This patchset tries to add a new bpf prog type and use it to select
a victim memcg when global OOM is invoked. The mainly motivation is
the need to customizable OOM victim selection functionality so that
we can protect more important app from OOM killer.
Chuyi Zhou (5):
bpf: Introduce BPF_PROG_TYPE_OOM_POLICY
mm: Select victim memcg using bpf prog
libbpf, bpftool: Support BPF_PROG_TYPE_OOM_POLICY
bpf: Add a new bpf helper to get cgroup ino
bpf: Sample BPF program to set oom policy
include/linux/bpf_oom.h | 22 ++++
include/linux/bpf_types.h | 2 +
include/linux/memcontrol.h | 6 ++
include/uapi/linux/bpf.h | 21 ++++
kernel/bpf/core.c | 1 +
kernel/bpf/helpers.c | 17 +++
kernel/bpf/syscall.c | 10 ++
mm/memcontrol.c | 50 +++++++++
mm/oom_kill.c | 185 +++++++++++++++++++++++++++++++++
samples/bpf/Makefile | 3 +
samples/bpf/oom_kern.c | 42 ++++++++
samples/bpf/oom_user.c | 128 +++++++++++++++++++++++
tools/bpf/bpftool/common.c | 1 +
tools/include/uapi/linux/bpf.h | 21 ++++
tools/lib/bpf/libbpf.c | 3 +
tools/lib/bpf/libbpf_probes.c | 2 +
16 files changed, 514 insertions(+)
create mode 100644 include/linux/bpf_oom.h
create mode 100644 samples/bpf/oom_kern.c
create mode 100644 samples/bpf/oom_user.c
--
2.20.1
This patch adds a sample showing how to set a OOM victim selection policy
to protect certain cgroups.
The BPF program, oom_kern.c, compares the score of two sibling memcg and
selects the larger one. The userspace program oom_user.c maintains a score
map by using cgroup inode number as the keys and the scores as the values.
Users can set lower score for some cgroups compared to their siblings to
avoid being selected.
Suggested-by: Abel Wu <[email protected]>
Signed-off-by: Chuyi Zhou <[email protected]>
---
samples/bpf/Makefile | 3 +
samples/bpf/oom_kern.c | 42 ++++++++++++++
samples/bpf/oom_user.c | 128 +++++++++++++++++++++++++++++++++++++++++
3 files changed, 173 insertions(+)
create mode 100644 samples/bpf/oom_kern.c
create mode 100644 samples/bpf/oom_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 615f24ebc49c..09dbdec22dad 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -56,6 +56,7 @@ tprogs-y += xdp_redirect_map_multi
tprogs-y += xdp_redirect_map
tprogs-y += xdp_redirect
tprogs-y += xdp_monitor
+tprogs-y += oom
# Libbpf dependencies
LIBBPF_SRC = $(TOOLS_PATH)/lib/bpf
@@ -118,6 +119,7 @@ xdp_redirect_map-objs := xdp_redirect_map_user.o $(XDP_SAMPLE)
xdp_redirect-objs := xdp_redirect_user.o $(XDP_SAMPLE)
xdp_monitor-objs := xdp_monitor_user.o $(XDP_SAMPLE)
xdp_router_ipv4-objs := xdp_router_ipv4_user.o $(XDP_SAMPLE)
+oom-objs := oom_user.o
# Tell kbuild to always build the programs
always-y := $(tprogs-y)
@@ -173,6 +175,7 @@ always-y += xdp_sample_pkts_kern.o
always-y += ibumad_kern.o
always-y += hbm_out_kern.o
always-y += hbm_edt_kern.o
+always-y += oom_kern.o
ifeq ($(ARCH), arm)
# Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux
diff --git a/samples/bpf/oom_kern.c b/samples/bpf/oom_kern.c
new file mode 100644
index 000000000000..1e0e2de1e06e
--- /dev/null
+++ b/samples/bpf/oom_kern.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <uapi/linux/bpf.h>
+#include <linux/version.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 1024);
+ __type(key, u64);
+ __type(value, u32);
+} sc_map SEC(".maps");
+
+SEC("oom_policy")
+int bpf_prog1(struct bpf_oom_ctx *ctx)
+{
+ u64 cg_ino_1, cg_ino_2;
+ u32 cs_1, sc_2;
+ u32 *value;
+
+ cs_1 = sc_2 = 250;
+ cg_ino_1 = bpf_get_ino_from_cgroup_id(ctx->cg_id_1);
+ cg_ino_2 = bpf_get_ino_from_cgroup_id(ctx->cg_id_2);
+
+ value = bpf_map_lookup_elem(&sc_map, &cg_ino_1);
+ if (value)
+ cs_1 = *value;
+
+ value = bpf_map_lookup_elem(&sc_map, &cg_ino_2);
+ if (value)
+ sc_2 = *value;
+
+ if (cs_1 > sc_2)
+ ctx->cmp_ret = BPF_OOM_CMP_GREATER;
+ else if (cs_1 < sc_2)
+ ctx->cmp_ret = BPF_OOM_CMP_LESS;
+ else
+ ctx->cmp_ret = BPF_OOM_CMP_EQUAL;
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/oom_user.c b/samples/bpf/oom_user.c
new file mode 100644
index 000000000000..7bd2d56ba910
--- /dev/null
+++ b/samples/bpf/oom_user.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <bpf/libbpf.h>
+#include <bpf/bpf.h>
+#include "trace_helpers.h"
+
+static int map_fd, prog_fd;
+
+static unsigned long long get_cgroup_inode(const char *path)
+{
+ unsigned long long inode;
+ struct stat file_stat;
+ int fd, ret;
+
+ fd = open(path, O_RDONLY);
+ if (fd < 0)
+ return 0;
+
+ ret = fstat(fd, &file_stat);
+ if (ret < 0)
+ return 0;
+
+ inode = file_stat.st_ino;
+ close(fd);
+ return inode;
+}
+
+static int set_cgroup_oom_score(const char *cg_path, int score)
+{
+ unsigned long long ino = get_cgroup_inode(cg_path);
+
+ if (!ino) {
+ fprintf(stderr, "ERROR: get inode for %s failed\n", cg_path);
+ return 1;
+ }
+ if (bpf_map_update_elem(map_fd, &ino, &score, BPF_ANY)) {
+ fprintf(stderr, "ERROR: update map failed\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+/**
+ * A simple sample of prefer select /root/blue/instance_1 as victim memcg
+ * and protect /root/blue/instance_2
+ * root
+ * / \
+ * user ... blue
+ * / \ / \
+ * .. instance_1 instance_2
+ */
+
+int main(int argc, char **argv)
+{
+ struct bpf_object *obj = NULL;
+ struct bpf_program *prog;
+ int target_fd = 0;
+ unsigned int prog_cnt;
+
+ obj = bpf_object__open_file("oom_kern.o", NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ obj = NULL;
+ goto cleanup;
+ }
+
+ prog = bpf_object__next_program(obj, NULL);
+ bpf_program__set_type(prog, BPF_PROG_TYPE_OOM_POLICY);
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd = bpf_object__find_map_fd_by_name(obj, "sc_map");
+
+ if (map_fd < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ /*
+ * In this sample, default score is 250 (see oom_kern.c).
+ * set high score for /blue and /blue/instance_1,
+ * so when global oom happened, /blue/instance_1 would
+ * be chosed as victim memcg
+ */
+ if (set_cgroup_oom_score("/sys/fs/cgroup/blue/", 500)) {
+ fprintf(stderr, "ERROR: set score for /blue failed\n");
+ goto cleanup;
+ }
+ if (set_cgroup_oom_score("/sys/fs/cgroup/blue/instance_1", 500)) {
+ fprintf(stderr, "ERROR: set score for /blue/instance_2 failed\n");
+ goto cleanup;
+ }
+
+ /* set low score to protect /blue/instance_2 */
+ if (set_cgroup_oom_score("/sys/fs/cgroup/blue/instance_2", 100)) {
+ fprintf(stderr, "ERROR: set score for /blue/instance_1 failed\n");
+ goto cleanup;
+ }
+
+ prog_fd = bpf_program__fd(prog);
+
+ /* Attach bpf program */
+ if (bpf_prog_attach(prog_fd, target_fd, BPF_OOM_POLICY, 0)) {
+ fprintf(stderr, "Failed to attach BPF_OOM_POLICY program");
+ goto cleanup;
+ }
+ if (bpf_prog_query(target_fd, BPF_OOM_POLICY, 0, NULL, NULL, &prog_cnt)) {
+ fprintf(stderr, "Failed to query attached programs\n");
+ goto cleanup;
+ }
+ printf("prog_cnt: %d\n", prog_cnt);
+
+cleanup:
+ bpf_object__close(obj);
+ return 0;
+}
--
2.20.1
Support BPF_PROG_TYPE_OOM_POLICY program in libbpf and bpftool, so that
we can identify and use BPF_PROG_TYPE_OOM_POLICY in our application.
Signed-off-by: Chuyi Zhou <[email protected]>
---
tools/bpf/bpftool/common.c | 1 +
tools/include/uapi/linux/bpf.h | 14 ++++++++++++++
tools/lib/bpf/libbpf.c | 3 +++
tools/lib/bpf/libbpf_probes.c | 2 ++
4 files changed, 20 insertions(+)
diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index cc6e6aae2447..c5c311299c4a 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -1089,6 +1089,7 @@ const char *bpf_attach_type_input_str(enum bpf_attach_type t)
case BPF_TRACE_FENTRY: return "fentry";
case BPF_TRACE_FEXIT: return "fexit";
case BPF_MODIFY_RETURN: return "mod_ret";
+ case BPF_OOM_POLICY: return "oom_policy";
case BPF_SK_REUSEPORT_SELECT: return "sk_skb_reuseport_select";
case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE: return "sk_skb_reuseport_select_or_migrate";
default: return libbpf_bpf_attach_type_str(t);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 60a9d59beeab..9da0d61cf703 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -987,6 +987,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SK_LOOKUP,
BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
BPF_PROG_TYPE_NETFILTER,
+ BPF_PROG_TYPE_OOM_POLICY,
};
enum bpf_attach_type {
@@ -1036,6 +1037,7 @@ enum bpf_attach_type {
BPF_LSM_CGROUP,
BPF_STRUCT_OPS,
BPF_NETFILTER,
+ BPF_OOM_POLICY,
__MAX_BPF_ATTACH_TYPE
};
@@ -6825,6 +6827,18 @@ struct bpf_cgroup_dev_ctx {
__u32 minor;
};
+enum {
+ BPF_OOM_CMP_EQUAL = (1ULL << 0),
+ BPF_OOM_CMP_GREATER = (1ULL << 1),
+ BPF_OOM_CMP_LESS = (1ULL << 2),
+};
+
+struct bpf_oom_ctx {
+ __u64 cg_id_1;
+ __u64 cg_id_2;
+ __u8 cmp_ret;
+};
+
struct bpf_raw_tracepoint_args {
__u64 args[0];
};
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 214f828ece6b..10496bb9b3bc 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -118,6 +118,7 @@ static const char * const attach_type_name[] = {
[BPF_TRACE_KPROBE_MULTI] = "trace_kprobe_multi",
[BPF_STRUCT_OPS] = "struct_ops",
[BPF_NETFILTER] = "netfilter",
+ [BPF_OOM_POLICY] = "oom_policy",
};
static const char * const link_type_name[] = {
@@ -204,6 +205,7 @@ static const char * const prog_type_name[] = {
[BPF_PROG_TYPE_SK_LOOKUP] = "sk_lookup",
[BPF_PROG_TYPE_SYSCALL] = "syscall",
[BPF_PROG_TYPE_NETFILTER] = "netfilter",
+ [BPF_PROG_TYPE_OOM_POLICY] = "oom_policy",
};
static int __base_pr(enum libbpf_print_level level, const char *format,
@@ -8738,6 +8740,7 @@ static const struct bpf_sec_def section_defs[] = {
SEC_DEF("struct_ops.s+", STRUCT_OPS, 0, SEC_SLEEPABLE),
SEC_DEF("sk_lookup", SK_LOOKUP, BPF_SK_LOOKUP, SEC_ATTACHABLE),
SEC_DEF("netfilter", NETFILTER, BPF_NETFILTER, SEC_NONE),
+ SEC_DEF("oom_policy", OOM_POLICY, BPF_OOM_POLICY, SEC_ATTACHABLE_OPT),
};
static size_t custom_sec_def_cnt;
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 9c4db90b92b6..dbac3e98a2d7 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -129,6 +129,8 @@ static int probe_prog_load(enum bpf_prog_type prog_type,
case BPF_PROG_TYPE_LIRC_MODE2:
opts.expected_attach_type = BPF_LIRC_MODE2;
break;
+ case BPF_PROG_TYPE_OOM_POLICY:
+ opts.expected_attach_type = BPF_OOM_POLICY;
case BPF_PROG_TYPE_TRACING:
case BPF_PROG_TYPE_LSM:
opts.log_buf = buf;
--
2.20.1
This patch introduces a BPF_PROG_TYPE_OOM_POLICY program type. This
prog will be used to select a leaf memcg as victim from the memcg tree
when global oom is invoked.
The program takes two sibling cgroup's id as parameters and return a
comparison result indicating which one should be chosen as the victim.
Suggested-by: Abel Wu <[email protected]>
Signed-off-by: Chuyi Zhou <[email protected]>
---
include/linux/bpf_oom.h | 22 +++++
include/linux/bpf_types.h | 2 +
include/uapi/linux/bpf.h | 14 ++++
kernel/bpf/syscall.c | 10 +++
mm/oom_kill.c | 168 ++++++++++++++++++++++++++++++++++++++
5 files changed, 216 insertions(+)
create mode 100644 include/linux/bpf_oom.h
diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h
new file mode 100644
index 000000000000..f4235a83d3bb
--- /dev/null
+++ b/include/linux/bpf_oom.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _BPF_OOM_H
+#define _BPF_OOM_H
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <uapi/linux/bpf.h>
+
+struct bpf_oom_policy {
+ struct bpf_prog_array __rcu *progs;
+};
+
+int oom_policy_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog);
+int oom_policy_prog_detach(const union bpf_attr *attr);
+int oom_policy_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr);
+
+int __bpf_run_oom_policy(u64 cg_id_1, u64 cg_id_2);
+
+bool bpf_oom_policy_enabled(void);
+
+#endif
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index fc0d6f32c687..8ab6009b7dd9 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -83,6 +83,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall,
BPF_PROG_TYPE(BPF_PROG_TYPE_NETFILTER, netfilter,
struct bpf_nf_ctx, struct bpf_nf_ctx)
#endif
+BPF_PROG_TYPE(BPF_PROG_TYPE_OOM_POLICY, oom_policy,
+ struct bpf_oom_ctx, struct bpf_oom_ctx)
BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 60a9d59beeab..9da0d61cf703 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -987,6 +987,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SK_LOOKUP,
BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
BPF_PROG_TYPE_NETFILTER,
+ BPF_PROG_TYPE_OOM_POLICY,
};
enum bpf_attach_type {
@@ -1036,6 +1037,7 @@ enum bpf_attach_type {
BPF_LSM_CGROUP,
BPF_STRUCT_OPS,
BPF_NETFILTER,
+ BPF_OOM_POLICY,
__MAX_BPF_ATTACH_TYPE
};
@@ -6825,6 +6827,18 @@ struct bpf_cgroup_dev_ctx {
__u32 minor;
};
+enum {
+ BPF_OOM_CMP_EQUAL = (1ULL << 0),
+ BPF_OOM_CMP_GREATER = (1ULL << 1),
+ BPF_OOM_CMP_LESS = (1ULL << 2),
+};
+
+struct bpf_oom_ctx {
+ __u64 cg_id_1;
+ __u64 cg_id_2;
+ __u8 cmp_ret;
+};
+
struct bpf_raw_tracepoint_args {
__u64 args[0];
};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a2aef900519c..fb6fb6294eba 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5,6 +5,7 @@
#include <linux/bpf-cgroup.h>
#include <linux/bpf_trace.h>
#include <linux/bpf_lirc.h>
+#include <linux/bpf_oom.h>
#include <linux/bpf_verifier.h>
#include <linux/bsearch.h>
#include <linux/btf.h>
@@ -3588,6 +3589,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
return BPF_PROG_TYPE_XDP;
case BPF_LSM_CGROUP:
return BPF_PROG_TYPE_LSM;
+ case BPF_OOM_POLICY:
+ return BPF_PROG_TYPE_OOM_POLICY;
default:
return BPF_PROG_TYPE_UNSPEC;
}
@@ -3634,6 +3637,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_PROG_TYPE_FLOW_DISSECTOR:
ret = netns_bpf_prog_attach(attr, prog);
break;
+ case BPF_PROG_TYPE_OOM_POLICY:
+ ret = oom_policy_prog_attach(attr, prog);
+ break;
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
@@ -3676,6 +3682,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
return lirc_prog_detach(attr);
case BPF_PROG_TYPE_FLOW_DISSECTOR:
return netns_bpf_prog_detach(attr, ptype);
+ case BPF_PROG_TYPE_OOM_POLICY:
+ return oom_policy_prog_detach(attr);
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
@@ -3733,6 +3741,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_FLOW_DISSECTOR:
case BPF_SK_LOOKUP:
return netns_bpf_prog_query(attr, uattr);
+ case BPF_OOM_POLICY:
+ return oom_policy_prog_query(attr, uattr);
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
case BPF_SK_MSG_VERDICT:
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 612b5597d3af..01af8adaa16c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -19,6 +19,7 @@
*/
#include <linux/oom.h>
+#include <linux/bpf_oom.h>
#include <linux/mm.h>
#include <linux/err.h>
#include <linux/gfp.h>
@@ -73,6 +74,9 @@ static inline bool is_memcg_oom(struct oom_control *oc)
return oc->memcg != NULL;
}
+DEFINE_MUTEX(oom_policy_lock);
+static struct bpf_oom_policy global_oom_policy;
+
#ifdef CONFIG_NUMA
/**
* oom_cpuset_eligible() - check task eligibility for kill
@@ -1258,3 +1262,167 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
return -ENOSYS;
#endif /* CONFIG_MMU */
}
+
+const struct bpf_prog_ops oom_policy_prog_ops = {
+};
+
+static const struct bpf_func_proto *
+oom_policy_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ return bpf_base_func_proto(func_id);
+}
+
+static bool oom_policy_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off + size > sizeof(struct bpf_oom_ctx) || off % size)
+ return false;
+
+ switch (off) {
+ case bpf_ctx_range(struct bpf_oom_ctx, cg_id_1):
+ case bpf_ctx_range(struct bpf_oom_ctx, cg_id_2):
+ if (type != BPF_READ)
+ return false;
+ bpf_ctx_record_field_size(info, sizeof(__u64));
+ return bpf_ctx_narrow_access_ok(off, size, sizeof(__u64));
+ case bpf_ctx_range(struct bpf_oom_ctx, cmp_ret):
+ if (type == BPF_READ) {
+ bpf_ctx_record_field_size(info, sizeof(__u8));
+ return bpf_ctx_narrow_access_ok(off, size, sizeof(__u8));
+ } else {
+ return size == sizeof(__u8);
+ }
+ default:
+ return false;
+ }
+}
+
+const struct bpf_verifier_ops oom_policy_verifier_ops = {
+ .get_func_proto = oom_policy_func_proto,
+ .is_valid_access = oom_policy_is_valid_access,
+};
+
+#define BPF_MAX_PROGS 10
+
+int oom_policy_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_prog_array *old_array;
+ struct bpf_prog_array *new_array;
+ int ret;
+
+ mutex_lock(&oom_policy_lock);
+ old_array = rcu_dereference(global_oom_policy.progs);
+ if (old_array && bpf_prog_array_length(old_array) >= BPF_MAX_PROGS) {
+ ret = -E2BIG;
+ goto unlock;
+ }
+ ret = bpf_prog_array_copy(old_array, NULL, prog, 0, &new_array);
+ if (ret < 0)
+ goto unlock;
+
+ rcu_assign_pointer(global_oom_policy.progs, new_array);
+ bpf_prog_array_free(old_array);
+
+unlock:
+ mutex_unlock(&oom_policy_lock);
+ return ret;
+}
+
+static int detach_prog(struct bpf_prog *prog)
+{
+ struct bpf_prog_array *old_array;
+ struct bpf_prog_array *new_array;
+ int ret;
+
+ mutex_lock(&oom_policy_lock);
+ old_array = rcu_dereference(global_oom_policy.progs);
+ ret = bpf_prog_array_copy(old_array, prog, NULL, 0, &new_array);
+
+ if (ret)
+ goto unlock;
+
+ rcu_assign_pointer(global_oom_policy.progs, new_array);
+ bpf_prog_array_free(old_array);
+ bpf_prog_put(prog);
+unlock:
+ mutex_unlock(&oom_policy_lock);
+ return ret;
+}
+
+int oom_policy_prog_detach(const union bpf_attr *attr)
+{
+ struct bpf_prog *prog;
+ int ret;
+
+ if (attr->attach_flags)
+ return -EINVAL;
+
+ prog = bpf_prog_get_type(attr->attach_bpf_fd,
+ BPF_PROG_TYPE_OOM_POLICY);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ ret = detach_prog(prog);
+ bpf_prog_put(prog);
+
+ return ret;
+}
+
+int oom_policy_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr)
+{
+ __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+ struct bpf_prog_array *progs;
+ u32 cnt, flags;
+ int ret = 0;
+
+ if (attr->query.query_flags)
+ return -EINVAL;
+
+ mutex_lock(&oom_policy_lock);
+ progs = rcu_dereference(global_oom_policy.progs);
+ cnt = progs ? bpf_prog_array_length(progs) : 0;
+ if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) {
+ ret = -EFAULT;
+ goto unlock;
+ }
+ if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) {
+ ret = -EFAULT;
+ goto unlock;
+ }
+ if (attr->query.prog_cnt != 0 && prog_ids && cnt)
+ ret = bpf_prog_array_copy_to_user(progs, prog_ids,
+ attr->query.prog_cnt);
+
+unlock:
+ mutex_unlock(&oom_policy_lock);
+ return ret;
+}
+
+int __bpf_run_oom_policy(u64 cg_id_1, u64 cg_id_2)
+{
+ struct bpf_oom_ctx ctx = {
+ .cg_id_1 = cg_id_1,
+ .cg_id_2 = cg_id_2,
+ .cmp_ret = BPF_OOM_CMP_EQUAL,
+ };
+ rcu_read_lock();
+ bpf_prog_run_array(rcu_dereference(global_oom_policy.progs),
+ &ctx, bpf_prog_run);
+ rcu_read_unlock();
+ return ctx.cmp_ret;
+}
+
+bool bpf_oom_policy_enabled(void)
+{
+ struct bpf_prog_array *prog_array;
+ bool empty = true;
+
+ rcu_read_lock();
+ prog_array = rcu_dereference(global_oom_policy.progs);
+ if (prog_array)
+ empty = bpf_prog_array_is_empty(prog_array);
+ rcu_read_unlock();
+ return !empty;
+}
--
2.20.1
This patch use BPF prog to bypass the default select_bad_process method
and select a victim memcg when gobal oom is invoked. Specifically, we
iterate root_mem_cgroup's children and select a next iteration root
through __bpf_run_oom_policy(). Repeat until we finally find a leaf
memcg in the last layer. Then we use oom_evaluate_task() to find a
victim task in the selected memcg. If there are no suitable process
to be killed in the memcg, we go back to the default method.
Suggested-by: Abel Wu <[email protected]>
Signed-off-by: Chuyi Zhou <[email protected]>
---
include/linux/memcontrol.h | 6 +++++
mm/memcontrol.c | 50 ++++++++++++++++++++++++++++++++++++++
mm/oom_kill.c | 17 +++++++++++++
3 files changed, 73 insertions(+)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5818af8eca5a..7fedc2521c8b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1155,6 +1155,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask,
unsigned long *total_scanned);
+struct mem_cgroup *select_victim_memcg(void);
#else /* CONFIG_MEMCG */
#define MEM_CGROUP_ID_SHIFT 0
@@ -1588,6 +1589,11 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
{
return 0;
}
+
+static inline struct mem_cgroup *select_victim_memcg(void)
+{
+ return NULL;
+}
#endif /* CONFIG_MEMCG */
static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e8ca4bdcb03c..c6b42635f1af 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -64,6 +64,7 @@
#include <linux/psi.h>
#include <linux/seq_buf.h>
#include <linux/sched/isolation.h>
+#include <linux/bpf_oom.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
@@ -2638,6 +2639,55 @@ void mem_cgroup_handle_over_high(void)
css_put(&memcg->css);
}
+struct mem_cgroup *select_victim_memcg(void)
+{
+ struct cgroup_subsys_state *pos, *parent, *victim;
+ struct mem_cgroup *victim_memcg;
+
+ parent = &root_mem_cgroup->css;
+ victim_memcg = NULL;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return NULL;
+
+ rcu_read_lock();
+ while (parent) {
+ struct cgroup_subsys_state *chosen = NULL;
+ struct mem_cgroup *pos_mem, *chosen_mem;
+ u64 chosen_id, pos_id;
+ int cmp_ret;
+
+ victim = parent;
+
+ list_for_each_entry_rcu(pos, &parent->children, sibling) {
+ pos_id = cgroup_id(pos->cgroup);
+ if (!chosen)
+ goto chose;
+
+ cmp_ret = __bpf_run_oom_policy(chosen_id, pos_id);
+ if (cmp_ret == BPF_OOM_CMP_GREATER)
+ continue;
+ if (cmp_ret == BPF_OOM_CMP_EQUAL) {
+ pos_mem = mem_cgroup_from_css(pos);
+ chosen_mem = mem_cgroup_from_css(chosen);
+ if (page_counter_read(&pos_mem->memory) <=
+ page_counter_read(&chosen_mem->memory))
+ continue;
+ }
+chose:
+ chosen = pos;
+ chosen_id = pos_id;
+ }
+ parent = chosen;
+ }
+
+ if (victim && css_tryget(victim))
+ victim_memcg = mem_cgroup_from_css(victim);
+ rcu_read_unlock();
+
+ return victim_memcg;
+}
+
static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages)
{
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 01af8adaa16c..b88c8c7d4ee4 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -361,6 +361,19 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
return 1;
}
+static bool bpf_select_bad_process(struct oom_control *oc)
+{
+ struct mem_cgroup *victim_memcg;
+
+ victim_memcg = select_victim_memcg();
+ if (victim_memcg) {
+ mem_cgroup_scan_tasks(victim_memcg, oom_evaluate_task, oc);
+ css_put(&victim_memcg->css);
+ }
+
+ return !!oc->chosen;
+}
+
/*
* Simple selection loop. We choose the process with the highest number of
* 'points'. In case scan was aborted, oc->chosen is set to -1.
@@ -372,6 +385,9 @@ static void select_bad_process(struct oom_control *oc)
if (is_memcg_oom(oc))
mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
else {
+ if (bpf_oom_policy_enabled() && bpf_select_bad_process(oc))
+ return;
+
struct task_struct *p;
rcu_read_lock();
@@ -1426,3 +1442,4 @@ bool bpf_oom_policy_enabled(void)
rcu_read_unlock();
return !empty;
}
+
--
2.20.1
On Thu 27-07-23 15:36:27, Chuyi Zhou wrote:
> This patchset tries to add a new bpf prog type and use it to select
> a victim memcg when global OOM is invoked. The mainly motivation is
> the need to customizable OOM victim selection functionality so that
> we can protect more important app from OOM killer.
This is rather modest to give an idea how the whole thing is supposed to
work. I have looked through patches very quickly but there is no overall
design described anywhere either.
Please could you give us a high level design description and reasoning
why certain decisions have been made? e.g. why is this limited to the
global oom sitation, why is the BPF program forced to operate on memcgs
as entities etc...
Also it would be very helpful to call out limitations of the BPF
program, if there are any.
Thanks!
--
Michal Hocko
SUSE Labs
This patch adds a new bpf helper bpf_get_ino_from_cgroup_id, so that
we can get the inode number once we know the cgroup id.
Cgroup_id is used to identify a cgroup in BPF prog. However we can't
get the cgroup id directly in userspace applications. In userspace,
we are used to identifying cgroups by their paths or their inodes.
However, cgroup id is not always equal to the inode number, depending
on the sizeof ino_t.
For example, given some cgroup paths, we only care about the events
related to those cgroups. We can only do this by updating these paths
in a map and doing string comparison in BPF prog, which is not very
convenient. However with this new helper, we just need to record
the inode in a map and lookup a inode number in BPF prog.
Signed-off-by: Chuyi Zhou <[email protected]>
---
include/uapi/linux/bpf.h | 7 +++++++
kernel/bpf/core.c | 1 +
kernel/bpf/helpers.c | 17 +++++++++++++++++
tools/include/uapi/linux/bpf.h | 7 +++++++
4 files changed, 32 insertions(+)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9da0d61cf703..01efb289fa14 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5575,6 +5575,12 @@ union bpf_attr {
* 0 on success.
*
* **-ENOENT** if the bpf_local_storage cannot be found.
+ *
+ * u64 bpf_get_ino_from_cgroup_id(u64 id)
+ * Description
+ * Get inode number from a *cgroup id*.
+ * Return
+ * Inode number.
*/
#define ___BPF_FUNC_MAPPER(FN, ctx...) \
FN(unspec, 0, ##ctx) \
@@ -5789,6 +5795,7 @@ union bpf_attr {
FN(user_ringbuf_drain, 209, ##ctx) \
FN(cgrp_storage_get, 210, ##ctx) \
FN(cgrp_storage_delete, 211, ##ctx) \
+ FN(get_ino_from_cgroup_id, 212, ##ctx) \
/* */
/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index dc85240a0134..49dfdb2dd336 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2666,6 +2666,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto __weak;
const struct bpf_func_proto bpf_seq_printf_btf_proto __weak;
const struct bpf_func_proto bpf_set_retval_proto __weak;
const struct bpf_func_proto bpf_get_retval_proto __weak;
+const struct bpf_func_proto bpf_get_ino_from_cgroup_id_proto __weak;
const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 9e80efa59a5d..e87328b008d3 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -433,6 +433,21 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
};
+
+BPF_CALL_1(bpf_get_ino_from_cgroup_id, u64, id)
+{
+ u64 ino = kernfs_id_ino(id);
+
+ return ino;
+}
+
+const struct bpf_func_proto bpf_get_ino_from_cgroup_id_proto = {
+ .func = bpf_get_ino_from_cgroup_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+};
+
#endif /* CONFIG_CGROUPS */
#define BPF_STRTOX_BASE_MASK 0x1F
@@ -1767,6 +1782,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_get_current_cgroup_id_proto;
case BPF_FUNC_get_current_ancestor_cgroup_id:
return &bpf_get_current_ancestor_cgroup_id_proto;
+ case BPF_FUNC_get_ino_from_cgroup_id:
+ return &bpf_get_ino_from_cgroup_id_proto;
#endif
default:
break;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 9da0d61cf703..661d97aacb85 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5575,6 +5575,12 @@ union bpf_attr {
* 0 on success.
*
* **-ENOENT** if the bpf_local_storage cannot be found.
+ *
+ * u64 bpf_get_ino_from_cgroup_id(u64 id)
+ * Description
+ * Get inode number from a *cgroup id*.
+ * Return
+ * Inode number.
*/
#define ___BPF_FUNC_MAPPER(FN, ctx...) \
FN(unspec, 0, ##ctx) \
@@ -5789,6 +5795,7 @@ union bpf_attr {
FN(user_ringbuf_drain, 209, ##ctx) \
FN(cgrp_storage_get, 210, ##ctx) \
FN(cgrp_storage_delete, 211, ##ctx) \
+ FN(get_ino_from_cgroup_id, 212, ##ctx) \
/* */
/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't
--
2.20.1
On 27/07/2023 08:36, Chuyi Zhou wrote:
> This patchset tries to add a new bpf prog type and use it to select
> a victim memcg when global OOM is invoked. The mainly motivation is
> the need to customizable OOM victim selection functionality so that
> we can protect more important app from OOM killer.
>
It's a nice use case, but at a high level, the approach pursued here
is, as I understand it, discouraged for new BPF program development.
Specifically, adding a new BPF program type with semantics like this
is not preferred. Instead, can you look at using something like
- using "fmod_ret" instead of a new program type
- use BPF kfuncs instead of helpers.
- add selftests in tools/testing/selftests/bpf not samples.
There's some examples of how solutions have evolved from the traditional
approach (adding a new program type, helpers etc) to using kfuncs etc on
this list - for example HID-BPF and the BPF scheduler series - which
should help orient you. There are presentations from Linux Plumbers 2022
that walk through some of this too.
Judging by the sample program example, all you should need here is a way
to override the return value of bpf_oom_set_policy() - a noinline
function that by default returns a no-op. It can then be overridden by
an "fmod_ret" BPF program.
One thing you lose is cgroup specificity at BPF attach time, but you can
always add predicates based on the cgroup to your BPF program if needed.
Alan
> Chuyi Zhou (5):
> bpf: Introduce BPF_PROG_TYPE_OOM_POLICY
> mm: Select victim memcg using bpf prog
> libbpf, bpftool: Support BPF_PROG_TYPE_OOM_POLICY
> bpf: Add a new bpf helper to get cgroup ino
> bpf: Sample BPF program to set oom policy
>
> include/linux/bpf_oom.h | 22 ++++
> include/linux/bpf_types.h | 2 +
> include/linux/memcontrol.h | 6 ++
> include/uapi/linux/bpf.h | 21 ++++
> kernel/bpf/core.c | 1 +
> kernel/bpf/helpers.c | 17 +++
> kernel/bpf/syscall.c | 10 ++
> mm/memcontrol.c | 50 +++++++++
> mm/oom_kill.c | 185 +++++++++++++++++++++++++++++++++
> samples/bpf/Makefile | 3 +
> samples/bpf/oom_kern.c | 42 ++++++++
> samples/bpf/oom_user.c | 128 +++++++++++++++++++++++
> tools/bpf/bpftool/common.c | 1 +
> tools/include/uapi/linux/bpf.h | 21 ++++
> tools/lib/bpf/libbpf.c | 3 +
> tools/lib/bpf/libbpf_probes.c | 2 +
> 16 files changed, 514 insertions(+)
> create mode 100644 include/linux/bpf_oom.h
> create mode 100644 samples/bpf/oom_kern.c
> create mode 100644 samples/bpf/oom_user.c
>
2023-07-27 15:36 UTC+0800 ~ Chuyi Zhou <[email protected]>
> Support BPF_PROG_TYPE_OOM_POLICY program in libbpf and bpftool, so that
> we can identify and use BPF_PROG_TYPE_OOM_POLICY in our application.
>
> Signed-off-by: Chuyi Zhou <[email protected]>
> ---
> tools/bpf/bpftool/common.c | 1 +
> tools/include/uapi/linux/bpf.h | 14 ++++++++++++++
> tools/lib/bpf/libbpf.c | 3 +++
> tools/lib/bpf/libbpf_probes.c | 2 ++
> 4 files changed, 20 insertions(+)
>
> diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
> index cc6e6aae2447..c5c311299c4a 100644
> --- a/tools/bpf/bpftool/common.c
> +++ b/tools/bpf/bpftool/common.c
> @@ -1089,6 +1089,7 @@ const char *bpf_attach_type_input_str(enum bpf_attach_type t)
> case BPF_TRACE_FENTRY: return "fentry";
> case BPF_TRACE_FEXIT: return "fexit";
> case BPF_MODIFY_RETURN: return "mod_ret";
> + case BPF_OOM_POLICY: return "oom_policy";
This case is not necessary. This block is here to keep legacy attach
type strings supported by bpftool. In your case, the name is the same as
the one provided by libbpf, so...
> case BPF_SK_REUSEPORT_SELECT: return "sk_skb_reuseport_select";
> case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE: return "sk_skb_reuseport_select_or_migrate";
> default: return libbpf_bpf_attach_type_str(t);
... we just want to pick it up from libbpf directly, here.
[...]
> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> index 214f828ece6b..10496bb9b3bc 100644
> --- a/tools/lib/bpf/libbpf.c
> +++ b/tools/lib/bpf/libbpf.c
> @@ -118,6 +118,7 @@ static const char * const attach_type_name[] = {
> [BPF_TRACE_KPROBE_MULTI] = "trace_kprobe_multi",
> [BPF_STRUCT_OPS] = "struct_ops",
> [BPF_NETFILTER] = "netfilter",
> + [BPF_OOM_POLICY] = "oom_policy",
> };
在 2023/7/27 16:15, Michal Hocko 写道:
> On Thu 27-07-23 15:36:27, Chuyi Zhou wrote:
>> This patchset tries to add a new bpf prog type and use it to select
>> a victim memcg when global OOM is invoked. The mainly motivation is
>> the need to customizable OOM victim selection functionality so that
>> we can protect more important app from OOM killer.
>
> This is rather modest to give an idea how the whole thing is supposed to
> work. I have looked through patches very quickly but there is no overall
> design described anywhere either.
>
> Please could you give us a high level design description and reasoning
> why certain decisions have been made? e.g. why is this limited to the
> global oom sitation, why is the BPF program forced to operate on memcgs
> as entities etc...
> Also it would be very helpful to call out limitations of the BPF
> program, if there are any.
>
> Thanks!
Hi,
Thanks for your advice.
The global/memcg OOM victim selection uses process as the base search
granularity. However, we can see a need for cgroup level protection and
there's been some discussion[1]. It seems reasonable to consider using
memcg as a search granularity in victim selection algorithm.
Besides, it seems pretty well fit for offloading policy decisions to a
BPF program, since BPF is scalable and flexible. That's why the new BPF
program operate on memcgs as entities.
The idea is to let user choose which leaf in the memcg tree should be
selected as the victim. At the first layer, if we choose A, then it
protects the memcg under the B, C, and D subtrees.
root
/ | \ \
A B C D
/\
E F
Using the BPF prog, we are allowed to compare the OOM priority between
two siblings so that we can choose the best victim in each layer.
For example:
run_prog(B, C) -> choose B
run_prog(B, D) -> choose D
run_prog(A, D) -> choose A
Once we select A as the victim in the first layer, the victim in next
layer would be selected among A's children. Finally, we select a leaf
memcg as victim.
In our scenarios, the impact caused by global OOM's is much more common,
so we only considered global in this patchset. But it seems that the
idea can also be applied to memcg OOM.
[1]https://lore.kernel.org/lkml/ZIgodGWoC%[email protected]/
Thanks!
--
Chuyi Zhou
On Thu, Jul 27, 2023 at 4:45 AM Alan Maguire <[email protected]> wrote:
>
> On 27/07/2023 08:36, Chuyi Zhou wrote:
> > This patchset tries to add a new bpf prog type and use it to select
> > a victim memcg when global OOM is invoked. The mainly motivation is
> > the need to customizable OOM victim selection functionality so that
> > we can protect more important app from OOM killer.
> >
>
> It's a nice use case, but at a high level, the approach pursued here
> is, as I understand it, discouraged for new BPF program development.
> Specifically, adding a new BPF program type with semantics like this
> is not preferred. Instead, can you look at using something like
>
> - using "fmod_ret" instead of a new program type
> - use BPF kfuncs instead of helpers.
> - add selftests in tools/testing/selftests/bpf not samples.
+1 to what Alan said above and below.
Also as Michal said there needs to be a design doc with pros and cons.
We see far too often that people attempt to use BPF in places where it
shouldn't be.
If programmability is not strictly necessary then BPF is not a good fit.
> There's some examples of how solutions have evolved from the traditional
> approach (adding a new program type, helpers etc) to using kfuncs etc on
> this list - for example HID-BPF and the BPF scheduler series - which
> should help orient you. There are presentations from Linux Plumbers 2022
> that walk through some of this too.
>
> Judging by the sample program example, all you should need here is a way
> to override the return value of bpf_oom_set_policy() - a noinline
> function that by default returns a no-op. It can then be overridden by
> an "fmod_ret" BPF program.
>
> One thing you lose is cgroup specificity at BPF attach time, but you can
> always add predicates based on the cgroup to your BPF program if needed.
>
> Alan
>
> > Chuyi Zhou (5):
> > bpf: Introduce BPF_PROG_TYPE_OOM_POLICY
> > mm: Select victim memcg using bpf prog
> > libbpf, bpftool: Support BPF_PROG_TYPE_OOM_POLICY
> > bpf: Add a new bpf helper to get cgroup ino
> > bpf: Sample BPF program to set oom policy
> >
> > include/linux/bpf_oom.h | 22 ++++
> > include/linux/bpf_types.h | 2 +
> > include/linux/memcontrol.h | 6 ++
> > include/uapi/linux/bpf.h | 21 ++++
> > kernel/bpf/core.c | 1 +
> > kernel/bpf/helpers.c | 17 +++
> > kernel/bpf/syscall.c | 10 ++
> > mm/memcontrol.c | 50 +++++++++
> > mm/oom_kill.c | 185 +++++++++++++++++++++++++++++++++
> > samples/bpf/Makefile | 3 +
> > samples/bpf/oom_kern.c | 42 ++++++++
> > samples/bpf/oom_user.c | 128 +++++++++++++++++++++++
> > tools/bpf/bpftool/common.c | 1 +
> > tools/include/uapi/linux/bpf.h | 21 ++++
> > tools/lib/bpf/libbpf.c | 3 +
> > tools/lib/bpf/libbpf_probes.c | 2 +
> > 16 files changed, 514 insertions(+)
> > create mode 100644 include/linux/bpf_oom.h
> > create mode 100644 samples/bpf/oom_kern.c
> > create mode 100644 samples/bpf/oom_user.c
> >
On Thu 27-07-23 20:12:01, Chuyi Zhou wrote:
>
>
> 在 2023/7/27 16:15, Michal Hocko 写道:
> > On Thu 27-07-23 15:36:27, Chuyi Zhou wrote:
> > > This patchset tries to add a new bpf prog type and use it to select
> > > a victim memcg when global OOM is invoked. The mainly motivation is
> > > the need to customizable OOM victim selection functionality so that
> > > we can protect more important app from OOM killer.
> >
> > This is rather modest to give an idea how the whole thing is supposed to
> > work. I have looked through patches very quickly but there is no overall
> > design described anywhere either.
> >
> > Please could you give us a high level design description and reasoning
> > why certain decisions have been made? e.g. why is this limited to the
> > global oom sitation, why is the BPF program forced to operate on memcgs
> > as entities etc...
> > Also it would be very helpful to call out limitations of the BPF
> > program, if there are any.
> >
> > Thanks!
>
> Hi,
>
> Thanks for your advice.
>
> The global/memcg OOM victim selection uses process as the base search
> granularity. However, we can see a need for cgroup level protection and
> there's been some discussion[1]. It seems reasonable to consider using memcg
> as a search granularity in victim selection algorithm.
Yes, it can be reasonable for some policies but making it central to the
design is very limiting.
> Besides, it seems pretty well fit for offloading policy decisions to a BPF
> program, since BPF is scalable and flexible. That's why the new BPF
> program operate on memcgs as entities.
I do not follow your line of argumentation here. The same could be
argued for processes or beans.
> The idea is to let user choose which leaf in the memcg tree should be
> selected as the victim. At the first layer, if we choose A, then it protects
> the memcg under the B, C, and D subtrees.
>
> root
> / | \ \
> A B C D
> /\
> E F
>
>
> Using the BPF prog, we are allowed to compare the OOM priority between
> two siblings so that we can choose the best victim in each layer.
How is the priority defined and communicated to the userspace.
> For example:
>
> run_prog(B, C) -> choose B
> run_prog(B, D) -> choose D
> run_prog(A, D) -> choose A
>
> Once we select A as the victim in the first layer, the victim in next layer
> would be selected among A's children. Finally, we select a leaf memcg as
> victim.
This sounds like a very specific oom policy and that is fine. But the
interface shouldn't be bound to any concepts like priorities let alone
be bound to memcg based selection. Ideally the BPF program should get
the oom_control as an input and either get a hook to kill process or if
that is not possible then return an entity to kill (either process or
set of processes).
> In our scenarios, the impact caused by global OOM's is much more common, so
> we only considered global in this patchset. But it seems that the idea can
> also be applied to memcg OOM.
The global and memcg OOMs shouldn't have a different interface. If a
specific BPF program wants to implement a different policy for global
vs. memcg OOM then be it but this should be a decision of the said
program not an inherent limitation of the interface.
>
> [1]https://lore.kernel.org/lkml/ZIgodGWoC%[email protected]/
>
> Thanks!
> --
> Chuyi Zhou
--
Michal Hocko
SUSE Labs
On Thu 27-07-23 08:57:03, Alexei Starovoitov wrote:
> On Thu, Jul 27, 2023 at 4:45 AM Alan Maguire <[email protected]> wrote:
> >
> > On 27/07/2023 08:36, Chuyi Zhou wrote:
> > > This patchset tries to add a new bpf prog type and use it to select
> > > a victim memcg when global OOM is invoked. The mainly motivation is
> > > the need to customizable OOM victim selection functionality so that
> > > we can protect more important app from OOM killer.
> > >
> >
> > It's a nice use case, but at a high level, the approach pursued here
> > is, as I understand it, discouraged for new BPF program development.
> > Specifically, adding a new BPF program type with semantics like this
> > is not preferred. Instead, can you look at using something like
> >
> > - using "fmod_ret" instead of a new program type
> > - use BPF kfuncs instead of helpers.
> > - add selftests in tools/testing/selftests/bpf not samples.
>
> +1 to what Alan said above and below.
>
> Also as Michal said there needs to be a design doc with pros and cons.
> We see far too often that people attempt to use BPF in places where it
> shouldn't be.
> If programmability is not strictly necessary then BPF is not a good fit.
To be completely honest I am not sure whether BPF is the right fit
myself. It is definitely a path to be explored but maybe we will learn
not the proper one in the end. The primary reason for considering it
though is that there is endless (+1) different policies how to select a
victim to kill on OOM. So having an interface to hook into and make that
decision sounds very promissing. This might be something very static
like EXPORT_SYMBOL that a kernel module can hook into or a more broader
BPF callback that could be more generic and therefore allow for easier
to define policy AFAIU.
--
Michal Hocko
SUSE Labs
Hi,
在 2023/7/27 19:43, Alan Maguire 写道:
> On 27/07/2023 08:36, Chuyi Zhou wrote:
>> This patchset tries to add a new bpf prog type and use it to select
>> a victim memcg when global OOM is invoked. The mainly motivation is
>> the need to customizable OOM victim selection functionality so that
>> we can protect more important app from OOM killer.
>>
>
> It's a nice use case, but at a high level, the approach pursued here
> is, as I understand it, discouraged for new BPF program development.
> Specifically, adding a new BPF program type with semantics like this
> is not preferred. Instead, can you look at using something like
>
> - using "fmod_ret" instead of a new program type
> - use BPF kfuncs instead of helpers.
> - add selftests in tools/testing/selftests/bpf not samples.
>
> There's some examples of how solutions have evolved from the traditional
> approach (adding a new program type, helpers etc) to using kfuncs etc on
> this list - for example HID-BPF and the BPF scheduler series - which
> should help orient you. There are presentations from Linux Plumbers 2022
> that walk through some of this too.
>
> Judging by the sample program example, all you should need here is a way
> to override the return value of bpf_oom_set_policy() - a noinline
> function that by default returns a no-op. It can then be overridden by
> an "fmod_ret" BPF program.
>
Indeed, I'll try to use kfuncs & fmod_ret.
Thanks for your advice.
--
Chuyi Zhou
> One thing you lose is cgroup specificity at BPF attach time, but you can
> always add predicates based on the cgroup to your BPF program if needed.
>
> Alan
>
>> Chuyi Zhou (5):
>> bpf: Introduce BPF_PROG_TYPE_OOM_POLICY
>> mm: Select victim memcg using bpf prog
>> libbpf, bpftool: Support BPF_PROG_TYPE_OOM_POLICY
>> bpf: Add a new bpf helper to get cgroup ino
>> bpf: Sample BPF program to set oom policy
>>
>> include/linux/bpf_oom.h | 22 ++++
>> include/linux/bpf_types.h | 2 +
>> include/linux/memcontrol.h | 6 ++
>> include/uapi/linux/bpf.h | 21 ++++
>> kernel/bpf/core.c | 1 +
>> kernel/bpf/helpers.c | 17 +++
>> kernel/bpf/syscall.c | 10 ++
>> mm/memcontrol.c | 50 +++++++++
>> mm/oom_kill.c | 185 +++++++++++++++++++++++++++++++++
>> samples/bpf/Makefile | 3 +
>> samples/bpf/oom_kern.c | 42 ++++++++
>> samples/bpf/oom_user.c | 128 +++++++++++++++++++++++
>> tools/bpf/bpftool/common.c | 1 +
>> tools/include/uapi/linux/bpf.h | 21 ++++
>> tools/lib/bpf/libbpf.c | 3 +
>> tools/lib/bpf/libbpf_probes.c | 2 +
>> 16 files changed, 514 insertions(+)
>> create mode 100644 include/linux/bpf_oom.h
>> create mode 100644 samples/bpf/oom_kern.c
>> create mode 100644 samples/bpf/oom_user.c
>>
Hi,
在 2023/7/27 20:26, Quentin Monnet 写道:
> 2023-07-27 15:36 UTC+0800 ~ Chuyi Zhou <[email protected]>
>> Support BPF_PROG_TYPE_OOM_POLICY program in libbpf and bpftool, so that
>> we can identify and use BPF_PROG_TYPE_OOM_POLICY in our application.
>>
>> Signed-off-by: Chuyi Zhou <[email protected]>
>> ---
>> tools/bpf/bpftool/common.c | 1 +
>> tools/include/uapi/linux/bpf.h | 14 ++++++++++++++
>> tools/lib/bpf/libbpf.c | 3 +++
>> tools/lib/bpf/libbpf_probes.c | 2 ++
>> 4 files changed, 20 insertions(+)
>>
>> diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
>> index cc6e6aae2447..c5c311299c4a 100644
>> --- a/tools/bpf/bpftool/common.c
>> +++ b/tools/bpf/bpftool/common.c
>> @@ -1089,6 +1089,7 @@ const char *bpf_attach_type_input_str(enum bpf_attach_type t)
>> case BPF_TRACE_FENTRY: return "fentry";
>> case BPF_TRACE_FEXIT: return "fexit";
>> case BPF_MODIFY_RETURN: return "mod_ret";
>> + case BPF_OOM_POLICY: return "oom_policy";
>
> This case is not necessary. This block is here to keep legacy attach
> type strings supported by bpftool. In your case, the name is the same as
> the one provided by libbpf, so...
>
>> case BPF_SK_REUSEPORT_SELECT: return "sk_skb_reuseport_select";
>> case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE: return "sk_skb_reuseport_select_or_migrate";
>> default: return libbpf_bpf_attach_type_str(t);
>
> ... we just want to pick it up from libbpf directly, here.
>
I see..
Thanks.
--
Chuyi Zhou
> [...]
>
>> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
>> index 214f828ece6b..10496bb9b3bc 100644
>> --- a/tools/lib/bpf/libbpf.c
>> +++ b/tools/lib/bpf/libbpf.c
>> @@ -118,6 +118,7 @@ static const char * const attach_type_name[] = {
>> [BPF_TRACE_KPROBE_MULTI] = "trace_kprobe_multi",
>> [BPF_STRUCT_OPS] = "struct_ops",
>> [BPF_NETFILTER] = "netfilter",
>> + [BPF_OOM_POLICY] = "oom_policy",
>> };
>
On Thu, Jul 27, 2023 at 10:15:16AM +0200, Michal Hocko wrote:
> On Thu 27-07-23 15:36:27, Chuyi Zhou wrote:
> > This patchset tries to add a new bpf prog type and use it to select
> > a victim memcg when global OOM is invoked. The mainly motivation is
> > the need to customizable OOM victim selection functionality so that
> > we can protect more important app from OOM killer.
>
> This is rather modest to give an idea how the whole thing is supposed to
> work. I have looked through patches very quickly but there is no overall
> design described anywhere either.
>
> Please could you give us a high level design description and reasoning
> why certain decisions have been made? e.g. why is this limited to the
> global oom sitation, why is the BPF program forced to operate on memcgs
> as entities etc...
> Also it would be very helpful to call out limitations of the BPF
> program, if there are any.
One thing I realized recently: we don't have to make a victim selection
during the OOM, we [almost always] can do it in advance.
Kernel OOM's must guarantee the forward progress under heavy memory pressure
and it creates a lot of limitations on what can and what can't be done in
these circumstances.
But in practice most policies except maybe those which aim to catch very fast
memory spikes rely on things which are fairly static: a logical importance of
several workloads in comparison to some other workloads, "age", memory footprint
etc.
So I wonder if the right path is to create a kernel interface which allows
to define a OOM victim (maybe several victims, also depending on if it's
a global or a memcg oom) and update it periodically from an userspace.
In fact, the second part is already implemented by tools like oomd, systemd-oomd etc.
Someone might say that the first part is also implemented by the oom_score
interface, but I don't think it's an example of a convenient interface.
It's also not a memcg-level interface.
Just some thoughts.
Thanks!
On Thu 27-07-23 21:30:01, Roman Gushchin wrote:
> On Thu, Jul 27, 2023 at 10:15:16AM +0200, Michal Hocko wrote:
> > On Thu 27-07-23 15:36:27, Chuyi Zhou wrote:
> > > This patchset tries to add a new bpf prog type and use it to select
> > > a victim memcg when global OOM is invoked. The mainly motivation is
> > > the need to customizable OOM victim selection functionality so that
> > > we can protect more important app from OOM killer.
> >
> > This is rather modest to give an idea how the whole thing is supposed to
> > work. I have looked through patches very quickly but there is no overall
> > design described anywhere either.
> >
> > Please could you give us a high level design description and reasoning
> > why certain decisions have been made? e.g. why is this limited to the
> > global oom sitation, why is the BPF program forced to operate on memcgs
> > as entities etc...
> > Also it would be very helpful to call out limitations of the BPF
> > program, if there are any.
>
> One thing I realized recently: we don't have to make a victim selection
> during the OOM, we [almost always] can do it in advance.
>
> Kernel OOM's must guarantee the forward progress under heavy memory pressure
> and it creates a lot of limitations on what can and what can't be done in
> these circumstances.
>
> But in practice most policies except maybe those which aim to catch very fast
> memory spikes rely on things which are fairly static: a logical importance of
> several workloads in comparison to some other workloads, "age", memory footprint
> etc.
>
> So I wonder if the right path is to create a kernel interface which allows
> to define a OOM victim (maybe several victims, also depending on if it's
> a global or a memcg oom) and update it periodically from an userspace.
We already have that interface. Just echo OOM_SCORE_ADJ_MAX to any tasks
that are to be killed with a priority...
Not a great interface but still something available.
> In fact, the second part is already implemented by tools like oomd, systemd-oomd etc.
> Someone might say that the first part is also implemented by the oom_score
> interface, but I don't think it's an example of a convenient interface.
> It's also not a memcg-level interface.
What do you mean by not memcg-level interface? What kind of interface
would you propose instead?
> Just some thoughts.
>
> Thanks!
--
Michal Hocko
SUSE Labs
On Fri, Jul 28, 2023 at 10:06:38AM +0200, Michal Hocko wrote:
> On Thu 27-07-23 21:30:01, Roman Gushchin wrote:
> > On Thu, Jul 27, 2023 at 10:15:16AM +0200, Michal Hocko wrote:
> > > On Thu 27-07-23 15:36:27, Chuyi Zhou wrote:
> > > > This patchset tries to add a new bpf prog type and use it to select
> > > > a victim memcg when global OOM is invoked. The mainly motivation is
> > > > the need to customizable OOM victim selection functionality so that
> > > > we can protect more important app from OOM killer.
> > >
> > > This is rather modest to give an idea how the whole thing is supposed to
> > > work. I have looked through patches very quickly but there is no overall
> > > design described anywhere either.
> > >
> > > Please could you give us a high level design description and reasoning
> > > why certain decisions have been made? e.g. why is this limited to the
> > > global oom sitation, why is the BPF program forced to operate on memcgs
> > > as entities etc...
> > > Also it would be very helpful to call out limitations of the BPF
> > > program, if there are any.
> >
> > One thing I realized recently: we don't have to make a victim selection
> > during the OOM, we [almost always] can do it in advance.
> >
> > Kernel OOM's must guarantee the forward progress under heavy memory pressure
> > and it creates a lot of limitations on what can and what can't be done in
> > these circumstances.
> >
> > But in practice most policies except maybe those which aim to catch very fast
> > memory spikes rely on things which are fairly static: a logical importance of
> > several workloads in comparison to some other workloads, "age", memory footprint
> > etc.
> >
> > So I wonder if the right path is to create a kernel interface which allows
> > to define a OOM victim (maybe several victims, also depending on if it's
> > a global or a memcg oom) and update it periodically from an userspace.
>
> We already have that interface. Just echo OOM_SCORE_ADJ_MAX to any tasks
> that are to be killed with a priority...
> Not a great interface but still something available.
>
> > In fact, the second part is already implemented by tools like oomd, systemd-oomd etc.
> > Someone might say that the first part is also implemented by the oom_score
> > interface, but I don't think it's an example of a convenient interface.
> > It's also not a memcg-level interface.
>
> What do you mean by not memcg-level interface? What kind of interface
> would you propose instead?
Something like memory.oom.priority, which is 0 by default, but if set to 1,
the memory cgroup is considered a good oom victim. Idk if we need priorities
or just fine with a binary thing.
Under oom conditions the kernel can look if we have a pre-selected oom target
and if not fallback to the current behavior.
Thanks!
Hello, Michal
在 2023/7/28 01:23, Michal Hocko 写道:
> On Thu 27-07-23 20:12:01, Chuyi Zhou wrote:
>>
>>
>> 在 2023/7/27 16:15, Michal Hocko 写道:
>>> On Thu 27-07-23 15:36:27, Chuyi Zhou wrote:
>>>> This patchset tries to add a new bpf prog type and use it to select
>>>> a victim memcg when global OOM is invoked. The mainly motivation is
>>>> the need to customizable OOM victim selection functionality so that
>>>> we can protect more important app from OOM killer.
>>>
>>> This is rather modest to give an idea how the whole thing is supposed to
>>> work. I have looked through patches very quickly but there is no overall
>>> design described anywhere either.
>>>
>>> Please could you give us a high level design description and reasoning
>>> why certain decisions have been made? e.g. why is this limited to the
>>> global oom sitation, why is the BPF program forced to operate on memcgs
>>> as entities etc...
>>> Also it would be very helpful to call out limitations of the BPF
>>> program, if there are any.
>>>
>>> Thanks!
>>
>> Hi,
>>
>> Thanks for your advice.
>>
>> The global/memcg OOM victim selection uses process as the base search
>> granularity. However, we can see a need for cgroup level protection and
>> there's been some discussion[1]. It seems reasonable to consider using memcg
>> as a search granularity in victim selection algorithm.
>
> Yes, it can be reasonable for some policies but making it central to the
> design is very limiting.
>
>> Besides, it seems pretty well fit for offloading policy decisions to a BPF
>> program, since BPF is scalable and flexible. That's why the new BPF
>> program operate on memcgs as entities.
>
> I do not follow your line of argumentation here. The same could be
> argued for processes or beans.
>
>> The idea is to let user choose which leaf in the memcg tree should be
>> selected as the victim. At the first layer, if we choose A, then it protects
>> the memcg under the B, C, and D subtrees.
>>
>> root
>> / | \ \
>> A B C D
>> /\
>> E F
>>
>>
>> Using the BPF prog, we are allowed to compare the OOM priority between
>> two siblings so that we can choose the best victim in each layer.
>
> How is the priority defined and communicated to the userspace.
>
>> For example:
>>
>> run_prog(B, C) -> choose B
>> run_prog(B, D) -> choose D
>> run_prog(A, D) -> choose A
>>
>> Once we select A as the victim in the first layer, the victim in next layer
>> would be selected among A's children. Finally, we select a leaf memcg as
>> victim.
>
> This sounds like a very specific oom policy and that is fine. But the
> interface shouldn't be bound to any concepts like priorities let alone
> be bound to memcg based selection. Ideally the BPF program should get
> the oom_control as an input and either get a hook to kill process or if
> that is not possible then return an entity to kill (either process or
> set of processes).
Here are two interfaces I can think of. I was wondering if you could
give me some feedback.
1. Add a new hook in select_bad_process(), we can attach it and return a
set of pids or cgroup_ids which are pre-selected by user-defined policy,
suggested by Roman. Then we could use oom_evaluate_task to find a
final victim among them. It's user-friendly and we can offload the OOM
policy to userspace.
2. Add a new hook in oom_evaluate_task() and return a point to override
the default oom_badness return-value. The simplest way to use this is to
protect certain processes by setting the minimum score.
Of course if you have a better idea, please let me know.
Thanks!
---
Chuyi Zhou
>
>> In our scenarios, the impact caused by global OOM's is much more common, so
>> we only considered global in this patchset. But it seems that the idea can
>> also be applied to memcg OOM.
>
> The global and memcg OOMs shouldn't have a different interface. If a
> specific BPF program wants to implement a different policy for global
> vs. memcg OOM then be it but this should be a decision of the said
> program not an inherent limitation of the interface.
>
>>
>> [1]https://lore.kernel.org/lkml/ZIgodGWoC%[email protected]/
>>
>> Thanks!
>> --
>> Chuyi Zhou
>
On Mon 31-07-23 14:00:22, Chuyi Zhou wrote:
> Hello, Michal
>
> 在 2023/7/28 01:23, Michal Hocko 写道:
[...]
> > This sounds like a very specific oom policy and that is fine. But the
> > interface shouldn't be bound to any concepts like priorities let alone
> > be bound to memcg based selection. Ideally the BPF program should get
> > the oom_control as an input and either get a hook to kill process or if
> > that is not possible then return an entity to kill (either process or
> > set of processes).
>
> Here are two interfaces I can think of. I was wondering if you could give me
> some feedback.
>
> 1. Add a new hook in select_bad_process(), we can attach it and return a set
> of pids or cgroup_ids which are pre-selected by user-defined policy,
> suggested by Roman. Then we could use oom_evaluate_task to find a final
> victim among them. It's user-friendly and we can offload the OOM policy to
> userspace.
>
> 2. Add a new hook in oom_evaluate_task() and return a point to override the
> default oom_badness return-value. The simplest way to use this is to protect
> certain processes by setting the minimum score.
>
> Of course if you have a better idea, please let me know.
Hooking into oom_evaluate_task seems the least disruptive to the
existing oom killer implementation. I would start by planing with that
and see whether useful oom policies could be defined this way. I am not
sure what is the best way to communicate user input so that a BPF prgram
can consume it though. The interface should be generic enough that it
doesn't really pre-define any specific class of policies. Maybe we can
add something completely opaque to each memcg/task? Does BPF
infrastructure allow anything like that already?
--
Michal Hocko
SUSE Labs
On Fri 28-07-23 11:42:27, Roman Gushchin wrote:
> On Fri, Jul 28, 2023 at 10:06:38AM +0200, Michal Hocko wrote:
> > On Thu 27-07-23 21:30:01, Roman Gushchin wrote:
> > > On Thu, Jul 27, 2023 at 10:15:16AM +0200, Michal Hocko wrote:
> > > > On Thu 27-07-23 15:36:27, Chuyi Zhou wrote:
> > > > > This patchset tries to add a new bpf prog type and use it to select
> > > > > a victim memcg when global OOM is invoked. The mainly motivation is
> > > > > the need to customizable OOM victim selection functionality so that
> > > > > we can protect more important app from OOM killer.
> > > >
> > > > This is rather modest to give an idea how the whole thing is supposed to
> > > > work. I have looked through patches very quickly but there is no overall
> > > > design described anywhere either.
> > > >
> > > > Please could you give us a high level design description and reasoning
> > > > why certain decisions have been made? e.g. why is this limited to the
> > > > global oom sitation, why is the BPF program forced to operate on memcgs
> > > > as entities etc...
> > > > Also it would be very helpful to call out limitations of the BPF
> > > > program, if there are any.
> > >
> > > One thing I realized recently: we don't have to make a victim selection
> > > during the OOM, we [almost always] can do it in advance.
> > >
> > > Kernel OOM's must guarantee the forward progress under heavy memory pressure
> > > and it creates a lot of limitations on what can and what can't be done in
> > > these circumstances.
> > >
> > > But in practice most policies except maybe those which aim to catch very fast
> > > memory spikes rely on things which are fairly static: a logical importance of
> > > several workloads in comparison to some other workloads, "age", memory footprint
> > > etc.
> > >
> > > So I wonder if the right path is to create a kernel interface which allows
> > > to define a OOM victim (maybe several victims, also depending on if it's
> > > a global or a memcg oom) and update it periodically from an userspace.
> >
> > We already have that interface. Just echo OOM_SCORE_ADJ_MAX to any tasks
> > that are to be killed with a priority...
> > Not a great interface but still something available.
> >
> > > In fact, the second part is already implemented by tools like oomd, systemd-oomd etc.
> > > Someone might say that the first part is also implemented by the oom_score
> > > interface, but I don't think it's an example of a convenient interface.
> > > It's also not a memcg-level interface.
> >
> > What do you mean by not memcg-level interface? What kind of interface
> > would you propose instead?
>
> Something like memory.oom.priority, which is 0 by default, but if set to 1,
> the memory cgroup is considered a good oom victim. Idk if we need priorities
> or just fine with a binary thing.
Priorities as a general API have been discussed at several occasions
(e.g http://lkml.kernel.org/r/[email protected]). Their usage
is rather limited, hiearchical semantic not trivial etc.
--
Michal Hocko
SUSE Labs
在 2023/7/31 21:23, Michal Hocko 写道:
> On Mon 31-07-23 14:00:22, Chuyi Zhou wrote:
>> Hello, Michal
>>
>> 在 2023/7/28 01:23, Michal Hocko 写道:
> [...]
>>> This sounds like a very specific oom policy and that is fine. But the
>>> interface shouldn't be bound to any concepts like priorities let alone
>>> be bound to memcg based selection. Ideally the BPF program should get
>>> the oom_control as an input and either get a hook to kill process or if
>>> that is not possible then return an entity to kill (either process or
>>> set of processes).
>>
>> Here are two interfaces I can think of. I was wondering if you could give me
>> some feedback.
>>
>> 1. Add a new hook in select_bad_process(), we can attach it and return a set
>> of pids or cgroup_ids which are pre-selected by user-defined policy,
>> suggested by Roman. Then we could use oom_evaluate_task to find a final
>> victim among them. It's user-friendly and we can offload the OOM policy to
>> userspace.
>>
>> 2. Add a new hook in oom_evaluate_task() and return a point to override the
>> default oom_badness return-value. The simplest way to use this is to protect
>> certain processes by setting the minimum score.
>>
>> Of course if you have a better idea, please let me know.
>
> Hooking into oom_evaluate_task seems the least disruptive to the
> existing oom killer implementation. I would start by planing with that
> and see whether useful oom policies could be defined this way. I am not
> sure what is the best way to communicate user input so that a BPF prgram
> can consume it though. The interface should be generic enough that it
> doesn't really pre-define any specific class of policies. Maybe we can
> add something completely opaque to each memcg/task? Does BPF
> infrastructure allow anything like that already?
>
“Maybe we can add something completely opaque to each memcg/task?”
Sorry, I don't understand what you mean.
I think we probably don't need to expose too much to the user, the
following might be sufficient:
noinline int bpf_get_score(struct oom_control *oc,
struct task_struct *task);
static int oom_evaluate_task()
{
...
points = bpf_get_score(oc, task);
if (!check_points_valid(points))
points = oom_badness(task, oc->totalpages);
...
}
There are several reasons:
1. The implementation of use-defined OOM policy, such as iteration,
sorting and other complex operations, is more suitable to be placed in
the userspace rather than in the bpf program. It is more convenient to
implement these operations in userspace in which the useful information
(memory usage of each task and memcg, memory allocation speed, etc.) can
also be captured. For example, oomd implements multiple policies[1]
without kernel-space input.
2. Userspace apps, such as oomd, can import useful information into bpf
program, e.g., through bpf_map, and update it periodically. For example,
we can do the scoring directly in userspace and maintain a score hash,
so that in the bpf program, we only need to look for the corresponding
score of the process.
Userspace policy(oomd)
bpf_map_update
score_hash
------------------> BPF program
look up score in
score_hash
---------------> kernel space
Just some thoughts.
Thanks!
[1]https://github.com/facebookincubator/oomd/tree/main/src/oomd/plugins)
On 7/28/23 12:30 PM, Roman Gushchin wrote:
> On Thu, Jul 27, 2023 at 10:15:16AM +0200, Michal Hocko wrote:
>> On Thu 27-07-23 15:36:27, Chuyi Zhou wrote:
>>> This patchset tries to add a new bpf prog type and use it to select
>>> a victim memcg when global OOM is invoked. The mainly motivation is
>>> the need to customizable OOM victim selection functionality so that
>>> we can protect more important app from OOM killer.
>>
>> This is rather modest to give an idea how the whole thing is supposed to
>> work. I have looked through patches very quickly but there is no overall
>> design described anywhere either.
>>
>> Please could you give us a high level design description and reasoning
>> why certain decisions have been made? e.g. why is this limited to the
>> global oom sitation, why is the BPF program forced to operate on memcgs
>> as entities etc...
>> Also it would be very helpful to call out limitations of the BPF
>> program, if there are any.
>
> One thing I realized recently: we don't have to make a victim selection
> during the OOM, we [almost always] can do it in advance.
I agree. We take precautions against memory shortage on over-committed
machines through oomd-like userspace tools, to mitigate possible SLO
violations on important services. The kernel OOM-killer in our scenario
works as a last resort, since userspace tools are not that reliable.
IMHO it would be useful for kernel to provide such flexibility.
>
> Kernel OOM's must guarantee the forward progress under heavy memory pressure
> and it creates a lot of limitations on what can and what can't be done in
> these circumstances.
>
> But in practice most policies except maybe those which aim to catch very fast
> memory spikes rely on things which are fairly static: a logical importance of
> several workloads in comparison to some other workloads, "age", memory footprint
> etc.
>
> So I wonder if the right path is to create a kernel interface which allows
> to define a OOM victim (maybe several victims, also depending on if it's
> a global or a memcg oom) and update it periodically from an userspace.
Something like [1] proposed by Chuyi? IIUC there is still lack of some
triggers to invoke the procedure so we can actually do this in advance.
[1]
https://lore.kernel.org/lkml/[email protected]/
Thanks & Best,
Abel
On Tue 01-08-23 00:26:20, Chuyi Zhou wrote:
>
>
> 在 2023/7/31 21:23, Michal Hocko 写道:
> > On Mon 31-07-23 14:00:22, Chuyi Zhou wrote:
> > > Hello, Michal
> > >
> > > 在 2023/7/28 01:23, Michal Hocko 写道:
> > [...]
> > > > This sounds like a very specific oom policy and that is fine. But the
> > > > interface shouldn't be bound to any concepts like priorities let alone
> > > > be bound to memcg based selection. Ideally the BPF program should get
> > > > the oom_control as an input and either get a hook to kill process or if
> > > > that is not possible then return an entity to kill (either process or
> > > > set of processes).
> > >
> > > Here are two interfaces I can think of. I was wondering if you could give me
> > > some feedback.
> > >
> > > 1. Add a new hook in select_bad_process(), we can attach it and return a set
> > > of pids or cgroup_ids which are pre-selected by user-defined policy,
> > > suggested by Roman. Then we could use oom_evaluate_task to find a final
> > > victim among them. It's user-friendly and we can offload the OOM policy to
> > > userspace.
> > >
> > > 2. Add a new hook in oom_evaluate_task() and return a point to override the
> > > default oom_badness return-value. The simplest way to use this is to protect
> > > certain processes by setting the minimum score.
> > >
> > > Of course if you have a better idea, please let me know.
> >
> > Hooking into oom_evaluate_task seems the least disruptive to the
> > existing oom killer implementation. I would start by planing with that
> > and see whether useful oom policies could be defined this way. I am not
> > sure what is the best way to communicate user input so that a BPF prgram
> > can consume it though. The interface should be generic enough that it
> > doesn't really pre-define any specific class of policies. Maybe we can
> > add something completely opaque to each memcg/task? Does BPF
> > infrastructure allow anything like that already?
> >
>
> “Maybe we can add something completely opaque to each memcg/task?”
> Sorry, I don't understand what you mean.
What I meant to say is to add a very non-specific interface that would
would a specific BPF program understand. Mostly an opaque value from the
memcg POV.
> I think we probably don't need to expose too much to the user, the following
> might be sufficient:
>
> noinline int bpf_get_score(struct oom_control *oc,
> struct task_struct *task);
>
> static int oom_evaluate_task()
> {
> ...
> points = bpf_get_score(oc, task);
> if (!check_points_valid(points))
> points = oom_badness(task, oc->totalpages);
> ...
> }
>
> There are several reasons:
>
> 1. The implementation of use-defined OOM policy, such as iteration, sorting
> and other complex operations, is more suitable to be placed in the userspace
> rather than in the bpf program. It is more convenient to implement these
> operations in userspace in which the useful information (memory usage of
> each task and memcg, memory allocation speed, etc.) can also be captured.
> For example, oomd implements multiple policies[1] without kernel-space
> input.
I do agree that userspace can handle a lot on its own and provide the
input to the BPF program to make a decision.
> 2. Userspace apps, such as oomd, can import useful information into bpf
> program, e.g., through bpf_map, and update it periodically. For example, we
> can do the scoring directly in userspace and maintain a score hash, so that
> in the bpf program, we only need to look for the corresponding score of the
> process.
Sure, why not. But all that is an implementation detail. We are
currently talkin about a proper abstraction and layering that would
allow what you do currently but also much more.
> Userspace policy(oomd)
> bpf_map_update
> score_hash
> ------------------> BPF program
> look up score in
> score_hash
> ---------------> kernel space
> Just some thoughts.
I believe all the above should be possible if BPF program is hooked at
the oom_evaluate_task layer and allow to bypass the default logic. BPF
program can process whatever data it has available. The oom scope iteration
will be implemented already in the kernel so all the BPF program has to
do is to rank processes and/or memcgs if oom.group is enabled. Whould
that work for your usecase?
> Thanks!
>
> [1]https://github.com/facebookincubator/oomd/tree/main/src/oomd/plugins)
--
Michal Hocko
SUSE Labs
Hello,
在 2023/8/1 16:18, Michal Hocko 写道:
> On Tue 01-08-23 00:26:20, Chuyi Zhou wrote:
>>
>>
>> 在 2023/7/31 21:23, Michal Hocko 写道:
>>> On Mon 31-07-23 14:00:22, Chuyi Zhou wrote:
>>>> Hello, Michal
>>>>
>>>> 在 2023/7/28 01:23, Michal Hocko 写道:
>>> [...]
>>>>> This sounds like a very specific oom policy and that is fine. But the
>>>>> interface shouldn't be bound to any concepts like priorities let alone
>>>>> be bound to memcg based selection. Ideally the BPF program should get
>>>>> the oom_control as an input and either get a hook to kill process or if
>>>>> that is not possible then return an entity to kill (either process or
>>>>> set of processes).
>>>>
>>>> Here are two interfaces I can think of. I was wondering if you could give me
>>>> some feedback.
>>>>
>>>> 1. Add a new hook in select_bad_process(), we can attach it and return a set
>>>> of pids or cgroup_ids which are pre-selected by user-defined policy,
>>>> suggested by Roman. Then we could use oom_evaluate_task to find a final
>>>> victim among them. It's user-friendly and we can offload the OOM policy to
>>>> userspace.
>>>>
>>>> 2. Add a new hook in oom_evaluate_task() and return a point to override the
>>>> default oom_badness return-value. The simplest way to use this is to protect
>>>> certain processes by setting the minimum score.
>>>>
>>>> Of course if you have a better idea, please let me know.
>>>
>>> Hooking into oom_evaluate_task seems the least disruptive to the
>>> existing oom killer implementation. I would start by planing with that
>>> and see whether useful oom policies could be defined this way. I am not
>>> sure what is the best way to communicate user input so that a BPF prgram
>>> can consume it though. The interface should be generic enough that it
>>> doesn't really pre-define any specific class of policies. Maybe we can
>>> add something completely opaque to each memcg/task? Does BPF
>>> infrastructure allow anything like that already?
>>>
>>
>> “Maybe we can add something completely opaque to each memcg/task?”
>> Sorry, I don't understand what you mean.
>
> What I meant to say is to add a very non-specific interface that would
> would a specific BPF program understand. Mostly an opaque value from the
> memcg POV.
>
>> I think we probably don't need to expose too much to the user, the following
>> might be sufficient:
>>
>> noinline int bpf_get_score(struct oom_control *oc,
>> struct task_struct *task);
>>
>> static int oom_evaluate_task()
>> {
>> ...
>> points = bpf_get_score(oc, task);
>> if (!check_points_valid(points))
>> points = oom_badness(task, oc->totalpages);
>> ...
>> }
>>
>> There are several reasons:
>>
>> 1. The implementation of use-defined OOM policy, such as iteration, sorting
>> and other complex operations, is more suitable to be placed in the userspace
>> rather than in the bpf program. It is more convenient to implement these
>> operations in userspace in which the useful information (memory usage of
>> each task and memcg, memory allocation speed, etc.) can also be captured.
>> For example, oomd implements multiple policies[1] without kernel-space
>> input.
>
> I do agree that userspace can handle a lot on its own and provide the
> input to the BPF program to make a decision.
>
>> 2. Userspace apps, such as oomd, can import useful information into bpf
>> program, e.g., through bpf_map, and update it periodically. For example, we
>> can do the scoring directly in userspace and maintain a score hash, so that
>> in the bpf program, we only need to look for the corresponding score of the
>> process.
>
> Sure, why not. But all that is an implementation detail. We are
> currently talkin about a proper abstraction and layering that would
> allow what you do currently but also much more.
>
>> Userspace policy(oomd)
>> bpf_map_update
>> score_hash
>> ------------------> BPF program
>> look up score in
>> score_hash
>> ---------------> kernel space
>> Just some thoughts.
>
> I believe all the above should be possible if BPF program is hooked at
> the oom_evaluate_task layer and allow to bypass the default logic. BPF
> program can process whatever data it has available. The oom scope iteration
> will be implemented already in the kernel so all the BPF program has to
> do is to rank processes and/or memcgs if oom.group is enabled. Whould
> that work for your usecase?
Yes, I think the above interface can works well for our usecase.
In our scenario, we want to protect the application with higher priority
and try to select lower priority as the victim.
Specifically, We can set priority for memcgs in userspace. In BPF
program, we can find the memcg to which the given process belongs, and
then rank according to the memcg's priority.
Thanks.
>
>> Thanks!
>>
>> [1]https://github.com/facebookincubator/oomd/tree/main/src/oomd/plugins)
>