LinuxLists.cc - [PATCH bpf-next v2 4/8] bpf: Introduce cgroup iter

2022-06-10 19:56:52

Subject: [PATCH bpf-next v2 4/8] bpf: Introduce cgroup iter

From: Hao Luo <[email protected]>

Cgroup_iter is a type of bpf_iter. It walks over cgroups in two modes:

- walking a cgroup's descendants.
- walking a cgroup's ancestors.

When attaching cgroup_iter, one can set a cgroup to the iter_link
created from attaching. This cgroup is passed as a file descriptor and
serves as the starting point of the walk. If no cgroup is specified,
the starting point will be the root cgroup.

For walking descendants, one can specify the order: either pre-order or
post-order. For walking ancestors, the walk starts at the specified
cgroup and ends at the root.

One can also terminate the walk early by returning 1 from the iter
program.

Note that because walking cgroup hierarchy holds cgroup_mutex, the iter
program is called with cgroup_mutex held.

Signed-off-by: Hao Luo <[email protected]>
Signed-off-by: Yosry Ahmed <[email protected]>
---
include/linux/bpf.h | 8 ++
include/uapi/linux/bpf.h | 21 +++
kernel/bpf/Makefile | 2 +-
kernel/bpf/cgroup_iter.c | 235 +++++++++++++++++++++++++++++++++
tools/include/uapi/linux/bpf.h | 21 +++
5 files changed, 286 insertions(+), 1 deletion(-)
create mode 100644 kernel/bpf/cgroup_iter.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8e6092d0ea956..48d8e836b9748 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -44,6 +44,7 @@ struct kobject;
struct mem_cgroup;
struct module;
struct bpf_func_state;
+struct cgroup;

extern struct idr btf_idr;
extern spinlock_t btf_idr_lock;
@@ -1590,7 +1591,14 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
int __init bpf_iter_ ## target(args) { return 0; }

struct bpf_iter_aux_info {
+ /* for map_elem iter */
struct bpf_map *map;
+
+ /* for cgroup iter */
+ struct {
+ struct cgroup *start; /* starting cgroup */
+ int order;
+ } cgroup;
};

typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f4009dbdf62da..4fd05cde19116 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -87,10 +87,27 @@ struct bpf_cgroup_storage_key {
__u32 attach_type; /* program attach type (enum bpf_attach_type) */
};

+enum bpf_iter_cgroup_traversal_order {
+ BPF_ITER_CGROUP_PRE = 0, /* pre-order traversal */
+ BPF_ITER_CGROUP_POST, /* post-order traversal */
+ BPF_ITER_CGROUP_PARENT_UP, /* traversal of ancestors up to the root */
+};
+
union bpf_iter_link_info {
struct {
__u32 map_fd;
} map;
+
+ /* cgroup_iter walks either the live descendants of a cgroup subtree, or the ancestors
+ * of a given cgroup.
+ */
+ struct {
+ /* Cgroup file descriptor. This is root of the subtree if for walking the
+ * descendants; this is the starting cgroup if for walking the ancestors.
+ */
+ __u32 cgroup_fd;
+ __u32 traversal_order;
+ } cgroup;
};

/* BPF syscall commands, see bpf(2) man-page for more details. */
@@ -6050,6 +6067,10 @@ struct bpf_link_info {
struct {
__u32 map_id;
} map;
+ struct {
+ __u32 traversal_order;
+ __aligned_u64 cgroup_id;
+ } cgroup;
};
} iter;
struct {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 057ba8e01e70f..9741b9314fb46 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -8,7 +8,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)

obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
-obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
+obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o cgroup_iter.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
new file mode 100644
index 0000000000000..88deb655efa71
--- /dev/null
+++ b/kernel/bpf/cgroup_iter.c
@@ -0,0 +1,235 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Google */
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/cgroup.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+
+#include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */
+
+/* cgroup_iter provides two modes of traversal to the cgroup hierarchy.
+ *
+ * 1. Walk the descendants of a cgroup.
+ * 2. Walk the ancestors of a cgroup.
+ *
+ * For walking descendants, cgroup_iter can walk in either pre-order or
+ * post-order. For walking ancestors, the iter walks up from a cgroup to
+ * the root.
+ *
+ * The iter program can terminate the walk early by returning 1. Walk
+ * continues if prog returns 0.
+ *
+ * The prog can check (seq->num == 0) to determine whether this is
+ * the first element. The prog may also be passed a NULL cgroup,
+ * which means the walk has completed and the prog has a chance to
+ * do post-processing, such as outputing an epilogue.
+ *
+ * Note: the iter_prog is called with cgroup_mutex held.
+ */
+
+struct bpf_iter__cgroup {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct cgroup *, cgroup);
+};
+
+struct cgroup_iter_priv {
+ struct cgroup_subsys_state *start_css;
+ bool terminate;
+ int order;
+};
+
+static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct cgroup_iter_priv *p = seq->private;
+
+ mutex_lock(&cgroup_mutex);
+
+ /* support only one session */
+ if (*pos > 0)
+ return NULL;
+
+ ++*pos;
+ p->terminate = false;
+ if (p->order == BPF_ITER_CGROUP_PRE)
+ return css_next_descendant_pre(NULL, p->start_css);
+ else if (p->order == BPF_ITER_CGROUP_POST)
+ return css_next_descendant_post(NULL, p->start_css);
+ else /* BPF_ITER_CGROUP_PARENT_UP */
+ return p->start_css;
+}
+
+static int __cgroup_iter_seq_show(struct seq_file *seq,
+ struct cgroup_subsys_state *css, int in_stop);
+
+static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
+{
+ /* pass NULL to the prog for post-processing */
+ if (!v)
+ __cgroup_iter_seq_show(seq, NULL, true);
+ mutex_unlock(&cgroup_mutex);
+}
+
+static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
+ struct cgroup_iter_priv *p = seq->private;
+
+ ++*pos;
+ if (p->terminate)
+ return NULL;
+
+ if (p->order == BPF_ITER_CGROUP_PRE)
+ return css_next_descendant_pre(curr, p->start_css);
+ else if (p->order == BPF_ITER_CGROUP_POST)
+ return css_next_descendant_post(curr, p->start_css);
+ else
+ return curr->parent;
+}
+
+static int __cgroup_iter_seq_show(struct seq_file *seq,
+ struct cgroup_subsys_state *css, int in_stop)
+{
+ struct cgroup_iter_priv *p = seq->private;
+ struct bpf_iter__cgroup ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ /* cgroup is dead, skip this element */
+ if (css && cgroup_is_dead(css->cgroup))
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.cgroup = css ? css->cgroup : NULL;
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ /* if prog returns > 0, terminate after this element. */
+ if (ret != 0)
+ p->terminate = true;
+
+ return 0;
+}
+
+static int cgroup_iter_seq_show(struct seq_file *seq, void *v)
+{
+ return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v,
+ false);
+}
+
+static const struct seq_operations cgroup_iter_seq_ops = {
+ .start = cgroup_iter_seq_start,
+ .next = cgroup_iter_seq_next,
+ .stop = cgroup_iter_seq_stop,
+ .show = cgroup_iter_seq_show,
+};
+
+BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
+
+static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
+{
+ struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
+ struct cgroup *cgrp = aux->cgroup.start;
+
+ p->start_css = &cgrp->self;
+ p->terminate = false;
+ p->order = aux->cgroup.order;
+ return 0;
+}
+
+static const struct bpf_iter_seq_info cgroup_iter_seq_info = {
+ .seq_ops = &cgroup_iter_seq_ops,
+ .init_seq_private = cgroup_iter_seq_init,
+ .seq_priv_size = sizeof(struct cgroup_iter_priv),
+};
+
+static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ int fd = linfo->cgroup.cgroup_fd;
+ struct cgroup *cgrp;
+
+ if (fd)
+ cgrp = cgroup_get_from_fd(fd);
+ else /* walk the entire hierarchy by default. */
+ cgrp = cgroup_get_from_path("/");
+
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ aux->cgroup.start = cgrp;
+ aux->cgroup.order = linfo->cgroup.traversal_order;
+ return 0;
+}
+
+static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux)
+{
+ cgroup_put(aux->cgroup.start);
+}
+
+static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux,
+ struct seq_file *seq)
+{
+ char *buf;
+
+ buf = kzalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf) {
+ seq_puts(seq, "cgroup_path:\n");
+ goto show_order;
+ }
+
+ /* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path
+ * will print nothing.
+ *
+ * Path is in the calling process's cgroup namespace.
+ */
+ cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX,
+ current->nsproxy->cgroup_ns);
+ seq_printf(seq, "cgroup_path:\t%s\n", buf);
+ kfree(buf);
+
+show_order:
+ if (aux->cgroup.order == BPF_ITER_CGROUP_PRE)
+ seq_puts(seq, "traversal_order: pre\n");
+ else if (aux->cgroup.order == BPF_ITER_CGROUP_POST)
+ seq_puts(seq, "traversal_order: post\n");
+ else /* BPF_ITER_CGROUP_PARENT_UP */
+ seq_puts(seq, "traversal_order: parent_up\n");
+}
+
+static int bpf_iter_cgroup_fill_link_info(const struct bpf_iter_aux_info *aux,
+ struct bpf_link_info *info)
+{
+ info->iter.cgroup.traversal_order = aux->cgroup.order;
+ info->iter.cgroup.cgroup_id = cgroup_id(aux->cgroup.start);
+ return 0;
+}
+
+DEFINE_BPF_ITER_FUNC(cgroup, struct bpf_iter_meta *meta,
+ struct cgroup *cgroup)
+
+static struct bpf_iter_reg bpf_cgroup_reg_info = {
+ .target = "cgroup",
+ .attach_target = bpf_iter_attach_cgroup,
+ .detach_target = bpf_iter_detach_cgroup,
+ .show_fdinfo = bpf_iter_cgroup_show_fdinfo,
+ .fill_link_info = bpf_iter_cgroup_fill_link_info,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__cgroup, cgroup),
+ PTR_TO_BTF_ID },
+ },
+ .seq_info = &cgroup_iter_seq_info,
+};
+
+static int __init bpf_cgroup_iter_init(void)
+{
+ bpf_cgroup_reg_info.ctx_arg_info[0].btf_id = bpf_cgroup_btf_id[0];
+ return bpf_iter_reg_target(&bpf_cgroup_reg_info);
+}
+
+late_initcall(bpf_cgroup_iter_init);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f4009dbdf62da..4fd05cde19116 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -87,10 +87,27 @@ struct bpf_cgroup_storage_key {
__u32 attach_type; /* program attach type (enum bpf_attach_type) */
};

+enum bpf_iter_cgroup_traversal_order {
+ BPF_ITER_CGROUP_PRE = 0, /* pre-order traversal */
+ BPF_ITER_CGROUP_POST, /* post-order traversal */
+ BPF_ITER_CGROUP_PARENT_UP, /* traversal of ancestors up to the root */
+};
+
union bpf_iter_link_info {
struct {
__u32 map_fd;
} map;
+
+ /* cgroup_iter walks either the live descendants of a cgroup subtree, or the ancestors
+ * of a given cgroup.
+ */
+ struct {
+ /* Cgroup file descriptor. This is root of the subtree if for walking the
+ * descendants; this is the starting cgroup if for walking the ancestors.
+ */
+ __u32 cgroup_fd;
+ __u32 traversal_order;
+ } cgroup;
};

/* BPF syscall commands, see bpf(2) man-page for more details. */
@@ -6050,6 +6067,10 @@ struct bpf_link_info {
struct {
__u32 map_id;
} map;
+ struct {
+ __u32 traversal_order;
+ __aligned_u64 cgroup_id;
+ } cgroup;
};
} iter;
struct {
--
2.36.1.476.g0c4daa206d-goog

2022-06-11 09:31:10

by kernel test robot

[permalink] [raw]

Subject: Re: [PATCH bpf-next v2 4/8] bpf: Introduce cgroup iter

2022-06-11 09:33:42

by kernel test robot

[permalink] [raw]

Subject: Re: [PATCH bpf-next v2 4/8] bpf: Introduce cgroup iter

Hi Yosry,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on bpf-next/master]

url: https://github.com/intel-lab-lkp/linux/commits/Yosry-Ahmed/bpf-rstat-cgroup-hierarchical-stats/20220611-034720
base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: m68k-defconfig (https://download.01.org/0day-ci/archive/20220611/[email protected]/config)
compiler: m68k-linux-gcc (GCC) 11.3.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/619857fd1ec4f351376ffcaaec20acc9aae9486f
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Yosry-Ahmed/bpf-rstat-cgroup-hierarchical-stats/20220611-034720
git checkout 619857fd1ec4f351376ffcaaec20acc9aae9486f
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.3.0 make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash kernel/bpf/ kernel/cgroup/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <[email protected]>

All warnings (new ones prefixed by >>):

In file included from kernel/bpf/cgroup_iter.c:9:
kernel/bpf/../cgroup/cgroup-internal.h:78:41: error: field 'iter' has incomplete type
78 | struct css_task_iter iter;
| ^~~~
kernel/bpf/../cgroup/cgroup-internal.h: In function 'cgroup_is_dead':
kernel/bpf/../cgroup/cgroup-internal.h:188:22: error: invalid use of undefined type 'const struct cgroup'
188 | return !(cgrp->self.flags & CSS_ONLINE);
| ^~
kernel/bpf/../cgroup/cgroup-internal.h:188:37: error: 'CSS_ONLINE' undeclared (first use in this function); did you mean 'N_ONLINE'?
188 | return !(cgrp->self.flags & CSS_ONLINE);
| ^~~~~~~~~~
| N_ONLINE
kernel/bpf/../cgroup/cgroup-internal.h:188:37: note: each undeclared identifier is reported only once for each function it appears in
kernel/bpf/../cgroup/cgroup-internal.h: In function 'notify_on_release':
kernel/bpf/../cgroup/cgroup-internal.h:193:25: error: 'CGRP_NOTIFY_ON_RELEASE' undeclared (first use in this function)
193 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
| ^~~~~~~~~~~~~~~~~~~~~~
kernel/bpf/../cgroup/cgroup-internal.h:193:54: error: invalid use of undefined type 'const struct cgroup'
193 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
| ^~
kernel/bpf/../cgroup/cgroup-internal.h: In function 'put_css_set':
kernel/bpf/../cgroup/cgroup-internal.h:207:39: error: invalid use of undefined type 'struct css_set'
207 | if (refcount_dec_not_one(&cset->refcount))
| ^~
kernel/bpf/../cgroup/cgroup-internal.h: In function 'get_css_set':
kernel/bpf/../cgroup/cgroup-internal.h:220:27: error: invalid use of undefined type 'struct css_set'
220 | refcount_inc(&cset->refcount);
| ^~
kernel/bpf/../cgroup/cgroup-internal.h: At top level:
kernel/bpf/../cgroup/cgroup-internal.h:284:22: error: array type has incomplete element type 'struct cftype'
284 | extern struct cftype cgroup1_base_files[];
| ^~~~~~~~~~~~~~~~~~
kernel/bpf/cgroup_iter.c: In function 'cgroup_iter_seq_start':
kernel/bpf/cgroup_iter.c:55:24: error: implicit declaration of function 'css_next_descendant_pre' [-Werror=implicit-function-declaration]
55 | return css_next_descendant_pre(NULL, p->start_css);
| ^~~~~~~~~~~~~~~~~~~~~~~
>> kernel/bpf/cgroup_iter.c:55:24: warning: returning 'int' from a function with return type 'void *' makes pointer from integer without a cast [-Wint-conversion]
55 | return css_next_descendant_pre(NULL, p->start_css);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
kernel/bpf/cgroup_iter.c:57:24: error: implicit declaration of function 'css_next_descendant_post' [-Werror=implicit-function-declaration]
57 | return css_next_descendant_post(NULL, p->start_css);
| ^~~~~~~~~~~~~~~~~~~~~~~~
kernel/bpf/cgroup_iter.c:57:24: warning: returning 'int' from a function with return type 'void *' makes pointer from integer without a cast [-Wint-conversion]
57 | return css_next_descendant_post(NULL, p->start_css);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
kernel/bpf/cgroup_iter.c: In function 'cgroup_iter_seq_next':
kernel/bpf/cgroup_iter.c:83:24: warning: returning 'int' from a function with return type 'void *' makes pointer from integer without a cast [-Wint-conversion]
83 | return css_next_descendant_pre(curr, p->start_css);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
kernel/bpf/cgroup_iter.c:85:24: warning: returning 'int' from a function with return type 'void *' makes pointer from integer without a cast [-Wint-conversion]
85 | return css_next_descendant_post(curr, p->start_css);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
kernel/bpf/cgroup_iter.c:87:28: error: invalid use of undefined type 'struct cgroup_subsys_state'
87 | return curr->parent;
| ^~
kernel/bpf/cgroup_iter.c: In function '__cgroup_iter_seq_show':
kernel/bpf/cgroup_iter.c:100:38: error: invalid use of undefined type 'struct cgroup_subsys_state'
100 | if (css && cgroup_is_dead(css->cgroup))
| ^~
kernel/bpf/cgroup_iter.c:104:31: error: invalid use of undefined type 'struct cgroup_subsys_state'
104 | ctx.cgroup = css ? css->cgroup : NULL;
| ^~
kernel/bpf/cgroup_iter.c: In function 'cgroup_iter_seq_init':
kernel/bpf/cgroup_iter.c:137:29: error: invalid use of undefined type 'struct cgroup'
137 | p->start_css = &cgrp->self;
| ^~
kernel/bpf/cgroup_iter.c: In function 'bpf_iter_attach_cgroup':
kernel/bpf/cgroup_iter.c:157:24: error: implicit declaration of function 'cgroup_get_from_fd'; did you mean 'cgroup_get_from_id'? [-Werror=implicit-function-declaration]
157 | cgrp = cgroup_get_from_fd(fd);
| ^~~~~~~~~~~~~~~~~~
| cgroup_get_from_id
>> kernel/bpf/cgroup_iter.c:157:22: warning: assignment to 'struct cgroup *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
157 | cgrp = cgroup_get_from_fd(fd);
| ^
kernel/bpf/cgroup_iter.c:159:24: error: implicit declaration of function 'cgroup_get_from_path'; did you mean 'cgroup_get_from_id'? [-Werror=implicit-function-declaration]
159 | cgrp = cgroup_get_from_path("/");
| ^~~~~~~~~~~~~~~~~~~~
| cgroup_get_from_id
kernel/bpf/cgroup_iter.c:159:22: warning: assignment to 'struct cgroup *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
159 | cgrp = cgroup_get_from_path("/");
| ^
kernel/bpf/cgroup_iter.c: In function 'bpf_iter_cgroup_show_fdinfo':
kernel/bpf/cgroup_iter.c:190:9: error: implicit declaration of function 'cgroup_path_ns'; did you mean 'cgroup_parent'? [-Werror=implicit-function-declaration]
190 | cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX,
| ^~~~~~~~~~~~~~
| cgroup_parent
kernel/bpf/cgroup_iter.c: In function 'cgroup_iter_seq_next':
kernel/bpf/cgroup_iter.c:88:1: error: control reaches end of non-void function [-Werror=return-type]
88 | }
| ^
cc1: some warnings being treated as errors

vim +55 kernel/bpf/cgroup_iter.c

41
42 static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
43 {
44 struct cgroup_iter_priv *p = seq->private;
45
46 mutex_lock(&cgroup_mutex);
47
48 /* support only one session */
49 if (*pos > 0)
50 return NULL;
51
52 ++*pos;
53 p->terminate = false;
54 if (p->order == BPF_ITER_CGROUP_PRE)
> 55 return css_next_descendant_pre(NULL, p->start_css);
56 else if (p->order == BPF_ITER_CGROUP_POST)
57 return css_next_descendant_post(NULL, p->start_css);
58 else /* BPF_ITER_CGROUP_PARENT_UP */
59 return p->start_css;
60 }
61
62 static int __cgroup_iter_seq_show(struct seq_file *seq,
63 struct cgroup_subsys_state *css, int in_stop);
64
65 static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
66 {
67 /* pass NULL to the prog for post-processing */
68 if (!v)
69 __cgroup_iter_seq_show(seq, NULL, true);
70 mutex_unlock(&cgroup_mutex);
71 }
72
73 static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
74 {
75 struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
76 struct cgroup_iter_priv *p = seq->private;
77
78 ++*pos;
79 if (p->terminate)
80 return NULL;
81
82 if (p->order == BPF_ITER_CGROUP_PRE)
83 return css_next_descendant_pre(curr, p->start_css);
84 else if (p->order == BPF_ITER_CGROUP_POST)
85 return css_next_descendant_post(curr, p->start_css);
86 else
87 return curr->parent;
88 }
89
90 static int __cgroup_iter_seq_show(struct seq_file *seq,
91 struct cgroup_subsys_state *css, int in_stop)
92 {
93 struct cgroup_iter_priv *p = seq->private;
94 struct bpf_iter__cgroup ctx;
95 struct bpf_iter_meta meta;
96 struct bpf_prog *prog;
97 int ret = 0;
98
99 /* cgroup is dead, skip this element */
100 if (css && cgroup_is_dead(css->cgroup))
101 return 0;
102
103 ctx.meta = &meta;
104 ctx.cgroup = css ? css->cgroup : NULL;
105 meta.seq = seq;
106 prog = bpf_iter_get_info(&meta, in_stop);
107 if (prog)
108 ret = bpf_iter_run_prog(prog, &ctx);
109
110 /* if prog returns > 0, terminate after this element. */
111 if (ret != 0)
112 p->terminate = true;
113
114 return 0;
115 }
116
117 static int cgroup_iter_seq_show(struct seq_file *seq, void *v)
118 {
119 return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v,
120 false);
121 }
122
123 static const struct seq_operations cgroup_iter_seq_ops = {
124 .start = cgroup_iter_seq_start,
125 .next = cgroup_iter_seq_next,
126 .stop = cgroup_iter_seq_stop,
127 .show = cgroup_iter_seq_show,
128 };
129
130 BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
131
132 static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
133 {
134 struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
135 struct cgroup *cgrp = aux->cgroup.start;
136
137 p->start_css = &cgrp->self;
138 p->terminate = false;
139 p->order = aux->cgroup.order;
140 return 0;
141 }
142
143 static const struct bpf_iter_seq_info cgroup_iter_seq_info = {
144 .seq_ops = &cgroup_iter_seq_ops,
145 .init_seq_private = cgroup_iter_seq_init,
146 .seq_priv_size = sizeof(struct cgroup_iter_priv),
147 };
148
149 static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
150 union bpf_iter_link_info *linfo,
151 struct bpf_iter_aux_info *aux)
152 {
153 int fd = linfo->cgroup.cgroup_fd;
154 struct cgroup *cgrp;
155
156 if (fd)
> 157 cgrp = cgroup_get_from_fd(fd);
158 else /* walk the entire hierarchy by default. */
159 cgrp = cgroup_get_from_path("/");
160
161 if (IS_ERR(cgrp))
162 return PTR_ERR(cgrp);
163
164 aux->cgroup.start = cgrp;
165 aux->cgroup.order = linfo->cgroup.traversal_order;
166 return 0;
167 }
168

--
0-DAY CI Kernel Test Service
https://01.org/lkp

2022-06-11 12:57:50

by kernel test robot

[permalink] [raw]

Subject: Re: [PATCH bpf-next v2 4/8] bpf: Introduce cgroup iter

Hi Yosry,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on bpf-next/master]

url: https://github.com/intel-lab-lkp/linux/commits/Yosry-Ahmed/bpf-rstat-cgroup-hierarchical-stats/20220611-034720
base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: s390-randconfig-r035-20220611 (https://download.01.org/0day-ci/archive/20220611/[email protected]/config)
compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project ff4abe755279a3a47cc416ef80dbc900d9a98a19)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# install s390 cross compiling tool for clang build
# apt-get install binutils-s390x-linux-gnu
# https://github.com/intel-lab-lkp/linux/commit/619857fd1ec4f351376ffcaaec20acc9aae9486f
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Yosry-Ahmed/bpf-rstat-cgroup-hierarchical-stats/20220611-034720
git checkout 619857fd1ec4f351376ffcaaec20acc9aae9486f
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=s390 SHELL=/bin/bash kernel/bpf/ kernel/cgroup/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <[email protected]>

All warnings (new ones prefixed by >>):

In file included from kernel/bpf/cgroup_iter.c:9:
kernel/bpf/../cgroup/cgroup-internal.h:78:24: error: field has incomplete type 'struct css_task_iter'
struct css_task_iter iter;
^
kernel/bpf/../cgroup/cgroup-internal.h:78:10: note: forward declaration of 'struct css_task_iter'
struct css_task_iter iter;
^
kernel/bpf/../cgroup/cgroup-internal.h:188:15: error: incomplete definition of type 'struct cgroup'
return !(cgrp->self.flags & CSS_ONLINE);
~~~~^
include/linux/sched/task.h:35:9: note: forward declaration of 'struct cgroup'
struct cgroup *cgrp;
^
In file included from kernel/bpf/cgroup_iter.c:9:
kernel/bpf/../cgroup/cgroup-internal.h:188:30: error: use of undeclared identifier 'CSS_ONLINE'; did you mean 'N_ONLINE'?
return !(cgrp->self.flags & CSS_ONLINE);
^~~~~~~~~~
N_ONLINE
include/linux/nodemask.h:392:2: note: 'N_ONLINE' declared here
N_ONLINE, /* The node is online */
^
In file included from kernel/bpf/cgroup_iter.c:9:
kernel/bpf/../cgroup/cgroup-internal.h:193:47: error: incomplete definition of type 'struct cgroup'
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
~~~~^
include/linux/sched/task.h:35:9: note: forward declaration of 'struct cgroup'
struct cgroup *cgrp;
^
In file included from kernel/bpf/cgroup_iter.c:9:
kernel/bpf/../cgroup/cgroup-internal.h:193:18: error: use of undeclared identifier 'CGRP_NOTIFY_ON_RELEASE'
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
^
kernel/bpf/../cgroup/cgroup-internal.h:207:32: error: incomplete definition of type 'struct css_set'
if (refcount_dec_not_one(&cset->refcount))
~~~~^
include/linux/sched/task.h:16:8: note: forward declaration of 'struct css_set'
struct css_set;
^
In file included from kernel/bpf/cgroup_iter.c:9:
kernel/bpf/../cgroup/cgroup-internal.h:220:20: error: incomplete definition of type 'struct css_set'
refcount_inc(&cset->refcount);
~~~~^
include/linux/sched/task.h:16:8: note: forward declaration of 'struct css_set'
struct css_set;
^
In file included from kernel/bpf/cgroup_iter.c:9:
kernel/bpf/../cgroup/cgroup-internal.h:284:40: error: array has incomplete element type 'struct cftype'
extern struct cftype cgroup1_base_files[];
^
kernel/bpf/../cgroup/cgroup-internal.h:284:15: note: forward declaration of 'struct cftype'
extern struct cftype cgroup1_base_files[];
^
kernel/bpf/cgroup_iter.c:55:10: error: call to undeclared function 'css_next_descendant_pre'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
return css_next_descendant_pre(NULL, p->start_css);
^
>> kernel/bpf/cgroup_iter.c:55:10: warning: incompatible integer to pointer conversion returning 'int' from a function with result type 'void *' [-Wint-conversion]
return css_next_descendant_pre(NULL, p->start_css);
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
kernel/bpf/cgroup_iter.c:57:10: error: call to undeclared function 'css_next_descendant_post'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
return css_next_descendant_post(NULL, p->start_css);
^
kernel/bpf/cgroup_iter.c:57:10: warning: incompatible integer to pointer conversion returning 'int' from a function with result type 'void *' [-Wint-conversion]
return css_next_descendant_post(NULL, p->start_css);
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
kernel/bpf/cgroup_iter.c:83:10: error: call to undeclared function 'css_next_descendant_pre'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
return css_next_descendant_pre(curr, p->start_css);
^
kernel/bpf/cgroup_iter.c:83:10: warning: incompatible integer to pointer conversion returning 'int' from a function with result type 'void *' [-Wint-conversion]
return css_next_descendant_pre(curr, p->start_css);
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
kernel/bpf/cgroup_iter.c:85:10: error: call to undeclared function 'css_next_descendant_post'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
return css_next_descendant_post(curr, p->start_css);
^
kernel/bpf/cgroup_iter.c:85:10: warning: incompatible integer to pointer conversion returning 'int' from a function with result type 'void *' [-Wint-conversion]
return css_next_descendant_post(curr, p->start_css);
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
kernel/bpf/cgroup_iter.c:87:14: error: incomplete definition of type 'struct cgroup_subsys_state'
return curr->parent;
~~~~^
include/linux/kthread.h:218:8: note: forward declaration of 'struct cgroup_subsys_state'
struct cgroup_subsys_state;
^
kernel/bpf/cgroup_iter.c:100:31: error: incomplete definition of type 'struct cgroup_subsys_state'
if (css && cgroup_is_dead(css->cgroup))
~~~^
include/linux/kthread.h:218:8: note: forward declaration of 'struct cgroup_subsys_state'
struct cgroup_subsys_state;
^
kernel/bpf/cgroup_iter.c:104:24: error: incomplete definition of type 'struct cgroup_subsys_state'
ctx.cgroup = css ? css->cgroup : NULL;
~~~^
include/linux/kthread.h:218:8: note: forward declaration of 'struct cgroup_subsys_state'
struct cgroup_subsys_state;
^
kernel/bpf/cgroup_iter.c:137:22: error: incomplete definition of type 'struct cgroup'
p->start_css = &cgrp->self;
~~~~^
include/linux/sched/task.h:35:9: note: forward declaration of 'struct cgroup'
struct cgroup *cgrp;
^
kernel/bpf/cgroup_iter.c:157:10: error: call to undeclared function 'cgroup_get_from_fd'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
cgrp = cgroup_get_from_fd(fd);
^
kernel/bpf/cgroup_iter.c:157:10: note: did you mean 'cgroup_get_from_id'?
include/linux/cgroup.h:756:30: note: 'cgroup_get_from_id' declared here
static inline struct cgroup *cgroup_get_from_id(u64 id)
^
>> kernel/bpf/cgroup_iter.c:157:8: warning: incompatible integer to pointer conversion assigning to 'struct cgroup *' from 'int' [-Wint-conversion]
cgrp = cgroup_get_from_fd(fd);
^ ~~~~~~~~~~~~~~~~~~~~~~
kernel/bpf/cgroup_iter.c:159:10: error: call to undeclared function 'cgroup_get_from_path'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
cgrp = cgroup_get_from_path("/");
^
kernel/bpf/cgroup_iter.c:159:8: warning: incompatible integer to pointer conversion assigning to 'struct cgroup *' from 'int' [-Wint-conversion]
cgrp = cgroup_get_from_path("/");
^ ~~~~~~~~~~~~~~~~~~~~~~~~~
kernel/bpf/cgroup_iter.c:190:2: error: call to undeclared function 'cgroup_path_ns'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX,
^
kernel/bpf/cgroup_iter.c:190:2: note: did you mean 'cgroup_parent'?
include/linux/cgroup.h:732:30: note: 'cgroup_parent' declared here
static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
^
6 warnings and 19 errors generated.

vim +55 kernel/bpf/cgroup_iter.c

41
42 static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
43 {
44 struct cgroup_iter_priv *p = seq->private;
45
46 mutex_lock(&cgroup_mutex);
47
48 /* support only one session */
49 if (*pos > 0)
50 return NULL;
51
52 ++*pos;
53 p->terminate = false;
54 if (p->order == BPF_ITER_CGROUP_PRE)
> 55 return css_next_descendant_pre(NULL, p->start_css);
56 else if (p->order == BPF_ITER_CGROUP_POST)
57 return css_next_descendant_post(NULL, p->start_css);
58 else /* BPF_ITER_CGROUP_PARENT_UP */
59 return p->start_css;
60 }
61
62 static int __cgroup_iter_seq_show(struct seq_file *seq,
63 struct cgroup_subsys_state *css, int in_stop);
64
65 static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
66 {
67 /* pass NULL to the prog for post-processing */
68 if (!v)
69 __cgroup_iter_seq_show(seq, NULL, true);
70 mutex_unlock(&cgroup_mutex);
71 }
72
73 static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
74 {
75 struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
76 struct cgroup_iter_priv *p = seq->private;
77
78 ++*pos;
79 if (p->terminate)
80 return NULL;
81
82 if (p->order == BPF_ITER_CGROUP_PRE)
83 return css_next_descendant_pre(curr, p->start_css);
84 else if (p->order == BPF_ITER_CGROUP_POST)
85 return css_next_descendant_post(curr, p->start_css);
86 else
87 return curr->parent;
88 }
89
90 static int __cgroup_iter_seq_show(struct seq_file *seq,
91 struct cgroup_subsys_state *css, int in_stop)
92 {
93 struct cgroup_iter_priv *p = seq->private;
94 struct bpf_iter__cgroup ctx;
95 struct bpf_iter_meta meta;
96 struct bpf_prog *prog;
97 int ret = 0;
98
99 /* cgroup is dead, skip this element */
100 if (css && cgroup_is_dead(css->cgroup))
101 return 0;
102
103 ctx.meta = &meta;
104 ctx.cgroup = css ? css->cgroup : NULL;
105 meta.seq = seq;
106 prog = bpf_iter_get_info(&meta, in_stop);
107 if (prog)
108 ret = bpf_iter_run_prog(prog, &ctx);
109
110 /* if prog returns > 0, terminate after this element. */
111 if (ret != 0)
112 p->terminate = true;
113
114 return 0;
115 }
116
117 static int cgroup_iter_seq_show(struct seq_file *seq, void *v)
118 {
119 return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v,
120 false);
121 }
122
123 static const struct seq_operations cgroup_iter_seq_ops = {
124 .start = cgroup_iter_seq_start,
125 .next = cgroup_iter_seq_next,
126 .stop = cgroup_iter_seq_stop,
127 .show = cgroup_iter_seq_show,
128 };
129
130 BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
131
132 static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
133 {
134 struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
135 struct cgroup *cgrp = aux->cgroup.start;
136
137 p->start_css = &cgrp->self;
138 p->terminate = false;
139 p->order = aux->cgroup.order;
140 return 0;
141 }
142
143 static const struct bpf_iter_seq_info cgroup_iter_seq_info = {
144 .seq_ops = &cgroup_iter_seq_ops,
145 .init_seq_private = cgroup_iter_seq_init,
146 .seq_priv_size = sizeof(struct cgroup_iter_priv),
147 };
148
149 static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
150 union bpf_iter_link_info *linfo,
151 struct bpf_iter_aux_info *aux)
152 {
153 int fd = linfo->cgroup.cgroup_fd;
154 struct cgroup *cgrp;
155
156 if (fd)
> 157 cgrp = cgroup_get_from_fd(fd);
158 else /* walk the entire hierarchy by default. */
159 cgrp = cgroup_get_from_path("/");
160
161 if (IS_ERR(cgrp))
162 return PTR_ERR(cgrp);
163
164 aux->cgroup.start = cgrp;
165 aux->cgroup.order = linfo->cgroup.traversal_order;
166 return 0;
167 }
168

--
0-DAY CI Kernel Test Service
https://01.org/lkp

2022-06-11 13:41:46

by kernel test robot

[permalink] [raw]

Subject: Re: [PATCH bpf-next v2 4/8] bpf: Introduce cgroup iter

Hi Yosry,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on bpf-next/master]

url: https://github.com/intel-lab-lkp/linux/commits/Yosry-Ahmed/bpf-rstat-cgroup-hierarchical-stats/20220611-034720
base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: m68k-randconfig-r004-20220611 (https://download.01.org/0day-ci/archive/20220611/[email protected]/config)
compiler: m68k-linux-gcc (GCC) 11.3.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/619857fd1ec4f351376ffcaaec20acc9aae9486f
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Yosry-Ahmed/bpf-rstat-cgroup-hierarchical-stats/20220611-034720
git checkout 619857fd1ec4f351376ffcaaec20acc9aae9486f
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.3.0 make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash kernel/bpf/ kernel/cgroup/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <[email protected]>

All error/warnings (new ones prefixed by >>):

In file included from kernel/bpf/cgroup_iter.c:9:
>> kernel/bpf/../cgroup/cgroup-internal.h:78:41: error: field 'iter' has incomplete type
78 | struct css_task_iter iter;
| ^~~~
kernel/bpf/../cgroup/cgroup-internal.h: In function 'cgroup_is_dead':
>> kernel/bpf/../cgroup/cgroup-internal.h:188:22: error: invalid use of undefined type 'const struct cgroup'
188 | return !(cgrp->self.flags & CSS_ONLINE);
| ^~
>> kernel/bpf/../cgroup/cgroup-internal.h:188:37: error: 'CSS_ONLINE' undeclared (first use in this function); did you mean 'N_ONLINE'?
188 | return !(cgrp->self.flags & CSS_ONLINE);
| ^~~~~~~~~~
| N_ONLINE
kernel/bpf/../cgroup/cgroup-internal.h:188:37: note: each undeclared identifier is reported only once for each function it appears in
kernel/bpf/../cgroup/cgroup-internal.h: In function 'notify_on_release':
>> kernel/bpf/../cgroup/cgroup-internal.h:193:25: error: 'CGRP_NOTIFY_ON_RELEASE' undeclared (first use in this function)
193 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
| ^~~~~~~~~~~~~~~~~~~~~~
kernel/bpf/../cgroup/cgroup-internal.h:193:54: error: invalid use of undefined type 'const struct cgroup'
193 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
| ^~
kernel/bpf/../cgroup/cgroup-internal.h: In function 'put_css_set':
>> kernel/bpf/../cgroup/cgroup-internal.h:207:39: error: invalid use of undefined type 'struct css_set'
207 | if (refcount_dec_not_one(&cset->refcount))
| ^~
kernel/bpf/../cgroup/cgroup-internal.h: In function 'get_css_set':
kernel/bpf/../cgroup/cgroup-internal.h:220:27: error: invalid use of undefined type 'struct css_set'
220 | refcount_inc(&cset->refcount);
| ^~
kernel/bpf/../cgroup/cgroup-internal.h: At top level:
>> kernel/bpf/../cgroup/cgroup-internal.h:284:22: error: array type has incomplete element type 'struct cftype'
284 | extern struct cftype cgroup1_base_files[];
| ^~~~~~~~~~~~~~~~~~
kernel/bpf/cgroup_iter.c: In function 'cgroup_iter_seq_start':
>> kernel/bpf/cgroup_iter.c:55:24: error: implicit declaration of function 'css_next_descendant_pre' [-Werror=implicit-function-declaration]
55 | return css_next_descendant_pre(NULL, p->start_css);
| ^~~~~~~~~~~~~~~~~~~~~~~
>> kernel/bpf/cgroup_iter.c:55:24: warning: returning 'int' from a function with return type 'void *' makes pointer from integer without a cast [-Wint-conversion]
55 | return css_next_descendant_pre(NULL, p->start_css);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>> kernel/bpf/cgroup_iter.c:57:24: error: implicit declaration of function 'css_next_descendant_post' [-Werror=implicit-function-declaration]
57 | return css_next_descendant_post(NULL, p->start_css);
| ^~~~~~~~~~~~~~~~~~~~~~~~
kernel/bpf/cgroup_iter.c:57:24: warning: returning 'int' from a function with return type 'void *' makes pointer from integer without a cast [-Wint-conversion]
57 | return css_next_descendant_post(NULL, p->start_css);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
kernel/bpf/cgroup_iter.c: In function 'cgroup_iter_seq_next':
kernel/bpf/cgroup_iter.c:83:24: warning: returning 'int' from a function with return type 'void *' makes pointer from integer without a cast [-Wint-conversion]
83 | return css_next_descendant_pre(curr, p->start_css);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
kernel/bpf/cgroup_iter.c:85:24: warning: returning 'int' from a function with return type 'void *' makes pointer from integer without a cast [-Wint-conversion]
85 | return css_next_descendant_post(curr, p->start_css);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>> kernel/bpf/cgroup_iter.c:87:28: error: invalid use of undefined type 'struct cgroup_subsys_state'
87 | return curr->parent;
| ^~
kernel/bpf/cgroup_iter.c: In function '__cgroup_iter_seq_show':
kernel/bpf/cgroup_iter.c:100:38: error: invalid use of undefined type 'struct cgroup_subsys_state'
100 | if (css && cgroup_is_dead(css->cgroup))
| ^~
kernel/bpf/cgroup_iter.c:104:31: error: invalid use of undefined type 'struct cgroup_subsys_state'
104 | ctx.cgroup = css ? css->cgroup : NULL;
| ^~
kernel/bpf/cgroup_iter.c: In function 'cgroup_iter_seq_init':
>> kernel/bpf/cgroup_iter.c:137:29: error: invalid use of undefined type 'struct cgroup'
137 | p->start_css = &cgrp->self;
| ^~
kernel/bpf/cgroup_iter.c: In function 'bpf_iter_attach_cgroup':
>> kernel/bpf/cgroup_iter.c:157:24: error: implicit declaration of function 'cgroup_get_from_fd'; did you mean 'cgroup_get_from_id'? [-Werror=implicit-function-declaration]
157 | cgrp = cgroup_get_from_fd(fd);
| ^~~~~~~~~~~~~~~~~~
| cgroup_get_from_id
>> kernel/bpf/cgroup_iter.c:157:22: warning: assignment to 'struct cgroup *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
157 | cgrp = cgroup_get_from_fd(fd);
| ^
>> kernel/bpf/cgroup_iter.c:159:24: error: implicit declaration of function 'cgroup_get_from_path'; did you mean 'cgroup_get_from_id'? [-Werror=implicit-function-declaration]
159 | cgrp = cgroup_get_from_path("/");
| ^~~~~~~~~~~~~~~~~~~~
| cgroup_get_from_id
kernel/bpf/cgroup_iter.c:159:22: warning: assignment to 'struct cgroup *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
159 | cgrp = cgroup_get_from_path("/");
| ^
kernel/bpf/cgroup_iter.c: In function 'bpf_iter_cgroup_show_fdinfo':
>> kernel/bpf/cgroup_iter.c:190:9: error: implicit declaration of function 'cgroup_path_ns'; did you mean 'cgroup_parent'? [-Werror=implicit-function-declaration]
190 | cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX,
| ^~~~~~~~~~~~~~
| cgroup_parent
kernel/bpf/cgroup_iter.c: In function 'cgroup_iter_seq_next':
kernel/bpf/cgroup_iter.c:88:1: error: control reaches end of non-void function [-Werror=return-type]
88 | }
| ^
cc1: some warnings being treated as errors

vim +/iter +78 kernel/bpf/../cgroup/cgroup-internal.h

0d2b5955b36250 Tejun Heo 2022-01-06 68
0d2b5955b36250 Tejun Heo 2022-01-06 69 struct cgroup_file_ctx {
e57457641613fe Tejun Heo 2022-01-06 70 struct cgroup_namespace *ns;
e57457641613fe Tejun Heo 2022-01-06 71
0d2b5955b36250 Tejun Heo 2022-01-06 72 struct {
0d2b5955b36250 Tejun Heo 2022-01-06 73 void *trigger;
0d2b5955b36250 Tejun Heo 2022-01-06 74 } psi;
0d2b5955b36250 Tejun Heo 2022-01-06 75
0d2b5955b36250 Tejun Heo 2022-01-06 76 struct {
0d2b5955b36250 Tejun Heo 2022-01-06 77 bool started;
0d2b5955b36250 Tejun Heo 2022-01-06 @78 struct css_task_iter iter;
0d2b5955b36250 Tejun Heo 2022-01-06 79 } procs;
0d2b5955b36250 Tejun Heo 2022-01-06 80
0d2b5955b36250 Tejun Heo 2022-01-06 81 struct {
0d2b5955b36250 Tejun Heo 2022-01-06 82 struct cgroup_pidlist *pidlist;
0d2b5955b36250 Tejun Heo 2022-01-06 83 } procs1;
0d2b5955b36250 Tejun Heo 2022-01-06 84 };
0d2b5955b36250 Tejun Heo 2022-01-06 85
0a268dbd7932c7 Tejun Heo 2016-12-27 86 /*
0a268dbd7932c7 Tejun Heo 2016-12-27 87 * A cgroup can be associated with multiple css_sets as different tasks may
0a268dbd7932c7 Tejun Heo 2016-12-27 88 * belong to different cgroups on different hierarchies. In the other
0a268dbd7932c7 Tejun Heo 2016-12-27 89 * direction, a css_set is naturally associated with multiple cgroups.
0a268dbd7932c7 Tejun Heo 2016-12-27 90 * This M:N relationship is represented by the following link structure
0a268dbd7932c7 Tejun Heo 2016-12-27 91 * which exists for each association and allows traversing the associations
0a268dbd7932c7 Tejun Heo 2016-12-27 92 * from both sides.
0a268dbd7932c7 Tejun Heo 2016-12-27 93 */
0a268dbd7932c7 Tejun Heo 2016-12-27 94 struct cgrp_cset_link {
0a268dbd7932c7 Tejun Heo 2016-12-27 95 /* the cgroup and css_set this link associates */
0a268dbd7932c7 Tejun Heo 2016-12-27 96 struct cgroup *cgrp;
0a268dbd7932c7 Tejun Heo 2016-12-27 97 struct css_set *cset;
0a268dbd7932c7 Tejun Heo 2016-12-27 98
0a268dbd7932c7 Tejun Heo 2016-12-27 99 /* list of cgrp_cset_links anchored at cgrp->cset_links */
0a268dbd7932c7 Tejun Heo 2016-12-27 100 struct list_head cset_link;
0a268dbd7932c7 Tejun Heo 2016-12-27 101
0a268dbd7932c7 Tejun Heo 2016-12-27 102 /* list of cgrp_cset_links anchored at css_set->cgrp_links */
0a268dbd7932c7 Tejun Heo 2016-12-27 103 struct list_head cgrp_link;
0a268dbd7932c7 Tejun Heo 2016-12-27 104 };
0a268dbd7932c7 Tejun Heo 2016-12-27 105
e595cd706982bf Tejun Heo 2017-01-15 106 /* used to track tasks and csets during migration */
e595cd706982bf Tejun Heo 2017-01-15 107 struct cgroup_taskset {
e595cd706982bf Tejun Heo 2017-01-15 108 /* the src and dst cset list running through cset->mg_node */
e595cd706982bf Tejun Heo 2017-01-15 109 struct list_head src_csets;
e595cd706982bf Tejun Heo 2017-01-15 110 struct list_head dst_csets;
e595cd706982bf Tejun Heo 2017-01-15 111
610467270fb368 Tejun Heo 2017-07-08 112 /* the number of tasks in the set */
610467270fb368 Tejun Heo 2017-07-08 113 int nr_tasks;
610467270fb368 Tejun Heo 2017-07-08 114
e595cd706982bf Tejun Heo 2017-01-15 115 /* the subsys currently being processed */
e595cd706982bf Tejun Heo 2017-01-15 116 int ssid;
e595cd706982bf Tejun Heo 2017-01-15 117
e595cd706982bf Tejun Heo 2017-01-15 118 /*
e595cd706982bf Tejun Heo 2017-01-15 119 * Fields for cgroup_taskset_*() iteration.
e595cd706982bf Tejun Heo 2017-01-15 120 *
e595cd706982bf Tejun Heo 2017-01-15 121 * Before migration is committed, the target migration tasks are on
e595cd706982bf Tejun Heo 2017-01-15 122 * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
e595cd706982bf Tejun Heo 2017-01-15 123 * the csets on ->dst_csets. ->csets point to either ->src_csets
e595cd706982bf Tejun Heo 2017-01-15 124 * or ->dst_csets depending on whether migration is committed.
e595cd706982bf Tejun Heo 2017-01-15 125 *
e595cd706982bf Tejun Heo 2017-01-15 126 * ->cur_csets and ->cur_task point to the current task position
e595cd706982bf Tejun Heo 2017-01-15 127 * during iteration.
e595cd706982bf Tejun Heo 2017-01-15 128 */
e595cd706982bf Tejun Heo 2017-01-15 129 struct list_head *csets;
e595cd706982bf Tejun Heo 2017-01-15 130 struct css_set *cur_cset;
e595cd706982bf Tejun Heo 2017-01-15 131 struct task_struct *cur_task;
e595cd706982bf Tejun Heo 2017-01-15 132 };
e595cd706982bf Tejun Heo 2017-01-15 133
e595cd706982bf Tejun Heo 2017-01-15 134 /* migration context also tracks preloading */
e595cd706982bf Tejun Heo 2017-01-15 135 struct cgroup_mgctx {
e595cd706982bf Tejun Heo 2017-01-15 136 /*
e595cd706982bf Tejun Heo 2017-01-15 137 * Preloaded source and destination csets. Used to guarantee
e595cd706982bf Tejun Heo 2017-01-15 138 * atomic success or failure on actual migration.
e595cd706982bf Tejun Heo 2017-01-15 139 */
e595cd706982bf Tejun Heo 2017-01-15 140 struct list_head preloaded_src_csets;
e595cd706982bf Tejun Heo 2017-01-15 141 struct list_head preloaded_dst_csets;
e595cd706982bf Tejun Heo 2017-01-15 142
e595cd706982bf Tejun Heo 2017-01-15 143 /* tasks and csets to migrate */
e595cd706982bf Tejun Heo 2017-01-15 144 struct cgroup_taskset tset;
bfc2cf6f61fcea Tejun Heo 2017-01-15 145
bfc2cf6f61fcea Tejun Heo 2017-01-15 146 /* subsystems affected by migration */
bfc2cf6f61fcea Tejun Heo 2017-01-15 147 u16 ss_mask;
e595cd706982bf Tejun Heo 2017-01-15 148 };
e595cd706982bf Tejun Heo 2017-01-15 149
e595cd706982bf Tejun Heo 2017-01-15 150 #define CGROUP_TASKSET_INIT(tset) \
e595cd706982bf Tejun Heo 2017-01-15 151 { \
e595cd706982bf Tejun Heo 2017-01-15 152 .src_csets = LIST_HEAD_INIT(tset.src_csets), \
e595cd706982bf Tejun Heo 2017-01-15 153 .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \
e595cd706982bf Tejun Heo 2017-01-15 154 .csets = &tset.src_csets, \
e595cd706982bf Tejun Heo 2017-01-15 155 }
e595cd706982bf Tejun Heo 2017-01-15 156
e595cd706982bf Tejun Heo 2017-01-15 157 #define CGROUP_MGCTX_INIT(name) \
e595cd706982bf Tejun Heo 2017-01-15 158 { \
e595cd706982bf Tejun Heo 2017-01-15 159 LIST_HEAD_INIT(name.preloaded_src_csets), \
e595cd706982bf Tejun Heo 2017-01-15 160 LIST_HEAD_INIT(name.preloaded_dst_csets), \
e595cd706982bf Tejun Heo 2017-01-15 161 CGROUP_TASKSET_INIT(name.tset), \
e595cd706982bf Tejun Heo 2017-01-15 162 }
e595cd706982bf Tejun Heo 2017-01-15 163
e595cd706982bf Tejun Heo 2017-01-15 164 #define DEFINE_CGROUP_MGCTX(name) \
e595cd706982bf Tejun Heo 2017-01-15 165 struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
e595cd706982bf Tejun Heo 2017-01-15 166
0a268dbd7932c7 Tejun Heo 2016-12-27 167 extern struct mutex cgroup_mutex;
0a268dbd7932c7 Tejun Heo 2016-12-27 168 extern spinlock_t css_set_lock;
0a268dbd7932c7 Tejun Heo 2016-12-27 169 extern struct cgroup_subsys *cgroup_subsys[];
0a268dbd7932c7 Tejun Heo 2016-12-27 170 extern struct list_head cgroup_roots;
0a268dbd7932c7 Tejun Heo 2016-12-27 171 extern struct file_system_type cgroup_fs_type;
0a268dbd7932c7 Tejun Heo 2016-12-27 172
0a268dbd7932c7 Tejun Heo 2016-12-27 173 /* iterate across the hierarchies */
0a268dbd7932c7 Tejun Heo 2016-12-27 174 #define for_each_root(root) \
0a268dbd7932c7 Tejun Heo 2016-12-27 175 list_for_each_entry((root), &cgroup_roots, root_list)
0a268dbd7932c7 Tejun Heo 2016-12-27 176
0a268dbd7932c7 Tejun Heo 2016-12-27 177 /**
0a268dbd7932c7 Tejun Heo 2016-12-27 178 * for_each_subsys - iterate all enabled cgroup subsystems
0a268dbd7932c7 Tejun Heo 2016-12-27 179 * @ss: the iteration cursor
0a268dbd7932c7 Tejun Heo 2016-12-27 180 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
0a268dbd7932c7 Tejun Heo 2016-12-27 181 */
0a268dbd7932c7 Tejun Heo 2016-12-27 182 #define for_each_subsys(ss, ssid) \
0a268dbd7932c7 Tejun Heo 2016-12-27 183 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
0a268dbd7932c7 Tejun Heo 2016-12-27 184 (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
0a268dbd7932c7 Tejun Heo 2016-12-27 185
0a268dbd7932c7 Tejun Heo 2016-12-27 186 static inline bool cgroup_is_dead(const struct cgroup *cgrp)
0a268dbd7932c7 Tejun Heo 2016-12-27 187 {
0a268dbd7932c7 Tejun Heo 2016-12-27 @188 return !(cgrp->self.flags & CSS_ONLINE);
0a268dbd7932c7 Tejun Heo 2016-12-27 189 }
0a268dbd7932c7 Tejun Heo 2016-12-27 190
0a268dbd7932c7 Tejun Heo 2016-12-27 191 static inline bool notify_on_release(const struct cgroup *cgrp)
0a268dbd7932c7 Tejun Heo 2016-12-27 192 {
0a268dbd7932c7 Tejun Heo 2016-12-27 @193 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
0a268dbd7932c7 Tejun Heo 2016-12-27 194 }
0a268dbd7932c7 Tejun Heo 2016-12-27 195
dcfe149b9f45aa Tejun Heo 2016-12-27 196 void put_css_set_locked(struct css_set *cset);
dcfe149b9f45aa Tejun Heo 2016-12-27 197
dcfe149b9f45aa Tejun Heo 2016-12-27 198 static inline void put_css_set(struct css_set *cset)
dcfe149b9f45aa Tejun Heo 2016-12-27 199 {
dcfe149b9f45aa Tejun Heo 2016-12-27 200 unsigned long flags;
dcfe149b9f45aa Tejun Heo 2016-12-27 201
dcfe149b9f45aa Tejun Heo 2016-12-27 202 /*
dcfe149b9f45aa Tejun Heo 2016-12-27 203 * Ensure that the refcount doesn't hit zero while any readers
dcfe149b9f45aa Tejun Heo 2016-12-27 204 * can see it. Similar to atomic_dec_and_lock(), but for an
dcfe149b9f45aa Tejun Heo 2016-12-27 205 * rwlock
dcfe149b9f45aa Tejun Heo 2016-12-27 206 */
4b9502e63b5e2b Elena Reshetova 2017-03-08 @207 if (refcount_dec_not_one(&cset->refcount))
dcfe149b9f45aa Tejun Heo 2016-12-27 208 return;
dcfe149b9f45aa Tejun Heo 2016-12-27 209
dcfe149b9f45aa Tejun Heo 2016-12-27 210 spin_lock_irqsave(&css_set_lock, flags);
dcfe149b9f45aa Tejun Heo 2016-12-27 211 put_css_set_locked(cset);
dcfe149b9f45aa Tejun Heo 2016-12-27 212 spin_unlock_irqrestore(&css_set_lock, flags);
dcfe149b9f45aa Tejun Heo 2016-12-27 213 }
dcfe149b9f45aa Tejun Heo 2016-12-27 214

--
0-DAY CI Kernel Test Service
https://01.org/lkp

2022-06-28 04:18:01

by Yonghong Song

[permalink] [raw]

Subject: Re: [PATCH bpf-next v2 4/8] bpf: Introduce cgroup iter

On 6/10/22 12:44 PM, Yosry Ahmed wrote:
> From: Hao Luo <[email protected]>
>
> Cgroup_iter is a type of bpf_iter. It walks over cgroups in two modes:
>
> - walking a cgroup's descendants.
> - walking a cgroup's ancestors.
>
> When attaching cgroup_iter, one can set a cgroup to the iter_link
> created from attaching. This cgroup is passed as a file descriptor and
> serves as the starting point of the walk. If no cgroup is specified,
> the starting point will be the root cgroup.
>
> For walking descendants, one can specify the order: either pre-order or
> post-order. For walking ancestors, the walk starts at the specified
> cgroup and ends at the root.
>
> One can also terminate the walk early by returning 1 from the iter
> program.
>
> Note that because walking cgroup hierarchy holds cgroup_mutex, the iter
> program is called with cgroup_mutex held.
>
> Signed-off-by: Hao Luo <[email protected]>
> Signed-off-by: Yosry Ahmed <[email protected]>
> ---
> include/linux/bpf.h | 8 ++
> include/uapi/linux/bpf.h | 21 +++
> kernel/bpf/Makefile | 2 +-
> kernel/bpf/cgroup_iter.c | 235 +++++++++++++++++++++++++++++++++
> tools/include/uapi/linux/bpf.h | 21 +++
> 5 files changed, 286 insertions(+), 1 deletion(-)
> create mode 100644 kernel/bpf/cgroup_iter.c
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 8e6092d0ea956..48d8e836b9748 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -44,6 +44,7 @@ struct kobject;
> struct mem_cgroup;
> struct module;
> struct bpf_func_state;
> +struct cgroup;
>
> extern struct idr btf_idr;
> extern spinlock_t btf_idr_lock;
> @@ -1590,7 +1591,14 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
> int __init bpf_iter_ ## target(args) { return 0; }
>
> struct bpf_iter_aux_info {
> + /* for map_elem iter */
> struct bpf_map *map;
> +
> + /* for cgroup iter */
> + struct {
> + struct cgroup *start; /* starting cgroup */
> + int order;
> + } cgroup;
> };
>
[...]
> +
> +static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
> +{
> + struct cgroup_iter_priv *p = seq->private;
> +
> + mutex_lock(&cgroup_mutex);
> +
> + /* support only one session */
> + if (*pos > 0)
> + return NULL;
> +
> + ++*pos;
> + p->terminate = false;
> + if (p->order == BPF_ITER_CGROUP_PRE)
> + return css_next_descendant_pre(NULL, p->start_css);
> + else if (p->order == BPF_ITER_CGROUP_POST)
> + return css_next_descendant_post(NULL, p->start_css);
> + else /* BPF_ITER_CGROUP_PARENT_UP */
> + return p->start_css;
> +}
> +
> +static int __cgroup_iter_seq_show(struct seq_file *seq,
> + struct cgroup_subsys_state *css, int in_stop);
> +
> +static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
> +{
> + /* pass NULL to the prog for post-processing */
> + if (!v)
> + __cgroup_iter_seq_show(seq, NULL, true);
> + mutex_unlock(&cgroup_mutex);
> +}
> +
> +static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> +{
> + struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
> + struct cgroup_iter_priv *p = seq->private;
> +
> + ++*pos;
> + if (p->terminate)
> + return NULL;
> +
> + if (p->order == BPF_ITER_CGROUP_PRE)
> + return css_next_descendant_pre(curr, p->start_css);
> + else if (p->order == BPF_ITER_CGROUP_POST)
> + return css_next_descendant_post(curr, p->start_css);
> + else
> + return curr->parent;
> +}
> +
> +static int __cgroup_iter_seq_show(struct seq_file *seq,
> + struct cgroup_subsys_state *css, int in_stop)
> +{
> + struct cgroup_iter_priv *p = seq->private;
> + struct bpf_iter__cgroup ctx;
> + struct bpf_iter_meta meta;
> + struct bpf_prog *prog;
> + int ret = 0;
> +
> + /* cgroup is dead, skip this element */
> + if (css && cgroup_is_dead(css->cgroup))
> + return 0;
> +
> + ctx.meta = &meta;
> + ctx.cgroup = css ? css->cgroup : NULL;
> + meta.seq = seq;
> + prog = bpf_iter_get_info(&meta, in_stop);
> + if (prog)
> + ret = bpf_iter_run_prog(prog, &ctx);

Do we need to do anything special to ensure bpf program gets
up-to-date stat from ctx.cgroup?

> +
> + /* if prog returns > 0, terminate after this element. */
> + if (ret != 0)
> + p->terminate = true;
> +
> + return 0;
> +}
> +
[...]

2022-06-28 04:18:01

by Yonghong Song

[permalink] [raw]

Subject: Re: [PATCH bpf-next v2 4/8] bpf: Introduce cgroup iter

On 6/10/22 12:44 PM, Yosry Ahmed wrote:
> From: Hao Luo <[email protected]>
>
> Cgroup_iter is a type of bpf_iter. It walks over cgroups in two modes:
>
> - walking a cgroup's descendants.
> - walking a cgroup's ancestors.

The implementation has another choice, BPF_ITER_CGROUP_PARENT_UP.
We should add it here as well.

>
> When attaching cgroup_iter, one can set a cgroup to the iter_link
> created from attaching. This cgroup is passed as a file descriptor and
> serves as the starting point of the walk. If no cgroup is specified,
> the starting point will be the root cgroup.
>
> For walking descendants, one can specify the order: either pre-order or
> post-order. For walking ancestors, the walk starts at the specified
> cgroup and ends at the root.
>
> One can also terminate the walk early by returning 1 from the iter
> program.
>
> Note that because walking cgroup hierarchy holds cgroup_mutex, the iter
> program is called with cgroup_mutex held.

Overall looks good to me with a few nits below.

Acked-by: Yonghong Song <[email protected]>

>
> Signed-off-by: Hao Luo <[email protected]>
> Signed-off-by: Yosry Ahmed <[email protected]>
> ---
> include/linux/bpf.h | 8 ++
> include/uapi/linux/bpf.h | 21 +++
> kernel/bpf/Makefile | 2 +-
> kernel/bpf/cgroup_iter.c | 235 +++++++++++++++++++++++++++++++++
> tools/include/uapi/linux/bpf.h | 21 +++
> 5 files changed, 286 insertions(+), 1 deletion(-)
> create mode 100644 kernel/bpf/cgroup_iter.c
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 8e6092d0ea956..48d8e836b9748 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -44,6 +44,7 @@ struct kobject;
> struct mem_cgroup;
> struct module;
> struct bpf_func_state;
> +struct cgroup;
>
> extern struct idr btf_idr;
> extern spinlock_t btf_idr_lock;
> @@ -1590,7 +1591,14 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
> int __init bpf_iter_ ## target(args) { return 0; }
>
> struct bpf_iter_aux_info {
> + /* for map_elem iter */
> struct bpf_map *map;
> +
> + /* for cgroup iter */
> + struct {
> + struct cgroup *start; /* starting cgroup */
> + int order;
> + } cgroup;
> };
>
> typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index f4009dbdf62da..4fd05cde19116 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -87,10 +87,27 @@ struct bpf_cgroup_storage_key {
> __u32 attach_type; /* program attach type (enum bpf_attach_type) */
> };
>
> +enum bpf_iter_cgroup_traversal_order {
> + BPF_ITER_CGROUP_PRE = 0, /* pre-order traversal */
> + BPF_ITER_CGROUP_POST, /* post-order traversal */
> + BPF_ITER_CGROUP_PARENT_UP, /* traversal of ancestors up to the root */
> +};
> +
> union bpf_iter_link_info {
> struct {
> __u32 map_fd;
> } map;
> +
> + /* cgroup_iter walks either the live descendants of a cgroup subtree, or the ancestors
> + * of a given cgroup.
> + */
> + struct {
> + /* Cgroup file descriptor. This is root of the subtree if for walking the
> + * descendants; this is the starting cgroup if for walking the ancestors.
> + */
> + __u32 cgroup_fd;
> + __u32 traversal_order;
> + } cgroup;
> };
>
> /* BPF syscall commands, see bpf(2) man-page for more details. */
> @@ -6050,6 +6067,10 @@ struct bpf_link_info {
> struct {
> __u32 map_id;
> } map;
> + struct {
> + __u32 traversal_order;
> + __aligned_u64 cgroup_id;
> + } cgroup;
> };
> } iter;
> struct {
> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> index 057ba8e01e70f..9741b9314fb46 100644
> --- a/kernel/bpf/Makefile
> +++ b/kernel/bpf/Makefile
> @@ -8,7 +8,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
>
> obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
> obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
> -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
> +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o cgroup_iter.o
> obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
> obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
> obj-$(CONFIG_BPF_SYSCALL) += disasm.o
> diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
> new file mode 100644
> index 0000000000000..88deb655efa71
> --- /dev/null
> +++ b/kernel/bpf/cgroup_iter.c
> @@ -0,0 +1,235 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2022 Google */
> +#include <linux/bpf.h>
> +#include <linux/btf_ids.h>
> +#include <linux/cgroup.h>
> +#include <linux/kernel.h>
> +#include <linux/seq_file.h>
> +
> +#include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */
> +
> +/* cgroup_iter provides two modes of traversal to the cgroup hierarchy.
> + *
> + * 1. Walk the descendants of a cgroup.
> + * 2. Walk the ancestors of a cgroup.

three modes here?

> + *
> + * For walking descendants, cgroup_iter can walk in either pre-order or
> + * post-order. For walking ancestors, the iter walks up from a cgroup to
> + * the root.
> + *
> + * The iter program can terminate the walk early by returning 1. Walk
> + * continues if prog returns 0.
> + *
> + * The prog can check (seq->num == 0) to determine whether this is
> + * the first element. The prog may also be passed a NULL cgroup,
> + * which means the walk has completed and the prog has a chance to
> + * do post-processing, such as outputing an epilogue.
> + *
> + * Note: the iter_prog is called with cgroup_mutex held.
> + */
> +
> +struct bpf_iter__cgroup {
> + __bpf_md_ptr(struct bpf_iter_meta *, meta);
> + __bpf_md_ptr(struct cgroup *, cgroup);
> +};
> +
> +struct cgroup_iter_priv {
> + struct cgroup_subsys_state *start_css;
> + bool terminate;
> + int order;
> +};
> +
> +static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
> +{
> + struct cgroup_iter_priv *p = seq->private;
> +
> + mutex_lock(&cgroup_mutex);
> +
> + /* support only one session */
> + if (*pos > 0)
> + return NULL;
> +
> + ++*pos;
> + p->terminate = false;
> + if (p->order == BPF_ITER_CGROUP_PRE)
> + return css_next_descendant_pre(NULL, p->start_css);
> + else if (p->order == BPF_ITER_CGROUP_POST)
> + return css_next_descendant_post(NULL, p->start_css);
> + else /* BPF_ITER_CGROUP_PARENT_UP */
> + return p->start_css;
> +}
> +
> +static int __cgroup_iter_seq_show(struct seq_file *seq,
> + struct cgroup_subsys_state *css, int in_stop);
> +
> +static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
> +{
> + /* pass NULL to the prog for post-processing */
> + if (!v)
> + __cgroup_iter_seq_show(seq, NULL, true);
> + mutex_unlock(&cgroup_mutex);
> +}
> +
> +static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> +{
> + struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
> + struct cgroup_iter_priv *p = seq->private;
> +
> + ++*pos;
> + if (p->terminate)
> + return NULL;
> +
> + if (p->order == BPF_ITER_CGROUP_PRE)
> + return css_next_descendant_pre(curr, p->start_css);
> + else if (p->order == BPF_ITER_CGROUP_POST)
> + return css_next_descendant_post(curr, p->start_css);
> + else
> + return curr->parent;
> +}
> +
> +static int __cgroup_iter_seq_show(struct seq_file *seq,
> + struct cgroup_subsys_state *css, int in_stop)
> +{
> + struct cgroup_iter_priv *p = seq->private;
> + struct bpf_iter__cgroup ctx;
> + struct bpf_iter_meta meta;
> + struct bpf_prog *prog;
> + int ret = 0;
> +
> + /* cgroup is dead, skip this element */
> + if (css && cgroup_is_dead(css->cgroup))
> + return 0;
> +
> + ctx.meta = &meta;
> + ctx.cgroup = css ? css->cgroup : NULL;
> + meta.seq = seq;
> + prog = bpf_iter_get_info(&meta, in_stop);
> + if (prog)
> + ret = bpf_iter_run_prog(prog, &ctx);
> +
> + /* if prog returns > 0, terminate after this element. */
> + if (ret != 0)
> + p->terminate = true;
> +
> + return 0;
> +}
> +
> +static int cgroup_iter_seq_show(struct seq_file *seq, void *v)
> +{
> + return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v,
> + false);
> +}
> +
> +static const struct seq_operations cgroup_iter_seq_ops = {
> + .start = cgroup_iter_seq_start,
> + .next = cgroup_iter_seq_next,
> + .stop = cgroup_iter_seq_stop,
> + .show = cgroup_iter_seq_show,
> +};
> +
> +BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
> +
> +static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
> +{
> + struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
> + struct cgroup *cgrp = aux->cgroup.start;
> +
> + p->start_css = &cgrp->self;
> + p->terminate = false;
> + p->order = aux->cgroup.order;
> + return 0;
> +}
> +
> +static const struct bpf_iter_seq_info cgroup_iter_seq_info = {
> + .seq_ops = &cgroup_iter_seq_ops,
> + .init_seq_private = cgroup_iter_seq_init,
> + .seq_priv_size = sizeof(struct cgroup_iter_priv),
> +};
> +
> +static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
> + union bpf_iter_link_info *linfo,
> + struct bpf_iter_aux_info *aux)
> +{
> + int fd = linfo->cgroup.cgroup_fd;
> + struct cgroup *cgrp;
> +
> + if (fd)
> + cgrp = cgroup_get_from_fd(fd);
> + else /* walk the entire hierarchy by default. */
> + cgrp = cgroup_get_from_path("/");
> +
> + if (IS_ERR(cgrp))
> + return PTR_ERR(cgrp);
> +
> + aux->cgroup.start = cgrp;
> + aux->cgroup.order = linfo->cgroup.traversal_order;

The legality of traversal_order should be checked.

> + return 0;
> +}
> +
> +static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux)
> +{
> + cgroup_put(aux->cgroup.start);
> +}
> +
> +static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux,
> + struct seq_file *seq)
> +{
> + char *buf;
> +
> + buf = kzalloc(PATH_MAX, GFP_KERNEL);
> + if (!buf) {
> + seq_puts(seq, "cgroup_path:\n");

This is a really unlikely case. maybe "cgroup_path:<unknown>"?

> + goto show_order;
> + }
> +
> + /* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path
> + * will print nothing.
> + *
> + * Path is in the calling process's cgroup namespace.
> + */
> + cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX,
> + current->nsproxy->cgroup_ns);
> + seq_printf(seq, "cgroup_path:\t%s\n", buf);
> + kfree(buf);
> +
> +show_order:
> + if (aux->cgroup.order == BPF_ITER_CGROUP_PRE)
> + seq_puts(seq, "traversal_order: pre\n");
> + else if (aux->cgroup.order == BPF_ITER_CGROUP_POST)
> + seq_puts(seq, "traversal_order: post\n");
> + else /* BPF_ITER_CGROUP_PARENT_UP */
> + seq_puts(seq, "traversal_order: parent_up\n");
> +}
> +
[...]

2022-06-28 06:13:41

by Yosry Ahmed

[permalink] [raw]

Subject: Re: [PATCH bpf-next v2 4/8] bpf: Introduce cgroup iter

On Mon, Jun 27, 2022 at 9:09 PM Yonghong Song <[email protected]> wrote:
>
>
>
> On 6/10/22 12:44 PM, Yosry Ahmed wrote:
> > From: Hao Luo <[email protected]>
> >
> > Cgroup_iter is a type of bpf_iter. It walks over cgroups in two modes:
> >
> > - walking a cgroup's descendants.
> > - walking a cgroup's ancestors.
>
> The implementation has another choice, BPF_ITER_CGROUP_PARENT_UP.
> We should add it here as well.
>

BPF_ITER_CGROUP_PARENT_UP is expressed here, I think what's actually
missing here (and down below where only 2 modes are specified again)
is that walking descendants is broken down into two separate modes,
pre and post order traversals.

> >
> > When attaching cgroup_iter, one can set a cgroup to the iter_link
> > created from attaching. This cgroup is passed as a file descriptor and
> > serves as the starting point of the walk. If no cgroup is specified,
> > the starting point will be the root cgroup.
> >
> > For walking descendants, one can specify the order: either pre-order or
> > post-order. For walking ancestors, the walk starts at the specified
> > cgroup and ends at the root.
> >
> > One can also terminate the walk early by returning 1 from the iter
> > program.
> >
> > Note that because walking cgroup hierarchy holds cgroup_mutex, the iter
> > program is called with cgroup_mutex held.
>
> Overall looks good to me with a few nits below.
>
> Acked-by: Yonghong Song <[email protected]>
>
> >
> > Signed-off-by: Hao Luo <[email protected]>
> > Signed-off-by: Yosry Ahmed <[email protected]>
> > ---
> > include/linux/bpf.h | 8 ++
> > include/uapi/linux/bpf.h | 21 +++
> > kernel/bpf/Makefile | 2 +-
> > kernel/bpf/cgroup_iter.c | 235 +++++++++++++++++++++++++++++++++
> > tools/include/uapi/linux/bpf.h | 21 +++
> > 5 files changed, 286 insertions(+), 1 deletion(-)
> > create mode 100644 kernel/bpf/cgroup_iter.c
> >
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index 8e6092d0ea956..48d8e836b9748 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -44,6 +44,7 @@ struct kobject;
> > struct mem_cgroup;
> > struct module;
> > struct bpf_func_state;
> > +struct cgroup;
> >
> > extern struct idr btf_idr;
> > extern spinlock_t btf_idr_lock;
> > @@ -1590,7 +1591,14 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
> > int __init bpf_iter_ ## target(args) { return 0; }
> >
> > struct bpf_iter_aux_info {
> > + /* for map_elem iter */
> > struct bpf_map *map;
> > +
> > + /* for cgroup iter */
> > + struct {
> > + struct cgroup *start; /* starting cgroup */
> > + int order;
> > + } cgroup;
> > };
> >
> > typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index f4009dbdf62da..4fd05cde19116 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -87,10 +87,27 @@ struct bpf_cgroup_storage_key {
> > __u32 attach_type; /* program attach type (enum bpf_attach_type) */
> > };
> >
> > +enum bpf_iter_cgroup_traversal_order {
> > + BPF_ITER_CGROUP_PRE = 0, /* pre-order traversal */
> > + BPF_ITER_CGROUP_POST, /* post-order traversal */
> > + BPF_ITER_CGROUP_PARENT_UP, /* traversal of ancestors up to the root */
> > +};
> > +
> > union bpf_iter_link_info {
> > struct {
> > __u32 map_fd;
> > } map;
> > +
> > + /* cgroup_iter walks either the live descendants of a cgroup subtree, or the ancestors
> > + * of a given cgroup.
> > + */
> > + struct {
> > + /* Cgroup file descriptor. This is root of the subtree if for walking the
> > + * descendants; this is the starting cgroup if for walking the ancestors.
> > + */
> > + __u32 cgroup_fd;
> > + __u32 traversal_order;
> > + } cgroup;
> > };
> >
> > /* BPF syscall commands, see bpf(2) man-page for more details. */
> > @@ -6050,6 +6067,10 @@ struct bpf_link_info {
> > struct {
> > __u32 map_id;
> > } map;
> > + struct {
> > + __u32 traversal_order;
> > + __aligned_u64 cgroup_id;
> > + } cgroup;
> > };
> > } iter;
> > struct {
> > diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> > index 057ba8e01e70f..9741b9314fb46 100644
> > --- a/kernel/bpf/Makefile
> > +++ b/kernel/bpf/Makefile
> > @@ -8,7 +8,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
> >
> > obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
> > obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
> > -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
> > +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o cgroup_iter.o
> > obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
> > obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
> > obj-$(CONFIG_BPF_SYSCALL) += disasm.o
> > diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
> > new file mode 100644
> > index 0000000000000..88deb655efa71
> > --- /dev/null
> > +++ b/kernel/bpf/cgroup_iter.c
> > @@ -0,0 +1,235 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/* Copyright (c) 2022 Google */
> > +#include <linux/bpf.h>
> > +#include <linux/btf_ids.h>
> > +#include <linux/cgroup.h>
> > +#include <linux/kernel.h>
> > +#include <linux/seq_file.h>
> > +
> > +#include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */
> > +
> > +/* cgroup_iter provides two modes of traversal to the cgroup hierarchy.
> > + *
> > + * 1. Walk the descendants of a cgroup.
> > + * 2. Walk the ancestors of a cgroup.
>
> three modes here?
>
> > + *
> > + * For walking descendants, cgroup_iter can walk in either pre-order or
> > + * post-order. For walking ancestors, the iter walks up from a cgroup to
> > + * the root.
> > + *
> > + * The iter program can terminate the walk early by returning 1. Walk
> > + * continues if prog returns 0.
> > + *
> > + * The prog can check (seq->num == 0) to determine whether this is
> > + * the first element. The prog may also be passed a NULL cgroup,
> > + * which means the walk has completed and the prog has a chance to
> > + * do post-processing, such as outputing an epilogue.
> > + *
> > + * Note: the iter_prog is called with cgroup_mutex held.
> > + */
> > +
> > +struct bpf_iter__cgroup {
> > + __bpf_md_ptr(struct bpf_iter_meta *, meta);
> > + __bpf_md_ptr(struct cgroup *, cgroup);
> > +};
> > +
> > +struct cgroup_iter_priv {
> > + struct cgroup_subsys_state *start_css;
> > + bool terminate;
> > + int order;
> > +};
> > +
> > +static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
> > +{
> > + struct cgroup_iter_priv *p = seq->private;
> > +
> > + mutex_lock(&cgroup_mutex);
> > +
> > + /* support only one session */
> > + if (*pos > 0)
> > + return NULL;
> > +
> > + ++*pos;
> > + p->terminate = false;
> > + if (p->order == BPF_ITER_CGROUP_PRE)
> > + return css_next_descendant_pre(NULL, p->start_css);
> > + else if (p->order == BPF_ITER_CGROUP_POST)
> > + return css_next_descendant_post(NULL, p->start_css);
> > + else /* BPF_ITER_CGROUP_PARENT_UP */
> > + return p->start_css;
> > +}
> > +
> > +static int __cgroup_iter_seq_show(struct seq_file *seq,
> > + struct cgroup_subsys_state *css, int in_stop);
> > +
> > +static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
> > +{
> > + /* pass NULL to the prog for post-processing */
> > + if (!v)
> > + __cgroup_iter_seq_show(seq, NULL, true);
> > + mutex_unlock(&cgroup_mutex);
> > +}
> > +
> > +static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> > +{
> > + struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
> > + struct cgroup_iter_priv *p = seq->private;
> > +
> > + ++*pos;
> > + if (p->terminate)
> > + return NULL;
> > +
> > + if (p->order == BPF_ITER_CGROUP_PRE)
> > + return css_next_descendant_pre(curr, p->start_css);
> > + else if (p->order == BPF_ITER_CGROUP_POST)
> > + return css_next_descendant_post(curr, p->start_css);
> > + else
> > + return curr->parent;
> > +}
> > +
> > +static int __cgroup_iter_seq_show(struct seq_file *seq,
> > + struct cgroup_subsys_state *css, int in_stop)
> > +{
> > + struct cgroup_iter_priv *p = seq->private;
> > + struct bpf_iter__cgroup ctx;
> > + struct bpf_iter_meta meta;
> > + struct bpf_prog *prog;
> > + int ret = 0;
> > +
> > + /* cgroup is dead, skip this element */
> > + if (css && cgroup_is_dead(css->cgroup))
> > + return 0;
> > +
> > + ctx.meta = &meta;
> > + ctx.cgroup = css ? css->cgroup : NULL;
> > + meta.seq = seq;
> > + prog = bpf_iter_get_info(&meta, in_stop);
> > + if (prog)
> > + ret = bpf_iter_run_prog(prog, &ctx);
> > +
> > + /* if prog returns > 0, terminate after this element. */
> > + if (ret != 0)
> > + p->terminate = true;
> > +
> > + return 0;
> > +}
> > +
> > +static int cgroup_iter_seq_show(struct seq_file *seq, void *v)
> > +{
> > + return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v,
> > + false);
> > +}
> > +
> > +static const struct seq_operations cgroup_iter_seq_ops = {
> > + .start = cgroup_iter_seq_start,
> > + .next = cgroup_iter_seq_next,
> > + .stop = cgroup_iter_seq_stop,
> > + .show = cgroup_iter_seq_show,
> > +};
> > +
> > +BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
> > +
> > +static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
> > +{
> > + struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
> > + struct cgroup *cgrp = aux->cgroup.start;
> > +
> > + p->start_css = &cgrp->self;
> > + p->terminate = false;
> > + p->order = aux->cgroup.order;
> > + return 0;
> > +}
> > +
> > +static const struct bpf_iter_seq_info cgroup_iter_seq_info = {
> > + .seq_ops = &cgroup_iter_seq_ops,
> > + .init_seq_private = cgroup_iter_seq_init,
> > + .seq_priv_size = sizeof(struct cgroup_iter_priv),
> > +};
> > +
> > +static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
> > + union bpf_iter_link_info *linfo,
> > + struct bpf_iter_aux_info *aux)
> > +{
> > + int fd = linfo->cgroup.cgroup_fd;
> > + struct cgroup *cgrp;
> > +
> > + if (fd)
> > + cgrp = cgroup_get_from_fd(fd);
> > + else /* walk the entire hierarchy by default. */
> > + cgrp = cgroup_get_from_path("/");
> > +
> > + if (IS_ERR(cgrp))
> > + return PTR_ERR(cgrp);
> > +
> > + aux->cgroup.start = cgrp;
> > + aux->cgroup.order = linfo->cgroup.traversal_order;
>
> The legality of traversal_order should be checked.
>
> > + return 0;
> > +}
> > +
> > +static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux)
> > +{
> > + cgroup_put(aux->cgroup.start);
> > +}
> > +
> > +static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux,
> > + struct seq_file *seq)
> > +{
> > + char *buf;
> > +
> > + buf = kzalloc(PATH_MAX, GFP_KERNEL);
> > + if (!buf) {
> > + seq_puts(seq, "cgroup_path:\n");
>
> This is a really unlikely case. maybe "cgroup_path:<unknown>"?
>
> > + goto show_order;
> > + }
> > +
> > + /* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path
> > + * will print nothing.
> > + *
> > + * Path is in the calling process's cgroup namespace.
> > + */
> > + cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX,
> > + current->nsproxy->cgroup_ns);
> > + seq_printf(seq, "cgroup_path:\t%s\n", buf);
> > + kfree(buf);
> > +
> > +show_order:
> > + if (aux->cgroup.order == BPF_ITER_CGROUP_PRE)
> > + seq_puts(seq, "traversal_order: pre\n");
> > + else if (aux->cgroup.order == BPF_ITER_CGROUP_POST)
> > + seq_puts(seq, "traversal_order: post\n");
> > + else /* BPF_ITER_CGROUP_PARENT_UP */
> > + seq_puts(seq, "traversal_order: parent_up\n");
> > +}
> > +
> [...]

2022-06-28 06:14:03

by Yosry Ahmed

[permalink] [raw]

Subject: Re: [PATCH bpf-next v2 4/8] bpf: Introduce cgroup iter

On Mon, Jun 27, 2022 at 9:14 PM Yonghong Song <[email protected]> wrote:
>
>
>
> On 6/10/22 12:44 PM, Yosry Ahmed wrote:
> > From: Hao Luo <[email protected]>
> >
> > Cgroup_iter is a type of bpf_iter. It walks over cgroups in two modes:
> >
> > - walking a cgroup's descendants.
> > - walking a cgroup's ancestors.
> >
> > When attaching cgroup_iter, one can set a cgroup to the iter_link
> > created from attaching. This cgroup is passed as a file descriptor and
> > serves as the starting point of the walk. If no cgroup is specified,
> > the starting point will be the root cgroup.
> >
> > For walking descendants, one can specify the order: either pre-order or
> > post-order. For walking ancestors, the walk starts at the specified
> > cgroup and ends at the root.
> >
> > One can also terminate the walk early by returning 1 from the iter
> > program.
> >
> > Note that because walking cgroup hierarchy holds cgroup_mutex, the iter
> > program is called with cgroup_mutex held.
> >
> > Signed-off-by: Hao Luo <[email protected]>
> > Signed-off-by: Yosry Ahmed <[email protected]>
> > ---
> > include/linux/bpf.h | 8 ++
> > include/uapi/linux/bpf.h | 21 +++
> > kernel/bpf/Makefile | 2 +-
> > kernel/bpf/cgroup_iter.c | 235 +++++++++++++++++++++++++++++++++
> > tools/include/uapi/linux/bpf.h | 21 +++
> > 5 files changed, 286 insertions(+), 1 deletion(-)
> > create mode 100644 kernel/bpf/cgroup_iter.c
> >
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index 8e6092d0ea956..48d8e836b9748 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -44,6 +44,7 @@ struct kobject;
> > struct mem_cgroup;
> > struct module;
> > struct bpf_func_state;
> > +struct cgroup;
> >
> > extern struct idr btf_idr;
> > extern spinlock_t btf_idr_lock;
> > @@ -1590,7 +1591,14 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
> > int __init bpf_iter_ ## target(args) { return 0; }
> >
> > struct bpf_iter_aux_info {
> > + /* for map_elem iter */
> > struct bpf_map *map;
> > +
> > + /* for cgroup iter */
> > + struct {
> > + struct cgroup *start; /* starting cgroup */
> > + int order;
> > + } cgroup;
> > };
> >
> [...]
> > +
> > +static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
> > +{
> > + struct cgroup_iter_priv *p = seq->private;
> > +
> > + mutex_lock(&cgroup_mutex);
> > +
> > + /* support only one session */
> > + if (*pos > 0)
> > + return NULL;
> > +
> > + ++*pos;
> > + p->terminate = false;
> > + if (p->order == BPF_ITER_CGROUP_PRE)
> > + return css_next_descendant_pre(NULL, p->start_css);
> > + else if (p->order == BPF_ITER_CGROUP_POST)
> > + return css_next_descendant_post(NULL, p->start_css);
> > + else /* BPF_ITER_CGROUP_PARENT_UP */
> > + return p->start_css;
> > +}
> > +
> > +static int __cgroup_iter_seq_show(struct seq_file *seq,
> > + struct cgroup_subsys_state *css, int in_stop);
> > +
> > +static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
> > +{
> > + /* pass NULL to the prog for post-processing */
> > + if (!v)
> > + __cgroup_iter_seq_show(seq, NULL, true);
> > + mutex_unlock(&cgroup_mutex);
> > +}
> > +
> > +static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> > +{
> > + struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
> > + struct cgroup_iter_priv *p = seq->private;
> > +
> > + ++*pos;
> > + if (p->terminate)
> > + return NULL;
> > +
> > + if (p->order == BPF_ITER_CGROUP_PRE)
> > + return css_next_descendant_pre(curr, p->start_css);
> > + else if (p->order == BPF_ITER_CGROUP_POST)
> > + return css_next_descendant_post(curr, p->start_css);
> > + else
> > + return curr->parent;
> > +}
> > +
> > +static int __cgroup_iter_seq_show(struct seq_file *seq,
> > + struct cgroup_subsys_state *css, int in_stop)
> > +{
> > + struct cgroup_iter_priv *p = seq->private;
> > + struct bpf_iter__cgroup ctx;
> > + struct bpf_iter_meta meta;
> > + struct bpf_prog *prog;
> > + int ret = 0;
> > +
> > + /* cgroup is dead, skip this element */
> > + if (css && cgroup_is_dead(css->cgroup))
> > + return 0;
> > +
> > + ctx.meta = &meta;
> > + ctx.cgroup = css ? css->cgroup : NULL;
> > + meta.seq = seq;
> > + prog = bpf_iter_get_info(&meta, in_stop);
> > + if (prog)
> > + ret = bpf_iter_run_prog(prog, &ctx);
>
> Do we need to do anything special to ensure bpf program gets
> up-to-date stat from ctx.cgroup?

Later patches in the series add cgroup_flush_rstat() kfunc which
flushes cgroup stats that use rstat (e.g. memcg stats). It can be
called directly from the bpf program if needed.

It would be better to leave this to the bpf program, it's an
unnecessary toll to flush the stats for any cgroup_iter program, that
could be not accessing stats, or stats that are not maintained using
rstat.

>
> > +
> > + /* if prog returns > 0, terminate after this element. */
> > + if (ret != 0)
> > + p->terminate = true;
> > +
> > + return 0;
> > +}
> > +
> [...]

2022-06-28 17:44:06

by Yonghong Song

[permalink] [raw]

Subject: Re: [PATCH bpf-next v2 4/8] bpf: Introduce cgroup iter

On 6/27/22 11:03 PM, Yosry Ahmed wrote:
> On Mon, Jun 27, 2022 at 9:14 PM Yonghong Song <[email protected]> wrote:
>>
>>
>>
>> On 6/10/22 12:44 PM, Yosry Ahmed wrote:
>>> From: Hao Luo <[email protected]>
>>>
>>> Cgroup_iter is a type of bpf_iter. It walks over cgroups in two modes:
>>>
>>> - walking a cgroup's descendants.
>>> - walking a cgroup's ancestors.
>>>
>>> When attaching cgroup_iter, one can set a cgroup to the iter_link
>>> created from attaching. This cgroup is passed as a file descriptor and
>>> serves as the starting point of the walk. If no cgroup is specified,
>>> the starting point will be the root cgroup.
>>>
>>> For walking descendants, one can specify the order: either pre-order or
>>> post-order. For walking ancestors, the walk starts at the specified
>>> cgroup and ends at the root.
>>>
>>> One can also terminate the walk early by returning 1 from the iter
>>> program.
>>>
>>> Note that because walking cgroup hierarchy holds cgroup_mutex, the iter
>>> program is called with cgroup_mutex held.
>>>
>>> Signed-off-by: Hao Luo <[email protected]>
>>> Signed-off-by: Yosry Ahmed <[email protected]>
>>> ---
>>> include/linux/bpf.h | 8 ++
>>> include/uapi/linux/bpf.h | 21 +++
>>> kernel/bpf/Makefile | 2 +-
>>> kernel/bpf/cgroup_iter.c | 235 +++++++++++++++++++++++++++++++++
>>> tools/include/uapi/linux/bpf.h | 21 +++
>>> 5 files changed, 286 insertions(+), 1 deletion(-)
>>> create mode 100644 kernel/bpf/cgroup_iter.c
>>>
>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>> index 8e6092d0ea956..48d8e836b9748 100644
>>> --- a/include/linux/bpf.h
>>> +++ b/include/linux/bpf.h
>>> @@ -44,6 +44,7 @@ struct kobject;
>>> struct mem_cgroup;
>>> struct module;
>>> struct bpf_func_state;
>>> +struct cgroup;
>>>
>>> extern struct idr btf_idr;
>>> extern spinlock_t btf_idr_lock;
>>> @@ -1590,7 +1591,14 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
>>> int __init bpf_iter_ ## target(args) { return 0; }
>>>
>>> struct bpf_iter_aux_info {
>>> + /* for map_elem iter */
>>> struct bpf_map *map;
>>> +
>>> + /* for cgroup iter */
>>> + struct {
>>> + struct cgroup *start; /* starting cgroup */
>>> + int order;
>>> + } cgroup;
>>> };
>>>
>> [...]
>>> +
>>> +static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
>>> +{
>>> + struct cgroup_iter_priv *p = seq->private;
>>> +
>>> + mutex_lock(&cgroup_mutex);
>>> +
>>> + /* support only one session */
>>> + if (*pos > 0)
>>> + return NULL;
>>> +
>>> + ++*pos;
>>> + p->terminate = false;
>>> + if (p->order == BPF_ITER_CGROUP_PRE)
>>> + return css_next_descendant_pre(NULL, p->start_css);
>>> + else if (p->order == BPF_ITER_CGROUP_POST)
>>> + return css_next_descendant_post(NULL, p->start_css);
>>> + else /* BPF_ITER_CGROUP_PARENT_UP */
>>> + return p->start_css;
>>> +}
>>> +
>>> +static int __cgroup_iter_seq_show(struct seq_file *seq,
>>> + struct cgroup_subsys_state *css, int in_stop);
>>> +
>>> +static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
>>> +{
>>> + /* pass NULL to the prog for post-processing */
>>> + if (!v)
>>> + __cgroup_iter_seq_show(seq, NULL, true);
>>> + mutex_unlock(&cgroup_mutex);
>>> +}
>>> +
>>> +static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
>>> +{
>>> + struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
>>> + struct cgroup_iter_priv *p = seq->private;
>>> +
>>> + ++*pos;
>>> + if (p->terminate)
>>> + return NULL;
>>> +
>>> + if (p->order == BPF_ITER_CGROUP_PRE)
>>> + return css_next_descendant_pre(curr, p->start_css);
>>> + else if (p->order == BPF_ITER_CGROUP_POST)
>>> + return css_next_descendant_post(curr, p->start_css);
>>> + else
>>> + return curr->parent;
>>> +}
>>> +
>>> +static int __cgroup_iter_seq_show(struct seq_file *seq,
>>> + struct cgroup_subsys_state *css, int in_stop)
>>> +{
>>> + struct cgroup_iter_priv *p = seq->private;
>>> + struct bpf_iter__cgroup ctx;
>>> + struct bpf_iter_meta meta;
>>> + struct bpf_prog *prog;
>>> + int ret = 0;
>>> +
>>> + /* cgroup is dead, skip this element */
>>> + if (css && cgroup_is_dead(css->cgroup))
>>> + return 0;
>>> +
>>> + ctx.meta = &meta;
>>> + ctx.cgroup = css ? css->cgroup : NULL;
>>> + meta.seq = seq;
>>> + prog = bpf_iter_get_info(&meta, in_stop);
>>> + if (prog)
>>> + ret = bpf_iter_run_prog(prog, &ctx);
>>
>> Do we need to do anything special to ensure bpf program gets
>> up-to-date stat from ctx.cgroup?
>
> Later patches in the series add cgroup_flush_rstat() kfunc which
> flushes cgroup stats that use rstat (e.g. memcg stats). It can be
> called directly from the bpf program if needed.
>
> It would be better to leave this to the bpf program, it's an
> unnecessary toll to flush the stats for any cgroup_iter program, that
> could be not accessing stats, or stats that are not maintained using
> rstat.

Okay, this should work.

>
>>
>>> +
>>> + /* if prog returns > 0, terminate after this element. */
>>> + if (ret != 0)
>>> + p->terminate = true;
>>> +
>>> + return 0;
>>> +}
>>> +
>> [...]

2022-07-07 23:38:18

by Hao Luo

[permalink] [raw]

Subject: Re: [PATCH bpf-next v2 4/8] bpf: Introduce cgroup iter

2022-07-07 23:56:03

by Hao Luo

[permalink] [raw]

Subject: Re: [PATCH bpf-next v2 4/8] bpf: Introduce cgroup iter

On Mon, Jun 27, 2022 at 9:14 PM Yonghong Song <[email protected]> wrote:
>
> > +static int __cgroup_iter_seq_show(struct seq_file *seq,
> > + struct cgroup_subsys_state *css, int in_stop)
> > +{
> > + struct cgroup_iter_priv *p = seq->private;
> > + struct bpf_iter__cgroup ctx;
> > + struct bpf_iter_meta meta;
> > + struct bpf_prog *prog;
> > + int ret = 0;
> > +
> > + /* cgroup is dead, skip this element */
> > + if (css && cgroup_is_dead(css->cgroup))
> > + return 0;
> > +
> > + ctx.meta = &meta;
> > + ctx.cgroup = css ? css->cgroup : NULL;
> > + meta.seq = seq;
> > + prog = bpf_iter_get_info(&meta, in_stop);
> > + if (prog)
> > + ret = bpf_iter_run_prog(prog, &ctx);
>
> Do we need to do anything special to ensure bpf program gets
> up-to-date stat from ctx.cgroup?
>

Let's leave that to be handled by bpf programs. The kfunc rstat_flush
can be called to sync stats, if using rstat.

> > +
> > + /* if prog returns > 0, terminate after this element. */
> > + if (ret != 0)
> > + p->terminate = true;
> > +
> > + return 0;
> > +}
> > +
> [...]