Every cgroup knows all its ancestors through its ->ancestor_ids[]. While all
the existing users only need the IDs, there's no advantage to remembering
the IDs instead of the pointers directly. Let's replace
cgroup->ancestor_ids[] with ->ancestors[] so that it's easy to access non-ID
ancestor fields.
This patch shouldn't cause any behavior differences.
Signed-off-by: Tejun Heo <[email protected]>
---
Namhyung, I think the change in bperf_cgroup is correct but couldn't figure
out how to test it (normal perf build wouldn't compile it). Can you please
see whether it's correct?
Thanks.
include/linux/cgroup-defs.h | 10 +++++-----
include/linux/cgroup.h | 2 +-
kernel/cgroup/cgroup.c | 7 +++----
net/netfilter/nft_socket.c | 5 +++--
tools/perf/util/bpf_skel/bperf_cgroup.bpf.c | 2 +-
5 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 63bf43c7ca3b..51fa744c2e9d 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -379,7 +379,7 @@ struct cgroup {
/*
* The depth this cgroup is at. The root is at depth zero and each
* step down the hierarchy increments the level. This along with
- * ancestor_ids[] can determine whether a given cgroup is a
+ * ancestors[] can determine whether a given cgroup is a
* descendant of another without traversing the hierarchy.
*/
int level;
@@ -499,8 +499,8 @@ struct cgroup {
/* Used to store internal freezer state */
struct cgroup_freezer_state freezer;
- /* ids of the ancestors at each level including self */
- u64 ancestor_ids[];
+ /* All ancestors including self */
+ struct cgroup *ancestors[];
};
/*
@@ -520,8 +520,8 @@ struct cgroup_root {
/* The root cgroup. Root is destroyed on its release. */
struct cgroup cgrp;
- /* for cgrp->ancestor_ids[0] */
- u64 cgrp_ancestor_id_storage;
+ /* for cgrp->ancestors[0] */
+ u64 cgrp_ancestor_storage;
/* Number of cgroups in the hierarchy, used only for /proc/cgroups */
atomic_t nr_cgrps;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ed53bfe7c46c..5334b6134399 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -574,7 +574,7 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp,
{
if (cgrp->root != ancestor->root || cgrp->level < ancestor->level)
return false;
- return cgrp->ancestor_ids[ancestor->level] == cgroup_id(ancestor);
+ return cgrp->ancestors[ancestor->level] == ancestor;
}
/**
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 85fa4c8587a8..ce587fe43dab 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2047,7 +2047,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
}
root_cgrp->kn = kernfs_root_to_node(root->kf_root);
WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
- root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
+ root_cgrp->ancestors[0] = root_cgrp;
ret = css_populate_dir(&root_cgrp->self);
if (ret)
@@ -5391,8 +5391,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
int ret;
/* allocate the cgroup and its ID, 0 is reserved for the root */
- cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
- GFP_KERNEL);
+ cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL);
if (!cgrp)
return ERR_PTR(-ENOMEM);
@@ -5444,7 +5443,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
spin_lock_irq(&css_set_lock);
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
- cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
+ cgrp->ancestors[tcgrp->level] = tcgrp;
if (tcgrp != cgrp) {
tcgrp->nr_descendants++;
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index 05ae5a338b6f..d64784d4bd02 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -40,6 +40,7 @@ static noinline bool
nft_sock_get_eval_cgroupv2(u32 *dest, struct sock *sk, const struct nft_pktinfo *pkt, u32 level)
{
struct cgroup *cgrp;
+ u64 cgid;
if (!sk_fullsock(sk))
return false;
@@ -48,8 +49,8 @@ nft_sock_get_eval_cgroupv2(u32 *dest, struct sock *sk, const struct nft_pktinfo
if (level > cgrp->level)
return false;
- memcpy(dest, &cgrp->ancestor_ids[level], sizeof(u64));
-
+ cgid = cgroup_id(cgrp->ancestors[level]);
+ memcpy(dest, &cgid, sizeof(u64));
return true;
}
#endif
diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
index 292c430768b5..bd6a420acc8f 100644
--- a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
+++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
@@ -68,7 +68,7 @@ static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
break;
// convert cgroup-id to a map index
- cgrp_id = BPF_CORE_READ(cgrp, ancestor_ids[i]);
+ cgrp_id = BPF_CORE_READ(cgrp, ancestors[i], kn, id);
elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
if (!elem)
continue;
Hi Tejun,
On Fri, Jul 29, 2022 at 12:06 PM Tejun Heo <[email protected]> wrote:
>
> Every cgroup knows all its ancestors through its ->ancestor_ids[]. While all
> the existing users only need the IDs, there's no advantage to remembering
> the IDs instead of the pointers directly. Let's replace
> cgroup->ancestor_ids[] with ->ancestors[] so that it's easy to access non-ID
> ancestor fields.
>
> This patch shouldn't cause any behavior differences.
>
> Signed-off-by: Tejun Heo <[email protected]>
> ---
> Namhyung, I think the change in bperf_cgroup is correct but couldn't figure
> out how to test it (normal perf build wouldn't compile it). Can you please
> see whether it's correct?
Looks ok to me. For the perf part,
Acked-by: Namhyung Kim <[email protected]>
Note that you need to pass BUILD_BPF_SKEL=1 when you build perf
to enable BPF stuff. IIRC Arnaldo will make it default at some point.
Thanks,
Namhyung
>
> Thanks.
>
> include/linux/cgroup-defs.h | 10 +++++-----
> include/linux/cgroup.h | 2 +-
> kernel/cgroup/cgroup.c | 7 +++----
> net/netfilter/nft_socket.c | 5 +++--
> tools/perf/util/bpf_skel/bperf_cgroup.bpf.c | 2 +-
> 5 files changed, 13 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
> index 63bf43c7ca3b..51fa744c2e9d 100644
> --- a/include/linux/cgroup-defs.h
> +++ b/include/linux/cgroup-defs.h
> @@ -379,7 +379,7 @@ struct cgroup {
> /*
> * The depth this cgroup is at. The root is at depth zero and each
> * step down the hierarchy increments the level. This along with
> - * ancestor_ids[] can determine whether a given cgroup is a
> + * ancestors[] can determine whether a given cgroup is a
> * descendant of another without traversing the hierarchy.
> */
> int level;
> @@ -499,8 +499,8 @@ struct cgroup {
> /* Used to store internal freezer state */
> struct cgroup_freezer_state freezer;
>
> - /* ids of the ancestors at each level including self */
> - u64 ancestor_ids[];
> + /* All ancestors including self */
> + struct cgroup *ancestors[];
> };
>
> /*
> @@ -520,8 +520,8 @@ struct cgroup_root {
> /* The root cgroup. Root is destroyed on its release. */
> struct cgroup cgrp;
>
> - /* for cgrp->ancestor_ids[0] */
> - u64 cgrp_ancestor_id_storage;
> + /* for cgrp->ancestors[0] */
> + u64 cgrp_ancestor_storage;
>
> /* Number of cgroups in the hierarchy, used only for /proc/cgroups */
> atomic_t nr_cgrps;
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index ed53bfe7c46c..5334b6134399 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -574,7 +574,7 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp,
> {
> if (cgrp->root != ancestor->root || cgrp->level < ancestor->level)
> return false;
> - return cgrp->ancestor_ids[ancestor->level] == cgroup_id(ancestor);
> + return cgrp->ancestors[ancestor->level] == ancestor;
> }
>
> /**
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 85fa4c8587a8..ce587fe43dab 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -2047,7 +2047,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
> }
> root_cgrp->kn = kernfs_root_to_node(root->kf_root);
> WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
> - root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
> + root_cgrp->ancestors[0] = root_cgrp;
>
> ret = css_populate_dir(&root_cgrp->self);
> if (ret)
> @@ -5391,8 +5391,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
> int ret;
>
> /* allocate the cgroup and its ID, 0 is reserved for the root */
> - cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
> - GFP_KERNEL);
> + cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL);
> if (!cgrp)
> return ERR_PTR(-ENOMEM);
>
> @@ -5444,7 +5443,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
>
> spin_lock_irq(&css_set_lock);
> for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
> - cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
> + cgrp->ancestors[tcgrp->level] = tcgrp;
>
> if (tcgrp != cgrp) {
> tcgrp->nr_descendants++;
> diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
> index 05ae5a338b6f..d64784d4bd02 100644
> --- a/net/netfilter/nft_socket.c
> +++ b/net/netfilter/nft_socket.c
> @@ -40,6 +40,7 @@ static noinline bool
> nft_sock_get_eval_cgroupv2(u32 *dest, struct sock *sk, const struct nft_pktinfo *pkt, u32 level)
> {
> struct cgroup *cgrp;
> + u64 cgid;
>
> if (!sk_fullsock(sk))
> return false;
> @@ -48,8 +49,8 @@ nft_sock_get_eval_cgroupv2(u32 *dest, struct sock *sk, const struct nft_pktinfo
> if (level > cgrp->level)
> return false;
>
> - memcpy(dest, &cgrp->ancestor_ids[level], sizeof(u64));
> -
> + cgid = cgroup_id(cgrp->ancestors[level]);
> + memcpy(dest, &cgid, sizeof(u64));
> return true;
> }
> #endif
> diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> index 292c430768b5..bd6a420acc8f 100644
> --- a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> +++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> @@ -68,7 +68,7 @@ static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
> break;
>
> // convert cgroup-id to a map index
> - cgrp_id = BPF_CORE_READ(cgrp, ancestor_ids[i]);
> + cgrp_id = BPF_CORE_READ(cgrp, ancestors[i], kn, id);
> elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
> if (!elem)
> continue;
Every cgroup knows all its ancestors through its ->ancestor_ids[]. There's
no advantage to remembering the IDs instead of the pointers directly and
this makes the array useless for finding an actual ancestor cgroup forcing
cgroup_ancestor() to iteratively walk up the hierarchy instead. Let's
replace cgroup->ancestor_ids[] with ->ancestors[] and remove the walking-up
from cgroup_ancestor().
This patch shouldn't cause user-visible behavior differences.
v2: Update cgroup_ancestor() to use ->ancestors[].
Signed-off-by: Tejun Heo <[email protected]>
Acked-by: Namhyung Kim <[email protected]>
---
include/linux/cgroup-defs.h | 10 +++++-----
include/linux/cgroup.h | 8 +++-----
kernel/cgroup/cgroup.c | 7 +++----
net/netfilter/nft_socket.c | 9 +++++----
tools/perf/util/bpf_skel/bperf_cgroup.bpf.c | 2 +-
5 files changed, 17 insertions(+), 19 deletions(-)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 63bf43c7ca3b..51fa744c2e9d 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -379,7 +379,7 @@ struct cgroup {
/*
* The depth this cgroup is at. The root is at depth zero and each
* step down the hierarchy increments the level. This along with
- * ancestor_ids[] can determine whether a given cgroup is a
+ * ancestors[] can determine whether a given cgroup is a
* descendant of another without traversing the hierarchy.
*/
int level;
@@ -499,8 +499,8 @@ struct cgroup {
/* Used to store internal freezer state */
struct cgroup_freezer_state freezer;
- /* ids of the ancestors at each level including self */
- u64 ancestor_ids[];
+ /* All ancestors including self */
+ struct cgroup *ancestors[];
};
/*
@@ -520,8 +520,8 @@ struct cgroup_root {
/* The root cgroup. Root is destroyed on its release. */
struct cgroup cgrp;
- /* for cgrp->ancestor_ids[0] */
- u64 cgrp_ancestor_id_storage;
+ /* for cgrp->ancestors[0] */
+ u64 cgrp_ancestor_storage;
/* Number of cgroups in the hierarchy, used only for /proc/cgroups */
atomic_t nr_cgrps;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ed53bfe7c46c..4d143729b246 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -574,7 +574,7 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp,
{
if (cgrp->root != ancestor->root || cgrp->level < ancestor->level)
return false;
- return cgrp->ancestor_ids[ancestor->level] == cgroup_id(ancestor);
+ return cgrp->ancestors[ancestor->level] == ancestor;
}
/**
@@ -591,11 +591,9 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp,
static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp,
int ancestor_level)
{
- if (cgrp->level < ancestor_level)
+ if (ancestor_level < 0 || ancestor_level > cgrp->level)
return NULL;
- while (cgrp && cgrp->level > ancestor_level)
- cgrp = cgroup_parent(cgrp);
- return cgrp;
+ return cgrp->ancestors[ancestor_level];
}
/**
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 85fa4c8587a8..ce587fe43dab 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2047,7 +2047,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
}
root_cgrp->kn = kernfs_root_to_node(root->kf_root);
WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
- root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
+ root_cgrp->ancestors[0] = root_cgrp;
ret = css_populate_dir(&root_cgrp->self);
if (ret)
@@ -5391,8 +5391,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
int ret;
/* allocate the cgroup and its ID, 0 is reserved for the root */
- cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
- GFP_KERNEL);
+ cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL);
if (!cgrp)
return ERR_PTR(-ENOMEM);
@@ -5444,7 +5443,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
spin_lock_irq(&css_set_lock);
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
- cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
+ cgrp->ancestors[tcgrp->level] = tcgrp;
if (tcgrp != cgrp) {
tcgrp->nr_descendants++;
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index 05ae5a338b6f..d982a7c22a77 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -40,16 +40,17 @@ static noinline bool
nft_sock_get_eval_cgroupv2(u32 *dest, struct sock *sk, const struct nft_pktinfo *pkt, u32 level)
{
struct cgroup *cgrp;
+ u64 cgid;
if (!sk_fullsock(sk))
return false;
- cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- if (level > cgrp->level)
+ cgrp = cgroup_ancestor(sock_cgroup_ptr(&sk->sk_cgrp_data), level);
+ if (!cgrp)
return false;
- memcpy(dest, &cgrp->ancestor_ids[level], sizeof(u64));
-
+ cgid = cgroup_id(cgrp);
+ memcpy(dest, &cgid, sizeof(u64));
return true;
}
#endif
diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
index 292c430768b5..bd6a420acc8f 100644
--- a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
+++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
@@ -68,7 +68,7 @@ static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
break;
// convert cgroup-id to a map index
- cgrp_id = BPF_CORE_READ(cgrp, ancestor_ids[i]);
+ cgrp_id = BPF_CORE_READ(cgrp, ancestors[i], kn, id);
elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
if (!elem)
continue;
On Fri, Jul 29, 2022 at 10:58:22AM -1000, Tejun Heo <[email protected]> wrote:
> @@ -520,8 +520,8 @@ struct cgroup_root {
> /* The root cgroup. Root is destroyed on its release. */
> struct cgroup cgrp;
>
> - /* for cgrp->ancestor_ids[0] */
> - u64 cgrp_ancestor_id_storage;
> + /* for cgrp->ancestors[0] */
> + u64 cgrp_ancestor_storage;
Just noticed, this member is (and was AFAICS) superfluous.
Michal
On Sat, Jul 30, 2022 at 12:42:09AM +0200, Michal Koutn? wrote:
> On Fri, Jul 29, 2022 at 10:58:22AM -1000, Tejun Heo <[email protected]> wrote:
> > @@ -520,8 +520,8 @@ struct cgroup_root {
> > /* The root cgroup. Root is destroyed on its release. */
> > struct cgroup cgrp;
> >
> > - /* for cgrp->ancestor_ids[0] */
> > - u64 cgrp_ancestor_id_storage;
> > + /* for cgrp->ancestors[0] */
> > + u64 cgrp_ancestor_storage;
>
> Just noticed, this member is (and was AFAICS) superfluous.
I should have changed the type to struct cgroup * but that's the space into
which cgroup_root->cgrp->ancestors[] stretch into.
Thanks.
--
tejun
Every cgroup knows all its ancestors through its ->ancestor_ids[]. There's
no advantage to remembering the IDs instead of the pointers directly and
this makes the array useless for finding an actual ancestor cgroup forcing
cgroup_ancestor() to iteratively walk up the hierarchy instead. Let's
replace cgroup->ancestor_ids[] with ->ancestors[] and remove the walking-up
from cgroup_ancestor().
While at it, improve comments around cgroup_root->cgrp_ancestor_storage.
This patch shouldn't cause user-visible behavior differences.
v2: Update cgroup_ancestor() to use ->ancestors[].
v3: cgroup_root->cgrp_ancestor_storage's type is updated to match
cgroup->ancestors[]. Better comments.
Signed-off-by: Tejun Heo <[email protected]>
Acked-by: Namhyung Kim <[email protected]>
---
include/linux/cgroup-defs.h | 16 ++++++++++------
include/linux/cgroup.h | 8 +++-----
kernel/cgroup/cgroup.c | 7 +++----
net/netfilter/nft_socket.c | 9 +++++----
tools/perf/util/bpf_skel/bperf_cgroup.bpf.c | 2 +-
5 files changed, 22 insertions(+), 20 deletions(-)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 63bf43c7ca3b..52a3c47c89bc 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -379,7 +379,7 @@ struct cgroup {
/*
* The depth this cgroup is at. The root is at depth zero and each
* step down the hierarchy increments the level. This along with
- * ancestor_ids[] can determine whether a given cgroup is a
+ * ancestors[] can determine whether a given cgroup is a
* descendant of another without traversing the hierarchy.
*/
int level;
@@ -499,8 +499,8 @@ struct cgroup {
/* Used to store internal freezer state */
struct cgroup_freezer_state freezer;
- /* ids of the ancestors at each level including self */
- u64 ancestor_ids[];
+ /* All ancestors including self */
+ struct cgroup *ancestors[];
};
/*
@@ -517,11 +517,15 @@ struct cgroup_root {
/* Unique id for this hierarchy. */
int hierarchy_id;
- /* The root cgroup. Root is destroyed on its release. */
+ /*
+ * The root cgroup. The containing cgroup_root will be destroyed on its
+ * release. cgrp->ancestors[0] will be used overflowing into the
+ * following field. cgrp_ancestor_storage must immediately follow.
+ */
struct cgroup cgrp;
- /* for cgrp->ancestor_ids[0] */
- u64 cgrp_ancestor_id_storage;
+ /* must follow cgrp for cgrp->ancestors[0], see above */
+ struct cgroup *cgrp_ancestor_storage;
/* Number of cgroups in the hierarchy, used only for /proc/cgroups */
atomic_t nr_cgrps;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ed53bfe7c46c..4d143729b246 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -574,7 +574,7 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp,
{
if (cgrp->root != ancestor->root || cgrp->level < ancestor->level)
return false;
- return cgrp->ancestor_ids[ancestor->level] == cgroup_id(ancestor);
+ return cgrp->ancestors[ancestor->level] == ancestor;
}
/**
@@ -591,11 +591,9 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp,
static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp,
int ancestor_level)
{
- if (cgrp->level < ancestor_level)
+ if (ancestor_level < 0 || ancestor_level > cgrp->level)
return NULL;
- while (cgrp && cgrp->level > ancestor_level)
- cgrp = cgroup_parent(cgrp);
- return cgrp;
+ return cgrp->ancestors[ancestor_level];
}
/**
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 85fa4c8587a8..ce587fe43dab 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2047,7 +2047,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
}
root_cgrp->kn = kernfs_root_to_node(root->kf_root);
WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
- root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
+ root_cgrp->ancestors[0] = root_cgrp;
ret = css_populate_dir(&root_cgrp->self);
if (ret)
@@ -5391,8 +5391,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
int ret;
/* allocate the cgroup and its ID, 0 is reserved for the root */
- cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
- GFP_KERNEL);
+ cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL);
if (!cgrp)
return ERR_PTR(-ENOMEM);
@@ -5444,7 +5443,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
spin_lock_irq(&css_set_lock);
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
- cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
+ cgrp->ancestors[tcgrp->level] = tcgrp;
if (tcgrp != cgrp) {
tcgrp->nr_descendants++;
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index 05ae5a338b6f..d982a7c22a77 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -40,16 +40,17 @@ static noinline bool
nft_sock_get_eval_cgroupv2(u32 *dest, struct sock *sk, const struct nft_pktinfo *pkt, u32 level)
{
struct cgroup *cgrp;
+ u64 cgid;
if (!sk_fullsock(sk))
return false;
- cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- if (level > cgrp->level)
+ cgrp = cgroup_ancestor(sock_cgroup_ptr(&sk->sk_cgrp_data), level);
+ if (!cgrp)
return false;
- memcpy(dest, &cgrp->ancestor_ids[level], sizeof(u64));
-
+ cgid = cgroup_id(cgrp);
+ memcpy(dest, &cgid, sizeof(u64));
return true;
}
#endif
diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
index 292c430768b5..bd6a420acc8f 100644
--- a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
+++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
@@ -68,7 +68,7 @@ static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
break;
// convert cgroup-id to a map index
- cgrp_id = BPF_CORE_READ(cgrp, ancestor_ids[i]);
+ cgrp_id = BPF_CORE_READ(cgrp, ancestors[i], kn, id);
elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
if (!elem)
continue;
On Fri, Jul 29, 2022 at 01:10:16PM -1000, Tejun Heo wrote:
> Every cgroup knows all its ancestors through its ->ancestor_ids[]. There's
> no advantage to remembering the IDs instead of the pointers directly and
> this makes the array useless for finding an actual ancestor cgroup forcing
> cgroup_ancestor() to iteratively walk up the hierarchy instead. Let's
> replace cgroup->ancestor_ids[] with ->ancestors[] and remove the walking-up
> from cgroup_ancestor().
>
> While at it, improve comments around cgroup_root->cgrp_ancestor_storage.
>
> This patch shouldn't cause user-visible behavior differences.
>
> v2: Update cgroup_ancestor() to use ->ancestors[].
>
> v3: cgroup_root->cgrp_ancestor_storage's type is updated to match
> cgroup->ancestors[]. Better comments.
>
> Signed-off-by: Tejun Heo <[email protected]>
> Acked-by: Namhyung Kim <[email protected]>
Applied to cgroup/for-6.1.
--
tejun
Hi Tejun,
On Mon, Aug 15, 2022 at 2:17 PM Tejun Heo <[email protected]> wrote:
>
> On Fri, Jul 29, 2022 at 01:10:16PM -1000, Tejun Heo wrote:
> > Every cgroup knows all its ancestors through its ->ancestor_ids[]. There's
> > no advantage to remembering the IDs instead of the pointers directly and
> > this makes the array useless for finding an actual ancestor cgroup forcing
> > cgroup_ancestor() to iteratively walk up the hierarchy instead. Let's
> > replace cgroup->ancestor_ids[] with ->ancestors[] and remove the walking-up
> > from cgroup_ancestor().
> >
> > While at it, improve comments around cgroup_root->cgrp_ancestor_storage.
> >
> > This patch shouldn't cause user-visible behavior differences.
> >
> > v2: Update cgroup_ancestor() to use ->ancestors[].
> >
> > v3: cgroup_root->cgrp_ancestor_storage's type is updated to match
> > cgroup->ancestors[]. Better comments.
> >
> > Signed-off-by: Tejun Heo <[email protected]>
> > Acked-by: Namhyung Kim <[email protected]>
>
> Applied to cgroup/for-6.1.
I've realized that perf stat change needs backward compatibility.
Will send a fix soon.
Thanks,
Namhyung
The recent change in the cgroup will break the backward compatiblity in
the BPF program. It should support both old and new kernels using BPF
CO-RE technique.
Like the task_struct->__state handling in the offcpu analysis, we can
check the field name in the cgroup struct.
Signed-off-by: Namhyung Kim <[email protected]>
---
Arnaldo, I think this should go through the cgroup tree since it depends
on the earlier change there. I don't think it'd conflict with other
perf changes but please let me know if you see any trouble, thanks!
tools/perf/util/bpf_skel/bperf_cgroup.bpf.c | 29 ++++++++++++++++++++-
1 file changed, 28 insertions(+), 1 deletion(-)
diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
index 488bd398f01d..4fe61043de04 100644
--- a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
+++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
@@ -43,12 +43,39 @@ struct {
__uint(value_size, sizeof(struct bpf_perf_event_value));
} cgrp_readings SEC(".maps");
+/* new kernel cgroup definition */
+struct cgroup___new {
+ int level;
+ struct cgroup *ancestors[];
+} __attribute__((preserve_access_index));
+
+/* old kernel cgroup definition */
+struct cgroup___old {
+ int level;
+ u64 ancestor_ids[];
+} __attribute__((preserve_access_index));
+
const volatile __u32 num_events = 1;
const volatile __u32 num_cpus = 1;
int enabled = 0;
int use_cgroup_v2 = 0;
+static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level)
+{
+ /* recast pointer to capture new type for compiler */
+ struct cgroup___new *cgrp_new = (void *)cgrp;
+
+ if (bpf_core_field_exists(cgrp_new->ancestors)) {
+ return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id);
+ } else {
+ /* recast pointer to capture old type for compiler */
+ struct cgroup___old *cgrp_old = (void *)cgrp;
+
+ return BPF_CORE_READ(cgrp_old, ancestor_ids[level]);
+ }
+}
+
static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
{
struct task_struct *p = (void *)bpf_get_current_task();
@@ -70,7 +97,7 @@ static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
break;
// convert cgroup-id to a map index
- cgrp_id = BPF_CORE_READ(cgrp, ancestors[i], kn, id);
+ cgrp_id = get_cgroup_v1_ancestor_id(cgrp, i);
elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
if (!elem)
continue;
--
2.37.3.968.ga6b4b080e4-goog
On Wed, Sep 21, 2022 at 09:14:35PM -0700, Namhyung Kim wrote:
> The recent change in the cgroup will break the backward compatiblity in
> the BPF program. It should support both old and new kernels using BPF
> CO-RE technique.
>
> Like the task_struct->__state handling in the offcpu analysis, we can
> check the field name in the cgroup struct.
>
> Signed-off-by: Namhyung Kim <[email protected]>
> ---
> Arnaldo, I think this should go through the cgroup tree since it depends
> on the earlier change there. I don't think it'd conflict with other
> perf changes but please let me know if you see any trouble, thanks!
FWIW, looks fine to me and I'd be happy to route this through the cgroup
tree once it gets acked.
Thanks.
--
tejun
On Fri, Sep 23, 2022 at 8:22 PM Tejun Heo <[email protected]> wrote:
>
> On Wed, Sep 21, 2022 at 09:14:35PM -0700, Namhyung Kim wrote:
> > The recent change in the cgroup will break the backward compatiblity in
> > the BPF program. It should support both old and new kernels using BPF
> > CO-RE technique.
> >
> > Like the task_struct->__state handling in the offcpu analysis, we can
> > check the field name in the cgroup struct.
> >
> > Signed-off-by: Namhyung Kim <[email protected]>
> > ---
> > Arnaldo, I think this should go through the cgroup tree since it depends
> > on the earlier change there. I don't think it'd conflict with other
> > perf changes but please let me know if you see any trouble, thanks!
>
> FWIW, looks fine to me and I'd be happy to route this through the cgroup
> tree once it gets acked.
Thanks Tejun!
Can any perf + bpf folks take a look?
Hi Jiri,
On Fri, Sep 30, 2022 at 2:44 PM Jiri Olsa <[email protected]> wrote:
>
> On Wed, Sep 21, 2022 at 09:14:35PM -0700, Namhyung Kim wrote:
> > The recent change in the cgroup will break the backward compatiblity in
> > the BPF program. It should support both old and new kernels using BPF
> > CO-RE technique.
> >
> > Like the task_struct->__state handling in the offcpu analysis, we can
> > check the field name in the cgroup struct.
> >
> > Signed-off-by: Namhyung Kim <[email protected]>
> > ---
> > Arnaldo, I think this should go through the cgroup tree since it depends
> > on the earlier change there. I don't think it'd conflict with other
> > perf changes but please let me know if you see any trouble, thanks!
>
> could you please paste the cgroup tree link?
Do you mean this?
https://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
Thanks,.
Namhyung
On Fri, Sep 30, 2022 at 3:00 PM Arnaldo Carvalho de Melo
<[email protected]> wrote:
>
>
>
> On September 30, 2022 6:56:40 PM GMT-03:00, Namhyung Kim <[email protected]> wrote:
> >Hi Jiri,
> >
> >On Fri, Sep 30, 2022 at 2:44 PM Jiri Olsa <[email protected]> wrote:
> >>
> >> On Wed, Sep 21, 2022 at 09:14:35PM -0700, Namhyung Kim wrote:
> >> > The recent change in the cgroup will break the backward compatiblity in
> >> > the BPF program. It should support both old and new kernels using BPF
> >> > CO-RE technique.
> >> >
> >> > Like the task_struct->__state handling in the offcpu analysis, we can
> >> > check the field name in the cgroup struct.
> >> >
> >> > Signed-off-by: Namhyung Kim <[email protected]>
> >> > ---
> >> > Arnaldo, I think this should go through the cgroup tree since it depends
> >> > on the earlier change there. I don't think it'd conflict with other
> >> > perf changes but please let me know if you see any trouble, thanks!
> >>
> >> could you please paste the cgroup tree link?
> >
> >Do you mean this?
> >
> > https://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
> >
>
>
> Which branch and cset in that tree does you perf skel depends on?
I believe it's for-6.1 and the cset is in
https://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git/commit/?h=for-6.1&id=7f203bc89eb66d6afde7eae91347fc0352090cc3
Thanks,
Namhyung
On Wed, Sep 21, 2022 at 09:14:35PM -0700, Namhyung Kim wrote:
> The recent change in the cgroup will break the backward compatiblity in
> the BPF program. It should support both old and new kernels using BPF
> CO-RE technique.
>
> Like the task_struct->__state handling in the offcpu analysis, we can
> check the field name in the cgroup struct.
>
> Signed-off-by: Namhyung Kim <[email protected]>
> ---
> Arnaldo, I think this should go through the cgroup tree since it depends
> on the earlier change there. I don't think it'd conflict with other
> perf changes but please let me know if you see any trouble, thanks!
could you please paste the cgroup tree link?
thanks,
jirka
>
> tools/perf/util/bpf_skel/bperf_cgroup.bpf.c | 29 ++++++++++++++++++++-
> 1 file changed, 28 insertions(+), 1 deletion(-)
>
> diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> index 488bd398f01d..4fe61043de04 100644
> --- a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> +++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> @@ -43,12 +43,39 @@ struct {
> __uint(value_size, sizeof(struct bpf_perf_event_value));
> } cgrp_readings SEC(".maps");
>
> +/* new kernel cgroup definition */
> +struct cgroup___new {
> + int level;
> + struct cgroup *ancestors[];
> +} __attribute__((preserve_access_index));
> +
> +/* old kernel cgroup definition */
> +struct cgroup___old {
> + int level;
> + u64 ancestor_ids[];
> +} __attribute__((preserve_access_index));
> +
> const volatile __u32 num_events = 1;
> const volatile __u32 num_cpus = 1;
>
> int enabled = 0;
> int use_cgroup_v2 = 0;
>
> +static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level)
> +{
> + /* recast pointer to capture new type for compiler */
> + struct cgroup___new *cgrp_new = (void *)cgrp;
> +
> + if (bpf_core_field_exists(cgrp_new->ancestors)) {
> + return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id);
> + } else {
> + /* recast pointer to capture old type for compiler */
> + struct cgroup___old *cgrp_old = (void *)cgrp;
> +
> + return BPF_CORE_READ(cgrp_old, ancestor_ids[level]);
> + }
> +}
> +
> static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
> {
> struct task_struct *p = (void *)bpf_get_current_task();
> @@ -70,7 +97,7 @@ static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
> break;
>
> // convert cgroup-id to a map index
> - cgrp_id = BPF_CORE_READ(cgrp, ancestors[i], kn, id);
> + cgrp_id = get_cgroup_v1_ancestor_id(cgrp, i);
> elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
> if (!elem)
> continue;
> --
> 2.37.3.968.ga6b4b080e4-goog
>
On September 30, 2022 6:56:40 PM GMT-03:00, Namhyung Kim <[email protected]> wrote:
>Hi Jiri,
>
>On Fri, Sep 30, 2022 at 2:44 PM Jiri Olsa <[email protected]> wrote:
>>
>> On Wed, Sep 21, 2022 at 09:14:35PM -0700, Namhyung Kim wrote:
>> > The recent change in the cgroup will break the backward compatiblity in
>> > the BPF program. It should support both old and new kernels using BPF
>> > CO-RE technique.
>> >
>> > Like the task_struct->__state handling in the offcpu analysis, we can
>> > check the field name in the cgroup struct.
>> >
>> > Signed-off-by: Namhyung Kim <[email protected]>
>> > ---
>> > Arnaldo, I think this should go through the cgroup tree since it depends
>> > on the earlier change there. I don't think it'd conflict with other
>> > perf changes but please let me know if you see any trouble, thanks!
>>
>> could you please paste the cgroup tree link?
>
>Do you mean this?
>
> https://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
>
Which branch and cset in that tree does you perf skel depends on?
- Arnaldo
>Thanks,.
>Namhyung
On Wed, Sep 21, 2022 at 9:21 PM Namhyung Kim <[email protected]> wrote:
>
> The recent change in the cgroup will break the backward compatiblity in
> the BPF program. It should support both old and new kernels using BPF
> CO-RE technique.
>
> Like the task_struct->__state handling in the offcpu analysis, we can
> check the field name in the cgroup struct.
>
> Signed-off-by: Namhyung Kim <[email protected]>
> ---
> Arnaldo, I think this should go through the cgroup tree since it depends
> on the earlier change there. I don't think it'd conflict with other
> perf changes but please let me know if you see any trouble, thanks!
>
> tools/perf/util/bpf_skel/bperf_cgroup.bpf.c | 29 ++++++++++++++++++++-
> 1 file changed, 28 insertions(+), 1 deletion(-)
>
> diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> index 488bd398f01d..4fe61043de04 100644
> --- a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> +++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> @@ -43,12 +43,39 @@ struct {
> __uint(value_size, sizeof(struct bpf_perf_event_value));
> } cgrp_readings SEC(".maps");
>
> +/* new kernel cgroup definition */
> +struct cgroup___new {
> + int level;
> + struct cgroup *ancestors[];
> +} __attribute__((preserve_access_index));
> +
> +/* old kernel cgroup definition */
> +struct cgroup___old {
> + int level;
> + u64 ancestor_ids[];
> +} __attribute__((preserve_access_index));
> +
> const volatile __u32 num_events = 1;
> const volatile __u32 num_cpus = 1;
>
> int enabled = 0;
> int use_cgroup_v2 = 0;
>
> +static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level)
> +{
> + /* recast pointer to capture new type for compiler */
> + struct cgroup___new *cgrp_new = (void *)cgrp;
> +
> + if (bpf_core_field_exists(cgrp_new->ancestors)) {
> + return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id);
have you checked generated BPF code for this ancestors[level] access?
I'd expect CO-RE relocation for finding ancestors offset and then just
normal + level * 8 arithmetic, but would be nice to confirm. Apart
from this, looks good to me:
Acked-by: Andrii Nakryiko <[email protected]>
> + } else {
> + /* recast pointer to capture old type for compiler */
> + struct cgroup___old *cgrp_old = (void *)cgrp;
> +
> + return BPF_CORE_READ(cgrp_old, ancestor_ids[level]);
> + }
> +}
> +
> static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
> {
> struct task_struct *p = (void *)bpf_get_current_task();
> @@ -70,7 +97,7 @@ static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
> break;
>
> // convert cgroup-id to a map index
> - cgrp_id = BPF_CORE_READ(cgrp, ancestors[i], kn, id);
> + cgrp_id = get_cgroup_v1_ancestor_id(cgrp, i);
> elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
> if (!elem)
> continue;
> --
> 2.37.3.968.ga6b4b080e4-goog
>
Hello,
On Fri, Sep 30, 2022 at 3:48 PM Andrii Nakryiko
<[email protected]> wrote:
>
> On Wed, Sep 21, 2022 at 9:21 PM Namhyung Kim <[email protected]> wrote:
> >
> > The recent change in the cgroup will break the backward compatiblity in
> > the BPF program. It should support both old and new kernels using BPF
> > CO-RE technique.
> >
> > Like the task_struct->__state handling in the offcpu analysis, we can
> > check the field name in the cgroup struct.
> >
> > Signed-off-by: Namhyung Kim <[email protected]>
> > ---
> > Arnaldo, I think this should go through the cgroup tree since it depends
> > on the earlier change there. I don't think it'd conflict with other
> > perf changes but please let me know if you see any trouble, thanks!
> >
> > tools/perf/util/bpf_skel/bperf_cgroup.bpf.c | 29 ++++++++++++++++++++-
> > 1 file changed, 28 insertions(+), 1 deletion(-)
> >
> > diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> > index 488bd398f01d..4fe61043de04 100644
> > --- a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> > +++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> > @@ -43,12 +43,39 @@ struct {
> > __uint(value_size, sizeof(struct bpf_perf_event_value));
> > } cgrp_readings SEC(".maps");
> >
> > +/* new kernel cgroup definition */
> > +struct cgroup___new {
> > + int level;
> > + struct cgroup *ancestors[];
> > +} __attribute__((preserve_access_index));
> > +
> > +/* old kernel cgroup definition */
> > +struct cgroup___old {
> > + int level;
> > + u64 ancestor_ids[];
> > +} __attribute__((preserve_access_index));
> > +
> > const volatile __u32 num_events = 1;
> > const volatile __u32 num_cpus = 1;
> >
> > int enabled = 0;
> > int use_cgroup_v2 = 0;
> >
> > +static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level)
> > +{
> > + /* recast pointer to capture new type for compiler */
> > + struct cgroup___new *cgrp_new = (void *)cgrp;
> > +
> > + if (bpf_core_field_exists(cgrp_new->ancestors)) {
> > + return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id);
>
> have you checked generated BPF code for this ancestors[level] access?
> I'd expect CO-RE relocation for finding ancestors offset and then just
> normal + level * 8 arithmetic, but would be nice to confirm. Apart
> from this, looks good to me:
>
> Acked-by: Andrii Nakryiko <[email protected]>
Thanks for your review!
How can I check the generated code? Do you have something works with
skeletons or do I have to save the BPF object somehow during the build?
Thanks,
Namhyung
On Wed, Sep 21, 2022 at 09:14:35PM -0700, Namhyung Kim wrote:
> The recent change in the cgroup will break the backward compatiblity in
> the BPF program. It should support both old and new kernels using BPF
> CO-RE technique.
>
> Like the task_struct->__state handling in the offcpu analysis, we can
> check the field name in the cgroup struct.
>
> Signed-off-by: Namhyung Kim <[email protected]>
lgtm
Acked-by: Jiri Olsa <[email protected]>
jirka
> ---
> Arnaldo, I think this should go through the cgroup tree since it depends
> on the earlier change there. I don't think it'd conflict with other
> perf changes but please let me know if you see any trouble, thanks!
>
> tools/perf/util/bpf_skel/bperf_cgroup.bpf.c | 29 ++++++++++++++++++++-
> 1 file changed, 28 insertions(+), 1 deletion(-)
>
> diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> index 488bd398f01d..4fe61043de04 100644
> --- a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> +++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> @@ -43,12 +43,39 @@ struct {
> __uint(value_size, sizeof(struct bpf_perf_event_value));
> } cgrp_readings SEC(".maps");
>
> +/* new kernel cgroup definition */
> +struct cgroup___new {
> + int level;
> + struct cgroup *ancestors[];
> +} __attribute__((preserve_access_index));
> +
> +/* old kernel cgroup definition */
> +struct cgroup___old {
> + int level;
> + u64 ancestor_ids[];
> +} __attribute__((preserve_access_index));
> +
> const volatile __u32 num_events = 1;
> const volatile __u32 num_cpus = 1;
>
> int enabled = 0;
> int use_cgroup_v2 = 0;
>
> +static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level)
> +{
> + /* recast pointer to capture new type for compiler */
> + struct cgroup___new *cgrp_new = (void *)cgrp;
> +
> + if (bpf_core_field_exists(cgrp_new->ancestors)) {
> + return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id);
> + } else {
> + /* recast pointer to capture old type for compiler */
> + struct cgroup___old *cgrp_old = (void *)cgrp;
> +
> + return BPF_CORE_READ(cgrp_old, ancestor_ids[level]);
> + }
> +}
> +
> static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
> {
> struct task_struct *p = (void *)bpf_get_current_task();
> @@ -70,7 +97,7 @@ static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
> break;
>
> // convert cgroup-id to a map index
> - cgrp_id = BPF_CORE_READ(cgrp, ancestors[i], kn, id);
> + cgrp_id = get_cgroup_v1_ancestor_id(cgrp, i);
> elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
> if (!elem)
> continue;
> --
> 2.37.3.968.ga6b4b080e4-goog
>
On Fri, Sep 30, 2022 at 02:56:40PM -0700, Namhyung Kim wrote:
> Hi Jiri,
>
> On Fri, Sep 30, 2022 at 2:44 PM Jiri Olsa <[email protected]> wrote:
> >
> > On Wed, Sep 21, 2022 at 09:14:35PM -0700, Namhyung Kim wrote:
> > > The recent change in the cgroup will break the backward compatiblity in
> > > the BPF program. It should support both old and new kernels using BPF
> > > CO-RE technique.
> > >
> > > Like the task_struct->__state handling in the offcpu analysis, we can
> > > check the field name in the cgroup struct.
> > >
> > > Signed-off-by: Namhyung Kim <[email protected]>
> > > ---
> > > Arnaldo, I think this should go through the cgroup tree since it depends
> > > on the earlier change there. I don't think it'd conflict with other
> > > perf changes but please let me know if you see any trouble, thanks!
> >
> > could you please paste the cgroup tree link?
>
> Do you mean this?
>
> https://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
yea, wanted to try that.. I cherry-picked the 7f203bc89eb6 ;-)
thanks,
jirka
>
> Thanks,.
> Namhyung
On Fri, Sep 30, 2022 at 7:31 PM Namhyung Kim <[email protected]> wrote:
>
> Hello,
>
> On Fri, Sep 30, 2022 at 3:48 PM Andrii Nakryiko
> <[email protected]> wrote:
> >
> > On Wed, Sep 21, 2022 at 9:21 PM Namhyung Kim <[email protected]> wrote:
> > >
> > > The recent change in the cgroup will break the backward compatiblity in
> > > the BPF program. It should support both old and new kernels using BPF
> > > CO-RE technique.
> > >
> > > Like the task_struct->__state handling in the offcpu analysis, we can
> > > check the field name in the cgroup struct.
> > >
> > > Signed-off-by: Namhyung Kim <[email protected]>
> > > ---
> > > Arnaldo, I think this should go through the cgroup tree since it depends
> > > on the earlier change there. I don't think it'd conflict with other
> > > perf changes but please let me know if you see any trouble, thanks!
> > >
> > > tools/perf/util/bpf_skel/bperf_cgroup.bpf.c | 29 ++++++++++++++++++++-
> > > 1 file changed, 28 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> > > index 488bd398f01d..4fe61043de04 100644
> > > --- a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> > > +++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> > > @@ -43,12 +43,39 @@ struct {
> > > __uint(value_size, sizeof(struct bpf_perf_event_value));
> > > } cgrp_readings SEC(".maps");
> > >
> > > +/* new kernel cgroup definition */
> > > +struct cgroup___new {
> > > + int level;
> > > + struct cgroup *ancestors[];
> > > +} __attribute__((preserve_access_index));
> > > +
> > > +/* old kernel cgroup definition */
> > > +struct cgroup___old {
> > > + int level;
> > > + u64 ancestor_ids[];
> > > +} __attribute__((preserve_access_index));
> > > +
> > > const volatile __u32 num_events = 1;
> > > const volatile __u32 num_cpus = 1;
> > >
> > > int enabled = 0;
> > > int use_cgroup_v2 = 0;
> > >
> > > +static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level)
> > > +{
> > > + /* recast pointer to capture new type for compiler */
> > > + struct cgroup___new *cgrp_new = (void *)cgrp;
> > > +
> > > + if (bpf_core_field_exists(cgrp_new->ancestors)) {
> > > + return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id);
> >
> > have you checked generated BPF code for this ancestors[level] access?
> > I'd expect CO-RE relocation for finding ancestors offset and then just
> > normal + level * 8 arithmetic, but would be nice to confirm. Apart
> > from this, looks good to me:
> >
> > Acked-by: Andrii Nakryiko <[email protected]>
>
> Thanks for your review!
>
> How can I check the generated code? Do you have something works with
> skeletons or do I have to save the BPF object somehow during the build?
>
skeleton is generated from ELF BPF object file. You can do
llvm-objdump -d <obj.bpf.o> to see instructions. Unfortunately you
can't see BPF CO-RE relocations this way, you'd have to use something
like my custom tool ([0]).
But anyways, I checked locally similar code pattern and I think it's
all good from BPF CO-RE perspective. I see appropriate relocations in
all the necessary places. So this should work.
Acked-by: Andrii Nakryiko <[email protected]>
[0] https://github.com/anakryiko/btfdump
> Thanks,
> Namhyung
On Wed, Sep 21, 2022 at 09:14:35PM -0700, Namhyung Kim wrote:
> The recent change in the cgroup will break the backward compatiblity in
> the BPF program. It should support both old and new kernels using BPF
> CO-RE technique.
>
> Like the task_struct->__state handling in the offcpu analysis, we can
> check the field name in the cgroup struct.
>
> Signed-off-by: Namhyung Kim <[email protected]>
Looks like it's acked enough but the patch doesn't apply anymore. Namhyung,
can you please refresh the patch? I'll route this through
cgroup/for-6.1-fixes unless somebody objects.
Thanks.
--
tejun
Hi Tejun,
On Mon, Oct 10, 2022 at 4:59 PM Tejun Heo <[email protected]> wrote:
>
> On Wed, Sep 21, 2022 at 09:14:35PM -0700, Namhyung Kim wrote:
> > The recent change in the cgroup will break the backward compatiblity in
> > the BPF program. It should support both old and new kernels using BPF
> > CO-RE technique.
> >
> > Like the task_struct->__state handling in the offcpu analysis, we can
> > check the field name in the cgroup struct.
> >
> > Signed-off-by: Namhyung Kim <[email protected]>
>
> Looks like it's acked enough but the patch doesn't apply anymore. Namhyung,
> can you please refresh the patch? I'll route this through
> cgroup/for-6.1-fixes unless somebody objects.
Sorry about the conflict. There was another change to get the perf
cgroup subsys id properly. Will send v2 soon.
Thanks,
Namhyung
The recent change in the cgroup will break the backward compatiblity in
the BPF program. It should support both old and new kernels using BPF
CO-RE technique.
Like the task_struct->__state handling in the offcpu analysis, we can
check the field name in the cgroup struct.
Acked-by: Jiri Olsa <[email protected]>
Acked-by: Andrii Nakryiko <[email protected]>
Signed-off-by: Namhyung Kim <[email protected]>
---
tools/perf/util/bpf_skel/bperf_cgroup.bpf.c | 29 ++++++++++++++++++++-
1 file changed, 28 insertions(+), 1 deletion(-)
diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
index 435a87556688..6a438e0102c5 100644
--- a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
+++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
@@ -43,6 +43,18 @@ struct {
__uint(value_size, sizeof(struct bpf_perf_event_value));
} cgrp_readings SEC(".maps");
+/* new kernel cgroup definition */
+struct cgroup___new {
+ int level;
+ struct cgroup *ancestors[];
+} __attribute__((preserve_access_index));
+
+/* old kernel cgroup definition */
+struct cgroup___old {
+ int level;
+ u64 ancestor_ids[];
+} __attribute__((preserve_access_index));
+
const volatile __u32 num_events = 1;
const volatile __u32 num_cpus = 1;
@@ -50,6 +62,21 @@ int enabled = 0;
int use_cgroup_v2 = 0;
int perf_subsys_id = -1;
+static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level)
+{
+ /* recast pointer to capture new type for compiler */
+ struct cgroup___new *cgrp_new = (void *)cgrp;
+
+ if (bpf_core_field_exists(cgrp_new->ancestors)) {
+ return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id);
+ } else {
+ /* recast pointer to capture old type for compiler */
+ struct cgroup___old *cgrp_old = (void *)cgrp;
+
+ return BPF_CORE_READ(cgrp_old, ancestor_ids[level]);
+ }
+}
+
static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
{
struct task_struct *p = (void *)bpf_get_current_task();
@@ -77,7 +104,7 @@ static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
break;
// convert cgroup-id to a map index
- cgrp_id = BPF_CORE_READ(cgrp, ancestors[i], kn, id);
+ cgrp_id = get_cgroup_v1_ancestor_id(cgrp, i);
elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
if (!elem)
continue;
--
2.38.0.rc1.362.ged0d419d3c-goog
On Mon, Oct 10, 2022 at 10:28:08PM -0700, Namhyung Kim wrote:
> The recent change in the cgroup will break the backward compatiblity in
> the BPF program. It should support both old and new kernels using BPF
> CO-RE technique.
>
> Like the task_struct->__state handling in the offcpu analysis, we can
> check the field name in the cgroup struct.
>
> Acked-by: Jiri Olsa <[email protected]>
> Acked-by: Andrii Nakryiko <[email protected]>
> Signed-off-by: Namhyung Kim <[email protected]>
Applied to cgroup/for-6.1-fixes.
Thanks.
--
tejun
Em Fri, Oct 14, 2022 at 10:27:40AM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Tue, Oct 11, 2022 at 06:53:43AM -1000, Tejun Heo escreveu:
> > On Mon, Oct 10, 2022 at 10:28:08PM -0700, Namhyung Kim wrote:
> > > The recent change in the cgroup will break the backward compatiblity in
> > > the BPF program. It should support both old and new kernels using BPF
> > > CO-RE technique.
>
> > > Like the task_struct->__state handling in the offcpu analysis, we can
> > > check the field name in the cgroup struct.
>
> > > Acked-by: Jiri Olsa <[email protected]>
> > > Acked-by: Andrii Nakryiko <[email protected]>
> > > Signed-off-by: Namhyung Kim <[email protected]>
>
> > Applied to cgroup/for-6.1-fixes.
>
> Hey, I noticed that the perf build is broken for the
> tools/perf/util/bpf_skel/bperf_cgroup.bpf.c skell, so I tried using b4
> on this Namhyung patch, it ended up getting a newer version, by Tejun,
> that mixes up kernel code and tooling, which, when I tried to apply
> upstream didn't work.
>
> Please try not to mix up kernel and tools/ changes in the same patch to
> avoid these issues.
>
> Also when changing tools/perf, please CC me.
>
> I'm now back trying to apply v2 of this patch to see if it fixes my
> build.
Yeah, applying just Namhyung's v2 patch gets perf back building, I'll
keep it there while processing the other patches so that I can test them
all together.
- Arnaldo
Em Tue, Oct 11, 2022 at 06:53:43AM -1000, Tejun Heo escreveu:
> On Mon, Oct 10, 2022 at 10:28:08PM -0700, Namhyung Kim wrote:
> > The recent change in the cgroup will break the backward compatiblity in
> > the BPF program. It should support both old and new kernels using BPF
> > CO-RE technique.
> > Like the task_struct->__state handling in the offcpu analysis, we can
> > check the field name in the cgroup struct.
> > Acked-by: Jiri Olsa <[email protected]>
> > Acked-by: Andrii Nakryiko <[email protected]>
> > Signed-off-by: Namhyung Kim <[email protected]>
> Applied to cgroup/for-6.1-fixes.
Hey, I noticed that the perf build is broken for the
tools/perf/util/bpf_skel/bperf_cgroup.bpf.c skell, so I tried using b4
on this Namhyung patch, it ended up getting a newer version, by Tejun,
that mixes up kernel code and tooling, which, when I tried to apply
upstream didn't work.
Please try not to mix up kernel and tools/ changes in the same patch to
avoid these issues.
Also when changing tools/perf, please CC me.
I'm now back trying to apply v2 of this patch to see if it fixes my
build.
Thanks,
- Arnaldo
Em Fri, Oct 14, 2022 at 06:40:56AM -1000, Tejun Heo escreveu:
> On Fri, Oct 14, 2022 at 10:27:40AM -0300, Arnaldo Carvalho de Melo wrote:
> > Hey, I noticed that the perf build is broken for the
> > tools/perf/util/bpf_skel/bperf_cgroup.bpf.c skell, so I tried using b4
> > on this Namhyung patch, it ended up getting a newer version, by Tejun,
> > that mixes up kernel code and tooling, which, when I tried to apply
> > upstream didn't work.
> > Please try not to mix up kernel and tools/ changes in the same patch to
> > avoid these issues.
> I didn't write a newer version of this patch. What are you talking about?
So, I saw this message from you in reply to Namhyung's v2 patch:
--------------------------
Date: Tue, 11 Oct 2022 06:53:43 -1000
From: Tejun Heo <[email protected]>
Subject: Re: [PATCH v2] perf stat: Support old kernels for bperf cgroup counting
To: Namhyung Kim <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>, Zefan Li <[email protected]>, Johannes Weiner <[email protected]>, [email protected], Jiri Olsa <[email protected]>, LKML <[email protected]>, [email protected], Song Liu
<[email protected]>, [email protected], Andrii Nakryiko <[email protected]>
Sender: Tejun Heo <[email protected]>
Message-ID: <[email protected]>
On Mon, Oct 10, 2022 at 10:28:08PM -0700, Namhyung Kim wrote:
> The recent change in the cgroup will break the backward compatiblity in
> the BPF program. It should support both old and new kernels using BPF
> CO-RE technique.
> Like the task_struct->__state handling in the offcpu analysis, we can
> check the field name in the cgroup struct.
> Acked-by: Jiri Olsa <[email protected]>
> Acked-by: Andrii Nakryiko <[email protected]>
> Signed-off-by: Namhyung Kim <[email protected]>
Applied to cgroup/for-6.1-fixes.
Thanks.
--
tejun
--------------------------
So, I picked the message id, [email protected], and asked
b4 to pick the patch:
⬢[acme@toolbox perf]$ b4 am --help | grep -A1 -- -c,
-c, --check-newer-revisions
Check if newer patch revisions exist
⬢[acme@toolbox perf]$
⬢[acme@toolbox perf]$ b4 am -ctsl --cc-trailers [email protected]
Grabbing thread from lore.kernel.org/all/Y0Wfl88objrECjSo%40slm.duckdns.org/t.mbox.gz
Checking for newer revisions on https://lore.kernel.org/all/
Analyzing 27 messages in the thread
('Acked-by', 'Andrii Nakryiko <[email protected]>', None)
Will use the latest revision: v3
You can pick other revisions using the -vN flag
Checking attestation on all messages, may take a moment...
---
✓ [PATCH v3] cgroup: Replace cgroup->ancestor_ids[] with ->ancestors[]
---
✓ Signed: DKIM/gmail.com (From: [email protected])
---
Total patches: 1
---
Link: https://lore.kernel.org/r/[email protected]
Base: not specified
git am ./v3_20220729_tj_cgroup_replace_cgroup_ancestor_ids_with_ancestors.mbx
⬢[acme@toolbox perf]$
Which got me this:
⬢[acme@toolbox perf]$ diffstat ./v3_20220729_tj_cgroup_replace_cgroup_ancestor_ids_with_ancestors.mbx
include/linux/cgroup-defs.h | 16 ++++++++++------
include/linux/cgroup.h | 8 +++-----
kernel/cgroup/cgroup.c | 7 +++----
net/netfilter/nft_socket.c | 9 +++++----
tools/perf/util/bpf_skel/bperf_cgroup.bpf.c | 2 +-
5 files changed, 22 insertions(+), 20 deletions(-)
⬢[acme@toolbox perf]$
⬢[acme@toolbox perf]$ grep From: ./v3_20220729_tj_cgroup_replace_cgroup_ancestor_ids_with_ancestors.mbx
From: Tejun Heo <[email protected]>
⬢[acme@toolbox perf]$
That mixes kernel and tools bits and touches
tools/perf/util/bpf_skel/bperf_cgroup.bpf.c, hence my request to add me
to the CC list for patches touching tools/perf/.
My assumption that it was a new patch was because b4 somehow got to
v3_20220729_tj_cgroup_replace_cgroup_ancestor_ids_with_ancestors,
which has v3 and touches the tools cgroup bpf skel.
So it seems b4 is confused somehow.
Hope this clarifies.
- Arnaldo
On Fri, Oct 14, 2022 at 10:27:40AM -0300, Arnaldo Carvalho de Melo wrote:
> Hey, I noticed that the perf build is broken for the
> tools/perf/util/bpf_skel/bperf_cgroup.bpf.c skell, so I tried using b4
> on this Namhyung patch, it ended up getting a newer version, by Tejun,
> that mixes up kernel code and tooling, which, when I tried to apply
> upstream didn't work.
>
> Please try not to mix up kernel and tools/ changes in the same patch to
> avoid these issues.
I didn't write a newer version of this patch. What are you talking about?
--
tejun