Sometimes it is necessary to understand in which memcg tracepoint event
occurred. The function cgroup_name() is a useful tool for this purpose.
To integrate cgroup_name() into the existing memcg tracepoints, this
patch introduces a new tracepoint template for the begin() and end()
events, utilizing static __array() to store the cgroup name.
Signed-off-by: Dmitry Rokosov <[email protected]>
---
include/trace/events/vmscan.h | 77 +++++++++++++++++++++++++++++------
mm/vmscan.c | 10 ++---
2 files changed, 70 insertions(+), 17 deletions(-)
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index d2123dd960d5..9b49cd120ae9 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -141,19 +141,47 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_b
);
#ifdef CONFIG_MEMCG
-DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin,
- TP_PROTO(int order, gfp_t gfp_flags),
+DECLARE_EVENT_CLASS(mm_vmscan_memcg_reclaim_begin_template,
- TP_ARGS(order, gfp_flags)
+ TP_PROTO(int order, gfp_t gfp_flags, const struct mem_cgroup *memcg),
+
+ TP_ARGS(order, gfp_flags, memcg),
+
+ TP_STRUCT__entry(
+ __field(int, order)
+ __field(unsigned long, gfp_flags)
+ __array(char, name, NAME_MAX + 1)
+ ),
+
+ TP_fast_assign(
+ __entry->order = order;
+ __entry->gfp_flags = (__force unsigned long)gfp_flags;
+ cgroup_name(memcg->css.cgroup,
+ __entry->name,
+ sizeof(__entry->name));
+ ),
+
+ TP_printk("order=%d gfp_flags=%s memcg=%s",
+ __entry->order,
+ show_gfp_flags(__entry->gfp_flags),
+ __entry->name)
);
-DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin,
+DEFINE_EVENT(mm_vmscan_memcg_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin,
- TP_PROTO(int order, gfp_t gfp_flags),
+ TP_PROTO(int order, gfp_t gfp_flags, const struct mem_cgroup *memcg),
- TP_ARGS(order, gfp_flags)
+ TP_ARGS(order, gfp_flags, memcg)
+);
+
+DEFINE_EVENT(mm_vmscan_memcg_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin,
+
+ TP_PROTO(int order, gfp_t gfp_flags, const struct mem_cgroup *memcg),
+
+ TP_ARGS(order, gfp_flags, memcg)
);
+
#endif /* CONFIG_MEMCG */
DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template,
@@ -181,19 +209,44 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_direct_reclaim_end
);
#ifdef CONFIG_MEMCG
-DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_reclaim_end,
- TP_PROTO(unsigned long nr_reclaimed),
+DECLARE_EVENT_CLASS(mm_vmscan_memcg_reclaim_end_template,
- TP_ARGS(nr_reclaimed)
+ TP_PROTO(unsigned long nr_reclaimed, const struct mem_cgroup *memcg),
+
+ TP_ARGS(nr_reclaimed, memcg),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, nr_reclaimed)
+ __array(char, name, NAME_MAX + 1)
+ ),
+
+ TP_fast_assign(
+ __entry->nr_reclaimed = nr_reclaimed;
+ cgroup_name(memcg->css.cgroup,
+ __entry->name,
+ sizeof(__entry->name));
+ ),
+
+ TP_printk("nr_reclaimed=%lu memcg=%s",
+ __entry->nr_reclaimed,
+ __entry->name)
);
-DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_reclaim_end,
+DEFINE_EVENT(mm_vmscan_memcg_reclaim_end_template, mm_vmscan_memcg_reclaim_end,
- TP_PROTO(unsigned long nr_reclaimed),
+ TP_PROTO(unsigned long nr_reclaimed, const struct mem_cgroup *memcg),
- TP_ARGS(nr_reclaimed)
+ TP_ARGS(nr_reclaimed, memcg)
);
+
+DEFINE_EVENT(mm_vmscan_memcg_reclaim_end_template, mm_vmscan_memcg_softlimit_reclaim_end,
+
+ TP_PROTO(unsigned long nr_reclaimed, const struct mem_cgroup *memcg),
+
+ TP_ARGS(nr_reclaimed, memcg)
+);
+
#endif /* CONFIG_MEMCG */
TRACE_EVENT(mm_shrink_slab_start,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1080209a568b..45780952f4b5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -7088,8 +7088,8 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
- trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
- sc.gfp_mask);
+ trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, sc.gfp_mask,
+ memcg);
/*
* NOTE: Although we can get the priority field, using it
@@ -7100,7 +7100,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
*/
shrink_lruvec(lruvec, &sc);
- trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
+ trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed, memcg);
*nr_scanned = sc.nr_scanned;
@@ -7134,13 +7134,13 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
set_task_reclaim_state(current, &sc.reclaim_state);
- trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
+ trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask, memcg);
noreclaim_flag = memalloc_noreclaim_save();
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
memalloc_noreclaim_restore(noreclaim_flag);
- trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
+ trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed, memcg);
set_task_reclaim_state(current, NULL);
return nr_reclaimed;
--
2.36.0
On Wed, Nov 22, 2023 at 01:01:55PM +0300, Dmitry Rokosov wrote:
> Sometimes it is necessary to understand in which memcg tracepoint event
> occurred. The function cgroup_name() is a useful tool for this purpose.
> To integrate cgroup_name() into the existing memcg tracepoints, this
> patch introduces a new tracepoint template for the begin() and end()
> events, utilizing static __array() to store the cgroup name.
>
> Signed-off-by: Dmitry Rokosov <[email protected]>
> ---
> include/trace/events/vmscan.h | 77 +++++++++++++++++++++++++++++------
> mm/vmscan.c | 10 ++---
> 2 files changed, 70 insertions(+), 17 deletions(-)
>
> diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
> index d2123dd960d5..9b49cd120ae9 100644
> --- a/include/trace/events/vmscan.h
> +++ b/include/trace/events/vmscan.h
> @@ -141,19 +141,47 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_b
> );
>
> #ifdef CONFIG_MEMCG
> -DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin,
>
> - TP_PROTO(int order, gfp_t gfp_flags),
> +DECLARE_EVENT_CLASS(mm_vmscan_memcg_reclaim_begin_template,
>
> - TP_ARGS(order, gfp_flags)
> + TP_PROTO(int order, gfp_t gfp_flags, const struct mem_cgroup *memcg),
> +
> + TP_ARGS(order, gfp_flags, memcg),
> +
> + TP_STRUCT__entry(
> + __field(int, order)
> + __field(unsigned long, gfp_flags)
> + __array(char, name, NAME_MAX + 1)
> + ),
> +
> + TP_fast_assign(
> + __entry->order = order;
> + __entry->gfp_flags = (__force unsigned long)gfp_flags;
> + cgroup_name(memcg->css.cgroup,
> + __entry->name,
> + sizeof(__entry->name));
Any reason not to use cgroup_ino? cgroup_name may conflict and be
ambiguous.
Hello Shakeel,
Thank you for quick review!
Please find my thoughts below.
On Thu, Nov 23, 2023 at 07:21:26AM +0000, Shakeel Butt wrote:
> On Wed, Nov 22, 2023 at 01:01:55PM +0300, Dmitry Rokosov wrote:
> > Sometimes it is necessary to understand in which memcg tracepoint event
> > occurred. The function cgroup_name() is a useful tool for this purpose.
> > To integrate cgroup_name() into the existing memcg tracepoints, this
> > patch introduces a new tracepoint template for the begin() and end()
> > events, utilizing static __array() to store the cgroup name.
> >
> > Signed-off-by: Dmitry Rokosov <[email protected]>
> > ---
> > include/trace/events/vmscan.h | 77 +++++++++++++++++++++++++++++------
> > mm/vmscan.c | 10 ++---
> > 2 files changed, 70 insertions(+), 17 deletions(-)
> >
> > diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
> > index d2123dd960d5..9b49cd120ae9 100644
> > --- a/include/trace/events/vmscan.h
> > +++ b/include/trace/events/vmscan.h
> > @@ -141,19 +141,47 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_b
> > );
> >
> > #ifdef CONFIG_MEMCG
> > -DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin,
> >
> > - TP_PROTO(int order, gfp_t gfp_flags),
> > +DECLARE_EVENT_CLASS(mm_vmscan_memcg_reclaim_begin_template,
> >
> > - TP_ARGS(order, gfp_flags)
> > + TP_PROTO(int order, gfp_t gfp_flags, const struct mem_cgroup *memcg),
> > +
> > + TP_ARGS(order, gfp_flags, memcg),
> > +
> > + TP_STRUCT__entry(
> > + __field(int, order)
> > + __field(unsigned long, gfp_flags)
> > + __array(char, name, NAME_MAX + 1)
> > + ),
> > +
> > + TP_fast_assign(
> > + __entry->order = order;
> > + __entry->gfp_flags = (__force unsigned long)gfp_flags;
> > + cgroup_name(memcg->css.cgroup,
> > + __entry->name,
> > + sizeof(__entry->name));
>
> Any reason not to use cgroup_ino? cgroup_name may conflict and be
> ambiguous.
I actually didn't consider it, as the cgroup name serves as a clear tag
for filtering the appropriate cgroup in the entire trace file. However,
you are correct that there might be conflicts with cgroup names.
Therefore, it might be better to display both tags: ino and name. What
do you think on this?
--
Thank you,
Dmitry
On Thu, Nov 23, 2023 at 11:03:34AM +0300, Dmitry Rokosov wrote:
[...]
> > > + cgroup_name(memcg->css.cgroup,
> > > + __entry->name,
> > > + sizeof(__entry->name));
> >
> > Any reason not to use cgroup_ino? cgroup_name may conflict and be
> > ambiguous.
>
> I actually didn't consider it, as the cgroup name serves as a clear tag
> for filtering the appropriate cgroup in the entire trace file. However,
> you are correct that there might be conflicts with cgroup names.
> Therefore, it might be better to display both tags: ino and name. What
> do you think on this?
>
I can see putting cgroup name can avoid pre or post processing, so
putting both are fine. Though keep in mind that cgroup_name acquires a
lock which may impact the applications running on the system.
On Thu, Nov 23, 2023 at 08:15:47AM +0000, Shakeel Butt wrote:
> On Thu, Nov 23, 2023 at 11:03:34AM +0300, Dmitry Rokosov wrote:
> [...]
> > > > + cgroup_name(memcg->css.cgroup,
> > > > + __entry->name,
> > > > + sizeof(__entry->name));
> > >
> > > Any reason not to use cgroup_ino? cgroup_name may conflict and be
> > > ambiguous.
> >
> > I actually didn't consider it, as the cgroup name serves as a clear tag
> > for filtering the appropriate cgroup in the entire trace file. However,
> > you are correct that there might be conflicts with cgroup names.
> > Therefore, it might be better to display both tags: ino and name. What
> > do you think on this?
> >
>
> I can see putting cgroup name can avoid pre or post processing, so
> putting both are fine. Though keep in mind that cgroup_name acquires a
> lock which may impact the applications running on the system.
Are you talking about kernfs_rename_lock? Yes, it's acquired each
time... Unfortunatelly, I don't know a way to save cgroup_name one time
somehow...
--
Thank you,
Dmitry
Shakeel,
On Thu, Nov 23, 2023 at 11:45:10AM +0300, Dmitry Rokosov wrote:
> On Thu, Nov 23, 2023 at 08:15:47AM +0000, Shakeel Butt wrote:
> > On Thu, Nov 23, 2023 at 11:03:34AM +0300, Dmitry Rokosov wrote:
> > [...]
> > > > > + cgroup_name(memcg->css.cgroup,
> > > > > + __entry->name,
> > > > > + sizeof(__entry->name));
> > > >
> > > > Any reason not to use cgroup_ino? cgroup_name may conflict and be
> > > > ambiguous.
> > >
> > > I actually didn't consider it, as the cgroup name serves as a clear tag
> > > for filtering the appropriate cgroup in the entire trace file. However,
> > > you are correct that there might be conflicts with cgroup names.
> > > Therefore, it might be better to display both tags: ino and name. What
> > > do you think on this?
> > >
> >
> > I can see putting cgroup name can avoid pre or post processing, so
> > putting both are fine. Though keep in mind that cgroup_name acquires a
> > lock which may impact the applications running on the system.
>
> Are you talking about kernfs_rename_lock? Yes, it's acquired each
> time... Unfortunatelly, I don't know a way to save cgroup_name one time
> somehow...
I delved deeper and realized that kernfs_rename_lock is a read-write
lock, but it's a global one. While it's true that we only enable
tracepoints during specific periods of the host's lifetime, the trace
system is still a fast way to debug things. So, you're absolutely right,
we shouldn't slow down the system unnecessarily.
Therefore, today I will prepare a new version with only the cgroup ino.
Thank you for pointing that out to me!
--
Thank you,
Dmitry