2022-05-08 09:26:35

by Vasily Averin

[permalink] [raw]
Subject: [PATCH] percpu: improve percpu_alloc_percpu event trace

Added bytes_alloc and gfp_flags fields to the output of the
percpu_alloc_percpu ftrace event. This is required to track
memcg-accounted percpu allocations.

Signed-off-by: Vasily Averin <[email protected]>
---
include/trace/events/percpu.h | 17 ++++++++++++-----
mm/percpu-internal.h | 8 ++++----
mm/percpu.c | 3 ++-
3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/include/trace/events/percpu.h b/include/trace/events/percpu.h
index df112a64f6c9..a6d640d2cb8b 100644
--- a/include/trace/events/percpu.h
+++ b/include/trace/events/percpu.h
@@ -6,13 +6,16 @@
#define _TRACE_PERCPU_H

#include <linux/tracepoint.h>
+#include <trace/events/mmflags.h>

TRACE_EVENT(percpu_alloc_percpu,

TP_PROTO(bool reserved, bool is_atomic, size_t size,
- size_t align, void *base_addr, int off, void __percpu *ptr),
+ size_t align, void *base_addr, int off,
+ void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags),

- TP_ARGS(reserved, is_atomic, size, align, base_addr, off, ptr),
+ TP_ARGS(reserved, is_atomic, size, align, base_addr, off, ptr,
+ bytes_alloc, gfp_flags),

TP_STRUCT__entry(
__field( bool, reserved )
@@ -22,8 +25,9 @@ TRACE_EVENT(percpu_alloc_percpu,
__field( void *, base_addr )
__field( int, off )
__field( void __percpu *, ptr )
+ __field( size_t, bytes_alloc )
+ __field( gfp_t, gfp_flags )
),
-
TP_fast_assign(
__entry->reserved = reserved;
__entry->is_atomic = is_atomic;
@@ -32,12 +36,15 @@ TRACE_EVENT(percpu_alloc_percpu,
__entry->base_addr = base_addr;
__entry->off = off;
__entry->ptr = ptr;
+ __entry->bytes_alloc = bytes_alloc;
+ __entry->gfp_flags = gfp_flags;
),

- TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p",
+ TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p bytes_alloc=%zu gfp_flags=%s",
__entry->reserved, __entry->is_atomic,
__entry->size, __entry->align,
- __entry->base_addr, __entry->off, __entry->ptr)
+ __entry->base_addr, __entry->off, __entry->ptr,
+ __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags))
);

TRACE_EVENT(percpu_free_percpu,
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index 411d1593ef23..70b1ea23f4d2 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -113,7 +113,6 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
}

-#ifdef CONFIG_MEMCG_KMEM
/**
* pcpu_obj_full_size - helper to calculate size of each accounted object
* @size: size of area to allocate in bytes
@@ -123,13 +122,14 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
*/
static inline size_t pcpu_obj_full_size(size_t size)
{
- size_t extra_size;
+ size_t extra_size = 0;

- extra_size = size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *);
+#ifdef CONFIG_MEMCG_KMEM
+ extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *);
+#endif

return size * num_possible_cpus() + extra_size;
}
-#endif /* CONFIG_MEMCG_KMEM */

#ifdef CONFIG_PERCPU_STATS

diff --git a/mm/percpu.c b/mm/percpu.c
index ea28db283044..cbeb380c359d 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1885,7 +1885,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
kmemleak_alloc_percpu(ptr, size, gfp);

trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
- chunk->base_addr, off, ptr);
+ chunk->base_addr, off, ptr,
+ pcpu_obj_full_size(size), gfp);

pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);

--
2.31.1



2022-05-09 04:09:36

by Vasily Averin

[permalink] [raw]
Subject: Re: [PATCH] percpu: improve percpu_alloc_percpu event trace

On 5/6/22 07:46, Vasily Averin wrote:
> Added bytes_alloc and gfp_flags fields to the output of the
> percpu_alloc_percpu ftrace event. This is required to track
> memcg-accounted percpu allocations.

Perhaps it makes sense to add call_site too...

2022-05-09 05:16:55

by Vasily Averin

[permalink] [raw]
Subject: [PATCH v2] percpu: improve percpu_alloc_percpu event trace

Added call_site, bytes_alloc and gfp_flags fields to the output
of the percpu_alloc_percpu ftrace event:

mkdir-4393 [001] 169.334788: percpu_alloc_percpu:
call_site=mem_cgroup_css_alloc+0xa6 reserved=0 is_atomic=0 size=2408 align=8
base_addr=0xffffc7117fc00000 off=402176 ptr=0x3dc867a62300 bytes_alloc=14448
gfp_flags=GFP_KERNEL_ACCOUNT

This is required to track memcg-accounted percpu allocations.

Signed-off-by: Vasily Averin <[email protected]>
---
v2: added call_site, improved patch description
---
include/trace/events/percpu.h | 23 +++++++++++++++++------
mm/percpu-internal.h | 8 ++++----
mm/percpu.c | 5 +++--
3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/include/trace/events/percpu.h b/include/trace/events/percpu.h
index df112a64f6c9..e989cefc0def 100644
--- a/include/trace/events/percpu.h
+++ b/include/trace/events/percpu.h
@@ -6,15 +6,20 @@
#define _TRACE_PERCPU_H

#include <linux/tracepoint.h>
+#include <trace/events/mmflags.h>

TRACE_EVENT(percpu_alloc_percpu,

- TP_PROTO(bool reserved, bool is_atomic, size_t size,
- size_t align, void *base_addr, int off, void __percpu *ptr),
+ TP_PROTO(unsigned long call_site,
+ bool reserved, bool is_atomic, size_t size,
+ size_t align, void *base_addr, int off,
+ void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags),

- TP_ARGS(reserved, is_atomic, size, align, base_addr, off, ptr),
+ TP_ARGS(call_site, reserved, is_atomic, size, align, base_addr, off,
+ ptr, bytes_alloc, gfp_flags),

TP_STRUCT__entry(
+ __field( unsigned long, call_site )
__field( bool, reserved )
__field( bool, is_atomic )
__field( size_t, size )
@@ -22,9 +27,11 @@ TRACE_EVENT(percpu_alloc_percpu,
__field( void *, base_addr )
__field( int, off )
__field( void __percpu *, ptr )
+ __field( size_t, bytes_alloc )
+ __field( gfp_t, gfp_flags )
),
-
TP_fast_assign(
+ __entry->call_site = call_site;
__entry->reserved = reserved;
__entry->is_atomic = is_atomic;
__entry->size = size;
@@ -32,12 +39,16 @@ TRACE_EVENT(percpu_alloc_percpu,
__entry->base_addr = base_addr;
__entry->off = off;
__entry->ptr = ptr;
+ __entry->bytes_alloc = bytes_alloc;
+ __entry->gfp_flags = gfp_flags;
),

- TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p",
+ TP_printk("call_site=%pS reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p bytes_alloc=%zu gfp_flags=%s",
+ (void *)__entry->call_site,
__entry->reserved, __entry->is_atomic,
__entry->size, __entry->align,
- __entry->base_addr, __entry->off, __entry->ptr)
+ __entry->base_addr, __entry->off, __entry->ptr,
+ __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags))
);

TRACE_EVENT(percpu_free_percpu,
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index 411d1593ef23..70b1ea23f4d2 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -113,7 +113,6 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
}

-#ifdef CONFIG_MEMCG_KMEM
/**
* pcpu_obj_full_size - helper to calculate size of each accounted object
* @size: size of area to allocate in bytes
@@ -123,13 +122,14 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
*/
static inline size_t pcpu_obj_full_size(size_t size)
{
- size_t extra_size;
+ size_t extra_size = 0;

- extra_size = size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *);
+#ifdef CONFIG_MEMCG_KMEM
+ extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *);
+#endif

return size * num_possible_cpus() + extra_size;
}
-#endif /* CONFIG_MEMCG_KMEM */

#ifdef CONFIG_PERCPU_STATS

diff --git a/mm/percpu.c b/mm/percpu.c
index ea28db283044..3633eeefaa0d 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1884,8 +1884,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
kmemleak_alloc_percpu(ptr, size, gfp);

- trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
- chunk->base_addr, off, ptr);
+ trace_percpu_alloc_percpu(_RET_IP_, reserved, is_atomic, size, align,
+ chunk->base_addr, off, ptr,
+ pcpu_obj_full_size(size), gfp);

pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);

--
2.31.1


2022-05-11 05:42:26

by Roman Gushchin

[permalink] [raw]
Subject: Re: [PATCH v2] percpu: improve percpu_alloc_percpu event trace

On Fri, May 06, 2022 at 10:29:25PM +0300, Vasily Averin wrote:
> Added call_site, bytes_alloc and gfp_flags fields to the output
> of the percpu_alloc_percpu ftrace event:
>
> mkdir-4393 [001] 169.334788: percpu_alloc_percpu:
> call_site=mem_cgroup_css_alloc+0xa6 reserved=0 is_atomic=0 size=2408 align=8
> base_addr=0xffffc7117fc00000 off=402176 ptr=0x3dc867a62300 bytes_alloc=14448
> gfp_flags=GFP_KERNEL_ACCOUNT
>
> This is required to track memcg-accounted percpu allocations.
>
> Signed-off-by: Vasily Averin <[email protected]>

Acked-by: Roman Gushchin <[email protected]>

LGTM, thanks Vasily!

One minor thing below.

> ---
> v2: added call_site, improved patch description
> ---
> include/trace/events/percpu.h | 23 +++++++++++++++++------
> mm/percpu-internal.h | 8 ++++----
> mm/percpu.c | 5 +++--
> 3 files changed, 24 insertions(+), 12 deletions(-)
>
> diff --git a/include/trace/events/percpu.h b/include/trace/events/percpu.h
> index df112a64f6c9..e989cefc0def 100644
> --- a/include/trace/events/percpu.h
> +++ b/include/trace/events/percpu.h
> @@ -6,15 +6,20 @@
> #define _TRACE_PERCPU_H
>
> #include <linux/tracepoint.h>
> +#include <trace/events/mmflags.h>
>
> TRACE_EVENT(percpu_alloc_percpu,
>
> - TP_PROTO(bool reserved, bool is_atomic, size_t size,
> - size_t align, void *base_addr, int off, void __percpu *ptr),
> + TP_PROTO(unsigned long call_site,
> + bool reserved, bool is_atomic, size_t size,
> + size_t align, void *base_addr, int off,
> + void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags),

Don't we want to preserve the order and add the call_site at the end?
Trace events are not ABI, but if we don't have a strong reason to break it,
I'd preserve the old order.

Thanks!

2022-05-11 09:48:17

by Vasily Averin

[permalink] [raw]
Subject: Re: [PATCH v2] percpu: improve percpu_alloc_percpu event trace

On 5/11/22 05:33, Roman Gushchin wrote:
> On Fri, May 06, 2022 at 10:29:25PM +0300, Vasily Averin wrote:
>> TRACE_EVENT(percpu_alloc_percpu,
>>
>> - TP_PROTO(bool reserved, bool is_atomic, size_t size,
>> - size_t align, void *base_addr, int off, void __percpu *ptr),
>> + TP_PROTO(unsigned long call_site,
>> + bool reserved, bool is_atomic, size_t size,
>> + size_t align, void *base_addr, int off,
>> + void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags),
>
> Don't we want to preserve the order and add the call_site at the end?
> Trace events are not ABI, but if we don't have a strong reason to break it,
> I'd preserve the old order.

I checked recent trace patches and found that order changes is acceptable.

commit 8c39b8bc82aafcc8dd378bd79c76fac8e8a89c8d
Author: David Howells <[email protected]>
Date: Fri Jan 14 11:44:54 2022 +0000

cachefiles: Make some tracepoint adjustments

- TP_printk("o=%08x i=%lx e=%d",
- __entry->obj, __entry->ino, __entry->error)
+ TP_printk("o=%08x dB=%lx B=%lx e=%d",
+ __entry->obj, __entry->dino, __entry->ino, __entry->error)

On the other hand I'm agree to keep old order by default.
that's why I added bytes_alloc and gfp_flags to end of output.
However I think call_site is an exception. In all cases found,
call_site is output first.
For me personally it simplified output parsing.

So I would like to know Steven's position on this question.

Thank you,
Vasily Averin

2022-05-13 10:08:09

by Roman Gushchin

[permalink] [raw]
Subject: Re: [PATCH v2] percpu: improve percpu_alloc_percpu event trace

On Wed, May 11, 2022 at 08:11:54AM +0300, Vasily Averin wrote:
> On 5/11/22 05:33, Roman Gushchin wrote:
> > On Fri, May 06, 2022 at 10:29:25PM +0300, Vasily Averin wrote:
> >> TRACE_EVENT(percpu_alloc_percpu,
> >>
> >> - TP_PROTO(bool reserved, bool is_atomic, size_t size,
> >> - size_t align, void *base_addr, int off, void __percpu *ptr),
> >> + TP_PROTO(unsigned long call_site,
> >> + bool reserved, bool is_atomic, size_t size,
> >> + size_t align, void *base_addr, int off,
> >> + void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags),
> >
> > Don't we want to preserve the order and add the call_site at the end?
> > Trace events are not ABI, but if we don't have a strong reason to break it,
> > I'd preserve the old order.
>
> I checked recent trace patches and found that order changes is acceptable.
>
> commit 8c39b8bc82aafcc8dd378bd79c76fac8e8a89c8d
> Author: David Howells <[email protected]>
> Date: Fri Jan 14 11:44:54 2022 +0000
>
> cachefiles: Make some tracepoint adjustments
>
> - TP_printk("o=%08x i=%lx e=%d",
> - __entry->obj, __entry->ino, __entry->error)
> + TP_printk("o=%08x dB=%lx B=%lx e=%d",
> + __entry->obj, __entry->dino, __entry->ino, __entry->error)
>
> On the other hand I'm agree to keep old order by default.
> that's why I added bytes_alloc and gfp_flags to end of output.
> However I think call_site is an exception. In all cases found,
> call_site is output first.
> For me personally it simplified output parsing.
>
> So I would like to know Steven's position on this question.

Ok, not a strong opinion, I think both options are acceptable.

Thanks!

2022-05-16 10:56:01

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH v2] percpu: improve percpu_alloc_percpu event trace

On Tue, 10 May 2022 19:33:17 -0700
Roman Gushchin <[email protected]> wrote:

> --- a/include/trace/events/percpu.h
> > +++ b/include/trace/events/percpu.h
> > @@ -6,15 +6,20 @@
> > #define _TRACE_PERCPU_H
> >
> > #include <linux/tracepoint.h>
> > +#include <trace/events/mmflags.h>
> >
> > TRACE_EVENT(percpu_alloc_percpu,
> >
> > - TP_PROTO(bool reserved, bool is_atomic, size_t size,
> > - size_t align, void *base_addr, int off, void __percpu *ptr),
> > + TP_PROTO(unsigned long call_site,
> > + bool reserved, bool is_atomic, size_t size,
> > + size_t align, void *base_addr, int off,
> > + void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags),
>
> Don't we want to preserve the order and add the call_site at the end?
> Trace events are not ABI, but if we don't have a strong reason to break it,
> I'd preserve the old order.

Ideally everyone should be using libtraceevent which will parse the format
file for the needed entries.

Nothing (important) should be parsing the raw ascii from the trace files.
It's slow and unreliable. The raw format (trace_pipe_raw) files, along with
libtraceevent will handle fining the fields you are looking for, even if
the fields move around (internally or externally).

Then there's trace-cruncher (a python script that uses libtracefs and
libtraceevent) that will work too.

https://github.com/vmware/trace-cruncher

-- Steve