The default remains 127, which is good for most cases, and not even hit
most of the time, but then for some cases, as reported by Brendan, 1024+
deep frames are appearing on the radar for things like groovy, ruby.
And in some workloads putting a _lower_ cap on this may make sense. One
that is per event still needs to be put in place tho.
The new file is:
# cat /proc/sys/kernel/perf_event_max_stack
127
Chaging it:
# echo 256 > /proc/sys/kernel/perf_event_max_stack
# cat /proc/sys/kernel/perf_event_max_stack
256
But as soon as there is some event using callchains we get:
# echo 512 > /proc/sys/kernel/perf_event_max_stack
-bash: echo: write error: Device or resource busy
#
Because we only allocate the callchain percpu data structures when there
is a user, which allows for changing the max easily, its just a matter
of having no callchain users at that point.
Reported-by: Brendan Gregg <[email protected]>
Link: http://lkml.kernel.org/n/[email protected]
Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 57653a44b128..260cde08e92e 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -60,6 +60,7 @@ show up in /proc/sys/kernel:
- panic_on_warn
- perf_cpu_time_max_percent
- perf_event_paranoid
+- perf_event_max_stack
- pid_max
- powersave-nap [ PPC only ]
- printk
@@ -654,6 +655,19 @@ users (without CAP_SYS_ADMIN). The default value is 1.
==============================================================
+perf_event_max_stack:
+
+Controls maximum number of stack frames to copy for (attr.sample_type &
+PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using
+'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 127.
+
+==============================================================
+
pid_max:
PID allocation wrap value. When the kernel's next PID value
diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index 4e02ae5950ff..27563befa8a2 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
tail = (struct frame_tail __user *)regs->ARM_fp - 1;
- while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+ while ((entry->nr < sysctl_perf_event_max_stack) &&
tail && !((unsigned long)tail & 0x3))
tail = user_backtrace(tail, entry);
}
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index ff4665462a02..32c3c6e70119 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
tail = (struct frame_tail __user *)regs->regs[29];
- while (entry->nr < PERF_MAX_STACK_DEPTH &&
+ while (entry->nr < sysctl_perf_event_max_stack &&
tail && !((unsigned long)tail & 0xf))
tail = user_backtrace(tail, entry);
} else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
- while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+ while ((entry->nr < sysctl_perf_event_max_stack) &&
tail && !((unsigned long)tail & 0x3))
tail = compat_user_backtrace(tail, entry);
#endif
diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c
index 315633461a94..252abc12a5a3 100644
--- a/arch/metag/kernel/perf_callchain.c
+++ b/arch/metag/kernel/perf_callchain.c
@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
--frame;
- while ((entry->nr < PERF_MAX_STACK_DEPTH) && frame)
+ while ((entry->nr < sysctl_perf_event_max_stack) && frame)
frame = user_backtrace(frame, entry);
}
diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c
index c1cf9c6c3f77..5021c546ad07 100644
--- a/arch/mips/kernel/perf_event.c
+++ b/arch/mips/kernel/perf_event.c
@@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
addr = *sp++;
if (__kernel_text_address(addr)) {
perf_callchain_store(entry, addr);
- if (entry->nr >= PERF_MAX_STACK_DEPTH)
+ if (entry->nr >= sysctl_perf_event_max_stack)
break;
}
}
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
}
do {
perf_callchain_store(entry, pc);
- if (entry->nr >= PERF_MAX_STACK_DEPTH)
+ if (entry->nr >= sysctl_perf_event_max_stack)
break;
pc = unwind_stack(current, &sp, pc, &ra);
} while (pc);
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index e04a6752b399..22d9015c1acc 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
sp = regs->gpr[1];
perf_callchain_store(entry, next_ip);
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
fp = (unsigned long __user *) sp;
if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
return;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
sp = regs->gpr[1];
perf_callchain_store(entry, next_ip);
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
fp = (unsigned int __user *) (unsigned long) sp;
if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
return;
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 6596f66ce112..a4b8b5aed21c 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
}
}
#endif
- } while (entry->nr < PERF_MAX_STACK_DEPTH);
+ } while (entry->nr < sysctl_perf_event_max_stack);
}
static inline int
@@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
pc = sf.callers_pc;
ufp = (unsigned long)sf.fp + STACK_BIAS;
perf_callchain_store(entry, pc);
- } while (entry->nr < PERF_MAX_STACK_DEPTH);
+ } while (entry->nr < sysctl_perf_event_max_stack);
}
static void perf_callchain_user_32(struct perf_callchain_entry *entry,
@@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
ufp = (unsigned long)sf.fp;
}
perf_callchain_store(entry, pc);
- } while (entry->nr < PERF_MAX_STACK_DEPTH);
+ } while (entry->nr < sysctl_perf_event_max_stack);
}
void
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 041e442a3e28..41d93d0e972b 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2277,7 +2277,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
fp = compat_ptr(ss_base + regs->bp);
pagefault_disable();
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
unsigned long bytes;
frame.next_frame = 0;
frame.return_address = 0;
@@ -2337,7 +2337,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
return;
pagefault_disable();
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
unsigned long bytes;
frame.next_frame = NULL;
frame.return_address = 0;
diff --git a/arch/xtensa/kernel/perf_event.c b/arch/xtensa/kernel/perf_event.c
index 54f01188c29c..a6b00b3af429 100644
--- a/arch/xtensa/kernel/perf_event.c
+++ b/arch/xtensa/kernel/perf_event.c
@@ -332,14 +332,14 @@ static int callchain_trace(struct stackframe *frame, void *data)
void perf_callchain_kernel(struct perf_callchain_entry *entry,
struct pt_regs *regs)
{
- xtensa_backtrace_kernel(regs, PERF_MAX_STACK_DEPTH,
+ xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack,
callchain_trace, NULL, entry);
}
void perf_callchain_user(struct perf_callchain_entry *entry,
struct pt_regs *regs)
{
- xtensa_backtrace_user(regs, PERF_MAX_STACK_DEPTH,
+ xtensa_backtrace_user(regs, sysctl_perf_event_max_stack,
callchain_trace, entry);
}
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b8b195fbe787..d599ed32af92 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -58,7 +58,7 @@ struct perf_guest_info_callbacks {
struct perf_callchain_entry {
__u64 nr;
- __u64 ip[PERF_MAX_STACK_DEPTH];
+ __u64 ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
};
struct perf_raw_record {
@@ -983,9 +983,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
extern int get_callchain_buffers(void);
extern void put_callchain_buffers(void);
+extern int sysctl_perf_event_max_stack;
+
static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
{
- if (entry->nr < PERF_MAX_STACK_DEPTH) {
+ if (entry->nr < sysctl_perf_event_max_stack) {
entry->ip[entry->nr++] = ip;
return 0;
} else {
@@ -1007,6 +1009,8 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos);
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
static inline bool perf_paranoid_tracepoint_raw(void)
{
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 343c22f5e867..1390747b2195 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -18,6 +18,14 @@ struct callchain_cpus_entries {
struct perf_callchain_entry *cpu_entries[0];
};
+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+
+static size_t perf_callchain_entry__sizeof(void)
+{
+ return sizeof(struct perf_callchain_entry) +
+ sizeof(__u64) * sysctl_perf_event_max_stack;
+}
+
static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
static atomic_t nr_callchain_events;
static DEFINE_MUTEX(callchain_mutex);
@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
if (!entries)
return -ENOMEM;
- size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+ size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
for_each_possible_cpu(cpu) {
entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
@@ -215,3 +223,25 @@ exit_put:
return entry;
}
+
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int new_value = sysctl_perf_event_max_stack, ret;
+ struct ctl_table new_table = *table;
+
+ new_table.data = &new_value;
+ ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
+ if (ret || !write)
+ return ret;
+
+ mutex_lock(&callchain_mutex);
+ if (atomic_read(&nr_callchain_events))
+ ret = -EBUSY;
+ else
+ sysctl_perf_event_max_stack = new_value;
+
+ mutex_unlock(&callchain_mutex);
+
+ return ret;
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 725587f10667..afbe9fb32ec5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1144,6 +1144,14 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one_hundred,
},
+ {
+ .procname = "perf_event_max_stack",
+ .data = NULL, /* filled in by handler */
+ .maxlen = sizeof(sysctl_perf_event_max_stack),
+ .mode = 0644,
+ .proc_handler = perf_event_max_stack_handler,
+ .extra1 = &zero,
+ },
#endif
#ifdef CONFIG_KMEMCHECK
{
On Wed, Apr 20, 2016 at 07:47:30PM -0300, Arnaldo Carvalho de Melo wrote:
> The default remains 127, which is good for most cases, and not even hit
> most of the time, but then for some cases, as reported by Brendan, 1024+
> deep frames are appearing on the radar for things like groovy, ruby.
>
> And in some workloads putting a _lower_ cap on this may make sense. One
> that is per event still needs to be put in place tho.
>
> The new file is:
>
> # cat /proc/sys/kernel/perf_event_max_stack
> 127
>
> Chaging it:
>
> # echo 256 > /proc/sys/kernel/perf_event_max_stack
> # cat /proc/sys/kernel/perf_event_max_stack
> 256
>
> But as soon as there is some event using callchains we get:
>
> # echo 512 > /proc/sys/kernel/perf_event_max_stack
> -bash: echo: write error: Device or resource busy
> #
>
> Because we only allocate the callchain percpu data structures when there
> is a user, which allows for changing the max easily, its just a matter
> of having no callchain users at that point.
>
> Reported-by: Brendan Gregg <[email protected]>
> Link: http://lkml.kernel.org/n/[email protected]
> Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
Nice. I like it. That's a great approach to hard problem.
Java guys will be happy too.
Please also adjust two places in kernel/bpf/stackmap.c
> + {
> + .procname = "perf_event_max_stack",
> + .data = NULL, /* filled in by handler */
> + .maxlen = sizeof(sysctl_perf_event_max_stack),
> + .mode = 0644,
> + .proc_handler = perf_event_max_stack_handler,
> + .extra1 = &zero,
zero seems to be the wrong minimum. I think it should be at least 2 to
to fit user/kernel tags ?
Probably needs to define max as well.
On 4/20/16 4:47 PM, Arnaldo Carvalho de Melo wrote:
> The new file is:
>
> # cat /proc/sys/kernel/perf_event_max_stack
> 127
>
> Chaging it:
>
> # echo 256 > /proc/sys/kernel/perf_event_max_stack
> # cat /proc/sys/kernel/perf_event_max_stack
> 256
>
> But as soon as there is some event using callchains we get:
>
> # echo 512 > /proc/sys/kernel/perf_event_max_stack
> -bash: echo: write error: Device or resource busy
> #
>
> Because we only allocate the callchain percpu data structures when there
> is a user, which allows for changing the max easily, its just a matter
> of having no callchain users at that point.
>
> Reported-by: Brendan Gregg <[email protected]>
> Link: http://lkml.kernel.org/n/[email protected]
> Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
>
I would love to see something like this go in. Right now I have to
recompile the kernel because I want a lower max count.
In the past we talked about about making this part of the attribute with
separate controls for both kernel stack and userspace stack. Have you
given up on that option?
Em Wed, Apr 20, 2016 at 05:10:17PM -0600, David Ahern escreveu:
> On 4/20/16 4:47 PM, Arnaldo Carvalho de Melo wrote:
> > # echo 256 > /proc/sys/kernel/perf_event_max_stack
> > # cat /proc/sys/kernel/perf_event_max_stack
> > 256
> I would love to see something like this go in. Right now I have to recompile
> the kernel because I want a lower max count.
> In the past we talked about about making this part of the attribute with
> separate controls for both kernel stack and userspace stack. Have you given
> up on that option?
Nope, I'll try to work on that too, I even mentioned it above, the part
"one per event remains to be done".
- Arnaldo
On Wed, Apr 20, 2016 at 07:47:30PM -0300, Arnaldo Carvalho de Melo wrote:
> The default remains 127, which is good for most cases, and not even hit
> most of the time, but then for some cases, as reported by Brendan, 1024+
> deep frames are appearing on the radar for things like groovy, ruby.
yea gawds ;-)
> index 343c22f5e867..1390747b2195 100644
> --- a/kernel/events/callchain.c
> +++ b/kernel/events/callchain.c
> @@ -18,6 +18,14 @@ struct callchain_cpus_entries {
> struct perf_callchain_entry *cpu_entries[0];
> };
>
> +int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
> +
> +static size_t perf_callchain_entry__sizeof(void)
> +{
> + return sizeof(struct perf_callchain_entry) +
> + sizeof(__u64) * sysctl_perf_event_max_stack;
> +}
> +
> static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
> static atomic_t nr_callchain_events;
> static DEFINE_MUTEX(callchain_mutex);
> @@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
> if (!entries)
> return -ENOMEM;
>
> - size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
> + size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
>
> for_each_possible_cpu(cpu) {
> entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
And this alloc _will_ fail if you put in a decent sized value..
Should we put in a dmesg WARN if this alloc fails and
perf_event_max_stack is 'large' ?
> @@ -215,3 +223,25 @@ exit_put:
>
> return entry;
> }
> +
> +int perf_event_max_stack_handler(struct ctl_table *table, int write,
> + void __user *buffer, size_t *lenp, loff_t *ppos)
> +{
> + int new_value = sysctl_perf_event_max_stack, ret;
> + struct ctl_table new_table = *table;
> +
> + new_table.data = &new_value;
cute :-)
> + ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
> + if (ret || !write)
> + return ret;
> +
> + mutex_lock(&callchain_mutex);
> + if (atomic_read(&nr_callchain_events))
> + ret = -EBUSY;
> + else
> + sysctl_perf_event_max_stack = new_value;
> +
> + mutex_unlock(&callchain_mutex);
> +
> + return ret;
> +}
Em Thu, Apr 21, 2016 at 12:48:58PM +0200, Peter Zijlstra escreveu:
> On Wed, Apr 20, 2016 at 07:47:30PM -0300, Arnaldo Carvalho de Melo wrote:
> > The default remains 127, which is good for most cases, and not even hit
> > most of the time, but then for some cases, as reported by Brendan, 1024+
> > deep frames are appearing on the radar for things like groovy, ruby.
> yea gawds ;-)
> > +++ b/kernel/events/callchain.c
> > @@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
> > if (!entries)
> > return -ENOMEM;
> >
> > - size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
> > + size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
> >
> > for_each_possible_cpu(cpu) {
> > entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
>
> And this alloc _will_ fail if you put in a decent sized value..
>
> Should we put in a dmesg WARN if this alloc fails and
> perf_event_max_stack is 'large' ?
Unsure, it already returns -ENOMEM, see, some lines above, i.e. it
better have error handling up this, ho-hum, call chain, I'm checking...
> > @@ -215,3 +223,25 @@ exit_put:
> >
> > return entry;
> > }
> > +
> > +int perf_event_max_stack_handler(struct ctl_table *table, int write,
> > + void __user *buffer, size_t *lenp, loff_t *ppos)
> > +{
> > + int new_value = sysctl_perf_event_max_stack, ret;
> > + struct ctl_table new_table = *table;
> > +
> > + new_table.data = &new_value;
> cute :-)
Hey, I found it on sysctl_schedstats() and sysctl_numa_balancing(), as a
way to read that value but only make it take effect if some condition
was true (nr_callchain_events == 0 in this case), granted, could be
better, less clever, but I leave this for later ;-)
> > + ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
> > + if (ret || !write)
> > + return ret;
> > +
> > + mutex_lock(&callchain_mutex);
> > + if (atomic_read(&nr_callchain_events))
> > + ret = -EBUSY;
> > + else
> > + sysctl_perf_event_max_stack = new_value;
> > +
> > + mutex_unlock(&callchain_mutex);
> > +
> > + return ret;
> > +}
Em Thu, Apr 21, 2016 at 12:17:07PM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Thu, Apr 21, 2016 at 12:48:58PM +0200, Peter Zijlstra escreveu:
> > On Wed, Apr 20, 2016 at 07:47:30PM -0300, Arnaldo Carvalho de Melo wrote:
> > > +++ b/kernel/events/callchain.c
>
> > > @@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
> > > if (!entries)
> > > return -ENOMEM;
> > > - size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
> > > + size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
> > > for_each_possible_cpu(cpu) {
> > > entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
> > And this alloc _will_ fail if you put in a decent sized value..
> > Should we put in a dmesg WARN if this alloc fails and
> > perf_event_max_stack is 'large' ?
>
> Unsure, it already returns -ENOMEM, see, some lines above, i.e. it
> better have error handling up this, ho-hum, call chain, I'm checking...
So, it is, using the tools, since it fits this bill:
[root@jouet acme]# perf probe get_callchain_buffers
Added new event:
probe:get_callchain_buffers (on get_callchain_buffers)
You can now use it in all perf tools, such as:
perf record -e probe:get_callchain_buffers -aR sleep 1
[root@jouet acme]# perf trace -e perf_event_open --event probe:get_callchain_buffers/call-graph=fp/ -- perf record -g usleep 1
[ perf record: Woken up 1 times to write data ]
26.740 ( 0.008 ms): perf/1264 perf_event_open(attr_uptr: 0x1e6b618, pid: 1265, group_fd: -1, flags: FD_CLOEXEC) ...
26.740 ( ): probe:get_callchain_buffers:(ffffffff811a9480))
get_callchain_buffers+0xfe200001 ([kernel.kallsyms])
SYSC_perf_event_open+0xfe2003bc ([kernel.kallsyms])
sys_perf_event_open+0xfe200009 ([kernel.kallsyms])
do_syscall_64+0xfe200062 ([kernel.kallsyms])
return_from_SYSCALL_64+0xfe200000 ([kernel.kallsyms])
syscall+0xffff0179d3c22019 (/usr/lib64/libc-2.22.so)
cmd_record+0xffffffffff8005fc (/home/acme/bin/perf)
run_builtin+0xffffffffff800061 (/home/acme/bin/perf)
main+0xffffffffff800672 (/home/acme/bin/perf)
__libc_start_main+0xffff0179d3c220f0 (/usr/lib64/libc-2.22.so)
26.749 ( 0.016 ms): perf/1264 ... [continued]: perf_event_open()) = 4
<SNIP the perf_event_open syscalls for the other 3 CPUs in this system)
So, if we put some too big value there and the allocation fails, people asking
for callchains will get that -ENOMEM back in their face.
I'll do this test, i.e. put some huge value there and ask for callchains...
- Arnaldo
Em Wed, Apr 20, 2016 at 04:04:12PM -0700, Alexei Starovoitov escreveu:
> On Wed, Apr 20, 2016 at 07:47:30PM -0300, Arnaldo Carvalho de Melo wrote:
> Nice. I like it. That's a great approach to hard problem.
> Java guys will be happy too.
> Please also adjust two places in kernel/bpf/stackmap.c
> > + {
> > + .procname = "perf_event_max_stack",
> > + .data = NULL, /* filled in by handler */
> > + .maxlen = sizeof(sysctl_perf_event_max_stack),
> > + .mode = 0644,
> > + .proc_handler = perf_event_max_stack_handler,
> > + .extra1 = &zero,
> zero seems to be the wrong minimum. I think it should be at least 2 to
> to fit user/kernel tags ? Probably needs to define max as well.
So, if someone asks for zero, it will not copy anything, but then, this
would be what the user had asked for :-)
Ditto for the max, if someone asks for too big a callchain, then when
allocating it it will fail and no callchain will be produced, that or it
will be able to allocate but will take too long copying that many
addresses, and we would be prevented from doing so by some other
protection, iirc there is perf_cpu_time_max_percent, and then buffer
space will run out.
So I think that leaving it as is is enough, no?
Can I keep your Acked-by? David, can I keep yours?
I'll improve tools/perf to look at this file and then push this to Ingo,
if nobody hollers.
Peter, I think my response was enough about reporting allocation
failure, right?
Brendan tested it and it solved the problem he reported, the callchains
he got are really that big and now he has good looking flamegraphs, yay!
- Arnaldo
Here is V2, with stackmap.c adjusted, only build tested.
commit 7c24bb249e392e3dd7dd71a26277c64e313bab4e
Author: Arnaldo Carvalho de Melo <[email protected]>
Date: Thu Apr 21 12:28:50 2016 -0300
perf core: Allow setting up max frame stack depth via sysctl
The default remains 127, which is good for most cases, and not even hit
most of the time, but then for some cases, as reported by Brendan, 1024+
deep frames are appearing on the radar for things like groovy, ruby.
And in some workloads putting a _lower_ cap on this may make sense. One
that is per event still needs to be put in place tho.
The new file is:
# cat /proc/sys/kernel/perf_event_max_stack
127
Chaging it:
# echo 256 > /proc/sys/kernel/perf_event_max_stack
# cat /proc/sys/kernel/perf_event_max_stack
256
But as soon as there is some event using callchains we get:
# echo 512 > /proc/sys/kernel/perf_event_max_stack
-bash: echo: write error: Device or resource busy
#
Because we only allocate the callchain percpu data structures when there
is a user, which allows for changing the max easily, its just a matter
of having no callchain users at that point.
Reported-and-Tested-by: Brendan Gregg <[email protected]>
Acked-by: Alexei Starovoitov <[email protected]>
Acked-by: David Ahern <[email protected]>
Cc: Adrian Hunter <[email protected]>
Cc: Alexander Shishkin <[email protected]>
Cc: He Kuang <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Masami Hiramatsu <[email protected]>
Cc: Milian Wolff <[email protected]>
Cc: Namhyung Kim <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Stephane Eranian <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Vince Weaver <[email protected]>
Cc: Wang Nan <[email protected]>
Cc: Zefan Li <[email protected]>
Link: http://lkml.kernel.org/n/[email protected]
Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 57653a44b128..260cde08e92e 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -60,6 +60,7 @@ show up in /proc/sys/kernel:
- panic_on_warn
- perf_cpu_time_max_percent
- perf_event_paranoid
+- perf_event_max_stack
- pid_max
- powersave-nap [ PPC only ]
- printk
@@ -654,6 +655,19 @@ users (without CAP_SYS_ADMIN). The default value is 1.
==============================================================
+perf_event_max_stack:
+
+Controls maximum number of stack frames to copy for (attr.sample_type &
+PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using
+'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 127.
+
+==============================================================
+
pid_max:
PID allocation wrap value. When the kernel's next PID value
diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index 4e02ae5950ff..27563befa8a2 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
tail = (struct frame_tail __user *)regs->ARM_fp - 1;
- while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+ while ((entry->nr < sysctl_perf_event_max_stack) &&
tail && !((unsigned long)tail & 0x3))
tail = user_backtrace(tail, entry);
}
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index ff4665462a02..32c3c6e70119 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
tail = (struct frame_tail __user *)regs->regs[29];
- while (entry->nr < PERF_MAX_STACK_DEPTH &&
+ while (entry->nr < sysctl_perf_event_max_stack &&
tail && !((unsigned long)tail & 0xf))
tail = user_backtrace(tail, entry);
} else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
- while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+ while ((entry->nr < sysctl_perf_event_max_stack) &&
tail && !((unsigned long)tail & 0x3))
tail = compat_user_backtrace(tail, entry);
#endif
diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c
index 315633461a94..252abc12a5a3 100644
--- a/arch/metag/kernel/perf_callchain.c
+++ b/arch/metag/kernel/perf_callchain.c
@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
--frame;
- while ((entry->nr < PERF_MAX_STACK_DEPTH) && frame)
+ while ((entry->nr < sysctl_perf_event_max_stack) && frame)
frame = user_backtrace(frame, entry);
}
diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c
index c1cf9c6c3f77..5021c546ad07 100644
--- a/arch/mips/kernel/perf_event.c
+++ b/arch/mips/kernel/perf_event.c
@@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
addr = *sp++;
if (__kernel_text_address(addr)) {
perf_callchain_store(entry, addr);
- if (entry->nr >= PERF_MAX_STACK_DEPTH)
+ if (entry->nr >= sysctl_perf_event_max_stack)
break;
}
}
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
}
do {
perf_callchain_store(entry, pc);
- if (entry->nr >= PERF_MAX_STACK_DEPTH)
+ if (entry->nr >= sysctl_perf_event_max_stack)
break;
pc = unwind_stack(current, &sp, pc, &ra);
} while (pc);
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index e04a6752b399..22d9015c1acc 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
sp = regs->gpr[1];
perf_callchain_store(entry, next_ip);
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
fp = (unsigned long __user *) sp;
if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
return;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
sp = regs->gpr[1];
perf_callchain_store(entry, next_ip);
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
fp = (unsigned int __user *) (unsigned long) sp;
if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
return;
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 6596f66ce112..a4b8b5aed21c 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
}
}
#endif
- } while (entry->nr < PERF_MAX_STACK_DEPTH);
+ } while (entry->nr < sysctl_perf_event_max_stack);
}
static inline int
@@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
pc = sf.callers_pc;
ufp = (unsigned long)sf.fp + STACK_BIAS;
perf_callchain_store(entry, pc);
- } while (entry->nr < PERF_MAX_STACK_DEPTH);
+ } while (entry->nr < sysctl_perf_event_max_stack);
}
static void perf_callchain_user_32(struct perf_callchain_entry *entry,
@@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
ufp = (unsigned long)sf.fp;
}
perf_callchain_store(entry, pc);
- } while (entry->nr < PERF_MAX_STACK_DEPTH);
+ } while (entry->nr < sysctl_perf_event_max_stack);
}
void
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 041e442a3e28..41d93d0e972b 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2277,7 +2277,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
fp = compat_ptr(ss_base + regs->bp);
pagefault_disable();
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
unsigned long bytes;
frame.next_frame = 0;
frame.return_address = 0;
@@ -2337,7 +2337,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
return;
pagefault_disable();
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
unsigned long bytes;
frame.next_frame = NULL;
frame.return_address = 0;
diff --git a/arch/xtensa/kernel/perf_event.c b/arch/xtensa/kernel/perf_event.c
index 54f01188c29c..a6b00b3af429 100644
--- a/arch/xtensa/kernel/perf_event.c
+++ b/arch/xtensa/kernel/perf_event.c
@@ -332,14 +332,14 @@ static int callchain_trace(struct stackframe *frame, void *data)
void perf_callchain_kernel(struct perf_callchain_entry *entry,
struct pt_regs *regs)
{
- xtensa_backtrace_kernel(regs, PERF_MAX_STACK_DEPTH,
+ xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack,
callchain_trace, NULL, entry);
}
void perf_callchain_user(struct perf_callchain_entry *entry,
struct pt_regs *regs)
{
- xtensa_backtrace_user(regs, PERF_MAX_STACK_DEPTH,
+ xtensa_backtrace_user(regs, sysctl_perf_event_max_stack,
callchain_trace, entry);
}
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b8b195fbe787..d599ed32af92 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -58,7 +58,7 @@ struct perf_guest_info_callbacks {
struct perf_callchain_entry {
__u64 nr;
- __u64 ip[PERF_MAX_STACK_DEPTH];
+ __u64 ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
};
struct perf_raw_record {
@@ -983,9 +983,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
extern int get_callchain_buffers(void);
extern void put_callchain_buffers(void);
+extern int sysctl_perf_event_max_stack;
+
static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
{
- if (entry->nr < PERF_MAX_STACK_DEPTH) {
+ if (entry->nr < sysctl_perf_event_max_stack) {
entry->ip[entry->nr++] = ip;
return 0;
} else {
@@ -1007,6 +1009,8 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos);
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
static inline bool perf_paranoid_tracepoint_raw(void)
{
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 499d9e933f8e..f5a19548be12 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -66,7 +66,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
value_size < 8 || value_size % 8 ||
- value_size / 8 > PERF_MAX_STACK_DEPTH)
+ value_size / 8 > sysctl_perf_event_max_stack)
return ERR_PTR(-EINVAL);
/* hash table size must be power of 2 */
@@ -124,8 +124,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
struct perf_callchain_entry *trace;
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
u32 max_depth = map->value_size / 8;
- /* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */
- u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth;
+ /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
+ u32 init_nr = sysctl_perf_event_max_stack - max_depth;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
u32 hash, id, trace_nr, trace_len;
bool user = flags & BPF_F_USER_STACK;
@@ -143,7 +143,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
return -EFAULT;
/* get_perf_callchain() guarantees that trace->nr >= init_nr
- * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth
+ * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
*/
trace_nr = trace->nr - init_nr;
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 343c22f5e867..6fe77349fa9d 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -18,6 +18,14 @@ struct callchain_cpus_entries {
struct perf_callchain_entry *cpu_entries[0];
};
+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+
+static size_t perf_callchain_entry__sizeof(void)
+{
+ return sizeof(struct perf_callchain_entry) +
+ sizeof(__u64) * sysctl_perf_event_max_stack;
+}
+
static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
static atomic_t nr_callchain_events;
static DEFINE_MUTEX(callchain_mutex);
@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
if (!entries)
return -ENOMEM;
- size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+ size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
for_each_possible_cpu(cpu) {
entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
@@ -215,3 +223,25 @@ exit_put:
return entry;
}
+
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int new_value = sysctl_perf_event_max_stack, ret;
+ struct ctl_table new_table = *table;
+
+ new_table.data = &new_value;
+ ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
+ if (ret || !write)
+ return ret;
+
+ mutex_lock(&callchain_mutex);
+ if (atomic_read(&nr_callchain_events))
+ ret = -EBUSY;
+ else
+ sysctl_perf_event_max_stack = new_value;
+
+ mutex_unlock(&callchain_mutex);
+
+ return ret;
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 725587f10667..afbe9fb32ec5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1144,6 +1144,14 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one_hundred,
},
+ {
+ .procname = "perf_event_max_stack",
+ .data = NULL, /* filled in by handler */
+ .maxlen = sizeof(sysctl_perf_event_max_stack),
+ .mode = 0644,
+ .proc_handler = perf_event_max_stack_handler,
+ .extra1 = &zero,
+ },
#endif
#ifdef CONFIG_KMEMCHECK
{
On Fri, Apr 22, 2016 at 05:52:32PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Wed, Apr 20, 2016 at 04:04:12PM -0700, Alexei Starovoitov escreveu:
> > On Wed, Apr 20, 2016 at 07:47:30PM -0300, Arnaldo Carvalho de Melo wrote:
>
> > Nice. I like it. That's a great approach to hard problem.
> > Java guys will be happy too.
> > Please also adjust two places in kernel/bpf/stackmap.c
>
> > > + {
> > > + .procname = "perf_event_max_stack",
> > > + .data = NULL, /* filled in by handler */
> > > + .maxlen = sizeof(sysctl_perf_event_max_stack),
> > > + .mode = 0644,
> > > + .proc_handler = perf_event_max_stack_handler,
> > > + .extra1 = &zero,
>
> > zero seems to be the wrong minimum. I think it should be at least 2 to
> > to fit user/kernel tags ? Probably needs to define max as well.
>
> So, if someone asks for zero, it will not copy anything, but then, this
> would be what the user had asked for :-)
>
> Ditto for the max, if someone asks for too big a callchain, then when
> allocating it it will fail and no callchain will be produced, that or it
> will be able to allocate but will take too long copying that many
> addresses, and we would be prevented from doing so by some other
> protection, iirc there is perf_cpu_time_max_percent, and then buffer
> space will run out.
>
> So I think that leaving it as is is enough, no?
>
> Can I keep your Acked-by? David, can I keep yours?
I'm still a bit concerned with perf_event_max_stack==0, since it
means that alloc_callchain_buffers() will be failing,
so perf_event_open() will also be failing with ENOMEM which
will be hard to understand for any user and not clear whether
perf user space can print any hints, since such errno is ambiguous.
Also I'm concerned with very large perf_event_max_stack that
can cause integer overflow.
So I still think we have to set reasonable min and max.
Other than that it's ack.
On 4/22/16 2:52 PM, Arnaldo Carvalho de Melo wrote:
> Em Wed, Apr 20, 2016 at 04:04:12PM -0700, Alexei Starovoitov escreveu:
>> On Wed, Apr 20, 2016 at 07:47:30PM -0300, Arnaldo Carvalho de Melo wrote:
>
>> Nice. I like it. That's a great approach to hard problem.
>> Java guys will be happy too.
>> Please also adjust two places in kernel/bpf/stackmap.c
>
>>> + {
>>> + .procname = "perf_event_max_stack",
>>> + .data = NULL, /* filled in by handler */
>>> + .maxlen = sizeof(sysctl_perf_event_max_stack),
>>> + .mode = 0644,
>>> + .proc_handler = perf_event_max_stack_handler,
>>> + .extra1 = &zero,
>
>> zero seems to be the wrong minimum. I think it should be at least 2 to
>> to fit user/kernel tags ? Probably needs to define max as well.
>
> So, if someone asks for zero, it will not copy anything, but then, this
> would be what the user had asked for :-)
>
> Ditto for the max, if someone asks for too big a callchain, then when
> allocating it it will fail and no callchain will be produced, that or it
> will be able to allocate but will take too long copying that many
> addresses, and we would be prevented from doing so by some other
> protection, iirc there is perf_cpu_time_max_percent, and then buffer
> space will run out.
>
> So I think that leaving it as is is enough, no?
>
> Can I keep your Acked-by? David, can I keep yours?
Yes
> diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
> index 343c22f5e867..6fe77349fa9d 100644
> --- a/kernel/events/callchain.c
> +++ b/kernel/events/callchain.c
> @@ -18,6 +18,14 @@ struct callchain_cpus_entries {
> struct perf_callchain_entry *cpu_entries[0];
> };
>
> +int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
> +
> +static size_t perf_callchain_entry__sizeof(void)
> +{
> + return sizeof(struct perf_callchain_entry) +
> + sizeof(__u64) * sysctl_perf_event_max_stack;
> +}
> +
To Alexei's comment, a max_stack of 0 still has a non-zero alloc size so
that should be ok.
> static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
> static atomic_t nr_callchain_events;
> static DEFINE_MUTEX(callchain_mutex);
> @@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
> if (!entries)
> return -ENOMEM;
>
> - size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
> + size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
>
> for_each_possible_cpu(cpu) {
> entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
On Fri, Apr 22, 2016 at 04:05:31PM -0600, David Ahern wrote:
> On 4/22/16 2:52 PM, Arnaldo Carvalho de Melo wrote:
> >Em Wed, Apr 20, 2016 at 04:04:12PM -0700, Alexei Starovoitov escreveu:
> >>On Wed, Apr 20, 2016 at 07:47:30PM -0300, Arnaldo Carvalho de Melo wrote:
> >
> >>Nice. I like it. That's a great approach to hard problem.
> >>Java guys will be happy too.
> >>Please also adjust two places in kernel/bpf/stackmap.c
> >
> >>>+ {
> >>>+ .procname = "perf_event_max_stack",
> >>>+ .data = NULL, /* filled in by handler */
> >>>+ .maxlen = sizeof(sysctl_perf_event_max_stack),
> >>>+ .mode = 0644,
> >>>+ .proc_handler = perf_event_max_stack_handler,
> >>>+ .extra1 = &zero,
> >
> >>zero seems to be the wrong minimum. I think it should be at least 2 to
> >>to fit user/kernel tags ? Probably needs to define max as well.
> >
> >So, if someone asks for zero, it will not copy anything, but then, this
> >would be what the user had asked for :-)
> >
> >Ditto for the max, if someone asks for too big a callchain, then when
> >allocating it it will fail and no callchain will be produced, that or it
> >will be able to allocate but will take too long copying that many
> >addresses, and we would be prevented from doing so by some other
> >protection, iirc there is perf_cpu_time_max_percent, and then buffer
> >space will run out.
> >
> >So I think that leaving it as is is enough, no?
> >
> >Can I keep your Acked-by? David, can I keep yours?
>
> Yes
>
>
> >diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
> >index 343c22f5e867..6fe77349fa9d 100644
> >--- a/kernel/events/callchain.c
> >+++ b/kernel/events/callchain.c
> >@@ -18,6 +18,14 @@ struct callchain_cpus_entries {
> > struct perf_callchain_entry *cpu_entries[0];
> > };
> >
> >+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
> >+
> >+static size_t perf_callchain_entry__sizeof(void)
> >+{
> >+ return sizeof(struct perf_callchain_entry) +
> >+ sizeof(__u64) * sysctl_perf_event_max_stack;
> >+}
> >+
>
> To Alexei's comment, a max_stack of 0 still has a non-zero alloc size so
> that should be ok.
>
>
> > static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
> > static atomic_t nr_callchain_events;
> > static DEFINE_MUTEX(callchain_mutex);
> >@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
> > if (!entries)
> > return -ENOMEM;
> >
> >- size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
> >+ size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
> >
> > for_each_possible_cpu(cpu) {
> > entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
right... and looking into it further, realized that the patch is broken,
since get_callchain_entry() is doing:
return &entries->cpu_entries[cpu][*rctx];
whereas it should be dynamic offset based on sysctl_perf_event_max_stack*8
So definitely needs another respin.
Em Fri, Apr 22, 2016 at 03:18:08PM -0700, Alexei Starovoitov escreveu:
> On Fri, Apr 22, 2016 at 04:05:31PM -0600, David Ahern wrote:
> > On 4/22/16 2:52 PM, Arnaldo Carvalho de Melo wrote:
> > >+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
> > >+
> > >+static size_t perf_callchain_entry__sizeof(void)
> > >+{
> > >+ return sizeof(struct perf_callchain_entry) +
> > >+ sizeof(__u64) * sysctl_perf_event_max_stack;
> > >+}
> > >+
> > To Alexei's comment, a max_stack of 0 still has a non-zero alloc size so
> > that should be ok.
> > > static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
> > > static atomic_t nr_callchain_events;
> > > static DEFINE_MUTEX(callchain_mutex);
> > >@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
> > > if (!entries)
> > > return -ENOMEM;
> > >- size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
> > >+ size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
> > > for_each_possible_cpu(cpu) {
> > > entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
> right... and looking into it further, realized that the patch is broken,
> since get_callchain_entry() is doing:
> return &entries->cpu_entries[cpu][*rctx];
> whereas it should be dynamic offset based on sysctl_perf_event_max_stack*8
> So definitely needs another respin.
Huh? Can you elaborate a bit more?
Are you saying this is a bug introduced by this patch or something
pre-existing?
- Arnaldo
Em Mon, Apr 25, 2016 at 01:14:25PM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Fri, Apr 22, 2016 at 03:18:08PM -0700, Alexei Starovoitov escreveu:
> > On Fri, Apr 22, 2016 at 04:05:31PM -0600, David Ahern wrote:
> > > On 4/22/16 2:52 PM, Arnaldo Carvalho de Melo wrote:
> > > >+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
> > > >+
> > > >+static size_t perf_callchain_entry__sizeof(void)
> > > >+{
> > > >+ return sizeof(struct perf_callchain_entry) +
> > > >+ sizeof(__u64) * sysctl_perf_event_max_stack;
> > > >+}
> > > >+
>
> > > To Alexei's comment, a max_stack of 0 still has a non-zero alloc size so
> > > that should be ok.
>
> > > > static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
> > > > static atomic_t nr_callchain_events;
> > > > static DEFINE_MUTEX(callchain_mutex);
> > > >@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
> > > > if (!entries)
> > > > return -ENOMEM;
>
> > > >- size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
> > > >+ size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
>
> > > > for_each_possible_cpu(cpu) {
> > > > entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
>
> > right... and looking into it further, realized that the patch is broken,
> > since get_callchain_entry() is doing:
> > return &entries->cpu_entries[cpu][*rctx];
> > whereas it should be dynamic offset based on sysctl_perf_event_max_stack*8
> > So definitely needs another respin.
>
> Huh? Can you elaborate a bit more?
>
> Are you saying this is a bug introduced by this patch or something
> pre-existing?
When we alloc we alloc:
/*
* Number of contexts where an event can trigger:
* task, softirq, hardirq, nmi.
*/
#define PERF_NR_CONTEXTS 4
callchain_cpus_entries[0 .. (nr_cpu_ids - 1)][in_nmi() (3)]
callchain_cpus_entries[0 .. (nr_cpu_ids - 1)][in_irq() (2)]
callchain_cpus_entries[0 .. (nr_cpu_ids - 1)][in_sortirq() (1)]
callchain_cpus_entries[0 .. (nr_cpu_ids - 1)][(0)]
And all of these have this type:
struct perf_callchain_entry {
__u64 nr;
__u64 ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
};
So, it will return a struct_callchain_entry pointer to a 8-byte sized
chunk, with just perf_callchain_entry->nr, and will not try to touch
perf_callchain_entry->ip[] since sysctl_perf_event_max_stack is zero.
But perf_callchain_entry->ip is not a pointer... Got it ;-\
Touch?, respinning...
Brendan, this probably affected you results...
- Arnaldo
Em Mon, Apr 25, 2016 at 01:27:06PM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Mon, Apr 25, 2016 at 01:14:25PM -0300, Arnaldo Carvalho de Melo escreveu:
> > Em Fri, Apr 22, 2016 at 03:18:08PM -0700, Alexei Starovoitov escreveu:
> > > right... and looking into it further, realized that the patch is broken,
> > > since get_callchain_entry() is doing:
> > > return &entries->cpu_entries[cpu][*rctx];
> > > whereas it should be dynamic offset based on sysctl_perf_event_max_stack*8
> > > So definitely needs another respin.
> struct perf_callchain_entry {
> __u64 nr;
> __u64 ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
> };
> But perf_callchain_entry->ip is not a pointer... Got it ;-\
This is what I am building now, a patch on top of the previous, that
will be combined and sent as v3, if I don't find any more funnies:
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 6fe77349fa9d..40657892a7e0 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -20,11 +20,10 @@ struct callchain_cpus_entries {
int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
-static size_t perf_callchain_entry__sizeof(void)
-{
- return sizeof(struct perf_callchain_entry) +
- sizeof(__u64) * sysctl_perf_event_max_stack;
-}
+#define __perf_callchain_entry_size(entries) \
+ (sizeof(struct perf_callchain_entry) + sizeof(__u64) * entries)
+
+static size_t perf_callchain_entry_size __read_mostly = __perf_callchain_entry_size(PERF_MAX_STACK_DEPTH);
static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
static atomic_t nr_callchain_events;
@@ -81,7 +80,7 @@ static int alloc_callchain_buffers(void)
if (!entries)
return -ENOMEM;
- size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
+ size = perf_callchain_entry_size * PERF_NR_CONTEXTS;
for_each_possible_cpu(cpu) {
entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
@@ -155,7 +154,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
cpu = smp_processor_id();
- return &entries->cpu_entries[cpu][*rctx];
+ return (((void *)&entries->cpu_entries[cpu][0]) +
+ (*rctx * perf_callchain_entry_size));
}
static void
@@ -236,10 +236,12 @@ int perf_event_max_stack_handler(struct ctl_table *table, int write,
return ret;
mutex_lock(&callchain_mutex);
- if (atomic_read(&nr_callchain_events))
+ if (atomic_read(&nr_callchain_events)) {
ret = -EBUSY;
- else
+ } else {
sysctl_perf_event_max_stack = new_value;
+ perf_callchain_entry_size = __perf_callchain_entry_size(new_value);
+ }
mutex_unlock(&callchain_mutex);
On Mon, Apr 25, 2016 at 04:22:29PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Mon, Apr 25, 2016 at 01:27:06PM -0300, Arnaldo Carvalho de Melo escreveu:
> > Em Mon, Apr 25, 2016 at 01:14:25PM -0300, Arnaldo Carvalho de Melo escreveu:
> > > Em Fri, Apr 22, 2016 at 03:18:08PM -0700, Alexei Starovoitov escreveu:
> > > > right... and looking into it further, realized that the patch is broken,
> > > > since get_callchain_entry() is doing:
> > > > return &entries->cpu_entries[cpu][*rctx];
> > > > whereas it should be dynamic offset based on sysctl_perf_event_max_stack*8
> > > > So definitely needs another respin.
>
> > struct perf_callchain_entry {
> > __u64 nr;
> > __u64 ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
> > };
>
> > But perf_callchain_entry->ip is not a pointer... Got it ;-\
>
> This is what I am building now, a patch on top of the previous, that
> will be combined and sent as v3, if I don't find any more funnies:
>
> diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
> index 6fe77349fa9d..40657892a7e0 100644
> --- a/kernel/events/callchain.c
> +++ b/kernel/events/callchain.c
> @@ -20,11 +20,10 @@ struct callchain_cpus_entries {
>
> int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
>
> -static size_t perf_callchain_entry__sizeof(void)
> -{
> - return sizeof(struct perf_callchain_entry) +
> - sizeof(__u64) * sysctl_perf_event_max_stack;
> -}
> +#define __perf_callchain_entry_size(entries) \
> + (sizeof(struct perf_callchain_entry) + sizeof(__u64) * entries)
> +
> +static size_t perf_callchain_entry_size __read_mostly = __perf_callchain_entry_size(PERF_MAX_STACK_DEPTH);
>
> static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
> static atomic_t nr_callchain_events;
> @@ -81,7 +80,7 @@ static int alloc_callchain_buffers(void)
> if (!entries)
> return -ENOMEM;
>
> - size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
> + size = perf_callchain_entry_size * PERF_NR_CONTEXTS;
>
> for_each_possible_cpu(cpu) {
> entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
> @@ -155,7 +154,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
>
> cpu = smp_processor_id();
>
> - return &entries->cpu_entries[cpu][*rctx];
> + return (((void *)&entries->cpu_entries[cpu][0]) +
> + (*rctx * perf_callchain_entry_size));
I think the following would be easier to read:
+ return (void *)entries->cpu_entries[cpu] +
+ *rctx * perf_callchain_entry_size;
?
if didn't mixed up the ordering...
and probably we could do the math on the spot instead of introducing
perf_callchain_entry_size global variable?
Em Mon, Apr 25, 2016 at 01:06:48PM -0700, Alexei Starovoitov escreveu:
> On Mon, Apr 25, 2016 at 04:22:29PM -0300, Arnaldo Carvalho de Melo wrote:
> > Em Mon, Apr 25, 2016 at 01:27:06PM -0300, Arnaldo Carvalho de Melo escreveu:
> > > Em Mon, Apr 25, 2016 at 01:14:25PM -0300, Arnaldo Carvalho de Melo escreveu:
> > > > Em Fri, Apr 22, 2016 at 03:18:08PM -0700, Alexei Starovoitov escreveu:
> > > > > right... and looking into it further, realized that the patch is broken,
> > > > > since get_callchain_entry() is doing:
> > > > > return &entries->cpu_entries[cpu][*rctx];
> > > > > whereas it should be dynamic offset based on sysctl_perf_event_max_stack*8
> > > > > So definitely needs another respin.
> >
> > > struct perf_callchain_entry {
> > > __u64 nr;
> > > __u64 ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
> > > };
> >
> > > But perf_callchain_entry->ip is not a pointer... Got it ;-\
> >
> > This is what I am building now, a patch on top of the previous, that
> > will be combined and sent as v3, if I don't find any more funnies:
> >
> > diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
> > index 6fe77349fa9d..40657892a7e0 100644
> > --- a/kernel/events/callchain.c
> > +++ b/kernel/events/callchain.c
> > @@ -20,11 +20,10 @@ struct callchain_cpus_entries {
> >
> > int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
> >
> > -static size_t perf_callchain_entry__sizeof(void)
> > -{
> > - return sizeof(struct perf_callchain_entry) +
> > - sizeof(__u64) * sysctl_perf_event_max_stack;
> > -}
> > +#define __perf_callchain_entry_size(entries) \
> > + (sizeof(struct perf_callchain_entry) + sizeof(__u64) * entries)
> > +
> > +static size_t perf_callchain_entry_size __read_mostly = __perf_callchain_entry_size(PERF_MAX_STACK_DEPTH);
> >
> > static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
> > static atomic_t nr_callchain_events;
> > @@ -81,7 +80,7 @@ static int alloc_callchain_buffers(void)
> > if (!entries)
> > return -ENOMEM;
> >
> > - size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
> > + size = perf_callchain_entry_size * PERF_NR_CONTEXTS;
> >
> > for_each_possible_cpu(cpu) {
> > entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
> > @@ -155,7 +154,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
> >
> > cpu = smp_processor_id();
> >
> > - return &entries->cpu_entries[cpu][*rctx];
> > + return (((void *)&entries->cpu_entries[cpu][0]) +
> > + (*rctx * perf_callchain_entry_size));
>
> I think the following would be easier to read:
>
> + return (void *)entries->cpu_entries[cpu] +
> + *rctx * perf_callchain_entry_size;
Well, I thought that multiline expressions required parentheses, to make
them easier to read for someone, maybe Ingo? ;-)
Whatever, both generate the same result, really want me to change this?
> ?
> if didn't mixed up the ordering...
If you are not sure, then its not clearer, huh? ;-P
> and probably we could do the math on the spot instead of introducing
> perf_callchain_entry_size global variable?
I was trying to avoid the calc for each alloc, just doing it when we
change that number via the sysctl, probably not that big a deal, do you
really think we should do the math per-alloc instead of
per-sysctl-changing?
- Arnaldo
On Mon, Apr 25, 2016 at 05:17:50PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Mon, Apr 25, 2016 at 01:06:48PM -0700, Alexei Starovoitov escreveu:
> > On Mon, Apr 25, 2016 at 04:22:29PM -0300, Arnaldo Carvalho de Melo wrote:
> > > Em Mon, Apr 25, 2016 at 01:27:06PM -0300, Arnaldo Carvalho de Melo escreveu:
> > > > Em Mon, Apr 25, 2016 at 01:14:25PM -0300, Arnaldo Carvalho de Melo escreveu:
> > > > > Em Fri, Apr 22, 2016 at 03:18:08PM -0700, Alexei Starovoitov escreveu:
> > > > > > right... and looking into it further, realized that the patch is broken,
> > > > > > since get_callchain_entry() is doing:
> > > > > > return &entries->cpu_entries[cpu][*rctx];
> > > > > > whereas it should be dynamic offset based on sysctl_perf_event_max_stack*8
> > > > > > So definitely needs another respin.
> > >
> > > > struct perf_callchain_entry {
> > > > __u64 nr;
> > > > __u64 ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
> > > > };
> > >
> > > > But perf_callchain_entry->ip is not a pointer... Got it ;-\
> > >
> > > This is what I am building now, a patch on top of the previous, that
> > > will be combined and sent as v3, if I don't find any more funnies:
> > >
> > > diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
> > > index 6fe77349fa9d..40657892a7e0 100644
> > > --- a/kernel/events/callchain.c
> > > +++ b/kernel/events/callchain.c
> > > @@ -20,11 +20,10 @@ struct callchain_cpus_entries {
> > >
> > > int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
> > >
> > > -static size_t perf_callchain_entry__sizeof(void)
> > > -{
> > > - return sizeof(struct perf_callchain_entry) +
> > > - sizeof(__u64) * sysctl_perf_event_max_stack;
> > > -}
> > > +#define __perf_callchain_entry_size(entries) \
> > > + (sizeof(struct perf_callchain_entry) + sizeof(__u64) * entries)
> > > +
> > > +static size_t perf_callchain_entry_size __read_mostly = __perf_callchain_entry_size(PERF_MAX_STACK_DEPTH);
> > >
> > > static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
> > > static atomic_t nr_callchain_events;
> > > @@ -81,7 +80,7 @@ static int alloc_callchain_buffers(void)
> > > if (!entries)
> > > return -ENOMEM;
> > >
> > > - size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
> > > + size = perf_callchain_entry_size * PERF_NR_CONTEXTS;
> > >
> > > for_each_possible_cpu(cpu) {
> > > entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
> > > @@ -155,7 +154,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
> > >
> > > cpu = smp_processor_id();
> > >
> > > - return &entries->cpu_entries[cpu][*rctx];
> > > + return (((void *)&entries->cpu_entries[cpu][0]) +
> > > + (*rctx * perf_callchain_entry_size));
> >
> > I think the following would be easier to read:
> >
> > + return (void *)entries->cpu_entries[cpu] +
> > + *rctx * perf_callchain_entry_size;
>
> Well, I thought that multiline expressions required parentheses, to make
> them easier to read for someone, maybe Ingo? ;-)
>
> Whatever, both generate the same result, really want me to change this?
I was mainly complaining about unnecessary &..[0]
> > ?
> > if didn't mixed up the ordering...
>
> If you are not sure, then its not clearer, huh? ;-P
matter of taste. No strong opinion
> > and probably we could do the math on the spot instead of introducing
> > perf_callchain_entry_size global variable?
>
> I was trying to avoid the calc for each alloc, just doing it when we
> change that number via the sysctl, probably not that big a deal, do you
> really think we should do the math per-alloc instead of
> per-sysctl-changing?
The math is trivial:
#define __perf_callchain_entry_size(entries) \
(sizeof(struct perf_callchain_entry) + sizeof(__u64) * entries)
shift and add after load is almost the same speed as load, since
integer multiply right after is dominating the cost.
whereas updating two global variables can potentially cause
race conditions that need to be analyzed.
Em Mon, Apr 25, 2016 at 02:59:49PM -0700, Alexei Starovoitov escreveu:
> On Mon, Apr 25, 2016 at 05:17:50PM -0300, Arnaldo Carvalho de Melo wrote:
> > Em Mon, Apr 25, 2016 at 01:06:48PM -0700, Alexei Starovoitov escreveu:
> > > On Mon, Apr 25, 2016 at 04:22:29PM -0300, Arnaldo Carvalho de Melo wrote:
> > Well, I thought that multiline expressions required parentheses, to make
> > them easier to read for someone, maybe Ingo? ;-)
> >
> > Whatever, both generate the same result, really want me to change this?
>
> I was mainly complaining about unnecessary &..[0]
>
> > > ?
> > > if didn't mixed up the ordering...
> >
> > If you are not sure, then its not clearer, huh? ;-P
>
> matter of taste. No strong opinion
>
> > > and probably we could do the math on the spot instead of introducing
> > > perf_callchain_entry_size global variable?
> >
> > I was trying to avoid the calc for each alloc, just doing it when we
> > change that number via the sysctl, probably not that big a deal, do you
> > really think we should do the math per-alloc instead of
> > per-sysctl-changing?
>
> The math is trivial:
> #define __perf_callchain_entry_size(entries) \
> (sizeof(struct perf_callchain_entry) + sizeof(__u64) * entries)
> shift and add after load is almost the same speed as load, since
> integer multiply right after is dominating the cost.
> whereas updating two global variables can potentially cause
> race conditions that need to be analyzed.
Ok, I thought the places where we changed those variables were always
under that callchain_mutex, but nah, too much trouble checking, find v3
below, see if I can have your Acked-by.
Thanks,
- Arnaldo
commit 565e8b138157fbe1c4495f083b88db681f52c6b3
Author: Arnaldo Carvalho de Melo <[email protected]>
Date: Thu Apr 21 12:28:50 2016 -0300
perf core: Allow setting up max frame stack depth via sysctl
The default remains 127, which is good for most cases, and not even hit
most of the time, but then for some cases, as reported by Brendan, 1024+
deep frames are appearing on the radar for things like groovy, ruby.
And in some workloads putting a _lower_ cap on this may make sense. One
that is per event still needs to be put in place tho.
The new file is:
# cat /proc/sys/kernel/perf_event_max_stack
127
Chaging it:
# echo 256 > /proc/sys/kernel/perf_event_max_stack
# cat /proc/sys/kernel/perf_event_max_stack
256
But as soon as there is some event using callchains we get:
# echo 512 > /proc/sys/kernel/perf_event_max_stack
-bash: echo: write error: Device or resource busy
#
Because we only allocate the callchain percpu data structures when there
is a user, which allows for changing the max easily, its just a matter
of having no callchain users at that point.
Reported-and-Tested-by: Brendan Gregg <[email protected]>
Acked-by: Alexei Starovoitov <[email protected]>
Acked-by: David Ahern <[email protected]>
Cc: Adrian Hunter <[email protected]>
Cc: Alexander Shishkin <[email protected]>
Cc: He Kuang <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Masami Hiramatsu <[email protected]>
Cc: Milian Wolff <[email protected]>
Cc: Namhyung Kim <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Stephane Eranian <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Vince Weaver <[email protected]>
Cc: Wang Nan <[email protected]>
Cc: Zefan Li <[email protected]>
Link: http://lkml.kernel.org/n/[email protected]
Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 57653a44b128..260cde08e92e 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -60,6 +60,7 @@ show up in /proc/sys/kernel:
- panic_on_warn
- perf_cpu_time_max_percent
- perf_event_paranoid
+- perf_event_max_stack
- pid_max
- powersave-nap [ PPC only ]
- printk
@@ -654,6 +655,19 @@ users (without CAP_SYS_ADMIN). The default value is 1.
==============================================================
+perf_event_max_stack:
+
+Controls maximum number of stack frames to copy for (attr.sample_type &
+PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using
+'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 127.
+
+==============================================================
+
pid_max:
PID allocation wrap value. When the kernel's next PID value
diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index 4e02ae5950ff..27563befa8a2 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
tail = (struct frame_tail __user *)regs->ARM_fp - 1;
- while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+ while ((entry->nr < sysctl_perf_event_max_stack) &&
tail && !((unsigned long)tail & 0x3))
tail = user_backtrace(tail, entry);
}
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index ff4665462a02..32c3c6e70119 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
tail = (struct frame_tail __user *)regs->regs[29];
- while (entry->nr < PERF_MAX_STACK_DEPTH &&
+ while (entry->nr < sysctl_perf_event_max_stack &&
tail && !((unsigned long)tail & 0xf))
tail = user_backtrace(tail, entry);
} else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
- while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+ while ((entry->nr < sysctl_perf_event_max_stack) &&
tail && !((unsigned long)tail & 0x3))
tail = compat_user_backtrace(tail, entry);
#endif
diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c
index 315633461a94..252abc12a5a3 100644
--- a/arch/metag/kernel/perf_callchain.c
+++ b/arch/metag/kernel/perf_callchain.c
@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
--frame;
- while ((entry->nr < PERF_MAX_STACK_DEPTH) && frame)
+ while ((entry->nr < sysctl_perf_event_max_stack) && frame)
frame = user_backtrace(frame, entry);
}
diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c
index c1cf9c6c3f77..5021c546ad07 100644
--- a/arch/mips/kernel/perf_event.c
+++ b/arch/mips/kernel/perf_event.c
@@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
addr = *sp++;
if (__kernel_text_address(addr)) {
perf_callchain_store(entry, addr);
- if (entry->nr >= PERF_MAX_STACK_DEPTH)
+ if (entry->nr >= sysctl_perf_event_max_stack)
break;
}
}
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
}
do {
perf_callchain_store(entry, pc);
- if (entry->nr >= PERF_MAX_STACK_DEPTH)
+ if (entry->nr >= sysctl_perf_event_max_stack)
break;
pc = unwind_stack(current, &sp, pc, &ra);
} while (pc);
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index e04a6752b399..22d9015c1acc 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
sp = regs->gpr[1];
perf_callchain_store(entry, next_ip);
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
fp = (unsigned long __user *) sp;
if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
return;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
sp = regs->gpr[1];
perf_callchain_store(entry, next_ip);
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
fp = (unsigned int __user *) (unsigned long) sp;
if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
return;
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 6596f66ce112..a4b8b5aed21c 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
}
}
#endif
- } while (entry->nr < PERF_MAX_STACK_DEPTH);
+ } while (entry->nr < sysctl_perf_event_max_stack);
}
static inline int
@@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
pc = sf.callers_pc;
ufp = (unsigned long)sf.fp + STACK_BIAS;
perf_callchain_store(entry, pc);
- } while (entry->nr < PERF_MAX_STACK_DEPTH);
+ } while (entry->nr < sysctl_perf_event_max_stack);
}
static void perf_callchain_user_32(struct perf_callchain_entry *entry,
@@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
ufp = (unsigned long)sf.fp;
}
perf_callchain_store(entry, pc);
- } while (entry->nr < PERF_MAX_STACK_DEPTH);
+ } while (entry->nr < sysctl_perf_event_max_stack);
}
void
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 041e442a3e28..41d93d0e972b 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2277,7 +2277,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
fp = compat_ptr(ss_base + regs->bp);
pagefault_disable();
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
unsigned long bytes;
frame.next_frame = 0;
frame.return_address = 0;
@@ -2337,7 +2337,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
return;
pagefault_disable();
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
unsigned long bytes;
frame.next_frame = NULL;
frame.return_address = 0;
diff --git a/arch/xtensa/kernel/perf_event.c b/arch/xtensa/kernel/perf_event.c
index 54f01188c29c..a6b00b3af429 100644
--- a/arch/xtensa/kernel/perf_event.c
+++ b/arch/xtensa/kernel/perf_event.c
@@ -332,14 +332,14 @@ static int callchain_trace(struct stackframe *frame, void *data)
void perf_callchain_kernel(struct perf_callchain_entry *entry,
struct pt_regs *regs)
{
- xtensa_backtrace_kernel(regs, PERF_MAX_STACK_DEPTH,
+ xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack,
callchain_trace, NULL, entry);
}
void perf_callchain_user(struct perf_callchain_entry *entry,
struct pt_regs *regs)
{
- xtensa_backtrace_user(regs, PERF_MAX_STACK_DEPTH,
+ xtensa_backtrace_user(regs, sysctl_perf_event_max_stack,
callchain_trace, entry);
}
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 85749ae8cb5f..a090700cccca 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -58,7 +58,7 @@ struct perf_guest_info_callbacks {
struct perf_callchain_entry {
__u64 nr;
- __u64 ip[PERF_MAX_STACK_DEPTH];
+ __u64 ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
};
struct perf_raw_record {
@@ -993,9 +993,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
extern int get_callchain_buffers(void);
extern void put_callchain_buffers(void);
+extern int sysctl_perf_event_max_stack;
+
static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
{
- if (entry->nr < PERF_MAX_STACK_DEPTH) {
+ if (entry->nr < sysctl_perf_event_max_stack) {
entry->ip[entry->nr++] = ip;
return 0;
} else {
@@ -1017,6 +1019,8 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos);
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
static inline bool perf_paranoid_tracepoint_raw(void)
{
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 499d9e933f8e..f5a19548be12 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -66,7 +66,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
value_size < 8 || value_size % 8 ||
- value_size / 8 > PERF_MAX_STACK_DEPTH)
+ value_size / 8 > sysctl_perf_event_max_stack)
return ERR_PTR(-EINVAL);
/* hash table size must be power of 2 */
@@ -124,8 +124,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
struct perf_callchain_entry *trace;
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
u32 max_depth = map->value_size / 8;
- /* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */
- u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth;
+ /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
+ u32 init_nr = sysctl_perf_event_max_stack - max_depth;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
u32 hash, id, trace_nr, trace_len;
bool user = flags & BPF_F_USER_STACK;
@@ -143,7 +143,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
return -EFAULT;
/* get_perf_callchain() guarantees that trace->nr >= init_nr
- * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth
+ * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
*/
trace_nr = trace->nr - init_nr;
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 343c22f5e867..b9325e7dcba1 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -18,6 +18,14 @@ struct callchain_cpus_entries {
struct perf_callchain_entry *cpu_entries[0];
};
+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+
+static inline size_t perf_callchain_entry__sizeof(void)
+{
+ return (sizeof(struct perf_callchain_entry) +
+ sizeof(__u64) * sysctl_perf_event_max_stack);
+}
+
static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
static atomic_t nr_callchain_events;
static DEFINE_MUTEX(callchain_mutex);
@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
if (!entries)
return -ENOMEM;
- size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+ size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
for_each_possible_cpu(cpu) {
entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
@@ -147,7 +155,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
cpu = smp_processor_id();
- return &entries->cpu_entries[cpu][*rctx];
+ return (((void *)entries->cpu_entries[cpu]) +
+ (*rctx * perf_callchain_entry__sizeof()));
}
static void
@@ -215,3 +224,25 @@ exit_put:
return entry;
}
+
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int new_value = sysctl_perf_event_max_stack, ret;
+ struct ctl_table new_table = *table;
+
+ new_table.data = &new_value;
+ ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
+ if (ret || !write)
+ return ret;
+
+ mutex_lock(&callchain_mutex);
+ if (atomic_read(&nr_callchain_events))
+ ret = -EBUSY;
+ else
+ sysctl_perf_event_max_stack = new_value;
+
+ mutex_unlock(&callchain_mutex);
+
+ return ret;
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 725587f10667..afbe9fb32ec5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1144,6 +1144,14 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one_hundred,
},
+ {
+ .procname = "perf_event_max_stack",
+ .data = NULL, /* filled in by handler */
+ .maxlen = sizeof(sysctl_perf_event_max_stack),
+ .mode = 0644,
+ .proc_handler = perf_event_max_stack_handler,
+ .extra1 = &zero,
+ },
#endif
#ifdef CONFIG_KMEMCHECK
{
On Mon, Apr 25, 2016 at 08:41:39PM -0300, Arnaldo Carvalho de Melo wrote:
>
> +int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
> +
> +static inline size_t perf_callchain_entry__sizeof(void)
> +{
> + return (sizeof(struct perf_callchain_entry) +
> + sizeof(__u64) * sysctl_perf_event_max_stack);
> +}
> @@ -1144,6 +1144,14 @@ static struct ctl_table kern_table[] = {
> .extra1 = &zero,
> .extra2 = &one_hundred,
> },
> + {
> + .procname = "perf_event_max_stack",
> + .data = NULL, /* filled in by handler */
> + .maxlen = sizeof(sysctl_perf_event_max_stack),
> + .mode = 0644,
> + .proc_handler = perf_event_max_stack_handler,
> + .extra1 = &zero,
> + },
you need to define a max value otherwise perf_callchain_entry__sizeof
will overflow. Sure it's root only facility, but still not nice.
1M? Anything above 1M stack frames would be insane anyway.
The rest looks good. Thanks!
Em Mon, Apr 25, 2016 at 05:07:26PM -0700, Alexei Starovoitov escreveu:
> > + {
> > + .procname = "perf_event_max_stack",
> > + .data = NULL, /* filled in by handler */
> > + .maxlen = sizeof(sysctl_perf_event_max_stack),
> > + .mode = 0644,
> > + .proc_handler = perf_event_max_stack_handler,
> > + .extra1 = &zero,
> > + },
> you need to define a max value otherwise perf_callchain_entry__sizeof
> will overflow. Sure it's root only facility, but still not nice.
> 1M? Anything above 1M stack frames would be insane anyway.
> The rest looks good. Thanks!
Something else? ;-)
:-)
- Arnaldo
commit cd544af4f7fede01cb512d52bb3efe62aa19271d
Author: Arnaldo Carvalho de Melo <[email protected]>
Date: Thu Apr 21 12:28:50 2016 -0300
perf core: Allow setting up max frame stack depth via sysctl
The default remains 127, which is good for most cases, and not even hit
most of the time, but then for some cases, as reported by Brendan, 1024+
deep frames are appearing on the radar for things like groovy, ruby.
And in some workloads putting a _lower_ cap on this may make sense. One
that is per event still needs to be put in place tho.
The new file is:
# cat /proc/sys/kernel/perf_event_max_stack
127
Chaging it:
# echo 256 > /proc/sys/kernel/perf_event_max_stack
# cat /proc/sys/kernel/perf_event_max_stack
256
But as soon as there is some event using callchains we get:
# echo 512 > /proc/sys/kernel/perf_event_max_stack
-bash: echo: write error: Device or resource busy
#
Because we only allocate the callchain percpu data structures when there
is a user, which allows for changing the max easily, its just a matter
of having no callchain users at that point.
Reported-and-Tested-by: Brendan Gregg <[email protected]>
Acked-by: Alexei Starovoitov <[email protected]>
Acked-by: David Ahern <[email protected]>
Cc: Adrian Hunter <[email protected]>
Cc: Alexander Shishkin <[email protected]>
Cc: He Kuang <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Masami Hiramatsu <[email protected]>
Cc: Milian Wolff <[email protected]>
Cc: Namhyung Kim <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Stephane Eranian <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Vince Weaver <[email protected]>
Cc: Wang Nan <[email protected]>
Cc: Zefan Li <[email protected]>
Link: http://lkml.kernel.org/n/[email protected]
Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 57653a44b128..260cde08e92e 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -60,6 +60,7 @@ show up in /proc/sys/kernel:
- panic_on_warn
- perf_cpu_time_max_percent
- perf_event_paranoid
+- perf_event_max_stack
- pid_max
- powersave-nap [ PPC only ]
- printk
@@ -654,6 +655,19 @@ users (without CAP_SYS_ADMIN). The default value is 1.
==============================================================
+perf_event_max_stack:
+
+Controls maximum number of stack frames to copy for (attr.sample_type &
+PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using
+'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 127.
+
+==============================================================
+
pid_max:
PID allocation wrap value. When the kernel's next PID value
diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index 4e02ae5950ff..27563befa8a2 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
tail = (struct frame_tail __user *)regs->ARM_fp - 1;
- while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+ while ((entry->nr < sysctl_perf_event_max_stack) &&
tail && !((unsigned long)tail & 0x3))
tail = user_backtrace(tail, entry);
}
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index ff4665462a02..32c3c6e70119 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
tail = (struct frame_tail __user *)regs->regs[29];
- while (entry->nr < PERF_MAX_STACK_DEPTH &&
+ while (entry->nr < sysctl_perf_event_max_stack &&
tail && !((unsigned long)tail & 0xf))
tail = user_backtrace(tail, entry);
} else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
- while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+ while ((entry->nr < sysctl_perf_event_max_stack) &&
tail && !((unsigned long)tail & 0x3))
tail = compat_user_backtrace(tail, entry);
#endif
diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c
index 315633461a94..252abc12a5a3 100644
--- a/arch/metag/kernel/perf_callchain.c
+++ b/arch/metag/kernel/perf_callchain.c
@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
--frame;
- while ((entry->nr < PERF_MAX_STACK_DEPTH) && frame)
+ while ((entry->nr < sysctl_perf_event_max_stack) && frame)
frame = user_backtrace(frame, entry);
}
diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c
index c1cf9c6c3f77..5021c546ad07 100644
--- a/arch/mips/kernel/perf_event.c
+++ b/arch/mips/kernel/perf_event.c
@@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
addr = *sp++;
if (__kernel_text_address(addr)) {
perf_callchain_store(entry, addr);
- if (entry->nr >= PERF_MAX_STACK_DEPTH)
+ if (entry->nr >= sysctl_perf_event_max_stack)
break;
}
}
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
}
do {
perf_callchain_store(entry, pc);
- if (entry->nr >= PERF_MAX_STACK_DEPTH)
+ if (entry->nr >= sysctl_perf_event_max_stack)
break;
pc = unwind_stack(current, &sp, pc, &ra);
} while (pc);
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index e04a6752b399..22d9015c1acc 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
sp = regs->gpr[1];
perf_callchain_store(entry, next_ip);
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
fp = (unsigned long __user *) sp;
if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
return;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
sp = regs->gpr[1];
perf_callchain_store(entry, next_ip);
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
fp = (unsigned int __user *) (unsigned long) sp;
if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
return;
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 6596f66ce112..a4b8b5aed21c 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
}
}
#endif
- } while (entry->nr < PERF_MAX_STACK_DEPTH);
+ } while (entry->nr < sysctl_perf_event_max_stack);
}
static inline int
@@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
pc = sf.callers_pc;
ufp = (unsigned long)sf.fp + STACK_BIAS;
perf_callchain_store(entry, pc);
- } while (entry->nr < PERF_MAX_STACK_DEPTH);
+ } while (entry->nr < sysctl_perf_event_max_stack);
}
static void perf_callchain_user_32(struct perf_callchain_entry *entry,
@@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
ufp = (unsigned long)sf.fp;
}
perf_callchain_store(entry, pc);
- } while (entry->nr < PERF_MAX_STACK_DEPTH);
+ } while (entry->nr < sysctl_perf_event_max_stack);
}
void
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 041e442a3e28..41d93d0e972b 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2277,7 +2277,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
fp = compat_ptr(ss_base + regs->bp);
pagefault_disable();
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
unsigned long bytes;
frame.next_frame = 0;
frame.return_address = 0;
@@ -2337,7 +2337,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
return;
pagefault_disable();
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
unsigned long bytes;
frame.next_frame = NULL;
frame.return_address = 0;
diff --git a/arch/xtensa/kernel/perf_event.c b/arch/xtensa/kernel/perf_event.c
index 54f01188c29c..a6b00b3af429 100644
--- a/arch/xtensa/kernel/perf_event.c
+++ b/arch/xtensa/kernel/perf_event.c
@@ -332,14 +332,14 @@ static int callchain_trace(struct stackframe *frame, void *data)
void perf_callchain_kernel(struct perf_callchain_entry *entry,
struct pt_regs *regs)
{
- xtensa_backtrace_kernel(regs, PERF_MAX_STACK_DEPTH,
+ xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack,
callchain_trace, NULL, entry);
}
void perf_callchain_user(struct perf_callchain_entry *entry,
struct pt_regs *regs)
{
- xtensa_backtrace_user(regs, PERF_MAX_STACK_DEPTH,
+ xtensa_backtrace_user(regs, sysctl_perf_event_max_stack,
callchain_trace, entry);
}
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 85749ae8cb5f..a090700cccca 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -58,7 +58,7 @@ struct perf_guest_info_callbacks {
struct perf_callchain_entry {
__u64 nr;
- __u64 ip[PERF_MAX_STACK_DEPTH];
+ __u64 ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
};
struct perf_raw_record {
@@ -993,9 +993,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
extern int get_callchain_buffers(void);
extern void put_callchain_buffers(void);
+extern int sysctl_perf_event_max_stack;
+
static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
{
- if (entry->nr < PERF_MAX_STACK_DEPTH) {
+ if (entry->nr < sysctl_perf_event_max_stack) {
entry->ip[entry->nr++] = ip;
return 0;
} else {
@@ -1017,6 +1019,8 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos);
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
static inline bool perf_paranoid_tracepoint_raw(void)
{
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 499d9e933f8e..f5a19548be12 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -66,7 +66,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
value_size < 8 || value_size % 8 ||
- value_size / 8 > PERF_MAX_STACK_DEPTH)
+ value_size / 8 > sysctl_perf_event_max_stack)
return ERR_PTR(-EINVAL);
/* hash table size must be power of 2 */
@@ -124,8 +124,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
struct perf_callchain_entry *trace;
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
u32 max_depth = map->value_size / 8;
- /* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */
- u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth;
+ /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
+ u32 init_nr = sysctl_perf_event_max_stack - max_depth;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
u32 hash, id, trace_nr, trace_len;
bool user = flags & BPF_F_USER_STACK;
@@ -143,7 +143,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
return -EFAULT;
/* get_perf_callchain() guarantees that trace->nr >= init_nr
- * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth
+ * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
*/
trace_nr = trace->nr - init_nr;
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 343c22f5e867..b9325e7dcba1 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -18,6 +18,14 @@ struct callchain_cpus_entries {
struct perf_callchain_entry *cpu_entries[0];
};
+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+
+static inline size_t perf_callchain_entry__sizeof(void)
+{
+ return (sizeof(struct perf_callchain_entry) +
+ sizeof(__u64) * sysctl_perf_event_max_stack);
+}
+
static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
static atomic_t nr_callchain_events;
static DEFINE_MUTEX(callchain_mutex);
@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
if (!entries)
return -ENOMEM;
- size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+ size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
for_each_possible_cpu(cpu) {
entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
@@ -147,7 +155,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
cpu = smp_processor_id();
- return &entries->cpu_entries[cpu][*rctx];
+ return (((void *)entries->cpu_entries[cpu]) +
+ (*rctx * perf_callchain_entry__sizeof()));
}
static void
@@ -215,3 +224,25 @@ exit_put:
return entry;
}
+
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int new_value = sysctl_perf_event_max_stack, ret;
+ struct ctl_table new_table = *table;
+
+ new_table.data = &new_value;
+ ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
+ if (ret || !write)
+ return ret;
+
+ mutex_lock(&callchain_mutex);
+ if (atomic_read(&nr_callchain_events))
+ ret = -EBUSY;
+ else
+ sysctl_perf_event_max_stack = new_value;
+
+ mutex_unlock(&callchain_mutex);
+
+ return ret;
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 725587f10667..c8b318663525 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -130,6 +130,9 @@ static int one_thousand = 1000;
#ifdef CONFIG_PRINTK
static int ten_thousand = 10000;
#endif
+#ifdef CONFIG_PERF_EVENTS
+static int six_hundred_forty_kb = 640 * 1024;
+#endif
/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -1144,6 +1147,15 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one_hundred,
},
+ {
+ .procname = "perf_event_max_stack",
+ .data = NULL, /* filled in by handler */
+ .maxlen = sizeof(sysctl_perf_event_max_stack),
+ .mode = 0644,
+ .proc_handler = perf_event_max_stack_handler,
+ .extra1 = &zero,
+ .extra2 = &six_hundred_forty_kb,
+ },
#endif
#ifdef CONFIG_KMEMCHECK
{
On Mon, Apr 25, 2016 at 09:29:28PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Mon, Apr 25, 2016 at 05:07:26PM -0700, Alexei Starovoitov escreveu:
> > > + {
> > > + .procname = "perf_event_max_stack",
> > > + .data = NULL, /* filled in by handler */
> > > + .maxlen = sizeof(sysctl_perf_event_max_stack),
> > > + .mode = 0644,
> > > + .proc_handler = perf_event_max_stack_handler,
> > > + .extra1 = &zero,
> > > + },
>
> > you need to define a max value otherwise perf_callchain_entry__sizeof
> > will overflow. Sure it's root only facility, but still not nice.
> > 1M? Anything above 1M stack frames would be insane anyway.
> > The rest looks good. Thanks!
>
> Something else? ;-)
all looks good to me. Thanks a bunch!
>
> commit cd544af4f7fede01cb512d52bb3efe62aa19271d
> Author: Arnaldo Carvalho de Melo <[email protected]>
> Date: Thu Apr 21 12:28:50 2016 -0300
>
> perf core: Allow setting up max frame stack depth via sysctl
>
> The default remains 127, which is good for most cases, and not even hit
> most of the time, but then for some cases, as reported by Brendan, 1024+
> deep frames are appearing on the radar for things like groovy, ruby.
>
> And in some workloads putting a _lower_ cap on this may make sense. One
> that is per event still needs to be put in place tho.
>
> The new file is:
>
> # cat /proc/sys/kernel/perf_event_max_stack
> 127
>
> Chaging it:
>
> # echo 256 > /proc/sys/kernel/perf_event_max_stack
> # cat /proc/sys/kernel/perf_event_max_stack
> 256
>
> But as soon as there is some event using callchains we get:
>
> # echo 512 > /proc/sys/kernel/perf_event_max_stack
> -bash: echo: write error: Device or resource busy
> #
>
> Because we only allocate the callchain percpu data structures when there
> is a user, which allows for changing the max easily, its just a matter
> of having no callchain users at that point.
>
> Reported-and-Tested-by: Brendan Gregg <[email protected]>
> Acked-by: Alexei Starovoitov <[email protected]>
yep :)
hopefully Brendan can give it another spin.
Em Mon, Apr 25, 2016 at 05:44:00PM -0700, Alexei Starovoitov escreveu:
> On Mon, Apr 25, 2016 at 09:29:28PM -0300, Arnaldo Carvalho de Melo wrote:
> > Em Mon, Apr 25, 2016 at 05:07:26PM -0700, Alexei Starovoitov escreveu:
> > > > + {
> > > > + .procname = "perf_event_max_stack",
> > > > + .data = NULL, /* filled in by handler */
> > > > + .maxlen = sizeof(sysctl_perf_event_max_stack),
> > > > + .mode = 0644,
> > > > + .proc_handler = perf_event_max_stack_handler,
> > > > + .extra1 = &zero,
> > > > + },
> >
> > > you need to define a max value otherwise perf_callchain_entry__sizeof
> > > will overflow. Sure it's root only facility, but still not nice.
> > > 1M? Anything above 1M stack frames would be insane anyway.
> > > The rest looks good. Thanks!
> >
> > Something else? ;-)
>
> all looks good to me. Thanks a bunch!
Thanks for checking!
> > Because we only allocate the callchain percpu data structures when there
> > is a user, which allows for changing the max easily, its just a matter
> > of having no callchain users at that point.
> >
> > Reported-and-Tested-by: Brendan Gregg <[email protected]>
> > Acked-by: Alexei Starovoitov <[email protected]>
>
> yep :)
> hopefully Brendan can give it another spin.
Agreed, and I'm calling it a day anyway, Brendan, please consider
retesting, thanks,
- Arnaldo
On Mon, Apr 25, 2016 at 5:47 PM, Arnaldo Carvalho de Melo
<[email protected]> wrote:
> Em Mon, Apr 25, 2016 at 05:44:00PM -0700, Alexei Starovoitov escreveu:
>> On Mon, Apr 25, 2016 at 09:29:28PM -0300, Arnaldo Carvalho de Melo wrote:
>> > Em Mon, Apr 25, 2016 at 05:07:26PM -0700, Alexei Starovoitov escreveu:
>> > > > + {
>> > > > + .procname = "perf_event_max_stack",
>> > > > + .data = NULL, /* filled in by handler */
>> > > > + .maxlen = sizeof(sysctl_perf_event_max_stack),
>> > > > + .mode = 0644,
>> > > > + .proc_handler = perf_event_max_stack_handler,
>> > > > + .extra1 = &zero,
>> > > > + },
>> >
>> > > you need to define a max value otherwise perf_callchain_entry__sizeof
>> > > will overflow. Sure it's root only facility, but still not nice.
>> > > 1M? Anything above 1M stack frames would be insane anyway.
>> > > The rest looks good. Thanks!
>> >
>> > Something else? ;-)
>>
>> all looks good to me. Thanks a bunch!
>
> Thanks for checking!
>
>> > Because we only allocate the callchain percpu data structures when there
>> > is a user, which allows for changing the max easily, its just a matter
>> > of having no callchain users at that point.
>> >
>> > Reported-and-Tested-by: Brendan Gregg <[email protected]>
>> > Acked-by: Alexei Starovoitov <[email protected]>
>>
>> yep :)
>> hopefully Brendan can give it another spin.
>
> Agreed, and I'm calling it a day anyway, Brendan, please consider
> retesting, thanks,
>
Will do, thanks!
Brendan
> - Arnaldo
Em Mon, Apr 25, 2016 at 05:49:38PM -0700, Brendan Gregg escreveu:
> On Mon, Apr 25, 2016 at 5:47 PM, Arnaldo Carvalho de Melo
> <[email protected]> wrote:
> > Em Mon, Apr 25, 2016 at 05:44:00PM -0700, Alexei Starovoitov escreveu:
> >> On Mon, Apr 25, 2016 at 09:29:28PM -0300, Arnaldo Carvalho de Melo wrote:
> >> > Because we only allocate the callchain percpu data structures when there
> >> > is a user, which allows for changing the max easily, its just a matter
> >> > of having no callchain users at that point.
> >> >
> >> > Reported-and-Tested-by: Brendan Gregg <[email protected]>
> >> > Acked-by: Alexei Starovoitov <[email protected]>
> >>
> >> yep :)
> >> hopefully Brendan can give it another spin.
> >
> > Agreed, and I'm calling it a day anyway, Brendan, please consider
> > retesting, thanks,
> Will do, thanks!
> Brendan
So, for completeness, further testing it to see how far it goes on a 8GB
machine I got:
[root@emilia ~]# echo 131100 > /proc/sys/kernel/perf_event_max_stack
[root@emilia ~]# perf record -g ls
Error:
The sys_perf_event_open() syscall returned with 12 (Cannot allocate memory) for event (cycles).
/bin/dmesg may provide additional information.
No CONFIG_PERF_EVENTS=y kernel support configured?
[root@emilia ~]#
[root@emilia ~]# echo 131000 > /proc/sys/kernel/perf_event_max_stack
[root@emilia ~]# perf record -g usleep
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.013 MB perf.data (9 samples) ]
[root@emilia ~]# ls -la perf.data
-rw-------. 1 root root 15736 Apr 26 13:33 perf.data
[root@emilia ~]#
- Arnaldo
On 4/26/16 10:33 AM, Arnaldo Carvalho de Melo wrote:
> So, for completeness, further testing it to see how far it goes on a 8GB
> machine I got:
>
> [root@emilia ~]# echo 131100 > /proc/sys/kernel/perf_event_max_stack
> [root@emilia ~]# perf record -g ls
> Error:
> The sys_perf_event_open() syscall returned with 12 (Cannot allocate memory) for event (cycles).
> /bin/dmesg may provide additional information.
> No CONFIG_PERF_EVENTS=y kernel support configured?
I thought we fixed up the last line to not display for errors like this.
How about adding ENOMEM case to perf_evsel__open_strerror?
On Mon, Apr 25, 2016 at 5:49 PM, Brendan Gregg
<[email protected]> wrote:
> On Mon, Apr 25, 2016 at 5:47 PM, Arnaldo Carvalho de Melo
> <[email protected]> wrote:
>> Em Mon, Apr 25, 2016 at 05:44:00PM -0700, Alexei Starovoitov escreveu:
>>> On Mon, Apr 25, 2016 at 09:29:28PM -0300, Arnaldo Carvalho de Melo wrote:
>>> > Em Mon, Apr 25, 2016 at 05:07:26PM -0700, Alexei Starovoitov escreveu:
>>> > > > + {
>>> > > > + .procname = "perf_event_max_stack",
>>> > > > + .data = NULL, /* filled in by handler */
>>> > > > + .maxlen = sizeof(sysctl_perf_event_max_stack),
>>> > > > + .mode = 0644,
>>> > > > + .proc_handler = perf_event_max_stack_handler,
>>> > > > + .extra1 = &zero,
>>> > > > + },
>>> >
>>> > > you need to define a max value otherwise perf_callchain_entry__sizeof
>>> > > will overflow. Sure it's root only facility, but still not nice.
>>> > > 1M? Anything above 1M stack frames would be insane anyway.
>>> > > The rest looks good. Thanks!
>>> >
>>> > Something else? ;-)
>>>
>>> all looks good to me. Thanks a bunch!
>>
>> Thanks for checking!
>>
>>> > Because we only allocate the callchain percpu data structures when there
>>> > is a user, which allows for changing the max easily, its just a matter
>>> > of having no callchain users at that point.
>>> >
>>> > Reported-and-Tested-by: Brendan Gregg <[email protected]>
>>> > Acked-by: Alexei Starovoitov <[email protected]>
>>>
>>> yep :)
>>> hopefully Brendan can give it another spin.
>>
>> Agreed, and I'm calling it a day anyway, Brendan, please consider
>> retesting, thanks,
>>
>
> Will do, thanks!
>
Looks good.
I started with max depth = 512, and even that was still truncated, and
had to profile again at 1024 to capture the full stacks. Seems to
generally match the flame graph I generated with V1, which made me
want to check that I'm running the new patch, and am:
# grep six_hundred_forty_kb /proc/kallsyms
ffffffff81c431e0 d six_hundred_forty_kb
I was mucking around and was able to get "corrupted callchain.
skipping..." errors, but these look to be expected -- that was
profiling a binary (bash) that doesn't have frame pointers. Some perf
script -D output:
16 3204735442777 0x18f0d8 [0x2030]: PERF_RECORD_SAMPLE(IP, 0x1):
18134/18134: 0xffffffff8118b6a4 period: 1001001 addr: 0
... FP chain: nr:1023
..... 0: ffffffffffffff80
..... 1: ffffffff8118b6a4
..... 2: ffffffff8118bc47
..... 3: ffffffff811d8c85
..... 4: ffffffff811b18f8
..... 5: ffffffff811b2a55
..... 6: ffffffff811b5ea0
..... 7: ffffffff810663c0
..... 8: ffffffff810666e0
..... 9: ffffffff817b9d28
..... 10: fffffffffffffe00
..... 11: 00000000004b45e2
..... 12: 000000000000610f
..... 13: 0000000000006110
..... 14: 0000000000006111
..... 15: 0000000000006112
..... 16: 0000000000006113
..... 17: 0000000000006114
..... 18: 0000000000006115
..... 19: 0000000000006116
..... 20: 0000000000006117
[...]
..... 1021: 000000000000650b
..... 1022: 000000000000650c
... thread: bash:18134
...... dso: /lib/modules/4.6.0-rc5-virtual/build/vmlinux
bash 18134 [016] 3204.735442: 1001001 cpu-clock:
Brendan
Em Tue, Apr 26, 2016 at 01:02:34PM -0700, Brendan Gregg escreveu:
> On Mon, Apr 25, 2016 at 5:49 PM, Brendan Gregg <[email protected]> wrote:
> > On Mon, Apr 25, 2016 at 5:47 PM, Arnaldo Carvalho de Melo <[email protected]> wrote:
> >> Em Mon, Apr 25, 2016 at 05:44:00PM -0700, Alexei Starovoitov escreveu:
> >>> yep :)
> >>> hopefully Brendan can give it another spin.
> >>
> >> Agreed, and I'm calling it a day anyway, Brendan, please consider
> >> retesting, thanks,
> >
> > Will do, thanks!
>
> Looks good.
>
> I started with max depth = 512, and even that was still truncated, and
> had to profile again at 1024 to capture the full stacks. Seems to
> generally match the flame graph I generated with V1, which made me
> want to check that I'm running the new patch, and am:
>
> # grep six_hundred_forty_kb /proc/kallsyms
> ffffffff81c431e0 d six_hundred_forty_kb
>
> I was mucking around and was able to get "corrupted callchain.
> skipping..." errors, but these look to be expected -- that was
Yeah, thanks for testing!
And since you talked about userspace without frame pointers, have you
played with '--call-graph lbr'?
- Arnaldo
> profiling a binary (bash) that doesn't have frame pointers. Some perf
> script -D output:
>
> 16 3204735442777 0x18f0d8 [0x2030]: PERF_RECORD_SAMPLE(IP, 0x1):
> 18134/18134: 0xffffffff8118b6a4 period: 1001001 addr: 0
> ... FP chain: nr:1023
> ..... 0: ffffffffffffff80
> ..... 1: ffffffff8118b6a4
> ..... 2: ffffffff8118bc47
> ..... 3: ffffffff811d8c85
> ..... 4: ffffffff811b18f8
> ..... 5: ffffffff811b2a55
> ..... 6: ffffffff811b5ea0
> ..... 7: ffffffff810663c0
> ..... 8: ffffffff810666e0
> ..... 9: ffffffff817b9d28
> ..... 10: fffffffffffffe00
> ..... 11: 00000000004b45e2
> ..... 12: 000000000000610f
> ..... 13: 0000000000006110
> ..... 14: 0000000000006111
> ..... 15: 0000000000006112
> ..... 16: 0000000000006113
> ..... 17: 0000000000006114
> ..... 18: 0000000000006115
> ..... 19: 0000000000006116
> ..... 20: 0000000000006117
> [...]
> ..... 1021: 000000000000650b
> ..... 1022: 000000000000650c
> ... thread: bash:18134
> ...... dso: /lib/modules/4.6.0-rc5-virtual/build/vmlinux
> bash 18134 [016] 3204.735442: 1001001 cpu-clock:
>
> Brendan
On Tue, Apr 26, 2016 at 06:05:00PM -0300, Arnaldo Carvalho de Melo wrote:
> > I started with max depth = 512, and even that was still truncated, and
> > had to profile again at 1024 to capture the full stacks. Seems to
^^^^^^
> > generally match the flame graph I generated with V1, which made me
> > want to check that I'm running the new patch, and am:
> >
> > # grep six_hundred_forty_kb /proc/kallsyms
> > ffffffff81c431e0 d six_hundred_forty_kb
> >
> > I was mucking around and was able to get "corrupted callchain.
> > skipping..." errors, but these look to be expected -- that was
>
> Yeah, thanks for testing!
>
> And since you talked about userspace without frame pointers, have you
> played with '--call-graph lbr'?
That seems to be at odds with his requirements; he needs 1024 entries to
capture full stacks, LBR is limited to 16/32 or so entries. That's 2
orders of magnitude difference.
On Mon, Apr 25, 2016 at 09:29:28PM -0300, Arnaldo Carvalho de Melo wrote:
> commit cd544af4f7fede01cb512d52bb3efe62aa19271d
> Author: Arnaldo Carvalho de Melo <[email protected]>
> Date: Thu Apr 21 12:28:50 2016 -0300
>
> perf core: Allow setting up max frame stack depth via sysctl
>
> The default remains 127, which is good for most cases, and not even hit
> most of the time, but then for some cases, as reported by Brendan, 1024+
> deep frames are appearing on the radar for things like groovy, ruby.
>
> And in some workloads putting a _lower_ cap on this may make sense. One
> that is per event still needs to be put in place tho.
>
> The new file is:
>
> # cat /proc/sys/kernel/perf_event_max_stack
> 127
>
> Chaging it:
>
> # echo 256 > /proc/sys/kernel/perf_event_max_stack
> # cat /proc/sys/kernel/perf_event_max_stack
> 256
>
> But as soon as there is some event using callchains we get:
>
> # echo 512 > /proc/sys/kernel/perf_event_max_stack
> -bash: echo: write error: Device or resource busy
> #
>
> Because we only allocate the callchain percpu data structures when there
> is a user, which allows for changing the max easily, its just a matter
> of having no callchain users at that point.
>
> Reported-and-Tested-by: Brendan Gregg <[email protected]>
> Acked-by: Alexei Starovoitov <[email protected]>
> Acked-by: David Ahern <[email protected]>
Reviewed-by: Frederic Weisbecker <[email protected]>
I first thought that this should be a tunable per event instead of a global sysctl
but then I realized that we still need a root-only-tunable maximum limit value to oppose
against any future per event limit and this sysctl value does the job.
Nice patch!
Thanks.
On Tue, Apr 26, 2016 at 2:05 PM, Arnaldo Carvalho de Melo
<[email protected]> wrote:
> Em Tue, Apr 26, 2016 at 01:02:34PM -0700, Brendan Gregg escreveu:
>> On Mon, Apr 25, 2016 at 5:49 PM, Brendan Gregg <[email protected]> wrote:
>> > On Mon, Apr 25, 2016 at 5:47 PM, Arnaldo Carvalho de Melo <[email protected]> wrote:
>> >> Em Mon, Apr 25, 2016 at 05:44:00PM -0700, Alexei Starovoitov escreveu:
>> >>> yep :)
>> >>> hopefully Brendan can give it another spin.
>> >>
>> >> Agreed, and I'm calling it a day anyway, Brendan, please consider
>> >> retesting, thanks,
>> >
>> > Will do, thanks!
>>
>> Looks good.
>>
>> I started with max depth = 512, and even that was still truncated, and
>> had to profile again at 1024 to capture the full stacks. Seems to
>> generally match the flame graph I generated with V1, which made me
>> want to check that I'm running the new patch, and am:
>>
>> # grep six_hundred_forty_kb /proc/kallsyms
>> ffffffff81c431e0 d six_hundred_forty_kb
>>
>> I was mucking around and was able to get "corrupted callchain.
>> skipping..." errors, but these look to be expected -- that was
>
> Yeah, thanks for testing!
>
> And since you talked about userspace without frame pointers, have you
> played with '--call-graph lbr'?
Not really. Isn't it only 16 levels deep max?
Most of our Linux is Xen guests (EC2), and I'd have to check if the
MSRs are available for LBR (perf record --call-graph lbr ... returns
"The sys_perf_event_open() syscall returned with 95 (Operation not
supported) for event (cpu-clock).", so I'd guess not, although many
other MSRs are exposed).
BTS seemed more promising (deeper stacks), and there's already Xen
support for it (need to boot the Xen host with vpmu=bts, preferably
vpmu=bts,arch for some PMCs as well :).
Brendan
On Tue, Apr 26, 2016 at 02:58:41PM -0700, Brendan Gregg wrote:
> BTS seemed more promising (deeper stacks), and there's already Xen
> support for it (need to boot the Xen host with vpmu=bts, preferably
> vpmu=bts,arch for some PMCs as well :).
BTS is a branch tracer; it simply traces _all_ branches, including
calls. It has fairly significant overhead.
But yes, by tracing all branches, you should get really shiny call
traces :-)
Em Tue, Apr 26, 2016 at 11:55:36PM +0200, Peter Zijlstra escreveu:
> On Tue, Apr 26, 2016 at 06:05:00PM -0300, Arnaldo Carvalho de Melo wrote:
> > > I started with max depth = 512, and even that was still truncated, and
> > > had to profile again at 1024 to capture the full stacks. Seems to
> ^^^^^^
> > And since you talked about userspace without frame pointers, have you
> > played with '--call-graph lbr'?
> That seems to be at odds with his requirements; he needs 1024 entries to
> capture full stacks, LBR is limited to 16/32 or so entries. That's 2
> orders of magnitude difference.
Duh, you're right, must've confused with that BTS thing, but as you said
in another message, that generates way too much info, have to read more
about it and the PT as well :-\
Anyway, for syscall backtraces, using 16 or 32 as the default in 'perf
trace', seems interesting, automatically switching to DWARF (or FP if
configured by the user in a system built with -fno-omit-frame-pointer)
if, say, --max-stack 38, say, is specified.
Have to take a look at how to do that for raw_syscalls:sys_enter.
- Arnaldo
Em Tue, Apr 26, 2016 at 11:58:10PM +0200, Frederic Weisbecker escreveu:
> On Mon, Apr 25, 2016 at 09:29:28PM -0300, Arnaldo Carvalho de Melo wrote:
> > commit cd544af4f7fede01cb512d52bb3efe62aa19271d
> > Author: Arnaldo Carvalho de Melo <[email protected]>
> > Date: Thu Apr 21 12:28:50 2016 -0300
> >
> > perf core: Allow setting up max frame stack depth via sysctl
> >
> > The default remains 127, which is good for most cases, and not even hit
> > most of the time, but then for some cases, as reported by Brendan, 1024+
> > deep frames are appearing on the radar for things like groovy, ruby.
> >
> > And in some workloads putting a _lower_ cap on this may make sense. One
> > that is per event still needs to be put in place tho.
> >
> > The new file is:
> >
> > # cat /proc/sys/kernel/perf_event_max_stack
> > 127
> >
> > Chaging it:
> >
> > # echo 256 > /proc/sys/kernel/perf_event_max_stack
> > # cat /proc/sys/kernel/perf_event_max_stack
> > 256
> >
> > But as soon as there is some event using callchains we get:
> >
> > # echo 512 > /proc/sys/kernel/perf_event_max_stack
> > -bash: echo: write error: Device or resource busy
> > #
> >
> > Because we only allocate the callchain percpu data structures when there
> > is a user, which allows for changing the max easily, its just a matter
> > of having no callchain users at that point.
> >
> > Reported-and-Tested-by: Brendan Gregg <[email protected]>
> > Acked-by: Alexei Starovoitov <[email protected]>
> > Acked-by: David Ahern <[email protected]>
>
> Reviewed-by: Frederic Weisbecker <[email protected]>
>
> I first thought that this should be a tunable per event instead of a global sysctl
Yeah, I'll work on that too.
> but then I realized that we still need a root-only-tunable maximum limit value to oppose
> against any future per event limit and this sysctl value does the job.
> Nice patch!
Thanks for reviewieng it!
- Arnaldo
Em Tue, Apr 26, 2016 at 02:58:41PM -0700, Brendan Gregg escreveu:
> On Tue, Apr 26, 2016 at 2:05 PM, Arnaldo Carvalho de Melo
> <[email protected]> wrote:
> > Em Tue, Apr 26, 2016 at 01:02:34PM -0700, Brendan Gregg escreveu:
> >> On Mon, Apr 25, 2016 at 5:49 PM, Brendan Gregg <[email protected]> wrote:
> >> > On Mon, Apr 25, 2016 at 5:47 PM, Arnaldo Carvalho de Melo <[email protected]> wrote:
> >> >> Em Mon, Apr 25, 2016 at 05:44:00PM -0700, Alexei Starovoitov escreveu:
> >> >>> yep :)
> >> >>> hopefully Brendan can give it another spin.
> >> >>
> >> >> Agreed, and I'm calling it a day anyway, Brendan, please consider
> >> >> retesting, thanks,
> >> >
> >> > Will do, thanks!
> >>
> >> Looks good.
> >>
> >> I started with max depth = 512, and even that was still truncated, and
> >> had to profile again at 1024 to capture the full stacks. Seems to
> >> generally match the flame graph I generated with V1, which made me
> >> want to check that I'm running the new patch, and am:
> >>
> >> # grep six_hundred_forty_kb /proc/kallsyms
> >> ffffffff81c431e0 d six_hundred_forty_kb
> >>
> >> I was mucking around and was able to get "corrupted callchain.
> >> skipping..." errors, but these look to be expected -- that was
> >
> > Yeah, thanks for testing!
> >
> > And since you talked about userspace without frame pointers, have you
> > played with '--call-graph lbr'?
>
> Not really. Isn't it only 16 levels deep max?
Yeah, stoopid me :-\
> Most of our Linux is Xen guests (EC2), and I'd have to check if the
> MSRs are available for LBR (perf record --call-graph lbr ... returns
> "The sys_perf_event_open() syscall returned with 95 (Operation not
That is because it is only accepted for PERF_TYPE_HARDWARE on x86, i.e.
it retunrs EOPNOTSUPP for all the other cases, like for
PERF_TYPE_SOFTWARE counters, like "cpu-clock"
> supported) for event (cpu-clock).", so I'd guess not, although many
> other MSRs are exposed).
>
> BTS seemed more promising (deeper stacks), and there's already Xen
> support for it (need to boot the Xen host with vpmu=bts, preferably
> vpmu=bts,arch for some PMCs as well :).
Yeah, worth a look, I guess, but doesn't look like a low hanging fruit
tho.
- Arnaldo
Em Tue, Apr 26, 2016 at 10:45:28AM -0600, David Ahern escreveu:
> On 4/26/16 10:33 AM, Arnaldo Carvalho de Melo wrote:
> >So, for completeness, further testing it to see how far it goes on a 8GB
> >machine I got:
> >
> >[root@emilia ~]# echo 131100 > /proc/sys/kernel/perf_event_max_stack
> >[root@emilia ~]# perf record -g ls
> >Error:
> >The sys_perf_event_open() syscall returned with 12 (Cannot allocate memory) for event (cycles).
> >/bin/dmesg may provide additional information.
> >No CONFIG_PERF_EVENTS=y kernel support configured?
>
> I thought we fixed up the last line to not display for errors like this.
For most cases, yes.
> How about adding ENOMEM case to perf_evsel__open_strerror?
I'll do.
- Arnaldo
Commit-ID: c5dfd78eb79851e278b7973031b9ca363da87a7e
Gitweb: http://git.kernel.org/tip/c5dfd78eb79851e278b7973031b9ca363da87a7e
Author: Arnaldo Carvalho de Melo <[email protected]>
AuthorDate: Thu, 21 Apr 2016 12:28:50 -0300
Committer: Arnaldo Carvalho de Melo <[email protected]>
CommitDate: Wed, 27 Apr 2016 10:20:39 -0300
perf core: Allow setting up max frame stack depth via sysctl
The default remains 127, which is good for most cases, and not even hit
most of the time, but then for some cases, as reported by Brendan, 1024+
deep frames are appearing on the radar for things like groovy, ruby.
And in some workloads putting a _lower_ cap on this may make sense. One
that is per event still needs to be put in place tho.
The new file is:
# cat /proc/sys/kernel/perf_event_max_stack
127
Chaging it:
# echo 256 > /proc/sys/kernel/perf_event_max_stack
# cat /proc/sys/kernel/perf_event_max_stack
256
But as soon as there is some event using callchains we get:
# echo 512 > /proc/sys/kernel/perf_event_max_stack
-bash: echo: write error: Device or resource busy
#
Because we only allocate the callchain percpu data structures when there
is a user, which allows for changing the max easily, its just a matter
of having no callchain users at that point.
Reported-and-Tested-by: Brendan Gregg <[email protected]>
Reviewed-by: Frederic Weisbecker <[email protected]>
Acked-by: Alexei Starovoitov <[email protected]>
Acked-by: David Ahern <[email protected]>
Cc: Adrian Hunter <[email protected]>
Cc: Alexander Shishkin <[email protected]>
Cc: He Kuang <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Masami Hiramatsu <[email protected]>
Cc: Milian Wolff <[email protected]>
Cc: Namhyung Kim <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Stephane Eranian <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Vince Weaver <[email protected]>
Cc: Wang Nan <[email protected]>
Cc: Zefan Li <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
---
Documentation/sysctl/kernel.txt | 14 ++++++++++++++
arch/arm/kernel/perf_callchain.c | 2 +-
arch/arm64/kernel/perf_callchain.c | 4 ++--
arch/metag/kernel/perf_callchain.c | 2 +-
arch/mips/kernel/perf_event.c | 4 ++--
arch/powerpc/perf/callchain.c | 4 ++--
arch/sparc/kernel/perf_event.c | 6 +++---
arch/x86/events/core.c | 4 ++--
arch/xtensa/kernel/perf_event.c | 4 ++--
include/linux/perf_event.h | 8 ++++++--
kernel/bpf/stackmap.c | 8 ++++----
kernel/events/callchain.c | 35 +++++++++++++++++++++++++++++++++--
kernel/sysctl.c | 12 ++++++++++++
13 files changed, 84 insertions(+), 23 deletions(-)
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 57653a4..260cde0 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -60,6 +60,7 @@ show up in /proc/sys/kernel:
- panic_on_warn
- perf_cpu_time_max_percent
- perf_event_paranoid
+- perf_event_max_stack
- pid_max
- powersave-nap [ PPC only ]
- printk
@@ -654,6 +655,19 @@ users (without CAP_SYS_ADMIN). The default value is 1.
==============================================================
+perf_event_max_stack:
+
+Controls maximum number of stack frames to copy for (attr.sample_type &
+PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using
+'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 127.
+
+==============================================================
+
pid_max:
PID allocation wrap value. When the kernel's next PID value
diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index 4e02ae5..27563be 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
tail = (struct frame_tail __user *)regs->ARM_fp - 1;
- while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+ while ((entry->nr < sysctl_perf_event_max_stack) &&
tail && !((unsigned long)tail & 0x3))
tail = user_backtrace(tail, entry);
}
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index ff46654..32c3c6e 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
tail = (struct frame_tail __user *)regs->regs[29];
- while (entry->nr < PERF_MAX_STACK_DEPTH &&
+ while (entry->nr < sysctl_perf_event_max_stack &&
tail && !((unsigned long)tail & 0xf))
tail = user_backtrace(tail, entry);
} else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
- while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+ while ((entry->nr < sysctl_perf_event_max_stack) &&
tail && !((unsigned long)tail & 0x3))
tail = compat_user_backtrace(tail, entry);
#endif
diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c
index 3156334..252abc1 100644
--- a/arch/metag/kernel/perf_callchain.c
+++ b/arch/metag/kernel/perf_callchain.c
@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
--frame;
- while ((entry->nr < PERF_MAX_STACK_DEPTH) && frame)
+ while ((entry->nr < sysctl_perf_event_max_stack) && frame)
frame = user_backtrace(frame, entry);
}
diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c
index c1cf9c6..5021c54 100644
--- a/arch/mips/kernel/perf_event.c
+++ b/arch/mips/kernel/perf_event.c
@@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
addr = *sp++;
if (__kernel_text_address(addr)) {
perf_callchain_store(entry, addr);
- if (entry->nr >= PERF_MAX_STACK_DEPTH)
+ if (entry->nr >= sysctl_perf_event_max_stack)
break;
}
}
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
}
do {
perf_callchain_store(entry, pc);
- if (entry->nr >= PERF_MAX_STACK_DEPTH)
+ if (entry->nr >= sysctl_perf_event_max_stack)
break;
pc = unwind_stack(current, &sp, pc, &ra);
} while (pc);
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index e04a675..22d9015 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
sp = regs->gpr[1];
perf_callchain_store(entry, next_ip);
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
fp = (unsigned long __user *) sp;
if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
return;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
sp = regs->gpr[1];
perf_callchain_store(entry, next_ip);
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
fp = (unsigned int __user *) (unsigned long) sp;
if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
return;
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 6596f66..a4b8b5a 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
}
}
#endif
- } while (entry->nr < PERF_MAX_STACK_DEPTH);
+ } while (entry->nr < sysctl_perf_event_max_stack);
}
static inline int
@@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
pc = sf.callers_pc;
ufp = (unsigned long)sf.fp + STACK_BIAS;
perf_callchain_store(entry, pc);
- } while (entry->nr < PERF_MAX_STACK_DEPTH);
+ } while (entry->nr < sysctl_perf_event_max_stack);
}
static void perf_callchain_user_32(struct perf_callchain_entry *entry,
@@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
ufp = (unsigned long)sf.fp;
}
perf_callchain_store(entry, pc);
- } while (entry->nr < PERF_MAX_STACK_DEPTH);
+ } while (entry->nr < sysctl_perf_event_max_stack);
}
void
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 041e442..41d93d0 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2277,7 +2277,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
fp = compat_ptr(ss_base + regs->bp);
pagefault_disable();
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
unsigned long bytes;
frame.next_frame = 0;
frame.return_address = 0;
@@ -2337,7 +2337,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
return;
pagefault_disable();
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
unsigned long bytes;
frame.next_frame = NULL;
frame.return_address = 0;
diff --git a/arch/xtensa/kernel/perf_event.c b/arch/xtensa/kernel/perf_event.c
index 54f0118..a6b00b3 100644
--- a/arch/xtensa/kernel/perf_event.c
+++ b/arch/xtensa/kernel/perf_event.c
@@ -332,14 +332,14 @@ static int callchain_trace(struct stackframe *frame, void *data)
void perf_callchain_kernel(struct perf_callchain_entry *entry,
struct pt_regs *regs)
{
- xtensa_backtrace_kernel(regs, PERF_MAX_STACK_DEPTH,
+ xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack,
callchain_trace, NULL, entry);
}
void perf_callchain_user(struct perf_callchain_entry *entry,
struct pt_regs *regs)
{
- xtensa_backtrace_user(regs, PERF_MAX_STACK_DEPTH,
+ xtensa_backtrace_user(regs, sysctl_perf_event_max_stack,
callchain_trace, entry);
}
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 85749ae..a090700 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -58,7 +58,7 @@ struct perf_guest_info_callbacks {
struct perf_callchain_entry {
__u64 nr;
- __u64 ip[PERF_MAX_STACK_DEPTH];
+ __u64 ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
};
struct perf_raw_record {
@@ -993,9 +993,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
extern int get_callchain_buffers(void);
extern void put_callchain_buffers(void);
+extern int sysctl_perf_event_max_stack;
+
static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
{
- if (entry->nr < PERF_MAX_STACK_DEPTH) {
+ if (entry->nr < sysctl_perf_event_max_stack) {
entry->ip[entry->nr++] = ip;
return 0;
} else {
@@ -1017,6 +1019,8 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos);
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
static inline bool perf_paranoid_tracepoint_raw(void)
{
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 499d9e9..f5a1954 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -66,7 +66,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
value_size < 8 || value_size % 8 ||
- value_size / 8 > PERF_MAX_STACK_DEPTH)
+ value_size / 8 > sysctl_perf_event_max_stack)
return ERR_PTR(-EINVAL);
/* hash table size must be power of 2 */
@@ -124,8 +124,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
struct perf_callchain_entry *trace;
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
u32 max_depth = map->value_size / 8;
- /* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */
- u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth;
+ /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
+ u32 init_nr = sysctl_perf_event_max_stack - max_depth;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
u32 hash, id, trace_nr, trace_len;
bool user = flags & BPF_F_USER_STACK;
@@ -143,7 +143,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
return -EFAULT;
/* get_perf_callchain() guarantees that trace->nr >= init_nr
- * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth
+ * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
*/
trace_nr = trace->nr - init_nr;
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 343c22f..b9325e7 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -18,6 +18,14 @@ struct callchain_cpus_entries {
struct perf_callchain_entry *cpu_entries[0];
};
+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+
+static inline size_t perf_callchain_entry__sizeof(void)
+{
+ return (sizeof(struct perf_callchain_entry) +
+ sizeof(__u64) * sysctl_perf_event_max_stack);
+}
+
static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
static atomic_t nr_callchain_events;
static DEFINE_MUTEX(callchain_mutex);
@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
if (!entries)
return -ENOMEM;
- size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+ size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
for_each_possible_cpu(cpu) {
entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
@@ -147,7 +155,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
cpu = smp_processor_id();
- return &entries->cpu_entries[cpu][*rctx];
+ return (((void *)entries->cpu_entries[cpu]) +
+ (*rctx * perf_callchain_entry__sizeof()));
}
static void
@@ -215,3 +224,25 @@ exit_put:
return entry;
}
+
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int new_value = sysctl_perf_event_max_stack, ret;
+ struct ctl_table new_table = *table;
+
+ new_table.data = &new_value;
+ ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
+ if (ret || !write)
+ return ret;
+
+ mutex_lock(&callchain_mutex);
+ if (atomic_read(&nr_callchain_events))
+ ret = -EBUSY;
+ else
+ sysctl_perf_event_max_stack = new_value;
+
+ mutex_unlock(&callchain_mutex);
+
+ return ret;
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 725587f..c8b3186 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -130,6 +130,9 @@ static int one_thousand = 1000;
#ifdef CONFIG_PRINTK
static int ten_thousand = 10000;
#endif
+#ifdef CONFIG_PERF_EVENTS
+static int six_hundred_forty_kb = 640 * 1024;
+#endif
/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -1144,6 +1147,15 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one_hundred,
},
+ {
+ .procname = "perf_event_max_stack",
+ .data = NULL, /* filled in by handler */
+ .maxlen = sizeof(sysctl_perf_event_max_stack),
+ .mode = 0644,
+ .proc_handler = perf_event_max_stack_handler,
+ .extra1 = &zero,
+ .extra2 = &six_hundred_forty_kb,
+ },
#endif
#ifdef CONFIG_KMEMCHECK
{
On Wed, Apr 27, 2016 at 09:53:58AM -0300, Arnaldo Carvalho de Melo wrote:
> Em Tue, Apr 26, 2016 at 11:58:10PM +0200, Frederic Weisbecker escreveu:
> > On Mon, Apr 25, 2016 at 09:29:28PM -0300, Arnaldo Carvalho de Melo wrote:
> > > commit cd544af4f7fede01cb512d52bb3efe62aa19271d
> > > Author: Arnaldo Carvalho de Melo <[email protected]>
> > > Date: Thu Apr 21 12:28:50 2016 -0300
> > >
> > > perf core: Allow setting up max frame stack depth via sysctl
> > >
> > > The default remains 127, which is good for most cases, and not even hit
> > > most of the time, but then for some cases, as reported by Brendan, 1024+
> > > deep frames are appearing on the radar for things like groovy, ruby.
> > >
> > > And in some workloads putting a _lower_ cap on this may make sense. One
> > > that is per event still needs to be put in place tho.
> > >
> > > The new file is:
> > >
> > > # cat /proc/sys/kernel/perf_event_max_stack
> > > 127
> > >
> > > Chaging it:
> > >
> > > # echo 256 > /proc/sys/kernel/perf_event_max_stack
> > > # cat /proc/sys/kernel/perf_event_max_stack
> > > 256
> > >
> > > But as soon as there is some event using callchains we get:
> > >
> > > # echo 512 > /proc/sys/kernel/perf_event_max_stack
> > > -bash: echo: write error: Device or resource busy
> > > #
> > >
> > > Because we only allocate the callchain percpu data structures when there
> > > is a user, which allows for changing the max easily, its just a matter
> > > of having no callchain users at that point.
> > >
> > > Reported-and-Tested-by: Brendan Gregg <[email protected]>
> > > Acked-by: Alexei Starovoitov <[email protected]>
> > > Acked-by: David Ahern <[email protected]>
> >
> > Reviewed-by: Frederic Weisbecker <[email protected]>
> >
> > I first thought that this should be a tunable per event instead of a global sysctl
>
> Yeah, I'll work on that too.
There is no rush though. The sysfs limit will probably be enough for most users. Unless
someone requested it?
On 4/27/16 10:09 AM, Frederic Weisbecker wrote:
>>> I first thought that this should be a tunable per event instead of a global sysctl
>>
>> Yeah, I'll work on that too.
>
> There is no rush though. The sysfs limit will probably be enough for most users. Unless
> someone requested it?
>
I have. I spent time last winter (2015) looking into how to do it. The
userspace syntax was more of a pain than passing the parameters to the
kernel side as the intent is to specify N kernel frames and M user frames.