Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751929AbcDTWrj (ORCPT ); Wed, 20 Apr 2016 18:47:39 -0400 Received: from mail.kernel.org ([198.145.29.136]:42472 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751335AbcDTWrh (ORCPT ); Wed, 20 Apr 2016 18:47:37 -0400 Date: Wed, 20 Apr 2016 19:47:30 -0300 From: Arnaldo Carvalho de Melo To: Peter Zijlstra Cc: Frederic Weisbecker , Ingo Molnar , Adrian Hunter , Brendan Gregg , Alexander Shishkin , Alexei Starovoitov , David Ahern , He Kuang , Jiri Olsa , Masami Hiramatsu , Milian Wolff , Namhyung Kim , Stephane Eranian , Thomas Gleixner , Vince Weaver , Wang Nan , Zefan Li , Linux Kernel Mailing List Subject: [PATCH/RFC] perf core: Allow setting up max frame stack depth via sysctl Message-ID: <20160420224730.GX3677@kernel.org> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline X-Url: http://acmel.wordpress.com User-Agent: Mutt/1.5.24 (2015-08-30) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 11760 Lines: 338 The default remains 127, which is good for most cases, and not even hit most of the time, but then for some cases, as reported by Brendan, 1024+ deep frames are appearing on the radar for things like groovy, ruby. And in some workloads putting a _lower_ cap on this may make sense. One that is per event still needs to be put in place tho. The new file is: # cat /proc/sys/kernel/perf_event_max_stack 127 Chaging it: # echo 256 > /proc/sys/kernel/perf_event_max_stack # cat /proc/sys/kernel/perf_event_max_stack 256 But as soon as there is some event using callchains we get: # echo 512 > /proc/sys/kernel/perf_event_max_stack -bash: echo: write error: Device or resource busy # Because we only allocate the callchain percpu data structures when there is a user, which allows for changing the max easily, its just a matter of having no callchain users at that point. Reported-by: Brendan Gregg Link: http://lkml.kernel.org/n/tip-cgls6uuncwjtq969tys1j6b0@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 57653a44b128..260cde08e92e 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -60,6 +60,7 @@ show up in /proc/sys/kernel: - panic_on_warn - perf_cpu_time_max_percent - perf_event_paranoid +- perf_event_max_stack - pid_max - powersave-nap [ PPC only ] - printk @@ -654,6 +655,19 @@ users (without CAP_SYS_ADMIN). The default value is 1. ============================================================== +perf_event_max_stack: + +Controls maximum number of stack frames to copy for (attr.sample_type & +PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using +'perf record -g' or 'perf trace --call-graph fp'. + +This can only be done when no events are in use that have callchains +enabled, otherwise writing to this file will return -EBUSY. + +The default value is 127. + +============================================================== + pid_max: PID allocation wrap value. When the kernel's next PID value diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c index 4e02ae5950ff..27563befa8a2 100644 --- a/arch/arm/kernel/perf_callchain.c +++ b/arch/arm/kernel/perf_callchain.c @@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) tail = (struct frame_tail __user *)regs->ARM_fp - 1; - while ((entry->nr < PERF_MAX_STACK_DEPTH) && + while ((entry->nr < sysctl_perf_event_max_stack) && tail && !((unsigned long)tail & 0x3)) tail = user_backtrace(tail, entry); } diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index ff4665462a02..32c3c6e70119 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry, tail = (struct frame_tail __user *)regs->regs[29]; - while (entry->nr < PERF_MAX_STACK_DEPTH && + while (entry->nr < sysctl_perf_event_max_stack && tail && !((unsigned long)tail & 0xf)) tail = user_backtrace(tail, entry); } else { @@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry, tail = (struct compat_frame_tail __user *)regs->compat_fp - 1; - while ((entry->nr < PERF_MAX_STACK_DEPTH) && + while ((entry->nr < sysctl_perf_event_max_stack) && tail && !((unsigned long)tail & 0x3)) tail = compat_user_backtrace(tail, entry); #endif diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c index 315633461a94..252abc12a5a3 100644 --- a/arch/metag/kernel/perf_callchain.c +++ b/arch/metag/kernel/perf_callchain.c @@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) --frame; - while ((entry->nr < PERF_MAX_STACK_DEPTH) && frame) + while ((entry->nr < sysctl_perf_event_max_stack) && frame) frame = user_backtrace(frame, entry); } diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c index c1cf9c6c3f77..5021c546ad07 100644 --- a/arch/mips/kernel/perf_event.c +++ b/arch/mips/kernel/perf_event.c @@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry, addr = *sp++; if (__kernel_text_address(addr)) { perf_callchain_store(entry, addr); - if (entry->nr >= PERF_MAX_STACK_DEPTH) + if (entry->nr >= sysctl_perf_event_max_stack) break; } } @@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry, } do { perf_callchain_store(entry, pc); - if (entry->nr >= PERF_MAX_STACK_DEPTH) + if (entry->nr >= sysctl_perf_event_max_stack) break; pc = unwind_stack(current, &sp, pc, &ra); } while (pc); diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index e04a6752b399..22d9015c1acc 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry, sp = regs->gpr[1]; perf_callchain_store(entry, next_ip); - while (entry->nr < PERF_MAX_STACK_DEPTH) { + while (entry->nr < sysctl_perf_event_max_stack) { fp = (unsigned long __user *) sp; if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp)) return; @@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry, sp = regs->gpr[1]; perf_callchain_store(entry, next_ip); - while (entry->nr < PERF_MAX_STACK_DEPTH) { + while (entry->nr < sysctl_perf_event_max_stack) { fp = (unsigned int __user *) (unsigned long) sp; if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp)) return; diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 6596f66ce112..a4b8b5aed21c 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry, } } #endif - } while (entry->nr < PERF_MAX_STACK_DEPTH); + } while (entry->nr < sysctl_perf_event_max_stack); } static inline int @@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry, pc = sf.callers_pc; ufp = (unsigned long)sf.fp + STACK_BIAS; perf_callchain_store(entry, pc); - } while (entry->nr < PERF_MAX_STACK_DEPTH); + } while (entry->nr < sysctl_perf_event_max_stack); } static void perf_callchain_user_32(struct perf_callchain_entry *entry, @@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry, ufp = (unsigned long)sf.fp; } perf_callchain_store(entry, pc); - } while (entry->nr < PERF_MAX_STACK_DEPTH); + } while (entry->nr < sysctl_perf_event_max_stack); } void diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 041e442a3e28..41d93d0e972b 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2277,7 +2277,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) fp = compat_ptr(ss_base + regs->bp); pagefault_disable(); - while (entry->nr < PERF_MAX_STACK_DEPTH) { + while (entry->nr < sysctl_perf_event_max_stack) { unsigned long bytes; frame.next_frame = 0; frame.return_address = 0; @@ -2337,7 +2337,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) return; pagefault_disable(); - while (entry->nr < PERF_MAX_STACK_DEPTH) { + while (entry->nr < sysctl_perf_event_max_stack) { unsigned long bytes; frame.next_frame = NULL; frame.return_address = 0; diff --git a/arch/xtensa/kernel/perf_event.c b/arch/xtensa/kernel/perf_event.c index 54f01188c29c..a6b00b3af429 100644 --- a/arch/xtensa/kernel/perf_event.c +++ b/arch/xtensa/kernel/perf_event.c @@ -332,14 +332,14 @@ static int callchain_trace(struct stackframe *frame, void *data) void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) { - xtensa_backtrace_kernel(regs, PERF_MAX_STACK_DEPTH, + xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack, callchain_trace, NULL, entry); } void perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) { - xtensa_backtrace_user(regs, PERF_MAX_STACK_DEPTH, + xtensa_backtrace_user(regs, sysctl_perf_event_max_stack, callchain_trace, entry); } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b8b195fbe787..d599ed32af92 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -58,7 +58,7 @@ struct perf_guest_info_callbacks { struct perf_callchain_entry { __u64 nr; - __u64 ip[PERF_MAX_STACK_DEPTH]; + __u64 ip[0]; /* /proc/sys/kernel/perf_event_max_stack */ }; struct perf_raw_record { @@ -983,9 +983,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, extern int get_callchain_buffers(void); extern void put_callchain_buffers(void); +extern int sysctl_perf_event_max_stack; + static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip) { - if (entry->nr < PERF_MAX_STACK_DEPTH) { + if (entry->nr < sysctl_perf_event_max_stack) { entry->ip[entry->nr++] = ip; return 0; } else { @@ -1007,6 +1009,8 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +int perf_event_max_stack_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); static inline bool perf_paranoid_tracepoint_raw(void) { diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 343c22f5e867..1390747b2195 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -18,6 +18,14 @@ struct callchain_cpus_entries { struct perf_callchain_entry *cpu_entries[0]; }; +int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; + +static size_t perf_callchain_entry__sizeof(void) +{ + return sizeof(struct perf_callchain_entry) + + sizeof(__u64) * sysctl_perf_event_max_stack; +} + static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); static atomic_t nr_callchain_events; static DEFINE_MUTEX(callchain_mutex); @@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void) if (!entries) return -ENOMEM; - size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; + size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS; for_each_possible_cpu(cpu) { entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, @@ -215,3 +223,25 @@ exit_put: return entry; } + +int perf_event_max_stack_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int new_value = sysctl_perf_event_max_stack, ret; + struct ctl_table new_table = *table; + + new_table.data = &new_value; + ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + mutex_lock(&callchain_mutex); + if (atomic_read(&nr_callchain_events)) + ret = -EBUSY; + else + sysctl_perf_event_max_stack = new_value; + + mutex_unlock(&callchain_mutex); + + return ret; +} diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 725587f10667..afbe9fb32ec5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1144,6 +1144,14 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one_hundred, }, + { + .procname = "perf_event_max_stack", + .data = NULL, /* filled in by handler */ + .maxlen = sizeof(sysctl_perf_event_max_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = &zero, + }, #endif #ifdef CONFIG_KMEMCHECK {