Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934689AbeAKP3u (ORCPT + 1 other); Thu, 11 Jan 2018 10:29:50 -0500 Received: from merlin.infradead.org ([205.233.59.134]:39710 "EHLO merlin.infradead.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S934278AbeAKP3m (ORCPT ); Thu, 11 Jan 2018 10:29:42 -0500 Date: Thu, 11 Jan 2018 16:29:30 +0100 From: Peter Zijlstra To: Vince Weaver Cc: Ingo Molnar , linux-kernel@vger.kernel.org, Ingo Molnar , Arnaldo Carvalho de Melo , Thomas Gleixner Subject: Re: perf: perf_fuzzer quickly locks up on 4.15-rc7 Message-ID: <20180111152930.GF2369@hirez.programming.kicks-ass.net> References: <20180108173005.lkglqrixb2ota6g2@gmail.com> <20180109102507.GG6176@hirez.programming.kicks-ass.net> <20180109132602.GA2369@hirez.programming.kicks-ass.net> <20180111091353.GW6176@hirez.programming.kicks-ass.net> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20180111091353.GW6176@hirez.programming.kicks-ass.net> User-Agent: Mutt/1.9.2 (2017-12-15) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Return-Path: On Thu, Jan 11, 2018 at 10:13:53AM +0100, Peter Zijlstra wrote: > On Tue, Jan 09, 2018 at 08:44:23AM -0500, Vince Weaver wrote: > > On Tue, 9 Jan 2018, Peter Zijlstra wrote: > > > > > So remind me again, how are you running that fuzzer? I'm running > > > ./fast_repro99.sh as root. > > > > I'm running ./fast_repro98.sh on a regular haswell machine with paranoid > > set to "0". > > I'm seeing things like: > > Cannot open /sys/kernel/tracing/kprobe_events > > this is likely caused by me not having anything mounted there. Rostedt > provided the magic incantation to make that work, I'll go try now. > > At least I seem to have cured all lockdep splats.. of course, the moment > I hit send on this someone bad is bound to happen. So while the IVB-EP (6/62/4) goes wobbly after a while, I also started fuzzing on my HSW-EX (6/63/4) and there it runs without issue for over 30 minutes. (I get an unchecked MSR splat from the RAPL pmu, but that is non-fatal) But so far no weird hangs.. running tip/master + below. Sadly the only other HSW I have is a laptop, and those are absolutely useless for debugging (no serial :/) but I'll go see if I can reproduce. FWIW these test-boxes are running debian/testing --- Documentation/admin-guide/kernel-parameters.txt | 5 + arch/x86/events/intel/ds.c | 33 ++++--- kernel/events/core.c | 47 +++++++++- kernel/locking/lockdep.c | 10 ++ kernel/printk/printk.c | 119 +++++++++++++++++------- kernel/smp.c | 10 ++ 6 files changed, 171 insertions(+), 53 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 8bc512782bd2..7fa1a59461f7 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1169,6 +1169,11 @@ parameter will force ia64_sal_cache_flush to call ia64_pal_cache_flush instead of SAL_CACHE_FLUSH. + force_early_printk + Forcefully uses early_console (as per earlyprintk=) + usage for regular printk, bypassing everything, + including the syslog (dmesg will be empty). + forcepae [X86-32] Forcefully enable Physical Address Extension (PAE). Many Pentium M systems disable PAE but may have a diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 8156e47da7ba..18c25ab28557 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -372,10 +372,9 @@ static int alloc_pebs_buffer(int cpu) static void release_pebs_buffer(int cpu) { struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); - struct debug_store *ds = hwev->ds; void *cea; - if (!ds || !x86_pmu.pebs) + if (!x86_pmu.pebs) return; kfree(per_cpu(insn_buffer, cpu)); @@ -384,7 +383,6 @@ static void release_pebs_buffer(int cpu) /* Clear the fixmap */ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; ds_clear_cea(cea, x86_pmu.pebs_buffer_size); - ds->pebs_buffer_base = 0; dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size); hwev->ds_pebs_vaddr = NULL; } @@ -419,16 +417,14 @@ static int alloc_bts_buffer(int cpu) static void release_bts_buffer(int cpu) { struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); - struct debug_store *ds = hwev->ds; void *cea; - if (!ds || !x86_pmu.bts) + if (!x86_pmu.bts) return; /* Clear the fixmap */ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer; ds_clear_cea(cea, BTS_BUFFER_SIZE); - ds->bts_buffer_base = 0; dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE); hwev->ds_bts_vaddr = NULL; } @@ -454,16 +450,22 @@ void release_ds_buffers(void) if (!x86_pmu.bts && !x86_pmu.pebs) return; - get_online_cpus(); - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) + release_ds_buffer(cpu); + + for_each_possible_cpu(cpu) { + /* + * Again, ignore errors from offline CPUs, they will no longer + * observe cpu_hw_events.ds and not program the DS_AREA when + * they come up. + */ fini_debug_store_on_cpu(cpu); + } for_each_possible_cpu(cpu) { release_pebs_buffer(cpu); release_bts_buffer(cpu); - release_ds_buffer(cpu); } - put_online_cpus(); } void reserve_ds_buffers(void) @@ -483,8 +485,6 @@ void reserve_ds_buffers(void) if (!x86_pmu.pebs) pebs_err = 1; - get_online_cpus(); - for_each_possible_cpu(cpu) { if (alloc_ds_buffer(cpu)) { bts_err = 1; @@ -521,11 +521,14 @@ void reserve_ds_buffers(void) if (x86_pmu.pebs && !pebs_err) x86_pmu.pebs_active = 1; - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) { + /* + * Ignores wrmsr_on_cpu() errors for offline CPUs they + * will get this call through intel_pmu_cpu_starting(). + */ init_debug_store_on_cpu(cpu); + } } - - put_online_cpus(); } /* diff --git a/kernel/events/core.c b/kernel/events/core.c index 4e1a1bf8d867..fb079be8971a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1231,6 +1231,10 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event_context::lock * perf_event::mmap_mutex * mmap_sem + * + * cpu_hotplug_lock + * pmus_lock + * cpuctx->mutex / perf_event_context::mutex */ static struct perf_event_context * perf_event_ctx_lock_nested(struct perf_event *event, int nesting) @@ -4196,6 +4200,7 @@ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_event *child, *tmp; + LIST_HEAD(free_list); /* * If we got here through err_file: fput(event_file); we will not have @@ -4268,8 +4273,7 @@ int perf_event_release_kernel(struct perf_event *event) struct perf_event, child_list); if (tmp == child) { perf_remove_from_context(child, DETACH_GROUP); - list_del(&child->child_list); - free_event(child); + list_move(&child->child_list, &free_list); /* * This matches the refcount bump in inherit_event(); * this can't be the last reference. @@ -4284,6 +4288,11 @@ int perf_event_release_kernel(struct perf_event *event) } mutex_unlock(&event->child_mutex); + list_for_each_entry_safe(child, tmp, &free_list, child_list) { + list_del(&child->child_list); + free_event(child); + } + no_ctx: put_event(event); /* Must be the 'last' reference */ return 0; @@ -8526,6 +8535,29 @@ perf_event_set_addr_filter(struct perf_event *event, char *filter_str) return ret; } +static int +perf_tracepoint_set_filter(struct perf_event *event, char *filter_str) +{ + struct perf_event_context *ctx = event->ctx; + int ret; + + /* + * Beware, here be dragons!! + * + * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint + * stuff does not actually need it. So temporarily drop ctx->mutex. As per + * perf_event_ctx_lock() we already have a reference on ctx. + * + * This can result in event getting moved to a different ctx, but that + * does not affect the tracepoint state. + */ + mutex_unlock(&ctx->mutex); + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); + mutex_lock(&ctx->mutex); + + return ret; +} + static int perf_event_set_filter(struct perf_event *event, void __user *arg) { char *filter_str; @@ -8542,8 +8574,7 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg) if (IS_ENABLED(CONFIG_EVENT_TRACING) && event->attr.type == PERF_TYPE_TRACEPOINT) - ret = ftrace_profile_set_filter(event, event->attr.config, - filter_str); + ret = perf_tracepoint_set_filter(event, filter_str); else if (has_addr_filter(event)) ret = perf_event_set_addr_filter(event, filter_str); @@ -9178,7 +9209,13 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) if (!try_module_get(pmu->module)) return -ENODEV; - if (event->group_leader != event) { + /* + * A number of pmu->event_init() methods iterate the sibling_list to, + * for example, validate if the group fits on the PMU. Therefore, + * if this is a sibling event, acquire the ctx->mutex to protect + * the sibling_list. + */ + if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) { /* * This ctx->mutex can nest when we're called through * inheritance. See the perf_event_ctx_lock_nested() comment. diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 5fa1324a4f29..fb81dc92aaa6 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2668,8 +2668,12 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, this, 0, irqclass); } +DECLARE_PER_CPU(struct cpumask *, smp_call_mask); + void print_irqtrace_events(struct task_struct *curr) { + struct cpumask *__mask = this_cpu_read(smp_call_mask); + printk("irq event stamp: %u\n", curr->irq_events); printk("hardirqs last enabled at (%u): [<%p>] %pS\n", curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip, @@ -2683,6 +2687,12 @@ void print_irqtrace_events(struct task_struct *curr) printk("softirqs last disabled at (%u): [<%p>] %pS\n", curr->softirq_disable_event, (void *)curr->softirq_disable_ip, (void *)curr->softirq_disable_ip); + + if (__mask) { + printk("smp_function_call mask: %*pbl\n", cpumask_pr_args(__mask)); + } else { + printk("smp_function_call mask: (NULL)\n"); + } } static int HARDIRQ_verbose(struct lock_class *class) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b9006617710f..c1a4ed17c22c 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -365,6 +365,75 @@ __packed __aligned(4) #endif ; +#ifdef CONFIG_EARLY_PRINTK +struct console *early_console; + +static bool __read_mostly force_early_printk; + +static int __init force_early_printk_setup(char *str) +{ + force_early_printk = true; + return 0; +} +early_param("force_early_printk", force_early_printk_setup); + +static int early_printk_cpu = -1; + +static int early_vprintk(const char *fmt, va_list args) +{ + int n, cpu, old; + char buf[512]; + + cpu = get_cpu(); + /* + * Test-and-Set inter-cpu spinlock with recursion. + */ + for (;;) { + /* + * c-cas to avoid the exclusive bouncing on spin. + * Depends on the memory barrier implied by cmpxchg + * for ACQUIRE semantics. + */ + old = READ_ONCE(early_printk_cpu); + if (old == -1) { + old = cmpxchg(&early_printk_cpu, -1, cpu); + if (old == -1) + break; + } + /* + * Allow recursion for interrupts and the like. + */ + if (old == cpu) + break; + + cpu_relax(); + } + + n = vscnprintf(buf, sizeof(buf), fmt, args); + early_console->write(early_console, buf, n); + + /* + * Unlock -- in case @old == @cpu, this is a no-op. + */ + smp_store_release(&early_printk_cpu, old); + put_cpu(); + + return n; +} + +asmlinkage __visible void early_printk(const char *fmt, ...) +{ + va_list ap; + + if (!early_console) + return; + + va_start(ap, fmt); + early_vprintk(fmt, ap); + va_end(ap); +} +#endif + /* * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken * within the scheduler's rq lock. It must be released before calling @@ -1692,6 +1761,16 @@ asmlinkage int vprintk_emit(int facility, int level, int printed_len; bool in_sched = false; +#ifdef CONFIG_KGDB_KDB + if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) + return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); +#endif + +#ifdef CONFIG_EARLY_PRINTK + if (force_early_printk && early_console) + return early_vprintk(fmt, args); +#endif + if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; in_sched = true; @@ -1784,18 +1863,7 @@ EXPORT_SYMBOL(printk_emit); int vprintk_default(const char *fmt, va_list args) { - int r; - -#ifdef CONFIG_KGDB_KDB - /* Allow to pass printk() to kdb but avoid a recursion. */ - if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) { - r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); - return r; - } -#endif - r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); - - return r; + return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); } EXPORT_SYMBOL_GPL(vprintk_default); @@ -1826,7 +1894,12 @@ asmlinkage __visible int printk(const char *fmt, ...) int r; va_start(args, fmt); - r = vprintk_func(fmt, args); +#ifdef CONFIG_EARLY_PRINTK + if (force_early_printk && early_console) + r = vprintk_default(fmt, args); + else +#endif + r = vprintk_func(fmt, args); va_end(args); return r; @@ -1863,26 +1936,6 @@ static bool suppress_message_printing(int level) { return false; } #endif /* CONFIG_PRINTK */ -#ifdef CONFIG_EARLY_PRINTK -struct console *early_console; - -asmlinkage __visible void early_printk(const char *fmt, ...) -{ - va_list ap; - char buf[512]; - int n; - - if (!early_console) - return; - - va_start(ap, fmt); - n = vscnprintf(buf, sizeof(buf), fmt, ap); - va_end(ap); - - early_console->write(early_console, buf, n); -} -#endif - static int __add_preferred_console(char *name, int idx, char *options, char *brl_options) { diff --git a/kernel/smp.c b/kernel/smp.c index 084c8b3a2681..d014602e13f7 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -257,6 +257,8 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) irq_work_run(); } +DEFINE_PER_CPU(const struct cpumask *, smp_call_mask); + /* * smp_call_function_single - Run a function on a specific CPU * @func: The function to run. This must be fast and non-blocking. @@ -296,11 +298,15 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, csd_lock(csd); } + this_cpu_write(smp_call_mask, cpumask_of(cpu)); + err = generic_exec_single(cpu, csd, func, info); if (wait) csd_lock_wait(csd); + this_cpu_write(smp_call_mask, NULL); + put_cpu(); return err; @@ -457,6 +463,8 @@ void smp_call_function_many(const struct cpumask *mask, __cpumask_set_cpu(cpu, cfd->cpumask_ipi); } + this_cpu_write(smp_call_mask, cfd->cpumask_ipi); + /* Send a message to all CPUs in the map */ arch_send_call_function_ipi_mask(cfd->cpumask_ipi); @@ -468,6 +476,8 @@ void smp_call_function_many(const struct cpumask *mask, csd_lock_wait(csd); } } + + this_cpu_write(smp_call_mask, NULL); } EXPORT_SYMBOL(smp_call_function_many);