Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752813AbcKITEs (ORCPT ); Wed, 9 Nov 2016 14:04:48 -0500 Received: from mga11.intel.com ([192.55.52.93]:47293 "EHLO mga11.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751944AbcKITEp (ORCPT ); Wed, 9 Nov 2016 14:04:45 -0500 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.31,614,1473145200"; d="scan'208";a="1904654" From: kan.liang@intel.com To: peterz@infradead.org, mingo@redhat.com, linux-kernel@vger.kernel.org Cc: acme@kernel.org, tglx@linutronix.de, alexander.shishkin@linux.intel.com, vince@deater.net, eranian@google.com, andi@firstfloor.org, Kan Liang Subject: [PATCH] perf/core: introduce context per CPU event list Date: Wed, 9 Nov 2016 14:04:46 -0500 Message-Id: <1478718286-12824-1-git-send-email-kan.liang@intel.com> X-Mailer: git-send-email 2.5.5 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 13859 Lines: 447 From: Kan Liang The perf per-process monitoring overhead increases rapidly with the increasing of events# and CPU#. Here is some data from the overhead test on Skylake server which has 64 logical CPU. Elapsed time of AIM7 is used to measure the overhead. perf record -e $event_list -p $pid #$pid is the pid of AIM7 Elapsed time A: elapsed time of AIM7 Elapsed time B: elapsed time of AIM7 while perf is running in parallel. Overhead = (B - A) / A Events# Overhead (%) 1 0.45% 2 1.20% 4 3.85% 8 15.82% 16 50.24% The perf_iterate_ctx contributes to the most of the increased overheads. Because it iterates over the whole rcu list ctx->event_list. ctx->event_list increases rapidly by adding events. By adding one event, CPU# of events will be finally inserted into the event_list. Iterating the whole event list will become a disaster on multi-core systems with large CPU#. Actually, it doesn't need to iterate the whole rcu list every time. Sometimes, it only needs to output the event on current CPU. A per cpu event list is introduced to divide the big rcu list into small per CPU rcu lists, which significantly reduce the overhead. Events# Overhead (%) 1 0.15% 2 1.06% 4 1.85% 8 9.99% 16 17.26% Signed-off-by: Kan Liang --- include/linux/perf_event.h | 3 +- kernel/events/core.c | 177 ++++++++++++++++++++++++++++++++++----------- 2 files changed, 138 insertions(+), 42 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 4741ecd..1222a33 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -734,7 +734,8 @@ struct perf_event_context { struct list_head active_ctx_list; struct list_head pinned_groups; struct list_head flexible_groups; - struct list_head event_list; + struct list_head *__percpu event_list; + int nr_events; int nr_active; int is_active; diff --git a/kernel/events/core.c b/kernel/events/core.c index 0e29213..00c12df 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1118,6 +1118,9 @@ static void free_ctx(struct rcu_head *head) ctx = container_of(head, struct perf_event_context, rcu_head); kfree(ctx->task_ctx_data); + + free_percpu(ctx->event_list); + kfree(ctx); } @@ -1461,6 +1464,7 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { + struct list_head *t_list; lockdep_assert_held(&ctx->lock); @@ -1483,7 +1487,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list_update_cgroup_event(event, ctx, true); - list_add_rcu(&event->event_entry, &ctx->event_list); + /* If event CPU is not set, add the event to the list of CPU 0 */ + t_list = per_cpu_ptr(ctx->event_list, event->cpu == -1 ? 0 : event->cpu); + list_add_rcu(&event->event_entry, t_list); ctx->nr_events++; if (event->attr.inherit_stat) ctx->nr_stat++; @@ -2749,25 +2755,30 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, struct perf_event_context *next_ctx) { struct perf_event *event, *next_event; + int cpu; if (!ctx->nr_stat) return; update_context_time(ctx); - event = list_first_entry(&ctx->event_list, - struct perf_event, event_entry); + for_each_possible_cpu(cpu) { + struct list_head *list, *next_list; - next_event = list_first_entry(&next_ctx->event_list, - struct perf_event, event_entry); + list = per_cpu_ptr(ctx->event_list, cpu); + next_list = per_cpu_ptr(next_ctx->event_list, cpu); - while (&event->event_entry != &ctx->event_list && - &next_event->event_entry != &next_ctx->event_list) { + event = list_first_entry(list, struct perf_event, event_entry); + next_event = list_first_entry(next_list, struct perf_event, event_entry); - __perf_event_sync_stat(event, next_event); + while (&event->event_entry != list && + &next_event->event_entry != next_list) { - event = list_next_entry(event, event_entry); - next_event = list_next_entry(next_event, event_entry); + __perf_event_sync_stat(event, next_event); + + event = list_next_entry(event, event_entry); + next_event = list_next_entry(next_event, event_entry); + } } } @@ -3241,7 +3252,9 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, struct perf_event *event; struct hw_perf_event *hwc; u64 now, period = TICK_NSEC; + struct list_head *t_list; s64 delta; + int cpu; /* * only need to iterate over all events iff: @@ -3254,7 +3267,11 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, raw_spin_lock(&ctx->lock); perf_pmu_disable(ctx->pmu); - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + cpu = smp_processor_id(); +again: + t_list = per_cpu_ptr(ctx->event_list, cpu); + + list_for_each_entry_rcu(event, t_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; @@ -3298,6 +3315,15 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, perf_pmu_enable(event->pmu); } + /* + * the event->cpu may be -1. + * If so, the event is stored in CPU0's event_list. + */ + if (cpu != 0) { + cpu = 0; + goto again; + } + perf_pmu_enable(ctx->pmu); raw_spin_unlock(&ctx->lock); } @@ -3385,6 +3411,12 @@ static int event_enable_on_exec(struct perf_event *event, return 1; } +#define for_each_ctx_event_list(__ctx, __cpu, __list) \ + for (__cpu = cpumask_first(cpu_possible_mask), \ + __list = per_cpu_ptr(__ctx->event_list, __cpu); \ + __list; \ + __cpu = cpumask_next(__cpu, cpu_possible_mask), \ + __list = (__cpu < nr_cpu_ids) ? per_cpu_ptr(__ctx->event_list, __cpu) : NULL) /* * Enable all of a task's events that have been marked enable-on-exec. * This expects task == current. @@ -3394,8 +3426,10 @@ static void perf_event_enable_on_exec(int ctxn) struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_cpu_context *cpuctx; struct perf_event *event; + struct list_head *list; unsigned long flags; int enabled = 0; + int cpu; local_irq_save(flags); ctx = current->perf_event_ctxp[ctxn]; @@ -3405,8 +3439,11 @@ static void perf_event_enable_on_exec(int ctxn) cpuctx = __get_cpu_context(ctx); perf_ctx_lock(cpuctx, ctx); ctx_sched_out(ctx, cpuctx, EVENT_TIME); - list_for_each_entry(event, &ctx->event_list, event_entry) - enabled |= event_enable_on_exec(event, ctx); + + for_each_ctx_event_list(ctx, cpu, list) { + list_for_each_entry(event, list, event_entry) + enabled |= event_enable_on_exec(event, ctx); + } /* * Unclone and reschedule this context if we enabled any event. @@ -3623,15 +3660,26 @@ static int perf_event_read(struct perf_event *event, bool group) /* * Initialize the perf_event context in a task_struct: */ -static void __perf_event_init_context(struct perf_event_context *ctx) +static int __perf_event_init_context(struct perf_event_context *ctx) { + struct list_head *list; + int cpu; + raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->active_ctx_list); INIT_LIST_HEAD(&ctx->pinned_groups); INIT_LIST_HEAD(&ctx->flexible_groups); - INIT_LIST_HEAD(&ctx->event_list); + + ctx->event_list = alloc_percpu(struct list_head); + if (!ctx->event_list) + return -1; + for_each_ctx_event_list(ctx, cpu, list) + INIT_LIST_HEAD(list); + atomic_set(&ctx->refcount, 1); + + return 0; } static struct perf_event_context * @@ -3643,7 +3691,11 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task) if (!ctx) return NULL; - __perf_event_init_context(ctx); + if (__perf_event_init_context(ctx)) { + kfree(ctx); + return NULL; + } + if (task) { ctx->task = task; get_task_struct(task); @@ -3978,13 +4030,17 @@ static bool exclusive_event_installable(struct perf_event *event, { struct perf_event *iter_event; struct pmu *pmu = event->pmu; + struct list_head *list; + int cpu; if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) return true; - list_for_each_entry(iter_event, &ctx->event_list, event_entry) { - if (exclusive_event_match(iter_event, event)) - return false; + for_each_ctx_event_list(ctx, cpu, list) { + list_for_each_entry(iter_event, list, event_entry) { + if (exclusive_event_match(iter_event, event)) + return false; + } } return true; @@ -6067,16 +6123,29 @@ perf_iterate_ctx(struct perf_event_context *ctx, void *data, bool all) { struct perf_event *event; + struct list_head *t_list; + int cpu; - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (!all) { + if (all) { + for_each_ctx_event_list(ctx, cpu, t_list) { + list_for_each_entry_rcu(event, t_list, event_entry) + output(event, data); + } + } else { + cpu = smp_processor_id(); +again: + t_list = this_cpu_ptr(ctx->event_list); + list_for_each_entry_rcu(event, t_list, event_entry) { if (event->state < PERF_EVENT_STATE_INACTIVE) continue; if (!event_filter_match(event)) continue; + output(event, data); + } + if (cpu != 0) { + cpu = 0; + goto again; } - - output(event, data); } } @@ -7605,6 +7674,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, { struct perf_sample_data data; struct perf_event *event; + struct list_head *list; + int cpu; struct perf_raw_record raw = { .frag = { @@ -7636,13 +7707,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, if (!ctx) goto unlock; - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (event->attr.type != PERF_TYPE_TRACEPOINT) - continue; - if (event->attr.config != entry->type) - continue; - if (perf_tp_event_match(event, &data, regs)) - perf_swevent_event(event, count, &data, regs); + for_each_ctx_event_list(ctx, cpu, list) { + list_for_each_entry_rcu(event, list, event_entry) { + if (event->attr.type != PERF_TYPE_TRACEPOINT) + continue; + if (event->attr.config != entry->type) + continue; + if (perf_tp_event_match(event, &data, regs)) + perf_swevent_event(event, count, &data, regs); + } } unlock: rcu_read_unlock(); @@ -8590,6 +8663,7 @@ static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) static void free_pmu_context(struct pmu *pmu) { struct pmu *i; + int cpu; mutex_lock(&pmus_lock); /* @@ -8601,7 +8675,12 @@ static void free_pmu_context(struct pmu *pmu) goto out; } } + for_each_possible_cpu(cpu) { + struct perf_cpu_context *cpuctx; + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + free_percpu(cpuctx->ctx.event_list); + } free_percpu(pmu->pmu_cpu_context); out: mutex_unlock(&pmus_lock); @@ -8801,7 +8880,8 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) struct perf_cpu_context *cpuctx; cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - __perf_event_init_context(&cpuctx->ctx); + if (__perf_event_init_context(&cpuctx->ctx)) + goto free_pmu_cpu_context; lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); cpuctx->ctx.pmu = pmu; @@ -8845,6 +8925,9 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) return ret; +free_pmu_cpu_context: + free_pmu_context(pmu); + free_dev: device_del(pmu->dev); put_device(pmu->dev); @@ -9969,6 +10052,8 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) struct perf_event_context *src_ctx; struct perf_event_context *dst_ctx; struct perf_event *event, *tmp; + struct list_head *list; + int cpu; LIST_HEAD(events); src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; @@ -9979,12 +10064,14 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) * of swizzling perf_event::ctx. */ mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); - list_for_each_entry_safe(event, tmp, &src_ctx->event_list, - event_entry) { - perf_remove_from_context(event, 0); - unaccount_event_cpu(event, src_cpu); - put_ctx(src_ctx); - list_add(&event->migrate_entry, &events); + + for_each_ctx_event_list(src_ctx, cpu, list) { + list_for_each_entry_safe(event, tmp, list, event_entry) { + perf_remove_from_context(event, 0); + unaccount_event_cpu(event, src_cpu); + put_ctx(src_ctx); + list_add(&event->migrate_entry, &events); + } } /* @@ -10111,6 +10198,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { struct perf_event_context *child_ctx, *clone_ctx = NULL; struct perf_event *child_event, *next; + struct list_head *list; + int cpu; WARN_ON_ONCE(child != current); @@ -10160,8 +10249,10 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) */ perf_event_task(child, child_ctx, 0); - list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) - perf_event_exit_event(child_event, child_ctx, child); + for_each_ctx_event_list(child_ctx, cpu, list) { + list_for_each_entry_safe(child_event, next, list, event_entry) + perf_event_exit_event(child_event, child_ctx, child); + } mutex_unlock(&child_ctx->mutex); @@ -10611,10 +10702,14 @@ static void __perf_event_exit_context(void *__info) struct perf_event_context *ctx = __info; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event *event; + struct list_head *list; + int cpu; raw_spin_lock(&ctx->lock); - list_for_each_entry(event, &ctx->event_list, event_entry) - __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); + for_each_ctx_event_list(ctx, cpu, list) { + list_for_each_entry(event, list, event_entry) + __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); + } raw_spin_unlock(&ctx->lock); } -- 2.5.5