Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S932104AbdHWOXi (ORCPT ); Wed, 23 Aug 2017 10:23:38 -0400 Received: from mga14.intel.com ([192.55.52.115]:8336 "EHLO mga14.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754098AbdHWOXh (ORCPT ); Wed, 23 Aug 2017 10:23:37 -0400 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.41,417,1498546800"; d="scan'208";a="1006851043" From: kan.liang@intel.com To: peterz@infradead.org, mingo@redhat.com, linux-kernel@vger.kernel.org Cc: acme@kernel.org, jolsa@redhat.com, tglx@linutronix.de, eranian@google.com, ak@linux.intel.com, Kan Liang Subject: [PATCH V6] perf: Add PERF_SAMPLE_PHYS_ADDR Date: Wed, 23 Aug 2017 10:22:46 -0400 Message-Id: <1503498166-3887-1-git-send-email-kan.liang@intel.com> X-Mailer: git-send-email 2.4.3 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 5412 Lines: 174 From: Kan Liang For understanding how the workload maps to memory channels and hardware behavior, it's very important to collect address maps with physical addresses. For example, 3D XPoint access can only be found by filtering the physical address. However, perf doesn't collect physical address information in sampling. The virtual address which stored in data->addr can be used to calculate the physical address. For kernel direct mapping addresses, virt_to_phys is used to convert the virtual addresses to physical address. For user virtual addresses, __get_user_pages_fast is used to walk the pages tables for user physical address. This does not work for vmalloc addresses. Right now these are not resolved, but code to do that could be added. For security, the physical address can only be exposed to root or privileged user. A new sample type PERF_SAMPLE_PHYS_ADDR is introduced to expose the physical addresses. Signed-off-by: Kan Liang --- This patch is kernel patch. The user space patch can be found here. https://www.spinics.net/lists/kernel/msg2587093.html Changes since V5 - Move virt_to_phys to generic code (PeterZ) arch/x86/events/perf_event.h | 2 +- include/linux/perf_event.h | 2 ++ include/uapi/linux/perf_event.h | 4 +++- kernel/events/core.c | 47 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 476aec3..65bb91e 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -91,7 +91,7 @@ struct amd_nb { (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \ PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \ PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ - PERF_SAMPLE_TRANSACTION) + PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR) /* * A debug store configuration. diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b14095b..74fb87e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -944,6 +944,8 @@ struct perf_sample_data { struct perf_regs regs_intr; u64 stack_user_size; + + u64 phys_addr; } ____cacheline_aligned; /* default value for data source */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 642db5f..cbea02f 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -139,8 +139,9 @@ enum perf_event_sample_format { PERF_SAMPLE_IDENTIFIER = 1U << 16, PERF_SAMPLE_TRANSACTION = 1U << 17, PERF_SAMPLE_REGS_INTR = 1U << 18, + PERF_SAMPLE_PHYS_ADDR = 1U << 19, - PERF_SAMPLE_MAX = 1U << 19, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */ }; /* @@ -814,6 +815,7 @@ enum perf_event_type { * { u64 transaction; } && PERF_SAMPLE_TRANSACTION * { u64 abi; # enum perf_sample_regs_abi * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR + * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR * }; */ PERF_RECORD_SAMPLE = 9, diff --git a/kernel/events/core.c b/kernel/events/core.c index d704e23..b991af3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1570,6 +1570,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_TRANSACTION) size += sizeof(data->txn); + if (sample_type & PERF_SAMPLE_PHYS_ADDR) + size += sizeof(data->phys_addr); + event->header_size = size; } @@ -6012,6 +6015,9 @@ void perf_output_sample(struct perf_output_handle *handle, } } + if (sample_type & PERF_SAMPLE_PHYS_ADDR) + perf_output_put(handle, data->phys_addr); + if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; @@ -6027,6 +6033,38 @@ void perf_output_sample(struct perf_output_handle *handle, } } +static u64 perf_virt_to_phys(u64 virt) +{ + u64 phys_addr = 0; + struct page *p = NULL; + + if (!virt) + return 0; + + if (virt >= TASK_SIZE) { + /* If it's vmalloc()d memory, leave phys_addr as 0 */ + if (virt_addr_valid(virt) && + !(virt >= VMALLOC_START && virt < VMALLOC_END)) + phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt); + } else { + /* + * Walking the pages tables for user address. + * Interrupts are disabled, so it prevents any tear down + * of the page tables. + * Try IRQ-safe __get_user_pages_fast first. + * If failed, leave phys_addr as 0. + */ + if ((current->mm != NULL) && + (__get_user_pages_fast(virt, 1, 0, &p) == 1)) + phys_addr = page_to_phys(p) + virt % PAGE_SIZE; + + if (p) + put_page(p); + } + + return phys_addr; +} + void perf_prepare_sample(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event, @@ -6145,6 +6183,9 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } + + if (sample_type & PERF_SAMPLE_PHYS_ADDR) + data->phys_addr = perf_virt_to_phys(data->addr); } static void __always_inline @@ -9892,6 +9933,12 @@ SYSCALL_DEFINE5(perf_event_open, return -EINVAL; } + /* Only privileged users can get kernel addresses */ + if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) && + perf_paranoid_kernel() && + !capable(CAP_SYS_ADMIN)) + return -EACCES; + if (!attr.sample_max_stack) attr.sample_max_stack = sysctl_perf_event_max_stack; -- 2.4.3