From: Kan Liang <[email protected]>
For understanding how the workload maps to memory channels and hardware
behavior, it's very important to collect address maps with physical
addresses. For example, 3D XPoint access can only be found by filtering
the physical address.
However, perf doesn't collect physical address information in sampling.
The load latency/DLA information in PEBS can be used to calculate the
physical address.
For kernel direct mapping addresses, virt_to_phys is used to convert the
virtual addresses from DLA to physical address.
For user virtual addresses, __get_user_pages_fast is used to walk the
pages tables for user physical address.
This does not work for vmalloc addresses. Right now these are not
resolved, but code to do that could be added.
For security, the physical address can only be exposed to root or
privileged user.
A new sample type PERF_SAMPLE_PHYS_ADDR is introduced to expose the
physical addresses.
Signed-off-by: Kan Liang <[email protected]>
---
This patch is kernel patch.
The user space patch will be sent out later separately.
Changes since V4
- Correct PHYS_ADDR placement in perf_output_sample (Stephane)
Changes since V3
- Move the code dla->phys to separate function (Stephane)
- Correct PHYS_ADDR misplacement in header file (Stephane)
Changes since V2
- Only the kernel patch
- Add example in changelog
- Include a perf_paranoid_kernel() test (PeterZ)
- Fix minor complier warning
arch/x86/events/intel/ds.c | 32 ++++++++++++++++++++++++++++++++
arch/x86/events/perf_event.h | 2 +-
include/linux/perf_event.h | 3 +++
include/uapi/linux/perf_event.h | 4 +++-
kernel/events/core.c | 12 ++++++++++++
5 files changed, 51 insertions(+), 2 deletions(-)
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index a322fed..f0e8d9c 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1065,6 +1065,35 @@ static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)
return txn;
}
+static u64 dla_to_phys(u64 dla)
+{
+ u64 phys_addr = 0;
+ struct page *p = NULL;
+
+ if (dla >= TASK_SIZE) {
+ /* If it's vmalloc()d memory, leave phys_addr as 0 */
+ if (virt_addr_valid(dla) &&
+ !(dla >= VMALLOC_START && dla < VMALLOC_END))
+ phys_addr = (u64)virt_to_phys((void *)(uintptr_t)dla);
+ } else {
+ /*
+ * Walking the pages tables for user address.
+ * Interrupts are disabled, so it prevents any tear down
+ * of the page tables.
+ * Try IRQ-safe __get_user_pages_fast first.
+ * If failed, leave phys_addr as 0.
+ */
+ if ((current->mm != NULL) &&
+ (__get_user_pages_fast(dla, 1, 0, &p) == 1))
+ phys_addr = page_to_phys(p) + dla % PAGE_SIZE;
+
+ if (p)
+ put_page(p);
+ }
+
+ return phys_addr;
+}
+
static void setup_pebs_sample_data(struct perf_event *event,
struct pt_regs *iregs, void *__pebs,
struct perf_sample_data *data,
@@ -1179,6 +1208,9 @@ static void setup_pebs_sample_data(struct perf_event *event,
x86_pmu.intel_cap.pebs_format >= 1)
data->addr = pebs->dla;
+ if ((sample_type & PERF_SAMPLE_PHYS_ADDR) && (data->addr != 0))
+ data->phys_addr = dla_to_phys(data->addr);
+
if (x86_pmu.intel_cap.pebs_format >= 2) {
/* Only set the TSX weight when no memory weight. */
if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 476aec3..65bb91e 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -91,7 +91,7 @@ struct amd_nb {
(PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
- PERF_SAMPLE_TRANSACTION)
+ PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR)
/*
* A debug store configuration.
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a3b873f..6783c69 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -944,6 +944,8 @@ struct perf_sample_data {
struct perf_regs regs_intr;
u64 stack_user_size;
+
+ u64 phys_addr;
} ____cacheline_aligned;
/* default value for data source */
@@ -964,6 +966,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
data->weight = 0;
data->data_src.val = PERF_MEM_NA;
data->txn = 0;
+ data->phys_addr = 0;
}
extern void perf_output_sample(struct perf_output_handle *handle,
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 642db5f..cbea02f 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -139,8 +139,9 @@ enum perf_event_sample_format {
PERF_SAMPLE_IDENTIFIER = 1U << 16,
PERF_SAMPLE_TRANSACTION = 1U << 17,
PERF_SAMPLE_REGS_INTR = 1U << 18,
+ PERF_SAMPLE_PHYS_ADDR = 1U << 19,
- PERF_SAMPLE_MAX = 1U << 19, /* non-ABI */
+ PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */
};
/*
@@ -814,6 +815,7 @@ enum perf_event_type {
* { u64 transaction; } && PERF_SAMPLE_TRANSACTION
* { u64 abi; # enum perf_sample_regs_abi
* u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
+ * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR
* };
*/
PERF_RECORD_SAMPLE = 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 426c2ff..d5da77f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1570,6 +1570,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
if (sample_type & PERF_SAMPLE_TRANSACTION)
size += sizeof(data->txn);
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ size += sizeof(data->phys_addr);
+
event->header_size = size;
}
@@ -5972,6 +5975,9 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ perf_output_put(handle, data->phys_addr);
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
@@ -9852,6 +9858,12 @@ SYSCALL_DEFINE5(perf_event_open,
return -EINVAL;
}
+ /* Only privileged users can get kernel addresses */
+ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
+ perf_paranoid_kernel() &&
+ !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
if (!attr.sample_max_stack)
attr.sample_max_stack = sysctl_perf_event_max_stack;
--
2.4.3
On Thu, Aug 17, 2017 at 02:17:23PM -0400, [email protected] wrote:
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index a3b873f..6783c69 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -944,6 +944,8 @@ struct perf_sample_data {
>
> struct perf_regs regs_intr;
> u64 stack_user_size;
> +
> + u64 phys_addr;
> } ____cacheline_aligned;
>
> /* default value for data source */
> @@ -964,6 +966,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
> data->weight = 0;
> data->data_src.val = PERF_MEM_NA;
> data->txn = 0;
> + data->phys_addr = 0;
> }
So this is very unfortunate...
struct perf_sample_data {
u64 addr; /* 0 8 */
struct perf_raw_record * raw; /* 8 8 */
struct perf_branch_stack * br_stack; /* 16 8 */
u64 period; /* 24 8 */
u64 weight; /* 32 8 */
u64 txn; /* 40 8 */
union perf_mem_data_src data_src; /* 48 8 */
u64 type; /* 56 8 */
/* --- cacheline 1 boundary (64 bytes) --- */
u64 ip; /* 64 8 */
struct {
u32 pid; /* 72 4 */
u32 tid; /* 76 4 */
} tid_entry; /* 72 8 */
u64 time; /* 80 8 */
u64 id; /* 88 8 */
u64 stream_id; /* 96 8 */
struct {
u32 cpu; /* 104 4 */
u32 reserved; /* 108 4 */
} cpu_entry; /* 104 8 */
struct perf_callchain_entry * callchain; /* 112 8 */
struct perf_regs regs_user; /* 120 16 */
/* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */
struct pt_regs regs_user_copy; /* 136 168 */
/* --- cacheline 4 boundary (256 bytes) was 48 bytes ago --- */
struct perf_regs regs_intr; /* 304 16 */
/* --- cacheline 5 boundary (320 bytes) --- */
u64 stack_user_size; /* 320 8 */
/* size: 384, cachelines: 6, members: 19 */
/* padding: 56 */
};
static inline void perf_sample_data_init(struct perf_sample_data *data,
u64 addr, u64 period)
{
/* remaining struct members initialized in perf_prepare_sample() */
data->addr = addr;
data->raw = NULL;
data->br_stack = NULL;
data->period = period;
data->weight = 0;
data->data_src.val = PERF_MEM_NA;
data->txn = 0;
}
You'll note that that only touches the first cacheline of the data
structure, and you just wrecked that. Back when I did that this made a
measurable difference.
On Thu, Aug 17, 2017 at 02:17:23PM -0400, [email protected] wrote:
> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
> index a322fed..f0e8d9c 100644
> --- a/arch/x86/events/intel/ds.c
> +++ b/arch/x86/events/intel/ds.c
> @@ -1065,6 +1065,35 @@ static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)
> return txn;
> }
>
> +static u64 dla_to_phys(u64 dla)
> +{
> + u64 phys_addr = 0;
> + struct page *p = NULL;
> +
> + if (dla >= TASK_SIZE) {
> + /* If it's vmalloc()d memory, leave phys_addr as 0 */
> + if (virt_addr_valid(dla) &&
> + !(dla >= VMALLOC_START && dla < VMALLOC_END))
> + phys_addr = (u64)virt_to_phys((void *)(uintptr_t)dla);
> + } else {
> + /*
> + * Walking the pages tables for user address.
> + * Interrupts are disabled, so it prevents any tear down
> + * of the page tables.
> + * Try IRQ-safe __get_user_pages_fast first.
> + * If failed, leave phys_addr as 0.
> + */
> + if ((current->mm != NULL) &&
> + (__get_user_pages_fast(dla, 1, 0, &p) == 1))
> + phys_addr = page_to_phys(p) + dla % PAGE_SIZE;
> +
> + if (p)
> + put_page(p);
> + }
> +
> + return phys_addr;
> +}
Is this in any way x86 specific? AFAICT this should work in generic code
as long as data->addr is provided.
> static void setup_pebs_sample_data(struct perf_event *event,
> struct pt_regs *iregs, void *__pebs,
> struct perf_sample_data *data,
> @@ -1179,6 +1208,9 @@ static void setup_pebs_sample_data(struct perf_event *event,
> x86_pmu.intel_cap.pebs_format >= 1)
> data->addr = pebs->dla;
>
> + if ((sample_type & PERF_SAMPLE_PHYS_ADDR) && (data->addr != 0))
> + data->phys_addr = dla_to_phys(data->addr);
> +
> if (x86_pmu.intel_cap.pebs_format >= 2) {
> /* Only set the TSX weight when no memory weight. */
> if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
> On Thu, Aug 17, 2017 at 02:17:23PM -0400, [email protected] wrote:
> > diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
> > index a322fed..f0e8d9c 100644
> > --- a/arch/x86/events/intel/ds.c
> > +++ b/arch/x86/events/intel/ds.c
> > @@ -1065,6 +1065,35 @@ static inline u64 intel_hsw_transaction(struct
> pebs_record_skl *pebs)
> > return txn;
> > }
> >
> > +static u64 dla_to_phys(u64 dla)
> > +{
> > + u64 phys_addr = 0;
> > + struct page *p = NULL;
> > +
> > + if (dla >= TASK_SIZE) {
> > + /* If it's vmalloc()d memory, leave phys_addr as 0 */
> > + if (virt_addr_valid(dla) &&
> > + !(dla >= VMALLOC_START && dla < VMALLOC_END))
> > + phys_addr = (u64)virt_to_phys((void *)(uintptr_t)dla);
> > + } else {
> > + /*
> > + * Walking the pages tables for user address.
> > + * Interrupts are disabled, so it prevents any tear down
> > + * of the page tables.
> > + * Try IRQ-safe __get_user_pages_fast first.
> > + * If failed, leave phys_addr as 0.
> > + */
> > + if ((current->mm != NULL) &&
> > + (__get_user_pages_fast(dla, 1, 0, &p) == 1))
> > + phys_addr = page_to_phys(p) + dla % PAGE_SIZE;
> > +
> > + if (p)
> > + put_page(p);
> > + }
> > +
> > + return phys_addr;
> > +}
>
> Is this in any way x86 specific? AFAICT this should work in generic code as
> long as data->addr is provided.
No, it's not X86 specific. I think it can be used as generic code to convert
virtual address to physical address.
Thanks,
Kan
>
> > static void setup_pebs_sample_data(struct perf_event *event,
> > struct pt_regs *iregs, void *__pebs,
> > struct perf_sample_data *data, @@ -
> 1179,6 +1208,9 @@ static
> > void setup_pebs_sample_data(struct perf_event *event,
> > x86_pmu.intel_cap.pebs_format >= 1)
> > data->addr = pebs->dla;
> >
> > + if ((sample_type & PERF_SAMPLE_PHYS_ADDR) && (data->addr != 0))
> > + data->phys_addr = dla_to_phys(data->addr);
> > +
> > if (x86_pmu.intel_cap.pebs_format >= 2) {
> > /* Only set the TSX weight when no memory weight. */
> > if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
>
> Is this in any way x86 specific? AFAICT this should work in generic code
> as long as data->addr is provided.
It won't work on architectures that split the address space for user/kernel
(if TASK_SIZE is meaningless). But perhaps some generic config for that would
work.
-Andi
>
> On Thu, Aug 17, 2017 at 02:17:23PM -0400, [email protected] wrote:
> > diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> > index a3b873f..6783c69 100644
> > --- a/include/linux/perf_event.h
> > +++ b/include/linux/perf_event.h
> > @@ -944,6 +944,8 @@ struct perf_sample_data {
> >
> > struct perf_regs regs_intr;
> > u64 stack_user_size;
> > +
> > + u64 phys_addr;
> > } ____cacheline_aligned;
> >
> > /* default value for data source */
> > @@ -964,6 +966,7 @@ static inline void perf_sample_data_init(struct
> perf_sample_data *data,
> > data->weight = 0;
> > data->data_src.val = PERF_MEM_NA;
> > data->txn = 0;
> > + data->phys_addr = 0;
> > }
>
> So this is very unfortunate...
>
> struct perf_sample_data {
> u64 addr; /* 0 8 */
> struct perf_raw_record * raw; /* 8 8 */
> struct perf_branch_stack * br_stack; /* 16 8 */
> u64 period; /* 24 8 */
> u64 weight; /* 32 8 */
> u64 txn; /* 40 8 */
> union perf_mem_data_src data_src; /* 48 8 */
> u64 type; /* 56 8 */
> /* --- cacheline 1 boundary (64 bytes) --- */
> u64 ip; /* 64 8 */
> struct {
> u32 pid; /* 72 4 */
> u32 tid; /* 76 4 */
> } tid_entry; /* 72 8 */
> u64 time; /* 80 8 */
> u64 id; /* 88 8 */
> u64 stream_id; /* 96 8 */
> struct {
> u32 cpu; /* 104 4 */
> u32 reserved; /* 108 4 */
> } cpu_entry; /* 104 8 */
> struct perf_callchain_entry * callchain; /* 112 8 */
> struct perf_regs regs_user; /* 120 16 */
> /* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */
> struct pt_regs regs_user_copy; /* 136 168 */
> /* --- cacheline 4 boundary (256 bytes) was 48 bytes ago --- */
> struct perf_regs regs_intr; /* 304 16 */
> /* --- cacheline 5 boundary (320 bytes) --- */
> u64 stack_user_size; /* 320 8 */
>
> /* size: 384, cachelines: 6, members: 19 */
> /* padding: 56 */
> };
>
>
> static inline void perf_sample_data_init(struct perf_sample_data *data,
> u64 addr, u64 period)
> {
> /* remaining struct members initialized in perf_prepare_sample() */
> data->addr = addr;
> data->raw = NULL;
> data->br_stack = NULL;
> data->period = period;
> data->weight = 0;
> data->data_src.val = PERF_MEM_NA;
> data->txn = 0;
> }
>
> You'll note that that only touches the first cacheline of the data structure,
> and you just wrecked that. Back when I did that this made a measurable
> difference.
It looks there is still one room in cacheline 1.
Could I use it?
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b14095b..bcd1007 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -915,6 +915,7 @@ struct perf_sample_data {
u64 weight;
u64 txn;
union perf_mem_data_src data_src;
+ u64 phys_addr;
/*
* The other fields, optionally {set,used} by
@@ -964,6 +966,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
data->weight = 0;
data->data_src.val = PERF_MEM_NA;
data->txn = 0;
+ data->phys_addr = 0;
}
Thanks,
Kan
On Tue, Aug 22, 2017 at 05:58:34PM +0000, Liang, Kan wrote:
> It looks there is still one room in cacheline 1.
> > So this is very unfortunate...
> >
> > struct perf_sample_data {
> > u64 addr; /* 0 8 */
> > struct perf_raw_record * raw; /* 8 8 */
> > struct perf_branch_stack * br_stack; /* 16 8 */
> > u64 period; /* 24 8 */
> > u64 weight; /* 32 8 */
> > u64 txn; /* 40 8 */
> > union perf_mem_data_src data_src; /* 48 8 */
> > u64 type; /* 56 8 */
You mean @type, right? That is unconditionally used by the output code.
> > /* --- cacheline 1 boundary (64 bytes) --- */
> > u64 ip; /* 64 8 */
> > struct {
> > u32 pid; /* 72 4 */
> > u32 tid; /* 76 4 */
> > } tid_entry; /* 72 8 */
> > u64 time; /* 80 8 */
> > u64 id; /* 88 8 */
> > u64 stream_id; /* 96 8 */
> > struct {
> > u32 cpu; /* 104 4 */
> > u32 reserved; /* 108 4 */
> > } cpu_entry; /* 104 8 */
> > struct perf_callchain_entry * callchain; /* 112 8 */
> > struct perf_regs regs_user; /* 120 16 */
> > /* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */
> > struct pt_regs regs_user_copy; /* 136 168 */
> > /* --- cacheline 4 boundary (256 bytes) was 48 bytes ago --- */
> > struct perf_regs regs_intr; /* 304 16 */
> > /* --- cacheline 5 boundary (320 bytes) --- */
> > u64 stack_user_size; /* 320 8 */
> >
> > /* size: 384, cachelines: 6, members: 19 */
> > /* padding: 56 */
> > };
Now, I was hoping, that if you move the entire thing into generic code
(PPC also support PERF_SAMPLE_DATA) then we can avoid the init here and
rely on perf_sample_prepare().