2018-08-10 14:04:24

by Liang, Kan

[permalink] [raw]
Subject: [PATCH RFC 1/7] perf/core, x86: Add PERF_SAMPLE_PAGE_SIZE

From: Kan Liang <[email protected]>

Current perf can report both virtual address and physical address, but
it doesn't report page size. Users have no idea how large the utilized
page is. They cannot promote/demote large pages to optimize memory use.

Add a new sample type for page size.

Current perf already has a facility to collect data virtual address.
A function, to retrieve page size by full page-table walk of a given
virtual address, is introduced for x86. Other architectures can
implement their own functions later separately.
The function must be IRQ-safe. For x86, disabling IRQs over the walk is
sufficient to prevent any tear down of the page tables.

The new sample type requires collecting the virtual address. The
virtual address will not be output unless SAMPLE_ADDR is applied.

Although only a few bits are needed to indicate the page size, a u64
type is still claimed for page_size. Because struct perf_sample_data
requires cacheline_aligned.

Signed-off-by: Kan Liang <[email protected]>
---
arch/x86/events/core.c | 25 +++++++++++++++++++++++++
arch/x86/events/intel/ds.c | 2 +-
arch/x86/events/perf_event.h | 2 +-
include/linux/perf_event.h | 1 +
include/uapi/linux/perf_event.h | 13 ++++++++++++-
kernel/events/core.c | 15 +++++++++++++++
6 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5f4829f..719e527 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2573,3 +2573,28 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
cap->events_mask_len = x86_pmu.events_mask_len;
}
EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
+
+u64 perf_get_page_size(u64 virt)
+{
+ unsigned long flags;
+ unsigned int level;
+ pte_t *pte;
+
+ if (!virt)
+ return 0;
+
+ /*
+ * Interrupts are disabled, so it prevents any tear down
+ * of the page tables.
+ * See the comment near struct mmu_table_batch.
+ */
+ local_irq_save(flags);
+ if (virt >= TASK_SIZE)
+ pte = lookup_address(virt, &level);
+ else
+ pte = lookup_address_in_pgd(pgd_offset(current->mm, virt),
+ virt, &level);
+ local_irq_restore(flags);
+
+ return (u64)level;
+}
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index b7b01d7..a3e56c7 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1274,7 +1274,7 @@ static void setup_pebs_sample_data(struct perf_event *event,
}


- if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) &&
+ if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_PAGE_SIZE)) &&
x86_pmu.intel_cap.pebs_format >= 1)
data->addr = pebs->dla;

diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 1562863..affcd26 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -94,7 +94,7 @@ struct amd_nb {
PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | \
- PERF_SAMPLE_PERIOD)
+ PERF_SAMPLE_PERIOD | PERF_SAMPLE_PAGE_SIZE)

#define PEBS_REGS \
(PERF_REG_X86_AX | \
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 53c500f..9d13745 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -937,6 +937,7 @@ struct perf_sample_data {
u64 stack_user_size;

u64 phys_addr;
+ u64 page_size;
} ____cacheline_aligned;

/* default value for data source */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index eeb787b..5473443 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -141,8 +141,9 @@ enum perf_event_sample_format {
PERF_SAMPLE_TRANSACTION = 1U << 17,
PERF_SAMPLE_REGS_INTR = 1U << 18,
PERF_SAMPLE_PHYS_ADDR = 1U << 19,
+ PERF_SAMPLE_PAGE_SIZE = 1U << 20,

- PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */
+ PERF_SAMPLE_MAX = 1U << 21, /* non-ABI */

__PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63,
};
@@ -861,6 +862,7 @@ enum perf_event_type {
* { u64 abi; # enum perf_sample_regs_abi
* u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
* { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR
+ * { u64 page_size;} && PERF_SAMPLE_PAGE_SIZE
* };
*/
PERF_RECORD_SAMPLE = 9,
@@ -1099,6 +1101,15 @@ union perf_mem_data_src {
#define PERF_MEM_S(a, s) \
(((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)

+
+enum perf_mem_page_size {
+ PERF_MEM_PAGE_SIZE_NONE,
+ PERF_MEM_PAGE_SIZE_4K,
+ PERF_MEM_PAGE_SIZE_2M,
+ PERF_MEM_PAGE_SIZE_1G,
+ PERF_MEM_PAGE_SIZE_512G,
+};
+
/*
* single taken branch record layout:
*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f6ea33a..e848e9b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1751,6 +1751,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
size += sizeof(data->phys_addr);

+ if (sample_type & PERF_SAMPLE_PAGE_SIZE)
+ size += sizeof(data->page_size);
+
event->header_size = size;
}

@@ -6294,6 +6297,9 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
perf_output_put(handle, data->phys_addr);

+ if (sample_type & PERF_SAMPLE_PAGE_SIZE)
+ perf_output_put(handle, data->page_size);
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;

@@ -6341,6 +6347,12 @@ static u64 perf_virt_to_phys(u64 virt)
return phys_addr;
}

+/* Return page size of given virtual address. IRQ-safe required. */
+u64 __weak perf_get_page_size(u64 virt)
+{
+ return PERF_MEM_PAGE_SIZE_NONE;
+}
+
static struct perf_callchain_entry __empty_callchain = { .nr = 0, };

struct perf_callchain_entry *
@@ -6482,6 +6494,9 @@ void perf_prepare_sample(struct perf_event_header *header,

if (sample_type & PERF_SAMPLE_PHYS_ADDR)
data->phys_addr = perf_virt_to_phys(data->addr);
+
+ if (sample_type & PERF_SAMPLE_PAGE_SIZE)
+ data->page_size = perf_get_page_size(data->addr);
}

static __always_inline void
--
2.7.4



2018-08-10 13:39:24

by Liang, Kan

[permalink] [raw]
Subject: [PATCH RFC 4/7] perf sort: Add sort option for page size

From: Kan Liang <[email protected]>

Add a new sort option "page_size" for --mem-mode sort. With this
option applied, perf can sort and report by sample's page size.

Here is an example.
perf report --stdio --mem-mode --sort=comm,symbol,phys_daddr,page_size

# To display the perf.data header info, please use
# --header/--header-only options.
#
#
# Total Lost Samples: 0
#
# Samples: 9K of event 'mem-loads:uP'
# Total weight : 9028
# Sort order : comm,symbol,phys_daddr,page_size
#
# Overhead Command Symbol Data Physical Address
# Page Size
# ........ ....... ............................
# ...................... ......................
#
11.19% dtlb [.] touch_buffer [.]
0x00000003fec82ea8 4K
8.61% dtlb [.] GetTickCount [.]
0x00000003c4f2c8a8 4K
4.52% dtlb [.] GetTickCount [.]
0x00000003fec82f58 4K
4.33% dtlb [.] __gettimeofday [.]
0x00000003fec82f48 4K
4.32% dtlb [.] GetTickCount [.]
0x00000003fec82f78 4K
4.28% dtlb [.] GetTickCount [.]
0x00000003fec82f50 4K
4.23% dtlb [.] GetTickCount [.]
0x00000003fec82f70 4K
4.11% dtlb [.] GetTickCount [.]
0x00000003fec82f68 4K
4.00% dtlb [.] Calibrate [.]
0x00000003fec82f98 4K
3.91% dtlb [.] Calibrate [.]
0x00000003fec82f90 4K
3.43% dtlb [.] touch_buffer [.]
0x00000003fec82e98 4K
3.42% dtlb [.] touch_buffer [.]
0x00000003fec82e90 4K
0.09% dtlb [.] DoDependentLoads [.]
0x000000036ea084c0 2M
0.08% dtlb [.] DoDependentLoads [.]
0x000000032b010b80 2M

Signed-off-by: Kan Liang <[email protected]>
---
tools/perf/Documentation/perf-report.txt | 1 +
tools/perf/util/hist.c | 3 +++
tools/perf/util/hist.h | 1 +
tools/perf/util/machine.c | 7 +++++--
tools/perf/util/sort.c | 28 ++++++++++++++++++++++++++++
tools/perf/util/sort.h | 1 +
tools/perf/util/symbol.h | 1 +
7 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 917e36f..53fadad 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -139,6 +139,7 @@ OPTIONS
- snoop: type of snoop (if any) for the data at the time of the sample
- dcacheline: the cacheline the data address is on at the time of the sample
- phys_daddr: physical address of data being executed on at the time of sample
+ - page_size: the page size of data being executed on at the time of sample

And the default sort keys are changed to local_weight, mem, sym, dso,
symbol_daddr, dso_daddr, snoop, tlb, locked, see '--mem-mode'.
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 828cb97..96d2b40 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -174,6 +174,9 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
hists__new_col_len(hists, HISTC_MEM_PHYS_DADDR,
unresolved_col_width + 4 + 2);

+ hists__new_col_len(hists, HISTC_MEM_PAGE_SIZE,
+ unresolved_col_width + 4 + 2);
+
} else {
symlen = unresolved_col_width + 4 + 2;
hists__new_col_len(hists, HISTC_MEM_DADDR_SYMBOL, symlen);
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 3badd7f..1c42c8e 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -49,6 +49,7 @@ enum hist_column {
HISTC_MEM_DADDR_SYMBOL,
HISTC_MEM_DADDR_DSO,
HISTC_MEM_PHYS_DADDR,
+ HISTC_MEM_PAGE_SIZE,
HISTC_MEM_LOCKED,
HISTC_MEM_TLB,
HISTC_MEM_LVL,
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index b300a39..e79bbc8 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -1829,11 +1829,12 @@ static void ip__resolve_ams(struct thread *thread,
ams->sym = al.sym;
ams->map = al.map;
ams->phys_addr = 0;
+ ams->page_size = PERF_MEM_PAGE_SIZE_NONE;
}

static void ip__resolve_data(struct thread *thread,
u8 m, struct addr_map_symbol *ams,
- u64 addr, u64 phys_addr)
+ u64 addr, u64 phys_addr, u64 daddr_page_size)
{
struct addr_location al;

@@ -1846,6 +1847,7 @@ static void ip__resolve_data(struct thread *thread,
ams->sym = al.sym;
ams->map = al.map;
ams->phys_addr = phys_addr;
+ ams->page_size = daddr_page_size;
}

struct mem_info *sample__resolve_mem(struct perf_sample *sample,
@@ -1858,7 +1860,8 @@ struct mem_info *sample__resolve_mem(struct perf_sample *sample,

ip__resolve_ams(al->thread, &mi->iaddr, sample->ip);
ip__resolve_data(al->thread, al->cpumode, &mi->daddr,
- sample->addr, sample->phys_addr);
+ sample->addr, sample->phys_addr,
+ sample->page_size);
mi->data_src.val = sample->data_src;

return mi;
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index fed2952..68b0f9e 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1334,6 +1334,33 @@ struct sort_entry sort_mem_phys_daddr = {
};

static int64_t
+sort__page_size_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+ uint64_t l = 0, r = 0;
+
+ if (left->mem_info)
+ l = left->mem_info->daddr.page_size;
+ if (right->mem_info)
+ r = right->mem_info->daddr.page_size;
+
+ return (int64_t)(r - l);
+}
+
+static int hist_entry__page_size_snprintf(struct hist_entry *he, char *bf,
+ size_t size, unsigned int width)
+{
+ return repsep_snprintf(bf, size, "%-*s", width,
+ get_page_size_name(he->mem_info->daddr.page_size));
+}
+
+struct sort_entry sort_mem_page_size = {
+ .se_header = "Page Size",
+ .se_cmp = sort__page_size_cmp,
+ .se_snprintf = hist_entry__page_size_snprintf,
+ .se_width_idx = HISTC_MEM_PAGE_SIZE,
+};
+
+static int64_t
sort__abort_cmp(struct hist_entry *left, struct hist_entry *right)
{
if (!left->branch_info || !right->branch_info)
@@ -1607,6 +1634,7 @@ static struct sort_dimension memory_sort_dimensions[] = {
DIM(SORT_MEM_SNOOP, "snoop", sort_mem_snoop),
DIM(SORT_MEM_DCACHELINE, "dcacheline", sort_mem_dcacheline),
DIM(SORT_MEM_PHYS_DADDR, "phys_daddr", sort_mem_phys_daddr),
+ DIM(SORT_MEM_PAGE_SIZE, "page_size", sort_mem_page_size),
};

#undef DIM
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 8bf302c..c6b2f30 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -254,6 +254,7 @@ enum sort_type {
SORT_MEM_DCACHELINE,
SORT_MEM_IADDR_SYMBOL,
SORT_MEM_PHYS_DADDR,
+ SORT_MEM_PAGE_SIZE,
};

/*
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index f25fae4..567a1e1 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -187,6 +187,7 @@ struct addr_map_symbol {
u64 addr;
u64 al_addr;
u64 phys_addr;
+ u64 page_size;
};

struct branch_info {
--
2.7.4


2018-08-10 13:39:26

by Liang, Kan

[permalink] [raw]
Subject: [PATCH RFC 5/7] perf mem: Clean up output format and sort order string

From: Kan Liang <[email protected]>

Now, "--phys-data" is the only option which impacts the output format and
sort order. A simple "if else" is enough to handle the option.
But there will be more options added, e.g. "--page-size", which also
impact the output format and sort order. The code will become too
complex to be maintained.

Divide the big printf into several small pieces. Output the specific
piece only if the related option is applied.

Divide the big sort order string into several small pieces as well.
Appends specific sort string only if the related option is applied.

No functional change.

Signed-off-by: Kan Liang <[email protected]>
---
tools/perf/builtin-mem.c | 132 +++++++++++++++++++++++------------------------
tools/perf/util/sort.h | 2 +
2 files changed, 66 insertions(+), 68 deletions(-)

diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 57393e9..6048fca 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -14,6 +14,7 @@
#include "util/mem-events.h"
#include "util/debug.h"
#include "util/symbol.h"
+#include "util/sort.h"

#define MEM_OPERATION_LOAD 0x1
#define MEM_OPERATION_STORE 0x2
@@ -153,7 +154,7 @@ dump_raw_samples(struct perf_tool *tool,
{
struct perf_mem *mem = container_of(tool, struct perf_mem, tool);
struct addr_location al;
- const char *fmt;
+ const char *fmt, *field_sep;

if (machine__resolve(machine, &al, sample) < 0) {
fprintf(stderr, "problem processing %d event, skipping it.\n",
@@ -167,60 +168,45 @@ dump_raw_samples(struct perf_tool *tool,
if (al.map != NULL)
al.map->dso->hit = 1;

- if (mem->phys_addr) {
- if (symbol_conf.field_sep) {
- fmt = "%d%s%d%s0x%"PRIx64"%s0x%"PRIx64"%s0x%016"PRIx64
- "%s%"PRIu64"%s0x%"PRIx64"%s%s:%s\n";
- } else {
- fmt = "%5d%s%5d%s0x%016"PRIx64"%s0x016%"PRIx64
- "%s0x%016"PRIx64"%s%5"PRIu64"%s0x%06"PRIx64
- "%s%s:%s\n";
- symbol_conf.field_sep = " ";
- }
-
- printf(fmt,
- sample->pid,
- symbol_conf.field_sep,
- sample->tid,
- symbol_conf.field_sep,
- sample->ip,
- symbol_conf.field_sep,
- sample->addr,
- symbol_conf.field_sep,
- sample->phys_addr,
- symbol_conf.field_sep,
- sample->weight,
- symbol_conf.field_sep,
- sample->data_src,
- symbol_conf.field_sep,
- al.map ? (al.map->dso ? al.map->dso->long_name : "???") : "???",
- al.sym ? al.sym->name : "???");
+ field_sep = symbol_conf.field_sep;
+ if (field_sep) {
+ fmt = "%d%s%d%s0x%"PRIx64"%s0x%"PRIx64"%s";
} else {
- if (symbol_conf.field_sep) {
- fmt = "%d%s%d%s0x%"PRIx64"%s0x%"PRIx64"%s%"PRIu64
- "%s0x%"PRIx64"%s%s:%s\n";
- } else {
- fmt = "%5d%s%5d%s0x%016"PRIx64"%s0x016%"PRIx64
- "%s%5"PRIu64"%s0x%06"PRIx64"%s%s:%s\n";
- symbol_conf.field_sep = " ";
- }
+ fmt = "%5d%s%5d%s0x%016"PRIx64"%s0x016%"PRIx64"%s";
+ symbol_conf.field_sep = " ";
+ }
+ printf(fmt,
+ sample->pid,
+ symbol_conf.field_sep,
+ sample->tid,
+ symbol_conf.field_sep,
+ sample->ip,
+ symbol_conf.field_sep,
+ sample->addr,
+ symbol_conf.field_sep);

+ if (mem->phys_addr) {
+ if (field_sep)
+ fmt = "0x%"PRIx64"%s";
+ else
+ fmt = "0x%016"PRIx64"%s";
printf(fmt,
- sample->pid,
- symbol_conf.field_sep,
- sample->tid,
- symbol_conf.field_sep,
- sample->ip,
- symbol_conf.field_sep,
- sample->addr,
- symbol_conf.field_sep,
- sample->weight,
- symbol_conf.field_sep,
- sample->data_src,
- symbol_conf.field_sep,
- al.map ? (al.map->dso ? al.map->dso->long_name : "???") : "???",
- al.sym ? al.sym->name : "???");
+ sample->phys_addr,
+ symbol_conf.field_sep);
}
+
+ if (field_sep)
+ fmt = "%"PRIu64"%s0x%"PRIx64"%s%s:%s\n";
+ else
+ fmt = "%5"PRIu64"%s0x%06"PRIx64"%s%s:%s\n";
+
+ printf(fmt,
+ sample->weight,
+ symbol_conf.field_sep,
+ sample->data_src,
+ symbol_conf.field_sep,
+ al.map ? (al.map->dso ? al.map->dso->long_name : "???") : "???",
+ al.sym ? al.sym->name : "???");
out_put:
addr_location__put(&al);
return 0;
@@ -262,10 +248,12 @@ static int report_raw_events(struct perf_mem *mem)
if (ret < 0)
goto out_delete;

+ printf("# PID, TID, IP, ADDR, ");
+
if (mem->phys_addr)
- printf("# PID, TID, IP, ADDR, PHYS ADDR, LOCAL WEIGHT, DSRC, SYMBOL\n");
- else
- printf("# PID, TID, IP, ADDR, LOCAL WEIGHT, DSRC, SYMBOL\n");
+ printf("PHYS ADDR, ");
+
+ printf("LOCAL WEIGHT, DSRC, SYMBOL\n");

ret = perf_session__process_events(session);

@@ -273,11 +261,30 @@ static int report_raw_events(struct perf_mem *mem)
perf_session__delete(session);
return ret;
}
+static char *get_sort_order(struct perf_mem *mem)
+{
+ char sort[MAX_SORT_ORDER_STR];
+
+ /*
+ * there is no weight (cost) associated with stores, so don't print
+ * the column
+ */
+ if (mem->operation & MEM_OPERATION_STORE)
+ strcpy(sort, "--sort=mem,sym,dso,symbol_daddr,dso_daddr,tlb,locked");
+ else
+ strcpy(sort, default_mem_sort_order);
+
+ if (mem->phys_addr)
+ strcat(sort, ",phys_daddr");
+
+ return strdup(sort);
+}

static int report_events(int argc, const char **argv, struct perf_mem *mem)
{
const char **rep_argv;
int ret, i = 0, j, rep_argc;
+ char *new_sort_order;

if (mem->dump_raw)
return report_raw_events(mem);
@@ -291,20 +298,9 @@ static int report_events(int argc, const char **argv, struct perf_mem *mem)
rep_argv[i++] = "--mem-mode";
rep_argv[i++] = "-n"; /* display number of samples */

- /*
- * there is no weight (cost) associated with stores, so don't print
- * the column
- */
- if (!(mem->operation & MEM_OPERATION_LOAD)) {
- if (mem->phys_addr)
- rep_argv[i++] = "--sort=mem,sym,dso,symbol_daddr,"
- "dso_daddr,tlb,locked,phys_daddr";
- else
- rep_argv[i++] = "--sort=mem,sym,dso,symbol_daddr,"
- "dso_daddr,tlb,locked";
- } else if (mem->phys_addr)
- rep_argv[i++] = "--sort=local_weight,mem,sym,dso,symbol_daddr,"
- "dso_daddr,snoop,tlb,locked,phys_daddr";
+ new_sort_order = get_sort_order(mem);
+ if (new_sort_order)
+ rep_argv[i++] = new_sort_order;

for (j = 1; j < argc; j++, i++)
rep_argv[i] = argv[j];
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index c6b2f30..4632e25 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -46,6 +46,8 @@ extern struct sort_entry sort_srcline;
extern enum sort_type sort__first_dimension;
extern const char default_mem_sort_order[];

+#define MAX_SORT_ORDER_STR 128
+
struct he_stat {
u64 period;
u64 period_sys;
--
2.7.4


2018-08-10 13:40:04

by Liang, Kan

[permalink] [raw]
Subject: [PATCH RFC 2/7] perf tools: Support new sample type for page size

From: Kan Liang <[email protected]>

Support new sample type PERF_SAMPLE_PAGE_SIZE for page size.

Add new option --page-size to record sample page size.

Signed-off-by: Kan Liang <[email protected]>
---
tools/include/uapi/linux/perf_event.h | 13 ++++++++++++-
tools/perf/Documentation/perf-record.txt | 3 +++
tools/perf/builtin-record.c | 2 ++
tools/perf/perf.h | 1 +
tools/perf/util/event.h | 1 +
tools/perf/util/evsel.c | 19 ++++++++++++++++++-
6 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index eeb787b..5473443 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -141,8 +141,9 @@ enum perf_event_sample_format {
PERF_SAMPLE_TRANSACTION = 1U << 17,
PERF_SAMPLE_REGS_INTR = 1U << 18,
PERF_SAMPLE_PHYS_ADDR = 1U << 19,
+ PERF_SAMPLE_PAGE_SIZE = 1U << 20,

- PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */
+ PERF_SAMPLE_MAX = 1U << 21, /* non-ABI */

__PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63,
};
@@ -861,6 +862,7 @@ enum perf_event_type {
* { u64 abi; # enum perf_sample_regs_abi
* u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
* { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR
+ * { u64 page_size;} && PERF_SAMPLE_PAGE_SIZE
* };
*/
PERF_RECORD_SAMPLE = 9,
@@ -1099,6 +1101,15 @@ union perf_mem_data_src {
#define PERF_MEM_S(a, s) \
(((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)

+
+enum perf_mem_page_size {
+ PERF_MEM_PAGE_SIZE_NONE,
+ PERF_MEM_PAGE_SIZE_4K,
+ PERF_MEM_PAGE_SIZE_2M,
+ PERF_MEM_PAGE_SIZE_1G,
+ PERF_MEM_PAGE_SIZE_512G,
+};
+
/*
* single taken branch record layout:
*
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 246dee0..ddfe4be 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -264,6 +264,9 @@ OPTIONS
--phys-data::
Record the sample physical addresses.

+--page-size::
+ Record the sample page size
+
-T::
--timestamp::
Record the sample timestamps. Use it with 'perf report -D' to see the
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 22ebeb92..7f27dbe 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1600,6 +1600,8 @@ static struct option __record_options[] = {
OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
"Record the sample physical addresses"),
+ OPT_BOOLEAN(0, "page-size", &record.opts.sample_page_size,
+ "Record the sample page size"),
OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
&record.opts.sample_time_set,
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 21bf7f5..db06458 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -48,6 +48,7 @@ struct record_opts {
bool raw_samples;
bool sample_address;
bool sample_phys_addr;
+ bool sample_page_size;
bool sample_weight;
bool sample_time;
bool sample_time_set;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index bfa60bc..51456b8 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -202,6 +202,7 @@ struct perf_sample {
u32 raw_size;
u64 data_src;
u64 phys_addr;
+ u64 page_size;
u32 flags;
u16 insn_len;
u8 cpumode;
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index ddf84b9..8cf7ec3 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1015,6 +1015,9 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts,
if (opts->sample_phys_addr)
perf_evsel__set_sample_bit(evsel, PHYS_ADDR);

+ if (opts->sample_page_size)
+ perf_evsel__set_sample_bit(evsel, PAGE_SIZE);
+
if (opts->no_buffering) {
attr->watermark = 0;
attr->wakeup_events = 1;
@@ -1540,7 +1543,7 @@ static void __p_sample_type(char *buf, size_t size, u64 value)
bit_name(PERIOD), bit_name(STREAM_ID), bit_name(RAW),
bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER),
bit_name(IDENTIFIER), bit_name(REGS_INTR), bit_name(DATA_SRC),
- bit_name(WEIGHT), bit_name(PHYS_ADDR),
+ bit_name(WEIGHT), bit_name(PHYS_ADDR), bit_name(PAGE_SIZE),
{ .name = NULL, }
};
#undef bit_name
@@ -2357,6 +2360,12 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
array++;
}

+ data->page_size = 0;
+ if (type & PERF_SAMPLE_PAGE_SIZE) {
+ data->page_size = *array;
+ array++;
+ }
+
return 0;
}

@@ -2509,6 +2518,9 @@ size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type,
if (type & PERF_SAMPLE_PHYS_ADDR)
result += sizeof(u64);

+ if (type & PERF_SAMPLE_PAGE_SIZE)
+ result += sizeof(u64);
+
return result;
}

@@ -2678,6 +2690,11 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type,
array++;
}

+ if (type & PERF_SAMPLE_PAGE_SIZE) {
+ *array = sample->page_size;
+ array++;
+ }
+
return 0;
}

--
2.7.4


2018-08-10 14:53:33

by Liang, Kan

[permalink] [raw]
Subject: [PATCH RFC 3/7] perf script: Support page size

From: Kan Liang <[email protected]>

Display the page size if it is available.

Can be configured by the user, for example:
perf script --fields comm,event,phys_addr,page_size
dtlb mem-loads:uP: 3fec82ea8 4K
dtlb mem-loads:uP: 3fec82e90 4K
dtlb mem-loads:uP: 3e23700a4 4K
dtlb mem-loads:uP: 3fec82f20 4K
dtlb mem-loads:uP: 3e23700a4 4K
dtlb mem-loads:uP: 3b4211bec 4K
dtlb mem-loads:uP: 382205dc0 2M
dtlb mem-loads:uP: 36fa082c0 2M
dtlb mem-loads:uP: 377607340 2M
dtlb mem-loads:uP: 330010180 2M
dtlb mem-loads:uP: 33200fd80 2M
dtlb mem-loads:uP: 31b012b80 2M

Signed-off-by: Kan Liang <[email protected]>
---
tools/perf/Documentation/perf-script.txt | 2 +-
tools/perf/builtin-script.c | 18 ++++++++++++++++--
tools/perf/util/event.h | 2 ++
tools/perf/util/session.c | 19 +++++++++++++++++++
4 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index afdafe2..c7cd34d 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -117,7 +117,7 @@ OPTIONS
Comma separated list of fields to print. Options are:
comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
srcline, period, iregs, uregs, brstack, brstacksym, flags, bpf-output, brstackinsn,
- brstackoff, callindent, insn, insnlen, synth, phys_addr, metric, misc.
+ brstackoff, callindent, insn, insnlen, synth, phys_addr, metric, misc, page_size.
Field list can be prepended with the type, trace, sw or hw,
to indicate to which event type the field list applies.
e.g., -F sw:comm,tid,time,ip,sym and -F trace:time,cpu,trace
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 568ddfa..fa8e487 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -27,6 +27,7 @@
#include "util/thread-stack.h"
#include "util/time-utils.h"
#include "util/path.h"
+#include "util/event.h"
#include "print_binary.h"
#include <linux/bitmap.h>
#include <linux/kernel.h>
@@ -95,6 +96,7 @@ enum perf_output_field {
PERF_OUTPUT_UREGS = 1U << 27,
PERF_OUTPUT_METRIC = 1U << 28,
PERF_OUTPUT_MISC = 1U << 29,
+ PERF_OUTPUT_PAGE_SIZE = 1U << 30,
};

struct output_option {
@@ -131,6 +133,7 @@ struct output_option {
{.str = "phys_addr", .field = PERF_OUTPUT_PHYS_ADDR},
{.str = "metric", .field = PERF_OUTPUT_METRIC},
{.str = "misc", .field = PERF_OUTPUT_MISC},
+ {.str = "page_size", .field = PERF_OUTPUT_PAGE_SIZE},
};

enum {
@@ -201,7 +204,8 @@ static struct {
PERF_OUTPUT_SYM | PERF_OUTPUT_SYMOFFSET |
PERF_OUTPUT_DSO | PERF_OUTPUT_PERIOD |
PERF_OUTPUT_ADDR | PERF_OUTPUT_DATA_SRC |
- PERF_OUTPUT_WEIGHT | PERF_OUTPUT_PHYS_ADDR,
+ PERF_OUTPUT_WEIGHT | PERF_OUTPUT_PHYS_ADDR |
+ PERF_OUTPUT_PAGE_SIZE,

.invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT,
},
@@ -465,6 +469,11 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
PERF_OUTPUT_PHYS_ADDR))
return -EINVAL;

+ if (PRINT_FIELD(PAGE_SIZE) &&
+ perf_evsel__check_stype(evsel, PERF_SAMPLE_PAGE_SIZE, "PAGE_SIZE",
+ PERF_OUTPUT_PAGE_SIZE))
+ return -EINVAL;
+
return 0;
}

@@ -1706,6 +1715,10 @@ static void process_event(struct perf_script *script,

if (PRINT_FIELD(PHYS_ADDR))
fprintf(fp, "%16" PRIx64, sample->phys_addr);
+
+ if (PRINT_FIELD(PAGE_SIZE))
+ fprintf(fp, " %s", get_page_size_name(sample->page_size));
+
fprintf(fp, "\n");

if (PRINT_FIELD(METRIC))
@@ -3150,7 +3163,8 @@ int cmd_script(int argc, const char **argv)
"Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,"
"addr,symoff,srcline,period,iregs,uregs,brstack,"
"brstacksym,flags,bpf-output,brstackinsn,brstackoff,"
- "callindent,insn,insnlen,synth,phys_addr,metric,misc",
+ "callindent,insn,insnlen,synth,phys_addr,metric,misc,"
+ "page_size",
parse_output_fields),
OPT_BOOLEAN('a', "all-cpus", &system_wide,
"system-wide collection from all CPUs"),
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 51456b8..7e7a820 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -831,4 +831,6 @@ int perf_event_paranoid(void);
extern int sysctl_perf_event_max_stack;
extern int sysctl_perf_event_max_contexts_per_stack;

+const char *get_page_size_name(u64 level);
+
#endif /* __PERF_RECORD_H */
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 8b93693..5c61820 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1080,6 +1080,22 @@ static void dump_event(struct perf_evlist *evlist, union perf_event *event,
event->header.size, perf_event__name(event->header.type));
}

+const char *get_page_size_name(u64 level)
+{
+ switch (level) {
+ case PERF_MEM_PAGE_SIZE_4K:
+ return "4K";
+ case PERF_MEM_PAGE_SIZE_2M:
+ return "2M";
+ case PERF_MEM_PAGE_SIZE_1G:
+ return "1G";
+ case PERF_MEM_PAGE_SIZE_512G:
+ return "512G";
+ default:
+ return "N/A";
+ }
+}
+
static void dump_sample(struct perf_evsel *evsel, union perf_event *event,
struct perf_sample *sample)
{
@@ -1118,6 +1134,9 @@ static void dump_sample(struct perf_evsel *evsel, union perf_event *event,
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
printf(" .. phys_addr: 0x%"PRIx64"\n", sample->phys_addr);

+ if (sample_type & PERF_SAMPLE_PAGE_SIZE)
+ printf(" .. page size: %s\n", get_page_size_name(sample->page_size));
+
if (sample_type & PERF_SAMPLE_TRANSACTION)
printf("... transaction: %" PRIx64 "\n", sample->transaction);

--
2.7.4


2018-08-10 14:53:36

by Liang, Kan

[permalink] [raw]
Subject: [PATCH RFC 7/7] perf test: Add test case for PERF_SAMPLE_PAGE_SIZE

From: Kan Liang <[email protected]>

Extend sample-parsing test cases to support new sample type
PERF_SAMPLE_PAGE_SIZE.

Signed-off-by: Kan Liang <[email protected]>
---
tools/perf/tests/sample-parsing.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tools/perf/tests/sample-parsing.c b/tools/perf/tests/sample-parsing.c
index 0e2d00d..cfba33d 100644
--- a/tools/perf/tests/sample-parsing.c
+++ b/tools/perf/tests/sample-parsing.c
@@ -145,6 +145,9 @@ static bool samples_same(const struct perf_sample *s1,
if (type & PERF_SAMPLE_PHYS_ADDR)
COMP(phys_addr);

+ if (type & PERF_SAMPLE_PAGE_SIZE)
+ COMP(page_size);
+
return true;
}

@@ -210,7 +213,9 @@ static int do_test(u64 sample_type, u64 sample_regs, u64 read_format)
.mask = sample_regs,
.regs = regs,
},
+
.phys_addr = 113,
+ .page_size = PERF_MEM_PAGE_SIZE_4K,
};
struct sample_read_value values[] = {{1, 5}, {9, 3}, {2, 7}, {6, 4},};
struct perf_sample sample_out;
@@ -310,7 +315,7 @@ int test__sample_parsing(struct test *test __maybe_unused, int subtest __maybe_u
* were added. Please actually update the test rather than just change
* the condition below.
*/
- if (PERF_SAMPLE_MAX > PERF_SAMPLE_PHYS_ADDR << 1) {
+ if (PERF_SAMPLE_MAX > PERF_SAMPLE_PAGE_SIZE << 1) {
pr_debug("sample format has changed, some new PERF_SAMPLE_ bit was introduced - test needs updating\n");
return -1;
}
--
2.7.4


2018-08-10 14:53:36

by Liang, Kan

[permalink] [raw]
Subject: [PATCH RFC 6/7] perf mem: Support page size

From: Kan Liang <[email protected]>

Add option --page-size in "perf mem" to record/report page size.

Here are some examples.
perf mem --phys-data --page-size report -D

# PID, TID, IP, ADDR, PHYS ADDR, PAGE SIZE, LOCAL WEIGHT, DSRC, SYMBOL
20134 20134 0xffffffffb5bd2fd0 0x016ffff9a274e96a308 0x000000044e96a308
4K 1168 0x5080144
/lib/modules/4.18.0-rc7+/build/vmlinux:perf_ctx_unlock
20134 20134 0xffffffffb63f645c 0xffffffffb752b814 0xcfb52b814 2M 225
0x26a100142 /lib/modules/4.18.0-rc7+/build/vmlinux:_raw_spin_lock
20134 20134 0xffffffffb660300c 0xfffffe00016b8bb0 0x0 4K 0 0x5080144
/lib/modules/4.18.0-rc7+/build/vmlinux:__x86_indirect_thunk_rax


perf mem --phys-data --page-size report --stdio

# To display the perf.data header info, please use
# --header/--header-only options.
#
#
# Total Lost Samples: 0
#
# Samples: 5K of event 'cpu/mem-loads,ldlat=30/P'
# Total weight : 281234
# Sort order :
# mem,sym,dso,symbol_daddr,dso_daddr,tlb,locked,phys_daddr,page_size
#
# Overhead Samples Memory access Symbol
# Shared Object Data Symbol Data
# Object TLB access Locked Data Physical
# Address Page Size
# ........ ............ ........................
# ................................ ................
# ........................................... .......................
# ...................... ...... ......................
# ......................
#
28.54% 1826 L1 or L1 hit [k]
__x86_indirect_thunk_rax [kernel.vmlinux] [k] 0xffffb0df31b0ff28
[unknown] L1 or L2 hit No [k]
0000000000000000 4K
6.02% 256 L1 or L1 hit [.] touch_buffer
dtlb [.] 0x00007ffd50109da8 [stack]
L1 or L2 hit No [.] 0x000000042454ada8 4K
3.23% 5 L1 or L1 hit [k] clear_huge_page
[kernel.vmlinux] [k] 0xffff9a2753b8ce60 [unknown]
L1 or L2 hit No [k] 0x0000000453b8ce60 2M
2.98% 4 L1 or L1 hit [k] clear_page_erms
[kernel.vmlinux] [k] 0xffffb0df31b0fd00 [unknown]
L1 or L2 hit No [k] 0000000000000000 4K

Signed-off-by: Kan Liang <[email protected]>
---
tools/perf/Documentation/perf-mem.txt | 4 ++++
tools/perf/builtin-mem.c | 17 +++++++++++++++++
2 files changed, 21 insertions(+)

diff --git a/tools/perf/Documentation/perf-mem.txt b/tools/perf/Documentation/perf-mem.txt
index f8d2167..1708689 100644
--- a/tools/perf/Documentation/perf-mem.txt
+++ b/tools/perf/Documentation/perf-mem.txt
@@ -63,6 +63,10 @@ OPTIONS
--phys-data::
Record/Report sample physical addresses

+-s::
+--page-size::
+ Record/Report sample page size
+
RECORD OPTIONS
--------------
-e::
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 6048fca..0dad6c2 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -26,6 +26,7 @@ struct perf_mem {
bool dump_raw;
bool force;
bool phys_addr;
+ bool page_size;
int operation;
const char *cpu_list;
DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
@@ -107,6 +108,9 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
if (mem->phys_addr)
rec_argv[i++] = "--phys-data";

+ if (mem->phys_addr)
+ rec_argv[i++] = "--page-size";
+
for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
if (!perf_mem_events[j].record)
continue;
@@ -195,6 +199,12 @@ dump_raw_samples(struct perf_tool *tool,
symbol_conf.field_sep);
}

+ if (mem->page_size) {
+ printf("%s%s",
+ get_page_size_name(sample->page_size),
+ symbol_conf.field_sep);
+ }
+
if (field_sep)
fmt = "%"PRIu64"%s0x%"PRIx64"%s%s:%s\n";
else
@@ -253,6 +263,9 @@ static int report_raw_events(struct perf_mem *mem)
if (mem->phys_addr)
printf("PHYS ADDR, ");

+ if (mem->page_size)
+ printf("PAGE SIZE, ");
+
printf("LOCAL WEIGHT, DSRC, SYMBOL\n");

ret = perf_session__process_events(session);
@@ -277,6 +290,9 @@ static char *get_sort_order(struct perf_mem *mem)
if (mem->phys_addr)
strcat(sort, ",phys_daddr");

+ if (mem->page_size)
+ strcat(sort, ",page_size");
+
return strdup(sort);
}

@@ -418,6 +434,7 @@ int cmd_mem(int argc, const char **argv)
" between columns '.' is reserved."),
OPT_BOOLEAN('f', "force", &mem.force, "don't complain, do it"),
OPT_BOOLEAN('p', "phys-data", &mem.phys_addr, "Record/Report sample physical addresses"),
+ OPT_BOOLEAN('s', "page-size", &mem.page_size, "Record/Report sample page size"),
OPT_END()
};
const char *const mem_subcommands[] = { "record", "report", NULL };
--
2.7.4


2018-08-10 20:25:29

by Stephane Eranian

[permalink] [raw]
Subject: Re: [PATCH RFC 1/7] perf/core, x86: Add PERF_SAMPLE_PAGE_SIZE

On Fri, Aug 10, 2018 at 6:37 AM <[email protected]> wrote:
>
> From: Kan Liang <[email protected]>
>
> Current perf can report both virtual address and physical address, but
> it doesn't report page size. Users have no idea how large the utilized
> page is. They cannot promote/demote large pages to optimize memory use.
>
> Add a new sample type for page size.
>
> Current perf already has a facility to collect data virtual address.
> A function, to retrieve page size by full page-table walk of a given
> virtual address, is introduced for x86. Other architectures can
> implement their own functions later separately.
> The function must be IRQ-safe. For x86, disabling IRQs over the walk is
> sufficient to prevent any tear down of the page tables.
>
> The new sample type requires collecting the virtual address. The
> virtual address will not be output unless SAMPLE_ADDR is applied.
>
I welcome this feature, been wanting it for some time now. There is
simply not enough support in /proc/PID/maps or smaps to get this
information. This is important to improve code and data layouts.

I would like to see the following changes to your proposal:
- call it PERF_SAMPLE_DATA_PAGE_SIZE

That would allow two things:
1 - not tied to PERF_SAMPLE_ADDR
2 - Allow PERF_SAMPLE_CODE_PAGE_SIZE to be added

In some measurements, you may just care about the distribution of accesses
across page sizes. No need to use double the buffer space to save the address
you will not use.

Layout is important for code as well, in fact, that's what most people
want first.
Having a CODE_PAGE_SIZE is therefore useful. I am happy adding it on top on your
proposal. Note that PERF_SAMPLE_CODE_PAGE_SIZE would not have to be tied
to PEBS unlike DATA_PAGE_SIZE.

Thanks.

> Although only a few bits are needed to indicate the page size, a u64
> type is still claimed for page_size. Because struct perf_sample_data
> requires cacheline_aligned.
>
> Signed-off-by: Kan Liang <[email protected]>
> ---
> arch/x86/events/core.c | 25 +++++++++++++++++++++++++
> arch/x86/events/intel/ds.c | 2 +-
> arch/x86/events/perf_event.h | 2 +-
> include/linux/perf_event.h | 1 +
> include/uapi/linux/perf_event.h | 13 ++++++++++++-
> kernel/events/core.c | 15 +++++++++++++++
> 6 files changed, 55 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index 5f4829f..719e527 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -2573,3 +2573,28 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
> cap->events_mask_len = x86_pmu.events_mask_len;
> }
> EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
> +
> +u64 perf_get_page_size(u64 virt)
> +{
> + unsigned long flags;
> + unsigned int level;
> + pte_t *pte;
> +
> + if (!virt)
> + return 0;
> +
> + /*
> + * Interrupts are disabled, so it prevents any tear down
> + * of the page tables.
> + * See the comment near struct mmu_table_batch.
> + */
> + local_irq_save(flags);
> + if (virt >= TASK_SIZE)
> + pte = lookup_address(virt, &level);
> + else
> + pte = lookup_address_in_pgd(pgd_offset(current->mm, virt),
> + virt, &level);
> + local_irq_restore(flags);
> +
> + return (u64)level;
> +}
> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
> index b7b01d7..a3e56c7 100644
> --- a/arch/x86/events/intel/ds.c
> +++ b/arch/x86/events/intel/ds.c
> @@ -1274,7 +1274,7 @@ static void setup_pebs_sample_data(struct perf_event *event,
> }
>
>
> - if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) &&
> + if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_PAGE_SIZE)) &&
> x86_pmu.intel_cap.pebs_format >= 1)
> data->addr = pebs->dla;
>
> diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
> index 1562863..affcd26 100644
> --- a/arch/x86/events/perf_event.h
> +++ b/arch/x86/events/perf_event.h
> @@ -94,7 +94,7 @@ struct amd_nb {
> PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
> PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
> PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | \
> - PERF_SAMPLE_PERIOD)
> + PERF_SAMPLE_PERIOD | PERF_SAMPLE_PAGE_SIZE)
>
> #define PEBS_REGS \
> (PERF_REG_X86_AX | \
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 53c500f..9d13745 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -937,6 +937,7 @@ struct perf_sample_data {
> u64 stack_user_size;
>
> u64 phys_addr;
> + u64 page_size;
> } ____cacheline_aligned;
>
> /* default value for data source */
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index eeb787b..5473443 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -141,8 +141,9 @@ enum perf_event_sample_format {
> PERF_SAMPLE_TRANSACTION = 1U << 17,
> PERF_SAMPLE_REGS_INTR = 1U << 18,
> PERF_SAMPLE_PHYS_ADDR = 1U << 19,
> + PERF_SAMPLE_PAGE_SIZE = 1U << 20,
>
> - PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */
> + PERF_SAMPLE_MAX = 1U << 21, /* non-ABI */
>
> __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63,
> };
> @@ -861,6 +862,7 @@ enum perf_event_type {
> * { u64 abi; # enum perf_sample_regs_abi
> * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
> * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR
> + * { u64 page_size;} && PERF_SAMPLE_PAGE_SIZE
> * };
> */
> PERF_RECORD_SAMPLE = 9,
> @@ -1099,6 +1101,15 @@ union perf_mem_data_src {
> #define PERF_MEM_S(a, s) \
> (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
>
> +
> +enum perf_mem_page_size {
> + PERF_MEM_PAGE_SIZE_NONE,
> + PERF_MEM_PAGE_SIZE_4K,
> + PERF_MEM_PAGE_SIZE_2M,
> + PERF_MEM_PAGE_SIZE_1G,
> + PERF_MEM_PAGE_SIZE_512G,
> +};
> +
> /*
> * single taken branch record layout:
> *
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index f6ea33a..e848e9b 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -1751,6 +1751,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
> if (sample_type & PERF_SAMPLE_PHYS_ADDR)
> size += sizeof(data->phys_addr);
>
> + if (sample_type & PERF_SAMPLE_PAGE_SIZE)
> + size += sizeof(data->page_size);
> +
> event->header_size = size;
> }
>
> @@ -6294,6 +6297,9 @@ void perf_output_sample(struct perf_output_handle *handle,
> if (sample_type & PERF_SAMPLE_PHYS_ADDR)
> perf_output_put(handle, data->phys_addr);
>
> + if (sample_type & PERF_SAMPLE_PAGE_SIZE)
> + perf_output_put(handle, data->page_size);
> +
> if (!event->attr.watermark) {
> int wakeup_events = event->attr.wakeup_events;
>
> @@ -6341,6 +6347,12 @@ static u64 perf_virt_to_phys(u64 virt)
> return phys_addr;
> }
>
> +/* Return page size of given virtual address. IRQ-safe required. */
> +u64 __weak perf_get_page_size(u64 virt)
> +{
> + return PERF_MEM_PAGE_SIZE_NONE;
> +}
> +
> static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
>
> struct perf_callchain_entry *
> @@ -6482,6 +6494,9 @@ void perf_prepare_sample(struct perf_event_header *header,
>
> if (sample_type & PERF_SAMPLE_PHYS_ADDR)
> data->phys_addr = perf_virt_to_phys(data->addr);
> +
> + if (sample_type & PERF_SAMPLE_PAGE_SIZE)
> + data->page_size = perf_get_page_size(data->addr);
> }
>
> static __always_inline void
> --
> 2.7.4
>