This patchkit adds support for the Intel Skylake core PMU to perf, documented in the
recently released SDM 054[1] Vol3, 17.9 and 18.12.
The main user visible feature is timed branch records, which allows to get cycle counts
for individual basic blocks, and a time stamp for PEBS records which improves
multi-record PEBS. The LBRs (branch records) also have been extended to 32, which allows
more accurate branch sampling and deeper call stacks.
v2:
Fix time stamp handling with non default clock.
Fix LBR freezing.
Some minor cleanups.
Moved user tools support for cycles into separate patchkit.
-Andi
[1] http://www.cps.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html
From: Andi Kleen <[email protected]>
PEBSv3 has a raw TSC time stamp in its memory buffer that
later needs to to be converted to perf_clock. Add a
native_sched_clock_from_tsc() that works the same
as native_sched_clock, but starts with an already given TSC
value.
Right now I ignore paravirt for this. It will just get
the native clock. But there isn't para virtualized PEBS anyways.
Cc: [email protected]
Signed-off-by: Andi Kleen <[email protected]>
---
arch/x86/include/asm/tsc.h | 1 +
arch/x86/kernel/tsc.c | 8 ++++++++
2 files changed, 9 insertions(+)
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 94605c0..aad56eb 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -51,6 +51,7 @@ extern int unsynchronized_tsc(void);
extern int check_tsc_unstable(void);
extern int check_tsc_disabled(void);
extern unsigned long native_calibrate_tsc(void);
+extern unsigned long long native_sched_clock_from_tsc(u64 tsc);
extern int tsc_clocksource_reliable;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 5054497..c16afd5 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -296,6 +296,14 @@ u64 native_sched_clock(void)
return cycles_2_ns(tsc_now);
}
+/*
+ * Generate a sched_clock if you already have a TSC value.
+ */
+u64 native_sched_clock_from_tsc(u64 tsc)
+{
+ return cycles_2_ns(tsc);
+}
+
/* We need to define a real function for sched_clock, to override the
weak default version */
#ifdef CONFIG_PARAVIRT
--
1.9.3
From: Andi Kleen <[email protected]>
PEBSv3 is the same as the existing PEBSv2 used on Haswell,
but it adds a new TSC field. Add support to the generic
PEBS handler to handle the new format, and overwrite
the perf time stamp using the new native_sched_clock_from_tsc()
Right now the time stamp is just slightly more accurate,
as it is nearer the actual event trigger point. With
the PEBS threshold > 1 patchkit it will be much more accurate,
avoid the problems with MMAP mismatches earlier.
The accurate time stamping is only implemented for
the default trace clock for now.
v2: Use _skl prefix. Check for default clock_id.
Signed-off-by: Andi Kleen <[email protected]>
---
arch/x86/kernel/cpu/perf_event_intel_ds.c | 36 ++++++++++++++++++++++++++++---
1 file changed, 33 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 813f75d..2eed6fc 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -224,6 +224,19 @@ union hsw_tsx_tuning {
#define PEBS_HSW_TSX_FLAGS 0xff00000000ULL
+/* Same as HSW, plus TSC */
+
+struct pebs_record_skl {
+ u64 flags, ip;
+ u64 ax, bx, cx, dx;
+ u64 si, di, bp, sp;
+ u64 r8, r9, r10, r11;
+ u64 r12, r13, r14, r15;
+ u64 status, dla, dse, lat;
+ u64 real_ip, tsx_tuning;
+ u64 tsc;
+};
+
void init_debug_store_on_cpu(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -827,7 +840,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
return 0;
}
-static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs)
+static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs)
{
if (pebs->tsx_tuning) {
union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
@@ -836,7 +849,7 @@ static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs)
return 0;
}
-static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs)
+static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)
{
u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
@@ -858,7 +871,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
* unconditionally access the 'extra' entries.
*/
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- struct pebs_record_hsw *pebs = __pebs;
+ struct pebs_record_skl *pebs = __pebs;
struct perf_sample_data data;
struct pt_regs regs;
u64 sample_type;
@@ -958,6 +971,16 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
data.txn = intel_hsw_transaction(pebs);
}
+ /*
+ * v3 supplies an accurate time stamp, so we use that
+ * for the time stamp.
+ *
+ * We can only do this for the default trace clock.
+ */
+ if (x86_pmu.intel_cap.pebs_format >= 3 &&
+ event->attr.use_clockid == 0)
+ data.time = native_sched_clock_from_tsc(pebs->tsc);
+
if (has_branch_stack(event))
data.br_stack = &cpuc->lbr_stack;
@@ -1098,6 +1121,13 @@ void __init intel_ds_init(void)
x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
break;
+ case 3:
+ pr_cont("PEBS fmt3%c, ", pebs_type);
+ x86_pmu.pebs_record_size =
+ sizeof(struct pebs_record_skl);
+ x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+ break;
+
default:
printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
x86_pmu.pebs = 0;
--
1.9.3
From: Andi Kleen <[email protected]>
Add new MSRs (LBR_INFO) and some new MSR bits used by the Skylake
PMU driver.
Signed-off-by: Andi Kleen <[email protected]>
---
arch/x86/include/asm/perf_event.h | 7 +++++++
arch/x86/include/uapi/asm/msr-index.h | 6 ++++++
2 files changed, 13 insertions(+)
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index dc0f6ed..7bcb861 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -159,6 +159,13 @@ struct x86_pmu_capability {
*/
#define INTEL_PMC_IDX_FIXED_BTS (INTEL_PMC_IDX_FIXED + 16)
+#define GLOBAL_STATUS_COND_CHG BIT_ULL(63)
+#define GLOBAL_STATUS_BUFFER_OVF BIT_ULL(62)
+#define GLOBAL_STATUS_UNC_OVF BIT_ULL(61)
+#define GLOBAL_STATUS_ASIF BIT_ULL(60)
+#define GLOBAL_STATUS_COUNTERS_FROZEN BIT_ULL(59)
+#define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(58)
+
/*
* IBS cpuid feature detection
*/
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index c469490..a6e1a2d 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -72,6 +72,12 @@
#define MSR_LBR_CORE_FROM 0x00000040
#define MSR_LBR_CORE_TO 0x00000060
+#define MSR_LBR_INFO_0 0x00000dc0 /* ... 0xddf for _31 */
+#define LBR_INFO_MISPRED (1UL << 63)
+#define LBR_INFO_IN_TX (1UL << 62)
+#define LBR_INFO_ABORT (1UL << 61)
+#define LBR_INFO_CYCLES 0xffff
+
#define MSR_IA32_PEBS_ENABLE 0x000003f1
#define MSR_IA32_DS_AREA 0x00000600
#define MSR_IA32_PERF_CAPABILITIES 0x00000345
--
1.9.3
From: Andi Kleen <[email protected]>
Skylake supports reporting the time in cycles a branch in the LBR
took, to give a rough indication of the basic block performance.
Export the cycle information in the branch_info structure.
This can be done by just reusing some currently zero padding.
This is just the generic header change. The architecture
still needs to fill it in.
There's no attempt to convert to real time, as we really
want cycles here.
Signed-off-by: Andi Kleen <[email protected]>
---
include/uapi/linux/perf_event.h | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 309211b..9ef489e 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -904,6 +904,7 @@ union perf_mem_data_src {
*
* in_tx: running in a hardware transaction
* abort: aborting a hardware transaction
+ * cycles: cycles from last branch (or 0 if not supported)
*/
struct perf_branch_entry {
__u64 from;
@@ -912,7 +913,8 @@ struct perf_branch_entry {
predicted:1,/* target predicted */
in_tx:1, /* in transaction */
abort:1, /* transaction abort */
- reserved:60;
+ cycles:16, /* cycle count to last branch */
+ reserved:44;
};
#endif /* _UAPI_LINUX_PERF_EVENT_H */
--
1.9.3
From: Andi Kleen <[email protected]>
Add support for the new LBRv5 format used on Skylake.
The flags for mispredict, abort, in_tx etc. moved to range of separate
LBR_INFO_* MSRs. Teach the LBR code to read those. The original
LBR registers stay the same, except they have full sign
extension now.
LBR_INFO also reports a cycle count to the last branch.
Report the cycle information using the new "cycles" branch_info
output field.
In addition we have to context switch and clear the new INFO
MSRs to avoid any information leaks.
Signed-off-by: Andi Kleen <[email protected]>
---
arch/x86/kernel/cpu/perf_event.h | 1 +
arch/x86/kernel/cpu/perf_event_intel_lbr.c | 21 ++++++++++++++++++++-
2 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index c2031cb..6dabfdc 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -595,6 +595,7 @@ struct x86_pmu {
struct x86_perf_task_context {
u64 lbr_from[MAX_LBR_ENTRIES];
u64 lbr_to[MAX_LBR_ENTRIES];
+ u64 lbr_info[MAX_LBR_ENTRIES];
int lbr_callstack_users;
int lbr_stack_state;
};
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 94e5b50..d938cae 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -13,7 +13,8 @@ enum {
LBR_FORMAT_EIP = 0x02,
LBR_FORMAT_EIP_FLAGS = 0x03,
LBR_FORMAT_EIP_FLAGS2 = 0x04,
- LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_EIP_FLAGS2,
+ LBR_FORMAT_INFO = 0x05,
+ LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_INFO,
};
static enum {
@@ -184,6 +185,8 @@ static void intel_pmu_lbr_reset_64(void)
for (i = 0; i < x86_pmu.lbr_nr; i++) {
wrmsrl(x86_pmu.lbr_from + i, 0);
wrmsrl(x86_pmu.lbr_to + i, 0);
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+ wrmsrl(MSR_LBR_INFO_0 + i, 0);
}
}
@@ -232,6 +235,8 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
lbr_idx = (tos - i) & mask;
wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+ wrmsrl(MSR_LBR_INFO_0 + i, task_ctx->lbr_info[i]);
}
task_ctx->lbr_stack_state = LBR_NONE;
}
@@ -253,6 +258,8 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
lbr_idx = (tos - i) & mask;
rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+ rdmsrl(MSR_LBR_INFO_0 + i, task_ctx->lbr_info[i]);
}
task_ctx->lbr_stack_state = LBR_VALID;
}
@@ -417,11 +424,22 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
unsigned long lbr_idx = (tos - i) & mask;
u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
int skip = 0;
+ u16 cycles = 0;
int lbr_flags = lbr_desc[lbr_format];
rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
+ if (lbr_format == LBR_FORMAT_INFO) {
+ u64 info;
+
+ rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
+ mis = !!(info & LBR_INFO_MISPRED);
+ pred = !mis;
+ in_tx = !!(info & LBR_INFO_IN_TX);
+ abort = !!(info & LBR_INFO_ABORT);
+ cycles = (info & LBR_INFO_CYCLES);
+ }
if (lbr_flags & LBR_EIP_FLAGS) {
mis = !!(from & LBR_FROM_FLAG_MISPRED);
pred = !mis;
@@ -451,6 +469,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
cpuc->lbr_entries[out].predicted = pred;
cpuc->lbr_entries[out].in_tx = in_tx;
cpuc->lbr_entries[out].abort = abort;
+ cpuc->lbr_entries[out].cycles = cycles;
cpuc->lbr_entries[out].reserved = 0;
out++;
}
--
1.9.3
From: Andi Kleen <[email protected]>
Add perf core PMU support for future Intel Skylake CPU cores.
The code is based on Haswell/Broadwell.
There is a new cache event list, based on the updated Haswell
event list.
Skylake has removed most counter constraints on basic
events, so the basic constraints table now only has a single
entry (plus the fixed counters)
TSX support and various other setups are all shared with Haswell.
Skylake has 32 LBR entries. Add a new LBR init function
to set this up. The filters are all the same as Haswell.
It also has a new LBR format with a separate LBR_INFO_* MSR,
but that has been already added earlier.
Signed-off-by: Andi Kleen <[email protected]>
---
arch/x86/kernel/cpu/perf_event.h | 6 +-
arch/x86/kernel/cpu/perf_event_intel.c | 232 +++++++++++++++++++++++++++++
arch/x86/kernel/cpu/perf_event_intel_ds.c | 22 +++
arch/x86/kernel/cpu/perf_event_intel_lbr.c | 20 +++
4 files changed, 279 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 6dabfdc..a7577a5 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -138,7 +138,7 @@ struct intel_excl_cntrs {
unsigned core_id; /* per-core: core id */
};
-#define MAX_LBR_ENTRIES 16
+#define MAX_LBR_ENTRIES 32
enum {
X86_PERF_KFREE_SHARED = 0,
@@ -841,6 +841,8 @@ extern struct event_constraint intel_ivb_pebs_event_constraints[];
extern struct event_constraint intel_hsw_pebs_event_constraints[];
+extern struct event_constraint intel_skl_pebs_event_constraints[];
+
struct event_constraint *intel_pebs_constraints(struct perf_event *event);
void intel_pmu_pebs_enable(struct perf_event *event);
@@ -877,6 +879,8 @@ void intel_pmu_lbr_init_snb(void);
void intel_pmu_lbr_init_hsw(void);
+void intel_pmu_lbr_init_skl(void);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
void intel_pt_interrupt(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 219d3fb..39fcd07 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -177,6 +177,14 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly =
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_skl_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */
+ EVENT_CONSTRAINT_END
+};
+
static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
@@ -193,6 +201,13 @@ static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
EVENT_EXTRA_END
};
+static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+ EVENT_EXTRA_END
+};
+
EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3");
EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3");
EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2");
@@ -244,6 +259,200 @@ static u64 intel_pmu_event_map(int hw_event)
return intel_perfmon_event_map[hw_event];
}
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts.
+ * - icache miss does not include decoded icache
+ */
+
+#define SKL_DEMAND_DATA_RD BIT_ULL(0)
+#define SKL_DEMAND_RFO BIT_ULL(1)
+#define SKL_ANY_RESPONSE BIT_ULL(16)
+#define SKL_SUPPLIER_NONE BIT_ULL(17)
+#define SKL_L3_MISS_LOCAL_DRAM BIT_ULL(26)
+#define SKL_L3_MISS_REMOTE_HOP0_DRAM BIT_ULL(27)
+#define SKL_L3_MISS_REMOTE_HOP1_DRAM BIT_ULL(28)
+#define SKL_L3_MISS_REMOTE_HOP2P_DRAM BIT_ULL(29)
+#define SKL_L3_MISS (SKL_L3_MISS_LOCAL_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+#define SKL_SPL_HIT BIT_ULL(30)
+#define SKL_SNOOP_NONE BIT_ULL(31)
+#define SKL_SNOOP_NOT_NEEDED BIT_ULL(32)
+#define SKL_SNOOP_MISS BIT_ULL(33)
+#define SKL_SNOOP_HIT_NO_FWD BIT_ULL(34)
+#define SKL_SNOOP_HIT_WITH_FWD BIT_ULL(35)
+#define SKL_SNOOP_HITM BIT_ULL(36)
+#define SKL_SNOOP_NON_DRAM BIT_ULL(37)
+#define SKL_ANY_SNOOP (SKL_SPL_HIT|SKL_SNOOP_NONE| \
+ SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+ SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+ SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM)
+#define SKL_DEMAND_READ SKL_DEMAND_DATA_RD
+#define SKL_SNOOP_DRAM (SKL_SNOOP_NONE| \
+ SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+ SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+ SKL_SNOOP_HITM|SKL_SPL_HIT)
+#define SKL_DEMAND_WRITE SKL_DEMAND_RFO
+#define SKL_LLC_ACCESS SKL_ANY_RESPONSE
+#define SKL_L3_MISS_REMOTE (SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+
+static __initconst const u64 skl_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x283, /* ICACHE_64B.MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x608, /* DTLB_LOAD_MISSES.WALK_COMPLETED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x649, /* DTLB_STORE_MISSES.WALK_COMPLETED */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2085, /* ITLB_MISSES.STLB_HIT */
+ [ C(RESULT_MISS) ] = 0xe85, /* ITLB_MISSES.WALK_COMPLETED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
+static __initconst const u64 skl_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+ SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+ [ C(RESULT_MISS) ] = SKL_DEMAND_READ|
+ SKL_L3_MISS|SKL_ANY_SNOOP|
+ SKL_SUPPLIER_NONE,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+ SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+ [ C(RESULT_MISS) ] = SKL_DEMAND_WRITE|
+ SKL_L3_MISS|SKL_ANY_SNOOP|
+ SKL_SUPPLIER_NONE,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+ SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = SKL_DEMAND_READ|
+ SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+ SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = SKL_DEMAND_WRITE|
+ SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
#define SNB_DMND_DATA_RD (1ULL << 0)
#define SNB_DMND_RFO (1ULL << 1)
#define SNB_DMND_IFETCH (1ULL << 2)
@@ -3292,6 +3501,29 @@ __init int intel_pmu_init(void)
pr_cont("Broadwell events, ");
break;
+ case 78: /* 14nm Skylake Mobile */
+ case 94: /* 14nm Skylake Desktop */
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ intel_pmu_lbr_init_skl();
+
+ x86_pmu.event_constraints = intel_skl_event_constraints;
+ x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_skl_extra_regs;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.cpu_events = hsw_events_attrs;
+ WARN_ON(!x86_pmu.format_attrs);
+ x86_pmu.cpu_events = hsw_events_attrs;
+ pr_cont("Skylake events, ");
+ break;
+
default:
switch (x86_pmu.version) {
case 1:
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 2eed6fc..1cbce6f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -678,6 +678,28 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_skl_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ INTEL_PLD_CONSTRAINT(0x1cd, 0xf), /* MEM_TRANS_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_L3_HIT_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_L3_MISS_RETIRED.* */
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+ EVENT_CONSTRAINT_END
+};
+
struct event_constraint *intel_pebs_constraints(struct perf_event *event)
{
struct event_constraint *c;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index d938cae..6c48c97 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -961,6 +961,26 @@ void intel_pmu_lbr_init_hsw(void)
pr_cont("16-deep LBR, ");
}
+/* skylake */
+__init void intel_pmu_lbr_init_skl(void)
+{
+ x86_pmu.lbr_nr = 32;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
+
+ /*
+ * SW branch filter usage:
+ * - support syscall, sysret capture.
+ * That requires LBR_FAR but that means far
+ * jmp need to be filtered out
+ */
+ pr_cont("32-deep LBR, ");
+}
+
/* atom */
void __init intel_pmu_lbr_init_atom(void)
{
--
1.9.3
From: Andi Kleen <[email protected]>
ArchPerfmon v4 has some new status bits in GLOBAL_STATUS.
These need to be ignored when deciding whether a NMI
was a NMI to avoid eating all NMIs when they
stay set (see b292d7a104)
This patch ignores the new ASIF bit, which indicates
that SGX interfered with the PMU, and also the new
LBR freezing bits, which are set when the LBRs get
frozen, plus the existing CondChange (set by JTAG debuggers
and some buggy BIOSes)
Signed-off-by: Andi Kleen <[email protected]>
---
arch/x86/kernel/cpu/perf_event_intel.c | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 39fcd07..2c8a268 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1821,13 +1821,14 @@ again:
intel_pmu_lbr_read();
/*
- * CondChgd bit 63 doesn't mean any overflow status. Ignore
- * and clear the bit.
+ * Ignore a range of extra bits in status that do not indicate
+ * overflow by themselves.
*/
- if (__test_and_clear_bit(63, (unsigned long *)&status)) {
- if (!status)
- goto done;
- }
+ status &= ~(GLOBAL_STATUS_COND_CHG |
+ GLOBAL_STATUS_ASIF |
+ GLOBAL_STATUS_LBRS_FROZEN);
+ if (!status)
+ goto done;
/*
* PEBS overflow sets bit 62 in the global status register
--
1.9.3
From: Andi Kleen <[email protected]>
In Arch perfmon v4 the GLOBAL_STATUS reset automatically unfreezes
LBRs. So no need to do it manually in the LBR code. Add a check
to skip it.
v2: Move test up to beginning of function.
Signed-off-by: Andi Kleen <[email protected]>
---
arch/x86/kernel/cpu/perf_event_intel_lbr.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 6c48c97..ae54ea9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -139,6 +139,13 @@ static void __intel_pmu_lbr_enable(bool pmi)
u64 debugctl, lbr_select = 0, orig_debugctl;
/*
+ * No need to unfreeze manually, as v4 can do that as part
+ * of the GLOBAL_STATUS ack.
+ */
+ if (pmi && x86_pmu.version >= 4)
+ return;
+
+ /*
* No need to reprogram LBR_SELECT in a PMI, as it
* did not change.
*/
--
1.9.3
From: Andi Kleen <[email protected]>
With Arch Perfmon v4 the PMU ack unfreezes the LBRs. So we need to do
the PMU ack after the LBR reading, otherwise the LBRs would be polluted by the
PMI handler.
This is a minimal change. In principle the ACK could be moved much later.
Signed-off-by: Andi Kleen <[email protected]>
---
arch/x86/kernel/cpu/perf_event_intel.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 2c8a268..a6de05e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1804,6 +1804,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
loops = 0;
again:
+ intel_pmu_lbr_read();
intel_pmu_ack_status(status);
if (++loops > 100) {
static bool warned = false;
@@ -1818,7 +1819,6 @@ again:
inc_irq_stat(apic_perf_irqs);
- intel_pmu_lbr_read();
/*
* Ignore a range of extra bits in status that do not indicate
--
1.9.3
On Sun, May 10, 2015 at 12:22:47PM -0700, Andi Kleen wrote:
> From: Andi Kleen <[email protected]>
>
> With Arch Perfmon v4 the PMU ack unfreezes the LBRs. So we need to do
> the PMU ack after the LBR reading, otherwise the LBRs would be polluted by the
> PMI handler.
Hmm, we should move these last three patches before the SKL enablement
patch no, otherwise things will misbehave -- say a bisection lands in
between.
/me shuffles patches.
> This is a minimal change. In principle the ACK could be moved much later.
Right, so the more complete change would be to use the new and improved
FREEZE_ON_PMI and reenable both the LBRs and the CTRs with the
STATUS_RESET MSR, right?
Does it make sense to have a new handle_irq() routine for that?
On Mon, May 11, 2015 at 9:23 AM, Peter Zijlstra <[email protected]> wrote:
> On Sun, May 10, 2015 at 12:22:47PM -0700, Andi Kleen wrote:
>> From: Andi Kleen <[email protected]>
>>
>> With Arch Perfmon v4 the PMU ack unfreezes the LBRs. So we need to do
>> the PMU ack after the LBR reading, otherwise the LBRs would be polluted by the
>> PMI handler.
>
> Hmm, we should move these last three patches before the SKL enablement
> patch no, otherwise things will misbehave -- say a bisection lands in
> between.
>
> /me shuffles patches.
>
>> This is a minimal change. In principle the ACK could be moved much later.
>
> Right, so the more complete change would be to use the new and improved
> FREEZE_ON_PMI and reenable both the LBRs and the CTRs with the
> STATUS_RESET MSR, right?
>
> Does it make sense to have a new handle_irq() routine for that?
Were we not already using FREEZE_ON_PMI with LBR (except for one
erratum on HSW)?
It would make sense to me to have an "optimized" and clean handle_irq
for the newer PMU.
We the caveat that any change to the core of it would now have to be done twice.
On Mon, May 11, 2015 at 09:32:33AM -0700, Stephane Eranian wrote:
> >> This is a minimal change. In principle the ACK could be moved much later.
> >
> > Right, so the more complete change would be to use the new and improved
> > FREEZE_ON_PMI and reenable both the LBRs and the CTRs with the
> > STATUS_RESET MSR, right?
> >
> > Does it make sense to have a new handle_irq() routine for that?
>
> Were we not already using FREEZE_ON_PMI with LBR (except for one
> erratum on HSW)?
That's FREEZE_LBRS_ON_PMI, I was referring to FREEZE_PERFMON_ON_PMI,
which we've not used so far.
I think Andi tried using it before, but there's some issues with it on
v3, but v4 should have fixed all that.
Andi can you perhaps explain what the problem with FREEZE_PERFMON_ON_PMI
on v3 was again?
> It would make sense to me to have an "optimized" and clean handle_irq
> for the newer PMU.
> We the caveat that any change to the core of it would now have to be done twice.
We could pull that out in a shared function of course, if possible.
On Mon, May 11, 2015 at 9:36 AM, Peter Zijlstra <[email protected]> wrote:
> On Mon, May 11, 2015 at 09:32:33AM -0700, Stephane Eranian wrote:
>> >> This is a minimal change. In principle the ACK could be moved much later.
>> >
>> > Right, so the more complete change would be to use the new and improved
>> > FREEZE_ON_PMI and reenable both the LBRs and the CTRs with the
>> > STATUS_RESET MSR, right?
>> >
>> > Does it make sense to have a new handle_irq() routine for that?
>>
>> Were we not already using FREEZE_ON_PMI with LBR (except for one
>> erratum on HSW)?
>
> That's FREEZE_LBRS_ON_PMI, I was referring to FREEZE_PERFMON_ON_PMI,
> which we've not used so far.
>
Ah, yes that one was not used so far. I don't quite remember why.
I think with PEBS, you don't need it or it should be off or something like this.
> I think Andi tried using it before, but there's some issues with it on
> v3, but v4 should have fixed all that.
>
I was referring to a LBR issue on v3 (HSW) and call stack mode.
> Andi can you perhaps explain what the problem with FREEZE_PERFMON_ON_PMI
> on v3 was again?
>
Andi, Is that what I am alluding to above?
>> It would make sense to me to have an "optimized" and clean handle_irq
>> for the newer PMU.
>> We the caveat that any change to the core of it would now have to be done twice.
>
> We could pull that out in a shared function of course, if possible.
Good.
On Mon, May 11, 2015 at 09:43:41AM -0700, Stephane Eranian wrote:
> On Mon, May 11, 2015 at 9:36 AM, Peter Zijlstra <[email protected]> wrote:
> > On Mon, May 11, 2015 at 09:32:33AM -0700, Stephane Eranian wrote:
> >> >> This is a minimal change. In principle the ACK could be moved much later.
> >> >
> >> > Right, so the more complete change would be to use the new and improved
> >> > FREEZE_ON_PMI and reenable both the LBRs and the CTRs with the
> >> > STATUS_RESET MSR, right?
> >> >
> >> > Does it make sense to have a new handle_irq() routine for that?
> >>
> >> Were we not already using FREEZE_ON_PMI with LBR (except for one
> >> erratum on HSW)?
> >
> > That's FREEZE_LBRS_ON_PMI, I was referring to FREEZE_PERFMON_ON_PMI,
> > which we've not used so far.
> >
> Ah, yes that one was not used so far. I don't quite remember why.
> I think with PEBS, you don't need it or it should be off or something like this.
The LBR freeze should work with call stack mode if you use PEBS events.
So in theory we could allow call-stack lbr for kernel with such a restriction.
But we have a working kernel backtrace anyways, so it's not really critical.
>
> > I think Andi tried using it before, but there's some issues with it on
> > v3, but v4 should have fixed all that.
> >
> I was referring to a LBR issue on v3 (HSW) and call stack mode.
>
> > Andi can you perhaps explain what the problem with FREEZE_PERFMON_ON_PMI
> > on v3 was again?
I'm not sure why we never used it.
The update of the DEBUGCTL can race with other operations, but it's rather
obscure.
But I would not do it right now.
-Andi
--
[email protected] -- Speaking for myself only
Commit-ID: a94cab2376cb35f236be14e2833cef63a8762a31
Gitweb: http://git.kernel.org/tip/a94cab2376cb35f236be14e2833cef63a8762a31
Author: Andi Kleen <[email protected]>
AuthorDate: Sun, 10 May 2015 12:22:39 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Tue, 4 Aug 2015 10:16:55 +0200
perf/x86: Add a native_perf_sched_clock_from_tsc()
PEBSv3 has a raw TSC time stamp in its memory buffer that
later needs to to be converted to perf_clock.
Add a native_sched_clock_from_tsc() that works the same
as native_sched_clock(), but starts with an already given
TSC value.
Paravirt is ignored, it will just get the native clock.
But there isn't a para virtualized PEBS anyway.
Signed-off-by: Andi Kleen <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/include/asm/tsc.h | 1 +
arch/x86/kernel/tsc.c | 8 ++++++++
2 files changed, 9 insertions(+)
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 94605c0..aad56eb 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -51,6 +51,7 @@ extern int unsynchronized_tsc(void);
extern int check_tsc_unstable(void);
extern int check_tsc_disabled(void);
extern unsigned long native_calibrate_tsc(void);
+extern unsigned long long native_sched_clock_from_tsc(u64 tsc);
extern int tsc_clocksource_reliable;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7437b41..88e9a38 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -296,6 +296,14 @@ u64 native_sched_clock(void)
return cycles_2_ns(tsc_now);
}
+/*
+ * Generate a sched_clock if you already have a TSC value.
+ */
+u64 native_sched_clock_from_tsc(u64 tsc)
+{
+ return cycles_2_ns(tsc);
+}
+
/* We need to define a real function for sched_clock, to override the
weak default version */
#ifdef CONFIG_PARAVIRT
Commit-ID: 2f7ebf2ec2a7b311318aae10b8373b0bd93001a7
Gitweb: http://git.kernel.org/tip/2f7ebf2ec2a7b311318aae10b8373b0bd93001a7
Author: Andi Kleen <[email protected]>
AuthorDate: Sun, 10 May 2015 12:22:40 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Tue, 4 Aug 2015 10:16:56 +0200
perf/x86/intel: Add support for PEBSv3 profiling
PEBSv3 is the same as the existing PEBSv2 used on Haswell,
but it adds a new TSC field. Add support to the generic
PEBS handler to handle the new format, and overwrite
the perf time stamp using the new native_sched_clock_from_tsc().
Right now the time stamp is just slightly more accurate,
as it is nearer the actual event trigger point. With
the PEBS threshold > 1 patchkit it will be much more accurate,
avoid the problems with MMAP mismatches earlier.
The accurate time stamping is only implemented for
the default trace clock for now.
v2: Use _skl prefix. Check for default clock_id.
Signed-off-by: Andi Kleen <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/kernel/cpu/perf_event_intel_ds.c | 36 ++++++++++++++++++++++++++++---
1 file changed, 33 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 71fc402..410270a 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -224,6 +224,19 @@ union hsw_tsx_tuning {
#define PEBS_HSW_TSX_FLAGS 0xff00000000ULL
+/* Same as HSW, plus TSC */
+
+struct pebs_record_skl {
+ u64 flags, ip;
+ u64 ax, bx, cx, dx;
+ u64 si, di, bp, sp;
+ u64 r8, r9, r10, r11;
+ u64 r12, r13, r14, r15;
+ u64 status, dla, dse, lat;
+ u64 real_ip, tsx_tuning;
+ u64 tsc;
+};
+
void init_debug_store_on_cpu(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -885,7 +898,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
return 0;
}
-static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs)
+static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs)
{
if (pebs->tsx_tuning) {
union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
@@ -894,7 +907,7 @@ static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs)
return 0;
}
-static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs)
+static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)
{
u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
@@ -918,7 +931,7 @@ static void setup_pebs_sample_data(struct perf_event *event,
* unconditionally access the 'extra' entries.
*/
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- struct pebs_record_hsw *pebs = __pebs;
+ struct pebs_record_skl *pebs = __pebs;
u64 sample_type;
int fll, fst, dsrc;
int fl = event->hw.flags;
@@ -1016,6 +1029,16 @@ static void setup_pebs_sample_data(struct perf_event *event,
data->txn = intel_hsw_transaction(pebs);
}
+ /*
+ * v3 supplies an accurate time stamp, so we use that
+ * for the time stamp.
+ *
+ * We can only do this for the default trace clock.
+ */
+ if (x86_pmu.intel_cap.pebs_format >= 3 &&
+ event->attr.use_clockid == 0)
+ data->time = native_sched_clock_from_tsc(pebs->tsc);
+
if (has_branch_stack(event))
data->br_stack = &cpuc->lbr_stack;
}
@@ -1245,6 +1268,13 @@ void __init intel_ds_init(void)
x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
break;
+ case 3:
+ pr_cont("PEBS fmt3%c, ", pebs_type);
+ x86_pmu.pebs_record_size =
+ sizeof(struct pebs_record_skl);
+ x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+ break;
+
default:
printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
x86_pmu.pebs = 0;
Commit-ID: b83ff1c8617aac03a1cf807aafa848fe0f0908f2
Gitweb: http://git.kernel.org/tip/b83ff1c8617aac03a1cf807aafa848fe0f0908f2
Author: Andi Kleen <[email protected]>
AuthorDate: Sun, 10 May 2015 12:22:41 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Tue, 4 Aug 2015 10:16:56 +0200
x86: Add new MSRs and MSR bits used for Intel Skylake PMU support
Add new MSRs (LBR_INFO) and some new MSR bits used by the Intel Skylake
PMU driver.
Signed-off-by: Andi Kleen <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/include/asm/msr-index.h | 6 ++++++
arch/x86/include/asm/perf_event.h | 7 +++++++
2 files changed, 13 insertions(+)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index c665d34..fcd17c1 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -73,6 +73,12 @@
#define MSR_LBR_CORE_FROM 0x00000040
#define MSR_LBR_CORE_TO 0x00000060
+#define MSR_LBR_INFO_0 0x00000dc0 /* ... 0xddf for _31 */
+#define LBR_INFO_MISPRED BIT_ULL(63)
+#define LBR_INFO_IN_TX BIT_ULL(62)
+#define LBR_INFO_ABORT BIT_ULL(61)
+#define LBR_INFO_CYCLES 0xffff
+
#define MSR_IA32_PEBS_ENABLE 0x000003f1
#define MSR_IA32_DS_AREA 0x00000600
#define MSR_IA32_PERF_CAPABILITIES 0x00000345
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index dc0f6ed..7bcb861 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -159,6 +159,13 @@ struct x86_pmu_capability {
*/
#define INTEL_PMC_IDX_FIXED_BTS (INTEL_PMC_IDX_FIXED + 16)
+#define GLOBAL_STATUS_COND_CHG BIT_ULL(63)
+#define GLOBAL_STATUS_BUFFER_OVF BIT_ULL(62)
+#define GLOBAL_STATUS_UNC_OVF BIT_ULL(61)
+#define GLOBAL_STATUS_ASIF BIT_ULL(60)
+#define GLOBAL_STATUS_COUNTERS_FROZEN BIT_ULL(59)
+#define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(58)
+
/*
* IBS cpuid feature detection
*/
Commit-ID: 71ef3c6b9d4665ee7afbbe4c208a98917dcfc32f
Gitweb: http://git.kernel.org/tip/71ef3c6b9d4665ee7afbbe4c208a98917dcfc32f
Author: Andi Kleen <[email protected]>
AuthorDate: Sun, 10 May 2015 12:22:42 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Tue, 4 Aug 2015 10:16:57 +0200
perf: Add cycles to branch_info
Intel Skylake supports reporting the time in cycles a branch in the LBR
took, to give a rough indication of the basic block performance.
Export the cycle information in the branch_info structure.
This can be done by just reusing some currently zero padding.
This is just the generic header change. The architecture
still needs to fill it in.
There's no attempt to convert to real time, as we really
want cycles here.
Signed-off-by: Andi Kleen <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
include/uapi/linux/perf_event.h | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 022d0ac..2881145 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -951,6 +951,7 @@ union perf_mem_data_src {
*
* in_tx: running in a hardware transaction
* abort: aborting a hardware transaction
+ * cycles: cycles from last branch (or 0 if not supported)
*/
struct perf_branch_entry {
__u64 from;
@@ -959,7 +960,8 @@ struct perf_branch_entry {
predicted:1,/* target predicted */
in_tx:1, /* in transaction */
abort:1, /* transaction abort */
- reserved:60;
+ cycles:16, /* cycle count to last branch */
+ reserved:44;
};
#endif /* _UAPI_LINUX_PERF_EVENT_H */
Commit-ID: 50eab8f6ecd77ae4f9742f8e21ea50705ce0f830
Gitweb: http://git.kernel.org/tip/50eab8f6ecd77ae4f9742f8e21ea50705ce0f830
Author: Andi Kleen <[email protected]>
AuthorDate: Sun, 10 May 2015 12:22:43 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Tue, 4 Aug 2015 10:16:57 +0200
perf/x86/intel/lbr: Add support for LBRv5
Add support for the new LBRv5 format used on Intel Skylake CPUs.
The flags for mispredict, abort, in_tx etc. moved to range of separate
LBR_INFO_* MSRs. Teach the LBR code to read those. The original
LBR registers stay the same, except they have full sign
extension now.
LBR_INFO also reports a cycle count to the last branch.
Report the cycle information using the new "cycles" branch_info
output field.
In addition we have to context switch and clear the new INFO
MSRs to avoid any information leaks.
Signed-off-by: Andi Kleen <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/kernel/cpu/perf_event.h | 1 +
arch/x86/kernel/cpu/perf_event_intel_lbr.c | 21 ++++++++++++++++++++-
2 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 7378b10..e9c5bdf 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -625,6 +625,7 @@ struct x86_pmu {
struct x86_perf_task_context {
u64 lbr_from[MAX_LBR_ENTRIES];
u64 lbr_to[MAX_LBR_ENTRIES];
+ u64 lbr_info[MAX_LBR_ENTRIES];
int lbr_callstack_users;
int lbr_stack_state;
};
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 452a7bd..2fb5737 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -13,7 +13,8 @@ enum {
LBR_FORMAT_EIP = 0x02,
LBR_FORMAT_EIP_FLAGS = 0x03,
LBR_FORMAT_EIP_FLAGS2 = 0x04,
- LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_EIP_FLAGS2,
+ LBR_FORMAT_INFO = 0x05,
+ LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_INFO,
};
static enum {
@@ -186,6 +187,8 @@ static void intel_pmu_lbr_reset_64(void)
for (i = 0; i < x86_pmu.lbr_nr; i++) {
wrmsrl(x86_pmu.lbr_from + i, 0);
wrmsrl(x86_pmu.lbr_to + i, 0);
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+ wrmsrl(MSR_LBR_INFO_0 + i, 0);
}
}
@@ -234,6 +237,8 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
lbr_idx = (tos - i) & mask;
wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+ wrmsrl(MSR_LBR_INFO_0 + i, task_ctx->lbr_info[i]);
}
task_ctx->lbr_stack_state = LBR_NONE;
}
@@ -255,6 +260,8 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
lbr_idx = (tos - i) & mask;
rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+ rdmsrl(MSR_LBR_INFO_0 + i, task_ctx->lbr_info[i]);
}
task_ctx->lbr_stack_state = LBR_VALID;
}
@@ -416,11 +423,22 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
unsigned long lbr_idx = (tos - i) & mask;
u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
int skip = 0;
+ u16 cycles = 0;
int lbr_flags = lbr_desc[lbr_format];
rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
+ if (lbr_format == LBR_FORMAT_INFO) {
+ u64 info;
+
+ rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
+ mis = !!(info & LBR_INFO_MISPRED);
+ pred = !mis;
+ in_tx = !!(info & LBR_INFO_IN_TX);
+ abort = !!(info & LBR_INFO_ABORT);
+ cycles = (info & LBR_INFO_CYCLES);
+ }
if (lbr_flags & LBR_EIP_FLAGS) {
mis = !!(from & LBR_FROM_FLAG_MISPRED);
pred = !mis;
@@ -450,6 +468,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
cpuc->lbr_entries[out].predicted = pred;
cpuc->lbr_entries[out].in_tx = in_tx;
cpuc->lbr_entries[out].abort = abort;
+ cpuc->lbr_entries[out].cycles = cycles;
cpuc->lbr_entries[out].reserved = 0;
out++;
}
Commit-ID: d8020bee1d0caa90e7b9d6f39ac1fdfaaee7f67f
Gitweb: http://git.kernel.org/tip/d8020bee1d0caa90e7b9d6f39ac1fdfaaee7f67f
Author: Andi Kleen <[email protected]>
AuthorDate: Sun, 10 May 2015 12:22:45 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Tue, 4 Aug 2015 10:16:57 +0200
perf/x86/intel: Handle new arch perfmon v4 status bits
ArchPerfmon v4 has some new status bits in GLOBAL_STATUS.
These need to be ignored when deciding whether a NMI
was an NMI, to avoid eating all NMIs when they
stay set, see:
b292d7a10487 ("perf/x86/intel: ignore CondChgd bit to avoid false NMI handling")
This patch ignores the new ASIF bit, which indicates
that SGX interfered with the PMU, and also the new
LBR freezing bits, which are set when the LBRs get
frozen, plus the existing CondChange (set by JTAG
debuggers and some buggy BIOSes)
Signed-off-by: Andi Kleen <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/kernel/cpu/perf_event_intel.c | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index cb112bf..52c9ded 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1611,13 +1611,14 @@ again:
intel_pmu_lbr_read();
/*
- * CondChgd bit 63 doesn't mean any overflow status. Ignore
- * and clear the bit.
+ * Ignore a range of extra bits in status that do not indicate
+ * overflow by themselves.
*/
- if (__test_and_clear_bit(63, (unsigned long *)&status)) {
- if (!status)
- goto done;
- }
+ status &= ~(GLOBAL_STATUS_COND_CHG |
+ GLOBAL_STATUS_ASIF |
+ GLOBAL_STATUS_LBRS_FROZEN);
+ if (!status)
+ goto done;
/*
* PEBS overflow sets bit 62 in the global status register
Commit-ID: 0f29e573dd32bb8598e74271454e97c962da5e05
Gitweb: http://git.kernel.org/tip/0f29e573dd32bb8598e74271454e97c962da5e05
Author: Andi Kleen <[email protected]>
AuthorDate: Sun, 10 May 2015 12:22:47 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Tue, 4 Aug 2015 10:16:58 +0200
perf/x86/intel: Move PMU ACK to after LBR read
With Arch Perfmon v4 the PMU ack unfreezes the LBRs. So we need to do
the PMU ack after the LBR reading, otherwise the LBRs would be polluted
by the PMI handler.
This is a minimal change. In principle the ACK could be moved much later.
Signed-off-by: Andi Kleen <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/kernel/cpu/perf_event_intel.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 52c9ded..da93b4b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1594,6 +1594,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
loops = 0;
again:
+ intel_pmu_lbr_read();
intel_pmu_ack_status(status);
if (++loops > 100) {
static bool warned = false;
@@ -1608,7 +1609,6 @@ again:
inc_irq_stat(apic_perf_irqs);
- intel_pmu_lbr_read();
/*
* Ignore a range of extra bits in status that do not indicate
Commit-ID: 425507fa5f321bb5ce1b5eb57a9586e0cf0b9802
Gitweb: http://git.kernel.org/tip/425507fa5f321bb5ce1b5eb57a9586e0cf0b9802
Author: Andi Kleen <[email protected]>
AuthorDate: Sun, 10 May 2015 12:22:46 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Tue, 4 Aug 2015 10:16:58 +0200
perf/x86/intel/lbr: Optimize v4 LBR unfreezing
In Arch perfmon v4 the GLOBAL_STATUS reset automatically unfreezes
LBRs. So no need to do it manually in the LBR code. Add a check
to skip it.
v2: Move test up to beginning of function.
Signed-off-by: Andi Kleen <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/kernel/cpu/perf_event_intel_lbr.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 2fb5737..769a42f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -141,6 +141,13 @@ static void __intel_pmu_lbr_enable(bool pmi)
u64 debugctl, lbr_select = 0, orig_debugctl;
/*
+ * No need to unfreeze manually, as v4 can do that as part
+ * of the GLOBAL_STATUS ack.
+ */
+ if (pmi && x86_pmu.version >= 4)
+ return;
+
+ /*
* No need to reprogram LBR_SELECT in a PMI, as it
* did not change.
*/
Commit-ID: 9a92e16fd7b4ccd9aabcbc4d42a3fb5f9a3cf4a1
Gitweb: http://git.kernel.org/tip/9a92e16fd7b4ccd9aabcbc4d42a3fb5f9a3cf4a1
Author: Andi Kleen <[email protected]>
AuthorDate: Sun, 10 May 2015 12:22:44 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Tue, 4 Aug 2015 10:16:58 +0200
perf/x86/intel: Add Intel Skylake PMU support
Add perf core PMU support for future Intel Skylake CPU cores.
The code is based on Haswell/Broadwell.
There is a new cache event list, based on the updated Haswell
event list.
Skylake has removed most counter constraints on basic
events, so the basic constraints table now only has a single
entry (plus the fixed counters).
TSX support and various other setups are all shared with Haswell.
Skylake has 32 LBR entries. Add a new LBR init function
to set this up. The filters are all the same as Haswell.
It also has a new LBR format with a separate LBR_INFO_* MSR,
but that has been already added earlier.
Signed-off-by: Andi Kleen <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/kernel/cpu/perf_event.h | 6 +-
arch/x86/kernel/cpu/perf_event_intel.c | 232 +++++++++++++++++++++++++++++
arch/x86/kernel/cpu/perf_event_intel_ds.c | 22 +++
arch/x86/kernel/cpu/perf_event_intel_lbr.c | 20 +++
4 files changed, 279 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index e9c5bdf..8ad9241 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -165,7 +165,7 @@ struct intel_excl_cntrs {
unsigned core_id; /* per-core: core id */
};
-#define MAX_LBR_ENTRIES 16
+#define MAX_LBR_ENTRIES 32
enum {
X86_PERF_KFREE_SHARED = 0,
@@ -861,6 +861,8 @@ extern struct event_constraint intel_ivb_pebs_event_constraints[];
extern struct event_constraint intel_hsw_pebs_event_constraints[];
+extern struct event_constraint intel_skl_pebs_event_constraints[];
+
struct event_constraint *intel_pebs_constraints(struct perf_event *event);
void intel_pmu_pebs_enable(struct perf_event *event);
@@ -899,6 +901,8 @@ void intel_pmu_lbr_init_snb(void);
void intel_pmu_lbr_init_hsw(void);
+void intel_pmu_lbr_init_skl(void);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
void intel_pt_interrupt(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index da93b4b..28fc272 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -177,6 +177,14 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly =
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_skl_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */
+ EVENT_CONSTRAINT_END
+};
+
static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
@@ -193,6 +201,13 @@ static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
EVENT_EXTRA_END
};
+static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+ EVENT_EXTRA_END
+};
+
EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3");
EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3");
EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2");
@@ -244,6 +259,200 @@ static u64 intel_pmu_event_map(int hw_event)
return intel_perfmon_event_map[hw_event];
}
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts.
+ * - icache miss does not include decoded icache
+ */
+
+#define SKL_DEMAND_DATA_RD BIT_ULL(0)
+#define SKL_DEMAND_RFO BIT_ULL(1)
+#define SKL_ANY_RESPONSE BIT_ULL(16)
+#define SKL_SUPPLIER_NONE BIT_ULL(17)
+#define SKL_L3_MISS_LOCAL_DRAM BIT_ULL(26)
+#define SKL_L3_MISS_REMOTE_HOP0_DRAM BIT_ULL(27)
+#define SKL_L3_MISS_REMOTE_HOP1_DRAM BIT_ULL(28)
+#define SKL_L3_MISS_REMOTE_HOP2P_DRAM BIT_ULL(29)
+#define SKL_L3_MISS (SKL_L3_MISS_LOCAL_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+#define SKL_SPL_HIT BIT_ULL(30)
+#define SKL_SNOOP_NONE BIT_ULL(31)
+#define SKL_SNOOP_NOT_NEEDED BIT_ULL(32)
+#define SKL_SNOOP_MISS BIT_ULL(33)
+#define SKL_SNOOP_HIT_NO_FWD BIT_ULL(34)
+#define SKL_SNOOP_HIT_WITH_FWD BIT_ULL(35)
+#define SKL_SNOOP_HITM BIT_ULL(36)
+#define SKL_SNOOP_NON_DRAM BIT_ULL(37)
+#define SKL_ANY_SNOOP (SKL_SPL_HIT|SKL_SNOOP_NONE| \
+ SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+ SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+ SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM)
+#define SKL_DEMAND_READ SKL_DEMAND_DATA_RD
+#define SKL_SNOOP_DRAM (SKL_SNOOP_NONE| \
+ SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+ SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+ SKL_SNOOP_HITM|SKL_SPL_HIT)
+#define SKL_DEMAND_WRITE SKL_DEMAND_RFO
+#define SKL_LLC_ACCESS SKL_ANY_RESPONSE
+#define SKL_L3_MISS_REMOTE (SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+
+static __initconst const u64 skl_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x283, /* ICACHE_64B.MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x608, /* DTLB_LOAD_MISSES.WALK_COMPLETED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x649, /* DTLB_STORE_MISSES.WALK_COMPLETED */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2085, /* ITLB_MISSES.STLB_HIT */
+ [ C(RESULT_MISS) ] = 0xe85, /* ITLB_MISSES.WALK_COMPLETED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
+static __initconst const u64 skl_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+ SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+ [ C(RESULT_MISS) ] = SKL_DEMAND_READ|
+ SKL_L3_MISS|SKL_ANY_SNOOP|
+ SKL_SUPPLIER_NONE,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+ SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+ [ C(RESULT_MISS) ] = SKL_DEMAND_WRITE|
+ SKL_L3_MISS|SKL_ANY_SNOOP|
+ SKL_SUPPLIER_NONE,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+ SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = SKL_DEMAND_READ|
+ SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+ SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = SKL_DEMAND_WRITE|
+ SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
#define SNB_DMND_DATA_RD (1ULL << 0)
#define SNB_DMND_RFO (1ULL << 1)
#define SNB_DMND_IFETCH (1ULL << 2)
@@ -3278,6 +3487,29 @@ __init int intel_pmu_init(void)
pr_cont("Broadwell events, ");
break;
+ case 78: /* 14nm Skylake Mobile */
+ case 94: /* 14nm Skylake Desktop */
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ intel_pmu_lbr_init_skl();
+
+ x86_pmu.event_constraints = intel_skl_event_constraints;
+ x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_skl_extra_regs;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.cpu_events = hsw_events_attrs;
+ WARN_ON(!x86_pmu.format_attrs);
+ x86_pmu.cpu_events = hsw_events_attrs;
+ pr_cont("Skylake events, ");
+ break;
+
default:
switch (x86_pmu.version) {
case 1:
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 03773c2..2f7ee05 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -688,6 +688,28 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_skl_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ INTEL_PLD_CONSTRAINT(0x1cd, 0xf), /* MEM_TRANS_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_L3_HIT_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_L3_MISS_RETIRED.* */
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+ EVENT_CONSTRAINT_END
+};
+
struct event_constraint *intel_pebs_constraints(struct perf_event *event)
{
struct event_constraint *c;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 769a42f..b432c47 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -973,6 +973,26 @@ void intel_pmu_lbr_init_hsw(void)
pr_cont("16-deep LBR, ");
}
+/* skylake */
+__init void intel_pmu_lbr_init_skl(void)
+{
+ x86_pmu.lbr_nr = 32;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
+
+ /*
+ * SW branch filter usage:
+ * - support syscall, sysret capture.
+ * That requires LBR_FAR but that means far
+ * jmp need to be filtered out
+ */
+ pr_cont("32-deep LBR, ");
+}
+
/* atom */
void __init intel_pmu_lbr_init_atom(void)
{