LinuxLists.cc - perf, x86: Add parts of the remaining haswell PMU functionality v3

2013-08-21 23:47:31

Subject: perf, x86: Add parts of the remaining haswell PMU functionality v3

I hope this version is ok for everyone now.

[v2: Added Peter's changes to the PEBS handler]
[v3: Addressed Arnaldo's feedback for the perf stat -T change
and avoid conflict]

Add some more TSX functionality to the basic Haswell PMU.

A lot of the infrastructure needed for these patches has
been merged earlier, so it is all quite straight forward
now.

- Add the checkpointed counter workaround.
(Parts of this have been already merged earlier)
- Add support for reporting PEBS transaction abort cost as weight.
This is useful to judge the cost of aborts and concentrate
on expensive ones first.
(Large parts of this have been already merged earlier,
this is just adding the final few lines to the PEBS handler)
- Add TSX event aliases, needed for perf stat -T and general
usability.
(Infrastructure also already in)
- Add perf stat -T support to give a user friendly highlevel
counting frontend for transaction..
This version should also be usable for POWER8 eventually.

Not included:

Support for transaction flags and TSX LBR flags.

-Andi

2013-08-21 23:47:36

by Andi Kleen

[permalink] [raw]

Subject: [PATCH 2/4] perf, x86: Report TSX transaction abort cost as weight v2

From: Andi Kleen <[email protected]>

Use the existing weight reporting facility to report the transaction
abort cost, that is the number of cycles wasted in aborts.
Haswell reports this in the PEBS record.

This was in fact the original user for weight.

This is a very useful sort key to concentrate on the most
costly aborts and a good metric for TSX tuning.

v2: Add Peter's changes with minor modifications. More comments.
Signed-off-by: Andi Kleen <[email protected]>
---
arch/x86/kernel/cpu/perf_event_intel_ds.c | 55 +++++++++++++++++++++++--------
1 file changed, 42 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 3065c57..ede2e40 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -182,16 +182,29 @@ struct pebs_record_nhm {
* Same as pebs_record_nhm, with two additional fields.
*/
struct pebs_record_hsw {
- struct pebs_record_nhm nhm;
- /*
- * Real IP of the event. In the Intel documentation this
- * is called eventingrip.
- */
- u64 real_ip;
- /*
- * TSX tuning information field: abort cycles and abort flags.
- */
- u64 tsx_tuning;
+ u64 flags, ip;
+ u64 ax, bx, cx, dx;
+ u64 si, di, bp, sp;
+ u64 r8, r9, r10, r11;
+ u64 r12, r13, r14, r15;
+ u64 status, dla, dse, lat;
+ u64 real_ip; /* the actual eventing ip */
+ u64 tsx_tuning; /* TSX abort cycles and flags */
+};
+
+union hsw_tsx_tuning {
+ struct {
+ u32 cycles_last_block : 32,
+ hle_abort : 1,
+ rtm_abort : 1,
+ instruction_abort : 1,
+ non_instruction_abort : 1,
+ retry : 1,
+ data_conflict : 1,
+ capacity_writes : 1,
+ capacity_reads : 1;
+ };
+ u64 value;
};

void init_debug_store_on_cpu(int cpu)
@@ -759,16 +772,26 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
return 0;
}

+static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs)
+{
+ if (pebs->tsx_tuning) {
+ union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
+ return tsx.cycles_last_block;
+ }
+ return 0;
+}
+
static void __intel_pmu_pebs_event(struct perf_event *event,
struct pt_regs *iregs, void *__pebs)
{
/*
* We cast to pebs_record_nhm to get the load latency data
* if extra_reg MSR_PEBS_LD_LAT_THRESHOLD used
+ * We cast to the biggest PEBS record are careful not
+ * to access out-of-bounds members.
*/
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct pebs_record_nhm *pebs = __pebs;
- struct pebs_record_hsw *pebs_hsw = __pebs;
+ struct pebs_record_hsw *pebs= __pebs;
struct perf_sample_data data;
struct pt_regs regs;
u64 sample_type;
@@ -827,7 +850,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
regs.sp = pebs->sp;

if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) {
- regs.ip = pebs_hsw->real_ip;
+ regs.ip = pebs->real_ip;
regs.flags |= PERF_EFLAGS_EXACT;
} else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(&regs))
regs.flags |= PERF_EFLAGS_EXACT;
@@ -838,6 +861,12 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
x86_pmu.intel_cap.pebs_format >= 1)
data.addr = pebs->dla;

+ /* Only set the TSX weight when no memory weight was requested. */
+ if ((event->attr.sample_type & PERF_SAMPLE_WEIGHT) &&
+ !fll &&
+ (x86_pmu.intel_cap.pebs_format >= 2))
+ data.weight = intel_hsw_weight(pebs);
+
if (has_branch_stack(event))
data.br_stack = &cpuc->lbr_stack;

--
1.8.3.1

2013-08-21 23:48:00

by Andi Kleen

[permalink] [raw]

Subject: [PATCH 4/4] perf, tools: Add perf stat --transaction v5

From: Andi Kleen <[email protected]>

Add support to perf stat to print the basic transactional execution statistics:
Total cycles, Cycles in Transaction, Cycles in aborted transsactions
using the in_tx and in_tx_checkpoint qualifiers.
Transaction Starts and Elision Starts, to compute the average transaction
length.

This is a reasonable overview over the success of the transactions.

Also support architectures that have a transaction aborted cycles
counter like POWER8. Since that is awkward to handle in the kernel
abstract handle both cases here.

Enable with a new --transaction / -T option.

This requires measuring these events in a group, since they depend on each
other.

This is implemented by using TM sysfs events exported by the kernel

v2: Only print the extended statistics when the option is enabled.
This avoids negative output when the user specifies the -T events
in separate groups.
v3: Port to latest tree
v4: Remove merge error. Avoid linear walks for comparisons. Check
transaction_run earlier. Minor fixes.
v5: Move option to avoid conflict. Improve description.
Signed-off-by: Andi Kleen <[email protected]>
---
tools/perf/Documentation/perf-stat.txt | 5 ++
tools/perf/builtin-stat.c | 144 ++++++++++++++++++++++++++++++++-
tools/perf/util/evsel.h | 6 ++
tools/perf/util/pmu.c | 16 ++++
tools/perf/util/pmu.h | 1 +
5 files changed, 171 insertions(+), 1 deletion(-)

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 2fe87fb..40bc65a 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -132,6 +132,11 @@ is a useful mode to detect imbalance between physical cores. To enable this mod
use --per-core in addition to -a. (system-wide). The output includes the
core number and the number of online logical processors on that physical processor.

+-T::
+--transaction::
+
+Print statistics of transactional execution if supported.
+
EXAMPLES
--------

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 352fbd7..6bd90e4 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -46,6 +46,7 @@
#include "util/util.h"
#include "util/parse-options.h"
#include "util/parse-events.h"
+#include "util/pmu.h"
#include "util/event.h"
#include "util/evlist.h"
#include "util/evsel.h"
@@ -70,6 +71,41 @@ static void print_counter_aggr(struct perf_evsel *counter, char *prefix);
static void print_counter(struct perf_evsel *counter, char *prefix);
static void print_aggr(char *prefix);

+/* Default events used for perf stat -T */
+static const char * const transaction_attrs[] = {
+ "task-clock",
+ "{"
+ "instructions,"
+ "cycles,"
+ "cpu/cycles-t/,"
+ "cpu/tx-start/,"
+ "cpu/el-start/,"
+ "cpu/cycles-ct/"
+ "}"
+};
+
+/* More limited version when the CPU does not have all events. */
+static const char * const transaction_limited_attrs[] = {
+ "task-clock",
+ "{"
+ "instructions,"
+ "cycles,"
+ "cpu/cycles-t/,"
+ "cpu/tx-start/"
+ "}"
+};
+
+/* must match transaction_attrs and the beginning limited_attrs */
+enum {
+ T_TASK_CLOCK,
+ T_INSTRUCTIONS,
+ T_CYCLES,
+ T_CYCLES_IN_TX,
+ T_TRANSACTION_START,
+ T_ELISION_START,
+ T_CYCLES_IN_TX_CP,
+};
+
static struct perf_evlist *evsel_list;

static struct perf_target target = {
@@ -90,6 +126,7 @@ static enum aggr_mode aggr_mode = AGGR_GLOBAL;
static volatile pid_t child_pid = -1;
static bool null_run = false;
static int detailed_run = 0;
+static bool transaction_run;
static bool big_num = true;
static int big_num_opt = -1;
static const char *csv_sep = NULL;
@@ -213,7 +250,10 @@ static struct stats runtime_l1_icache_stats[MAX_NR_CPUS];
static struct stats runtime_ll_cache_stats[MAX_NR_CPUS];
static struct stats runtime_itlb_cache_stats[MAX_NR_CPUS];
static struct stats runtime_dtlb_cache_stats[MAX_NR_CPUS];
+static struct stats runtime_cycles_in_tx_stats[MAX_NR_CPUS];
static struct stats walltime_nsecs_stats;
+static struct stats runtime_transaction_stats[MAX_NR_CPUS];
+static struct stats runtime_elision_stats[MAX_NR_CPUS];

static void perf_stat__reset_stats(struct perf_evlist *evlist)
{
@@ -235,6 +275,11 @@ static void perf_stat__reset_stats(struct perf_evlist *evlist)
memset(runtime_ll_cache_stats, 0, sizeof(runtime_ll_cache_stats));
memset(runtime_itlb_cache_stats, 0, sizeof(runtime_itlb_cache_stats));
memset(runtime_dtlb_cache_stats, 0, sizeof(runtime_dtlb_cache_stats));
+ memset(runtime_cycles_in_tx_stats, 0,
+ sizeof(runtime_cycles_in_tx_stats));
+ memset(runtime_transaction_stats, 0,
+ sizeof(runtime_transaction_stats));
+ memset(runtime_elision_stats, 0, sizeof(runtime_elision_stats));
memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats));
}

@@ -272,6 +317,29 @@ static inline int nsec_counter(struct perf_evsel *evsel)
return 0;
}

+static struct perf_evsel *nth_evsel(int n)
+{
+ static struct perf_evsel **array;
+ static int array_len;
+ struct perf_evsel *ev;
+ int j;
+
+ /* Assumes this only called when evsel_list does not change anymore. */
+ if (!array) {
+ list_for_each_entry(ev, &evsel_list->entries, node)
+ array_len++;
+ array = malloc(array_len * sizeof(void *));
+ if (!array)
+ exit(ENOMEM);
+ j = 0;
+ list_for_each_entry(ev, &evsel_list->entries, node)
+ array[j++] = ev;
+ }
+ if (n < array_len)
+ return array[n];
+ return NULL;
+}
+
/*
* Update various tracking values we maintain to print
* more semantic information such as miss/hit ratios,
@@ -283,6 +351,15 @@ static void update_shadow_stats(struct perf_evsel *counter, u64 *count)
update_stats(&runtime_nsecs_stats[0], count[0]);
else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
update_stats(&runtime_cycles_stats[0], count[0]);
+ else if (transaction_run &&
+ perf_evsel__cmp(counter, nth_evsel(T_CYCLES_IN_TX)))
+ update_stats(&runtime_cycles_in_tx_stats[0], count[0]);
+ else if (transaction_run &&
+ perf_evsel__cmp(counter, nth_evsel(T_TRANSACTION_START)))
+ update_stats(&runtime_transaction_stats[0], count[0]);
+ else if (transaction_run &&
+ perf_evsel__cmp(counter, nth_evsel(T_ELISION_START)))
+ update_stats(&runtime_elision_stats[0], count[0]);
else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
update_stats(&runtime_stalled_cycles_front_stats[0], count[0]);
else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
@@ -807,7 +884,7 @@ static void print_ll_cache_misses(int cpu,

static void abs_printout(int cpu, int nr, struct perf_evsel *evsel, double avg)
{
- double total, ratio = 0.0;
+ double total, ratio = 0.0, total2;
const char *fmt;

if (csv_output)
@@ -903,6 +980,43 @@ static void abs_printout(int cpu, int nr, struct perf_evsel *evsel, double avg)
ratio = 1.0 * avg / total;

fprintf(output, " # %8.3f GHz ", ratio);
+ } else if (transaction_run &&
+ perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX))) {
+ total = avg_stats(&runtime_cycles_stats[cpu]);
+ if (total)
+ fprintf(output,
+ " # %5.2f%% transactional cycles ",
+ 100.0 * (avg / total));
+ } else if (transaction_run &&
+ perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX_CP))) {
+ total = avg_stats(&runtime_cycles_stats[cpu]);
+ total2 = avg_stats(&runtime_cycles_in_tx_stats[cpu]);
+ if (total2 < avg)
+ total2 = avg;
+ if (total)
+ fprintf(output,
+ " # %5.2f%% aborted cycles ",
+ 100.0 * ((total2-avg) / total));
+ } else if (transaction_run &&
+ perf_evsel__cmp(evsel, nth_evsel(T_TRANSACTION_START)) &&
+ avg > 0 &&
+ runtime_cycles_in_tx_stats[cpu].n != 0) {
+ total = avg_stats(&runtime_cycles_in_tx_stats[cpu]);
+
+ if (total)
+ ratio = total / avg;
+
+ fprintf(output, " # %8.0f cycles / transaction ", ratio);
+ } else if (transaction_run &&
+ perf_evsel__cmp(evsel, nth_evsel(T_ELISION_START)) &&
+ avg > 0 &&
+ runtime_cycles_in_tx_stats[cpu].n != 0) {
+ total = avg_stats(&runtime_cycles_in_tx_stats[cpu]);
+
+ if (total)
+ ratio = total / avg;
+
+ fprintf(output, " # %8.0f cycles / elision ", ratio);
} else if (runtime_nsecs_stats[cpu].n != 0) {
char unit = 'M';

@@ -1216,6 +1330,16 @@ static int perf_stat_init_aggr_mode(void)
return 0;
}

+static int setup_events(const char * const *attrs, unsigned len)
+{
+ unsigned i;
+
+ for (i = 0; i < len; i++) {
+ if (parse_events(evsel_list, attrs[i]))
+ return -1;
+ }
+ return 0;
+}

/*
* Add default attributes, if there were no attributes specified or
@@ -1334,6 +1458,22 @@ static int add_default_attributes(void)
if (null_run)
return 0;

+ if (transaction_run) {
+ int err;
+ if (pmu_have_event("cpu", "cycles-ct") &&
+ pmu_have_event("cpu", "el-start"))
+ err = setup_events(transaction_attrs,
+ ARRAY_SIZE(transaction_attrs));
+ else
+ err = setup_events(transaction_limited_attrs,
+ ARRAY_SIZE(transaction_limited_attrs));
+ if (err < 0) {
+ fprintf(stderr, "Cannot set up transaction events\n");
+ return -1;
+ }
+ return 0;
+ }
+
if (!evsel_list->nr_entries) {
if (perf_evlist__add_default_attrs(evsel_list, default_attrs) < 0)
return -1;
@@ -1368,6 +1508,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
int output_fd = 0;
const char *output_name = NULL;
const struct option options[] = {
+ OPT_BOOLEAN('T', "transaction", &transaction_run,
+ "hardware transaction statistics"),
OPT_CALLBACK('e', "event", &evsel_list, "event",
"event selector. use 'perf list' to list available events",
parse_events_option),
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 3f156cc..2f3dc86 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -180,6 +180,12 @@ static inline bool perf_evsel__match2(struct perf_evsel *e1,
(e1->attr.config == e2->attr.config);
}

+#define perf_evsel__cmp(a, b) \
+ ((a) && \
+ (b) && \
+ (a)->attr.type == (b)->attr.type && \
+ (a)->attr.config == (b)->attr.config)
+
int __perf_evsel__read_on_cpu(struct perf_evsel *evsel,
int cpu, int thread, bool scale);

diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index bc9d806..64362fe 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -637,3 +637,19 @@ void print_pmu_events(const char *event_glob, bool name_only)
printf("\n");
free(aliases);
}
+
+bool pmu_have_event(const char *pname, const char *name)
+{
+ struct perf_pmu *pmu;
+ struct perf_pmu_alias *alias;
+
+ pmu = NULL;
+ while ((pmu = perf_pmu__scan(pmu)) != NULL) {
+ if (strcmp(pname, pmu->name))
+ continue;
+ list_for_each_entry(alias, &pmu->aliases, list)
+ if (!strcmp(alias->name, name))
+ return true;
+ }
+ return false;
+}
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 6b2cbe2..1179b26 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -42,6 +42,7 @@ int perf_pmu__format_parse(char *dir, struct list_head *head);
struct perf_pmu *perf_pmu__scan(struct perf_pmu *pmu);

void print_pmu_events(const char *event_glob, bool name_only);
+bool pmu_have_event(const char *pname, const char *name);

int perf_pmu__test(void);
#endif /* __PMU_H */
--
1.8.3.1

2013-08-21 23:48:41

by Andi Kleen

[permalink] [raw]

Subject: [PATCH 1/4] perf, x86: Avoid checkpointed counters causing excessive TSX aborts v4

From: Andi Kleen <[email protected]>

With checkpointed counters there can be a situation where the counter
is overflowing, aborts the transaction, is set back to a non overflowing
checkpoint, causes interupt. The interrupt doesn't see the overflow
because it has been checkpointed. This is then a spurious PMI, typically with
a ugly NMI message. It can also lead to excessive aborts.

Avoid this problem by:
- Using the full counter width for counting counters (earlier patch)
- Forbid sampling for checkpointed counters. It's not too useful anyways,
checkpointing is mainly for counting. The check is approximate
(to still handle KVM), but should catch the majority of cases.
- On a PMI always set back checkpointed counters to zero.

v2: Add unlikely. Add comment
v3: Allow large sampling periods with CP for KVM
v4: Use event_is_checkpointed. Use EOPNOTSUPP. (Stephane Eranian)
Signed-off-by: Andi Kleen <[email protected]>
---
arch/x86/kernel/cpu/perf_event_intel.c | 39 ++++++++++++++++++++++++++++++++++
1 file changed, 39 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index a45d8d4..9218025 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1134,6 +1134,11 @@ static void intel_pmu_enable_event(struct perf_event *event)
__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
}

+static inline bool event_is_checkpointed(struct perf_event *event)
+{
+ return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0;
+}
+
/*
* Save and restart an expired event. Called by NMI contexts,
* so it has to be careful about preempting normal event ops:
@@ -1141,6 +1146,17 @@ static void intel_pmu_enable_event(struct perf_event *event)
int intel_pmu_save_and_restart(struct perf_event *event)
{
x86_perf_event_update(event);
+ /*
+ * For a checkpointed counter always reset back to 0. This
+ * avoids a situation where the counter overflows, aborts the
+ * transaction and is then set back to shortly before the
+ * overflow, and overflows and aborts again.
+ */
+ if (unlikely(event_is_checkpointed(event))) {
+ /* No race with NMIs because the counter should not be armed */
+ wrmsrl(event->hw.event_base, 0);
+ local64_set(&event->hw.prev_count, 0);
+ }
return x86_perf_event_set_period(event);
}

@@ -1224,6 +1240,15 @@ again:
x86_pmu.drain_pebs(regs);
}

+ /*
+ * To avoid spurious interrupts with perf stat always reset checkpointed
+ * counters.
+ *
+ * XXX move somewhere else.
+ */
+ if (cpuc->events[2] && event_is_checkpointed(cpuc->events[2]))
+ status |= (1ULL << 2);
+
for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
struct perf_event *event = cpuc->events[bit];

@@ -1689,6 +1714,20 @@ static int hsw_hw_config(struct perf_event *event)
event->attr.precise_ip > 0))
return -EOPNOTSUPP;

+ if (event_is_checkpointed(event)) {
+ /*
+ * Sampling of checkpointed events can cause situations where
+ * the CPU constantly aborts because of a overflow, which is
+ * then checkpointed back and ignored. Forbid checkpointing
+ * for sampling.
+ *
+ * But still allow a long sampling period, so that perf stat
+ * from KVM works.
+ */
+ if (event->attr.sample_period > 0 &&
+ event->attr.sample_period < 0x7fffffff)
+ return -EOPNOTSUPP;
+ }
return 0;
}

--
1.8.3.1

2013-08-21 23:48:53

by Andi Kleen

[permalink] [raw]

Subject: [PATCH 3/4] perf, x86: Add Haswell TSX event aliases v6

From: Andi Kleen <[email protected]>

Add TSX event aliases, and export them from the kernel to perf.

These are used by perf stat -T and to allow
more user friendly access to events. The events are designed to
be fairly generic and may also apply to other architectures
implementing HTM. They all cover common situations that
happens during tuning of transactional code.

For Haswell we have to separate the HLE and RTM events,
as they are separate in the PMU.

This adds the following events.

tx-start Count start transaction (used by perf stat -T)
tx-commit Count commit of transaction
tx-abort Count all aborts
tx-conflict Count aborts due to conflict with another CPU.
tx-capacity Count capacity aborts (transaction too large)

Then matching el-* events for HLE

cycles-t Transactional cycles (used by perf stat -T)
* also exists on POWER8
cycles-ct Transactional cycles commited (used by perf stat -T)
* according to Michael Ellerman POWER8 has a cycles-transactional-committed,
* perf stat -T handles both cases

Note for useful abort profiling often precise has to be set,
as Haswell can only report the point inside the transaction
with precise=2.

(I had another patchkit to allow exporting precise too, but Vince
Weaver pointed out it violates the ABI, so dropped now)

For some classes of aborts, like conflicts, this is not needed,
as it makes more sense to look at the complete critical section.

This gives a clean set of generalized events to examine transaction
success and aborts. Haswell has additional events for TSX, but those are more
specialized for very specific situations.

v2: Move to new sysfs infrastructure
v3: Use own sysfs functions now
v4: Add tx/el-abort-return for better conflict sampling
v5: Different white space.
v6: Cut down events, rewrite description.
Signed-off-by: Andi Kleen <[email protected]>
---
arch/x86/kernel/cpu/perf_event_intel.c | 27 +++++++++++++++++++++++++++
1 file changed, 27 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 9218025..ca7f02c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2076,7 +2076,34 @@ static __init void intel_nehalem_quirk(void)
EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3");
EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82")

+/* Haswell special events */
+EVENT_ATTR_STR(tx-start, tx_start, "event=0xc9,umask=0x1");
+EVENT_ATTR_STR(tx-commit, tx_commit, "event=0xc9,umask=0x2");
+EVENT_ATTR_STR(tx-abort, tx_abort, "event=0xc9,umask=0x4");
+EVENT_ATTR_STR(tx-capacity, tx_capacity, "event=0x54,umask=0x2");
+EVENT_ATTR_STR(tx-conflict, tx_conflict, "event=0x54,umask=0x1");
+EVENT_ATTR_STR(el-start, el_start, "event=0xc8,umask=0x1");
+EVENT_ATTR_STR(el-commit, el_commit, "event=0xc8,umask=0x2");
+EVENT_ATTR_STR(el-abort, el_abort, "event=0xc8,umask=0x4");
+EVENT_ATTR_STR(el-capacity, el_capacity, "event=0x54,umask=0x2");
+EVENT_ATTR_STR(el-conflict, el_conflict, "event=0x54,umask=0x1");
+EVENT_ATTR_STR(cycles-t, cycles_t, "event=0x3c,in_tx=1");
+EVENT_ATTR_STR(cycles-ct, cycles_ct,
+ "event=0x3c,in_tx=1,in_tx_cp=1");
+
static struct attribute *hsw_events_attrs[] = {
+ EVENT_PTR(tx_start),
+ EVENT_PTR(tx_commit),
+ EVENT_PTR(tx_abort),
+ EVENT_PTR(tx_capacity),
+ EVENT_PTR(tx_conflict),
+ EVENT_PTR(el_start),
+ EVENT_PTR(el_commit),
+ EVENT_PTR(el_abort),
+ EVENT_PTR(el_capacity),
+ EVENT_PTR(el_conflict),
+ EVENT_PTR(cycles_t),
+ EVENT_PTR(cycles_ct),
EVENT_PTR(mem_ld_hsw),
EVENT_PTR(mem_st_hsw),
NULL
--
1.8.3.1

2013-08-22 15:41:55

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH 4/4] perf, tools: Add perf stat --transaction v5

Em Wed, Aug 21, 2013 at 04:47:26PM -0700, Andi Kleen escreveu:
> From: Andi Kleen <[email protected]>
>
> Add support to perf stat to print the basic transactional execution statistics:
> Total cycles, Cycles in Transaction, Cycles in aborted transsactions
> using the in_tx and in_tx_checkpoint qualifiers.
> Transaction Starts and Elision Starts, to compute the average transaction
> length.

Ack

- Arnaldo

2013-08-30 16:02:22

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH 1/4] perf, x86: Avoid checkpointed counters causing excessive TSX aborts v4

On Wed, Aug 21, 2013 at 04:47:23PM -0700, Andi Kleen wrote:
> @@ -1224,6 +1240,15 @@ again:
> x86_pmu.drain_pebs(regs);
> }
>
> + /*
> + * To avoid spurious interrupts with perf stat always reset checkpointed
> + * counters.
> + *
> + * XXX move somewhere else.
> + */
> + if (cpuc->events[2] && event_is_checkpointed(cpuc->events[2]))
> + status |= (1ULL << 2);
> +
> for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
> struct perf_event *event = cpuc->events[bit];
>

I'm sure I commented on that XXX thing..

Yep, right here:

http://marc.info/?l=linux-kernel&m=137638980210140

Sup?

2013-08-30 20:44:48

by Andi Kleen

[permalink] [raw]

Subject: Re: [PATCH 1/4] perf, x86: Avoid checkpointed counters causing excessive TSX aborts v4

On Fri, Aug 30, 2013 at 06:02:15PM +0200, Peter Zijlstra wrote:
> On Wed, Aug 21, 2013 at 04:47:23PM -0700, Andi Kleen wrote:
> > @@ -1224,6 +1240,15 @@ again:
> > x86_pmu.drain_pebs(regs);
> > }
> >
> > + /*
> > + * To avoid spurious interrupts with perf stat always reset checkpointed
> > + * counters.
> > + *
> > + * XXX move somewhere else.
> > + */
> > + if (cpuc->events[2] && event_is_checkpointed(cpuc->events[2]))
> > + status |= (1ULL << 2);
> > +
> > for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
> > struct perf_event *event = cpuc->events[bit];
> >
>
> I'm sure I commented on that XXX thing..
>
> Yep, right here:

Ok.

>
> http://marc.info/?l=linux-kernel&m=137638980210140
>
> Sup?

I originally thought about precomputing the mask somewhere,
but it's not really that bad a place.

So can just drop the XXX comment. Ok?

-Andi

--
[email protected] -- Speaking for myself only

2013-08-31 15:08:09

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH 1/4] perf, x86: Avoid checkpointed counters causing excessive TSX aborts v4

On Fri, Aug 30, 2013 at 01:44:45PM -0700, Andi Kleen wrote:
> On Fri, Aug 30, 2013 at 06:02:15PM +0200, Peter Zijlstra wrote:
> > On Wed, Aug 21, 2013 at 04:47:23PM -0700, Andi Kleen wrote:
> > > @@ -1224,6 +1240,15 @@ again:
> > > x86_pmu.drain_pebs(regs);
> > > }
> > >
> > > + /*
> > > + * To avoid spurious interrupts with perf stat always reset checkpointed
> > > + * counters.
> > > + *
> > > + * XXX move somewhere else.
> > > + */
> > > + if (cpuc->events[2] && event_is_checkpointed(cpuc->events[2]))
> > > + status |= (1ULL << 2);
> > > +
> > > for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
> > > struct perf_event *event = cpuc->events[bit];
> > >

> So can just drop the XXX comment. Ok?

How about hiding the entire thing in a hsw function. I'm fairly sure
that eventually we'll need to check all counters for this nonsense.

Something like so perhaps?

---
arch/x86/kernel/cpu/perf_event_intel.c | 16 ++++++++++++++++
1 file changed, 16 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index a45d8d4..2a400b7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1170,6 +1170,20 @@ static void intel_pmu_reset(void)
local_irq_restore(flags);
}

+static void intel_pmu_hsw_tsx_status(struct cpu_hw_event *cpuc, u64 *status)
+{
+ const int idx = 2; /* only cnt2 supports TSX for now */
+ struct perf_event *event = cpuc->event[idx];
+
+ if (event_is_checkpoint(event)) {
+ /*
+ * In order to avoid spurious interrupts always reset
+ * checkpointed counters.
+ */
+ *status |= (1ULL << idx);
+ }
+}
+
/*
* This handler is triggered by the local APIC, so the APIC IRQ handling
* rules apply:
@@ -1224,6 +1238,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
x86_pmu.drain_pebs(regs);
}

+ intel_pmu_hsw_tsx_status(cpuc, &status);
+
for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
struct perf_event *event = cpuc->events[bit];

2013-08-31 15:42:26

by Andi Kleen

[permalink] [raw]

Subject: Re: [PATCH 1/4] perf, x86: Avoid checkpointed counters causing excessive TSX aborts v4

> > So can just drop the XXX comment. Ok?
>
> How about hiding the entire thing in a hsw function. I'm fairly sure
> that eventually we'll need to check all counters for this nonsense.

AFAIK there are no plans to do so.

>
> Something like so perhaps?

It's ok for me, except it's not for TSX (that's intx), but only for
intx_checkpointed.

Should I send a new patch?

-Andi

2013-10-04 17:33:04

by tip-bot for Vasyl Gomonovych

[permalink] [raw]

Subject: [tip:perf/core] tools/perf/stat: Add perf stat --transaction

Commit-ID: 4cabc3d1cb6a46f581a2628d1d11c483d5f300e5
Gitweb: http://git.kernel.org/tip/4cabc3d1cb6a46f581a2628d1d11c483d5f300e5
Author: Andi Kleen <[email protected]>
AuthorDate: Wed, 21 Aug 2013 16:47:26 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Fri, 4 Oct 2013 10:06:07 +0200

tools/perf/stat: Add perf stat --transaction

Add support to perf stat to print the basic transactional execution statistics:
Total cycles, Cycles in Transaction, Cycles in aborted transsactions
using the in_tx and in_tx_checkpoint qualifiers.
Transaction Starts and Elision Starts, to compute the average transaction
length.

This is a reasonable overview over the success of the transactions.

Also support architectures that have a transaction aborted cycles
counter like POWER8. Since that is awkward to handle in the kernel
abstract handle both cases here.

Enable with a new --transaction / -T option.

This requires measuring these events in a group, since they depend on each
other.

This is implemented by using TM sysfs events exported by the kernel

Signed-off-by: Andi Kleen <[email protected]>
Acked-by: Arnaldo Carvalho de Melo <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
tools/perf/Documentation/perf-stat.txt | 5 ++
tools/perf/builtin-stat.c | 144 ++++++++++++++++++++++++++++++++-
tools/perf/util/evsel.h | 6 ++
tools/perf/util/pmu.c | 16 ++++
tools/perf/util/pmu.h | 1 +
5 files changed, 171 insertions(+), 1 deletion(-)

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 73c9759..80c7da6 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -137,6 +137,11 @@ core number and the number of online logical processors on that physical process
After starting the program, wait msecs before measuring. This is useful to
filter out the startup phase of the program, which is often very different.

+-T::
+--transaction::
+
+Print statistics of transactional execution if supported.
+
EXAMPLES
--------

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index f686d5f..cc7efee 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -46,6 +46,7 @@
#include "util/util.h"
#include "util/parse-options.h"
#include "util/parse-events.h"
+#include "util/pmu.h"
#include "util/event.h"
#include "util/evlist.h"
#include "util/evsel.h"
@@ -70,6 +71,41 @@ static void print_counter_aggr(struct perf_evsel *counter, char *prefix);
static void print_counter(struct perf_evsel *counter, char *prefix);
static void print_aggr(char *prefix);

+/* Default events used for perf stat -T */
+static const char * const transaction_attrs[] = {
+ "task-clock",
+ "{"
+ "instructions,"
+ "cycles,"
+ "cpu/cycles-t/,"
+ "cpu/tx-start/,"
+ "cpu/el-start/,"
+ "cpu/cycles-ct/"
+ "}"
+};
+
+/* More limited version when the CPU does not have all events. */
+static const char * const transaction_limited_attrs[] = {
+ "task-clock",
+ "{"
+ "instructions,"
+ "cycles,"
+ "cpu/cycles-t/,"
+ "cpu/tx-start/"
+ "}"
+};
+
+/* must match transaction_attrs and the beginning limited_attrs */
+enum {
+ T_TASK_CLOCK,
+ T_INSTRUCTIONS,
+ T_CYCLES,
+ T_CYCLES_IN_TX,
+ T_TRANSACTION_START,
+ T_ELISION_START,
+ T_CYCLES_IN_TX_CP,
+};
+
static struct perf_evlist *evsel_list;

static struct perf_target target = {
@@ -90,6 +126,7 @@ static enum aggr_mode aggr_mode = AGGR_GLOBAL;
static volatile pid_t child_pid = -1;
static bool null_run = false;
static int detailed_run = 0;
+static bool transaction_run;
static bool big_num = true;
static int big_num_opt = -1;
static const char *csv_sep = NULL;
@@ -214,7 +251,10 @@ static struct stats runtime_l1_icache_stats[MAX_NR_CPUS];
static struct stats runtime_ll_cache_stats[MAX_NR_CPUS];
static struct stats runtime_itlb_cache_stats[MAX_NR_CPUS];
static struct stats runtime_dtlb_cache_stats[MAX_NR_CPUS];
+static struct stats runtime_cycles_in_tx_stats[MAX_NR_CPUS];
static struct stats walltime_nsecs_stats;
+static struct stats runtime_transaction_stats[MAX_NR_CPUS];
+static struct stats runtime_elision_stats[MAX_NR_CPUS];

static void perf_stat__reset_stats(struct perf_evlist *evlist)
{
@@ -236,6 +276,11 @@ static void perf_stat__reset_stats(struct perf_evlist *evlist)
memset(runtime_ll_cache_stats, 0, sizeof(runtime_ll_cache_stats));
memset(runtime_itlb_cache_stats, 0, sizeof(runtime_itlb_cache_stats));
memset(runtime_dtlb_cache_stats, 0, sizeof(runtime_dtlb_cache_stats));
+ memset(runtime_cycles_in_tx_stats, 0,
+ sizeof(runtime_cycles_in_tx_stats));
+ memset(runtime_transaction_stats, 0,
+ sizeof(runtime_transaction_stats));
+ memset(runtime_elision_stats, 0, sizeof(runtime_elision_stats));
memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats));
}

@@ -274,6 +319,29 @@ static inline int nsec_counter(struct perf_evsel *evsel)
return 0;
}

+static struct perf_evsel *nth_evsel(int n)
+{
+ static struct perf_evsel **array;
+ static int array_len;
+ struct perf_evsel *ev;
+ int j;
+
+ /* Assumes this only called when evsel_list does not change anymore. */
+ if (!array) {
+ list_for_each_entry(ev, &evsel_list->entries, node)
+ array_len++;
+ array = malloc(array_len * sizeof(void *));
+ if (!array)
+ exit(ENOMEM);
+ j = 0;
+ list_for_each_entry(ev, &evsel_list->entries, node)
+ array[j++] = ev;
+ }
+ if (n < array_len)
+ return array[n];
+ return NULL;
+}
+
/*
* Update various tracking values we maintain to print
* more semantic information such as miss/hit ratios,
@@ -285,6 +353,15 @@ static void update_shadow_stats(struct perf_evsel *counter, u64 *count)
update_stats(&runtime_nsecs_stats[0], count[0]);
else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
update_stats(&runtime_cycles_stats[0], count[0]);
+ else if (transaction_run &&
+ perf_evsel__cmp(counter, nth_evsel(T_CYCLES_IN_TX)))
+ update_stats(&runtime_cycles_in_tx_stats[0], count[0]);
+ else if (transaction_run &&
+ perf_evsel__cmp(counter, nth_evsel(T_TRANSACTION_START)))
+ update_stats(&runtime_transaction_stats[0], count[0]);
+ else if (transaction_run &&
+ perf_evsel__cmp(counter, nth_evsel(T_ELISION_START)))
+ update_stats(&runtime_elision_stats[0], count[0]);
else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
update_stats(&runtime_stalled_cycles_front_stats[0], count[0]);
else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
@@ -827,7 +904,7 @@ static void print_ll_cache_misses(int cpu,

static void abs_printout(int cpu, int nr, struct perf_evsel *evsel, double avg)
{
- double total, ratio = 0.0;
+ double total, ratio = 0.0, total2;
const char *fmt;

if (csv_output)
@@ -923,6 +1000,43 @@ static void abs_printout(int cpu, int nr, struct perf_evsel *evsel, double avg)
ratio = 1.0 * avg / total;

fprintf(output, " # %8.3f GHz ", ratio);
+ } else if (transaction_run &&
+ perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX))) {
+ total = avg_stats(&runtime_cycles_stats[cpu]);
+ if (total)
+ fprintf(output,
+ " # %5.2f%% transactional cycles ",
+ 100.0 * (avg / total));
+ } else if (transaction_run &&
+ perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX_CP))) {
+ total = avg_stats(&runtime_cycles_stats[cpu]);
+ total2 = avg_stats(&runtime_cycles_in_tx_stats[cpu]);
+ if (total2 < avg)
+ total2 = avg;
+ if (total)
+ fprintf(output,
+ " # %5.2f%% aborted cycles ",
+ 100.0 * ((total2-avg) / total));
+ } else if (transaction_run &&
+ perf_evsel__cmp(evsel, nth_evsel(T_TRANSACTION_START)) &&
+ avg > 0 &&
+ runtime_cycles_in_tx_stats[cpu].n != 0) {
+ total = avg_stats(&runtime_cycles_in_tx_stats[cpu]);
+
+ if (total)
+ ratio = total / avg;
+
+ fprintf(output, " # %8.0f cycles / transaction ", ratio);
+ } else if (transaction_run &&
+ perf_evsel__cmp(evsel, nth_evsel(T_ELISION_START)) &&
+ avg > 0 &&
+ runtime_cycles_in_tx_stats[cpu].n != 0) {
+ total = avg_stats(&runtime_cycles_in_tx_stats[cpu]);
+
+ if (total)
+ ratio = total / avg;
+
+ fprintf(output, " # %8.0f cycles / elision ", ratio);
} else if (runtime_nsecs_stats[cpu].n != 0) {
char unit = 'M';

@@ -1236,6 +1350,16 @@ static int perf_stat_init_aggr_mode(void)
return 0;
}

+static int setup_events(const char * const *attrs, unsigned len)
+{
+ unsigned i;
+
+ for (i = 0; i < len; i++) {
+ if (parse_events(evsel_list, attrs[i]))
+ return -1;
+ }
+ return 0;
+}

/*
* Add default attributes, if there were no attributes specified or
@@ -1354,6 +1478,22 @@ static int add_default_attributes(void)
if (null_run)
return 0;

+ if (transaction_run) {
+ int err;
+ if (pmu_have_event("cpu", "cycles-ct") &&
+ pmu_have_event("cpu", "el-start"))
+ err = setup_events(transaction_attrs,
+ ARRAY_SIZE(transaction_attrs));
+ else
+ err = setup_events(transaction_limited_attrs,
+ ARRAY_SIZE(transaction_limited_attrs));
+ if (err < 0) {
+ fprintf(stderr, "Cannot set up transaction events\n");
+ return -1;
+ }
+ return 0;
+ }
+
if (!evsel_list->nr_entries) {
if (perf_evlist__add_default_attrs(evsel_list, default_attrs) < 0)
return -1;
@@ -1388,6 +1528,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
int output_fd = 0;
const char *output_name = NULL;
const struct option options[] = {
+ OPT_BOOLEAN('T', "transaction", &transaction_run,
+ "hardware transaction statistics"),
OPT_CALLBACK('e', "event", &evsel_list, "event",
"event selector. use 'perf list' to list available events",
parse_events_option),
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 4a7bdc7..5aa68cd 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -197,6 +197,12 @@ static inline bool perf_evsel__match2(struct perf_evsel *e1,
(e1->attr.config == e2->attr.config);
}

+#define perf_evsel__cmp(a, b) \
+ ((a) && \
+ (b) && \
+ (a)->attr.type == (b)->attr.type && \
+ (a)->attr.config == (b)->attr.config)
+
int __perf_evsel__read_on_cpu(struct perf_evsel *evsel,
int cpu, int thread, bool scale);

diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index bc9d806..64362fe 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -637,3 +637,19 @@ void print_pmu_events(const char *event_glob, bool name_only)
printf("\n");
free(aliases);
}
+
+bool pmu_have_event(const char *pname, const char *name)
+{
+ struct perf_pmu *pmu;
+ struct perf_pmu_alias *alias;
+
+ pmu = NULL;
+ while ((pmu = perf_pmu__scan(pmu)) != NULL) {
+ if (strcmp(pname, pmu->name))
+ continue;
+ list_for_each_entry(alias, &pmu->aliases, list)
+ if (!strcmp(alias->name, name))
+ return true;
+ }
+ return false;
+}
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 6b2cbe2..1179b26 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -42,6 +42,7 @@ int perf_pmu__format_parse(char *dir, struct list_head *head);
struct perf_pmu *perf_pmu__scan(struct perf_pmu *pmu);

void print_pmu_events(const char *event_glob, bool name_only);
+bool pmu_have_event(const char *pname, const char *name);

int perf_pmu__test(void);
#endif /* __PMU_H */