This patchs adds a new sample record type called
PERF_SAMPLE_SKID_IP. The goal is to record
the unmodified interrupted instruction pointer (IP) as seen by
the kernel and reflected in the machine state.
On some architectures, it is possible to avoid the IP skid using
hardware support. For instance, on Intel x86, the use of PEBS helps
eliminate the skid on Haswell and later processors. On older Intel
processor, software, i..e, the kernel, may succeed in eliminating
the skid.
Without this patch, on Haswell processors, if you set:
- attr.precise = 0, then you get the skid IP
- attr.precise = 1, then you get the skid PEBS ip (off-by-1)
- attr.precise = 2, then you get the skidless PEBS ip
The IP is captured when the event has PERF_SAMPLE_IP set in sample_type.
However, there are certain measurements where you need to have BOTH
the skidless IP and the skid IP. For instance, when studying branches,
the skid IP usually points to the target of the branch while the skidless
IP points to the branch instruction itself. Today, it is not possible to retrieve
both at the same time. This patch makes this possible by specifying
PERF_SAMPLE_IP|PERF_SAMPLE_SKID_IP.
As an example, consider the following code snipet:
37.51 │42c2ed je 42c2f3
│42c2ef add $0x1,%rdx ▒
│42c2f3 sub $0x1,%rax ▒
When using PEBS (precise=2) and sampling on BR_INST_RETIRED.CONDITIONAL,
the IP always points to 0x42c2ed. With precise=1, the IP would point to
0x42c2f3. It is interesting to collect both IPs in a single run to determine
how often the conditional branch is taken vs. non-taken.
Stephane Eranian (5):
perf/core: add PERF_SAMPLE_SKID_IP record type
perf/x86: add PERF_SAMPLE_SKID_IP support for X86 PEBS
perf/tools: add support for PERF_SAMPLE_SKID_IP
perf/record: add support for sampling skid ip
perf/script: add support for skid ip
arch/x86/events/intel/ds.c | 7 +++++++
include/linux/perf_event.h | 2 ++
include/uapi/linux/perf_event.h | 4 +++-
kernel/events/core.c | 14 ++++++++++++++
tools/include/uapi/linux/perf_event.h | 4 +++-
tools/perf/Documentation/perf-record.txt | 8 ++++++++
tools/perf/builtin-record.c | 2 ++
tools/perf/builtin-script.c | 12 +++++++++++-
tools/perf/perf.h | 1 +
tools/perf/util/event.h | 1 +
tools/perf/util/evsel.c | 11 ++++++++++-
tools/perf/util/session.c | 3 +++
12 files changed, 65 insertions(+), 4 deletions(-)
--
2.13.1.518.g3df882009-goog
This patch adds the support code to handle the PERF_SAMPLE_SKID_IP
record type.
Signed-off-by: Stephane Eranian <[email protected]>
---
tools/include/uapi/linux/perf_event.h | 4 +++-
tools/perf/perf.h | 1 +
tools/perf/util/event.h | 1 +
tools/perf/util/evsel.c | 11 ++++++++++-
tools/perf/util/session.c | 3 +++
5 files changed, 18 insertions(+), 2 deletions(-)
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index b1c0b187acfe..0ebb6bee75a9 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -139,8 +139,9 @@ enum perf_event_sample_format {
PERF_SAMPLE_IDENTIFIER = 1U << 16,
PERF_SAMPLE_TRANSACTION = 1U << 17,
PERF_SAMPLE_REGS_INTR = 1U << 18,
+ PERF_SAMPLE_SKID_IP = 1U << 19,
- PERF_SAMPLE_MAX = 1U << 19, /* non-ABI */
+ PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */
};
/*
@@ -791,6 +792,7 @@ enum perf_event_type {
* { u64 transaction; } && PERF_SAMPLE_TRANSACTION
* { u64 abi; # enum perf_sample_regs_abi
* u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
+ * { u64 skid_ip; } && PERF_SAMPLE_SKID_IP
* };
*/
PERF_RECORD_SAMPLE = 9,
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 806c216a1078..64ce89a61f8c 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -57,6 +57,7 @@ struct record_opts {
bool tail_synthesize;
bool overwrite;
bool ignore_missing_thread;
+ bool skid_ip;
unsigned int freq;
unsigned int mmap_pages;
unsigned int auxtrace_mmap_pages;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 7c3fa1c8cbcd..f19bf5e5696c 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -199,6 +199,7 @@ struct perf_sample {
u32 cpu;
u32 raw_size;
u64 data_src;
+ u64 skid_ip;
u32 flags;
u16 insn_len;
u8 cpumode;
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index e4f7902d5afa..08d71e83bd07 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -937,6 +937,9 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts,
if (opts->sample_weight)
perf_evsel__set_sample_bit(evsel, WEIGHT);
+ if (opts->skid_ip)
+ perf_evsel__set_sample_bit(evsel, SKID_IP);
+
attr->task = track;
attr->mmap = track;
attr->mmap2 = track && !perf_missing_features.mmap2;
@@ -1319,7 +1322,7 @@ static void __p_sample_type(char *buf, size_t size, u64 value)
bit_name(PERIOD), bit_name(STREAM_ID), bit_name(RAW),
bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER),
bit_name(IDENTIFIER), bit_name(REGS_INTR), bit_name(DATA_SRC),
- bit_name(WEIGHT),
+ bit_name(WEIGHT), bit_name(SKID_IP),
{ .name = NULL, }
};
#undef bit_name
@@ -2043,6 +2046,12 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
}
}
+ data->skid_ip = 0;
+ if (type & PERF_SAMPLE_SKID_IP) {
+ data->skid_ip = *array;
+ array++;
+ }
+
return 0;
}
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 7dc1096264c5..0bfc1051c708 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1123,6 +1123,9 @@ static void dump_sample(struct perf_evsel *evsel, union perf_event *event,
if (sample_type & PERF_SAMPLE_READ)
sample_read__printf(sample, evsel->attr.read_format);
+
+ if (sample_type & PERF_SAMPLE_SKID_IP)
+ printf("... skid_ip: %" PRIu64 "\n", sample->skid_ip);
}
static struct machine *machines__find_for_cpumode(struct machines *machines,
--
2.13.1.518.g3df882009-goog
This patch adds a new --skid-ip option to perf record
to capture the unmodified interrupted instruction pointer in
each sample. With this option, the perf tool can request that
the kernel records both the ip and skid IP in each sample.
Unless precise mode is enabled both IPis are the same. They may be
different in precise mode depending on the event.
$ perf record --skid-ip .....
Signed-off-by: Stephane Eranian <[email protected]>
---
tools/perf/Documentation/perf-record.txt | 8 ++++++++
tools/perf/builtin-record.c | 2 ++
2 files changed, 10 insertions(+)
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index b0e9e921d534..fa24376c20e0 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -477,6 +477,14 @@ config terms. For example: 'cycles/overwrite/' and 'instructions/no-overwrite/'.
Implies --tail-synthesize.
+--skid-ip::
+Capture the unmodified interrupt instruction pointer (IP) in each sample. Usually
+with event-based sampling, the ip has skid and rarely points to the instruction which
+caused the event to overflow. On some architectures, the hardware can eliminate the
+skid and perf_events returns it as the IP when precise sampling is enabled. But for
+certain measurements, it may be useful to have both the corrected and skid ip. This
+option enables capturing the skid ip in addition to the corrected ip. Default: off
+
SEE ALSO
--------
linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index ee7d0a82ccd0..66b497dc4333 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1669,6 +1669,8 @@ static struct option __record_options[] = {
"signal"),
OPT_BOOLEAN(0, "dry-run", &dry_run,
"Parse options then exit"),
+ OPT_BOOLEAN(0, "skid-ip", &record.opts.skid_ip,
+ "capture skid ip in addition to ip"),
OPT_END()
};
--
2.13.1.518.g3df882009-goog
This patch adds a skid_ip field to perf script
to dump the raw value of the PERF_SAMPLE_SKID_IP
field in each sample.
$ perf script -F ip,skid_ip ......
Signed-off-by: Stephane Eranian <[email protected]>
---
tools/perf/builtin-script.c | 12 +++++++++++-
tools/perf/util/session.c | 2 +-
2 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 4761b0d7fcb5..ba6720cf70ce 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -85,6 +85,7 @@ enum perf_output_field {
PERF_OUTPUT_INSN = 1U << 21,
PERF_OUTPUT_INSNLEN = 1U << 22,
PERF_OUTPUT_BRSTACKINSN = 1U << 23,
+ PERF_OUTPUT_SKID_IP = 1U << 24,
};
struct output_option {
@@ -115,6 +116,7 @@ struct output_option {
{.str = "insn", .field = PERF_OUTPUT_INSN},
{.str = "insnlen", .field = PERF_OUTPUT_INSNLEN},
{.str = "brstackinsn", .field = PERF_OUTPUT_BRSTACKINSN},
+ {.str = "skid_ip", .field = PERF_OUTPUT_SKID_IP},
};
/* default set to maintain compatibility with current format */
@@ -1125,6 +1127,11 @@ static size_t data_src__printf(u64 data_src)
return printf("%-*s", maxlen, out);
}
+static void print_sample_skid_ip(struct perf_sample *sample)
+{
+ printf(" %"PRIx64" ", sample->skid_ip);
+}
+
static void process_event(struct perf_script *script,
struct perf_sample *sample, struct perf_evsel *evsel,
struct addr_location *al,
@@ -1193,7 +1200,10 @@ static void process_event(struct perf_script *script,
if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT))
print_sample_bpf_output(sample);
- print_insn(sample, attr, thread, machine);
+
+ if (PRINT_FIELD(SKID_IP))
+ print_sample_skid_ip(sample);
+
printf("\n");
}
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 0bfc1051c708..51b90630cb32 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1125,7 +1125,7 @@ static void dump_sample(struct perf_evsel *evsel, union perf_event *event,
sample_read__printf(sample, evsel->attr.read_format);
if (sample_type & PERF_SAMPLE_SKID_IP)
- printf("... skid_ip: %" PRIu64 "\n", sample->skid_ip);
+ printf("... skid_ip: 0x%" PRIx64 "\n", sample->skid_ip);
}
static struct machine *machines__find_for_cpumode(struct machines *machines,
--
2.13.1.518.g3df882009-goog
This patchs adds a new sample record type called
PERF_SAMPLE_SKID_IP. The goal is to record
the interrupted instruction pointer (IP) as seen by the
kernel and reflected in the machine state.
On some architectures, it is possible to avoid the IP skid using
hardware support. For instance, on Intel x86, the use of PEBS helps
eliminate the skid on Haswell and later processors.
Without this patch, on Haswell processors, if you set:
- attr.precise = 0, then you get the skid IP
- attr.precise = 1, then you get the skid PEBS ip (off-by-1)
- attr.precise = 2, then you get the skidless PEBS
The IP is captured when the event has PERF_RECORD_SAMPLE_IP set in sample_type.
However, there are certain measurements where you need to have BOTH
the skidless IP and the skid IP. For instance, when studying branches,
the skid IP usually points to the target of the branch while the skidless
IP points to the branch instruction itself. Today, it is not possible to retrieve
both at the same time. This patch makes this possible by specifying
PERF_SAMPLE_IP|PERF_SAMPLE_SKID_IP.
Signed-off-by: Stephane Eranian <[email protected]>
---
include/linux/perf_event.h | 2 ++
include/uapi/linux/perf_event.h | 4 +++-
kernel/events/core.c | 14 ++++++++++++++
3 files changed, 19 insertions(+), 1 deletion(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7d6aa29094b2..ac63f6af6f53 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -944,6 +944,7 @@ struct perf_sample_data {
struct perf_regs regs_intr;
u64 stack_user_size;
+ u64 skid_ip;
} ____cacheline_aligned;
/* default value for data source */
@@ -964,6 +965,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
data->weight = 0;
data->data_src.val = PERF_MEM_NA;
data->txn = 0;
+ data->skid_ip = 0; /* mark as uinitialized */
}
extern void perf_output_sample(struct perf_output_handle *handle,
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index b1c0b187acfe..48e736eb20bd 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -139,8 +139,9 @@ enum perf_event_sample_format {
PERF_SAMPLE_IDENTIFIER = 1U << 16,
PERF_SAMPLE_TRANSACTION = 1U << 17,
PERF_SAMPLE_REGS_INTR = 1U << 18,
+ PERF_SAMPLE_SKID_IP = 1U << 19,
- PERF_SAMPLE_MAX = 1U << 19, /* non-ABI */
+ PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */
};
/*
@@ -791,6 +792,7 @@ enum perf_event_type {
* { u64 transaction; } && PERF_SAMPLE_TRANSACTION
* { u64 abi; # enum perf_sample_regs_abi
* u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
+ * { u64 skid_ip; } && PERF_SAMPLE_SKID_IP
* };
*/
PERF_RECORD_SAMPLE = 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4d2c32f98482..e5afc3cbf287 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1563,6 +1563,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
if (sample_type & PERF_SAMPLE_TRANSACTION)
size += sizeof(data->txn);
+ if (sample_type & PERF_SAMPLE_SKID_IP)
+ size += sizeof(data->skid_ip);
+
event->header_size = size;
}
@@ -5947,6 +5950,9 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}
+ if (sample_type & PERF_SAMPLE_SKID_IP)
+ perf_output_put(handle, data->skid_ip);
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
@@ -5980,6 +5986,14 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_IP)
data->ip = perf_instruction_pointer(regs);
+ /*
+ * if skid_ip has not been set by arch specific code, then
+ * we initialize it to IP as interrupt-based sampling has
+ * skid
+ */
+ if (!data->skid_ip && (sample_type & PERF_SAMPLE_SKID_IP))
+ data->skid_ip = perf_instruction_pointer(regs);
+
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
int size = 1;
--
2.13.1.518.g3df882009-goog
This patch adds support for SKID_IP to Intel x86 processors
in PEBS mode. In that case, the off-by-1 IP from PEBS is returned in
the SKID_IP field.
Signed-off-by: Stephane Eranian <[email protected]>
---
arch/x86/events/intel/ds.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index c6d23ffe422d..ee17de5d6b8d 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1169,6 +1169,13 @@ static void setup_pebs_sample_data(struct perf_event *event,
x86_pmu.intel_cap.pebs_format >= 1)
data->addr = pebs->dla;
+ /*
+ * unmodified, skid IP which is guaranteed to be the next
+ * dyanmic instruction
+ */
+ if (sample_type & PERF_SAMPLE_SKID_IP)
+ data->skid_ip = pebs->ip;
+
if (x86_pmu.intel_cap.pebs_format >= 2) {
/* Only set the TSX weight when no memory weight. */
if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
--
2.13.1.518.g3df882009-goog
On Thu, Jun 15, 2017 at 06:56:24AM -0700, Stephane Eranian wrote:
> This patchs adds a new sample record type called
> PERF_SAMPLE_SKID_IP. The goal is to record
> the unmodified interrupted instruction pointer (IP) as seen by
> the kernel and reflected in the machine state.
Patches look reasonable for me.
If you only cared about branches it would be more natural to model
it like a 1 entry LBR. That would make a lot more tooling work
automatically.
-Andi
> This patch adds support for SKID_IP to Intel x86 processors in PEBS mode. In
> that case, the off-by-1 IP from PEBS is returned in the SKID_IP field.
It looks we can only get different skid_ip and ip with :pp event (attr.precise = 2).
With the :p event (attr.precise = 1), the skid_ip and ip are the same. Right?
Thanks,
Kan
>
> Signed-off-by: Stephane Eranian <[email protected]>
> ---
> arch/x86/events/intel/ds.c | 7 +++++++
> 1 file changed, 7 insertions(+)
>
> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index
> c6d23ffe422d..ee17de5d6b8d 100644
> --- a/arch/x86/events/intel/ds.c
> +++ b/arch/x86/events/intel/ds.c
> @@ -1169,6 +1169,13 @@ static void setup_pebs_sample_data(struct
> perf_event *event,
> x86_pmu.intel_cap.pebs_format >= 1)
> data->addr = pebs->dla;
>
> + /*
> + * unmodified, skid IP which is guaranteed to be the next
> + * dyanmic instruction
> + */
> + if (sample_type & PERF_SAMPLE_SKID_IP)
> + data->skid_ip = pebs->ip;
> +
> if (x86_pmu.intel_cap.pebs_format >= 2) {
> /* Only set the TSX weight when no memory weight. */
> if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
> --
> 2.13.1.518.g3df882009-goog
On Thu, Jun 15, 2017 at 8:40 AM, Liang, Kan <[email protected]> wrote:
>
>
>> This patch adds support for SKID_IP to Intel x86 processors in PEBS mode. In
>> that case, the off-by-1 IP from PEBS is returned in the SKID_IP field.
>
> It looks we can only get different skid_ip and ip with :pp event (attr.precise = 2).
> With the :p event (attr.precise = 1), the skid_ip and ip are the same. Right?
>
Correct, because skid_ip would be equal to the pebs->ip.
> Thanks,
> Kan
>
>>
>> Signed-off-by: Stephane Eranian <[email protected]>
>> ---
>> arch/x86/events/intel/ds.c | 7 +++++++
>> 1 file changed, 7 insertions(+)
>>
>> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index
>> c6d23ffe422d..ee17de5d6b8d 100644
>> --- a/arch/x86/events/intel/ds.c
>> +++ b/arch/x86/events/intel/ds.c
>> @@ -1169,6 +1169,13 @@ static void setup_pebs_sample_data(struct
>> perf_event *event,
>> x86_pmu.intel_cap.pebs_format >= 1)
>> data->addr = pebs->dla;
>>
>> + /*
>> + * unmodified, skid IP which is guaranteed to be the next
>> + * dyanmic instruction
>> + */
>> + if (sample_type & PERF_SAMPLE_SKID_IP)
>> + data->skid_ip = pebs->ip;
>> +
>> if (x86_pmu.intel_cap.pebs_format >= 2) {
>> /* Only set the TSX weight when no memory weight. */
>> if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
>> --
>> 2.13.1.518.g3df882009-goog
>
On Thu, Jun 15, 2017 at 8:10 AM, Andi Kleen <[email protected]> wrote:
> On Thu, Jun 15, 2017 at 06:56:24AM -0700, Stephane Eranian wrote:
>> This patchs adds a new sample record type called
>> PERF_SAMPLE_SKID_IP. The goal is to record
>> the unmodified interrupted instruction pointer (IP) as seen by
>> the kernel and reflected in the machine state.
>
> Patches look reasonable for me.
>
> If you only cared about branches it would be more natural to model
> it like a 1 entry LBR. That would make a lot more tooling work
> automatically.
>
You'd still have to modify tooling to present correct column headers.
It is mostly interesting for branches for sure but any situation where you
want to skidless IP and the skid IP would work because they tell you
something about the sampled instruction.
> On Thu, Jun 15, 2017 at 8:40 AM, Liang, Kan <[email protected]> wrote:
> >
> >
> >> This patch adds support for SKID_IP to Intel x86 processors in PEBS
> >> mode. In that case, the off-by-1 IP from PEBS is returned in the SKID_IP
> field.
> >
> > It looks we can only get different skid_ip and ip with :pp event (attr.precise
> = 2).
> > With the :p event (attr.precise = 1), the skid_ip and ip are the same. Right?
> >
> Correct, because skid_ip would be equal to the pebs->ip.
I think it's better to make it clear for user by adding a check in perf tool.
If --skid-ip is applied with :p event/non-PEBS event, the tool may
suggest them to use :pp event instead.
Thanks,
Kan
> >
> >>
> >> Signed-off-by: Stephane Eranian <[email protected]>
> >> ---
> >> arch/x86/events/intel/ds.c | 7 +++++++
> >> 1 file changed, 7 insertions(+)
> >>
> >> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
> >> index c6d23ffe422d..ee17de5d6b8d 100644
> >> --- a/arch/x86/events/intel/ds.c
> >> +++ b/arch/x86/events/intel/ds.c
> >> @@ -1169,6 +1169,13 @@ static void setup_pebs_sample_data(struct
> >> perf_event *event,
> >> x86_pmu.intel_cap.pebs_format >= 1)
> >> data->addr = pebs->dla;
> >>
> >> + /*
> >> + * unmodified, skid IP which is guaranteed to be the next
> >> + * dyanmic instruction
> >> + */
> >> + if (sample_type & PERF_SAMPLE_SKID_IP)
> >> + data->skid_ip = pebs->ip;
> >> +
> >> if (x86_pmu.intel_cap.pebs_format >= 2) {
> >> /* Only set the TSX weight when no memory weight. */
> >> if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
> >> --
> >> 2.13.1.518.g3df882009-goog
> >
On Thu, Jun 15, 2017 at 09:44:07AM -0700, Stephane Eranian wrote:
> On Thu, Jun 15, 2017 at 8:10 AM, Andi Kleen <[email protected]> wrote:
> > On Thu, Jun 15, 2017 at 06:56:24AM -0700, Stephane Eranian wrote:
> >> This patchs adds a new sample record type called
> >> PERF_SAMPLE_SKID_IP. The goal is to record
> >> the unmodified interrupted instruction pointer (IP) as seen by
> >> the kernel and reflected in the machine state.
> >
> > Patches look reasonable for me.
> >
> > If you only cared about branches it would be more natural to model
> > it like a 1 entry LBR. That would make a lot more tooling work
> > automatically.
> >
> You'd still have to modify tooling to present correct column headers.
Why? It's from/to?
-Andi
Hi,
On Thu, Jun 15, 2017 at 9:39 AM, Stephane Eranian <[email protected]> wrote:
> On Thu, Jun 15, 2017 at 8:40 AM, Liang, Kan <[email protected]> wrote:
>>
>>
>>> This patch adds support for SKID_IP to Intel x86 processors in PEBS mode. In
>>> that case, the off-by-1 IP from PEBS is returned in the SKID_IP field.
>>
>> It looks we can only get different skid_ip and ip with :pp event (attr.precise = 2).
>> With the :p event (attr.precise = 1), the skid_ip and ip are the same. Right?
>>
> Correct, because skid_ip would be equal to the pebs->ip.
>
To be more precise, you can always specify PERF_SAMPLE_SKID_IP
with any event or sampling mode. By default, the value saved in each
sample will be the same as the one used for PERF_SAMPLE_IP. On
Intel x86, it only differs with precise mode=2 for events supporting PEBS.
>>> Signed-off-by: Stephane Eranian <[email protected]>
>>> ---
>>> arch/x86/events/intel/ds.c | 7 +++++++
>>> 1 file changed, 7 insertions(+)
>>>
>>> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index
>>> c6d23ffe422d..ee17de5d6b8d 100644
>>> --- a/arch/x86/events/intel/ds.c
>>> +++ b/arch/x86/events/intel/ds.c
>>> @@ -1169,6 +1169,13 @@ static void setup_pebs_sample_data(struct
>>> perf_event *event,
>>> x86_pmu.intel_cap.pebs_format >= 1)
>>> data->addr = pebs->dla;
>>>
>>> + /*
>>> + * unmodified, skid IP which is guaranteed to be the next
>>> + * dyanmic instruction
>>> + */
>>> + if (sample_type & PERF_SAMPLE_SKID_IP)
>>> + data->skid_ip = pebs->ip;
>>> +
>>> if (x86_pmu.intel_cap.pebs_format >= 2) {
>>> /* Only set the TSX weight when no memory weight. */
>>> if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
>>> --
>>> 2.13.1.518.g3df882009-goog
>>
On Thu, Jun 15, 2017 at 10:23 AM, Andi Kleen <[email protected]> wrote:
> On Thu, Jun 15, 2017 at 09:44:07AM -0700, Stephane Eranian wrote:
>> On Thu, Jun 15, 2017 at 8:10 AM, Andi Kleen <[email protected]> wrote:
>> > On Thu, Jun 15, 2017 at 06:56:24AM -0700, Stephane Eranian wrote:
>> >> This patchs adds a new sample record type called
>> >> PERF_SAMPLE_SKID_IP. The goal is to record
>> >> the unmodified interrupted instruction pointer (IP) as seen by
>> >> the kernel and reflected in the machine state.
>> >
>> > Patches look reasonable for me.
>> >
>> > If you only cared about branches it would be more natural to model
>> > it like a 1 entry LBR. That would make a lot more tooling work
>> > automatically.
>> >
>> You'd still have to modify tooling to present correct column headers.
>
> Why? It's from/to?
>
Ah, yes you are right, but it is not clear to me how you would specify
this cleanly with the interface.
Especially in the case where this could be used for non-branch instructions.
> -Andi
On Thu, Jun 15, 2017 at 12:35:39PM -0700, Stephane Eranian wrote:
> On Thu, Jun 15, 2017 at 10:23 AM, Andi Kleen <[email protected]> wrote:
> > On Thu, Jun 15, 2017 at 09:44:07AM -0700, Stephane Eranian wrote:
> >> On Thu, Jun 15, 2017 at 8:10 AM, Andi Kleen <[email protected]> wrote:
> >> > On Thu, Jun 15, 2017 at 06:56:24AM -0700, Stephane Eranian wrote:
> >> >> This patchs adds a new sample record type called
> >> >> PERF_SAMPLE_SKID_IP. The goal is to record
> >> >> the unmodified interrupted instruction pointer (IP) as seen by
> >> >> the kernel and reflected in the machine state.
> >> >
> >> > Patches look reasonable for me.
> >> >
> >> > If you only cared about branches it would be more natural to model
> >> > it like a 1 entry LBR. That would make a lot more tooling work
> >> > automatically.
> >> >
> >> You'd still have to modify tooling to present correct column headers.
> >
> > Why? It's from/to?
> >
> Ah, yes you are right, but it is not clear to me how you would specify
> this cleanly with the interface.
> Especially in the case where this could be used for non-branch instructions.
Generally the skid ip is only interesting for instructions that have some kind of
control flow change, or an exception/interrupt.
So if it's interesting the headers would be correct.
Actually it's even correct for non control flow change, because the IP
moves from "FROM" to "TO" ...
-Andi
On Thu, Jun 15, 2017 at 1:02 PM, Andi Kleen <[email protected]> wrote:
> On Thu, Jun 15, 2017 at 12:35:39PM -0700, Stephane Eranian wrote:
>> On Thu, Jun 15, 2017 at 10:23 AM, Andi Kleen <[email protected]> wrote:
>> > On Thu, Jun 15, 2017 at 09:44:07AM -0700, Stephane Eranian wrote:
>> >> On Thu, Jun 15, 2017 at 8:10 AM, Andi Kleen <[email protected]> wrote:
>> >> > On Thu, Jun 15, 2017 at 06:56:24AM -0700, Stephane Eranian wrote:
>> >> >> This patchs adds a new sample record type called
>> >> >> PERF_SAMPLE_SKID_IP. The goal is to record
>> >> >> the unmodified interrupted instruction pointer (IP) as seen by
>> >> >> the kernel and reflected in the machine state.
>> >> >
>> >> > Patches look reasonable for me.
>> >> >
>> >> > If you only cared about branches it would be more natural to model
>> >> > it like a 1 entry LBR. That would make a lot more tooling work
>> >> > automatically.
>> >> >
>> >> You'd still have to modify tooling to present correct column headers.
>> >
>> > Why? It's from/to?
>> >
>> Ah, yes you are right, but it is not clear to me how you would specify
>> this cleanly with the interface.
>> Especially in the case where this could be used for non-branch instructions.
>
> Generally the skid ip is only interesting for instructions that have some kind of
> control flow change, or an exception/interrupt.
>
True, right now I found use for the control flow changes, but any PEBS enabled
event + precise=2 could potentially be useful.
> So if it's interesting the headers would be correct.
>
> Actually it's even correct for non control flow change, because the IP
> moves from "FROM" to "TO" ...
>
Sure.
But I am not too fond of tying this to a branch abstraction. You'd
have to express
this with the perf_events interface.To trigger LBR-style hardware, you'd have
to define a new PERF_SAMPLE_BRANCH_SKID_IP of some sort and then you
tie this to a branch kind of sampling feature:
$ perf record -j skid_ip -e BR_INST_RETIRED.CONDITIONAL:pp
Is what I think you are thinking about.
On Thu, Jun 15, 2017 at 1:20 PM, Stephane Eranian <[email protected]> wrote:
> On Thu, Jun 15, 2017 at 1:02 PM, Andi Kleen <[email protected]> wrote:
>> On Thu, Jun 15, 2017 at 12:35:39PM -0700, Stephane Eranian wrote:
>>> On Thu, Jun 15, 2017 at 10:23 AM, Andi Kleen <[email protected]> wrote:
>>> > On Thu, Jun 15, 2017 at 09:44:07AM -0700, Stephane Eranian wrote:
>>> >> On Thu, Jun 15, 2017 at 8:10 AM, Andi Kleen <[email protected]> wrote:
>>> >> > On Thu, Jun 15, 2017 at 06:56:24AM -0700, Stephane Eranian wrote:
>>> >> >> This patchs adds a new sample record type called
>>> >> >> PERF_SAMPLE_SKID_IP. The goal is to record
>>> >> >> the unmodified interrupted instruction pointer (IP) as seen by
>>> >> >> the kernel and reflected in the machine state.
>>> >> >
>>> >> > Patches look reasonable for me.
>>> >> >
>>> >> > If you only cared about branches it would be more natural to model
>>> >> > it like a 1 entry LBR. That would make a lot more tooling work
>>> >> > automatically.
>>> >> >
>>> >> You'd still have to modify tooling to present correct column headers.
>>> >
>>> > Why? It's from/to?
>>> >
>>> Ah, yes you are right, but it is not clear to me how you would specify
>>> this cleanly with the interface.
>>> Especially in the case where this could be used for non-branch instructions.
>>
>> Generally the skid ip is only interesting for instructions that have some kind of
>> control flow change, or an exception/interrupt.
>>
> True, right now I found use for the control flow changes, but any PEBS enabled
> event + precise=2 could potentially be useful.
>> So if it's interesting the headers would be correct.
>>
>> Actually it's even correct for non control flow change, because the IP
>> moves from "FROM" to "TO" ...
>>
> Sure.
> But I am not too fond of tying this to a branch abstraction. You'd
> have to express
> this with the perf_events interface.To trigger LBR-style hardware, you'd have
> to define a new PERF_SAMPLE_BRANCH_SKID_IP of some sort and then you
> tie this to a branch kind of sampling feature:
>
> $ perf record -j skid_ip -e BR_INST_RETIRED.CONDITIONAL:pp
>
> Is what I think you are thinking about.
Looking at this approach, the user interface is straightforward,
implementation in the x86 code is a bit more hairy because of the way
the branch_stack is captured, via the cpuc->lbr_entries. If you assume
that SKID_IP cannot be used with any other branch stack mode, then it
is easy. It becomes messy if you don't.
> Looking at this approach, the user interface is straightforward,
> implementation in the x86 code is a bit more hairy because of the way
> the branch_stack is captured, via the cpuc->lbr_entries. If you assume
> that SKID_IP cannot be used with any other branch stack mode, then it
> is easy. It becomes messy if you don't.
That should be fine. After all if you have real LBRs you don't need
the skid IP.
-Andi
Andi,
On Thu, Jun 15, 2017 at 4:18 PM, Andi Kleen <[email protected]> wrote:
>> Looking at this approach, the user interface is straightforward,
>> implementation in the x86 code is a bit more hairy because of the way
>> the branch_stack is captured, via the cpuc->lbr_entries. If you assume
>> that SKID_IP cannot be used with any other branch stack mode, then it
>> is easy. It becomes messy if you don't.
>
> That should be fine. After all if you have real LBRs you don't need
> the skid IP.
>
Yes, you still do. This is not the same thing. LBR captures only taken branches.
I care about taken AND non-taken branches and I don't want to sample on a
non-taken event, assuming it is available.
You need to inject the skid ip into the LBR stack somehow. Either
directly in the
registers or in the cpuc->lbr_entries. The injection can only happen
on interrupt
whereas the LBR captures 32 branches. So you'd have the skid ip info
only for the
most recent branch in the stack.
On Thu, Jun 15, 2017 at 11:52:07PM -0700, Stephane Eranian wrote:
> Andi,
>
> On Thu, Jun 15, 2017 at 4:18 PM, Andi Kleen <[email protected]> wrote:
> >> Looking at this approach, the user interface is straightforward,
> >> implementation in the x86 code is a bit more hairy because of the way
> >> the branch_stack is captured, via the cpuc->lbr_entries. If you assume
> >> that SKID_IP cannot be used with any other branch stack mode, then it
> >> is easy. It becomes messy if you don't.
> >
> > That should be fine. After all if you have real LBRs you don't need
> > the skid IP.
> >
> Yes, you still do. This is not the same thing. LBR captures only taken branches.
> I care about taken AND non-taken branches and I don't want to sample on a
> non-taken event, assuming it is available.
Ok that's a reasonable argument for reporting it separately, like
in your original patch.
-Andi
On Fri, Jun 16, 2017 at 9:06 AM, Andi Kleen <[email protected]> wrote:
> On Thu, Jun 15, 2017 at 11:52:07PM -0700, Stephane Eranian wrote:
>> Andi,
>>
>> On Thu, Jun 15, 2017 at 4:18 PM, Andi Kleen <[email protected]> wrote:
>> >> Looking at this approach, the user interface is straightforward,
>> >> implementation in the x86 code is a bit more hairy because of the way
>> >> the branch_stack is captured, via the cpuc->lbr_entries. If you assume
>> >> that SKID_IP cannot be used with any other branch stack mode, then it
>> >> is easy. It becomes messy if you don't.
>> >
>> > That should be fine. After all if you have real LBRs you don't need
>> > the skid IP.
>> >
>> Yes, you still do. This is not the same thing. LBR captures only taken branches.
>> I care about taken AND non-taken branches and I don't want to sample on a
>> non-taken event, assuming it is available.
>
> Ok that's a reasonable argument for reporting it separately, like
> in your original patch.
>
Yeah, I think it is easier and more portable, especially on hardware with a
PEBS-like mechanism but no branch buffer (like LBR). FYI, I did do a test
implementation yesterday to evaluate the difficulty.
> -Andi
On Fri, Jun 16, 2017 at 10:08 AM, Stephane Eranian <[email protected]> wrote:
> On Fri, Jun 16, 2017 at 9:06 AM, Andi Kleen <[email protected]> wrote:
>> On Thu, Jun 15, 2017 at 11:52:07PM -0700, Stephane Eranian wrote:
>>> Andi,
>>>
>>> On Thu, Jun 15, 2017 at 4:18 PM, Andi Kleen <[email protected]> wrote:
>>> >> Looking at this approach, the user interface is straightforward,
>>> >> implementation in the x86 code is a bit more hairy because of the way
>>> >> the branch_stack is captured, via the cpuc->lbr_entries. If you assume
>>> >> that SKID_IP cannot be used with any other branch stack mode, then it
>>> >> is easy. It becomes messy if you don't.
>>> >
>>> > That should be fine. After all if you have real LBRs you don't need
>>> > the skid IP.
>>> >
>>> Yes, you still do. This is not the same thing. LBR captures only taken branches.
>>> I care about taken AND non-taken branches and I don't want to sample on a
>>> non-taken event, assuming it is available.
>>
>> Ok that's a reasonable argument for reporting it separately, like
>> in your original patch.
>>
> Yeah, I think it is easier and more portable, especially on hardware with a
> PEBS-like mechanism but no branch buffer (like LBR). FYI, I did do a test
> implementation yesterday to evaluate the difficulty.
>
A more generalized usage of the feature is to evaluate the amount of skid
for any precise event.
>> -Andi
> > Yeah, I think it is easier and more portable, especially on hardware with a
> > PEBS-like mechanism but no branch buffer (like LBR). FYI, I did do a test
> > implementation yesterday to evaluate the difficulty.
> >
> A more generalized usage of the feature is to evaluate the amount of skid
> for any precise event.
It should be always the same (one instruction), except for the control flow
change case.
-Andi
On Fri, Jun 16, 2017 at 10:50 AM, Andi Kleen <[email protected]> wrote:
>> > Yeah, I think it is easier and more portable, especially on hardware with a
>> > PEBS-like mechanism but no branch buffer (like LBR). FYI, I did do a test
>> > implementation yesterday to evaluate the difficulty.
>> >
>> A more generalized usage of the feature is to evaluate the amount of skid
>> for any precise event.
>
> It should be always the same (one instruction), except for the control flow
> change case.
>
That's on Intel X86. What about the other arch?
> -Andi