2015-02-19 12:13:11

by Adrian Hunter

[permalink] [raw]
Subject: [PATCH 0/2] perf/x86: Add ability to sample TSC

Hi

With the advent of switching perf_clock to CLOCK_MONOTONIC,
it will not be possible to convert perf_clock directly to/from
TSC. So add the ability to sample TSC instead.


Adrian Hunter (2):
perf: Sample additional clock value
perf/x86: Provide TSC for PERF_SAMPLE_CLOCK_ARCH

arch/x86/include/asm/perf_event.h | 6 ++++++
arch/x86/kernel/cpu/perf_event.c | 10 ++++++++++
include/linux/perf_event.h | 3 ++-
include/uapi/linux/perf_event.h | 19 +++++++++++++++++--
kernel/events/core.c | 30 ++++++++++++++++++++++++++++++
kernel/events/internal.h | 4 ++++
6 files changed, 69 insertions(+), 3 deletions(-)


Regards
Adrian


2015-02-19 12:13:15

by Adrian Hunter

[permalink] [raw]
Subject: [PATCH 1/2] perf: Sample additional clock value

Add PERF_SAMPLE_CLOCK to sample some other clock.
The patch allows for 16 possible clock selections
with the only initial possibility an architecture-specific
clock which will be used for TSC on x86.

Although there are only 16 possible clock selections,
it is envisioned that POSIX clock ids would be a
single selection, with the actual clock id provided
in another perf_event_attr member.

Based-on-patch-by: Pawel Moll <[email protected]>
Signed-off-by: Adrian Hunter <[email protected]>
---
include/linux/perf_event.h | 3 ++-
include/uapi/linux/perf_event.h | 19 +++++++++++++++++--
kernel/events/core.c | 30 ++++++++++++++++++++++++++++++
kernel/events/internal.h | 4 ++++
4 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index efe2d2d..e86637e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -655,7 +655,7 @@ extern void perf_pmu_migrate_context(struct pmu *pmu,
int src_cpu, int dst_cpu);
extern u64 perf_event_read_value(struct perf_event *event,
u64 *enabled, u64 *running);
-
+u64 perf_sample_clock_arch(void);

struct perf_sample_data {
/*
@@ -687,6 +687,7 @@ struct perf_sample_data {
u32 cpu;
u32 reserved;
} cpu_entry;
+ u64 clock;
struct perf_callchain_entry *callchain;

/*
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index be9ff06..4d081d5 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -139,8 +139,9 @@ enum perf_event_sample_format {
PERF_SAMPLE_IDENTIFIER = 1U << 16,
PERF_SAMPLE_TRANSACTION = 1U << 17,
PERF_SAMPLE_REGS_INTR = 1U << 18,
+ PERF_SAMPLE_CLOCK = 1U << 19,

- PERF_SAMPLE_MAX = 1U << 19, /* non-ABI */
+ PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */
};

/*
@@ -228,6 +229,16 @@ enum {
};

/*
+ * Values to determine clock to sample.
+ */
+enum perf_sample_clock_type {
+ /* Architecture-specific clock (TSC on x86) */
+ PERF_SAMPLE_CLOCK_ARCH = 0,
+
+ PERF_SAMPLE_CLOCK_MAX /* non-ABI */
+};
+
+/*
* The format of the data returned by read() on a perf event fd,
* as specified by attr.read_format:
*
@@ -328,7 +339,9 @@ struct perf_event_attr {
exclude_callchain_user : 1, /* exclude user callchains */
mmap2 : 1, /* include mmap with inode data */
comm_exec : 1, /* flag comm events that are due to an exec */
- __reserved_1 : 39;
+ /* clock: see enum perf_sample_clock_type */
+ clock : 4, /* which clock */
+ __reserved_1 : 35;

union {
__u32 wakeup_events; /* wakeup every n events */
@@ -601,6 +614,7 @@ enum perf_event_type {
* { u64 id; } && PERF_SAMPLE_ID
* { u64 stream_id;} && PERF_SAMPLE_STREAM_ID
* { u32 cpu, res; } && PERF_SAMPLE_CPU
+ * { u64 clock; } && PERF_SAMPLE_CLOCK
* { u64 id; } && PERF_SAMPLE_IDENTIFIER
* } && perf_event_attr::sample_id_all
*
@@ -746,6 +760,7 @@ enum perf_event_type {
* { u64 transaction; } && PERF_SAMPLE_TRANSACTION
* { u64 abi; # enum perf_sample_regs_abi
* u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
+ * { u64 clock; } && PERF_SAMPLE_CLOCK
* };
*/
PERF_RECORD_SAMPLE = 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 799f034..0f6e7c8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1323,6 +1323,9 @@ static void perf_event__id_header_size(struct perf_event *event)
if (sample_type & PERF_SAMPLE_CPU)
size += sizeof(data->cpu_entry);

+ if (sample_type & PERF_SAMPLE_CLOCK)
+ size += sizeof(data->clock);
+
event->id_header_size = size;
}

@@ -4915,6 +4918,11 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
}
}

+u64 __weak perf_sample_clock_arch(void)
+{
+ return 0;
+}
+
static void __perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
@@ -4943,6 +4951,16 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
data->cpu_entry.cpu = raw_smp_processor_id();
data->cpu_entry.reserved = 0;
}
+
+ if (sample_type & PERF_SAMPLE_CLOCK) {
+ switch (event->attr.clock) {
+ case PERF_SAMPLE_CLOCK_ARCH:
+ data->clock = perf_sample_clock_arch();
+ break;
+ default:
+ data->clock = 0;
+ }
+ }
}

void perf_event_header__init_id(struct perf_event_header *header,
@@ -4973,6 +4991,9 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_CPU)
perf_output_put(handle, data->cpu_entry);

+ if (sample_type & PERF_SAMPLE_CLOCK)
+ perf_output_put(handle, data->clock);
+
if (sample_type & PERF_SAMPLE_IDENTIFIER)
perf_output_put(handle, data->id);
}
@@ -5218,6 +5239,9 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}

+ if (sample_type & PERF_SAMPLE_CLOCK)
+ perf_output_put(handle, data->clock);
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;

@@ -7632,6 +7656,12 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,

if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
ret = perf_reg_validate(attr->sample_regs_intr);
+
+ if ((attr->sample_type & PERF_SAMPLE_CLOCK) &&
+ (attr->clock >= PERF_SAMPLE_CLOCK_MAX ||
+ (!HAVE_PERF_SAMPLE_CLOCK_ARCH &&
+ attr->clock == PERF_SAMPLE_CLOCK_ARCH)))
+ return -EINVAL;
out:
return ret;

diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 9f6ce9b..c3dca1e 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -228,4 +228,8 @@ static inline bool arch_perf_have_user_stack_dump(void)
#define perf_user_stack_pointer(regs) 0
#endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */

+#ifndef HAVE_PERF_SAMPLE_CLOCK_ARCH
+#define HAVE_PERF_SAMPLE_CLOCK_ARCH 0
+#endif
+
#endif /* _KERNEL_EVENTS_INTERNAL_H */
--
1.9.1

2015-02-19 12:13:22

by Adrian Hunter

[permalink] [raw]
Subject: [PATCH 2/2] perf/x86: Provide TSC for PERF_SAMPLE_CLOCK_ARCH

Provide TSC for PERF_SAMPLE_CLOCK_ARCH. This is needed
to match perf events against hardware traces like
Intel PT.

Signed-off-by: Adrian Hunter <[email protected]>
---
arch/x86/include/asm/perf_event.h | 6 ++++++
arch/x86/kernel/cpu/perf_event.c | 10 ++++++++++
2 files changed, 16 insertions(+)

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index dc0f6ed..ced9b5f 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -261,6 +261,12 @@ struct perf_guest_switch_msr {
extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap);
extern void perf_check_microcode(void);
+
+#ifdef CONFIG_X86_TSC
+#define HAVE_PERF_SAMPLE_CLOCK_ARCH 1
+u64 perf_sample_clock_arch(void);
+#endif
+
#else
static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
{
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 8271d6b..23ae9d4 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2256,3 +2256,13 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
cap->events_mask_len = x86_pmu.events_mask_len;
}
EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
+
+#ifdef CONFIG_X86_TSC
+u64 perf_sample_clock_arch(void)
+{
+ u64 tsc;
+
+ rdtscll(tsc);
+ return tsc;
+}
+#endif
--
1.9.1

2015-02-19 13:50:31

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 0/2] perf/x86: Add ability to sample TSC

On Thu, Feb 19, 2015 at 02:11:08PM +0200, Adrian Hunter wrote:
> Hi
>
> With the advent of switching perf_clock to CLOCK_MONOTONIC,
> it will not be possible to convert perf_clock directly to/from
> TSC. So add the ability to sample TSC instead.

Well, you can, mostly. MONOTONIC is only affected by NTP slew rate
changes, not offset changes.

And NTP limits the slew rate to 500 PPM, so even if you would get a
slew change and then not update the userpage data for a second you'd be
maximally off by 0.0005 seconds.

And that is way below what the current perf clock guarantees on funny
hardware.

If you're really worried about this; we could maybe get John and Thomas
to allow us a callback on every slew change so we can update the
userpage data ASAP, much reducing the max error.

Say it takes a 10e5 cycles to update your userpage, then you're never
further off than 50 cycles, which is below your ART multiplier.

Does that really matter? Also, if you have a stable crystal, the slew
rate change should be minimal and infrequent, never getting you close to
these numbers.

So no, I'm not convinced we need this.

2015-02-19 14:41:18

by Adrian Hunter

[permalink] [raw]
Subject: Re: [PATCH 0/2] perf/x86: Add ability to sample TSC

On 19/02/15 15:50, Peter Zijlstra wrote:
> On Thu, Feb 19, 2015 at 02:11:08PM +0200, Adrian Hunter wrote:
>> Hi
>>
>> With the advent of switching perf_clock to CLOCK_MONOTONIC,
>> it will not be possible to convert perf_clock directly to/from
>> TSC. So add the ability to sample TSC instead.
>
> Well, you can, mostly. MONOTONIC is only affected by NTP slew rate
> changes, not offset changes.

man page says is also subject to adjtime(3)

>
> And NTP limits the slew rate to 500 PPM, so even if you would get a

Assuming it is not broken.

> slew change and then not update the userpage data for a second you'd be
> maximally off by 0.0005 seconds.

That could still be enough to break the decoder. It will certainly
misrepresent the order of events, which is a big loss of information.

>
> And that is way below what the current perf clock guarantees on funny
> hardware.
>
> If you're really worried about this; we could maybe get John and Thomas
> to allow us a callback on every slew change so we can update the
> userpage data ASAP, much reducing the max error.
>
> Say it takes a 10e5 cycles to update your userpage, then you're never
> further off than 50 cycles, which is below your ART multiplier.

You still need to wake up user space to read the userpage.

>
> Does that really matter? Also, if you have a stable crystal, the slew
> rate change should be minimal and infrequent, never getting you close to
> these numbers.
>
> So no, I'm not convinced we need this.

Adding TSC to the sample is a lot simpler and more accurate.

2015-02-19 15:06:03

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 0/2] perf/x86: Add ability to sample TSC

On Thu, Feb 19, 2015 at 04:38:57PM +0200, Adrian Hunter wrote:
> On 19/02/15 15:50, Peter Zijlstra wrote:
> > On Thu, Feb 19, 2015 at 02:11:08PM +0200, Adrian Hunter wrote:
> >> Hi
> >>
> >> With the advent of switching perf_clock to CLOCK_MONOTONIC,
> >> it will not be possible to convert perf_clock directly to/from
> >> TSC. So add the ability to sample TSC instead.
> >
> > Well, you can, mostly. MONOTONIC is only affected by NTP slew rate
> > changes, not offset changes.
>
> man page says is also subject to adjtime(3)

which is slew adjustment; read the adjtime manpage :-)

> > And NTP limits the slew rate to 500 PPM, so even if you would get a
>
> Assuming it is not broken.

NTP people are a cautious crowd, sure they get it wrong just like the
rest of us, but mostly it needs to work.

> > slew change and then not update the userpage data for a second you'd be
> > maximally off by 0.0005 seconds.
>
> That could still be enough to break the decoder. It will certainly
> misrepresent the order of events, which is a big loss of information.

What decoder? perf report is already subject to much larger shifts in
time if you run it on say a core2 machine.

> > And that is way below what the current perf clock guarantees on funny
> > hardware.
> >
> > If you're really worried about this; we could maybe get John and Thomas
> > to allow us a callback on every slew change so we can update the
> > userpage data ASAP, much reducing the max error.
> >
> > Say it takes a 10e5 cycles to update your userpage, then you're never
> > further off than 50 cycles, which is below your ART multiplier.
>
> You still need to wake up user space to read the userpage.

Uhm what? Userspace is already awake.

> > Does that really matter? Also, if you have a stable crystal, the slew
> > rate change should be minimal and infrequent, never getting you close to
> > these numbers.
> >
> > So no, I'm not convinced we need this.
>
> Adding TSC to the sample is a lot simpler and more accurate.

Finding multiple samples and interpolating between them is much simpler
than reading tsc and doing the mult, shift and offset addition?

I suspect you're talking about something else entirely; your changelogs
are inadequate for they tell ntohing of your usecase and have me
guessing. Don't do that.

2015-02-19 15:56:37

by Adrian Hunter

[permalink] [raw]
Subject: Re: [PATCH 0/2] perf/x86: Add ability to sample TSC

On 19/02/2015 5:05 p.m., Peter Zijlstra wrote:
> On Thu, Feb 19, 2015 at 04:38:57PM +0200, Adrian Hunter wrote:
>> On 19/02/15 15:50, Peter Zijlstra wrote:
>>> On Thu, Feb 19, 2015 at 02:11:08PM +0200, Adrian Hunter wrote:
>>>> Hi
>>>>
>>>> With the advent of switching perf_clock to CLOCK_MONOTONIC,
>>>> it will not be possible to convert perf_clock directly to/from
>>>> TSC. So add the ability to sample TSC instead.
>>>
>>> Well, you can, mostly. MONOTONIC is only affected by NTP slew rate
>>> changes, not offset changes.
>>
>> man page says is also subject to adjtime(3)
>
> which is slew adjustment; read the adjtime manpage :-)
>
>>> And NTP limits the slew rate to 500 PPM, so even if you would get a
>>
>> Assuming it is not broken.
>
> NTP people are a cautious crowd, sure they get it wrong just like the
> rest of us, but mostly it needs to work.
>
>>> slew change and then not update the userpage data for a second you'd be
>>> maximally off by 0.0005 seconds.
>>
>> That could still be enough to break the decoder. It will certainly
>> misrepresent the order of events, which is a big loss of information.
>
> What decoder? perf report is already subject to much larger shifts in
> time if you run it on say a core2 machine.

Any decoder of Intel PT data. Side-band events like sched_switch or mmap
have to be sync'ed with Intel PT TSC timestamps to decode the trace. But
synchronizing any kind of event could be useful for analysis.

>
>>> And that is way below what the current perf clock guarantees on funny
>>> hardware.
>>>
>>> If you're really worried about this; we could maybe get John and Thomas
>>> to allow us a callback on every slew change so we can update the
>>> userpage data ASAP, much reducing the max error.
>>>
>>> Say it takes a 10e5 cycles to update your userpage, then you're never
>>> further off than 50 cycles, which is below your ART multiplier.
>>
>> You still need to wake up user space to read the userpage.
>
> Uhm what? Userspace is already awake.

For Intel PT recording, perf record will be sleeping on poll().

>
>>> Does that really matter? Also, if you have a stable crystal, the slew
>>> rate change should be minimal and infrequent, never getting you close to
>>> these numbers.
>>>
>>> So no, I'm not convinced we need this.
>>
>> Adding TSC to the sample is a lot simpler and more accurate.
>
> Finding multiple samples and interpolating between them is much simpler
> than reading tsc and doing the mult, shift and offset addition?
>
> I suspect you're talking about something else entirely; your changelogs
> are inadequate for they tell ntohing of your usecase and have me
> guessing. Don't do that.

Sorry. I did mention Intel PT in patch 2, but I basically assumed the
need to synchronize events with other time sources was understood.

2015-02-19 17:24:19

by John Stultz

[permalink] [raw]
Subject: Re: [PATCH 0/2] perf/x86: Add ability to sample TSC

On Thu, Feb 19, 2015 at 4:11 AM, Adrian Hunter <[email protected]> wrote:
> Hi
>
> With the advent of switching perf_clock to CLOCK_MONOTONIC,
> it will not be possible to convert perf_clock directly to/from
> TSC. So add the ability to sample TSC instead.
>
>
> Adrian Hunter (2):
> perf: Sample additional clock value
> perf/x86: Provide TSC for PERF_SAMPLE_CLOCK_ARCH

This doesn't seem very portable. The CLOCK_MONOTONIC_RAW clockid was
added to provide a arch-neutral abstraction of a free-running hardware
counter that isn't affected by adjtimex slewing (though like any
counter, it will be affected by non-constant drift).

You might consider looking at that if the short term slew adjustments
(which result in more accurate timings in the long term) are
problematic for you.

thanks
-john

2015-02-19 17:40:37

by Adrian Hunter

[permalink] [raw]
Subject: Re: [PATCH 0/2] perf/x86: Add ability to sample TSC

On 19/02/2015 7:24 p.m., John Stultz wrote:
> On Thu, Feb 19, 2015 at 4:11 AM, Adrian Hunter <[email protected]> wrote:
>> Hi
>>
>> With the advent of switching perf_clock to CLOCK_MONOTONIC,
>> it will not be possible to convert perf_clock directly to/from
>> TSC. So add the ability to sample TSC instead.
>>
>>
>> Adrian Hunter (2):
>> perf: Sample additional clock value
>> perf/x86: Provide TSC for PERF_SAMPLE_CLOCK_ARCH
>
> This doesn't seem very portable. The CLOCK_MONOTONIC_RAW clockid was
> added to provide a arch-neutral abstraction of a free-running hardware
> counter that isn't affected by adjtimex slewing (though like any
> counter, it will be affected by non-constant drift).
>
> You might consider looking at that if the short term slew adjustments
> (which result in more accurate timings in the long term) are
> problematic for you.

This is for Intel Processor Trace - which Peter has already
rightly chastised me for not making plain.

Intel Processor Trace (Intel PT) is a trace that is hardware generated. The
hardware does not know about linux or its clocks, so the timestamps
are TSC. I think ARM might have the same issue with ETM or such. i.e. the
need to synchronize a hardware timestamp with a perf event.

There is a description of Intel PT in the Intel Architecture manuals.

http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html

2015-02-19 17:41:06

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 0/2] perf/x86: Add ability to sample TSC

> > And NTP limits the slew rate to 500 PPM, so even if you would get a

Assuming you run NTP

>
> Assuming it is not broken.

It could also easily switch to HPET if the kernel decides the TSC has some
issues (which may be true or not).

The switching of the perf clock to MONOTONIC is a bad idea imho.

-Andi

2015-02-19 17:50:23

by John Stultz

[permalink] [raw]
Subject: Re: [PATCH 0/2] perf/x86: Add ability to sample TSC

On Thu, Feb 19, 2015 at 9:40 AM, Adrian Hunter <[email protected]> wrote:
> On 19/02/2015 7:24 p.m., John Stultz wrote:
>>
>> On Thu, Feb 19, 2015 at 4:11 AM, Adrian Hunter <[email protected]>
>> wrote:
>>>
>>> Hi
>>>
>>> With the advent of switching perf_clock to CLOCK_MONOTONIC,
>>> it will not be possible to convert perf_clock directly to/from
>>> TSC. So add the ability to sample TSC instead.
>>>
>>>
>>> Adrian Hunter (2):
>>> perf: Sample additional clock value
>>> perf/x86: Provide TSC for PERF_SAMPLE_CLOCK_ARCH
>>
>>
>> This doesn't seem very portable. The CLOCK_MONOTONIC_RAW clockid was
>> added to provide a arch-neutral abstraction of a free-running hardware
>> counter that isn't affected by adjtimex slewing (though like any
>> counter, it will be affected by non-constant drift).
>>
>> You might consider looking at that if the short term slew adjustments
>> (which result in more accurate timings in the long term) are
>> problematic for you.
>
>
> This is for Intel Processor Trace - which Peter has already
> rightly chastised me for not making plain.
>
> Intel Processor Trace (Intel PT) is a trace that is hardware generated. The
> hardware does not know about linux or its clocks, so the timestamps
> are TSC. I think ARM might have the same issue with ETM or such. i.e. the
> need to synchronize a hardware timestamp with a perf event.
>
> There is a description of Intel PT in the Intel Architecture manuals.
>
> http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html

Cc'ing Mathieu since he might be familiar w/ any similar issues w/ Coresight.

thanks
-john

2015-02-19 17:58:50

by Pawel Moll

[permalink] [raw]
Subject: Re: [PATCH 0/2] perf/x86: Add ability to sample TSC

On Thu, 2015-02-19 at 17:50 +0000, John Stultz wrote:
> On Thu, Feb 19, 2015 at 9:40 AM, Adrian Hunter <[email protected]> wrote:
> > On 19/02/2015 7:24 p.m., John Stultz wrote:
> >>
> >> On Thu, Feb 19, 2015 at 4:11 AM, Adrian Hunter <[email protected]>
> >> wrote:
> >>>
> >>> Hi
> >>>
> >>> With the advent of switching perf_clock to CLOCK_MONOTONIC,
> >>> it will not be possible to convert perf_clock directly to/from
> >>> TSC. So add the ability to sample TSC instead.
> >>>
> >>>
> >>> Adrian Hunter (2):
> >>> perf: Sample additional clock value
> >>> perf/x86: Provide TSC for PERF_SAMPLE_CLOCK_ARCH
> >>
> >>
> >> This doesn't seem very portable. The CLOCK_MONOTONIC_RAW clockid was
> >> added to provide a arch-neutral abstraction of a free-running hardware
> >> counter that isn't affected by adjtimex slewing (though like any
> >> counter, it will be affected by non-constant drift).
> >>
> >> You might consider looking at that if the short term slew adjustments
> >> (which result in more accurate timings in the long term) are
> >> problematic for you.
> >
> >
> > This is for Intel Processor Trace - which Peter has already
> > rightly chastised me for not making plain.
> >
> > Intel Processor Trace (Intel PT) is a trace that is hardware generated. The
> > hardware does not know about linux or its clocks, so the timestamps
> > are TSC. I think ARM might have the same issue with ETM or such. i.e. the
> > need to synchronize a hardware timestamp with a perf event.
> >
> > There is a description of Intel PT in the Intel Architecture manuals.
> >
> > http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html
>
> Cc'ing Mathieu since he might be familiar w/ any similar issues w/ Coresight.

We haven't got to this yet, but it was discussed (briefly) at Plumbers
in Dusseldorf. In our case the CS processor trace can be timestamped
from yet another clock source, probably hidden behind memory mapped
register. Thus the additional time sample of some sort will be required
in some form.

Peter pointed out that there may be problem with obtaining the "other"
timestamps in NMI context, so I was thinking about injecting the
synchronisation points as separate records:

http://article.gmane.org/gmane.linux.kernel.api/7726/match=perf+sample+additional+clock+value

but maybe I'm making it too complicated and what Adrian did, explicitly
defining possible timestamps at perf_event_attr level instead of
relating them to to posix clock ids is the way to go. No strong opinion
here.

Pawel

2015-02-19 18:01:51

by Pawel Moll

[permalink] [raw]
Subject: Re: [PATCH 0/2] perf/x86: Add ability to sample TSC

On Thu, 2015-02-19 at 17:58 +0000, Pawel Moll wrote:
> and what Adrian did, explicitly
> defining possible timestamps at perf_event_attr level instead of
> relating them to to posix clock ids is the way to go. No strong opinion
> here.

One note here: I'd rather make it "processor trace clock", rather than
"architecturally defined" one. That way you relate it directly to the
time domain and allow userspace decoder to as for it in a
architecturally independent way (yes, I'm assuming that there will be
only one "type of processor trace in a system :-)

Pawel

2015-02-19 19:13:27

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 0/2] perf/x86: Add ability to sample TSC


* Adrian Hunter <[email protected]> wrote:

> > I suspect you're talking about something else entirely;
> > your changelogs are inadequate for they tell ntohing of
> > your usecase and have me guessing. Don't do that.
>
> Sorry. I did mention Intel PT in patch 2, but I basically
> assumed the need to synchronize events with other time
> sources was understood.

Written down, slightly repetitive walk-throughs from first
concepts to implementation that don't assume too much
context are very useful to us slightly overworked
maintainers with an attention span of a slightly retarded
golden retriever. (Make the story interesting and mix in a
few jokes and you've got our attention for sure!)

So please do explain verbosely, especially if a review
discussion turns somewhat discordant, it's a useful tool.

Thanks,

Ingo