Restore Intel LBR call stack from cloned inactive task perf context on
a context switch. This change inherently addresses inconsistency in LBR
call stack data provided on a sample in record profiling mode:
$ perf record -N -B -T -R --call-graph lbr \
-e cpu/period=0xcdfe60,event=0x3c,name=\'CPU_CLK_UNHALTED.THREAD\'/Duk \
--clockid=monotonic_raw -- ./miniFE.x nx 25 ny 25 nz 25
Let's assume threads A, B, C belonging to the same process.
B and C are siblings of A and their perf contexts are treated as equivalent.
At some point B blocks on a futex (non preempt context switch).
B's LBRs are preserved at B's perf context task_ctx_data and B's events
are removed from PMU and disabled. B's perf context becomes inactive.
Later C gets on a cpu, runs, gets profiled and eventually switches to
the awaken but not yet running B. The optimized context switch path is
executed swapping B's and C's task_ctx_data pointers at perf event contexts.
So C's task_ctx_data will refer preserved B's LBRs on the following
switch-in event.
However, as far B's perf context is inactive there is no enabled events
in there and B's task_ctx_data->lbr_callstack_users is equal to 0.
When B gets on the cpu B's events reviving is skipped following
the optimized context switch path and B's task_ctx_data->lbr_callstack_users
remains 0. Thus B's LBR's are not restored by pmu sched_task() code called
in the end of perf context switch-in callback for B.
In the report that manifests as having short fragments of B's
call stack, still tracked by LBR's HW between adjacent samples,
but the whole thread call tree doesn't aggregate.
The fix has been evaluated when profiling miniFE [1] (C++, OpenMP)
workload running 64 threads on Intel Skylake EP(64 core, 2 sockets):
$ perf report --call-graph callee,flat
5.3.0-rc6+ (tip perf/core) - fixed
- 92.66% 82.64% miniFE.x libiomp5.so [.] _INTERNAL_25_______src_kmp_barrier_cpp_1d20fae8::__kmp_hyper_barrier_release
- 69.14% _INTERNAL_25_______src_kmp_barrier_cpp_1d20fae8::__kmp_hyper_barrier_release
__kmp_fork_barrier
__kmp_launch_thread
_INTERNAL_24_______src_z_Linux_util_c_3e0095e6::__kmp_launch_worker
start_thread
__clone
- 21.89% _INTERNAL_25_______src_kmp_barrier_cpp_1d20fae8::__kmp_hyper_barrier_release
__kmp_barrier
__kmpc_reduce_nowait
miniFE::cg_solve<miniFE::CSRMatrix<double, int, int>, miniFE::Vector<double, int, int>, miniFE::matvec_std<miniFE::CSRMatrix<double, int, int>, miniFE::Vector<double, int, in
__kmp_invoke_microtask
__kmp_invoke_task_func
__kmp_launch_thread
_INTERNAL_24_______src_z_Linux_util_c_3e0095e6::__kmp_launch_worker
start_thread
__clone
- 1.63% _INTERNAL_25_______src_kmp_barrier_cpp_1d20fae8::__kmp_hyper_barrier_release
__kmp_barrier
__kmpc_reduce_nowait
main
__kmp_invoke_microtask
__kmp_invoke_task_func
__kmp_launch_thread
_INTERNAL_24_______src_z_Linux_util_c_3e0095e6::__kmp_launch_worker
start_thread
__clone
5.0.13-300.fc30.x86_64 - no fix
- 90.29% 81.01% miniFE.x libiomp5.so [.] _INTERNAL_25_______src_kmp_barrier_cpp_1d20fae8::__kmp_hyper_barrier_release
- 33.45% _INTERNAL_25_______src_kmp_barrier_cpp_1d20fae8::__kmp_hyper_barrier_release
__kmp_fork_barrier
__kmp_launch_thread
_INTERNAL_24_______src_z_Linux_util_c_3e0095e6::__kmp_launch_worker
start_thread
__clone
87.63% _INTERNAL_25_______src_kmp_barrier_cpp_1d20fae8::__kmp_hyper_barrier_release
- 54.79% _INTERNAL_25_______src_kmp_barrier_cpp_1d20fae8::__kmp_hyper_barrier_release
__kmp_fork_barrier
__kmp_launch_thread
- 9.18% _INTERNAL_25_______src_kmp_barrier_cpp_1d20fae8::__kmp_hyper_barrier_release
__kmp_barrier
__kmpc_reduce_nowait
miniFE::cg_solve<miniFE::CSRMatrix<double, int, int>, miniFE::Vector<double, int, int>, miniFE::matvec_std<miniFE::CSRMatrix<double, int, int>, miniFE::Vector<double, int, in
__kmp_invoke_microtask
__kmp_invoke_task_func
__kmp_launch_thread
_INTERNAL_24_______src_z_Linux_util_c_3e0095e6::__kmp_launch_worker
start_thread
__clone
- 41.28% _INTERNAL_25_______src_kmp_barrier_cpp_1d20fae8::__kmp_hyper_barrier_release
__kmp_fork_barrier
__kmp_launch_thread
_INTERNAL_24_______src_z_Linux_util_c_3e0095e6::__kmp_launch_worker
- 15.77% _INTERNAL_25_______src_kmp_barrier_cpp_1d20fae8::__kmp_hyper_barrier_release
__kmp_barrier
__kmpc_reduce_nowait
miniFE::cg_solve<miniFE::CSRMatrix<double, int, int>, miniFE::Vector<double, int, int>, miniFE::matvec_std<miniFE::CSRMatrix<double, int, int>, miniFE::Vector<double, int, in
__kmp_invoke_microtask
__kmp_invoke_task_func
__kmp_launch_thread
- 11.56% _INTERNAL_25_______src_kmp_barrier_cpp_1d20fae8::__kmp_hyper_barrier_release
__kmp_barrier
__kmpc_reduce_nowait
miniFE::cg_solve<miniFE::CSRMatrix<double, int, int>, miniFE::Vector<double, int, int>, miniFE::matvec_std<miniFE::CSRMatrix<double, int, int>, miniFE::Vector<double, int, in
__kmp_invoke_microtask
__kmp_invoke_task_func
__kmp_launch_thread
_INTERNAL_24_______src_z_Linux_util_c_3e0095e6::__kmp_launch_worker
- 2.33% _INTERNAL_25_______src_kmp_barrier_cpp_1d20fae8::__kmp_hyper_barrier_release
__kmp_barrier
__kmpc_reduce_nowait
main
__kmp_invoke_microtask
__kmp_invoke_task_func
__kmp_launch_thread
_INTERNAL_24_______src_z_Linux_util_c_3e0095e6::__kmp_launch_worker
start_thread
__clone
0.67% _INTERNAL_25_______src_kmp_barrier_cpp_1d20fae8::__kmp_hyper_barrier_gather
0.57% __kmp_hardware_timestamp
[1] https://www.hpcadvisorycouncil.com/pdf/miniFE_Analysis_and_Profiling.pdf
---
Alexey Budankov (4):
perf/core,x86: introduce sync_task_ctx() method at struct pmu
perf/x86: install platform specific sync_task_ctx adapter
perf/x86/intel: implement LBR callstacks context synchronization
perf/core,x86: synchronize PMU task contexts on optimized context
switches
arch/x86/events/core.c | 7 +++++++
arch/x86/events/intel/core.c | 7 +++++++
arch/x86/events/intel/lbr.c | 9 +++++++++
arch/x86/events/perf_event.h | 11 +++++++++++
include/linux/perf_event.h | 7 +++++++
kernel/events/core.c | 9 +++++++++
6 files changed, 50 insertions(+)
---
Changes in v2:
- implemented sync_task_ctx() method at perf,x86,intel pmu types;
- employed the method on the optimized context switch path between
equivalent perf event contexts;
--
2.20.1
Declare sync_task_ctx() methods at the generic and x86 specific
pmu types to bridge calls to platform specific pmu code on optimized
context switch path between equivalent task perf event contexts.
Signed-off-by: Alexey Budankov <[email protected]>
---
arch/x86/events/perf_event.h | 8 ++++++++
include/linux/perf_event.h | 7 +++++++
2 files changed, 15 insertions(+)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index ecacfbf4ebc1..a25e6d7eb87b 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -682,6 +682,14 @@ struct x86_pmu {
*/
atomic_t lbr_exclusive[x86_lbr_exclusive_max];
+ /*
+ * perf task context (i.e. struct perf_event_context::task_ctx_data) switch helper
+ * to bridge calls from perf/core to perf/x86. See struct pmu::sync_task_ctx() usage
+ * for examples;
+ */
+ void (*sync_task_ctx)(struct x86_perf_task_context *one,
+ struct x86_perf_task_context *another);
+
/*
* AMD bits
*/
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 61448c19a132..60bf17af69f0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -409,6 +409,13 @@ struct pmu {
*/
size_t task_ctx_size;
+ /*
+ * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data)
+ * can be synchronized using this function. See Intel LBR callstack support
+ * implementation and Perf core context switch handling callbacks for usage
+ * examples.
+ */
+ void (*sync_task_ctx) (void *one, void *another);
/*
* Set up pmu-private data structures for an AUX area
--
2.20.1
Bridge perf core and x86 sync_task_ctx() method calls.
Signed-off-by: Alexey Budankov <[email protected]>
---
arch/x86/events/core.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 15b90b1a8fb1..2c293bbd093f 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2243,6 +2243,12 @@ static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
x86_pmu.sched_task(ctx, sched_in);
}
+static void x86_pmu_sync_task_ctx(void *one, void *another)
+{
+ if (x86_pmu.sync_task_ctx)
+ x86_pmu.sync_task_ctx(one, another);
+}
+
void perf_check_microcode(void)
{
if (x86_pmu.check_microcode)
@@ -2297,6 +2303,7 @@ static struct pmu pmu = {
.event_idx = x86_pmu_event_idx,
.sched_task = x86_pmu_sched_task,
.task_ctx_size = sizeof(struct x86_perf_task_context),
+ .sync_task_ctx = x86_pmu_sync_task_ctx,
.check_period = x86_pmu_check_period,
.aux_output_match = x86_pmu_aux_output_match,
--
2.20.1
Implement intel_pmu_lbr_sync_task_ctx() method that updates counter
of the events that requested LBR callstack data on a sample.
The counter can be zero for the case when task context belongs to
a thread that has just come from a block on a futex and the context
contains saved (lbr_stack_state == LBR_VALID) LBR register values.
For the values to be restored at LBR registers on the next thread's
switch-in event it copies the counter value that is expected to be
non zero from the previous equivalent task perf event context.
Signed-off-by: Alexey Budankov <[email protected]>
---
arch/x86/events/intel/lbr.c | 9 +++++++++
arch/x86/events/perf_event.h | 3 +++
2 files changed, 12 insertions(+)
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index ea54634eabf3..152a3f8b516a 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -417,6 +417,15 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
cpuc->last_log_id = ++task_ctx->log_id;
}
+void intel_pmu_lbr_sync_task_ctx(struct x86_perf_task_context *one,
+ struct x86_perf_task_context *another)
+{
+ if (!one || !another)
+ return;
+
+ one->lbr_callstack_users = another->lbr_callstack_users;
+}
+
void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index a25e6d7eb87b..3e0087c06fc9 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1024,6 +1024,9 @@ void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr);
void intel_ds_init(void);
+void intel_pmu_lbr_sync_task_ctx(struct x86_perf_task_context *one,
+ struct x86_perf_task_context *another);
+
void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
u64 lbr_from_signext_quirk_wr(u64 val);
--
2.20.1
Install Intel specific PMU task context synchronization adapter and
extend optimized context switch path with PMU specific task context
synchronization to fix LBR callstack virtualization on context switches.
Signed-off-by: Alexey Budankov <[email protected]>
---
arch/x86/events/intel/core.c | 7 +++++++
kernel/events/core.c | 9 +++++++++
2 files changed, 16 insertions(+)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 43c966d1208e..7cfa658cce4b 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3819,6 +3819,12 @@ static void intel_pmu_sched_task(struct perf_event_context *ctx,
intel_pmu_lbr_sched_task(ctx, sched_in);
}
+static void intel_pmu_sync_task_ctx(struct x86_perf_task_context *one,
+ struct x86_perf_task_context *another)
+{
+ intel_pmu_lbr_sync_task_ctx(one, another);
+}
+
static int intel_pmu_check_period(struct perf_event *event, u64 value)
{
return intel_pmu_has_bts_period(event, value) ? -EINVAL : 0;
@@ -3954,6 +3960,7 @@ static __initconst const struct x86_pmu intel_pmu = {
.guest_get_msrs = intel_guest_get_msrs,
.sched_task = intel_pmu_sched_task,
+ .sync_task_ctx = intel_pmu_sync_task_ctx,
.check_period = intel_pmu_check_period,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2aad959e6def..3c7edd8454ef 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3204,11 +3204,20 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_lock(&ctx->lock);
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
+ struct pmu *pmu = ctx->pmu;
+
WRITE_ONCE(ctx->task, next);
WRITE_ONCE(next_ctx->task, task);
swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+ /*
+ * PMU specific parts of task perf context may require
+ * additional synchronization, at least for proper Intel
+ * LBR callstack data profiling;
+ */
+ pmu->sync_task_ctx(ctx->task_ctx_data,
+ next_ctx->task_ctx_data);
/*
* RCU_INIT_POINTER here is safe because we've not
* modified the ctx and the above modification of
--
2.20.1
On 10/16/2019 5:50 AM, Alexey Budankov wrote:
>
> Implement intel_pmu_lbr_sync_task_ctx() method that updates counter
> of the events that requested LBR callstack data on a sample.
>
> The counter can be zero for the case when task context belongs to
> a thread that has just come from a block on a futex and the context
> contains saved (lbr_stack_state == LBR_VALID) LBR register values.
>
> For the values to be restored at LBR registers on the next thread's
> switch-in event it copies the counter value that is expected to be
> non zero from the previous equivalent task perf event context.
>
> Signed-off-by: Alexey Budankov <[email protected]>
> ---
> arch/x86/events/intel/lbr.c | 9 +++++++++
> arch/x86/events/perf_event.h | 3 +++
> 2 files changed, 12 insertions(+)
>
> diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
> index ea54634eabf3..152a3f8b516a 100644
> --- a/arch/x86/events/intel/lbr.c
> +++ b/arch/x86/events/intel/lbr.c
> @@ -417,6 +417,15 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
> cpuc->last_log_id = ++task_ctx->log_id;
> }
>
> +void intel_pmu_lbr_sync_task_ctx(struct x86_perf_task_context *one,
> + struct x86_perf_task_context *another)
> +{
> + if (!one || !another)
> + return;
> +
> + one->lbr_callstack_users = another->lbr_callstack_users;
We may want to swap here?
Thanks,
Kan
> +}
> +
> void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
> {
> struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
> index a25e6d7eb87b..3e0087c06fc9 100644
> --- a/arch/x86/events/perf_event.h
> +++ b/arch/x86/events/perf_event.h
> @@ -1024,6 +1024,9 @@ void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr);
>
> void intel_ds_init(void);
>
> +void intel_pmu_lbr_sync_task_ctx(struct x86_perf_task_context *one,
> + struct x86_perf_task_context *another);
> +
> void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
>
> u64 lbr_from_signext_quirk_wr(u64 val);
>
On 16.10.2019 15:20, Liang, Kan wrote:
>
>
> On 10/16/2019 5:50 AM, Alexey Budankov wrote:
>>
>> Implement intel_pmu_lbr_sync_task_ctx() method that updates counter
>> of the events that requested LBR callstack data on a sample.
>>
>> The counter can be zero for the case when task context belongs to
>> a thread that has just come from a block on a futex and the context
>> contains saved (lbr_stack_state == LBR_VALID) LBR register values.
>>
>> For the values to be restored at LBR registers on the next thread's
>> switch-in event it copies the counter value that is expected to be
>> non zero from the previous equivalent task perf event context.
>>
>> Signed-off-by: Alexey Budankov <[email protected]>
>> ---
>> arch/x86/events/intel/lbr.c | 9 +++++++++
>> arch/x86/events/perf_event.h | 3 +++
>> 2 files changed, 12 insertions(+)
>>
>> diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
>> index ea54634eabf3..152a3f8b516a 100644
>> --- a/arch/x86/events/intel/lbr.c
>> +++ b/arch/x86/events/intel/lbr.c
>> @@ -417,6 +417,15 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
>> cpuc->last_log_id = ++task_ctx->log_id;
>> }
>> +void intel_pmu_lbr_sync_task_ctx(struct x86_perf_task_context *one,
>> + struct x86_perf_task_context *another)
>> +{
>> + if (!one || !another)
>> + return;
>> +
>> + one->lbr_callstack_users = another->lbr_callstack_users;
>
> We may want to swap here?
Well, in this particular case lbr_callstack_users has to stay consistent
with the amount of events in task perf event context that requested
LBR callstack.
~Alexey
>
> Thanks,
> Kan
>
>> +}
>> +
>> void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
>> {
>> struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
>> diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
>> index a25e6d7eb87b..3e0087c06fc9 100644
>> --- a/arch/x86/events/perf_event.h
>> +++ b/arch/x86/events/perf_event.h
>> @@ -1024,6 +1024,9 @@ void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr);
>> void intel_ds_init(void);
>> +void intel_pmu_lbr_sync_task_ctx(struct x86_perf_task_context *one,
>> + struct x86_perf_task_context *another);
>> +
>> void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
>> u64 lbr_from_signext_quirk_wr(u64 val);
>>
>
On 16.10.2019 16:39, Alexey Budankov wrote:
> On 16.10.2019 15:20, Liang, Kan wrote:
>>
>>
>> On 10/16/2019 5:50 AM, Alexey Budankov wrote:
>>>
>>> Implement intel_pmu_lbr_sync_task_ctx() method that updates counter
>>> of the events that requested LBR callstack data on a sample.
>>>
>>> The counter can be zero for the case when task context belongs to
>>> a thread that has just come from a block on a futex and the context
>>> contains saved (lbr_stack_state == LBR_VALID) LBR register values.
>>>
>>> For the values to be restored at LBR registers on the next thread's
>>> switch-in event it copies the counter value that is expected to be
>>> non zero from the previous equivalent task perf event context.
>>>
>>> Signed-off-by: Alexey Budankov <[email protected]>
>>> ---
>>> arch/x86/events/intel/lbr.c | 9 +++++++++
>>> arch/x86/events/perf_event.h | 3 +++
>>> 2 files changed, 12 insertions(+)
>>>
>>> diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
>>> index ea54634eabf3..152a3f8b516a 100644
>>> --- a/arch/x86/events/intel/lbr.c
>>> +++ b/arch/x86/events/intel/lbr.c
>>> @@ -417,6 +417,15 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
>>> cpuc->last_log_id = ++task_ctx->log_id;
>>> }
>>> +void intel_pmu_lbr_sync_task_ctx(struct x86_perf_task_context *one,
>>> + struct x86_perf_task_context *another)
>>> +{
>>> + if (!one || !another)
>>> + return;
>>> +
>>> + one->lbr_callstack_users = another->lbr_callstack_users;
>>
>> We may want to swap here?
>
> Well, in this particular case lbr_callstack_users has to stay consistent
> with the amount of events in task perf event context that requested
> LBR callstack.
Tested swap version and it also fixes the initial issue.
After more code revising swap looks like the correct operation here.
Nice catch. Thanks!
~Alexey
>
> ~Alexey
>
>>
>> Thanks,
>> Kan
>>
>>> +}
>>> +
>>> void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
>>> {
>>> struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
>>> diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
>>> index a25e6d7eb87b..3e0087c06fc9 100644
>>> --- a/arch/x86/events/perf_event.h
>>> +++ b/arch/x86/events/perf_event.h
>>> @@ -1024,6 +1024,9 @@ void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr);
>>> void intel_ds_init(void);
>>> +void intel_pmu_lbr_sync_task_ctx(struct x86_perf_task_context *one,
>>> + struct x86_perf_task_context *another);
>>> +
>>> void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
>>> u64 lbr_from_signext_quirk_wr(u64 val);
>>>
>>
>