Hello,
It was reported that system-wide events with precise_ip set have a lot
of unknown symbols on Intel machines. Depending on the system load I
can see more than 30% of total symbols are not resolved (actually
don't have DSO mappings).
I found that it's only large PEBS is enabled - using call-graph or the
frequency mode will disable it and have valid results. I've verified
it by checking intel_pmu_pebs_sched_task() is called like below:
# perf probe -a intel_pmu_pebs_sched_task
# perf stat -a -e probe:intel_pmu_pebs_sched_task \
> perf record -a -e cycles:ppp -c 100001 sleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 2.625 MB perf.data (10345 samples) ]
Performance counter stats for 'system wide':
0 probe:intel_pmu_pebs_sched_task
2.157533991 seconds time elapsed
Looking at the code, I found out that the pmu::sched_task callback was
changed recently that it's called only for task events. So cpu events
with large PEBS didn't flush the buffer and they are attributed to
unrelated tasks later resulted in unresolved symbols.
This patch reverts it and keeps the optimization for task events.
While at it, I also found the context switch callback was not enabled
for cpu events from the beginning. So I've added it too. With this
applied, I can see the above callbacks are hit as expected and perf
report has valid symbols.
Thanks
Namhyung
Namhyung Kim (2):
perf/core: Enable sched_task callbacks if PMU has it
perf/core: Invoke pmu::sched_task callback for per-cpu events
include/linux/perf_event.h | 1 +
kernel/events/core.c | 42 ++++++++++++++++++++++++++++++++++++--
2 files changed, 41 insertions(+), 2 deletions(-)
--
2.29.1.341.ge80a0c044ae-goog
The commit 44fae179ce73 ("perf/core: Pull pmu::sched_task() into
perf_event_context_sched_out()") moved the pmu::sched_task callback to
be called for task event context. But it missed to call it for
per-cpu events to flush PMU internal buffers (i.e. for PEBS, ...).
This commit basically reverts the commit but keeps the optimization
for task events and only add missing calls for cpu events.
Fixes: 44fae179ce73 ("perf/core: Pull pmu::sched_task() into perf_event_context_sched_out()")
Signed-off-by: Namhyung Kim <[email protected]>
---
include/linux/perf_event.h | 1 +
kernel/events/core.c | 38 ++++++++++++++++++++++++++++++++++++--
2 files changed, 37 insertions(+), 2 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0defb526cd0c..abb70a557cb5 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -872,6 +872,7 @@ struct perf_cpu_context {
struct list_head cgrp_cpuctx_entry;
#endif
+ struct list_head sched_cb_entry;
int sched_cb_usage;
int online;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index aaa0155c4142..c04d9a913537 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -383,6 +383,7 @@ static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
@@ -3480,11 +3481,16 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
}
}
+static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+
void perf_sched_cb_dec(struct pmu *pmu)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- --cpuctx->sched_cb_usage;
+ this_cpu_dec(perf_sched_cb_usages);
+
+ if (!--cpuctx->sched_cb_usage)
+ list_del(&cpuctx->sched_cb_entry);
}
@@ -3492,7 +3498,10 @@ void perf_sched_cb_inc(struct pmu *pmu)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- cpuctx->sched_cb_usage++;
+ if (!cpuctx->sched_cb_usage++)
+ list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+
+ this_cpu_inc(perf_sched_cb_usages);
}
/*
@@ -3521,6 +3530,24 @@ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
+static void perf_pmu_sched_task(struct task_struct *prev,
+ struct task_struct *next,
+ bool sched_in)
+{
+ struct perf_cpu_context *cpuctx;
+
+ if (prev == next)
+ return;
+
+ list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
+ /* will be handled in perf_event_context_sched_in/out */
+ if (cpuctx->task_ctx)
+ continue;
+
+ __perf_pmu_sched_task(cpuctx, sched_in);
+ }
+}
+
static void perf_event_switch(struct task_struct *task,
struct task_struct *next_prev, bool sched_in);
@@ -3543,6 +3570,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
{
int ctxn;
+ if (__this_cpu_read(perf_sched_cb_usages))
+ perf_pmu_sched_task(task, next, false);
+
if (atomic_read(&nr_switch_events))
perf_event_switch(task, next, false);
@@ -3850,6 +3880,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
if (atomic_read(&nr_switch_events))
perf_event_switch(task, prev, true);
+
+ if (__this_cpu_read(perf_sched_cb_usages))
+ perf_pmu_sched_task(prev, task, true);
}
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -12999,6 +13032,7 @@ static void __init perf_event_init_all_cpus(void)
#ifdef CONFIG_CGROUP_PERF
INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
#endif
+ INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
}
}
--
2.29.1.341.ge80a0c044ae-goog
If an event associated with a PMU which has a sched_task callback,
it should be called regardless of cpu/task context. For example,
a per-cpu event might enable large PEBS buffers so it needs to flush
the buffer whenever task scheduling happens.
The underlying PMU may or may not require this for the given event,
but it will be handled in the pmu::sched_task() callback anyway.
Signed-off-by: Namhyung Kim <[email protected]>
---
kernel/events/core.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b458ed3dc81b..aaa0155c4142 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4696,6 +4696,8 @@ static void unaccount_event(struct perf_event *event)
dec = true;
if (has_branch_stack(event))
dec = true;
+ if (event->pmu->sched_task)
+ dec = true;
if (event->attr.ksymbol)
atomic_dec(&nr_ksymbol_events);
if (event->attr.bpf_event)
@@ -11225,6 +11227,8 @@ static void account_event(struct perf_event *event)
inc = true;
if (is_cgroup_event(event))
inc = true;
+ if (event->pmu->sched_task)
+ inc = true;
if (event->attr.ksymbol)
atomic_inc(&nr_ksymbol_events);
if (event->attr.bpf_event)
--
2.29.1.341.ge80a0c044ae-goog
On Mon, Nov 2, 2020 at 6:52 AM Namhyung Kim <[email protected]> wrote:
>
> Hello,
>
> It was reported that system-wide events with precise_ip set have a lot
> of unknown symbols on Intel machines. Depending on the system load I
> can see more than 30% of total symbols are not resolved (actually
> don't have DSO mappings).
>
> I found that it's only large PEBS is enabled - using call-graph or the
> frequency mode will disable it and have valid results. I've verified
> it by checking intel_pmu_pebs_sched_task() is called like below:
>
> # perf probe -a intel_pmu_pebs_sched_task
>
> # perf stat -a -e probe:intel_pmu_pebs_sched_task \
> > perf record -a -e cycles:ppp -c 100001 sleep 1
> [ perf record: Woken up 1 times to write data ]
> [ perf record: Captured and wrote 2.625 MB perf.data (10345 samples) ]
>
> Performance counter stats for 'system wide':
>
> 0 probe:intel_pmu_pebs_sched_task
>
> 2.157533991 seconds time elapsed
>
>
> Looking at the code, I found out that the pmu::sched_task callback was
> changed recently that it's called only for task events. So cpu events
> with large PEBS didn't flush the buffer and they are attributed to
> unrelated tasks later resulted in unresolved symbols.
>
> This patch reverts it and keeps the optimization for task events.
> While at it, I also found the context switch callback was not enabled
> for cpu events from the beginning. So I've added it too. With this
> applied, I can see the above callbacks are hit as expected and perf
> report has valid symbols.
>
This is a serious bug that impacts many kernel versions as soon as
multi-entry PEBS is activated by the kernel in system-wide mode.
I remember this was working in the past so it must have been broken by
some code refactoring or optimization or extension of sched_task
to other features. PEBS must be flushed on context switch in per-cpu
mode, otherwise you may report samples in locations that do not belong
to the process where they are processed in. PEBS does not tag samples
with PID/TID.
On 11/2/2020 9:52 AM, Namhyung Kim wrote:
> The commit 44fae179ce73 ("perf/core: Pull pmu::sched_task() into
> perf_event_context_sched_out()") moved the pmu::sched_task callback to
> be called for task event context. But it missed to call it for
> per-cpu events to flush PMU internal buffers (i.e. for PEBS, ...).
>
> This commit basically reverts the commit but keeps the optimization
> for task events and only add missing calls for cpu events.
>
> Fixes: 44fae179ce73 ("perf/core: Pull pmu::sched_task() into perf_event_context_sched_out()") > Signed-off-by: Namhyung Kim <[email protected]>
> ---
> include/linux/perf_event.h | 1 +
> kernel/events/core.c | 38 ++++++++++++++++++++++++++++++++++++--
> 2 files changed, 37 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 0defb526cd0c..abb70a557cb5 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -872,6 +872,7 @@ struct perf_cpu_context {
> struct list_head cgrp_cpuctx_entry;
> #endif
>
> + struct list_head sched_cb_entry;
> int sched_cb_usage;
>
> int online;
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index aaa0155c4142..c04d9a913537 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -383,6 +383,7 @@ static DEFINE_MUTEX(perf_sched_mutex);
> static atomic_t perf_sched_count;
>
> static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
> +static DEFINE_PER_CPU(int, perf_sched_cb_usages);
> static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
>
> static atomic_t nr_mmap_events __read_mostly;
> @@ -3480,11 +3481,16 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
> }
> }
>
> +static DEFINE_PER_CPU(struct list_head, sched_cb_list);
> +
> void perf_sched_cb_dec(struct pmu *pmu)
> {
> struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
>
> - --cpuctx->sched_cb_usage;
> + this_cpu_dec(perf_sched_cb_usages);
> +
> + if (!--cpuctx->sched_cb_usage)
> + list_del(&cpuctx->sched_cb_entry);
> }
>
>
> @@ -3492,7 +3498,10 @@ void perf_sched_cb_inc(struct pmu *pmu)
> {
> struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
>
> - cpuctx->sched_cb_usage++;
> + if (!cpuctx->sched_cb_usage++)
> + list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
> +
> + this_cpu_inc(perf_sched_cb_usages);
> }
>
> /*
> @@ -3521,6 +3530,24 @@ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in
> perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> }
>
> +static void perf_pmu_sched_task(struct task_struct *prev,
> + struct task_struct *next,
> + bool sched_in)
> +{
> + struct perf_cpu_context *cpuctx;
> +
> + if (prev == next)
> + return;
> +
> + list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
> + /* will be handled in perf_event_context_sched_in/out */
> + if (cpuctx->task_ctx)
> + continue;
> +
> + __perf_pmu_sched_task(cpuctx, sched_in);
> + }
> +}
> +
> static void perf_event_switch(struct task_struct *task,
> struct task_struct *next_prev, bool sched_in);
>
> @@ -3543,6 +3570,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
> {
> int ctxn;
>
> + if (__this_cpu_read(perf_sched_cb_usages))
> + perf_pmu_sched_task(task, next, false);
> +
> if (atomic_read(&nr_switch_events))
> perf_event_switch(task, next, false);
>
> @@ -3850,6 +3880,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
>
> if (atomic_read(&nr_switch_events))
> perf_event_switch(task, prev, true);
> +
> + if (__this_cpu_read(perf_sched_cb_usages))
> + perf_pmu_sched_task(prev, task, true);
> }
It looks like the ("perf/core: Pull pmu::sched_task() into
perf_event_context_sched_in()") is also reverted in the patch.
>
> static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
> @@ -12999,6 +13032,7 @@ static void __init perf_event_init_all_cpus(void)
> #ifdef CONFIG_CGROUP_PERF
> INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
> #endif
> + INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
> }
> }
>
>
Can we only update the perf_sched_cb_usages and sched_cb_list for
per-cpu event as below patch (not tested)?
If user only uses the per-process event, we don't need to go through the
list.
diff --git a/arch/powerpc/perf/core-book3s.c
b/arch/powerpc/perf/core-book3s.c
index 6586f7e71cfb..63c9b87cab5e 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -380,7 +380,7 @@ static void power_pmu_bhrb_enable(struct perf_event
*event)
cpuhw->bhrb_context = event->ctx;
}
cpuhw->bhrb_users++;
- perf_sched_cb_inc(event->ctx->pmu);
+ perf_sched_cb_inc(event->ctx->pmu, !(event->attach_state &
PERF_ATTACH_TASK));
}
static void power_pmu_bhrb_disable(struct perf_event *event)
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 444e5f061d04..a34b90c7fa6d 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1022,9 +1022,9 @@ pebs_update_state(bool needed_cb, struct
cpu_hw_events *cpuc,
if (needed_cb != pebs_needs_sched_cb(cpuc)) {
if (!needed_cb)
- perf_sched_cb_inc(pmu);
+ perf_sched_cb_inc(pmu, !(event->attach_state & PERF_ATTACH_TASK));
else
- perf_sched_cb_dec(pmu);
+ perf_sched_cb_dec(pmu, !(event->attach_state & PERF_ATTACH_TASK));
update = true;
}
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 8961653c5dd2..8d4d02cde3d4 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -693,7 +693,7 @@ void intel_pmu_lbr_add(struct perf_event *event)
*/
if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0)
cpuc->lbr_pebs_users++;
- perf_sched_cb_inc(event->ctx->pmu);
+ perf_sched_cb_inc(event->ctx->pmu, !(event->attach_state &
PERF_ATTACH_TASK));
if (!cpuc->lbr_users++ && !event->total_time_running)
intel_pmu_lbr_reset();
@@ -740,7 +740,7 @@ void intel_pmu_lbr_del(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
WARN_ON_ONCE(cpuc->lbr_pebs_users < 0);
- perf_sched_cb_dec(event->ctx->pmu);
+ perf_sched_cb_dec(event->ctx->pmu, !(event->attach_state &
PERF_ATTACH_TASK));
}
static inline bool vlbr_exclude_host(void)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index abb70a557cb5..5a02239ca8fd 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -966,8 +966,8 @@ extern const struct perf_event_attr
*perf_event_attrs(struct perf_event *event);
extern void perf_event_print_debug(void);
extern void perf_pmu_disable(struct pmu *pmu);
extern void perf_pmu_enable(struct pmu *pmu);
-extern void perf_sched_cb_dec(struct pmu *pmu);
-extern void perf_sched_cb_inc(struct pmu *pmu);
+extern void perf_sched_cb_dec(struct pmu *pmu, bool systemwide);
+extern void perf_sched_cb_inc(struct pmu *pmu, bool systemwide);
extern int perf_event_task_disable(void);
extern int perf_event_task_enable(void);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e6b98507734a..2d7c07af02f8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3484,25 +3484,29 @@ static void perf_event_context_sched_out(struct
task_struct *task, int ctxn,
static DEFINE_PER_CPU(struct list_head, sched_cb_list);
-void perf_sched_cb_dec(struct pmu *pmu)
+void perf_sched_cb_dec(struct pmu *pmu, bool systemwide)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- this_cpu_dec(perf_sched_cb_usages);
+ --cpuctx->sched_cb_usage;
- if (!--cpuctx->sched_cb_usage)
+ if (systemwide) {
+ this_cpu_dec(perf_sched_cb_usages);
list_del(&cpuctx->sched_cb_entry);
+ }
}
-void perf_sched_cb_inc(struct pmu *pmu)
+void perf_sched_cb_inc(struct pmu *pmu, bool systemwide)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- if (!cpuctx->sched_cb_usage++)
- list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+ cpuctx->sched_cb_usage++;
- this_cpu_inc(perf_sched_cb_usages);
+ if (systemwide) {
+ this_cpu_inc(perf_sched_cb_usages);
+ list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+ }
}
/*
Thanks,
Kan
On 11/2/2020 9:52 AM, Namhyung Kim wrote:
> If an event associated with a PMU which has a sched_task callback,
> it should be called regardless of cpu/task context. For example,
I don't think it's necessary. We should call it when we have to.
Otherwise, it just waste cycles.
Shouldn't the patch 2 be enough?
Thanks,
Kan
> a per-cpu event might enable large PEBS buffers so it needs to flush
> the buffer whenever task scheduling happens. >
> The underlying PMU may or may not require this for the given event,
> but it will be handled in the pmu::sched_task() callback anyway.
>
> Signed-off-by: Namhyung Kim <[email protected]>
> ---
> kernel/events/core.c | 4 ++++
> 1 file changed, 4 insertions(+)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index b458ed3dc81b..aaa0155c4142 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -4696,6 +4696,8 @@ static void unaccount_event(struct perf_event *event)
> dec = true;
> if (has_branch_stack(event))
> dec = true;
> + if (event->pmu->sched_task)
> + dec = true;
> if (event->attr.ksymbol)
> atomic_dec(&nr_ksymbol_events);
> if (event->attr.bpf_event)
> @@ -11225,6 +11227,8 @@ static void account_event(struct perf_event *event)
> inc = true;
> if (is_cgroup_event(event))
> inc = true;
> + if (event->pmu->sched_task)
> + inc = true;
> if (event->attr.ksymbol)
> atomic_inc(&nr_ksymbol_events);
> if (event->attr.bpf_event)
>
Hello,
On Thu, Nov 5, 2020 at 11:47 PM Liang, Kan <[email protected]> wrote:
>
>
>
> On 11/2/2020 9:52 AM, Namhyung Kim wrote:
> > If an event associated with a PMU which has a sched_task callback,
> > it should be called regardless of cpu/task context. For example,
>
>
> I don't think it's necessary. We should call it when we have to.
> Otherwise, it just waste cycles.
> Shouldn't the patch 2 be enough?
I'm not sure, without this patch __perf_event_task_sched_in/out
cannot be called for per-cpu events (w/o cgroups) IMHO.
And I could not find any other place to check the
perf_sched_cb_usages.
Thanks
Namhyung
>
> > a per-cpu event might enable large PEBS buffers so it needs to flush
> > the buffer whenever task scheduling happens. >
> > The underlying PMU may or may not require this for the given event,
> > but it will be handled in the pmu::sched_task() callback anyway.
> >
> > Signed-off-by: Namhyung Kim <[email protected]>
> > ---
> > kernel/events/core.c | 4 ++++
> > 1 file changed, 4 insertions(+)
> >
> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index b458ed3dc81b..aaa0155c4142 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -4696,6 +4696,8 @@ static void unaccount_event(struct perf_event *event)
> > dec = true;
> > if (has_branch_stack(event))
> > dec = true;
> > + if (event->pmu->sched_task)
> > + dec = true;
> > if (event->attr.ksymbol)
> > atomic_dec(&nr_ksymbol_events);
> > if (event->attr.bpf_event)
> > @@ -11225,6 +11227,8 @@ static void account_event(struct perf_event *event)
> > inc = true;
> > if (is_cgroup_event(event))
> > inc = true;
> > + if (event->pmu->sched_task)
> > + inc = true;
> > if (event->attr.ksymbol)
> > atomic_inc(&nr_ksymbol_events);
> > if (event->attr.bpf_event)
> >
On Thu, Nov 5, 2020 at 11:48 PM Liang, Kan <[email protected]> wrote:
>
>
>
> On 11/2/2020 9:52 AM, Namhyung Kim wrote:
> > The commit 44fae179ce73 ("perf/core: Pull pmu::sched_task() into
> > perf_event_context_sched_out()") moved the pmu::sched_task callback to
> > be called for task event context. But it missed to call it for
> > per-cpu events to flush PMU internal buffers (i.e. for PEBS, ...).
> >
> > This commit basically reverts the commit but keeps the optimization
> > for task events and only add missing calls for cpu events.
> >
> > Fixes: 44fae179ce73 ("perf/core: Pull pmu::sched_task() into perf_event_context_sched_out()")
> > Signed-off-by: Namhyung Kim <[email protected]>
> > ---
[SNIP]
> > @@ -3850,6 +3880,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
> >
> > if (atomic_read(&nr_switch_events))
> > perf_event_switch(task, prev, true);
> > +
> > + if (__this_cpu_read(perf_sched_cb_usages))
> > + perf_pmu_sched_task(prev, task, true);
> > }
>
> It looks like the ("perf/core: Pull pmu::sched_task() into
> perf_event_context_sched_in()") is also reverted in the patch.
Ah, right..
>
> >
> > static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
> > @@ -12999,6 +13032,7 @@ static void __init perf_event_init_all_cpus(void)
> > #ifdef CONFIG_CGROUP_PERF
> > INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
> > #endif
> > + INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
> > }
> > }
> >
> >
>
> Can we only update the perf_sched_cb_usages and sched_cb_list for
> per-cpu event as below patch (not tested)?
>
> If user only uses the per-process event, we don't need to go through the
> list.
Hmm... ok.
>
>
> diff --git a/arch/powerpc/perf/core-book3s.c
> b/arch/powerpc/perf/core-book3s.c
> index 6586f7e71cfb..63c9b87cab5e 100644
> --- a/arch/powerpc/perf/core-book3s.c
> +++ b/arch/powerpc/perf/core-book3s.c
> @@ -380,7 +380,7 @@ static void power_pmu_bhrb_enable(struct perf_event
> *event)
> cpuhw->bhrb_context = event->ctx;
> }
> cpuhw->bhrb_users++;
> - perf_sched_cb_inc(event->ctx->pmu);
> + perf_sched_cb_inc(event->ctx->pmu, !(event->attach_state &
> PERF_ATTACH_TASK));
> }
>
> static void power_pmu_bhrb_disable(struct perf_event *event)
> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
> index 444e5f061d04..a34b90c7fa6d 100644
> --- a/arch/x86/events/intel/ds.c
> +++ b/arch/x86/events/intel/ds.c
> @@ -1022,9 +1022,9 @@ pebs_update_state(bool needed_cb, struct
> cpu_hw_events *cpuc,
>
> if (needed_cb != pebs_needs_sched_cb(cpuc)) {
> if (!needed_cb)
> - perf_sched_cb_inc(pmu);
> + perf_sched_cb_inc(pmu, !(event->attach_state & PERF_ATTACH_TASK));
> else
> - perf_sched_cb_dec(pmu);
> + perf_sched_cb_dec(pmu, !(event->attach_state & PERF_ATTACH_TASK));
>
> update = true;
> }
> diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
> index 8961653c5dd2..8d4d02cde3d4 100644
> --- a/arch/x86/events/intel/lbr.c
> +++ b/arch/x86/events/intel/lbr.c
> @@ -693,7 +693,7 @@ void intel_pmu_lbr_add(struct perf_event *event)
> */
> if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0)
> cpuc->lbr_pebs_users++;
> - perf_sched_cb_inc(event->ctx->pmu);
> + perf_sched_cb_inc(event->ctx->pmu, !(event->attach_state &
> PERF_ATTACH_TASK));
> if (!cpuc->lbr_users++ && !event->total_time_running)
> intel_pmu_lbr_reset();
>
> @@ -740,7 +740,7 @@ void intel_pmu_lbr_del(struct perf_event *event)
> cpuc->lbr_users--;
> WARN_ON_ONCE(cpuc->lbr_users < 0);
> WARN_ON_ONCE(cpuc->lbr_pebs_users < 0);
> - perf_sched_cb_dec(event->ctx->pmu);
> + perf_sched_cb_dec(event->ctx->pmu, !(event->attach_state &
> PERF_ATTACH_TASK));
> }
>
> static inline bool vlbr_exclude_host(void)
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index abb70a557cb5..5a02239ca8fd 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -966,8 +966,8 @@ extern const struct perf_event_attr
> *perf_event_attrs(struct perf_event *event);
> extern void perf_event_print_debug(void);
> extern void perf_pmu_disable(struct pmu *pmu);
> extern void perf_pmu_enable(struct pmu *pmu);
> -extern void perf_sched_cb_dec(struct pmu *pmu);
> -extern void perf_sched_cb_inc(struct pmu *pmu);
> +extern void perf_sched_cb_dec(struct pmu *pmu, bool systemwide);
> +extern void perf_sched_cb_inc(struct pmu *pmu, bool systemwide);
> extern int perf_event_task_disable(void);
> extern int perf_event_task_enable(void);
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index e6b98507734a..2d7c07af02f8 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -3484,25 +3484,29 @@ static void perf_event_context_sched_out(struct
> task_struct *task, int ctxn,
>
> static DEFINE_PER_CPU(struct list_head, sched_cb_list);
>
> -void perf_sched_cb_dec(struct pmu *pmu)
> +void perf_sched_cb_dec(struct pmu *pmu, bool systemwide)
> {
> struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
>
> - this_cpu_dec(perf_sched_cb_usages);
> + --cpuctx->sched_cb_usage;
>
> - if (!--cpuctx->sched_cb_usage)
> + if (systemwide) {
> + this_cpu_dec(perf_sched_cb_usages);
> list_del(&cpuctx->sched_cb_entry);
> + }
> }
>
>
> -void perf_sched_cb_inc(struct pmu *pmu)
> +void perf_sched_cb_inc(struct pmu *pmu, bool systemwide)
> {
> struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
>
> - if (!cpuctx->sched_cb_usage++)
> - list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
> + cpuctx->sched_cb_usage++;
>
> - this_cpu_inc(perf_sched_cb_usages);
> + if (systemwide) {
> + this_cpu_inc(perf_sched_cb_usages);
> + list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
You need to check the value and make sure it's added only once.
Thanks
Namhyung
> + }
> }
>
> /*
>
> Thanks,
> Kan
On 11/5/2020 10:45 AM, Namhyung Kim wrote:
> Hello,
>
> On Thu, Nov 5, 2020 at 11:47 PM Liang, Kan <[email protected]> wrote:
>>
>>
>>
>> On 11/2/2020 9:52 AM, Namhyung Kim wrote:
>>> If an event associated with a PMU which has a sched_task callback,
>>> it should be called regardless of cpu/task context. For example,
>>
>>
>> I don't think it's necessary. We should call it when we have to.
>> Otherwise, it just waste cycles.
>> Shouldn't the patch 2 be enough?
>
> I'm not sure, without this patch __perf_event_task_sched_in/out
> cannot be called for per-cpu events (w/o cgroups) IMHO.
> And I could not find any other place to check the
> perf_sched_cb_usages.
>
Yes, it should a bug for large PEBS, and it should has always been there
since the large PEBS was introduced. I just tried some older kernels
(before recent change). Large PEBS is not flushed with per-cpu events.
But from your description, it looks like the issue is only found after
recent change. Could you please double check if the issue can also be
reproduced before the recent change?
>>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>>> index b458ed3dc81b..aaa0155c4142 100644
>>> --- a/kernel/events/core.c
>>> +++ b/kernel/events/core.c
>>> @@ -4696,6 +4696,8 @@ static void unaccount_event(struct perf_event *event)
>>> dec = true;
>>> if (has_branch_stack(event))
>>> dec = true;
>>> + if (event->pmu->sched_task)
>>> + dec = true;
I think sched_task is a too big hammer. The
__perf_event_task_sched_in/out will be invoked even for non-pebs per-cpu
events, which is not necessary.
Maybe we can introduce a flag to indicate the case. How about the patch
as below?
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index c79748f6921d..953a4bb98330 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3565,8 +3565,10 @@ static int intel_pmu_hw_config(struct perf_event
*event)
if (!(event->attr.freq || (event->attr.wakeup_events &&
!event->attr.watermark))) {
event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
if (!(event->attr.sample_type &
- ~intel_pmu_large_pebs_flags(event)))
+ ~intel_pmu_large_pebs_flags(event))) {
event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS;
+ event->attach_state |= PERF_ATTACH_SCHED_DATA;
+ }
}
if (x86_pmu.pebs_aliases)
x86_pmu.pebs_aliases(event);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0defb526cd0c..3eef7142aa11 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -606,6 +606,7 @@ struct swevent_hlist {
#define PERF_ATTACH_TASK 0x04
#define PERF_ATTACH_TASK_DATA 0x08
#define PERF_ATTACH_ITRACE 0x10
+#define PERF_ATTACH_SCHED_DATA 0x20
struct perf_cgroup;
struct perf_buffer;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index dba4ea4e648b..a38133b5543a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4675,7 +4675,7 @@ static void unaccount_event(struct perf_event *event)
if (event->parent)
return;
- if (event->attach_state & PERF_ATTACH_TASK)
+ if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_DATA))
dec = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_dec(&nr_mmap_events);
@@ -11204,7 +11204,7 @@ static void account_event(struct perf_event *event)
if (event->parent)
return;
- if (event->attach_state & PERF_ATTACH_TASK)
+ if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_DATA))
inc = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_inc(&nr_mmap_events);
Thanks,
Kan
On 11/5/2020 10:54 AM, Namhyung Kim wrote:
>> -void perf_sched_cb_inc(struct pmu *pmu)
>> +void perf_sched_cb_inc(struct pmu *pmu, bool systemwide)
>> {
>> struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
>>
>> - if (!cpuctx->sched_cb_usage++)
>> - list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
>> + cpuctx->sched_cb_usage++;
>>
>> - this_cpu_inc(perf_sched_cb_usages);
>> + if (systemwide) {
>> + this_cpu_inc(perf_sched_cb_usages);
>> + list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
> You need to check the value and make sure it's added only once.
Right, maybe we have to add a new variable for that.
diff --git a/arch/powerpc/perf/core-book3s.c
b/arch/powerpc/perf/core-book3s.c
index 6586f7e71cfb..63c9b87cab5e 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -380,7 +380,7 @@ static void power_pmu_bhrb_enable(struct perf_event
*event)
cpuhw->bhrb_context = event->ctx;
}
cpuhw->bhrb_users++;
- perf_sched_cb_inc(event->ctx->pmu);
+ perf_sched_cb_inc(event->ctx->pmu, !(event->attach_state &
PERF_ATTACH_TASK));
}
static void power_pmu_bhrb_disable(struct perf_event *event)
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 444e5f061d04..a34b90c7fa6d 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1022,9 +1022,9 @@ pebs_update_state(bool needed_cb, struct
cpu_hw_events *cpuc,
if (needed_cb != pebs_needs_sched_cb(cpuc)) {
if (!needed_cb)
- perf_sched_cb_inc(pmu);
+ perf_sched_cb_inc(pmu, !(event->attach_state & PERF_ATTACH_TASK));
else
- perf_sched_cb_dec(pmu);
+ perf_sched_cb_dec(pmu, !(event->attach_state & PERF_ATTACH_TASK));
update = true;
}
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 8961653c5dd2..8d4d02cde3d4 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -693,7 +693,7 @@ void intel_pmu_lbr_add(struct perf_event *event)
*/
if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0)
cpuc->lbr_pebs_users++;
- perf_sched_cb_inc(event->ctx->pmu);
+ perf_sched_cb_inc(event->ctx->pmu, !(event->attach_state &
PERF_ATTACH_TASK));
if (!cpuc->lbr_users++ && !event->total_time_running)
intel_pmu_lbr_reset();
@@ -740,7 +740,7 @@ void intel_pmu_lbr_del(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
WARN_ON_ONCE(cpuc->lbr_pebs_users < 0);
- perf_sched_cb_dec(event->ctx->pmu);
+ perf_sched_cb_dec(event->ctx->pmu, !(event->attach_state &
PERF_ATTACH_TASK));
}
static inline bool vlbr_exclude_host(void)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a1b91f2de264..14f936385cc8 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -875,6 +875,7 @@ struct perf_cpu_context {
struct list_head sched_cb_entry;
int sched_cb_usage;
+ int sched_cb_sw_usage;
int online;
/*
@@ -967,8 +968,8 @@ extern const struct perf_event_attr
*perf_event_attrs(struct perf_event *event);
extern void perf_event_print_debug(void);
extern void perf_pmu_disable(struct pmu *pmu);
extern void perf_pmu_enable(struct pmu *pmu);
-extern void perf_sched_cb_dec(struct pmu *pmu);
-extern void perf_sched_cb_inc(struct pmu *pmu);
+extern void perf_sched_cb_dec(struct pmu *pmu, bool systemwide);
+extern void perf_sched_cb_inc(struct pmu *pmu, bool systemwide);
extern int perf_event_task_disable(void);
extern int perf_event_task_enable(void);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 66a9bd71f3da..af75859c9138 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3484,22 +3484,32 @@ static void perf_event_context_sched_out(struct
task_struct *task, int ctxn,
static DEFINE_PER_CPU(struct list_head, sched_cb_list);
-void perf_sched_cb_dec(struct pmu *pmu)
+void perf_sched_cb_dec(struct pmu *pmu, bool systemwide)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ --cpuctx->sched_cb_usage;
+
+ if (!systemwide)
+ return;
+
this_cpu_dec(perf_sched_cb_usages);
- if (!--cpuctx->sched_cb_usage)
+ if (!--cpuctx->sched_cb_sw_usage)
list_del(&cpuctx->sched_cb_entry);
}
-void perf_sched_cb_inc(struct pmu *pmu)
+void perf_sched_cb_inc(struct pmu *pmu, bool systemwide)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- if (!cpuctx->sched_cb_usage++)
+ cpuctx->sched_cb_usage++;
+
+ if (!systemwide)
+ return;
+
+ if (!cpuctx->sched_cb_sw_usage++)
list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
this_cpu_inc(perf_sched_cb_usages);
Thanks,
Kan
On Thu, Nov 5, 2020 at 11:40 AM Liang, Kan <[email protected]> wrote:
>
>
>
> On 11/5/2020 10:54 AM, Namhyung Kim wrote:
> >> -void perf_sched_cb_inc(struct pmu *pmu)
> >> +void perf_sched_cb_inc(struct pmu *pmu, bool systemwide)
> >> {
> >> struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
> >>
> >> - if (!cpuctx->sched_cb_usage++)
> >> - list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
> >> + cpuctx->sched_cb_usage++;
> >>
> >> - this_cpu_inc(perf_sched_cb_usages);
> >> + if (systemwide) {
> >> + this_cpu_inc(perf_sched_cb_usages);
> >> + list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
> > You need to check the value and make sure it's added only once.
>
> Right, maybe we have to add a new variable for that.
>
Sure, I tend to agree here that we need a narrower scope trigger, only
when needed, i.e., an event
or PMU feature that requires ctxsw work. In fact, I am also interested
in splitting ctxswin and ctswout
callbacks. The reason is that you have overhead in the way the
callback is invoked. You may end up
calling the sched_task on ctxswout when only ctxwin is needed. In
doing that you pay the cost of
stopping/starting the PMU for possibly nothing. Stopping the PMU can
be expensive, like on AMD
where you need multiple wrmsr.
So splitting or adding a flag to convey that either CTXSW_IN or
CTXSW_OUT is needed would help.
I am suggesting this now given you are adding a flag.
>
> diff --git a/arch/powerpc/perf/core-book3s.c
> b/arch/powerpc/perf/core-book3s.c
> index 6586f7e71cfb..63c9b87cab5e 100644
> --- a/arch/powerpc/perf/core-book3s.c
> +++ b/arch/powerpc/perf/core-book3s.c
> @@ -380,7 +380,7 @@ static void power_pmu_bhrb_enable(struct perf_event
> *event)
> cpuhw->bhrb_context = event->ctx;
> }
> cpuhw->bhrb_users++;
> - perf_sched_cb_inc(event->ctx->pmu);
> + perf_sched_cb_inc(event->ctx->pmu, !(event->attach_state &
> PERF_ATTACH_TASK));
> }
>
> static void power_pmu_bhrb_disable(struct perf_event *event)
> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
> index 444e5f061d04..a34b90c7fa6d 100644
> --- a/arch/x86/events/intel/ds.c
> +++ b/arch/x86/events/intel/ds.c
> @@ -1022,9 +1022,9 @@ pebs_update_state(bool needed_cb, struct
> cpu_hw_events *cpuc,
>
> if (needed_cb != pebs_needs_sched_cb(cpuc)) {
> if (!needed_cb)
> - perf_sched_cb_inc(pmu);
> + perf_sched_cb_inc(pmu, !(event->attach_state & PERF_ATTACH_TASK));
> else
> - perf_sched_cb_dec(pmu);
> + perf_sched_cb_dec(pmu, !(event->attach_state & PERF_ATTACH_TASK));
>
> update = true;
> }
> diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
> index 8961653c5dd2..8d4d02cde3d4 100644
> --- a/arch/x86/events/intel/lbr.c
> +++ b/arch/x86/events/intel/lbr.c
> @@ -693,7 +693,7 @@ void intel_pmu_lbr_add(struct perf_event *event)
> */
> if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0)
> cpuc->lbr_pebs_users++;
> - perf_sched_cb_inc(event->ctx->pmu);
> + perf_sched_cb_inc(event->ctx->pmu, !(event->attach_state &
> PERF_ATTACH_TASK));
> if (!cpuc->lbr_users++ && !event->total_time_running)
> intel_pmu_lbr_reset();
>
> @@ -740,7 +740,7 @@ void intel_pmu_lbr_del(struct perf_event *event)
> cpuc->lbr_users--;
> WARN_ON_ONCE(cpuc->lbr_users < 0);
> WARN_ON_ONCE(cpuc->lbr_pebs_users < 0);
> - perf_sched_cb_dec(event->ctx->pmu);
> + perf_sched_cb_dec(event->ctx->pmu, !(event->attach_state &
> PERF_ATTACH_TASK));
> }
>
> static inline bool vlbr_exclude_host(void)
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index a1b91f2de264..14f936385cc8 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -875,6 +875,7 @@ struct perf_cpu_context {
>
> struct list_head sched_cb_entry;
> int sched_cb_usage;
> + int sched_cb_sw_usage;
>
> int online;
> /*
> @@ -967,8 +968,8 @@ extern const struct perf_event_attr
> *perf_event_attrs(struct perf_event *event);
> extern void perf_event_print_debug(void);
> extern void perf_pmu_disable(struct pmu *pmu);
> extern void perf_pmu_enable(struct pmu *pmu);
> -extern void perf_sched_cb_dec(struct pmu *pmu);
> -extern void perf_sched_cb_inc(struct pmu *pmu);
> +extern void perf_sched_cb_dec(struct pmu *pmu, bool systemwide);
> +extern void perf_sched_cb_inc(struct pmu *pmu, bool systemwide);
> extern int perf_event_task_disable(void);
> extern int perf_event_task_enable(void);
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 66a9bd71f3da..af75859c9138 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -3484,22 +3484,32 @@ static void perf_event_context_sched_out(struct
> task_struct *task, int ctxn,
>
> static DEFINE_PER_CPU(struct list_head, sched_cb_list);
>
> -void perf_sched_cb_dec(struct pmu *pmu)
> +void perf_sched_cb_dec(struct pmu *pmu, bool systemwide)
> {
> struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
>
> + --cpuctx->sched_cb_usage;
> +
> + if (!systemwide)
> + return;
> +
> this_cpu_dec(perf_sched_cb_usages);
>
> - if (!--cpuctx->sched_cb_usage)
> + if (!--cpuctx->sched_cb_sw_usage)
> list_del(&cpuctx->sched_cb_entry);
> }
>
>
> -void perf_sched_cb_inc(struct pmu *pmu)
> +void perf_sched_cb_inc(struct pmu *pmu, bool systemwide)
> {
> struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
>
> - if (!cpuctx->sched_cb_usage++)
> + cpuctx->sched_cb_usage++;
> +
> + if (!systemwide)
> + return;
> +
> + if (!cpuctx->sched_cb_sw_usage++)
> list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
>
> this_cpu_inc(perf_sched_cb_usages);
>
>
>
> Thanks,
> Kan
On 11/5/2020 4:15 PM, Stephane Eranian wrote:
> On Thu, Nov 5, 2020 at 11:40 AM Liang, Kan <[email protected]> wrote:
>>
>>
>>
>> On 11/5/2020 10:54 AM, Namhyung Kim wrote:
>>>> -void perf_sched_cb_inc(struct pmu *pmu)
>>>> +void perf_sched_cb_inc(struct pmu *pmu, bool systemwide)
>>>> {
>>>> struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
>>>>
>>>> - if (!cpuctx->sched_cb_usage++)
>>>> - list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
>>>> + cpuctx->sched_cb_usage++;
>>>>
>>>> - this_cpu_inc(perf_sched_cb_usages);
>>>> + if (systemwide) {
>>>> + this_cpu_inc(perf_sched_cb_usages);
>>>> + list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
>>> You need to check the value and make sure it's added only once.
>>
>> Right, maybe we have to add a new variable for that.
>>
> Sure, I tend to agree here that we need a narrower scope trigger, only
> when needed, i.e., an event
> or PMU feature that requires ctxsw work. In fact, I am also interested
> in splitting ctxswin and ctswout
> callbacks. The reason is that you have overhead in the way the
> callback is invoked. You may end up
> calling the sched_task on ctxswout when only ctxwin is needed. In
> doing that you pay the cost of
> stopping/starting the PMU for possibly nothing. Stopping the PMU can
> be expensive, like on AMD
> where you need multiple wrmsr.
>
> So splitting or adding a flag to convey that either CTXSW_IN or
> CTXSW_OUT is needed would help.
> I am suggesting this now given you are adding a flag.
>
Yes, adding a new flag would avoid the unnecessary PMU stop/restart for
both per-cpu and per-process event.
How about the patch as below?
(Just did simple test. Should need another patch to split the callback.)
diff --git a/arch/powerpc/perf/core-book3s.c
b/arch/powerpc/perf/core-book3s.c
index 6586f7e71cfb..e5837fdf82e3 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -370,6 +370,7 @@ static void power_pmu_bhrb_reset(void)
static void power_pmu_bhrb_enable(struct perf_event *event)
{
struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events);
+ int state = PERF_SCHED_CB_SW_IN;
if (!ppmu->bhrb_nr)
return;
@@ -380,7 +381,11 @@ static void power_pmu_bhrb_enable(struct perf_event
*event)
cpuhw->bhrb_context = event->ctx;
}
cpuhw->bhrb_users++;
- perf_sched_cb_inc(event->ctx->pmu);
+
+ if (!(event->attach_state & PERF_ATTACH_TASK))
+ state |= PERF_SCHED_CB_CPU;
+
+ perf_sched_cb_inc(event->ctx->pmu, state);
}
static void power_pmu_bhrb_disable(struct perf_event *event)
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 444e5f061d04..f4f9d737ca85 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1019,12 +1019,16 @@ pebs_update_state(bool needed_cb, struct
cpu_hw_events *cpuc,
* that does not hurt:
*/
bool update = cpuc->n_pebs == 1;
+ int state = PERF_SCHED_CB_SW_OUT;
if (needed_cb != pebs_needs_sched_cb(cpuc)) {
+ if (!(event->attach_state & PERF_ATTACH_TASK))
+ state |= PERF_SCHED_CB_CPU;
+
if (!needed_cb)
- perf_sched_cb_inc(pmu);
+ perf_sched_cb_inc(pmu, state);
else
- perf_sched_cb_dec(pmu);
+ perf_sched_cb_dec(pmu, state);
update = true;
}
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 8961653c5dd2..e4c500580df5 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -660,6 +660,7 @@ void intel_pmu_lbr_add(struct perf_event *event)
{
struct kmem_cache *kmem_cache = event->pmu->task_ctx_cache;
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int state = PERF_SCHED_CB_SW_IN;
if (!x86_pmu.lbr_nr)
return;
@@ -693,7 +694,13 @@ void intel_pmu_lbr_add(struct perf_event *event)
*/
if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0)
cpuc->lbr_pebs_users++;
- perf_sched_cb_inc(event->ctx->pmu);
+
+ if (!(event->attach_state & PERF_ATTACH_TASK))
+ state |= PERF_SCHED_CB_CPU;
+ if (event->attach_state & PERF_ATTACH_TASK_DATA)
+ state |= PERF_SCHED_CB_SW_OUT;
+
+ perf_sched_cb_inc(event->ctx->pmu, state);
if (!cpuc->lbr_users++ && !event->total_time_running)
intel_pmu_lbr_reset();
@@ -724,6 +731,7 @@ void release_lbr_buffers(void)
void intel_pmu_lbr_del(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int state = 0;
if (!x86_pmu.lbr_nr)
return;
@@ -740,7 +748,10 @@ void intel_pmu_lbr_del(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
WARN_ON_ONCE(cpuc->lbr_pebs_users < 0);
- perf_sched_cb_dec(event->ctx->pmu);
+
+ if (!(event->attach_state & PERF_ATTACH_TASK))
+ state |= PERF_SCHED_CB_CPU;
+ perf_sched_cb_dec(event->ctx->pmu, state);
}
static inline bool vlbr_exclude_host(void)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a1b91f2de264..1556fdbfc615 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -854,6 +854,10 @@ struct perf_event_context {
*/
#define PERF_NR_CONTEXTS 4
+#define PERF_SCHED_CB_CPU 0x1
+#define PERF_SCHED_CB_SW_IN 0x2
+#define PERF_SCHED_CB_SW_OUT 0x4
+
/**
* struct perf_event_cpu_context - per cpu event context structure
*/
@@ -875,6 +879,8 @@ struct perf_cpu_context {
struct list_head sched_cb_entry;
int sched_cb_usage;
+ int sched_cb_cpu_usage;
+ int sched_cb_state;
int online;
/*
@@ -967,8 +973,8 @@ extern const struct perf_event_attr
*perf_event_attrs(struct perf_event *event);
extern void perf_event_print_debug(void);
extern void perf_pmu_disable(struct pmu *pmu);
extern void perf_pmu_enable(struct pmu *pmu);
-extern void perf_sched_cb_dec(struct pmu *pmu);
-extern void perf_sched_cb_inc(struct pmu *pmu);
+extern void perf_sched_cb_dec(struct pmu *pmu, int state);
+extern void perf_sched_cb_inc(struct pmu *pmu, int state);
extern int perf_event_task_disable(void);
extern int perf_event_task_enable(void);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 66a9bd71f3da..3a9d017a75b6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3384,6 +3384,34 @@ static void perf_event_sync_stat(struct
perf_event_context *ctx,
}
}
+static inline bool perf_pmu_sched_in_ctx(struct perf_cpu_context *cpuctx)
+{
+ return !!(cpuctx->sched_cb_state & PERF_SCHED_CB_SW_IN);
+}
+
+static inline bool perf_pmu_sched_out_ctx(struct perf_cpu_context *cpuctx)
+{
+ return !!(cpuctx->sched_cb_state & PERF_SCHED_CB_SW_OUT);
+}
+
+static inline bool perf_pmu_has_sched_task(struct perf_cpu_context
*cpuctx, bool sched_in)
+{
+ struct pmu *pmu;
+
+ pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
+
+ if (!pmu->sched_task)
+ return false;
+
+ if (sched_in && perf_pmu_sched_in_ctx(cpuctx))
+ return true;
+
+ if (!sched_in && perf_pmu_sched_out_ctx(cpuctx))
+ return true;
+
+ return false;
+}
+
static void perf_event_context_sched_out(struct task_struct *task, int
ctxn,
struct task_struct *next)
{
@@ -3433,7 +3461,7 @@ static void perf_event_context_sched_out(struct
task_struct *task, int ctxn,
perf_pmu_disable(pmu);
- if (cpuctx->sched_cb_usage && pmu->sched_task)
+ if (cpuctx->sched_cb_usage && perf_pmu_has_sched_task(cpuctx, false))
pmu->sched_task(ctx, false);
/*
@@ -3473,7 +3501,7 @@ static void perf_event_context_sched_out(struct
task_struct *task, int ctxn,
raw_spin_lock(&ctx->lock);
perf_pmu_disable(pmu);
- if (cpuctx->sched_cb_usage && pmu->sched_task)
+ if (cpuctx->sched_cb_usage && perf_pmu_has_sched_task(cpuctx, false))
pmu->sched_task(ctx, false);
task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
@@ -3484,22 +3512,35 @@ static void perf_event_context_sched_out(struct
task_struct *task, int ctxn,
static DEFINE_PER_CPU(struct list_head, sched_cb_list);
-void perf_sched_cb_dec(struct pmu *pmu)
+void perf_sched_cb_dec(struct pmu *pmu, int state)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ if (!--cpuctx->sched_cb_usage)
+ cpuctx->sched_cb_state = 0;
+
+ if (!(state & PERF_SCHED_CB_CPU))
+ return;
+
this_cpu_dec(perf_sched_cb_usages);
- if (!--cpuctx->sched_cb_usage)
+ if (!--cpuctx->sched_cb_cpu_usage)
list_del(&cpuctx->sched_cb_entry);
}
-void perf_sched_cb_inc(struct pmu *pmu)
+void perf_sched_cb_inc(struct pmu *pmu, int state)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- if (!cpuctx->sched_cb_usage++)
+ cpuctx->sched_cb_usage++;
+
+ cpuctx->sched_cb_state |= state;
+
+ if (!(state & PERF_SCHED_CB_CPU))
+ return;
+
+ if (!cpuctx->sched_cb_cpu_usage++)
list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
this_cpu_inc(perf_sched_cb_usages);
@@ -3544,8 +3585,8 @@ static void perf_pmu_sched_task(struct task_struct
*prev,
/* will be handled in perf_event_context_sched_in/out */
if (cpuctx->task_ctx)
continue;
-
- __perf_pmu_sched_task(cpuctx, sched_in);
+ if (perf_pmu_has_sched_task(cpuctx, sched_in))
+ __perf_pmu_sched_task(cpuctx, sched_in);
}
}
@@ -3809,7 +3850,8 @@ static void perf_event_context_sched_in(struct
perf_event_context *ctx,
cpuctx = __get_cpu_context(ctx);
if (cpuctx->task_ctx == ctx) {
- if (cpuctx->sched_cb_usage)
+ if (cpuctx->sched_cb_usage &&
+ (cpuctx->sched_cb_state & PERF_SCHED_CB_SW_IN))
__perf_pmu_sched_task(cpuctx, true);
return;
}
@@ -3835,7 +3877,7 @@ static void perf_event_context_sched_in(struct
perf_event_context *ctx,
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, ctx, task);
- if (cpuctx->sched_cb_usage && pmu->sched_task)
+ if (cpuctx->sched_cb_usage && perf_pmu_has_sched_task(cpuctx, true))
pmu->sched_task(cpuctx->task_ctx, true);
perf_pmu_enable(pmu);
Thanks,
Kan
On Fri, Nov 6, 2020 at 4:01 AM Liang, Kan <[email protected]> wrote:
>
>
>
> On 11/5/2020 10:45 AM, Namhyung Kim wrote:
> > Hello,
> >
> > On Thu, Nov 5, 2020 at 11:47 PM Liang, Kan <[email protected]> wrote:
> >>
> >>
> >>
> >> On 11/2/2020 9:52 AM, Namhyung Kim wrote:
> >>> If an event associated with a PMU which has a sched_task callback,
> >>> it should be called regardless of cpu/task context. For example,
> >>
> >>
> >> I don't think it's necessary. We should call it when we have to.
> >> Otherwise, it just waste cycles.
> >> Shouldn't the patch 2 be enough?
> >
> > I'm not sure, without this patch __perf_event_task_sched_in/out
> > cannot be called for per-cpu events (w/o cgroups) IMHO.
> > And I could not find any other place to check the
> > perf_sched_cb_usages.
> >
>
> Yes, it should a bug for large PEBS, and it should has always been there
> since the large PEBS was introduced. I just tried some older kernels
> (before recent change). Large PEBS is not flushed with per-cpu events.
>
> But from your description, it looks like the issue is only found after
> recent change. Could you please double check if the issue can also be
> reproduced before the recent change?
Yep, actually Gabriel reported this problem on v4.4 kernel.
I'm sorry that my description was misleading.
>
>
> >>> diff --git a/kernel/events/core.c b/kernel/events/core.c
> >>> index b458ed3dc81b..aaa0155c4142 100644
> >>> --- a/kernel/events/core.c
> >>> +++ b/kernel/events/core.c
> >>> @@ -4696,6 +4696,8 @@ static void unaccount_event(struct perf_event *event)
> >>> dec = true;
> >>> if (has_branch_stack(event))
> >>> dec = true;
> >>> + if (event->pmu->sched_task)
> >>> + dec = true;
>
> I think sched_task is a too big hammer. The
> __perf_event_task_sched_in/out will be invoked even for non-pebs per-cpu
> events, which is not necessary.
Agreed.
>
> Maybe we can introduce a flag to indicate the case. How about the patch
> as below?
LGTM, but I prefer PERF_ATTACH_SCHED_CB, though. :)
Thanks
Namhyung
>
> diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
> index c79748f6921d..953a4bb98330 100644
> --- a/arch/x86/events/intel/core.c
> +++ b/arch/x86/events/intel/core.c
> @@ -3565,8 +3565,10 @@ static int intel_pmu_hw_config(struct perf_event
> *event)
> if (!(event->attr.freq || (event->attr.wakeup_events &&
> !event->attr.watermark))) {
> event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
> if (!(event->attr.sample_type &
> - ~intel_pmu_large_pebs_flags(event)))
> + ~intel_pmu_large_pebs_flags(event))) {
> event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS;
> + event->attach_state |= PERF_ATTACH_SCHED_DATA;
> + }
> }
> if (x86_pmu.pebs_aliases)
> x86_pmu.pebs_aliases(event);
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 0defb526cd0c..3eef7142aa11 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -606,6 +606,7 @@ struct swevent_hlist {
> #define PERF_ATTACH_TASK 0x04
> #define PERF_ATTACH_TASK_DATA 0x08
> #define PERF_ATTACH_ITRACE 0x10
> +#define PERF_ATTACH_SCHED_DATA 0x20
>
> struct perf_cgroup;
> struct perf_buffer;
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index dba4ea4e648b..a38133b5543a 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -4675,7 +4675,7 @@ static void unaccount_event(struct perf_event *event)
> if (event->parent)
> return;
>
> - if (event->attach_state & PERF_ATTACH_TASK)
> + if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_DATA))
> dec = true;
> if (event->attr.mmap || event->attr.mmap_data)
> atomic_dec(&nr_mmap_events);
> @@ -11204,7 +11204,7 @@ static void account_event(struct perf_event *event)
> if (event->parent)
> return;
>
> - if (event->attach_state & PERF_ATTACH_TASK)
> + if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_DATA))
> inc = true;
> if (event->attr.mmap || event->attr.mmap_data)
> atomic_inc(&nr_mmap_events);
>
> Thanks,
> Kan
On 11/5/2020 7:53 PM, Namhyung Kim wrote:
> On Fri, Nov 6, 2020 at 4:01 AM Liang, Kan<[email protected]> wrote:
>>
>>
>> On 11/5/2020 10:45 AM, Namhyung Kim wrote:
>>> Hello,
>>>
>>> On Thu, Nov 5, 2020 at 11:47 PM Liang, Kan<[email protected]> wrote:
>>>>
>>>>
>>>> On 11/2/2020 9:52 AM, Namhyung Kim wrote:
>>>>> If an event associated with a PMU which has a sched_task callback,
>>>>> it should be called regardless of cpu/task context. For example,
>>>>
>>>> I don't think it's necessary. We should call it when we have to.
>>>> Otherwise, it just waste cycles.
>>>> Shouldn't the patch 2 be enough?
>>> I'm not sure, without this patch __perf_event_task_sched_in/out
>>> cannot be called for per-cpu events (w/o cgroups) IMHO.
>>> And I could not find any other place to check the
>>> perf_sched_cb_usages.
>>>
>> Yes, it should a bug for large PEBS, and it should has always been there
>> since the large PEBS was introduced. I just tried some older kernels
>> (before recent change). Large PEBS is not flushed with per-cpu events.
>>
>> But from your description, it looks like the issue is only found after
>> recent change. Could you please double check if the issue can also be
>> reproduced before the recent change?
> Yep, actually Gabriel reported this problem on v4.4 kernel.
Thanks for the confirm.
So large PEBS never works with per-cpu events. :(
I will send a new patch set to address the issue.
Thanks,
Kan