2024-03-29 16:37:42

by Andrii Nakryiko

[permalink] [raw]
Subject: [PATCH v3 1/2] perf/x86/amd: support capturing LBR from software events

[0] added ability to capture LBR (Last Branch Records) on Intel CPUs
from inside BPF program at pretty much any arbitrary point. This is
extremely useful capability that allows to figure out otherwise
hard-to-debug problems, because LBR is now available based on some
application-defined conditions, not just hardware-supported events.

retsnoop ([1]) is one such tool that takes a huge advantage of this
functionality and has proved to be an extremely useful tool in
practice.

Now, AMD Zen4 CPUs got support for similar LBR functionality, but
necessary wiring inside the kernel is not yet setup. This patch seeks to
rectify this and follows a similar approach to the original patch [0]
for Intel CPUs.

Given LBR can be set up to capture any indirect jumps, it's critical to
minimize indirect jumps on the way to requesting LBR from BPF program,
so we split amd_pmu_lbr_disable_all() into a wrapper with some generic
conditions vs always-inlined __amd_pmu_lbr_disable() called directly
from BPF subsystem (through perf_snapshot_branch_stack static call).

This was tested on AMD Bergamo CPU and worked well when utilized from
the aforementioned retsnoop tool.

[0] https://lore.kernel.org/bpf/[email protected]/
[1] https://github.com/anakryiko/retsnoop

Reviewed-by: Sandipan Das <[email protected]>
Signed-off-by: Andrii Nakryiko <[email protected]>
---
arch/x86/events/amd/core.c | 29 ++++++++++++++++++++++++++++-
arch/x86/events/amd/lbr.c | 7 +------
arch/x86/events/perf_event.h | 11 +++++++++++
3 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index aec16e581f5b..88f6d0701342 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -618,7 +618,7 @@ static void amd_pmu_cpu_dead(int cpu)
}
}

-static inline void amd_pmu_set_global_ctl(u64 ctl)
+static __always_inline void amd_pmu_set_global_ctl(u64 ctl)
{
wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl);
}
@@ -878,6 +878,29 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
return amd_pmu_adjust_nmi_window(handled);
}

+static int amd_pmu_v2_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
+{
+ struct cpu_hw_events *cpuc;
+ unsigned long flags;
+
+ /* must not have branches... */
+ local_irq_save(flags);
+ amd_pmu_core_disable_all();
+ __amd_pmu_lbr_disable();
+ /* ... until here */
+
+ cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ amd_pmu_lbr_read();
+ cnt = min_t(unsigned int, cnt, x86_pmu.lbr_nr);
+ memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt);
+
+ amd_pmu_v2_enable_all(0);
+ local_irq_restore(flags);
+
+ return cnt;
+}
+
static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1414,6 +1437,10 @@ static int __init amd_core_pmu_init(void)
static_call_update(amd_pmu_branch_reset, amd_pmu_lbr_reset);
static_call_update(amd_pmu_branch_add, amd_pmu_lbr_add);
static_call_update(amd_pmu_branch_del, amd_pmu_lbr_del);
+
+ /* only support branch_stack snapshot on perfmon v2 */
+ if (x86_pmu.handle_irq == amd_pmu_v2_handle_irq)
+ static_call_update(perf_snapshot_branch_stack, amd_pmu_v2_snapshot_branch_stack);
} else if (!amd_brs_init()) {
/*
* BRS requires special event constraints and flushing on ctxsw.
diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c
index 4a1e600314d5..0e4de028590d 100644
--- a/arch/x86/events/amd/lbr.c
+++ b/arch/x86/events/amd/lbr.c
@@ -412,16 +412,11 @@ void amd_pmu_lbr_enable_all(void)
void amd_pmu_lbr_disable_all(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- u64 dbg_ctl, dbg_extn_cfg;

if (!cpuc->lbr_users || !x86_pmu.lbr_nr)
return;

- rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg);
- rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl);
-
- wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN);
- wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+ __amd_pmu_lbr_disable();
}

__init int amd_pmu_lbr_init(void)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index fb56518356ec..4dddf0a7e81e 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1329,6 +1329,17 @@ void amd_pmu_lbr_enable_all(void);
void amd_pmu_lbr_disable_all(void);
int amd_pmu_lbr_hw_config(struct perf_event *event);

+static __always_inline void __amd_pmu_lbr_disable(void)
+{
+ u64 dbg_ctl, dbg_extn_cfg;
+
+ rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg);
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl);
+
+ wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN);
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+}
+
#ifdef CONFIG_PERF_EVENTS_AMD_BRS

#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */
--
2.43.0



2024-03-29 16:37:55

by Andrii Nakryiko

[permalink] [raw]
Subject: [PATCH v3 2/2] perf/x86/amd: don't reject sampling events with configured LBR

Now that it's possible to capture LBR on AMD CPU from BPF at arbitrary
point, there is no reason to artificially limit this feature to sampling
events. So corresponding check is removed. AFAIU, there is no
correctness implications of doing this (and it was possible to bypass
this check by just setting perf_event's sample_period to 1 anyways, so
it doesn't guard all that much).

Reviewed-by: Sandipan Das <[email protected]>
Signed-off-by: Andrii Nakryiko <[email protected]>
---
arch/x86/events/amd/lbr.c | 4 ----
1 file changed, 4 deletions(-)

diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c
index 0e4de028590d..75920f895d67 100644
--- a/arch/x86/events/amd/lbr.c
+++ b/arch/x86/events/amd/lbr.c
@@ -310,10 +310,6 @@ int amd_pmu_lbr_hw_config(struct perf_event *event)
{
int ret = 0;

- /* LBR is not recommended in counting mode */
- if (!is_sampling_event(event))
- return -EINVAL;
-
ret = amd_pmu_lbr_setup_filter(event);
if (!ret)
event->attach_state |= PERF_ATTACH_SCHED_CB;
--
2.43.0


2024-03-30 11:07:17

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] perf/x86/amd: support capturing LBR from software events


* Andrii Nakryiko <[email protected]> wrote:

> [0] added ability to capture LBR (Last Branch Records) on Intel CPUs
> from inside BPF program at pretty much any arbitrary point.

Upstream commit ID:

c22ac2a3d4bd ("perf: Enable branch record for software events")

> [...] This is extremely useful capability that allows to figure out
> otherwise hard-to-debug problems, because LBR is now available based
> on some application-defined conditions, not just hardware-supported
> events.
>
> retsnoop ([1]) is one such tool that takes a huge advantage of this
> functionality and has proved to be an extremely useful tool in
> practice.
>
> Now, AMD Zen4 CPUs got support for similar LBR functionality, but
> necessary wiring inside the kernel is not yet setup. This patch seeks to
> rectify this and follows a similar approach to the original patch [0]
> for Intel CPUs.
>
> Given LBR can be set up to capture any indirect jumps, it's critical to
> minimize indirect jumps on the way to requesting LBR from BPF program,
> so we split amd_pmu_lbr_disable_all() into a wrapper with some generic
> conditions vs always-inlined __amd_pmu_lbr_disable() called directly
> from BPF subsystem (through perf_snapshot_branch_stack static call).
>
> This was tested on AMD Bergamo CPU and worked well when utilized from
> the aforementioned retsnoop tool.
>
> [0] https://lore.kernel.org/bpf/[email protected]/
> [1] https://github.com/anakryiko/retsnoop
>
> Reviewed-by: Sandipan Das <[email protected]>
> Signed-off-by: Andrii Nakryiko <[email protected]>
> ---
> arch/x86/events/amd/core.c | 29 ++++++++++++++++++++++++++++-
> arch/x86/events/amd/lbr.c | 7 +------
> arch/x86/events/perf_event.h | 11 +++++++++++
> 3 files changed, 40 insertions(+), 7 deletions(-)
>
> diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
> index aec16e581f5b..88f6d0701342 100644
> --- a/arch/x86/events/amd/core.c
> +++ b/arch/x86/events/amd/core.c
> @@ -618,7 +618,7 @@ static void amd_pmu_cpu_dead(int cpu)
> }
> }
>
> -static inline void amd_pmu_set_global_ctl(u64 ctl)
> +static __always_inline void amd_pmu_set_global_ctl(u64 ctl)

What is this inlining change about? My first guess was that it's to
generate better code, but my guess was wrong: it's to avoid branches.
To not force people to guess, please put it into a separate patch & add
an explanation.

> {
> wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl);
> }
> @@ -878,6 +878,29 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
> return amd_pmu_adjust_nmi_window(handled);
> }
>
> +static int amd_pmu_v2_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
> +{
> + struct cpu_hw_events *cpuc;
> + unsigned long flags;
> +
> + /* must not have branches... */
> + local_irq_save(flags);
> + amd_pmu_core_disable_all();
> + __amd_pmu_lbr_disable();
> + /* ... until here */

Oh ... so it's not about performance or code layout, but to avoid new
branches to contaminate the snapshot, right? Even stronger reason to
put that change into a separate patch.

> +
> + cpuc = this_cpu_ptr(&cpu_hw_events);
> +
> + amd_pmu_lbr_read();
> + cnt = min_t(unsigned int, cnt, x86_pmu.lbr_nr);

Why is min_t() used here? AFAICT all types here are 'unsigned int'.

> + memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt);

The function could use a description comment explaining the arguments,
and that the caller must make sure there's enough space in the
'entries' array.

> +
> + amd_pmu_v2_enable_all(0);
> + local_irq_restore(flags);
> +
> + return cnt;
> +}
> +
> static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
> {
> struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> @@ -1414,6 +1437,10 @@ static int __init amd_core_pmu_init(void)
> static_call_update(amd_pmu_branch_reset, amd_pmu_lbr_reset);
> static_call_update(amd_pmu_branch_add, amd_pmu_lbr_add);
> static_call_update(amd_pmu_branch_del, amd_pmu_lbr_del);
> +
> + /* only support branch_stack snapshot on perfmon v2 */
> + if (x86_pmu.handle_irq == amd_pmu_v2_handle_irq)
> + static_call_update(perf_snapshot_branch_stack, amd_pmu_v2_snapshot_branch_stack);

> } else if (!amd_brs_init()) {
> /*
> * BRS requires special event constraints and flushing on ctxsw.

Please use consistent capitalization in all new comments you add:

/* Properly capitalized comment */


> diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c
> index 4a1e600314d5..0e4de028590d 100644
> --- a/arch/x86/events/amd/lbr.c
> +++ b/arch/x86/events/amd/lbr.c
> @@ -412,16 +412,11 @@ void amd_pmu_lbr_enable_all(void)
> void amd_pmu_lbr_disable_all(void)
> {
> struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> - u64 dbg_ctl, dbg_extn_cfg;
>
> if (!cpuc->lbr_users || !x86_pmu.lbr_nr)
> return;
>
> - rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg);
> - rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl);
> -
> - wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN);
> - wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
> + __amd_pmu_lbr_disable();
> }
>
> __init int amd_pmu_lbr_init(void)
> diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
> index fb56518356ec..4dddf0a7e81e 100644
> --- a/arch/x86/events/perf_event.h
> +++ b/arch/x86/events/perf_event.h
> @@ -1329,6 +1329,17 @@ void amd_pmu_lbr_enable_all(void);
> void amd_pmu_lbr_disable_all(void);
> int amd_pmu_lbr_hw_config(struct perf_event *event);
>
> +static __always_inline void __amd_pmu_lbr_disable(void)
> +{
> + u64 dbg_ctl, dbg_extn_cfg;
> +
> + rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg);
> + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl);
> +
> + wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN);
> + wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
> +}
> +

This factoring out of __amd_pmu_lbr_disable() should be in a separate
preparatory patch too.

Thanks,

Ingo

2024-03-31 04:10:51

by Andrii Nakryiko

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] perf/x86/amd: support capturing LBR from software events

On Sat, Mar 30, 2024 at 4:07 AM Ingo Molnar <[email protected]> wrote:
>
>
> * Andrii Nakryiko <[email protected]> wrote:
>
> > [0] added ability to capture LBR (Last Branch Records) on Intel CPUs
> > from inside BPF program at pretty much any arbitrary point.
>
> Upstream commit ID:
>
> c22ac2a3d4bd ("perf: Enable branch record for software events")

will include this in commit message

>
> > [...] This is extremely useful capability that allows to figure out
> > otherwise hard-to-debug problems, because LBR is now available based
> > on some application-defined conditions, not just hardware-supported
> > events.
> >
> > retsnoop ([1]) is one such tool that takes a huge advantage of this
> > functionality and has proved to be an extremely useful tool in
> > practice.
> >
> > Now, AMD Zen4 CPUs got support for similar LBR functionality, but
> > necessary wiring inside the kernel is not yet setup. This patch seeks to
> > rectify this and follows a similar approach to the original patch [0]
> > for Intel CPUs.
> >
> > Given LBR can be set up to capture any indirect jumps, it's critical to
> > minimize indirect jumps on the way to requesting LBR from BPF program,
> > so we split amd_pmu_lbr_disable_all() into a wrapper with some generic
> > conditions vs always-inlined __amd_pmu_lbr_disable() called directly
> > from BPF subsystem (through perf_snapshot_branch_stack static call).
> >
> > This was tested on AMD Bergamo CPU and worked well when utilized from
> > the aforementioned retsnoop tool.
> >
> > [0] https://lore.kernel.org/bpf/[email protected]/
> > [1] https://github.com/anakryiko/retsnoop
> >
> > Reviewed-by: Sandipan Das <[email protected]>
> > Signed-off-by: Andrii Nakryiko <[email protected]>
> > ---
> > arch/x86/events/amd/core.c | 29 ++++++++++++++++++++++++++++-
> > arch/x86/events/amd/lbr.c | 7 +------
> > arch/x86/events/perf_event.h | 11 +++++++++++
> > 3 files changed, 40 insertions(+), 7 deletions(-)
> >
> > diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
> > index aec16e581f5b..88f6d0701342 100644
> > --- a/arch/x86/events/amd/core.c
> > +++ b/arch/x86/events/amd/core.c
> > @@ -618,7 +618,7 @@ static void amd_pmu_cpu_dead(int cpu)
> > }
> > }
> >
> > -static inline void amd_pmu_set_global_ctl(u64 ctl)
> > +static __always_inline void amd_pmu_set_global_ctl(u64 ctl)
>
> What is this inlining change about? My first guess was that it's to
> generate better code, but my guess was wrong: it's to avoid branches.
> To not force people to guess, please put it into a separate patch & add
> an explanation.

ok, will do

>
> > {
> > wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl);
> > }
> > @@ -878,6 +878,29 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
> > return amd_pmu_adjust_nmi_window(handled);
> > }
> >
> > +static int amd_pmu_v2_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
> > +{
> > + struct cpu_hw_events *cpuc;
> > + unsigned long flags;
> > +
> > + /* must not have branches... */
> > + local_irq_save(flags);
> > + amd_pmu_core_disable_all();
> > + __amd_pmu_lbr_disable();
> > + /* ... until here */
>
> Oh ... so it's not about performance or code layout, but to avoid new
> branches to contaminate the snapshot, right? Even stronger reason to
> put that change into a separate patch.
>

yep

> > +
> > + cpuc = this_cpu_ptr(&cpu_hw_events);
> > +
> > + amd_pmu_lbr_read();
> > + cnt = min_t(unsigned int, cnt, x86_pmu.lbr_nr);
>
> Why is min_t() used here? AFAICT all types here are 'unsigned int'.

you are right, seems like it's all unsigned int, so I will drop min_t.
Not sure why the original Intel implementation used min_t().

>
> > + memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt);
>
> The function could use a description comment explaining the arguments,
> and that the caller must make sure there's enough space in the
> 'entries' array.
>

There is a comment next to perf_snapshot_branch_stack static call
definition in include/linux/perf_event.h. I'll include a comment that
amd_pmu_v2_snapshot_branch_stack() is an AMD-specific "implementation"
of that static call, so user can refer to it for documentation (which
would be kept in one place this way, instead of copy/pasting it
between Intel and AMD implementations).

> > +
> > + amd_pmu_v2_enable_all(0);
> > + local_irq_restore(flags);
> > +
> > + return cnt;
> > +}
> > +
> > static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
> > {
> > struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> > @@ -1414,6 +1437,10 @@ static int __init amd_core_pmu_init(void)
> > static_call_update(amd_pmu_branch_reset, amd_pmu_lbr_reset);
> > static_call_update(amd_pmu_branch_add, amd_pmu_lbr_add);
> > static_call_update(amd_pmu_branch_del, amd_pmu_lbr_del);
> > +
> > + /* only support branch_stack snapshot on perfmon v2 */
> > + if (x86_pmu.handle_irq == amd_pmu_v2_handle_irq)
> > + static_call_update(perf_snapshot_branch_stack, amd_pmu_v2_snapshot_branch_stack);
>
> > } else if (!amd_brs_init()) {
> > /*
> > * BRS requires special event constraints and flushing on ctxsw.
>
> Please use consistent capitalization in all new comments you add:
>
> /* Properly capitalized comment */
>

ack

>
> > diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c
> > index 4a1e600314d5..0e4de028590d 100644
> > --- a/arch/x86/events/amd/lbr.c
> > +++ b/arch/x86/events/amd/lbr.c
> > @@ -412,16 +412,11 @@ void amd_pmu_lbr_enable_all(void)
> > void amd_pmu_lbr_disable_all(void)
> > {
> > struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> > - u64 dbg_ctl, dbg_extn_cfg;
> >
> > if (!cpuc->lbr_users || !x86_pmu.lbr_nr)
> > return;
> >
> > - rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg);
> > - rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl);
> > -
> > - wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN);
> > - wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
> > + __amd_pmu_lbr_disable();
> > }
> >
> > __init int amd_pmu_lbr_init(void)
> > diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
> > index fb56518356ec..4dddf0a7e81e 100644
> > --- a/arch/x86/events/perf_event.h
> > +++ b/arch/x86/events/perf_event.h
> > @@ -1329,6 +1329,17 @@ void amd_pmu_lbr_enable_all(void);
> > void amd_pmu_lbr_disable_all(void);
> > int amd_pmu_lbr_hw_config(struct perf_event *event);
> >
> > +static __always_inline void __amd_pmu_lbr_disable(void)
> > +{
> > + u64 dbg_ctl, dbg_extn_cfg;
> > +
> > + rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg);
> > + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl);
> > +
> > + wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN);
> > + wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
> > +}
> > +
>
> This factoring out of __amd_pmu_lbr_disable() should be in a separate
> preparatory patch too.
>

ok

> Thanks,
>
> Ingo