2024-01-09 21:36:41

by Namhyung Kim

[permalink] [raw]
Subject: [PATCH RESEND 1/2] perf/core: Update perf_adjust_freq_unthr_context()

It was unnecessarily disabling and enabling PMUs for each event. It
should be done at PMU level. Add pmu_ctx->nr_freq counter to check it
at each PMU. As pmu context has separate active lists for pinned group
and flexible group, factor out a new function to do the job.

Another minor optimization is that it can skip PMUs w/ CAP_NO_INTERRUPT
even if it needs to unthrottle sampling events.

Reviewed-by: Ian Rogers <[email protected]>
Reviewed-by: Kan Liang <[email protected]>
Tested-by: Mingwei Zhang <[email protected]>
Signed-off-by: Namhyung Kim <[email protected]>
---
include/linux/perf_event.h | 1 +
kernel/events/core.c | 68 +++++++++++++++++++++++---------------
2 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d2a15c0c6f8a..b2ff60fa487e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -883,6 +883,7 @@ struct perf_event_pmu_context {

unsigned int nr_events;
unsigned int nr_cgroups;
+ unsigned int nr_freq;

atomic_t refcount; /* event <-> epc */
struct rcu_head rcu_head;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 59b332cce9e7..ce9db9dbfd4c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2277,8 +2277,10 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)

if (!is_software_event(event))
cpc->active_oncpu--;
- if (event->attr.freq && event->attr.sample_freq)
+ if (event->attr.freq && event->attr.sample_freq) {
ctx->nr_freq--;
+ epc->nr_freq--;
+ }
if (event->attr.exclusive || !cpc->active_oncpu)
cpc->exclusive = 0;

@@ -2533,9 +2535,10 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)

if (!is_software_event(event))
cpc->active_oncpu++;
- if (event->attr.freq && event->attr.sample_freq)
+ if (event->attr.freq && event->attr.sample_freq) {
ctx->nr_freq++;
-
+ epc->nr_freq++;
+ }
if (event->attr.exclusive)
cpc->exclusive = 1;

@@ -4098,30 +4101,14 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
}
}

-/*
- * combine freq adjustment with unthrottling to avoid two passes over the
- * events. At the same time, make sure, having freq events does not change
- * the rate of unthrottling as that would introduce bias.
- */
-static void
-perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
+static void perf_adjust_freq_unthr_events(struct list_head *event_list)
{
struct perf_event *event;
struct hw_perf_event *hwc;
u64 now, period = TICK_NSEC;
s64 delta;

- /*
- * only need to iterate over all events iff:
- * - context have events in frequency mode (needs freq adjust)
- * - there are events to unthrottle on this cpu
- */
- if (!(ctx->nr_freq || unthrottle))
- return;
-
- raw_spin_lock(&ctx->lock);
-
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ list_for_each_entry(event, event_list, active_list) {
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;

@@ -4129,8 +4116,6 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
if (!event_filter_match(event))
continue;

- perf_pmu_disable(event->pmu);
-
hwc = &event->hw;

if (hwc->interrupts == MAX_INTERRUPTS) {
@@ -4140,7 +4125,7 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
}

if (!event->attr.freq || !event->attr.sample_freq)
- goto next;
+ continue;

/*
* stop the event and update event->count
@@ -4162,8 +4147,39 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
perf_adjust_period(event, period, delta, false);

event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
- next:
- perf_pmu_enable(event->pmu);
+ }
+}
+
+/*
+ * combine freq adjustment with unthrottling to avoid two passes over the
+ * events. At the same time, make sure, having freq events does not change
+ * the rate of unthrottling as that would introduce bias.
+ */
+static void
+perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+
+ /*
+ * only need to iterate over all events iff:
+ * - context have events in frequency mode (needs freq adjust)
+ * - there are events to unthrottle on this cpu
+ */
+ if (!(ctx->nr_freq || unthrottle))
+ return;
+
+ raw_spin_lock(&ctx->lock);
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (!(pmu_ctx->nr_freq || unthrottle))
+ continue;
+ if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
+ continue;
+
+ perf_pmu_disable(pmu_ctx->pmu);
+ perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
+ perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
+ perf_pmu_enable(pmu_ctx->pmu);
}

raw_spin_unlock(&ctx->lock);
--
2.43.0.472.g3155946c3a-goog



2024-01-09 21:36:47

by Namhyung Kim

[permalink] [raw]
Subject: [PATCH RESEND 2/2] perf/core: Reduce PMU access to adjust sample freq

For throttled events, it first starts the event and then stop
unnecessarily. As it's already stopped, it can directly adjust
the frequency and then move on.

Reviewed-by: Ian Rogers <[email protected]>
Reviewed-by: Kan Liang <[email protected]>
Signed-off-by: Namhyung Kim <[email protected]>
---
kernel/events/core.c | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index ce9db9dbfd4c..c80b6aa0e354 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4121,10 +4121,15 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list)
if (hwc->interrupts == MAX_INTERRUPTS) {
hwc->interrupts = 0;
perf_log_throttle(event, 1);
- event->pmu->start(event, 0);
- }

- if (!event->attr.freq || !event->attr.sample_freq)
+ if (!event->attr.freq || !event->attr.sample_freq) {
+ delta = 0;
+ goto next;
+ }
+
+ if (event->hw.state & PERF_HES_STOPPED)
+ goto adjust;
+ } else if (!event->attr.freq || !event->attr.sample_freq)
continue;

/*
@@ -4132,6 +4137,7 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list)
*/
event->pmu->stop(event, PERF_EF_UPDATE);

+adjust:
now = local64_read(&event->count);
delta = now - hwc->freq_count_stamp;
hwc->freq_count_stamp = now;
@@ -4146,6 +4152,7 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list)
if (delta > 0)
perf_adjust_period(event, period, delta, false);

+next:
event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
}
}
--
2.43.0.472.g3155946c3a-goog


2024-01-10 14:49:57

by Mark Rutland

[permalink] [raw]
Subject: Re: [PATCH RESEND 1/2] perf/core: Update perf_adjust_freq_unthr_context()

On Tue, Jan 09, 2024 at 01:36:22PM -0800, Namhyung Kim wrote:
> It was unnecessarily disabling and enabling PMUs for each event. It
> should be done at PMU level. Add pmu_ctx->nr_freq counter to check it
> at each PMU. As pmu context has separate active lists for pinned group
> and flexible group, factor out a new function to do the job.
>
> Another minor optimization is that it can skip PMUs w/ CAP_NO_INTERRUPT
> even if it needs to unthrottle sampling events.
>
> Reviewed-by: Ian Rogers <[email protected]>
> Reviewed-by: Kan Liang <[email protected]>
> Tested-by: Mingwei Zhang <[email protected]>
> Signed-off-by: Namhyung Kim <[email protected]>

Hi,

I've taken a quick look and I don't think this is quite right for
hybrid/big.LITTLE, but I think that should be relatively simple to fix (more on
that below).

This seems to be a bunch of optimizations; was that based on inspection alone,
or have you found a workload where this has a measureable impact?

> ---
> include/linux/perf_event.h | 1 +
> kernel/events/core.c | 68 +++++++++++++++++++++++---------------
> 2 files changed, 43 insertions(+), 26 deletions(-)
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index d2a15c0c6f8a..b2ff60fa487e 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -883,6 +883,7 @@ struct perf_event_pmu_context {
>
> unsigned int nr_events;
> unsigned int nr_cgroups;
> + unsigned int nr_freq;
>
> atomic_t refcount; /* event <-> epc */
> struct rcu_head rcu_head;
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 59b332cce9e7..ce9db9dbfd4c 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -2277,8 +2277,10 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
>
> if (!is_software_event(event))
> cpc->active_oncpu--;
> - if (event->attr.freq && event->attr.sample_freq)
> + if (event->attr.freq && event->attr.sample_freq) {
> ctx->nr_freq--;
> + epc->nr_freq--;
> + }
> if (event->attr.exclusive || !cpc->active_oncpu)
> cpc->exclusive = 0;
>
> @@ -2533,9 +2535,10 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
>
> if (!is_software_event(event))
> cpc->active_oncpu++;
> - if (event->attr.freq && event->attr.sample_freq)
> + if (event->attr.freq && event->attr.sample_freq) {
> ctx->nr_freq++;
> -
> + epc->nr_freq++;
> + }
> if (event->attr.exclusive)
> cpc->exclusive = 1;
>
> @@ -4098,30 +4101,14 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
> }
> }
>
> -/*
> - * combine freq adjustment with unthrottling to avoid two passes over the
> - * events. At the same time, make sure, having freq events does not change
> - * the rate of unthrottling as that would introduce bias.
> - */
> -static void
> -perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> +static void perf_adjust_freq_unthr_events(struct list_head *event_list)
> {
> struct perf_event *event;
> struct hw_perf_event *hwc;
> u64 now, period = TICK_NSEC;
> s64 delta;
>
> - /*
> - * only need to iterate over all events iff:
> - * - context have events in frequency mode (needs freq adjust)
> - * - there are events to unthrottle on this cpu
> - */
> - if (!(ctx->nr_freq || unthrottle))
> - return;
> -
> - raw_spin_lock(&ctx->lock);
> -
> - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
> + list_for_each_entry(event, event_list, active_list) {
> if (event->state != PERF_EVENT_STATE_ACTIVE)
> continue;
>
> @@ -4129,8 +4116,6 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> if (!event_filter_match(event))
> continue;
>
> - perf_pmu_disable(event->pmu);
> -
> hwc = &event->hw;
>
> if (hwc->interrupts == MAX_INTERRUPTS) {
> @@ -4140,7 +4125,7 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> }
>
> if (!event->attr.freq || !event->attr.sample_freq)
> - goto next;
> + continue;
>
> /*
> * stop the event and update event->count
> @@ -4162,8 +4147,39 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> perf_adjust_period(event, period, delta, false);
>
> event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
> - next:
> - perf_pmu_enable(event->pmu);
> + }
> +}
> +
> +/*
> + * combine freq adjustment with unthrottling to avoid two passes over the
> + * events. At the same time, make sure, having freq events does not change
> + * the rate of unthrottling as that would introduce bias.
> + */
> +static void
> +perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> +{
> + struct perf_event_pmu_context *pmu_ctx;
> +
> + /*
> + * only need to iterate over all events iff:
> + * - context have events in frequency mode (needs freq adjust)
> + * - there are events to unthrottle on this cpu
> + */
> + if (!(ctx->nr_freq || unthrottle))
> + return;
> +
> + raw_spin_lock(&ctx->lock);
> +
> + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> + if (!(pmu_ctx->nr_freq || unthrottle))
> + continue;
> + if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
> + continue;
> +
> + perf_pmu_disable(pmu_ctx->pmu);
> + perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
> + perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
> + perf_pmu_enable(pmu_ctx->pmu);
> }

I don't think this is correct for big.LITTLE/hybrid systems.

Imagine a system where CPUs 0-1 have pmu_a, CPUs 2-3 have pmu_b, and a task has
events for both pmu_a and pmu_b. The perf_event_context for that task will have
a perf_event_pmu_context for each PMU in its pmu_ctx_list.

Say that task is run on CPU0, and perf_event_task_tick() is called. That will
call perf_adjust_freq_unthr_context(), and it will iterate over the
pmu_ctx_list. Note that regardless of pmu_ctx->nr_freq, if 'unthottle' is true,
we'll go ahead and call the following for all of the pmu contexts in the
pmu_ctx_list:

perf_pmu_disable(pmu_ctx->pmu);
perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
perf_pmu_enable(pmu_ctx->pmu);

.. and that means we might call that for pmu_b, even though it's not
associated with CPU0. That could be fatal depending on what those callbacks do.

The old logic avoided that possibility implicitly, since the events for pmu_b
couldn't be active, and so the check at the start of the look would skip all of
pmu_b's events:

if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;

We could do similar by keeping track of how many active events each
perf_event_pmu_context has, which'd allow us to do something like:

if (pmu_ctx->nr_active == 0)
continue;

How does that sound to you?

Mark.

2024-01-10 18:27:57

by Namhyung Kim

[permalink] [raw]
Subject: Re: [PATCH RESEND 1/2] perf/core: Update perf_adjust_freq_unthr_context()

Hi Mark,

On Wed, Jan 10, 2024 at 6:49 AM Mark Rutland <[email protected]> wrote:
>
> On Tue, Jan 09, 2024 at 01:36:22PM -0800, Namhyung Kim wrote:
> > It was unnecessarily disabling and enabling PMUs for each event. It
> > should be done at PMU level. Add pmu_ctx->nr_freq counter to check it
> > at each PMU. As pmu context has separate active lists for pinned group
> > and flexible group, factor out a new function to do the job.
> >
> > Another minor optimization is that it can skip PMUs w/ CAP_NO_INTERRUPT
> > even if it needs to unthrottle sampling events.
> >
> > Reviewed-by: Ian Rogers <[email protected]>
> > Reviewed-by: Kan Liang <[email protected]>
> > Tested-by: Mingwei Zhang <[email protected]>
> > Signed-off-by: Namhyung Kim <[email protected]>
>
> Hi,
>
> I've taken a quick look and I don't think this is quite right for
> hybrid/big.LITTLE, but I think that should be relatively simple to fix (more on
> that below).

Thanks for your review!

>
> This seems to be a bunch of optimizations; was that based on inspection alone,
> or have you found a workload where this has a measureable impact?

It's from a code inspection but I think Mingwei reported some excessive
MSR accesses for KVM use cases. Anyway it'd increase the interrupt \
latency if you have slow (uncore) PMUs and lots of events on those PMUs.

>
> > ---
> > include/linux/perf_event.h | 1 +
> > kernel/events/core.c | 68 +++++++++++++++++++++++---------------
> > 2 files changed, 43 insertions(+), 26 deletions(-)
> >
> > diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> > index d2a15c0c6f8a..b2ff60fa487e 100644
> > --- a/include/linux/perf_event.h
> > +++ b/include/linux/perf_event.h
> > @@ -883,6 +883,7 @@ struct perf_event_pmu_context {
> >
> > unsigned int nr_events;
> > unsigned int nr_cgroups;
> > + unsigned int nr_freq;
> >
> > atomic_t refcount; /* event <-> epc */
> > struct rcu_head rcu_head;
> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index 59b332cce9e7..ce9db9dbfd4c 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -2277,8 +2277,10 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
> >
> > if (!is_software_event(event))
> > cpc->active_oncpu--;
> > - if (event->attr.freq && event->attr.sample_freq)
> > + if (event->attr.freq && event->attr.sample_freq) {
> > ctx->nr_freq--;
> > + epc->nr_freq--;
> > + }
> > if (event->attr.exclusive || !cpc->active_oncpu)
> > cpc->exclusive = 0;
> >
> > @@ -2533,9 +2535,10 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
> >
> > if (!is_software_event(event))
> > cpc->active_oncpu++;
> > - if (event->attr.freq && event->attr.sample_freq)
> > + if (event->attr.freq && event->attr.sample_freq) {
> > ctx->nr_freq++;
> > -
> > + epc->nr_freq++;
> > + }
> > if (event->attr.exclusive)
> > cpc->exclusive = 1;
> >
> > @@ -4098,30 +4101,14 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
> > }
> > }
> >
> > -/*
> > - * combine freq adjustment with unthrottling to avoid two passes over the
> > - * events. At the same time, make sure, having freq events does not change
> > - * the rate of unthrottling as that would introduce bias.
> > - */
> > -static void
> > -perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> > +static void perf_adjust_freq_unthr_events(struct list_head *event_list)
> > {
> > struct perf_event *event;
> > struct hw_perf_event *hwc;
> > u64 now, period = TICK_NSEC;
> > s64 delta;
> >
> > - /*
> > - * only need to iterate over all events iff:
> > - * - context have events in frequency mode (needs freq adjust)
> > - * - there are events to unthrottle on this cpu
> > - */
> > - if (!(ctx->nr_freq || unthrottle))
> > - return;
> > -
> > - raw_spin_lock(&ctx->lock);
> > -
> > - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
> > + list_for_each_entry(event, event_list, active_list) {
> > if (event->state != PERF_EVENT_STATE_ACTIVE)
> > continue;
> >
> > @@ -4129,8 +4116,6 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> > if (!event_filter_match(event))
> > continue;
> >
> > - perf_pmu_disable(event->pmu);
> > -
> > hwc = &event->hw;
> >
> > if (hwc->interrupts == MAX_INTERRUPTS) {
> > @@ -4140,7 +4125,7 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> > }
> >
> > if (!event->attr.freq || !event->attr.sample_freq)
> > - goto next;
> > + continue;
> >
> > /*
> > * stop the event and update event->count
> > @@ -4162,8 +4147,39 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> > perf_adjust_period(event, period, delta, false);
> >
> > event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
> > - next:
> > - perf_pmu_enable(event->pmu);
> > + }
> > +}
> > +
> > +/*
> > + * combine freq adjustment with unthrottling to avoid two passes over the
> > + * events. At the same time, make sure, having freq events does not change
> > + * the rate of unthrottling as that would introduce bias.
> > + */
> > +static void
> > +perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> > +{
> > + struct perf_event_pmu_context *pmu_ctx;
> > +
> > + /*
> > + * only need to iterate over all events iff:
> > + * - context have events in frequency mode (needs freq adjust)
> > + * - there are events to unthrottle on this cpu
> > + */
> > + if (!(ctx->nr_freq || unthrottle))
> > + return;
> > +
> > + raw_spin_lock(&ctx->lock);
> > +
> > + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> > + if (!(pmu_ctx->nr_freq || unthrottle))
> > + continue;
> > + if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
> > + continue;
> > +
> > + perf_pmu_disable(pmu_ctx->pmu);
> > + perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
> > + perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
> > + perf_pmu_enable(pmu_ctx->pmu);
> > }
>
> I don't think this is correct for big.LITTLE/hybrid systems.
>
> Imagine a system where CPUs 0-1 have pmu_a, CPUs 2-3 have pmu_b, and a task has
> events for both pmu_a and pmu_b. The perf_event_context for that task will have
> a perf_event_pmu_context for each PMU in its pmu_ctx_list.
>
> Say that task is run on CPU0, and perf_event_task_tick() is called. That will
> call perf_adjust_freq_unthr_context(), and it will iterate over the
> pmu_ctx_list. Note that regardless of pmu_ctx->nr_freq, if 'unthottle' is true,
> we'll go ahead and call the following for all of the pmu contexts in the
> pmu_ctx_list:
>
> perf_pmu_disable(pmu_ctx->pmu);
> perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
> perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
> perf_pmu_enable(pmu_ctx->pmu);
>
> ... and that means we might call that for pmu_b, even though it's not
> associated with CPU0. That could be fatal depending on what those callbacks do.

Thanks for pointing that out. I missed the hybrid cases.

>
> The old logic avoided that possibility implicitly, since the events for pmu_b
> couldn't be active, and so the check at the start of the look would skip all of
> pmu_b's events:
>
> if (event->state != PERF_EVENT_STATE_ACTIVE)
> continue;
>
> We could do similar by keeping track of how many active events each
> perf_event_pmu_context has, which'd allow us to do something like:
>
> if (pmu_ctx->nr_active == 0)
> continue;
>
> How does that sound to you?

Sounds good. Maybe we can just test if both active lists are empty.

Thanks,
Namhyung

2024-01-10 18:45:32

by Mingwei Zhang

[permalink] [raw]
Subject: Re: [PATCH RESEND 1/2] perf/core: Update perf_adjust_freq_unthr_context()

On Wed, Jan 10, 2024 at 10:27 AM Namhyung Kim <[email protected]> wrote:
>
> Hi Mark,
>
> On Wed, Jan 10, 2024 at 6:49 AM Mark Rutland <[email protected]> wrote:
> >
> > On Tue, Jan 09, 2024 at 01:36:22PM -0800, Namhyung Kim wrote:
> > > It was unnecessarily disabling and enabling PMUs for each event. It
> > > should be done at PMU level. Add pmu_ctx->nr_freq counter to check it
> > > at each PMU. As pmu context has separate active lists for pinned group
> > > and flexible group, factor out a new function to do the job.
> > >
> > > Another minor optimization is that it can skip PMUs w/ CAP_NO_INTERRUPT
> > > even if it needs to unthrottle sampling events.
> > >
> > > Reviewed-by: Ian Rogers <[email protected]>
> > > Reviewed-by: Kan Liang <[email protected]>
> > > Tested-by: Mingwei Zhang <[email protected]>
> > > Signed-off-by: Namhyung Kim <[email protected]>
> >
> > Hi,
> >
> > I've taken a quick look and I don't think this is quite right for
> > hybrid/big.LITTLE, but I think that should be relatively simple to fix (more on
> > that below).
>
> Thanks for your review!
>
> >
> > This seems to be a bunch of optimizations; was that based on inspection alone,
> > or have you found a workload where this has a measureable impact?
>
> It's from a code inspection but I think Mingwei reported some excessive
> MSR accesses for KVM use cases. Anyway it'd increase the interrupt \
> latency if you have slow (uncore) PMUs and lots of events on those PMUs.
>

Yes, we have observed a huge host-level overhead when guest PMU is
multiplexing events in frequency mode. The investigation finally
narrows down to this code after profiling. We find that there are
excessive MSR writes to 0x38f, which is caused by PMU disable and
enable operations. These operations are pretty heavy weight in a
virtualized environment because all of the wrmsr to 0x38f will not
directly go to HW but get redirected via perf API to the host PMU. The
round trip overhead is very high. Note that this overhead is
implicitly visible to guests, ie., when a guest is running a
CPU-saturating workload (eg., SPEC2017), they will see their vCPU
performance return to ancient times.

The interesting part that I have observed after applying this patch is
the increase of guest PMIs. This (or there might be another reason)
causes the host-level overhead to remain high.

But the patch is still quite useful as it does reduce the excessive
VMEXITs caused by excessive wrmsr to 0x38f.

Thanks.
-Mingwei
> >
> > > ---
> > > include/linux/perf_event.h | 1 +
> > > kernel/events/core.c | 68 +++++++++++++++++++++++---------------
> > > 2 files changed, 43 insertions(+), 26 deletions(-)
> > >
> > > diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> > > index d2a15c0c6f8a..b2ff60fa487e 100644
> > > --- a/include/linux/perf_event.h
> > > +++ b/include/linux/perf_event.h
> > > @@ -883,6 +883,7 @@ struct perf_event_pmu_context {
> > >
> > > unsigned int nr_events;
> > > unsigned int nr_cgroups;
> > > + unsigned int nr_freq;
> > >
> > > atomic_t refcount; /* event <-> epc */
> > > struct rcu_head rcu_head;
> > > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > > index 59b332cce9e7..ce9db9dbfd4c 100644
> > > --- a/kernel/events/core.c
> > > +++ b/kernel/events/core.c
> > > @@ -2277,8 +2277,10 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
> > >
> > > if (!is_software_event(event))
> > > cpc->active_oncpu--;
> > > - if (event->attr.freq && event->attr.sample_freq)
> > > + if (event->attr.freq && event->attr.sample_freq) {
> > > ctx->nr_freq--;
> > > + epc->nr_freq--;
> > > + }
> > > if (event->attr.exclusive || !cpc->active_oncpu)
> > > cpc->exclusive = 0;
> > >
> > > @@ -2533,9 +2535,10 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
> > >
> > > if (!is_software_event(event))
> > > cpc->active_oncpu++;
> > > - if (event->attr.freq && event->attr.sample_freq)
> > > + if (event->attr.freq && event->attr.sample_freq) {
> > > ctx->nr_freq++;
> > > -
> > > + epc->nr_freq++;
> > > + }
> > > if (event->attr.exclusive)
> > > cpc->exclusive = 1;
> > >
> > > @@ -4098,30 +4101,14 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
> > > }
> > > }
> > >
> > > -/*
> > > - * combine freq adjustment with unthrottling to avoid two passes over the
> > > - * events. At the same time, make sure, having freq events does not change
> > > - * the rate of unthrottling as that would introduce bias.
> > > - */
> > > -static void
> > > -perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> > > +static void perf_adjust_freq_unthr_events(struct list_head *event_list)
> > > {
> > > struct perf_event *event;
> > > struct hw_perf_event *hwc;
> > > u64 now, period = TICK_NSEC;
> > > s64 delta;
> > >
> > > - /*
> > > - * only need to iterate over all events iff:
> > > - * - context have events in frequency mode (needs freq adjust)
> > > - * - there are events to unthrottle on this cpu
> > > - */
> > > - if (!(ctx->nr_freq || unthrottle))
> > > - return;
> > > -
> > > - raw_spin_lock(&ctx->lock);
> > > -
> > > - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
> > > + list_for_each_entry(event, event_list, active_list) {
> > > if (event->state != PERF_EVENT_STATE_ACTIVE)
> > > continue;
> > >
> > > @@ -4129,8 +4116,6 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> > > if (!event_filter_match(event))
> > > continue;
> > >
> > > - perf_pmu_disable(event->pmu);
> > > -
> > > hwc = &event->hw;
> > >
> > > if (hwc->interrupts == MAX_INTERRUPTS) {
> > > @@ -4140,7 +4125,7 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> > > }
> > >
> > > if (!event->attr.freq || !event->attr.sample_freq)
> > > - goto next;
> > > + continue;
> > >
> > > /*
> > > * stop the event and update event->count
> > > @@ -4162,8 +4147,39 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> > > perf_adjust_period(event, period, delta, false);
> > >
> > > event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
> > > - next:
> > > - perf_pmu_enable(event->pmu);
> > > + }
> > > +}
> > > +
> > > +/*
> > > + * combine freq adjustment with unthrottling to avoid two passes over the
> > > + * events. At the same time, make sure, having freq events does not change
> > > + * the rate of unthrottling as that would introduce bias.
> > > + */
> > > +static void
> > > +perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> > > +{
> > > + struct perf_event_pmu_context *pmu_ctx;
> > > +
> > > + /*
> > > + * only need to iterate over all events iff:
> > > + * - context have events in frequency mode (needs freq adjust)
> > > + * - there are events to unthrottle on this cpu
> > > + */
> > > + if (!(ctx->nr_freq || unthrottle))
> > > + return;
> > > +
> > > + raw_spin_lock(&ctx->lock);
> > > +
> > > + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> > > + if (!(pmu_ctx->nr_freq || unthrottle))
> > > + continue;
> > > + if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
> > > + continue;
> > > +
> > > + perf_pmu_disable(pmu_ctx->pmu);
> > > + perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
> > > + perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
> > > + perf_pmu_enable(pmu_ctx->pmu);
> > > }
> >
> > I don't think this is correct for big.LITTLE/hybrid systems.
> >
> > Imagine a system where CPUs 0-1 have pmu_a, CPUs 2-3 have pmu_b, and a task has
> > events for both pmu_a and pmu_b. The perf_event_context for that task will have
> > a perf_event_pmu_context for each PMU in its pmu_ctx_list.
> >
> > Say that task is run on CPU0, and perf_event_task_tick() is called. That will
> > call perf_adjust_freq_unthr_context(), and it will iterate over the
> > pmu_ctx_list. Note that regardless of pmu_ctx->nr_freq, if 'unthottle' is true,
> > we'll go ahead and call the following for all of the pmu contexts in the
> > pmu_ctx_list:
> >
> > perf_pmu_disable(pmu_ctx->pmu);
> > perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
> > perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
> > perf_pmu_enable(pmu_ctx->pmu);
> >
> > ... and that means we might call that for pmu_b, even though it's not
> > associated with CPU0. That could be fatal depending on what those callbacks do.
>
> Thanks for pointing that out. I missed the hybrid cases.
>
> >
> > The old logic avoided that possibility implicitly, since the events for pmu_b
> > couldn't be active, and so the check at the start of the look would skip all of
> > pmu_b's events:
> >
> > if (event->state != PERF_EVENT_STATE_ACTIVE)
> > continue;
> >
> > We could do similar by keeping track of how many active events each
> > perf_event_pmu_context has, which'd allow us to do something like:
> >
> > if (pmu_ctx->nr_active == 0)
> > continue;
> >
> > How does that sound to you?
>
> Sounds good. Maybe we can just test if both active lists are empty.
>
> Thanks,
> Namhyung

2024-01-11 10:41:47

by Mark Rutland

[permalink] [raw]
Subject: Re: [PATCH RESEND 1/2] perf/core: Update perf_adjust_freq_unthr_context()

On Wed, Jan 10, 2024 at 10:27:27AM -0800, Namhyung Kim wrote:
> On Wed, Jan 10, 2024 at 6:49 AM Mark Rutland <[email protected]> wrote:
> > On Tue, Jan 09, 2024 at 01:36:22PM -0800, Namhyung Kim wrote:
> > > It was unnecessarily disabling and enabling PMUs for each event. It
> > > should be done at PMU level. Add pmu_ctx->nr_freq counter to check it
> > > at each PMU. As pmu context has separate active lists for pinned group
> > > and flexible group, factor out a new function to do the job.
> > >
> > > Another minor optimization is that it can skip PMUs w/ CAP_NO_INTERRUPT
> > > even if it needs to unthrottle sampling events.
> > >
> > > Reviewed-by: Ian Rogers <[email protected]>
> > > Reviewed-by: Kan Liang <[email protected]>
> > > Tested-by: Mingwei Zhang <[email protected]>
> > > Signed-off-by: Namhyung Kim <[email protected]>
> >
> > Hi,
> >
> > I've taken a quick look and I don't think this is quite right for
> > hybrid/big.LITTLE, but I think that should be relatively simple to fix (more on
> > that below).
>
> Thanks for your review!

No problem!

> > This seems to be a bunch of optimizations; was that based on inspection alone,
> > or have you found a workload where this has a measureable impact?
>
> It's from a code inspection but I think Mingwei reported some excessive
> MSR accesses for KVM use cases. Anyway it'd increase the interrupt \
> latency if you have slow (uncore) PMUs and lots of events on those PMUs.

Makes sense; it would be good if we could put smoething in the commit message
mentioning that.

[...]

> > > +static void
> > > +perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
> > > +{
> > > + struct perf_event_pmu_context *pmu_ctx;
> > > +
> > > + /*
> > > + * only need to iterate over all events iff:
> > > + * - context have events in frequency mode (needs freq adjust)
> > > + * - there are events to unthrottle on this cpu
> > > + */
> > > + if (!(ctx->nr_freq || unthrottle))
> > > + return;
> > > +
> > > + raw_spin_lock(&ctx->lock);
> > > +
> > > + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> > > + if (!(pmu_ctx->nr_freq || unthrottle))
> > > + continue;
> > > + if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
> > > + continue;
> > > +
> > > + perf_pmu_disable(pmu_ctx->pmu);
> > > + perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
> > > + perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
> > > + perf_pmu_enable(pmu_ctx->pmu);
> > > }
> >
> > I don't think this is correct for big.LITTLE/hybrid systems.
> >
> > Imagine a system where CPUs 0-1 have pmu_a, CPUs 2-3 have pmu_b, and a task has
> > events for both pmu_a and pmu_b. The perf_event_context for that task will have
> > a perf_event_pmu_context for each PMU in its pmu_ctx_list.
> >
> > Say that task is run on CPU0, and perf_event_task_tick() is called. That will
> > call perf_adjust_freq_unthr_context(), and it will iterate over the
> > pmu_ctx_list. Note that regardless of pmu_ctx->nr_freq, if 'unthottle' is true,
> > we'll go ahead and call the following for all of the pmu contexts in the
> > pmu_ctx_list:
> >
> > perf_pmu_disable(pmu_ctx->pmu);
> > perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
> > perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
> > perf_pmu_enable(pmu_ctx->pmu);
> >
> > ... and that means we might call that for pmu_b, even though it's not
> > associated with CPU0. That could be fatal depending on what those callbacks do.
>
> Thanks for pointing that out. I missed the hybrid cases.
>
> > The old logic avoided that possibility implicitly, since the events for pmu_b
> > couldn't be active, and so the check at the start of the look would skip all of
> > pmu_b's events:
> >
> > if (event->state != PERF_EVENT_STATE_ACTIVE)
> > continue;
> >
> > We could do similar by keeping track of how many active events each
> > perf_event_pmu_context has, which'd allow us to do something like:
> >
> > if (pmu_ctx->nr_active == 0)
> > continue;
> >
> > How does that sound to you?
>
> Sounds good. Maybe we can just test if both active lists are empty.

Good idea, I think that'd be simpler and less fragile.

Thanks,
Mark.