2022-03-14 08:00:05

by Wen Yang

[permalink] [raw]
Subject: [PATCH v2 1/3] perf/x86: extract code to assign perf events for both core and uncore

Following two patterns in x86 perf code are used in multiple places where
similar code is duplicated:
- fast path, try to reuse previous register
- slow path, assign a counter for each event

In order to improve code quality and prepare for following patch series
that also uses described patterns, extract the codes to perf_assign_events.

This commit doesn't change functionality.

Signed-off-by: Wen Yang <[email protected]>
Cc: Peter Zijlstra (Intel) <[email protected]>
Cc: Stephane Eranian <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Mark Rutland <[email protected]>
Cc: Alexander Shishkin <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Namhyung Kim <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: [email protected]
Cc: Wen Yang <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: [email protected]
Cc: [email protected]
---
arch/x86/events/core.c | 141 ++++++++++++++++++++++-------------------
arch/x86/events/intel/uncore.c | 31 +--------
arch/x86/events/perf_event.h | 6 +-
3 files changed, 82 insertions(+), 96 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index e686c5e..b14fb1b 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -950,10 +950,7 @@ static bool perf_sched_next_event(struct perf_sched *sched)
return true;
}

-/*
- * Assign a counter for each event.
- */
-int perf_assign_events(struct event_constraint **constraints, int n,
+static int __perf_assign_events(struct event_constraint **constraints, int n,
int wmin, int wmax, int gpmax, int *assign)
{
struct perf_sched sched;
@@ -969,16 +966,66 @@ int perf_assign_events(struct event_constraint **constraints, int n,

return sched.state.unassigned;
}
+
+/*
+ * Assign a counter for each event.
+ */
+int perf_assign_events(struct perf_event **event_list,
+ struct event_constraint **constraints, int n,
+ int wmin, int wmax, int gpmax, int *assign)
+{
+ struct event_constraint *c;
+ struct hw_perf_event *hwc;
+ u64 used_mask = 0;
+ int unsched = 0;
+ int i;
+
+ /*
+ * fastpath, try to reuse previous register
+ */
+ for (i = 0; i < n; i++) {
+ u64 mask;
+
+ hwc = &event_list[i]->hw;
+ c = constraints[i];
+
+ /* never assigned */
+ if (hwc->idx == -1)
+ break;
+
+ /* constraint still honored */
+ if (!test_bit(hwc->idx, c->idxmsk))
+ break;
+
+ mask = BIT_ULL(hwc->idx);
+ if (is_counter_pair(hwc))
+ mask |= mask << 1;
+
+ /* not already used */
+ if (used_mask & mask)
+ break;
+
+ used_mask |= mask;
+
+ if (assign)
+ assign[i] = hwc->idx;
+ }
+
+ /* slow path */
+ if (i != n)
+ unsched = __perf_assign_events(constraints, n,
+ wmin, wmax, gpmax, assign);
+
+ return unsched;
+}
EXPORT_SYMBOL_GPL(perf_assign_events);

int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
{
int num_counters = hybrid(cpuc->pmu, num_counters);
- struct event_constraint *c;
- struct perf_event *e;
int n0, i, wmin, wmax, unsched = 0;
- struct hw_perf_event *hwc;
- u64 used_mask = 0;
+ struct event_constraint *c;
+ int gpmax = num_counters;

/*
* Compute the number of events already present; see x86_pmu_add(),
@@ -1017,66 +1064,30 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
}

/*
- * fastpath, try to reuse previous register
+ * Do not allow scheduling of more than half the available
+ * generic counters.
+ *
+ * This helps avoid counter starvation of sibling thread by
+ * ensuring at most half the counters cannot be in exclusive
+ * mode. There is no designated counters for the limits. Any
+ * N/2 counters can be used. This helps with events with
+ * specific counter constraints.
*/
- for (i = 0; i < n; i++) {
- u64 mask;
-
- hwc = &cpuc->event_list[i]->hw;
- c = cpuc->event_constraint[i];
-
- /* never assigned */
- if (hwc->idx == -1)
- break;
-
- /* constraint still honored */
- if (!test_bit(hwc->idx, c->idxmsk))
- break;
-
- mask = BIT_ULL(hwc->idx);
- if (is_counter_pair(hwc))
- mask |= mask << 1;
-
- /* not already used */
- if (used_mask & mask)
- break;
+ if (is_ht_workaround_enabled() && !cpuc->is_fake &&
+ READ_ONCE(cpuc->excl_cntrs->exclusive_present))
+ gpmax /= 2;

- used_mask |= mask;
-
- if (assign)
- assign[i] = hwc->idx;
+ /*
+ * Reduce the amount of available counters to allow fitting
+ * the extra Merge events needed by large increment events.
+ */
+ if (x86_pmu.flags & PMU_FL_PAIR) {
+ gpmax = num_counters - cpuc->n_pair;
+ WARN_ON(gpmax <= 0);
}

- /* slow path */
- if (i != n) {
- int gpmax = num_counters;
-
- /*
- * Do not allow scheduling of more than half the available
- * generic counters.
- *
- * This helps avoid counter starvation of sibling thread by
- * ensuring at most half the counters cannot be in exclusive
- * mode. There is no designated counters for the limits. Any
- * N/2 counters can be used. This helps with events with
- * specific counter constraints.
- */
- if (is_ht_workaround_enabled() && !cpuc->is_fake &&
- READ_ONCE(cpuc->excl_cntrs->exclusive_present))
- gpmax /= 2;
-
- /*
- * Reduce the amount of available counters to allow fitting
- * the extra Merge events needed by large increment events.
- */
- if (x86_pmu.flags & PMU_FL_PAIR) {
- gpmax = num_counters - cpuc->n_pair;
- WARN_ON(gpmax <= 0);
- }
-
- unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
- wmax, gpmax, assign);
- }
+ unsched = perf_assign_events(cpuc->event_list, cpuc->event_constraint,
+ n, wmin, wmax, gpmax, assign);

/*
* In case of success (unsched = 0), mark events as committed,
@@ -1093,7 +1104,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]);
} else {
for (i = n0; i < n; i++) {
- e = cpuc->event_list[i];
+ struct perf_event *e = cpuc->event_list[i];

/*
* release events that failed scheduling
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index e497da9..101358a 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -442,12 +442,8 @@ static void uncore_put_event_constraint(struct intel_uncore_box *box,

static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n)
{
- unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
struct event_constraint *c;
int i, wmin, wmax, ret = 0;
- struct hw_perf_event *hwc;
-
- bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);

for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
c = uncore_get_event_constraint(box, box->event_list[i]);
@@ -456,31 +452,8 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int
wmax = max(wmax, c->weight);
}

- /* fastpath, try to reuse previous register */
- for (i = 0; i < n; i++) {
- hwc = &box->event_list[i]->hw;
- c = box->event_constraint[i];
-
- /* never assigned */
- if (hwc->idx == -1)
- break;
-
- /* constraint still honored */
- if (!test_bit(hwc->idx, c->idxmsk))
- break;
-
- /* not already used */
- if (test_bit(hwc->idx, used_mask))
- break;
-
- __set_bit(hwc->idx, used_mask);
- if (assign)
- assign[i] = hwc->idx;
- }
- /* slow path */
- if (i != n)
- ret = perf_assign_events(box->event_constraint, n,
- wmin, wmax, n, assign);
+ ret = perf_assign_events(box->event_list,
+ box->event_constraint, n, wmin, wmax, n, assign);

if (!assign || ret) {
for (i = 0; i < n; i++)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 150261d..f1acd1d 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1130,8 +1130,10 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,

void x86_pmu_enable_all(int added);

-int perf_assign_events(struct event_constraint **constraints, int n,
- int wmin, int wmax, int gpmax, int *assign);
+int perf_assign_events(struct perf_event **event_list,
+ struct event_constraint **constraints, int n,
+ int wmin, int wmax, int gpmax, int *assign);
+
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);

void x86_pmu_stop(struct perf_event *event, int flags);
--
1.8.3.1


2022-03-14 14:46:54

by Wen Yang

[permalink] [raw]
Subject: [PATCH v2 3/3] perf/x86: reuse scarce pmu counters

The nmi watchdog may permanently consume a fixed counter (*cycles*),
so when other programs collect *cycles* again, they will occupy a GP.
Here is a slight optimization: save a generic counter for events that
are non-sampling type and using a fixed counter.

Signed-off-by: Wen Yang <[email protected]>
Cc: Peter Zijlstra (Intel) <[email protected]>
Cc: Stephane Eranian <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Mark Rutland <[email protected]>
Cc: Alexander Shishkin <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Namhyung Kim <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: [email protected]
Cc: Wen Yang <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: [email protected]
Cc: [email protected]
---
arch/x86/events/core.c | 45 +++++++++++++++++++++++++++++++--------------
1 file changed, 31 insertions(+), 14 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index b7f5925..6ddddf1 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -799,6 +799,7 @@ struct perf_sched {
u64 msk_counters;
u64 msk_events;
struct event_constraint **constraints;
+ struct perf_event **events;
struct sched_state state;
struct sched_state saved[SCHED_STATES_MAX];
};
@@ -846,7 +847,8 @@ static int perf_sched_calc_event(struct event_constraint **constraints,
/*
* Initialize iterator that runs through all events and counters.
*/
-static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
+static void perf_sched_init(struct perf_sched *sched,
+ struct perf_event **events, struct event_constraint **constraints,
int num, int wmin, int wmax, int gpmax, u64 mevt, u64 mcnt)
{
memset(sched, 0, sizeof(*sched));
@@ -854,12 +856,13 @@ static void perf_sched_init(struct perf_sched *sched, struct event_constraint **
sched->max_weight = wmax;
sched->max_gp = gpmax;
sched->constraints = constraints;
+ sched->events = events;
sched->msk_events = mevt;
sched->msk_counters = mcnt;

sched->state.weight = perf_sched_calc_weight(constraints, num, wmin, wmax, mcnt);
sched->state.event = perf_sched_calc_event(constraints, num, sched->state.weight, mevt);
- sched->state.unassigned = num - hweight_long(sched->state.event);
+ sched->state.unassigned = num - hweight_long(mevt);
}

static void perf_sched_save_state(struct perf_sched *sched)
@@ -896,6 +899,7 @@ static bool perf_sched_restore_state(struct perf_sched *sched)
static bool __perf_sched_find_counter(struct perf_sched *sched)
{
struct event_constraint *c;
+ struct perf_event *e;
int idx;

if (!sched->state.unassigned)
@@ -905,16 +909,17 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
return false;

c = sched->constraints[sched->state.event];
+ e = sched->events[sched->state.event];
/* Prefer fixed purpose counters */
if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
idx = INTEL_PMC_IDX_FIXED;
for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
u64 mask = BIT_ULL(idx);

- if (sched->msk_counters & mask)
+ if ((sched->msk_counters & mask) && is_sampling_event(e))
continue;

- if (sched->state.used & mask)
+ if ((sched->state.used & mask) && is_sampling_event(e))
continue;

sched->state.used |= mask;
@@ -1016,14 +1021,15 @@ static void perf_sched_obtain_used_registers(int *assign, int n, u64 *events, u6
}
}

-static int __perf_assign_events(struct event_constraint **constraints, int n,
+static int __perf_assign_events(struct perf_event **events,
+ struct event_constraint **constraints, int n,
int wmin, int wmax, int gpmax, int *assign)
{
- u64 msk_events, msk_counters;
+ u64 mevt, mcnt;
struct perf_sched sched;

- perf_sched_obtain_used_registers(assign, n, &msk_events, &msk_counters);
- perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax, msk_events, msk_counters);
+ perf_sched_obtain_used_registers(assign, n, &mevt, &mcnt);
+ perf_sched_init(&sched, events, constraints, n, wmin, wmax, gpmax, mevt, mcnt);

do {
if (!perf_sched_find_counter(&sched))
@@ -1035,6 +1041,13 @@ static int __perf_assign_events(struct event_constraint **constraints, int n,
return sched.state.unassigned;
}

+static bool is_pmc_reuseable(struct perf_event *e,
+ struct event_constraint *c)
+{
+ return c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED) &&
+ !is_sampling_event(e);
+}
+
/*
* Assign a counter for each event.
*/
@@ -1043,12 +1056,13 @@ int perf_assign_events(struct perf_event **event_list,
int wmin, int wmax, int gpmax, int *assign)
{
struct event_constraint *c;
+ struct perf_event *e;
struct hw_perf_event *hwc;
u64 used_mask = 0;
int unsched = 0;
int i;

- memset(assign, -1, n);
+ memset(assign, -1, n * sizeof(int));

/*
* fastpath, try to reuse previous register
@@ -1058,6 +1072,7 @@ int perf_assign_events(struct perf_event **event_list,

hwc = &event_list[i]->hw;
c = constraints[i];
+ e = event_list[i];

/* never assigned */
if (hwc->idx == -1)
@@ -1072,8 +1087,10 @@ int perf_assign_events(struct perf_event **event_list,
mask |= mask << 1;

/* not already used */
- if (used_mask & mask)
- break;
+ if (used_mask & mask) {
+ if (!is_pmc_reuseable(e, c))
+ break;
+ }

used_mask |= mask;

@@ -1083,12 +1100,12 @@ int perf_assign_events(struct perf_event **event_list,

/* slow path */
if (i != n) {
- unsched = __perf_assign_events(constraints, n,
+ unsched = __perf_assign_events(event_list, constraints, n,
wmin, wmax, gpmax, assign);

if (unsched) {
- memset(assign, -1, n);
- unsched = __perf_assign_events(constraints, n,
+ memset(assign, -1, n * sizeof(int));
+ unsched = __perf_assign_events(event_list, constraints, n,
wmin, wmax, gpmax, assign);
}
}
--
1.8.3.1