LinuxLists.cc - [PATCH 1/2] perf/x86: extract code to assign perf events for both core and uncore

2022-03-03 08:41:47

Subject: [PATCH 1/2] perf/x86: extract code to assign perf events for both core and uncore

Following two patterns in x86 perf code are used in multiple places where
similar code is duplicated:
- fast path, try to reuse previous register
- slow path, assign a counter for each event

In order to reduce duplicate and prepare for following patch series that
also uses described patterns, extract the codes to perf_assign_events.

This commit doesn't change functionality.

Signed-off-by: Wen Yang <[email protected]>
Cc: Peter Zijlstra (Intel) <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Mark Rutland <[email protected]>
Cc: Alexander Shishkin <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Namhyung Kim <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: [email protected]
Cc: Wen Yang <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: [email protected]
Cc: [email protected]
---
arch/x86/events/core.c | 141 ++++++++++++++++++---------------
arch/x86/events/intel/uncore.c | 31 +-------
arch/x86/events/perf_event.h | 6 +-
3 files changed, 82 insertions(+), 96 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index eef816fc216d..9846d422f06d 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -950,10 +950,7 @@ static bool perf_sched_next_event(struct perf_sched *sched)
return true;
}

-/*
- * Assign a counter for each event.
- */
-int perf_assign_events(struct event_constraint **constraints, int n,
+int _perf_assign_events(struct event_constraint **constraints, int n,
int wmin, int wmax, int gpmax, int *assign)
{
struct perf_sched sched;
@@ -969,16 +966,66 @@ int perf_assign_events(struct event_constraint **constraints, int n,

return sched.state.unassigned;
}
+
+/*
+ * Assign a counter for each event.
+ */
+int perf_assign_events(struct perf_event **event_list,
+ struct event_constraint **constraints, int n,
+ int wmin, int wmax, int gpmax, int *assign)
+{
+ struct event_constraint *c;
+ struct hw_perf_event *hwc;
+ u64 used_mask = 0;
+ int unsched = 0;
+ int i;
+
+ /*
+ * fastpath, try to reuse previous register
+ */
+ for (i = 0; i < n; i++) {
+ u64 mask;
+
+ hwc = &event_list[i]->hw;
+ c = constraints[i];
+
+ /* never assigned */
+ if (hwc->idx == -1)
+ break;
+
+ /* constraint still honored */
+ if (!test_bit(hwc->idx, c->idxmsk))
+ break;
+
+ mask = BIT_ULL(hwc->idx);
+ if (is_counter_pair(hwc))
+ mask |= mask << 1;
+
+ /* not already used */
+ if (used_mask & mask)
+ break;
+
+ used_mask |= mask;
+
+ if (assign)
+ assign[i] = hwc->idx;
+ }
+
+ /* slow path */
+ if (i != n)
+ unsched = _perf_assign_events(constraints, n,
+ wmin, wmax, gpmax, assign);
+
+ return unsched;
+}
EXPORT_SYMBOL_GPL(perf_assign_events);

int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
{
int num_counters = hybrid(cpuc->pmu, num_counters);
- struct event_constraint *c;
- struct perf_event *e;
int n0, i, wmin, wmax, unsched = 0;
- struct hw_perf_event *hwc;
- u64 used_mask = 0;
+ struct event_constraint *c;
+ int gpmax = num_counters;

/*
* Compute the number of events already present; see x86_pmu_add(),
@@ -1017,66 +1064,30 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
}

/*
- * fastpath, try to reuse previous register
+ * Do not allow scheduling of more than half the available
+ * generic counters.
+ *
+ * This helps avoid counter starvation of sibling thread by
+ * ensuring at most half the counters cannot be in exclusive
+ * mode. There is no designated counters for the limits. Any
+ * N/2 counters can be used. This helps with events with
+ * specific counter constraints.
*/
- for (i = 0; i < n; i++) {
- u64 mask;
-
- hwc = &cpuc->event_list[i]->hw;
- c = cpuc->event_constraint[i];
-
- /* never assigned */
- if (hwc->idx == -1)
- break;
-
- /* constraint still honored */
- if (!test_bit(hwc->idx, c->idxmsk))
- break;
-
- mask = BIT_ULL(hwc->idx);
- if (is_counter_pair(hwc))
- mask |= mask << 1;
-
- /* not already used */
- if (used_mask & mask)
- break;
+ if (is_ht_workaround_enabled() && !cpuc->is_fake &&
+ READ_ONCE(cpuc->excl_cntrs->exclusive_present))
+ gpmax /= 2;

- used_mask |= mask;
-
- if (assign)
- assign[i] = hwc->idx;
+ /*
+ * Reduce the amount of available counters to allow fitting
+ * the extra Merge events needed by large increment events.
+ */
+ if (x86_pmu.flags & PMU_FL_PAIR) {
+ gpmax = num_counters - cpuc->n_pair;
+ WARN_ON(gpmax <= 0);
}

- /* slow path */
- if (i != n) {
- int gpmax = num_counters;
-
- /*
- * Do not allow scheduling of more than half the available
- * generic counters.
- *
- * This helps avoid counter starvation of sibling thread by
- * ensuring at most half the counters cannot be in exclusive
- * mode. There is no designated counters for the limits. Any
- * N/2 counters can be used. This helps with events with
- * specific counter constraints.
- */
- if (is_ht_workaround_enabled() && !cpuc->is_fake &&
- READ_ONCE(cpuc->excl_cntrs->exclusive_present))
- gpmax /= 2;
-
- /*
- * Reduce the amount of available counters to allow fitting
- * the extra Merge events needed by large increment events.
- */
- if (x86_pmu.flags & PMU_FL_PAIR) {
- gpmax = num_counters - cpuc->n_pair;
- WARN_ON(gpmax <= 0);
- }
-
- unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
- wmax, gpmax, assign);
- }
+ unsched = perf_assign_events(cpuc->event_list, cpuc->event_constraint,
+ n, wmin, wmax, gpmax, assign);

/*
* In case of success (unsched = 0), mark events as committed,
@@ -1093,7 +1104,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]);
} else {
for (i = n0; i < n; i++) {
- e = cpuc->event_list[i];
+ struct perf_event *e = cpuc->event_list[i];

/*
* release events that failed scheduling
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index e497da9bf427..101358ae2814 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -442,12 +442,8 @@ static void uncore_put_event_constraint(struct intel_uncore_box *box,

static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n)
{
- unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
struct event_constraint *c;
int i, wmin, wmax, ret = 0;
- struct hw_perf_event *hwc;
-
- bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);

for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
c = uncore_get_event_constraint(box, box->event_list[i]);
@@ -456,31 +452,8 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int
wmax = max(wmax, c->weight);
}

- /* fastpath, try to reuse previous register */
- for (i = 0; i < n; i++) {
- hwc = &box->event_list[i]->hw;
- c = box->event_constraint[i];
-
- /* never assigned */
- if (hwc->idx == -1)
- break;
-
- /* constraint still honored */
- if (!test_bit(hwc->idx, c->idxmsk))
- break;
-
- /* not already used */
- if (test_bit(hwc->idx, used_mask))
- break;
-
- __set_bit(hwc->idx, used_mask);
- if (assign)
- assign[i] = hwc->idx;
- }
- /* slow path */
- if (i != n)
- ret = perf_assign_events(box->event_constraint, n,
- wmin, wmax, n, assign);
+ ret = perf_assign_events(box->event_list,
+ box->event_constraint, n, wmin, wmax, n, assign);

if (!assign || ret) {
for (i = 0; i < n; i++)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 150261d929b9..f1acd1ded001 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1130,8 +1130,10 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,

void x86_pmu_enable_all(int added);

-int perf_assign_events(struct event_constraint **constraints, int n,
- int wmin, int wmax, int gpmax, int *assign);
+int perf_assign_events(struct perf_event **event_list,
+ struct event_constraint **constraints, int n,
+ int wmin, int wmax, int gpmax, int *assign);
+
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);

void x86_pmu_stop(struct perf_event *event, int flags);
--
2.19.1.6.gb485710b

2022-03-03 10:45:17

by Wen Yang

[permalink] [raw]

Subject: [PATCH 2/2] perf/x86: improve the event scheduling to avoid unnecessary pmu_stop/start

this issue has been there for a long time, we could reproduce it as follows:

1, run a script that periodically collects perf data, eg:
while true
do
perf stat -e cache-misses,cache-misses,cache-misses -c 1 sleep 2
perf stat -e cache-misses -c 1 sleep 2
sleep 1
done

2, run another one to capture the ipc, eg:
perf stat -e cycles:d,instructions:d -c 1 -i 1000

then we could observe that the counter used by cycles:d changes frequently:
crash> struct cpu_hw_events.n_events,assign,event_list,events 0xffff88bf7f44f420
n_events = 3
assign = {33, 1, 32, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
event_list = {0xffff88bf77b85000, 0xffff88b72db82000, 0xffff88b72db85800, 0xffff88ff6cfcb000, 0xffff88ff609f1800, 0xffff88ff609f1800, 0xffff88ff5f46a800, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
events = {0x0, 0xffff88b72db82000, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xffff88b72db85800, 0xffff88bf77b85000, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}

crash> struct cpu_hw_events.n_events,assign,event_list,events 0xffff88bf7f44f420
n_events = 6
assign = {33, 3, 32, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
event_list = {0xffff88bf77b85000, 0xffff88b72db82000, 0xffff88b72db85800, 0xffff88bf46c34000, 0xffff88bf46c35000, 0xffff88bf46c30000, 0xffff88ff5f46a800, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
events = {0xffff88bf46c34000, 0xffff88bf46c35000, 0xffff88bf46c30000, 0xffff88b72db82000, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xffff88b72db85800, 0xffff88bf77b85000, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}

the reason is that the nmi watchdog permanently consumes one fp
(*cycles*). therefore, when the above shell script obtains *cycles*
again, only one gp can be used, and its weight is 5.
but other events (like *cache-misses*) have a weight of 4,
so the counter used by *cycles* will often be taken away, as in
the raw data above:
[1]
n_events = 3
assign = {33, 1, 32, ...}
-->
n_events = 6
assign = {33, 3, 32, 0, 1, 2, ...}

so it will cause unnecessary pmu_stop/start and also cause abnormal cpi.

Cloud servers usually continuously monitor the cpi data of some important
services. This issue affects performance and misleads monitoring.

The current event scheduling algorithm is more than 10 years old:
https://lwn.net/articles/357631/

we wish it could be optimized a bit.
The fields msk_counters and msk_events are added to indicate currently
used counters and events so that the used ones can be skipped
in __perf_sched_find_counter and perf_sched_next_event functions to avoid
unnecessary pmu_stop/start.

[1]:
32: intel_pmc_idx_fixed_instructions
33: intel_pmc_idx_fixed_cpu_cycles
34: intel_pmc_idx_fixed_ref_cycles
0,1,2,3: gp

signed-off-by: Wen Yang <[email protected]>
cc: peter zijlstra (intel) <[email protected]>
cc: ingo molnar <[email protected]>
cc: arnaldo carvalho de melo <[email protected]>
cc: mark rutland <[email protected]>
cc: alexander shishkin <[email protected]>
cc: jiri olsa <[email protected]>
cc: namhyung kim <[email protected]>
cc: thomas gleixner <[email protected]>
cc: borislav petkov <[email protected]>
cc: [email protected]
cc: Wen Yang <[email protected]>
cc: "h. peter anvin" <[email protected]>
cc: [email protected]
cc: [email protected]
---
arch/x86/events/core.c | 93 +++++++++++++++++++++++++++++++++---------
1 file changed, 74 insertions(+), 19 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 9846d422f06d..88313d669756 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -796,33 +796,67 @@ struct perf_sched {
int max_events;
int max_gp;
int saved_states;
+ u64 msk_counters;
+ u64 msk_events;
struct event_constraint **constraints;
struct sched_state state;
struct sched_state saved[SCHED_STATES_MAX];
};

+static int perf_sched_calc_weight(struct event_constraint **constraints,
+ int num, int wmin, u64 msk_events)
+{
+ int tmp = wmin;
+ int idx;
+
+ if (!msk_events)
+ goto out;
+
+ for (idx = 0; idx < num; idx++) {
+ if (msk_events & BIT_ULL(idx))
+ continue;
+
+ tmp = min(tmp, constraints[idx]->weight);
+ }
+
+out:
+ return tmp;
+}
+
+static int perf_sched_calc_event(struct event_constraint **constraints,
+ int num, int weight, u64 msk_events)
+{
+ int idx;
+
+ for (idx = 0; idx < num; idx++) {
+ if (msk_events & BIT_ULL(idx))
+ continue;
+
+ if (constraints[idx]->weight == weight)
+ break;
+ }
+
+ /* start with min weight */
+ return idx;
+}
+
/*
* Initialize iterator that runs through all events and counters.
*/
static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
- int num, int wmin, int wmax, int gpmax)
+ int num, int wmin, int wmax, int gpmax, u64 cntm, u64 evtm)
{
- int idx;
-
memset(sched, 0, sizeof(*sched));
sched->max_events = num;
sched->max_weight = wmax;
sched->max_gp = gpmax;
sched->constraints = constraints;
+ sched->msk_counters = cntm;
+ sched->msk_events = evtm;

- for (idx = 0; idx < num; idx++) {
- if (constraints[idx]->weight == wmin)
- break;
- }
-
- sched->state.event = idx; /* start with min weight */
- sched->state.weight = wmin;
- sched->state.unassigned = num;
+ sched->state.weight = perf_sched_calc_weight(constraints, num, wmin, cntm);
+ sched->state.event = perf_sched_calc_event(constraints, num, sched->state.weight, evtm);
+ sched->state.unassigned = num - hweight_long(evtm);
}

static void perf_sched_save_state(struct perf_sched *sched)
@@ -874,6 +908,9 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
u64 mask = BIT_ULL(idx);

+ if (sched->msk_counters & mask)
+ continue;
+
if (sched->state.used & mask)
continue;

@@ -890,6 +927,9 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
if (c->flags & PERF_X86_EVENT_PAIR)
mask |= mask << 1;

+ if (sched->msk_counters & mask)
+ continue;
+
if (sched->state.used & mask)
continue;

@@ -921,6 +961,12 @@ static bool perf_sched_find_counter(struct perf_sched *sched)
return true;
}

+static void ignore_used_index(u64 mask, int *index)
+{
+ while (mask & BIT_ULL(*index))
+ ++*index;
+}
+
/*
* Go through all unassigned events and find the next one to schedule.
* Take events with the least weight first. Return true on success.
@@ -935,9 +981,12 @@ static bool perf_sched_next_event(struct perf_sched *sched)
do {
/* next event */
sched->state.event++;
+ ignore_used_index(sched->msk_events, &sched->state.event);
if (sched->state.event >= sched->max_events) {
/* next weight */
sched->state.event = 0;
+ ignore_used_index(sched->msk_events, &sched->state.event);
+
sched->state.weight++;
if (sched->state.weight > sched->max_weight)
return false;
@@ -951,11 +1000,11 @@ static bool perf_sched_next_event(struct perf_sched *sched)
}

int _perf_assign_events(struct event_constraint **constraints, int n,
- int wmin, int wmax, int gpmax, int *assign)
+ int wmin, int wmax, int gpmax, u64 cntm, u64 evtm, int *assign)
{
struct perf_sched sched;

- perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
+ perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax, cntm, evtm);

do {
if (!perf_sched_find_counter(&sched))
@@ -976,7 +1025,8 @@ int perf_assign_events(struct perf_event **event_list,
{
struct event_constraint *c;
struct hw_perf_event *hwc;
- u64 used_mask = 0;
+ u64 msk_counters = 0;
+ u64 msk_event = 0;
int unsched = 0;
int i;

@@ -1002,19 +1052,24 @@ int perf_assign_events(struct perf_event **event_list,
mask |= mask << 1;

/* not already used */
- if (used_mask & mask)
+ if (msk_counters & mask)
break;

- used_mask |= mask;
+ msk_counters |= mask;
+ msk_event |= BIT_ULL(i);

if (assign)
assign[i] = hwc->idx;
}

/* slow path */
- if (i != n)
- unsched = _perf_assign_events(constraints, n,
- wmin, wmax, gpmax, assign);
+ if (i != n) {
+ unsched = _perf_assign_events(constraints, n, wmin, wmax, gpmax,
+ msk_counters, msk_event, assign);
+ if (unsched)
+ unsched = _perf_assign_events(constraints, n, wmin, wmax, gpmax,
+ 0, 0, assign);
+ }

return unsched;
}
--
2.19.1.6.gb485710b