Sending some urgent fixes for the MBM(memory b/w monitoring) which is
upstreamed from 4.6-rc1. Patches apply on 4.6-rc1.
CQM and MBM counters reported some incorrect counts for different
scenarios like interval mode or for multiple perf instances.
An updated V2 as per Peter feedback: fixing a few indenting issues and
adding some better changelogs/comments, Removed the patch to send error
for some broken features - since these were broken anyways since 4.1.
[PATCH 1/3] perf/x86/cqm,mbm: Store cqm,mbm count for all events when
[PATCH 2/3] perf/x86/mbm: Store bytes counted for mbm during recycle
[PATCH 3/3] perf/x86/mbm: Fix mbm counting when RMIDs are reused
For MBM, since we report total bytes for the duration the perf counts,
we need to keep the total bytes counted every time we loose an RMID.
Introduce rc_count(recycle count) per event
keep this history count(all bytes counted before the current RMID).
If we do not keep this count separately then we may end up sending a
count that may be less than the previous count during -I perf stat
option which leads to negative numbers being reported in the perf. This
happens say when we counted a greater amount with RMID1 and then
counted lesser with RMID2, and if user checks counts in interval mode
after RMID1 and then again after RMID2.
Signed-off-by: Vikas Shivappa <[email protected]>
---
arch/x86/events/intel/cqm.c | 49 ++++++++++++++++++++++++++++++++++++++++-----
include/linux/perf_event.h | 1 +
2 files changed, 45 insertions(+), 5 deletions(-)
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index 5f2104a..320af26 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -479,6 +479,16 @@ static void cqm_mask_call(struct rmid_read *rr)
on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, rr, 1);
}
+static inline void
+mbm_set_rccount(struct perf_event *event, struct rmid_read *rr)
+{
+ u64 tmpval;
+
+ tmpval = local64_read(&event->hw.rc_count) + atomic64_read(&rr->value);
+ local64_set(&event->hw.rc_count, tmpval);
+ local64_set(&event->count, tmpval);
+}
+
/*
* Exchange the RMID of a group of events.
*/
@@ -493,12 +503,19 @@ static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
/*
* If our RMID is being deallocated, perform a read now.
+ * For mbm, we need to store the bytes that were counted till now
+ * separately.
*/
if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
rr = __init_rr(old_rmid, group->attr.config, 0);
cqm_mask_call(&rr);
- local64_set(&group->count, atomic64_read(&rr.value));
+
+ if (is_mbm_event(group->attr.config))
+ mbm_set_rccount(group, &rr);
+ else
+ local64_set(&group->count, atomic64_read(&rr.value));
+
list_for_each_entry(event, head, hw.cqm_group_entry) {
if (event->hw.is_group_event) {
@@ -506,8 +523,11 @@ static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
rr = __init_rr(old_rmid, evttype, 0);
cqm_mask_call(&rr);
- local64_set(&event->count,
- atomic64_read(&rr.value));
+ if (is_mbm_event(event->attr.config))
+ mbm_set_rccount(event, &rr);
+ else
+ local64_set(&event->count,
+ atomic64_read(&rr.value));
}
}
}
@@ -1194,6 +1214,7 @@ static u64 intel_cqm_event_count(struct perf_event *event)
{
unsigned long flags;
struct rmid_read rr = __init_rr(-1, event->attr.config, 0);
+ u64 tmpval;
/*
* We only need to worry about task events. System-wide events
@@ -1235,6 +1256,16 @@ static u64 intel_cqm_event_count(struct perf_event *event)
* busying performing the IPI calls. It's therefore necessary to
* check @event's RMID afterwards, and if it has changed,
* discard the result of the read.
+ *
+ * For MBM events, we are reading the total bytes and not
+ * a snapshot. Hence if RMIDs were recycled for the event, we need
+ * to add the counts of all RMIDs associated with the event together.
+ * Suppose RMID(1).. RMID(k) represent the total_bytes of the
+ * different RMIDs the event was associated with,
+ * count = RMID(1) + RMID(2) +...+ RMID(k-1)+ RMID(k).
+ * = rc_count + RMID(k).
+ * RMID(k) - is the count we read now via IPI
+ * rc_count = RMID(1) + RMID(2) +...+ RMID(k-1).
*/
rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
@@ -1244,8 +1275,16 @@ static u64 intel_cqm_event_count(struct perf_event *event)
cqm_mask_call(&rr);
raw_spin_lock_irqsave(&cache_lock, flags);
- if (event->hw.cqm_rmid == rr.rmid)
- local64_set(&event->count, atomic64_read(&rr.value));
+ if (event->hw.cqm_rmid == rr.rmid) {
+ if (is_mbm_event(event->attr.config)) {
+ tmpval = atomic64_read(&rr.value) +
+ local64_read(&event->hw.rc_count);
+
+ local64_set(&event->count, tmpval);
+ } else {
+ local64_set(&event->count, atomic64_read(&rr.value));
+ }
+ }
raw_spin_unlock_irqrestore(&cache_lock, flags);
out:
return __perf_event_count(event);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f291275..ec7772a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -122,6 +122,7 @@ struct hw_perf_event {
int cqm_state;
u32 cqm_rmid;
int is_group_event;
+ local64_t rc_count;
struct list_head cqm_events_entry;
struct list_head cqm_groups_entry;
struct list_head cqm_group_entry;
--
1.9.1
When multiple instances of perf reuse RMID for the same PID, then we
need to start counting from zero for each new event, rather than
reporting the current RMID count. This patch adds a st_count(start
count) 'per event' to track the same.
Note that this is different from the 'per rmid' start count. For the
first time an RMID is used we store the start_count for the 'per RMID'.
The 'per rmid' start_count is in the RMID data structure where as the
start count 'per event' is in the perf_event.
u64 read_sample(rmid) // for each rmid
{
start: // first call for the rmid
'per rmid' prev = read_hw_counter();
count:
cur_count = read_hw_counter();
delta = cur_count - prev;
prev = cur_count;
total_bytes += delta;
return total_bytes;
}
for each event -
start:
if rmid is reused
'per event' prev = read_sample(rmid);
else
prev = 0;
count:
tmp = read_sample(rmid);
count = tmp - prev;
For ex:
1.event1 gets RMID1. We read the hw_counter and set that as the RMID1's
start_count.
2.RMID1's total_bytes(this is current hw_count - start_count of the
RMID) is 100MB for event1(PID1)
3.another perf instance starts measuring the same PID1 with event2. We
reuse RMID1 as the PID1 is already counted.
4.event2 stores st_count as 100MB.
5.After some time, when user wants to count event2 and say RMID1's
current total_bytes 110MB, we report 110MB - 100MB = 10MB
Signed-off-by: Vikas Shivappa <[email protected]>
---
arch/x86/events/intel/cqm.c | 71 ++++++++++++++++++++++++++++++++++++++++++---
include/linux/perf_event.h | 1 +
2 files changed, 68 insertions(+), 4 deletions(-)
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index 320af26..8129959 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -484,8 +484,18 @@ mbm_set_rccount(struct perf_event *event, struct rmid_read *rr)
{
u64 tmpval;
- tmpval = local64_read(&event->hw.rc_count) + atomic64_read(&rr->value);
+ tmpval = local64_read(&event->hw.rc_count) + atomic64_read(&rr->value) -
+ local64_read(&event->hw.st_count);
+
local64_set(&event->hw.rc_count, tmpval);
+
+ /*
+ * The st_count(start count) is meant to store the starting bytes
+ * for an event which is reusing an RMID which already
+ * had bytes measured.Once we start using the rc_count
+ * to keep the history bytes, reset the start bytes.
+ */
+ local64_set(&event->hw.st_count, 0UL);
local64_set(&event->count, tmpval);
}
@@ -1025,6 +1035,58 @@ static void init_mbm_sample(u32 rmid, u32 evt_type)
on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1);
}
+static inline bool first_event_ingroup(struct perf_event *group,
+ struct perf_event *event)
+{
+ struct list_head *head = &group->hw.cqm_group_entry;
+ u32 evt_type = event->attr.config;
+
+ if (evt_type == group->attr.config)
+ return false;
+ list_for_each_entry(event, head, hw.cqm_group_entry) {
+ if (evt_type == event->attr.config)
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * mbm_setup_event - Does mbm specific count initialization
+ * when multiple events share RMID.
+ *
+ * If this is the first mbm event using the RMID, then initialize
+ * the total_bytes in the RMID and prev_count.
+ * else only initialize the start count of the event which is the current
+ * count of the RMID.
+ * In other words if the RMID has say counted 100MB till now because
+ * other event was already using it, we start
+ * from zero for our new event. Because after 1s if user checks the count,
+ * we need to report for the 1s duration and not the entire duration the
+ * RMID was being counted.
+*/
+static inline void mbm_setup_event(u32 rmid, struct perf_event *group,
+ struct perf_event *event)
+{
+ u32 evt_type = event->attr.config;
+ struct rmid_read rr;
+
+ if (first_event_ingroup(group, event)) {
+ init_mbm_sample(rmid, evt_type);
+ } else {
+ rr = __init_rr(rmid, evt_type, 0);
+ cqm_mask_call(&rr);
+ local64_set(&event->hw.st_count, atomic64_read(&rr.value));
+ }
+}
+
+static inline void mbm_setup_event_init(struct perf_event *event)
+{
+ event->hw.is_group_event = false;
+ local64_set(&event->hw.rc_count, 0UL);
+ local64_set(&event->hw.st_count, 0UL);
+}
+
/*
* Find a group and setup RMID.
*
@@ -1037,7 +1099,7 @@ static void intel_cqm_setup_event(struct perf_event *event,
bool conflict = false;
u32 rmid;
- event->hw.is_group_event = false;
+ mbm_setup_event_init(event);
list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
rmid = iter->hw.cqm_rmid;
@@ -1046,7 +1108,7 @@ static void intel_cqm_setup_event(struct perf_event *event,
event->hw.cqm_rmid = rmid;
*group = iter;
if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
- init_mbm_sample(rmid, event->attr.config);
+ mbm_setup_event(rmid, iter, event);
return;
}
@@ -1278,7 +1340,8 @@ static u64 intel_cqm_event_count(struct perf_event *event)
if (event->hw.cqm_rmid == rr.rmid) {
if (is_mbm_event(event->attr.config)) {
tmpval = atomic64_read(&rr.value) +
- local64_read(&event->hw.rc_count);
+ local64_read(&event->hw.rc_count) -
+ local64_read(&event->hw.st_count);
local64_set(&event->count, tmpval);
} else {
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ec7772a..44a7f0c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -123,6 +123,7 @@ struct hw_perf_event {
u32 cqm_rmid;
int is_group_event;
local64_t rc_count;
+ local64_t st_count;
struct list_head cqm_events_entry;
struct list_head cqm_groups_entry;
struct list_head cqm_group_entry;
--
1.9.1
During RMID recycling, when an event loses the RMID we saved the counter
for group leader but it was not being saved for all the events in an
event group. This would lead to a situation where if 2 perf instances
are counting the same PID one of them would not see the updated count
which other perf instance is seeing. This patch tries to fix the issue
by saving the count for all the events in the same event group.
Signed-off-by: Vikas Shivappa <[email protected]>
---
arch/x86/events/intel/cqm.c | 39 ++++++++++++++++++++++++---------------
1 file changed, 24 insertions(+), 15 deletions(-)
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index 7b5fd81..5f2104a 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -14,6 +14,14 @@
#define MSR_IA32_QM_EVTSEL 0x0c8d
#define MBM_CNTR_WIDTH 24
+
+#define __init_rr(old_rmid, config, val) \
+((struct rmid_read) { \
+ .rmid = old_rmid, \
+ .evt_type = config, \
+ .value = ATOMIC64_INIT(val), \
+})
+
/*
* Guaranteed time in ms as per SDM where MBM counters will not overflow.
*/
@@ -478,7 +486,8 @@ static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
{
struct perf_event *event;
struct list_head *head = &group->hw.cqm_group_entry;
- u32 old_rmid = group->hw.cqm_rmid;
+ u32 old_rmid = group->hw.cqm_rmid, evttype;
+ struct rmid_read rr;
lockdep_assert_held(&cache_mutex);
@@ -486,14 +495,21 @@ static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
* If our RMID is being deallocated, perform a read now.
*/
if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
- struct rmid_read rr = {
- .rmid = old_rmid,
- .evt_type = group->attr.config,
- .value = ATOMIC64_INIT(0),
- };
+ rr = __init_rr(old_rmid, group->attr.config, 0);
cqm_mask_call(&rr);
local64_set(&group->count, atomic64_read(&rr.value));
+ list_for_each_entry(event, head, hw.cqm_group_entry) {
+ if (event->hw.is_group_event) {
+
+ evttype = event->attr.config;
+ rr = __init_rr(old_rmid, evttype, 0);
+
+ cqm_mask_call(&rr);
+ local64_set(&event->count,
+ atomic64_read(&rr.value));
+ }
+ }
}
raw_spin_lock_irq(&cache_lock);
@@ -983,11 +999,7 @@ static void __intel_mbm_event_init(void *info)
static void init_mbm_sample(u32 rmid, u32 evt_type)
{
- struct rmid_read rr = {
- .rmid = rmid,
- .evt_type = evt_type,
- .value = ATOMIC64_INIT(0),
- };
+ struct rmid_read rr = __init_rr(rmid, evt_type, 0);
/* on each socket, init sample */
on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1);
@@ -1181,10 +1193,7 @@ static void mbm_hrtimer_init(void)
static u64 intel_cqm_event_count(struct perf_event *event)
{
unsigned long flags;
- struct rmid_read rr = {
- .evt_type = event->attr.config,
- .value = ATOMIC64_INIT(0),
- };
+ struct rmid_read rr = __init_rr(-1, event->attr.config, 0);
/*
* We only need to worry about task events. System-wide events
--
1.9.1