2015-07-22 00:11:12

by Kanaka Juvva

[permalink] [raw]
Subject: [PATCH v2] perf,x86: add Intel Memory Bandwidth Monitoring (MBM) PMU

This patch adds Memory Bandwidth Monitoring events to the exitsing
intel_cqm pmu in the Linux Kernel.

Intel MBM builds on Cache Monitoring Technology (CMT) infrastructure
to allow monitoring of bandwidth from one level of the cache hierarchy
to the next - in this case focusing on the L3 cache, which is typically
backed directly by system memory. As a result of this implementation,
memory bandwidth can be monitored.

MBM counters are available in Intel Xeon CPUs. The following events are
implemented in the kernel and expose two MBM stats via perf_event
interface:

- llc_local_bw: bandwidth consumption of all coress on socket
- llc_total_bw: bandwidth consumption of all cores on socket + bandwidth
for QPI accesses

At present, MBM events are checked at one second interval provided
by the HRTIMER of the event. Counters can overflow atmost once in
a second and thus must be read atleast once in a second. Overflow is
detected and handled. Running average value is calculated for each
bandwidth type upon reading a new value from the MSR. Sliding window
stores the values read from MSR. Sliding window size is configurable.

Signed-off-by: Kanaka Juvva <[email protected]>
---
arch/x86/include/asm/cpufeature.h | 2 +
arch/x86/kernel/cpu/common.c | 4 +-
arch/x86/kernel/cpu/perf_event_intel_cqm.c | 624 +++++++++++++++++++++++++++--
3 files changed, 588 insertions(+), 42 deletions(-)

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 3d6606f..c36abdf 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -251,6 +251,8 @@

/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
+#define X86_FEATURE_CQM_MBM_TOTAL (12*32+1) /* LLC Total MBM monitoring */
+#define X86_FEATURE_CQM_MBM_LOCAL (12*32+2) /* LLC Local MBM monitoring */

/*
* BUG word(s)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 922c5e0..7dcc88c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -643,7 +643,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
/* QoS sub-leaf, EAX=0Fh, ECX=1 */
cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
c->x86_capability[12] = edx;
- if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) {
+ if ((cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC))
+ || ((cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL))
+ || (cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)))) {
c->x86_cache_max_rmid = ecx;
c->x86_cache_occ_scale = ebx;
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 1880761..dc1ce58 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -12,10 +12,20 @@
#define MSR_IA32_PQR_ASSOC 0x0c8f
#define MSR_IA32_QM_CTR 0x0c8e
#define MSR_IA32_QM_EVTSEL 0x0c8d
+#define MAX_MBM_CNTR 0xffffff
+#define MBM_SOCKET_MAX 8
+#define MBM_TIME_DELTA_MAX 1000
+#define MBM_TIME_DELTA_MIN 1000
+#define MBM_SCALING_FACTOR 1000
+#define MBM_SCALING_HALF (MBM_SCALING_FACTOR/2)
+#define MBM_FIFO_SIZE_MIN 10
+#define MBM_FIFO_SIZE_MAX 300

-static u32 cqm_max_rmid = -1;
-static unsigned int cqm_l3_scale; /* supposedly cacheline size */

+static u32 cqm_max_rmid = -1;
+static unsigned int cqm_l3_scale;/* supposedly cacheline size */
+static unsigned mbm_window_size = MBM_FIFO_SIZE_MIN;
+static bool cqm_llc_occ, cqm_mbm;
/**
* struct intel_pqr_state - State cache for the PQR MSR
* @rmid: The cached Resource Monitoring ID
@@ -42,6 +52,34 @@ struct intel_pqr_state {
* interrupts disabled, which is sufficient for the protection.
*/
static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
+static DEFINE_PER_CPU(struct mbm_pmu *, mbm_pmu);
+static DEFINE_PER_CPU(struct mbm_pmu *, mbm_pmu_to_free);
+
+/*
+ * mbm pmu is used for storing mbm_local and mbm_total
+ * events
+ */
+struct mbm_pmu {
+ spinlock_t lock;
+ int n_active; /* number of active events */
+ struct list_head active_list;
+ struct pmu *pmu; /* pointer to mbm perf_event */
+ ktime_t timer_interval; /* in ktime_t unit */
+ struct hrtimer hrtimer;
+};
+
+struct sample {
+ u64 bytes; /* mbm counter value read*/
+ u64 cum_avg; /* current running average of bandwidth */
+ ktime_t prev_time;
+ u64 index; /* current sample no */
+ u32 mbmfifo[MBM_FIFO_SIZE_MAX]; /* window of last n bw values */
+ u32 fifoin; /* mbmfifo in counter for sliding window */
+ u32 fifoout; /* mbmfifo out counter for sliding window */
+};
+
+struct sample *mbm_total; /* curent stats for mbm_total events */
+struct sample *mbm_local; /* current stats for mbm_local events */

/*
* Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
@@ -66,6 +104,9 @@ static cpumask_t cqm_cpumask;
#define RMID_VAL_UNAVAIL (1ULL << 62)

#define QOS_L3_OCCUP_EVENT_ID (1 << 0)
+#define QOS_MBM_TOTAL_EVENT_ID (1 << 1)
+#define QOS_MBM_LOCAL_EVENT_ID1 0x3
+#define QOS_MBM_LOCAL_EVENT_ID (1 << 2)

#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID

@@ -90,7 +131,8 @@ static u32 intel_cqm_rotation_rmid;
*/
static inline bool __rmid_valid(u32 rmid)
{
- if (!rmid || rmid == INVALID_RMID)
+ WARN_ON_ONCE(rmid > cqm_max_rmid);
+ if (!rmid || (rmid == INVALID_RMID) || (rmid > cqm_max_rmid))
return false;

return true;
@@ -125,6 +167,7 @@ struct cqm_rmid_entry {
enum rmid_recycle_state state;
struct list_head list;
unsigned long queue_time;
+ bool config;
};

/*
@@ -176,6 +219,17 @@ static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
return entry;
}

+static void mbm_reset_stats(u32 rmid)
+{
+ u32 vrmid = topology_physical_package_id(smp_processor_id()) *
+ cqm_max_rmid + rmid;
+
+ if ((!cqm_mbm) || (!mbm_local) || (!mbm_total))
+ return;
+ memset(&mbm_local[vrmid], 0, sizeof(struct sample));
+ memset(&mbm_total[vrmid], 0, sizeof(struct sample));
+}
+
/*
* Returns < 0 on fail.
*
@@ -190,8 +244,10 @@ static u32 __get_rmid(void)
if (list_empty(&cqm_rmid_free_lru))
return INVALID_RMID;

- entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
+ entry = list_first_entry(&cqm_rmid_free_lru,
+ struct cqm_rmid_entry, list);
list_del(&entry->list);
+ mbm_reset_stats(entry->rmid);

return entry->rmid;
}
@@ -207,6 +263,7 @@ static void __put_rmid(u32 rmid)

entry->queue_time = jiffies;
entry->state = RMID_YOUNG;
+ mbm_reset_stats(rmid);

list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
}
@@ -232,6 +289,8 @@ static int intel_cqm_setup_rmid_cache(void)

INIT_LIST_HEAD(&entry->list);
entry->rmid = r;
+ entry->config = false;
+
cqm_rmid_ptrs[r] = entry;

list_add_tail(&entry->list, &cqm_rmid_free_lru);
@@ -254,6 +313,8 @@ fail:
kfree(cqm_rmid_ptrs[r]);

kfree(cqm_rmid_ptrs);
+ kfree(mbm_local);
+ kfree(mbm_total);
return -ENOMEM;
}

@@ -403,9 +464,11 @@ static void __intel_cqm_event_count(void *info);
static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
{
struct perf_event *event;
+
struct list_head *head = &group->hw.cqm_group_entry;
u32 old_rmid = group->hw.cqm_rmid;

+
lockdep_assert_held(&cache_mutex);

/*
@@ -494,6 +557,166 @@ static bool intel_cqm_sched_in_event(u32 rmid)
return false;
}

+static u32 mbm_fifo_sum_lastn_out(int rmid, bool is_localbw)
+{
+ u32 val = 0, i, j, index;
+
+ if (is_localbw) {
+ if (++mbm_local[rmid].fifoout >= mbm_window_size)
+ mbm_local[rmid].fifoout = 0;
+ index = mbm_local[rmid].fifoout;
+ for (i = 0; i < mbm_window_size - 1; i++) {
+ if (index + i >= mbm_window_size)
+ j = index + i - mbm_window_size;
+ else
+ j = index + i;
+ val += mbm_local[rmid].mbmfifo[j];
+ }
+ return val;
+ }
+
+ if (++mbm_total[rmid].fifoout >= mbm_window_size)
+ mbm_total[rmid].fifoout = 0;
+ index = mbm_total[rmid].fifoout;
+ for (i = 0; i < mbm_window_size - 1; i++) {
+ if (index + i >= mbm_window_size)
+ j = index + i - mbm_window_size;
+ else
+ j = index + i;
+ val += mbm_total[rmid].mbmfifo[j];
+ }
+ return val;
+}
+
+static int mbm_fifo_in(int rmid, u32 val, bool is_localbw)
+{
+ if (is_localbw) {
+ mbm_local[rmid].mbmfifo[mbm_local[rmid].fifoin] = val;
+ if (++mbm_local[rmid].fifoin >= mbm_window_size)
+ mbm_local[rmid].fifoin = 0;
+ } else {
+ mbm_total[rmid].mbmfifo[mbm_total[rmid].fifoin] = val;
+ if (++mbm_total[rmid].fifoin >= mbm_window_size)
+ mbm_total[rmid].fifoin = 0;
+ }
+ return 0;
+}
+
+/*
+ * __rmid_read_mbm checks whether it is LOCAL or GLOBAL MBM event and reads
+ * its MSR counter. if (MSR current value < MSR previous value) it is an
+ * overflow and overflow is handled. If MSR is read within last 100ms,
+ * then the value is ignored; this will suppress small deltas. We don't
+ * process MBM samples that are within 100ms. Bandwidth is calculated as:
+ * bandwidth = difference of last two msr counter values/time difference.
+ * cum_avg = Running Average bandwidth of last 'n' samples that are processed
+ * Sliding window is used to save the last 'n' samples. Hence,
+ * n = sliding_window_size
+ * cum_avg is scaled down by a factor MBM_SCALING_FACTOR and rounded to nearest
+ * integer. User interface reads the BW in MB/sec.
+ * Rounding algorithm : (X + 0.5):
+ * where X is scaled BW value. To avoid floating point arithmetic :
+ * BW- unscaled value
+ * (BW + MBM_SCALING_HALF)/MBM_SCALING_FACTOR is computed which gives the
+ * scaled bandwidth.
+ */
+static u64 __rmid_read_mbm(unsigned int rmid, bool read_mbm_local)
+{
+ u64 val, tmp, diff_time, cma, bytes, index;
+ bool overflow = false;
+ ktime_t cur_time;
+ u32 tmp32 = rmid;
+ u32 vrmid = topology_physical_package_id(smp_processor_id()) *
+ cqm_max_rmid + rmid;
+
+ rmid = vrmid;
+ cur_time = ktime_get_real();
+ if (read_mbm_local) {
+ cma = mbm_local[rmid].cum_avg;
+ diff_time = ktime_ms_delta(cur_time,
+ mbm_local[rmid].prev_time);
+ if (diff_time < 100)
+ return cma;
+ mbm_local[rmid].prev_time = ktime_set(0,
+ (unsigned long)ktime_to_ns(cur_time));
+ bytes = mbm_local[rmid].bytes;
+ index = mbm_local[rmid].index;
+ rmid = tmp32;
+ wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_LOCAL_EVENT_ID1, rmid);
+ } else {
+ cma = mbm_total[rmid].cum_avg;
+ diff_time = ktime_ms_delta(cur_time,
+ mbm_total[rmid].prev_time);
+ if (diff_time < 100)
+ return cma;
+ mbm_total[rmid].prev_time = ktime_set(0,
+ (unsigned long)ktime_to_ns(cur_time));
+ bytes = mbm_total[rmid].bytes;
+ index = mbm_total[rmid].index;
+ rmid = tmp32;
+ wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_TOTAL_EVENT_ID, rmid);
+ }
+ rdmsrl(MSR_IA32_QM_CTR, val);
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return val;
+
+ tmp = val;
+ if (val < bytes) /* overflow handle it */ {
+ val = MAX_MBM_CNTR - bytes + val;
+ overflow = true;
+ } else
+ val = val - bytes;
+ if (diff_time < MBM_TIME_DELTA_MAX - MBM_TIME_DELTA_MIN)
+ val = (val * MBM_TIME_DELTA_MAX) / diff_time;
+
+ if ((diff_time > MBM_TIME_DELTA_MAX) && (!cma))
+ /* First sample, we can't use the time delta */
+ diff_time = MBM_TIME_DELTA_MAX;
+
+ rmid = vrmid;
+ if ((diff_time <= MBM_TIME_DELTA_MAX + MBM_TIME_DELTA_MIN) ||
+ overflow) {
+ int bw, ret;
+
+ if (index & (index < mbm_window_size))
+ val = cma * MBM_SCALING_FACTOR + val / index -
+ cma / index;
+ val = (val + MBM_SCALING_HALF) / MBM_SCALING_FACTOR;
+ if (index >= mbm_window_size) {
+ /*
+ * Compute the sum of recent n-1 samples and slide the
+ * window by 1
+ */
+ ret = mbm_fifo_sum_lastn_out(rmid, read_mbm_local);
+ /* recalculate the running average with current bw */
+ ret = (ret + val) / mbm_window_size;
+ if (ret < 0)
+ ret = 0;
+ bw = val;
+ val = ret;
+ } else
+ bw = val;
+ /* save the recent bw in fifo */
+ mbm_fifo_in(rmid, bw, read_mbm_local);
+
+ if (read_mbm_local) {
+ mbm_local[rmid].index++;
+ mbm_local[rmid].cum_avg = val;
+ mbm_local[rmid].bytes = tmp;
+ mbm_local[rmid].prev_time = ktime_set(0,
+ (unsigned long)ktime_to_ns(cur_time));
+ } else {
+ mbm_total[rmid].index++;
+ mbm_total[rmid].cum_avg = val;
+ mbm_total[rmid].bytes = tmp;
+ mbm_total[rmid].prev_time = ktime_set(0,
+ (unsigned long)ktime_to_ns(cur_time));
+ }
+ return val;
+ }
+ return cma;
+}
+
/*
* Initially use this constant for both the limbo queue time and the
* rotation timer interval, pmu::hrtimer_interval_ms.
@@ -568,7 +791,8 @@ static bool intel_cqm_rmid_stabilize(unsigned int *available)
/*
* Test whether an RMID is free for each package.
*/
- on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
+ if (entry->config)
+ on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);

list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
/*
@@ -846,8 +1070,9 @@ static void intel_cqm_setup_event(struct perf_event *event,
struct perf_event **group)
{
struct perf_event *iter;
- bool conflict = false;
+
u32 rmid;
+ bool conflict = false;

list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
rmid = iter->hw.cqm_rmid;
@@ -875,6 +1100,40 @@ static void intel_cqm_setup_event(struct perf_event *event,
event->hw.cqm_rmid = rmid;
}

+static void intel_cqm_event_update(struct perf_event *event)
+{
+ unsigned int rmid;
+ u64 val = 0;
+
+ /*
+ * Task events are handled by intel_cqm_event_count().
+ */
+ if (event->cpu == -1)
+ return;
+
+ rmid = event->hw.cqm_rmid;
+ if (!__rmid_valid(rmid))
+ return;
+
+ switch (event->attr.config) {
+ case QOS_MBM_TOTAL_EVENT_ID:
+ val = __rmid_read_mbm(rmid, false);
+ break;
+ case QOS_MBM_LOCAL_EVENT_ID:
+ val = __rmid_read_mbm(rmid, true);
+ break;
+ default:
+ break;
+ }
+ /*
+ * Ignore this reading on error states and do not update the value.
+ */
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return;
+
+ local64_set(&event->count, val);
+}
+
static void intel_cqm_event_read(struct perf_event *event)
{
unsigned long flags;
@@ -887,6 +1146,12 @@ static void intel_cqm_event_read(struct perf_event *event)
if (event->cpu == -1)
return;

+ if ((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
+ (event->attr.config & QOS_MBM_LOCAL_EVENT_ID))
+ intel_cqm_event_update(event);
+
+ if (!(event->attr.config & QOS_L3_OCCUP_EVENT_ID))
+ return;
raw_spin_lock_irqsave(&cache_lock, flags);
rmid = event->hw.cqm_rmid;

@@ -906,6 +1171,28 @@ out:
raw_spin_unlock_irqrestore(&cache_lock, flags);
}

+static void __intel_cqm_event_total_bw_count(void *info)
+{
+ struct rmid_read *rr = info;
+ u64 val;
+
+ val = __rmid_read_mbm(rr->rmid, false);
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return;
+ atomic64_add(val, &rr->value);
+}
+
+static void __intel_cqm_event_local_bw_count(void *info)
+{
+ struct rmid_read *rr = info;
+ u64 val;
+
+ val = __rmid_read_mbm(rr->rmid, true);
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return;
+ atomic64_add(val, &rr->value);
+}
+
static void __intel_cqm_event_count(void *info)
{
struct rmid_read *rr = info;
@@ -967,7 +1254,21 @@ static u64 intel_cqm_event_count(struct perf_event *event)
if (!__rmid_valid(rr.rmid))
goto out;

- on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+ switch (event->attr.config) {
+ case QOS_L3_OCCUP_EVENT_ID:
+ on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+ break;
+ case QOS_MBM_TOTAL_EVENT_ID:
+ on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_total_bw_count,
+ &rr, 1);
+ break;
+ case QOS_MBM_LOCAL_EVENT_ID:
+ on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_local_bw_count,
+ &rr, 1);
+ break;
+ default:
+ break;
+ }

raw_spin_lock_irqsave(&cache_lock, flags);
if (event->hw.cqm_rmid == rr.rmid)
@@ -977,6 +1278,39 @@ out:
return __perf_event_count(event);
}

+static void mbm_start_hrtimer(struct mbm_pmu *pmu)
+{
+ hrtimer_start_range_ns(&(pmu->hrtimer),
+ pmu->timer_interval, 0,
+ HRTIMER_MODE_REL_PINNED);
+}
+
+static void mbm_stop_hrtimer(struct mbm_pmu *pmu)
+{
+ hrtimer_cancel(&pmu->hrtimer);
+}
+
+static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer)
+{
+ struct mbm_pmu *pmu = __this_cpu_read(mbm_pmu);
+ struct perf_event *event;
+
+ if (!pmu->n_active)
+ return HRTIMER_NORESTART;
+ list_for_each_entry(event, &pmu->active_list, active_entry)
+ intel_cqm_event_update(event);
+ hrtimer_forward_now(hrtimer, pmu->timer_interval);
+ return HRTIMER_RESTART;
+}
+
+static void mbm_hrtimer_init(struct mbm_pmu *pmu)
+{
+ struct hrtimer *hr = &pmu->hrtimer;
+
+ hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hr->function = mbm_hrtimer_handle;
+}
+
static void intel_cqm_event_start(struct perf_event *event, int mode)
{
struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
@@ -995,19 +1329,48 @@ static void intel_cqm_event_start(struct perf_event *event, int mode)
}

state->rmid = rmid;
+ if (event->attr.config & QOS_L3_OCCUP_EVENT_ID) {
+ struct cqm_rmid_entry *entry;
+
+ entry = __rmid_entry(rmid);
+ entry->config = true;
+ }
wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
+
+ if (((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
+ (event->attr.config & QOS_MBM_LOCAL_EVENT_ID)) && (cqm_mbm)) {
+ int cpu = get_cpu();
+ struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
+
+ if (pmu) {
+ if (pmu->n_active == 0)
+ mbm_hrtimer_init(pmu);
+ pmu->n_active++;
+ list_add_tail(&event->active_entry,
+ &pmu->active_list);
+ if (pmu->n_active == 1)
+ mbm_start_hrtimer(pmu);
+ }
+ put_cpu();
+ }
}

static void intel_cqm_event_stop(struct perf_event *event, int mode)
{
struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+ struct mbm_pmu *pmu = __this_cpu_read(mbm_pmu);

if (event->hw.cqm_state & PERF_HES_STOPPED)
return;

event->hw.cqm_state |= PERF_HES_STOPPED;

- intel_cqm_event_read(event);
+ if (event->attr.config & QOS_L3_OCCUP_EVENT_ID)
+ intel_cqm_event_read(event);
+
+ if ((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
+ (event->attr.config & QOS_MBM_LOCAL_EVENT_ID))
+ intel_cqm_event_update(event);

if (!--state->rmid_usecnt) {
state->rmid = 0;
@@ -1015,8 +1378,18 @@ static void intel_cqm_event_stop(struct perf_event *event, int mode)
} else {
WARN_ON_ONCE(!state->rmid);
}
+
+ if (pmu) {
+ WARN_ON_ONCE(pmu->n_active <= 0);
+ pmu->n_active--;
+ if (pmu->n_active == 0)
+ mbm_stop_hrtimer(pmu);
+ list_del(&event->active_entry);
+ }
+
}

+
static int intel_cqm_event_add(struct perf_event *event, int mode)
{
unsigned long flags;
@@ -1082,7 +1455,9 @@ static int intel_cqm_event_init(struct perf_event *event)
if (event->attr.type != intel_cqm_pmu.type)
return -ENOENT;

- if (event->attr.config & ~QOS_EVENT_MASK)
+ if (((event->attr.config & ~QOS_EVENT_MASK) &&
+ (event->attr.config & ~QOS_MBM_TOTAL_EVENT_ID)) &&
+ (event->attr.config & ~QOS_MBM_LOCAL_EVENT_ID))
return -EINVAL;

/* unsupported modes and filters */
@@ -1137,18 +1512,63 @@ EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");

-static struct attribute *intel_cqm_events_attr[] = {
+EVENT_ATTR_STR(llc_total_bw, intel_cqm_llc_total_bw, "event=0x02");
+EVENT_ATTR_STR(llc_total_bw.per-pkg, intel_cqm_llc_total_bw_pkg, "1");
+EVENT_ATTR_STR(llc_total_bw.unit, intel_cqm_llc_total_bw_unit, "MB/sec");
+EVENT_ATTR_STR(llc_total_bw.scale, intel_cqm_llc_total_bw_scale, NULL);
+EVENT_ATTR_STR(llc_total_bw.snapshot, intel_cqm_llc_total_bw_snapshot, "1");
+
+EVENT_ATTR_STR(llc_local_bw, intel_cqm_llc_local_bw, "event=0x04");
+EVENT_ATTR_STR(llc_local_bw.per-pkg, intel_cqm_llc_local_bw_pkg, "1");
+EVENT_ATTR_STR(llc_local_bw.unit, intel_cqm_llc_local_bw_unit, "MB/sec");
+EVENT_ATTR_STR(llc_local_bw.scale, intel_cqm_llc_local_bw_scale, NULL);
+EVENT_ATTR_STR(llc_local_bw.snapshot, intel_cqm_llc_local_bw_snapshot, "1");
+
+static struct attribute *intel_cmt_events_attr[] = {
+ EVENT_PTR(intel_cqm_llc),
+ EVENT_PTR(intel_cqm_llc_pkg),
+ EVENT_PTR(intel_cqm_llc_unit),
+ EVENT_PTR(intel_cqm_llc_scale),
+ EVENT_PTR(intel_cqm_llc_snapshot),
+ NULL,
+};
+
+static struct attribute *intel_mbm_events_attr[] = {
+ EVENT_PTR(intel_cqm_llc_total_bw),
+ EVENT_PTR(intel_cqm_llc_local_bw),
+ EVENT_PTR(intel_cqm_llc_total_bw_pkg),
+ EVENT_PTR(intel_cqm_llc_local_bw_pkg),
+ EVENT_PTR(intel_cqm_llc_total_bw_unit),
+ EVENT_PTR(intel_cqm_llc_local_bw_unit),
+ EVENT_PTR(intel_cqm_llc_total_bw_scale),
+ EVENT_PTR(intel_cqm_llc_local_bw_scale),
+ EVENT_PTR(intel_cqm_llc_total_bw_snapshot),
+ EVENT_PTR(intel_cqm_llc_local_bw_snapshot),
+ NULL,
+};
+
+static struct attribute *intel_cmt_mbm_events_attr[] = {
EVENT_PTR(intel_cqm_llc),
+ EVENT_PTR(intel_cqm_llc_total_bw),
+ EVENT_PTR(intel_cqm_llc_local_bw),
EVENT_PTR(intel_cqm_llc_pkg),
+ EVENT_PTR(intel_cqm_llc_total_bw_pkg),
+ EVENT_PTR(intel_cqm_llc_local_bw_pkg),
EVENT_PTR(intel_cqm_llc_unit),
+ EVENT_PTR(intel_cqm_llc_total_bw_unit),
+ EVENT_PTR(intel_cqm_llc_local_bw_unit),
EVENT_PTR(intel_cqm_llc_scale),
+ EVENT_PTR(intel_cqm_llc_total_bw_scale),
+ EVENT_PTR(intel_cqm_llc_local_bw_scale),
EVENT_PTR(intel_cqm_llc_snapshot),
+ EVENT_PTR(intel_cqm_llc_total_bw_snapshot),
+ EVENT_PTR(intel_cqm_llc_local_bw_snapshot),
NULL,
};

static struct attribute_group intel_cqm_events_group = {
.name = "events",
- .attrs = intel_cqm_events_attr,
+ .attrs = NULL,
};

PMU_FORMAT_ATTR(event, "config:0-7");
@@ -1176,6 +1596,19 @@ max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
}

static ssize_t
+sliding_window_size_show(struct device *dev, struct device_attribute *attr,
+ char *page)
+{
+ ssize_t rv;
+
+ mutex_lock(&cache_mutex);
+ rv = snprintf(page, PAGE_SIZE-1, "%u\n", mbm_window_size);
+ mutex_unlock(&cache_mutex);
+
+ return rv;
+}
+
+static ssize_t
max_recycle_threshold_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
@@ -1203,10 +1636,35 @@ max_recycle_threshold_store(struct device *dev,
return count;
}

+static ssize_t
+sliding_window_size_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned int bytes;
+ int ret;
+
+ ret = kstrtouint(buf, 0, &bytes);
+ if (ret)
+ return ret;
+
+ mutex_lock(&cache_mutex);
+ if (bytes > 0 && bytes <= MBM_FIFO_SIZE_MAX)
+ mbm_window_size = bytes;
+ else
+ bytes = MBM_FIFO_SIZE_MIN;
+
+ mutex_unlock(&cache_mutex);
+
+ return count;
+}
+
static DEVICE_ATTR_RW(max_recycle_threshold);
+static DEVICE_ATTR_RW(sliding_window_size);

static struct attribute *intel_cqm_attrs[] = {
&dev_attr_max_recycle_threshold.attr,
+ &dev_attr_sliding_window_size.attr,
NULL,
};

@@ -1241,16 +1699,17 @@ static inline void cqm_pick_event_reader(int cpu)

for_each_cpu(i, &cqm_cpumask) {
if (phys_id == topology_physical_package_id(i))
- return; /* already got reader for this socket */
+ return; /* already got reader for this socket */
}

cpumask_set_cpu(cpu, &cqm_cpumask);
}

-static void intel_cqm_cpu_prepare(unsigned int cpu)
+static int intel_cqm_cpu_prepare(unsigned int cpu)
{
struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);

state->rmid = 0;
state->closid = 0;
@@ -1258,12 +1717,27 @@ static void intel_cqm_cpu_prepare(unsigned int cpu)

WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
+
+ if ((!pmu) && (cqm_mbm)) {
+ pmu = kzalloc_node(sizeof(*mbm_pmu), GFP_KERNEL, NUMA_NO_NODE);
+ if (!pmu)
+ return -ENOMEM;
+ spin_lock_init(&pmu->lock);
+ INIT_LIST_HEAD(&pmu->active_list);
+ pmu->pmu = &intel_cqm_pmu;
+ pmu->n_active = 0;
+ pmu->timer_interval = ms_to_ktime(MBM_TIME_DELTA_MAX);
+ per_cpu(mbm_pmu, cpu) = pmu;
+ per_cpu(mbm_pmu_to_free, cpu) = NULL;
+ }
+ return 0;
}

static void intel_cqm_cpu_exit(unsigned int cpu)
{
int phys_id = topology_physical_package_id(cpu);
int i;
+ struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);

/*
* Is @cpu a designated cqm reader?
@@ -1280,6 +1754,13 @@ static void intel_cqm_cpu_exit(unsigned int cpu)
break;
}
}
+
+ /* cancel overflow polling timer for CPU */
+ if (pmu)
+ mbm_stop_hrtimer(pmu);
+ kfree(mbm_local);
+ kfree(mbm_total);
+
}

static int intel_cqm_cpu_notifier(struct notifier_block *nb,
@@ -1289,7 +1770,7 @@ static int intel_cqm_cpu_notifier(struct notifier_block *nb,

switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
- intel_cqm_cpu_prepare(cpu);
+ return intel_cqm_cpu_prepare(cpu);
break;
case CPU_DOWN_PREPARE:
intel_cqm_cpu_exit(cpu);
@@ -1305,17 +1786,74 @@ static int intel_cqm_cpu_notifier(struct notifier_block *nb,
static const struct x86_cpu_id intel_cqm_match[] = {
{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
{}
+ }, intel_mbm_match[] = {
+ { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL },
+ {}
};

static int __init intel_cqm_init(void)
{
char *str, scale[20];
- int i, cpu, ret;
+ int i = 0, cpu, ret;

- if (!x86_match_cpu(intel_cqm_match))
+ if (!x86_match_cpu(intel_cqm_match) &&
+ (!x86_match_cpu(intel_mbm_match)))
return -ENODEV;

cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
+ cqm_max_rmid = boot_cpu_data.x86_cache_max_rmid;
+
+ if (x86_match_cpu(intel_cqm_match)) {
+ cqm_llc_occ = true;
+ intel_cqm_events_group.attrs = intel_cmt_events_attr;
+ } else
+ cqm_llc_occ = false;
+
+ if (x86_match_cpu(intel_mbm_match)) {
+ u32 mbm_scale_rounded = 0;
+
+ cqm_mbm = true;
+ cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
+ /*
+ * MBM counter values are in bytes. To conver this to MB/sec,
+ * we scale the MBM scale factor by 1000. Another 1000 factor
+ * scaling down is done
+ * after reading the counter val i.e. in the function
+ * __rmid_read_mbm()
+ */
+ mbm_scale_rounded = (cqm_l3_scale + 500) / 1000;
+ cqm_max_rmid = boot_cpu_data.x86_cache_max_rmid;
+ snprintf(scale, sizeof(scale), "%u", mbm_scale_rounded);
+ str = kstrdup(scale, GFP_KERNEL);
+ if (!str) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (cqm_llc_occ)
+ intel_cqm_events_group.attrs =
+ intel_cmt_mbm_events_attr;
+ else
+ intel_cqm_events_group.attrs = intel_mbm_events_attr;
+
+ event_attr_intel_cqm_llc_local_bw_scale.event_str
+ = event_attr_intel_cqm_llc_total_bw_scale.event_str = str;
+ mbm_local = kzalloc_node(sizeof(struct sample) *
+ (cqm_max_rmid + 1) * MBM_SOCKET_MAX,
+ GFP_KERNEL, NUMA_NO_NODE);
+ if (!mbm_local) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ mbm_total = kzalloc_node(sizeof(struct sample) *
+ (cqm_max_rmid + 1) * MBM_SOCKET_MAX,
+ GFP_KERNEL, NUMA_NO_NODE);
+ if (!mbm_total) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ } else
+ cqm_mbm = false;

/*
* It's possible that not all resources support the same number
@@ -1328,44 +1866,48 @@ static int __init intel_cqm_init(void)
*/
cpu_notifier_register_begin();

- for_each_online_cpu(cpu) {
- struct cpuinfo_x86 *c = &cpu_data(cpu);
+ if (cqm_llc_occ) {
+ for_each_online_cpu(cpu) {
+ struct cpuinfo_x86 *c = &cpu_data(cpu);

- if (c->x86_cache_max_rmid < cqm_max_rmid)
- cqm_max_rmid = c->x86_cache_max_rmid;
+ if (c->x86_cache_max_rmid < cqm_max_rmid)
+ cqm_max_rmid = c->x86_cache_max_rmid;

- if (c->x86_cache_occ_scale != cqm_l3_scale) {
- pr_err("Multiple LLC scale values, disabling\n");
- ret = -EINVAL;
- goto out;
+ if (c->x86_cache_occ_scale != cqm_l3_scale) {
+ pr_err("Multiple LLC scale values, disabling\n");
+ ret = -EINVAL;
+ goto out;
+ }
}
- }

- /*
- * A reasonable upper limit on the max threshold is the number
- * of lines tagged per RMID if all RMIDs have the same number of
- * lines tagged in the LLC.
- *
- * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
- */
- __intel_cqm_max_threshold =
+ /*
+ * A reasonable upper limit on the max threshold is the number
+ * of lines tagged per RMID if all RMIDs have the same number of
+ * lines tagged in the LLC.
+ *
+ * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+ */
+ __intel_cqm_max_threshold =
boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);

- snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
- str = kstrdup(scale, GFP_KERNEL);
- if (!str) {
- ret = -ENOMEM;
- goto out;
- }
+ snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
+ str = kstrdup(scale, GFP_KERNEL);
+ if (!str) {
+ ret = -ENOMEM;
+ goto out;
+ }

- event_attr_intel_cqm_llc_scale.event_str = str;
+ event_attr_intel_cqm_llc_scale.event_str = str;
+ }

ret = intel_cqm_setup_rmid_cache();
if (ret)
goto out;

for_each_online_cpu(i) {
- intel_cqm_cpu_prepare(i);
+ ret = intel_cqm_cpu_prepare(i);
+ if (ret)
+ goto out;
cqm_pick_event_reader(i);
}

--
2.1.0


2015-07-28 07:59:35

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v2] perf,x86: add Intel Memory Bandwidth Monitoring (MBM) PMU

On Tue, 21 Jul 2015, Kanaka Juvva wrote:
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
> index 1880761..dc1ce58 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
> @@ -12,10 +12,20 @@
> #define MSR_IA32_PQR_ASSOC 0x0c8f
> #define MSR_IA32_QM_CTR 0x0c8e
> #define MSR_IA32_QM_EVTSEL 0x0c8d
> +#define MAX_MBM_CNTR 0xffffff

Can we please have a consistent name space MBM_... here?

> +#define MBM_SOCKET_MAX 8
> +#define MBM_TIME_DELTA_MAX 1000
> +#define MBM_TIME_DELTA_MIN 1000
> +#define MBM_SCALING_FACTOR 1000
> +#define MBM_SCALING_HALF (MBM_SCALING_FACTOR/2)
> +#define MBM_FIFO_SIZE_MIN 10
> +#define MBM_FIFO_SIZE_MAX 300
>
> -static u32 cqm_max_rmid = -1;
> -static unsigned int cqm_l3_scale; /* supposedly cacheline size */
>
> +static u32 cqm_max_rmid = -1;
> +static unsigned int cqm_l3_scale;/* supposedly cacheline size */

Pointless white space change

> +static unsigned mbm_window_size = MBM_FIFO_SIZE_MIN;
> +static bool cqm_llc_occ, cqm_mbm;
> /**
> * struct intel_pqr_state - State cache for the PQR MSR
> * @rmid: The cached Resource Monitoring ID
> @@ -42,6 +52,34 @@ struct intel_pqr_state {
> * interrupts disabled, which is sufficient for the protection.
> */
> static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
> +static DEFINE_PER_CPU(struct mbm_pmu *, mbm_pmu);
> +static DEFINE_PER_CPU(struct mbm_pmu *, mbm_pmu_to_free);
> +
> +/*
> + * mbm pmu is used for storing mbm_local and mbm_total
> + * events
> + */
> +struct mbm_pmu {
> + spinlock_t lock;

What is this lock protecting and how does it nest into perf locking?

> + int n_active; /* number of active events */

If you want to document your struct members, please use kerneldoc and
not these hard to read tail comments.

> + struct list_head active_list;
> + struct pmu *pmu; /* pointer to mbm perf_event */
> + ktime_t timer_interval; /* in ktime_t unit */
> + struct hrtimer hrtimer;
> +};
> +
> +struct sample {
> + u64 bytes; /* mbm counter value read*/
> + u64 cum_avg; /* current running average of bandwidth */
> + ktime_t prev_time;
> + u64 index; /* current sample no */
> + u32 mbmfifo[MBM_FIFO_SIZE_MAX]; /* window of last n bw values */
> + u32 fifoin; /* mbmfifo in counter for sliding window */
> + u32 fifoout; /* mbmfifo out counter for sliding window */

So above you use a proper layout which is actualy readable (aside of
the tail comments). Can you please stick with a consistent coding
style?

> +};
> +
> +struct sample *mbm_total; /* curent stats for mbm_total events */
> +struct sample *mbm_local; /* current stats for mbm_local events */

static ? And please get rid of these tail comments.

> /*
> * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
> @@ -66,6 +104,9 @@ static cpumask_t cqm_cpumask;
> #define RMID_VAL_UNAVAIL (1ULL << 62)
>
> #define QOS_L3_OCCUP_EVENT_ID (1 << 0)
> +#define QOS_MBM_TOTAL_EVENT_ID (1 << 1)

Please use tabs not spaces.

> +#define QOS_MBM_LOCAL_EVENT_ID1 0x3

What's ID1 for heavens sake? Looks like

(QOS_L3_OCCUP_EVENT_ID | QOS_MBM_TOTAL_EVENT_ID)

So this wants a descriptive ID name and a comment.

> +#define QOS_MBM_LOCAL_EVENT_ID (1 << 2)
>
> #define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
>
> @@ -90,7 +131,8 @@ static u32 intel_cqm_rotation_rmid;
> */
> static inline bool __rmid_valid(u32 rmid)
> {
> - if (!rmid || rmid == INVALID_RMID)
> + WARN_ON_ONCE(rmid > cqm_max_rmid);
> + if (!rmid || (rmid == INVALID_RMID) || (rmid > cqm_max_rmid))
> return false;

How is that related to this patch? Looks like a fix or prerequisite
change to the existing code.

> return true;
> @@ -125,6 +167,7 @@ struct cqm_rmid_entry {
> enum rmid_recycle_state state;
> struct list_head list;
> unsigned long queue_time;
> + bool config;
> };
>
> /*
> @@ -176,6 +219,17 @@ static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
> return entry;
> }
>
> +static void mbm_reset_stats(u32 rmid)
> +{
> + u32 vrmid = topology_physical_package_id(smp_processor_id()) *
> + cqm_max_rmid + rmid;

Can you please explain that magic calculation? It's far from obvious
why this would be correct.

> +
> + if ((!cqm_mbm) || (!mbm_local) || (!mbm_total))
> + return;
> + memset(&mbm_local[vrmid], 0, sizeof(struct sample));
> + memset(&mbm_total[vrmid], 0, sizeof(struct sample));
> +}
> +
> /*
> * Returns < 0 on fail.
> *
> @@ -190,8 +244,10 @@ static u32 __get_rmid(void)
> if (list_empty(&cqm_rmid_free_lru))
> return INVALID_RMID;
>
> - entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
> + entry = list_first_entry(&cqm_rmid_free_lru,
> + struct cqm_rmid_entry, list);

And the point of this change is?

> list_del(&entry->list);
> + mbm_reset_stats(entry->rmid);
>
> return entry->rmid;
> }
> @@ -207,6 +263,7 @@ static void __put_rmid(u32 rmid)
>
> entry->queue_time = jiffies;
> entry->state = RMID_YOUNG;
> + mbm_reset_stats(rmid);
>
> list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
> }
> @@ -232,6 +289,8 @@ static int intel_cqm_setup_rmid_cache(void)
>
> INIT_LIST_HEAD(&entry->list);
> entry->rmid = r;
> + entry->config = false;
> +
> cqm_rmid_ptrs[r] = entry;
>
> list_add_tail(&entry->list, &cqm_rmid_free_lru);
> @@ -254,6 +313,8 @@ fail:
> kfree(cqm_rmid_ptrs[r]);
>
> kfree(cqm_rmid_ptrs);
> + kfree(mbm_local);
> + kfree(mbm_total);
> return -ENOMEM;
> }
>
> @@ -403,9 +464,11 @@ static void __intel_cqm_event_count(void *info);
> static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
> {
> struct perf_event *event;
> +

Superfluous newline.

> struct list_head *head = &group->hw.cqm_group_entry;
> u32 old_rmid = group->hw.cqm_rmid;
>
> +

Another one. Sigh!

> lockdep_assert_held(&cache_mutex);
>
> /*
> @@ -494,6 +557,166 @@ static bool intel_cqm_sched_in_event(u32 rmid)
> return false;
> }
>
> +static u32 mbm_fifo_sum_lastn_out(int rmid, bool is_localbw)

Random space after u32

> +{
> + u32 val = 0, i, j, index;
> +
> + if (is_localbw) {
> + if (++mbm_local[rmid].fifoout >= mbm_window_size)
> + mbm_local[rmid].fifoout = 0;
> + index = mbm_local[rmid].fifoout;
> + for (i = 0; i < mbm_window_size - 1; i++) {
> + if (index + i >= mbm_window_size)
> + j = index + i - mbm_window_size;
> + else
> + j = index + i;
> + val += mbm_local[rmid].mbmfifo[j];
> + }
> + return val;
> + }
> +
> + if (++mbm_total[rmid].fifoout >= mbm_window_size)
> + mbm_total[rmid].fifoout = 0;
> + index = mbm_total[rmid].fifoout;
> + for (i = 0; i < mbm_window_size - 1; i++) {
> + if (index + i >= mbm_window_size)
> + j = index + i - mbm_window_size;
> + else
> + j = index + i;
> + val += mbm_total[rmid].mbmfifo[j];
> + }

The concept of helper functions to avoid code duplication is
definitely applicable here.

> + return val;
> +}
> +
> +static int mbm_fifo_in(int rmid, u32 val, bool is_localbw)

Random space after int.

> +{
> + if (is_localbw) {
> + mbm_local[rmid].mbmfifo[mbm_local[rmid].fifoin] = val;
> + if (++mbm_local[rmid].fifoin >= mbm_window_size)
> + mbm_local[rmid].fifoin = 0;
> + } else {
> + mbm_total[rmid].mbmfifo[mbm_total[rmid].fifoin] = val;
> + if (++mbm_total[rmid].fifoin >= mbm_window_size)
> + mbm_total[rmid].fifoin = 0;
> + }

static void mbm_fifo_in(struct sample *, u32 val)

would get rid of code duplication.

> + return 0;

What's the purpose of having a return value here?

> +}
> +
> +/*
> + * __rmid_read_mbm checks whether it is LOCAL or GLOBAL MBM event and reads
> + * its MSR counter. if (MSR current value < MSR previous value) it is an
> + * overflow and overflow is handled. If MSR is read within last 100ms,
> + * then the value is ignored; this will suppress small deltas. We don't
> + * process MBM samples that are within 100ms. Bandwidth is calculated as:
> + * bandwidth = difference of last two msr counter values/time difference.
> + * cum_avg = Running Average bandwidth of last 'n' samples that are processed
> + * Sliding window is used to save the last 'n' samples. Hence,
> + * n = sliding_window_size
> + * cum_avg is scaled down by a factor MBM_SCALING_FACTOR and rounded to nearest
> + * integer. User interface reads the BW in MB/sec.
> + * Rounding algorithm : (X + 0.5):
> + * where X is scaled BW value. To avoid floating point arithmetic :
> + * BW- unscaled value
> + * (BW + MBM_SCALING_HALF)/MBM_SCALING_FACTOR is computed which gives the
> + * scaled bandwidth.

This is completely unreadable.

> + */
> +static u64 __rmid_read_mbm(unsigned int rmid, bool read_mbm_local)
> +{
> + u64 val, tmp, diff_time, cma, bytes, index;
> + bool overflow = false;
> + ktime_t cur_time;
> + u32 tmp32 = rmid;
> + u32 vrmid = topology_physical_package_id(smp_processor_id()) *
> + cqm_max_rmid + rmid;
> +
> + rmid = vrmid;

This is completely backwards.

tmp32 = rmid;
rmid = vrmid;
do_stuff(rmid);
rmid = tmp32;
do_other_stuff(rmid);

Why can't you use vrmid for do_stuff() and leave rmid alone? Just
because it would make the code simpler to read?

> + cur_time = ktime_get_real();

Why would this operate on wall time and not on clock monotonic time?

> + if (read_mbm_local) {
> + cma = mbm_local[rmid].cum_avg;
> + diff_time = ktime_ms_delta(cur_time,
> + mbm_local[rmid].prev_time);
> + if (diff_time < 100)
> + return cma;
> + mbm_local[rmid].prev_time = ktime_set(0,
> + (unsigned long)ktime_to_ns(cur_time));
> + bytes = mbm_local[rmid].bytes;
> + index = mbm_local[rmid].index;
> + rmid = tmp32;
> + wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_LOCAL_EVENT_ID1, rmid);

Why is this using ID1?

> + } else {
> + cma = mbm_total[rmid].cum_avg;
> + diff_time = ktime_ms_delta(cur_time,
> + mbm_total[rmid].prev_time);
> + if (diff_time < 100)
> + return cma;
> + mbm_total[rmid].prev_time = ktime_set(0,
> + (unsigned long)ktime_to_ns(cur_time));
> + bytes = mbm_total[rmid].bytes;
> + index = mbm_total[rmid].index;
> + rmid = tmp32;
> + wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_TOTAL_EVENT_ID, rmid);
> + }

Random space after tab.

> + rdmsrl(MSR_IA32_QM_CTR, val);
> + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> + return val;
> +
> + tmp = val;

You really have a faible for random storage. tmp gets assigned to
mbm_*[rmid].bytes further down. This makes no sense at all.

> + if (val < bytes) /* overflow handle it */ {

These tail comments are crap.

> + val = MAX_MBM_CNTR - bytes + val;
> + overflow = true;
> + } else
> + val = val - bytes;

That else clause is missing braces.

> + if (diff_time < MBM_TIME_DELTA_MAX - MBM_TIME_DELTA_MIN)
> + val = (val * MBM_TIME_DELTA_MAX) / diff_time;

That resolves to

if (diff_time < 0)

Which is always false because diff_time is u64.

What's the logic behind this?

> +
> + if ((diff_time > MBM_TIME_DELTA_MAX) && (!cma))
> + /* First sample, we can't use the time delta */
> + diff_time = MBM_TIME_DELTA_MAX;

And this?

> +
> + rmid = vrmid;

More obfuscation

> + if ((diff_time <= MBM_TIME_DELTA_MAX + MBM_TIME_DELTA_MIN) ||
> + overflow) {

Again, that resolves to

if (overflow)

And even if MBM_TIME_DELTA_MAX would be not equal MBM_TIME_DELTA_MIN
then this wants some explanation about

MBM_TIME_DELTA_MAX - MBM_TIME_DELTA_MIN

versus

MBM_TIME_DELTA_MAX + MBM_TIME_DELTA_MIN

and the whole logic behind this time delta magic, if there is any.

> + int bw, ret;
> +
> + if (index & (index < mbm_window_size))
> + val = cma * MBM_SCALING_FACTOR + val / index -
> + cma / index;
> +
> + val = (val + MBM_SCALING_HALF) / MBM_SCALING_FACTOR;
> + if (index >= mbm_window_size) {

These conditionals along with the magic math are undocumented.

> + /*
> + * Compute the sum of recent n-1 samples and slide the
> + * window by 1
> + */
> + ret = mbm_fifo_sum_lastn_out(rmid, read_mbm_local);
> + /* recalculate the running average with current bw */
> + ret = (ret + val) / mbm_window_size;
> + if (ret < 0)

The average of positive numbers becomes negative ?

> + ret = 0;
> + bw = val;
> + val = ret;

Your back and forth assignements of random variables is not making the
code any more readable.

> + } else
> + bw = val;
> + /* save the recent bw in fifo */
> + mbm_fifo_in(rmid, bw, read_mbm_local);
> +
> + if (read_mbm_local) {

Oh well. You have this read_mbm_local conditional 3 times in this
function. Why don't you make the function oblivious of this and hand
in a pointer to mbm_local or mbm_total? Would be too simple and not
enough obfuscated, right?

> + mbm_local[rmid].index++;
> + mbm_local[rmid].cum_avg = val;
> + mbm_local[rmid].bytes = tmp;
> + mbm_local[rmid].prev_time = ktime_set(0,
> + (unsigned long)ktime_to_ns(cur_time));
> + } else {
> + mbm_total[rmid].index++;
> + mbm_total[rmid].cum_avg = val;
> + mbm_total[rmid].bytes = tmp;
> + mbm_total[rmid].prev_time = ktime_set(0,
> + (unsigned long)ktime_to_ns(cur_time));
> + }
> + return val;
> + }
> + return cma;
> +}
> +
> /*
> * Initially use this constant for both the limbo queue time and the
> * rotation timer interval, pmu::hrtimer_interval_ms.
> @@ -568,7 +791,8 @@ static bool intel_cqm_rmid_stabilize(unsigned int *available)
> /*
> * Test whether an RMID is free for each package.
> */
> - on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
> + if (entry->config)

This entry->config change wants to be a separate patch. It's
completely non obvious how this changes the behaviour of the existing code.

> + on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
>
> list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
> /*
> @@ -846,8 +1070,9 @@ static void intel_cqm_setup_event(struct perf_event *event,
> struct perf_event **group)
> {
> struct perf_event *iter;
> - bool conflict = false;
> +
> u32 rmid;
> + bool conflict = false;

Fits the overall picture of this patch: Sloppy

> list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
> rmid = iter->hw.cqm_rmid;
> @@ -875,6 +1100,40 @@ static void intel_cqm_setup_event(struct perf_event *event,
> event->hw.cqm_rmid = rmid;
> }
>
> +static void intel_cqm_event_update(struct perf_event *event)
> +{
> + unsigned int rmid;
> + u64 val = 0;
> +
> + /*
> + * Task events are handled by intel_cqm_event_count().
> + */
> + if (event->cpu == -1)
> + return;
> +
> + rmid = event->hw.cqm_rmid;
> + if (!__rmid_valid(rmid))
> + return;
> +
> + switch (event->attr.config) {
> + case QOS_MBM_TOTAL_EVENT_ID:
> + val = __rmid_read_mbm(rmid, false);
> + break;
> + case QOS_MBM_LOCAL_EVENT_ID:
> + val = __rmid_read_mbm(rmid, true);
> + break;
> + default:
> + break;

return?

> + }
> + /*
> + * Ignore this reading on error states and do not update the value.
> + */
> + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> + return;
> +
> + local64_set(&event->count, val);
> +}
> +
> static void intel_cqm_event_read(struct perf_event *event)
> {
> unsigned long flags;
> @@ -887,6 +1146,12 @@ static void intel_cqm_event_read(struct perf_event *event)
> if (event->cpu == -1)
> return;
>
> + if ((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
> + (event->attr.config & QOS_MBM_LOCAL_EVENT_ID))
> + intel_cqm_event_update(event);
> +
> + if (!(event->attr.config & QOS_L3_OCCUP_EVENT_ID))
> + return;
> raw_spin_lock_irqsave(&cache_lock, flags);
> rmid = event->hw.cqm_rmid;
>
> @@ -906,6 +1171,28 @@ out:
> raw_spin_unlock_irqrestore(&cache_lock, flags);
> }
>
> +static void __intel_cqm_event_total_bw_count(void *info)
> +{
> + struct rmid_read *rr = info;
> + u64 val;
> +
> + val = __rmid_read_mbm(rr->rmid, false);
> + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> + return;
> + atomic64_add(val, &rr->value);
> +}
> +
> +static void __intel_cqm_event_local_bw_count(void *info)
> +{
> + struct rmid_read *rr = info;
> + u64 val;
> +
> + val = __rmid_read_mbm(rr->rmid, true);
> + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> + return;
> + atomic64_add(val, &rr->value);

You're really a fan of copy and paste.

> +}
> +
> static void __intel_cqm_event_count(void *info)
> {
> struct rmid_read *rr = info;
> @@ -967,7 +1254,21 @@ static u64 intel_cqm_event_count(struct perf_event *event)
> if (!__rmid_valid(rr.rmid))
> goto out;
>
> - on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
> + switch (event->attr.config) {
> + case QOS_L3_OCCUP_EVENT_ID:
> + on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
> + break;
> + case QOS_MBM_TOTAL_EVENT_ID:
> + on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_total_bw_count,
> + &rr, 1);
> + break;
> + case QOS_MBM_LOCAL_EVENT_ID:
> + on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_local_bw_count,
> + &rr, 1);
> + break;
> + default:
> + break;

So there is nothing to do and you happily proceed?

> + }
>
> raw_spin_lock_irqsave(&cache_lock, flags);
> if (event->hw.cqm_rmid == rr.rmid)
> @@ -977,6 +1278,39 @@ out:
> return __perf_event_count(event);
> }

> static void intel_cqm_event_start(struct perf_event *event, int mode)
> {
> struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
> @@ -995,19 +1329,48 @@ static void intel_cqm_event_start(struct perf_event *event, int mode)
> }
>
> state->rmid = rmid;
> + if (event->attr.config & QOS_L3_OCCUP_EVENT_ID) {
> + struct cqm_rmid_entry *entry;
> +
> + entry = __rmid_entry(rmid);
> + entry->config = true;
> + }
> wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
> +
> + if (((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
> + (event->attr.config & QOS_MBM_LOCAL_EVENT_ID)) && (cqm_mbm)) {
> + int cpu = get_cpu();

pmu->start() is called with interrupts disabled. So why do you need
get_cpu() here?

> + struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
> +
> + if (pmu) {
> + if (pmu->n_active == 0)
> + mbm_hrtimer_init(pmu);

That pmu timer wants to be initialized when the pmu is initialized.

> + pmu->n_active++;
> + list_add_tail(&event->active_entry,
> + &pmu->active_list);
> + if (pmu->n_active == 1)
> + mbm_start_hrtimer(pmu);
> + }
> + put_cpu();
> + }
> }
>
> static void intel_cqm_event_stop(struct perf_event *event, int mode)
> {
> struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
> + struct mbm_pmu *pmu = __this_cpu_read(mbm_pmu);
>
> if (event->hw.cqm_state & PERF_HES_STOPPED)
> return;
>
> event->hw.cqm_state |= PERF_HES_STOPPED;
>
> - intel_cqm_event_read(event);
> + if (event->attr.config & QOS_L3_OCCUP_EVENT_ID)
> + intel_cqm_event_read(event);
> +
> + if ((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
> + (event->attr.config & QOS_MBM_LOCAL_EVENT_ID))
> + intel_cqm_event_update(event);
>
> if (!--state->rmid_usecnt) {
> state->rmid = 0;
> @@ -1015,8 +1378,18 @@ static void intel_cqm_event_stop(struct perf_event *event, int mode)
> } else {
> WARN_ON_ONCE(!state->rmid);
> }
> +
> + if (pmu) {
> + WARN_ON_ONCE(pmu->n_active <= 0);
> + pmu->n_active--;
> + if (pmu->n_active == 0)
> + mbm_stop_hrtimer(pmu);
> + list_del(&event->active_entry);
> + }
> +
> }
>
> +

Sigh

> static int intel_cqm_event_add(struct perf_event *event, int mode)
> {

> +static ssize_t
> +sliding_window_size_store(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t count)
> +{
> + unsigned int bytes;
> + int ret;
> +
> + ret = kstrtouint(buf, 0, &bytes);
> + if (ret)
> + return ret;
> +
> + mutex_lock(&cache_mutex);
> + if (bytes > 0 && bytes <= MBM_FIFO_SIZE_MAX)
> + mbm_window_size = bytes;
> + else
> + bytes = MBM_FIFO_SIZE_MIN;

What kind of twisted logic is that?

> +
> + mutex_unlock(&cache_mutex);
> +
> + return count;
> +}
> +
> static DEVICE_ATTR_RW(max_recycle_threshold);
> +static DEVICE_ATTR_RW(sliding_window_size);
>
> static struct attribute *intel_cqm_attrs[] = {
> &dev_attr_max_recycle_threshold.attr,
> + &dev_attr_sliding_window_size.attr,
> NULL,
> };
>
> @@ -1241,16 +1699,17 @@ static inline void cqm_pick_event_reader(int cpu)
>
> for_each_cpu(i, &cqm_cpumask) {
> if (phys_id == topology_physical_package_id(i))
> - return; /* already got reader for this socket */
> + return; /* already got reader for this socket */

More random crap.

> }
>
> cpumask_set_cpu(cpu, &cqm_cpumask);
> }
>
> -static void intel_cqm_cpu_prepare(unsigned int cpu)
> +static int intel_cqm_cpu_prepare(unsigned int cpu)
> {
> struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
> struct cpuinfo_x86 *c = &cpu_data(cpu);
> + struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
>
> state->rmid = 0;
> state->closid = 0;
> @@ -1258,12 +1717,27 @@ static void intel_cqm_cpu_prepare(unsigned int cpu)
>
> WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
> WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
> +
> + if ((!pmu) && (cqm_mbm)) {
> + pmu = kzalloc_node(sizeof(*mbm_pmu), GFP_KERNEL, NUMA_NO_NODE);
> + if (!pmu)
> + return -ENOMEM;
> + spin_lock_init(&pmu->lock);
> + INIT_LIST_HEAD(&pmu->active_list);
> + pmu->pmu = &intel_cqm_pmu;
> + pmu->n_active = 0;
> + pmu->timer_interval = ms_to_ktime(MBM_TIME_DELTA_MAX);
> + per_cpu(mbm_pmu, cpu) = pmu;
> + per_cpu(mbm_pmu_to_free, cpu) = NULL;

So that mbm_pmu_to_free per cpu variable is NULL already and never
becomes anything else than NULL. What's the purpose of setting it NULL
again? And what's the purpose of it in general?

> + }
> + return 0;
> }
>
> static void intel_cqm_cpu_exit(unsigned int cpu)
> {
> int phys_id = topology_physical_package_id(cpu);
> int i;
> + struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
>
> /*
> * Is @cpu a designated cqm reader?
> @@ -1280,6 +1754,13 @@ static void intel_cqm_cpu_exit(unsigned int cpu)
> break;
> }
> }
> +
> + /* cancel overflow polling timer for CPU */
> + if (pmu)
> + mbm_stop_hrtimer(pmu);
> + kfree(mbm_local);
> + kfree(mbm_total);

So this frees the module global storage if one cpu is unplugged and
leaves the pointer initialized so the rest of the code can happily
dereference it.

> +
> }
>
> static int intel_cqm_cpu_notifier(struct notifier_block *nb,
> @@ -1289,7 +1770,7 @@ static int intel_cqm_cpu_notifier(struct notifier_block *nb,
>
> switch (action & ~CPU_TASKS_FROZEN) {
> case CPU_UP_PREPARE:
> - intel_cqm_cpu_prepare(cpu);
> + return intel_cqm_cpu_prepare(cpu);
> break;
> case CPU_DOWN_PREPARE:
> intel_cqm_cpu_exit(cpu);
> @@ -1305,17 +1786,74 @@ static int intel_cqm_cpu_notifier(struct notifier_block *nb,
> static const struct x86_cpu_id intel_cqm_match[] = {
> { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
> {}
> + }, intel_mbm_match[] = {

What's wrong with having a separate

static const struct x86_cpu_id intel_mbm_match[] = {

> + { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL },
> + {}
> };

That again would make the change obvious, but that's not on your agenda.

>
> static int __init intel_cqm_init(void)
> {
> char *str, scale[20];
> - int i, cpu, ret;
> + int i = 0, cpu, ret;

Initializing i is required because?

> - if (!x86_match_cpu(intel_cqm_match))
> + if (!x86_match_cpu(intel_cqm_match) &&
> + (!x86_match_cpu(intel_mbm_match)))
> return -ENODEV;
>
> cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
> + cqm_max_rmid = boot_cpu_data.x86_cache_max_rmid;
> +
> + if (x86_match_cpu(intel_cqm_match)) {
> + cqm_llc_occ = true;
> + intel_cqm_events_group.attrs = intel_cmt_events_attr;
> + } else
> + cqm_llc_occ = false;

I see you do not trust the compiler to put cqm_llc_occ into the BSS
section and neither the kernel to zero it out proper.

> +
> + if (x86_match_cpu(intel_mbm_match)) {
> + u32 mbm_scale_rounded = 0;
> +
> + cqm_mbm = true;
> + cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
> + /*
> + * MBM counter values are in bytes. To conver this to MB/sec,
> + * we scale the MBM scale factor by 1000. Another 1000 factor
> + * scaling down is done
> + * after reading the counter val i.e. in the function
> + * __rmid_read_mbm()

So the event attribute string is in KB/sec and at some other place you
convert it to MB/sec? There seems to be some hidden logic for this,
but that's definitely not decodeable for me.

> + */
> + mbm_scale_rounded = (cqm_l3_scale + 500) / 1000;

div_round_up

> + cqm_max_rmid = boot_cpu_data.x86_cache_max_rmid;
> + snprintf(scale, sizeof(scale), "%u", mbm_scale_rounded);
> + str = kstrdup(scale, GFP_KERNEL);
> + if (!str) {
> + ret = -ENOMEM;
> + goto out;
> + }
> +
> + if (cqm_llc_occ)
> + intel_cqm_events_group.attrs =
> + intel_cmt_mbm_events_attr;

And this line break is required because?

> + else
> + intel_cqm_events_group.attrs = intel_mbm_events_attr;
> +
> + event_attr_intel_cqm_llc_local_bw_scale.event_str
> + = event_attr_intel_cqm_llc_total_bw_scale.event_str = str;

You really love to write unreadable crap. This is neither perl nor the
obfuscated c-code contest. What's wrong with two separate lines which
assign 'str' to each of the event attributes?

> + mbm_local = kzalloc_node(sizeof(struct sample) *
> + (cqm_max_rmid + 1) * MBM_SOCKET_MAX,

Calculating the size in a separate line with a proper variable would
make it readable.

> + GFP_KERNEL, NUMA_NO_NODE);
> + if (!mbm_local) {

Leaks str

> + ret = -ENOMEM;
> + goto out;
> + }
> + mbm_total = kzalloc_node(sizeof(struct sample) *
> + (cqm_max_rmid + 1) * MBM_SOCKET_MAX,
> + GFP_KERNEL, NUMA_NO_NODE);
> + if (!mbm_total) {
> + ret = -ENOMEM;

Leaks str and mbm_local

> + goto out;
> + }

What about sticking that mess into a separate function?

> + } else
> + cqm_mbm = false;

Sigh.

> /*
> * It's possible that not all resources support the same number
> @@ -1328,44 +1866,48 @@ static int __init intel_cqm_init(void)
> */
> cpu_notifier_register_begin();
>
> - for_each_online_cpu(cpu) {
> - struct cpuinfo_x86 *c = &cpu_data(cpu);
> + if (cqm_llc_occ) {
> + for_each_online_cpu(cpu) {
> + struct cpuinfo_x86 *c = &cpu_data(cpu);
>
> - if (c->x86_cache_max_rmid < cqm_max_rmid)
> - cqm_max_rmid = c->x86_cache_max_rmid;
> + if (c->x86_cache_max_rmid < cqm_max_rmid)
> + cqm_max_rmid = c->x86_cache_max_rmid;
>
> - if (c->x86_cache_occ_scale != cqm_l3_scale) {
> - pr_err("Multiple LLC scale values, disabling\n");
> - ret = -EINVAL;
> - goto out;
> + if (c->x86_cache_occ_scale != cqm_l3_scale) {
> + pr_err("Multiple LLC scale values, disabling\n");
> + ret = -EINVAL;

More memory leaks

> + goto out;
> + }
> }
> - }
>
> - /*
> - * A reasonable upper limit on the max threshold is the number
> - * of lines tagged per RMID if all RMIDs have the same number of
> - * lines tagged in the LLC.
> - *
> - * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
> - */
> - __intel_cqm_max_threshold =
> + /*
> + * A reasonable upper limit on the max threshold is the number
> + * of lines tagged per RMID if all RMIDs have the same number of
> + * lines tagged in the LLC.
> + *
> + * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
> + */
> + __intel_cqm_max_threshold =
> boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
>
> - snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
> - str = kstrdup(scale, GFP_KERNEL);
> - if (!str) {
> - ret = -ENOMEM;
> - goto out;
> - }
> + snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
> + str = kstrdup(scale, GFP_KERNEL);
> + if (!str) {
> + ret = -ENOMEM;

Ditto.

> + goto out;
> + }
>
> - event_attr_intel_cqm_llc_scale.event_str = str;
> + event_attr_intel_cqm_llc_scale.event_str = str;
> + }
>
> ret = intel_cqm_setup_rmid_cache();
> if (ret)
> goto out;
>
> for_each_online_cpu(i) {
> - intel_cqm_cpu_prepare(i);
> + ret = intel_cqm_cpu_prepare(i);
> + if (ret)
> + goto out;

And that leaves even more half initialized stuff around.

> cqm_pick_event_reader(i);
> }

I fear your assessment in

http://lkml.kernel.org/r/06033C969873E840BDE9FC9B584F66B58BB89D@ORSMSX116.amr.corp.intel.com

is slightly wrong.

That's a completely convoluted mess. Aside of that its broken all over
the place.

This wants to be split up into preparatory patches which remodel the
existing code and move out cqm stuff into helper functions, so the mbm
stuff can be plugged in with its own helpers nicely.

Thanks,

tglx

2015-08-03 17:51:15

by Juvva, Kanaka D

[permalink] [raw]
Subject: RE: [PATCH v2] perf,x86: add Intel Memory Bandwidth Monitoring (MBM) PMU

Thomas,

Acknowledged. I'll update the patch and generate separate patch(s) wherever cqm changes
are required. One thing was checkpatch.pl gave errors in current cqm code. I had to break lines
as it complained about some line(s) exceeding 80 chars.

Regards.
-Kanaka

> -----Original Message-----
> From: Thomas Gleixner [mailto:[email protected]]
> Sent: Tuesday, July 28, 2015 12:59 AM
> To: Kanaka Juvva
> Cc: Juvva, Kanaka D; Williamson, Glenn P; Fleming, Matt; Auld, Will; Andi Kleen;
> LKML; Luck, Tony; Peter Zijlstra; Tejun Heo; Autee, Priya V; [email protected];
> [email protected]; H. Peter Anvin; Shivappa, Vikas
> Subject: Re: [PATCH v2] perf,x86: add Intel Memory Bandwidth Monitoring
> (MBM) PMU
>
> On Tue, 21 Jul 2015, Kanaka Juvva wrote:
> > diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
> > b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
> > index 1880761..dc1ce58 100644
> > --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
> > +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
> > @@ -12,10 +12,20 @@
> > #define MSR_IA32_PQR_ASSOC 0x0c8f
> > #define MSR_IA32_QM_CTR 0x0c8e
> > #define MSR_IA32_QM_EVTSEL 0x0c8d
> > +#define MAX_MBM_CNTR 0xffffff
>
> Can we please have a consistent name space MBM_... here?
>
> > +#define MBM_SOCKET_MAX 8
> > +#define MBM_TIME_DELTA_MAX 1000
> > +#define MBM_TIME_DELTA_MIN 1000
> > +#define MBM_SCALING_FACTOR 1000
> > +#define MBM_SCALING_HALF (MBM_SCALING_FACTOR/2)
> > +#define MBM_FIFO_SIZE_MIN 10
> > +#define MBM_FIFO_SIZE_MAX 300
> >
> > -static u32 cqm_max_rmid = -1;
> > -static unsigned int cqm_l3_scale; /* supposedly cacheline size */
> >
> > +static u32 cqm_max_rmid = -1;
> > +static unsigned int cqm_l3_scale;/* supposedly cacheline size */
>
> Pointless white space change
>
> > +static unsigned mbm_window_size = MBM_FIFO_SIZE_MIN; static bool
> > +cqm_llc_occ, cqm_mbm;
> > /**
> > * struct intel_pqr_state - State cache for the PQR MSR
> > * @rmid: The cached Resource Monitoring ID
> > @@ -42,6 +52,34 @@ struct intel_pqr_state {
> > * interrupts disabled, which is sufficient for the protection.
> > */
> > static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
> > +static DEFINE_PER_CPU(struct mbm_pmu *, mbm_pmu); static
> > +DEFINE_PER_CPU(struct mbm_pmu *, mbm_pmu_to_free);
> > +
> > +/*
> > + * mbm pmu is used for storing mbm_local and mbm_total
> > + * events
> > + */
> > +struct mbm_pmu {
> > + spinlock_t lock;
>
> What is this lock protecting and how does it nest into perf locking?
>
> > + int n_active; /* number of active events */
>
> If you want to document your struct members, please use kerneldoc and not
> these hard to read tail comments.
>
> > + struct list_head active_list;
> > + struct pmu *pmu; /* pointer to mbm perf_event */
> > + ktime_t timer_interval; /* in ktime_t unit */
> > + struct hrtimer hrtimer;
> > +};
> > +
> > +struct sample {
> > + u64 bytes; /* mbm counter value read*/
> > + u64 cum_avg; /* current running average of bandwidth */
> > + ktime_t prev_time;
> > + u64 index; /* current sample no */
> > + u32 mbmfifo[MBM_FIFO_SIZE_MAX]; /* window of last n bw values */
> > + u32 fifoin; /* mbmfifo in counter for sliding window */
> > + u32 fifoout; /* mbmfifo out counter for sliding window */
>
> So above you use a proper layout which is actualy readable (aside of the tail
> comments). Can you please stick with a consistent coding style?
>
> > +};
> > +
> > +struct sample *mbm_total; /* curent stats for mbm_total events */
> > +struct sample *mbm_local; /* current stats for mbm_local events */
>
> static ? And please get rid of these tail comments.
>
> > /*
> > * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
> > @@ -66,6 +104,9 @@ static cpumask_t cqm_cpumask;
> > #define RMID_VAL_UNAVAIL (1ULL << 62)
> >
> > #define QOS_L3_OCCUP_EVENT_ID (1 << 0)
> > +#define QOS_MBM_TOTAL_EVENT_ID (1 << 1)
>
> Please use tabs not spaces.
>
> > +#define QOS_MBM_LOCAL_EVENT_ID1 0x3
>
> What's ID1 for heavens sake? Looks like
>
> (QOS_L3_OCCUP_EVENT_ID | QOS_MBM_TOTAL_EVENT_ID)
>
> So this wants a descriptive ID name and a comment.
>
> > +#define QOS_MBM_LOCAL_EVENT_ID (1 << 2)
> >
> > #define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
> >
> > @@ -90,7 +131,8 @@ static u32 intel_cqm_rotation_rmid;
> > */
> > static inline bool __rmid_valid(u32 rmid) {
> > - if (!rmid || rmid == INVALID_RMID)
> > + WARN_ON_ONCE(rmid > cqm_max_rmid);
> > + if (!rmid || (rmid == INVALID_RMID) || (rmid > cqm_max_rmid))
> > return false;
>
> How is that related to this patch? Looks like a fix or prerequisite change to the
> existing code.
>
> > return true;
> > @@ -125,6 +167,7 @@ struct cqm_rmid_entry {
> > enum rmid_recycle_state state;
> > struct list_head list;
> > unsigned long queue_time;
> > + bool config;
> > };
> >
> > /*
> > @@ -176,6 +219,17 @@ static inline struct cqm_rmid_entry
> *__rmid_entry(u32 rmid)
> > return entry;
> > }
> >
> > +static void mbm_reset_stats(u32 rmid) {
> > + u32 vrmid = topology_physical_package_id(smp_processor_id()) *
> > + cqm_max_rmid + rmid;
>
> Can you please explain that magic calculation? It's far from obvious why this
> would be correct.
>
> > +
> > + if ((!cqm_mbm) || (!mbm_local) || (!mbm_total))
> > + return;
> > + memset(&mbm_local[vrmid], 0, sizeof(struct sample));
> > + memset(&mbm_total[vrmid], 0, sizeof(struct sample)); }
> > +
> > /*
> > * Returns < 0 on fail.
> > *
> > @@ -190,8 +244,10 @@ static u32 __get_rmid(void)
> > if (list_empty(&cqm_rmid_free_lru))
> > return INVALID_RMID;
> >
> > - entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry,
> list);
> > + entry = list_first_entry(&cqm_rmid_free_lru,
> > + struct cqm_rmid_entry, list);
>
> And the point of this change is?
>
> > list_del(&entry->list);
> > + mbm_reset_stats(entry->rmid);
> >
> > return entry->rmid;
> > }
> > @@ -207,6 +263,7 @@ static void __put_rmid(u32 rmid)
> >
> > entry->queue_time = jiffies;
> > entry->state = RMID_YOUNG;
> > + mbm_reset_stats(rmid);
> >
> > list_add_tail(&entry->list, &cqm_rmid_limbo_lru); } @@ -232,6
> > +289,8 @@ static int intel_cqm_setup_rmid_cache(void)
> >
> > INIT_LIST_HEAD(&entry->list);
> > entry->rmid = r;
> > + entry->config = false;
> > +
> > cqm_rmid_ptrs[r] = entry;
> >
> > list_add_tail(&entry->list, &cqm_rmid_free_lru); @@ -254,6
> +313,8
> > @@ fail:
> > kfree(cqm_rmid_ptrs[r]);
> >
> > kfree(cqm_rmid_ptrs);
> > + kfree(mbm_local);
> > + kfree(mbm_total);
> > return -ENOMEM;
> > }
> >
> > @@ -403,9 +464,11 @@ static void __intel_cqm_event_count(void *info);
> > static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid) {
> > struct perf_event *event;
> > +
>
> Superfluous newline.
>
> > struct list_head *head = &group->hw.cqm_group_entry;
> > u32 old_rmid = group->hw.cqm_rmid;
> >
> > +
>
> Another one. Sigh!
>
> > lockdep_assert_held(&cache_mutex);
> >
> > /*
> > @@ -494,6 +557,166 @@ static bool intel_cqm_sched_in_event(u32 rmid)
> > return false;
> > }
> >
> > +static u32 mbm_fifo_sum_lastn_out(int rmid, bool is_localbw)
>
> Random space after u32
>
> > +{
> > + u32 val = 0, i, j, index;
> > +
> > + if (is_localbw) {
> > + if (++mbm_local[rmid].fifoout >= mbm_window_size)
> > + mbm_local[rmid].fifoout = 0;
> > + index = mbm_local[rmid].fifoout;
> > + for (i = 0; i < mbm_window_size - 1; i++) {
> > + if (index + i >= mbm_window_size)
> > + j = index + i - mbm_window_size;
> > + else
> > + j = index + i;
> > + val += mbm_local[rmid].mbmfifo[j];
> > + }
> > + return val;
> > + }
> > +
> > + if (++mbm_total[rmid].fifoout >= mbm_window_size)
> > + mbm_total[rmid].fifoout = 0;
> > + index = mbm_total[rmid].fifoout;
> > + for (i = 0; i < mbm_window_size - 1; i++) {
> > + if (index + i >= mbm_window_size)
> > + j = index + i - mbm_window_size;
> > + else
> > + j = index + i;
> > + val += mbm_total[rmid].mbmfifo[j];
> > + }
>
> The concept of helper functions to avoid code duplication is
> definitely applicable here.
>
> > + return val;
> > +}
> > +
> > +static int mbm_fifo_in(int rmid, u32 val, bool is_localbw)
>
> Random space after int.
>
> > +{
> > + if (is_localbw) {
> > + mbm_local[rmid].mbmfifo[mbm_local[rmid].fifoin] = val;
> > + if (++mbm_local[rmid].fifoin >= mbm_window_size)
> > + mbm_local[rmid].fifoin = 0;
> > + } else {
> > + mbm_total[rmid].mbmfifo[mbm_total[rmid].fifoin] = val;
> > + if (++mbm_total[rmid].fifoin >= mbm_window_size)
> > + mbm_total[rmid].fifoin = 0;
> > + }
>
> static void mbm_fifo_in(struct sample *, u32 val)
>
> would get rid of code duplication.
>
> > + return 0;
>
> What's the purpose of having a return value here?
>
> > +}
> > +
> > +/*
> > + * __rmid_read_mbm checks whether it is LOCAL or GLOBAL MBM event and
> reads
> > + * its MSR counter. if (MSR current value < MSR previous value) it is an
> > + * overflow and overflow is handled. If MSR is read within last 100ms,
> > + * then the value is ignored; this will suppress small deltas. We don't
> > + * process MBM samples that are within 100ms. Bandwidth is calculated as:
> > + * bandwidth = difference of last two msr counter values/time difference.
> > + * cum_avg = Running Average bandwidth of last 'n' samples that are
> processed
> > + * Sliding window is used to save the last 'n' samples. Hence,
> > + * n = sliding_window_size
> > + * cum_avg is scaled down by a factor MBM_SCALING_FACTOR and rounded
> to nearest
> > + * integer. User interface reads the BW in MB/sec.
> > + * Rounding algorithm : (X + 0.5):
> > + * where X is scaled BW value. To avoid floating point arithmetic :
> > + * BW- unscaled value
> > + * (BW + MBM_SCALING_HALF)/MBM_SCALING_FACTOR is computed which
> gives the
> > + * scaled bandwidth.
>
> This is completely unreadable.
>
> > + */
> > +static u64 __rmid_read_mbm(unsigned int rmid, bool read_mbm_local)
> > +{
> > + u64 val, tmp, diff_time, cma, bytes, index;
> > + bool overflow = false;
> > + ktime_t cur_time;
> > + u32 tmp32 = rmid;
> > + u32 vrmid = topology_physical_package_id(smp_processor_id()) *
> > + cqm_max_rmid + rmid;
> > +
> > + rmid = vrmid;
>
> This is completely backwards.
>
> tmp32 = rmid;
> rmid = vrmid;
> do_stuff(rmid);
> rmid = tmp32;
> do_other_stuff(rmid);
>
> Why can't you use vrmid for do_stuff() and leave rmid alone? Just
> because it would make the code simpler to read?
>
> > + cur_time = ktime_get_real();
>
> Why would this operate on wall time and not on clock monotonic time?
>
> > + if (read_mbm_local) {
> > + cma = mbm_local[rmid].cum_avg;
> > + diff_time = ktime_ms_delta(cur_time,
> > + mbm_local[rmid].prev_time);
> > + if (diff_time < 100)
> > + return cma;
> > + mbm_local[rmid].prev_time = ktime_set(0,
> > + (unsigned long)ktime_to_ns(cur_time));
> > + bytes = mbm_local[rmid].bytes;
> > + index = mbm_local[rmid].index;
> > + rmid = tmp32;
> > + wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_LOCAL_EVENT_ID1,
> rmid);
>
> Why is this using ID1?
>
> > + } else {
> > + cma = mbm_total[rmid].cum_avg;
> > + diff_time = ktime_ms_delta(cur_time,
> > + mbm_total[rmid].prev_time);
> > + if (diff_time < 100)
> > + return cma;
> > + mbm_total[rmid].prev_time = ktime_set(0,
> > + (unsigned long)ktime_to_ns(cur_time));
> > + bytes = mbm_total[rmid].bytes;
> > + index = mbm_total[rmid].index;
> > + rmid = tmp32;
> > + wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_TOTAL_EVENT_ID,
> rmid);
> > + }
>
> Random space after tab.
>
> > + rdmsrl(MSR_IA32_QM_CTR, val);
> > + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> > + return val;
> > +
> > + tmp = val;
>
> You really have a faible for random storage. tmp gets assigned to
> mbm_*[rmid].bytes further down. This makes no sense at all.
>
> > + if (val < bytes) /* overflow handle it */ {
>
> These tail comments are crap.
>
> > + val = MAX_MBM_CNTR - bytes + val;
> > + overflow = true;
> > + } else
> > + val = val - bytes;
>
> That else clause is missing braces.
>
> > + if (diff_time < MBM_TIME_DELTA_MAX - MBM_TIME_DELTA_MIN)
> > + val = (val * MBM_TIME_DELTA_MAX) / diff_time;
>
> That resolves to
>
> if (diff_time < 0)
>
> Which is always false because diff_time is u64.
>
> What's the logic behind this?
>
> > +
> > + if ((diff_time > MBM_TIME_DELTA_MAX) && (!cma))
> > + /* First sample, we can't use the time delta */
> > + diff_time = MBM_TIME_DELTA_MAX;
>
> And this?
>
> > +
> > + rmid = vrmid;
>
> More obfuscation
>
> > + if ((diff_time <= MBM_TIME_DELTA_MAX + MBM_TIME_DELTA_MIN)
> ||
> > + overflow) {
>
> Again, that resolves to
>
> if (overflow)
>
> And even if MBM_TIME_DELTA_MAX would be not equal
> MBM_TIME_DELTA_MIN
> then this wants some explanation about
>
> MBM_TIME_DELTA_MAX - MBM_TIME_DELTA_MIN
>
> versus
>
> MBM_TIME_DELTA_MAX + MBM_TIME_DELTA_MIN
>
> and the whole logic behind this time delta magic, if there is any.
>
> > + int bw, ret;
> > +
> > + if (index & (index < mbm_window_size))
> > + val = cma * MBM_SCALING_FACTOR + val / index -
> > + cma / index;
> > +
> > + val = (val + MBM_SCALING_HALF) / MBM_SCALING_FACTOR;
> > + if (index >= mbm_window_size) {
>
> These conditionals along with the magic math are undocumented.
>
> > + /*
> > + * Compute the sum of recent n-1 samples and slide the
> > + * window by 1
> > + */
> > + ret = mbm_fifo_sum_lastn_out(rmid, read_mbm_local);
> > + /* recalculate the running average with current bw */
> > + ret = (ret + val) / mbm_window_size;
> > + if (ret < 0)
>
> The average of positive numbers becomes negative ?
>
> > + ret = 0;
> > + bw = val;
> > + val = ret;
>
> Your back and forth assignements of random variables is not making the
> code any more readable.
>
> > + } else
> > + bw = val;
> > + /* save the recent bw in fifo */
> > + mbm_fifo_in(rmid, bw, read_mbm_local);
> > +
> > + if (read_mbm_local) {
>
> Oh well. You have this read_mbm_local conditional 3 times in this
> function. Why don't you make the function oblivious of this and hand
> in a pointer to mbm_local or mbm_total? Would be too simple and not
> enough obfuscated, right?
>
> > + mbm_local[rmid].index++;
> > + mbm_local[rmid].cum_avg = val;
> > + mbm_local[rmid].bytes = tmp;
> > + mbm_local[rmid].prev_time = ktime_set(0,
> > + (unsigned long)ktime_to_ns(cur_time));
> > + } else {
> > + mbm_total[rmid].index++;
> > + mbm_total[rmid].cum_avg = val;
> > + mbm_total[rmid].bytes = tmp;
> > + mbm_total[rmid].prev_time = ktime_set(0,
> > + (unsigned long)ktime_to_ns(cur_time));
> > + }
> > + return val;
> > + }
> > + return cma;
> > +}
> > +
> > /*
> > * Initially use this constant for both the limbo queue time and the
> > * rotation timer interval, pmu::hrtimer_interval_ms.
> > @@ -568,7 +791,8 @@ static bool intel_cqm_rmid_stabilize(unsigned int
> *available)
> > /*
> > * Test whether an RMID is free for each package.
> > */
> > - on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
> > + if (entry->config)
>
> This entry->config change wants to be a separate patch. It's
> completely non obvious how this changes the behaviour of the existing code.
>
> > + on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL,
> true);
> >
> > list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
> > /*
> > @@ -846,8 +1070,9 @@ static void intel_cqm_setup_event(struct perf_event
> *event,
> > struct perf_event **group)
> > {
> > struct perf_event *iter;
> > - bool conflict = false;
> > +
> > u32 rmid;
> > + bool conflict = false;
>
> Fits the overall picture of this patch: Sloppy
>
> > list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
> > rmid = iter->hw.cqm_rmid;
> > @@ -875,6 +1100,40 @@ static void intel_cqm_setup_event(struct
> perf_event *event,
> > event->hw.cqm_rmid = rmid;
> > }
> >
> > +static void intel_cqm_event_update(struct perf_event *event)
> > +{
> > + unsigned int rmid;
> > + u64 val = 0;
> > +
> > + /*
> > + * Task events are handled by intel_cqm_event_count().
> > + */
> > + if (event->cpu == -1)
> > + return;
> > +
> > + rmid = event->hw.cqm_rmid;
> > + if (!__rmid_valid(rmid))
> > + return;
> > +
> > + switch (event->attr.config) {
> > + case QOS_MBM_TOTAL_EVENT_ID:
> > + val = __rmid_read_mbm(rmid, false);
> > + break;
> > + case QOS_MBM_LOCAL_EVENT_ID:
> > + val = __rmid_read_mbm(rmid, true);
> > + break;
> > + default:
> > + break;
>
> return?
>
> > + }
> > + /*
> > + * Ignore this reading on error states and do not update the value.
> > + */
> > + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> > + return;
> > +
> > + local64_set(&event->count, val);
> > +}
> > +
> > static void intel_cqm_event_read(struct perf_event *event)
> > {
> > unsigned long flags;
> > @@ -887,6 +1146,12 @@ static void intel_cqm_event_read(struct perf_event
> *event)
> > if (event->cpu == -1)
> > return;
> >
> > + if ((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
> > + (event->attr.config & QOS_MBM_LOCAL_EVENT_ID))
> > + intel_cqm_event_update(event);
> > +
> > + if (!(event->attr.config & QOS_L3_OCCUP_EVENT_ID))
> > + return;
> > raw_spin_lock_irqsave(&cache_lock, flags);
> > rmid = event->hw.cqm_rmid;
> >
> > @@ -906,6 +1171,28 @@ out:
> > raw_spin_unlock_irqrestore(&cache_lock, flags);
> > }
> >
> > +static void __intel_cqm_event_total_bw_count(void *info)
> > +{
> > + struct rmid_read *rr = info;
> > + u64 val;
> > +
> > + val = __rmid_read_mbm(rr->rmid, false);
> > + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> > + return;
> > + atomic64_add(val, &rr->value);
> > +}
> > +
> > +static void __intel_cqm_event_local_bw_count(void *info)
> > +{
> > + struct rmid_read *rr = info;
> > + u64 val;
> > +
> > + val = __rmid_read_mbm(rr->rmid, true);
> > + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> > + return;
> > + atomic64_add(val, &rr->value);
>
> You're really a fan of copy and paste.
>
> > +}
> > +
> > static void __intel_cqm_event_count(void *info)
> > {
> > struct rmid_read *rr = info;
> > @@ -967,7 +1254,21 @@ static u64 intel_cqm_event_count(struct perf_event
> *event)
> > if (!__rmid_valid(rr.rmid))
> > goto out;
> >
> > - on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
> > + switch (event->attr.config) {
> > + case QOS_L3_OCCUP_EVENT_ID:
> > + on_each_cpu_mask(&cqm_cpumask,
> __intel_cqm_event_count, &rr, 1);
> > + break;
> > + case QOS_MBM_TOTAL_EVENT_ID:
> > + on_each_cpu_mask(&cqm_cpumask,
> __intel_cqm_event_total_bw_count,
> > + &rr, 1);
> > + break;
> > + case QOS_MBM_LOCAL_EVENT_ID:
> > + on_each_cpu_mask(&cqm_cpumask,
> __intel_cqm_event_local_bw_count,
> > + &rr, 1);
> > + break;
> > + default:
> > + break;
>
> So there is nothing to do and you happily proceed?
>
> > + }
> >
> > raw_spin_lock_irqsave(&cache_lock, flags);
> > if (event->hw.cqm_rmid == rr.rmid)
> > @@ -977,6 +1278,39 @@ out:
> > return __perf_event_count(event);
> > }
>
> > static void intel_cqm_event_start(struct perf_event *event, int mode)
> > {
> > struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
> > @@ -995,19 +1329,48 @@ static void intel_cqm_event_start(struct
> perf_event *event, int mode)
> > }
> >
> > state->rmid = rmid;
> > + if (event->attr.config & QOS_L3_OCCUP_EVENT_ID) {
> > + struct cqm_rmid_entry *entry;
> > +
> > + entry = __rmid_entry(rmid);
> > + entry->config = true;
> > + }
> > wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
> > +
> > + if (((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
> > + (event->attr.config & QOS_MBM_LOCAL_EVENT_ID)) &&
> (cqm_mbm)) {
> > + int cpu = get_cpu();
>
> pmu->start() is called with interrupts disabled. So why do you need
> get_cpu() here?
>
> > + struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
> > +
> > + if (pmu) {
> > + if (pmu->n_active == 0)
> > + mbm_hrtimer_init(pmu);
>
> That pmu timer wants to be initialized when the pmu is initialized.
>
> > + pmu->n_active++;
> > + list_add_tail(&event->active_entry,
> > + &pmu->active_list);
> > + if (pmu->n_active == 1)
> > + mbm_start_hrtimer(pmu);
> > + }
> > + put_cpu();
> > + }
> > }
> >
> > static void intel_cqm_event_stop(struct perf_event *event, int mode)
> > {
> > struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
> > + struct mbm_pmu *pmu = __this_cpu_read(mbm_pmu);
> >
> > if (event->hw.cqm_state & PERF_HES_STOPPED)
> > return;
> >
> > event->hw.cqm_state |= PERF_HES_STOPPED;
> >
> > - intel_cqm_event_read(event);
> > + if (event->attr.config & QOS_L3_OCCUP_EVENT_ID)
> > + intel_cqm_event_read(event);
> > +
> > + if ((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
> > + (event->attr.config & QOS_MBM_LOCAL_EVENT_ID))
> > + intel_cqm_event_update(event);
> >
> > if (!--state->rmid_usecnt) {
> > state->rmid = 0;
> > @@ -1015,8 +1378,18 @@ static void intel_cqm_event_stop(struct
> perf_event *event, int mode)
> > } else {
> > WARN_ON_ONCE(!state->rmid);
> > }
> > +
> > + if (pmu) {
> > + WARN_ON_ONCE(pmu->n_active <= 0);
> > + pmu->n_active--;
> > + if (pmu->n_active == 0)
> > + mbm_stop_hrtimer(pmu);
> > + list_del(&event->active_entry);
> > + }
> > +
> > }
> >
> > +
>
> Sigh
>
> > static int intel_cqm_event_add(struct perf_event *event, int mode)
> > {
>
> > +static ssize_t
> > +sliding_window_size_store(struct device *dev,
> > + struct device_attribute *attr,
> > + const char *buf, size_t count)
> > +{
> > + unsigned int bytes;
> > + int ret;
> > +
> > + ret = kstrtouint(buf, 0, &bytes);
> > + if (ret)
> > + return ret;
> > +
> > + mutex_lock(&cache_mutex);
> > + if (bytes > 0 && bytes <= MBM_FIFO_SIZE_MAX)
> > + mbm_window_size = bytes;
> > + else
> > + bytes = MBM_FIFO_SIZE_MIN;
>
> What kind of twisted logic is that?
>
> > +
> > + mutex_unlock(&cache_mutex);
> > +
> > + return count;
> > +}
> > +
> > static DEVICE_ATTR_RW(max_recycle_threshold);
> > +static DEVICE_ATTR_RW(sliding_window_size);
> >
> > static struct attribute *intel_cqm_attrs[] = {
> > &dev_attr_max_recycle_threshold.attr,
> > + &dev_attr_sliding_window_size.attr,
> > NULL,
> > };
> >
> > @@ -1241,16 +1699,17 @@ static inline void cqm_pick_event_reader(int cpu)
> >
> > for_each_cpu(i, &cqm_cpumask) {
> > if (phys_id == topology_physical_package_id(i))
> > - return; /* already got reader for this socket */
> > + return; /* already got reader for this socket */
>
> More random crap.
>
> > }
> >
> > cpumask_set_cpu(cpu, &cqm_cpumask);
> > }
> >
> > -static void intel_cqm_cpu_prepare(unsigned int cpu)
> > +static int intel_cqm_cpu_prepare(unsigned int cpu)
> > {
> > struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
> > struct cpuinfo_x86 *c = &cpu_data(cpu);
> > + struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
> >
> > state->rmid = 0;
> > state->closid = 0;
> > @@ -1258,12 +1717,27 @@ static void intel_cqm_cpu_prepare(unsigned int
> cpu)
> >
> > WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
> > WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
> > +
> > + if ((!pmu) && (cqm_mbm)) {
> > + pmu = kzalloc_node(sizeof(*mbm_pmu), GFP_KERNEL,
> NUMA_NO_NODE);
> > + if (!pmu)
> > + return -ENOMEM;
> > + spin_lock_init(&pmu->lock);
> > + INIT_LIST_HEAD(&pmu->active_list);
> > + pmu->pmu = &intel_cqm_pmu;
> > + pmu->n_active = 0;
> > + pmu->timer_interval = ms_to_ktime(MBM_TIME_DELTA_MAX);
> > + per_cpu(mbm_pmu, cpu) = pmu;
> > + per_cpu(mbm_pmu_to_free, cpu) = NULL;
>
> So that mbm_pmu_to_free per cpu variable is NULL already and never
> becomes anything else than NULL. What's the purpose of setting it NULL
> again? And what's the purpose of it in general?
>
> > + }
> > + return 0;
> > }
> >
> > static void intel_cqm_cpu_exit(unsigned int cpu)
> > {
> > int phys_id = topology_physical_package_id(cpu);
> > int i;
> > + struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
> >
> > /*
> > * Is @cpu a designated cqm reader?
> > @@ -1280,6 +1754,13 @@ static void intel_cqm_cpu_exit(unsigned int cpu)
> > break;
> > }
> > }
> > +
> > + /* cancel overflow polling timer for CPU */
> > + if (pmu)
> > + mbm_stop_hrtimer(pmu);
> > + kfree(mbm_local);
> > + kfree(mbm_total);
>
> So this frees the module global storage if one cpu is unplugged and
> leaves the pointer initialized so the rest of the code can happily
> dereference it.
>
> > +
> > }
> >
> > static int intel_cqm_cpu_notifier(struct notifier_block *nb,
> > @@ -1289,7 +1770,7 @@ static int intel_cqm_cpu_notifier(struct
> notifier_block *nb,
> >
> > switch (action & ~CPU_TASKS_FROZEN) {
> > case CPU_UP_PREPARE:
> > - intel_cqm_cpu_prepare(cpu);
> > + return intel_cqm_cpu_prepare(cpu);
> > break;
> > case CPU_DOWN_PREPARE:
> > intel_cqm_cpu_exit(cpu);
> > @@ -1305,17 +1786,74 @@ static int intel_cqm_cpu_notifier(struct
> notifier_block *nb,
> > static const struct x86_cpu_id intel_cqm_match[] = {
> > { .vendor = X86_VENDOR_INTEL, .feature =
> X86_FEATURE_CQM_OCCUP_LLC },
> > {}
> > + }, intel_mbm_match[] = {
>
> What's wrong with having a separate
>
> static const struct x86_cpu_id intel_mbm_match[] = {
>
> > + { .vendor = X86_VENDOR_INTEL, .feature =
> X86_FEATURE_CQM_MBM_LOCAL },
> > + {}
> > };
>
> That again would make the change obvious, but that's not on your agenda.
>
> >
> > static int __init intel_cqm_init(void)
> > {
> > char *str, scale[20];
> > - int i, cpu, ret;
> > + int i = 0, cpu, ret;
>
> Initializing i is required because?
>
> > - if (!x86_match_cpu(intel_cqm_match))
> > + if (!x86_match_cpu(intel_cqm_match) &&
> > + (!x86_match_cpu(intel_mbm_match)))
> > return -ENODEV;
> >
> > cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
> > + cqm_max_rmid = boot_cpu_data.x86_cache_max_rmid;
> > +
> > + if (x86_match_cpu(intel_cqm_match)) {
> > + cqm_llc_occ = true;
> > + intel_cqm_events_group.attrs = intel_cmt_events_attr;
> > + } else
> > + cqm_llc_occ = false;
>
> I see you do not trust the compiler to put cqm_llc_occ into the BSS
> section and neither the kernel to zero it out proper.
>
> > +
> > + if (x86_match_cpu(intel_mbm_match)) {
> > + u32 mbm_scale_rounded = 0;
> > +
> > + cqm_mbm = true;
> > + cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
> > + /*
> > + * MBM counter values are in bytes. To conver this to MB/sec,
> > + * we scale the MBM scale factor by 1000. Another 1000 factor
> > + * scaling down is done
> > + * after reading the counter val i.e. in the function
> > + * __rmid_read_mbm()
>
> So the event attribute string is in KB/sec and at some other place you
> convert it to MB/sec? There seems to be some hidden logic for this,
> but that's definitely not decodeable for me.
>
> > + */
> > + mbm_scale_rounded = (cqm_l3_scale + 500) / 1000;
>
> div_round_up
>
> > + cqm_max_rmid = boot_cpu_data.x86_cache_max_rmid;
> > + snprintf(scale, sizeof(scale), "%u", mbm_scale_rounded);
> > + str = kstrdup(scale, GFP_KERNEL);
> > + if (!str) {
> > + ret = -ENOMEM;
> > + goto out;
> > + }
> > +
> > + if (cqm_llc_occ)
> > + intel_cqm_events_group.attrs =
> > + intel_cmt_mbm_events_attr;
>
> And this line break is required because?
>
> > + else
> > + intel_cqm_events_group.attrs =
> intel_mbm_events_attr;
> > +
> > + event_attr_intel_cqm_llc_local_bw_scale.event_str
> > + = event_attr_intel_cqm_llc_total_bw_scale.event_str = str;
>
> You really love to write unreadable crap. This is neither perl nor the
> obfuscated c-code contest. What's wrong with two separate lines which
> assign 'str' to each of the event attributes?
>
> > + mbm_local = kzalloc_node(sizeof(struct sample) *
> > + (cqm_max_rmid + 1) * MBM_SOCKET_MAX,
>
> Calculating the size in a separate line with a proper variable would
> make it readable.
>
> > + GFP_KERNEL, NUMA_NO_NODE);
> > + if (!mbm_local) {
>
> Leaks str
>
> > + ret = -ENOMEM;
> > + goto out;
> > + }
> > + mbm_total = kzalloc_node(sizeof(struct sample) *
> > + (cqm_max_rmid + 1) * MBM_SOCKET_MAX,
> > + GFP_KERNEL, NUMA_NO_NODE);
> > + if (!mbm_total) {
> > + ret = -ENOMEM;
>
> Leaks str and mbm_local
>
> > + goto out;
> > + }
>
> What about sticking that mess into a separate function?
>
> > + } else
> > + cqm_mbm = false;
>
> Sigh.
>
> > /*
> > * It's possible that not all resources support the same number
> > @@ -1328,44 +1866,48 @@ static int __init intel_cqm_init(void)
> > */
> > cpu_notifier_register_begin();
> >
> > - for_each_online_cpu(cpu) {
> > - struct cpuinfo_x86 *c = &cpu_data(cpu);
> > + if (cqm_llc_occ) {
> > + for_each_online_cpu(cpu) {
> > + struct cpuinfo_x86 *c = &cpu_data(cpu);
> >
> > - if (c->x86_cache_max_rmid < cqm_max_rmid)
> > - cqm_max_rmid = c->x86_cache_max_rmid;
> > + if (c->x86_cache_max_rmid < cqm_max_rmid)
> > + cqm_max_rmid = c->x86_cache_max_rmid;
> >
> > - if (c->x86_cache_occ_scale != cqm_l3_scale) {
> > - pr_err("Multiple LLC scale values, disabling\n");
> > - ret = -EINVAL;
> > - goto out;
> > + if (c->x86_cache_occ_scale != cqm_l3_scale) {
> > + pr_err("Multiple LLC scale values, disabling\n");
> > + ret = -EINVAL;
>
> More memory leaks
>
> > + goto out;
> > + }
> > }
> > - }
> >
> > - /*
> > - * A reasonable upper limit on the max threshold is the number
> > - * of lines tagged per RMID if all RMIDs have the same number of
> > - * lines tagged in the LLC.
> > - *
> > - * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
> > - */
> > - __intel_cqm_max_threshold =
> > + /*
> > + * A reasonable upper limit on the max threshold is the number
> > + * of lines tagged per RMID if all RMIDs have the same number
> of
> > + * lines tagged in the LLC.
> > + *
> > + * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
> > + */
> > + __intel_cqm_max_threshold =
> > boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
> >
> > - snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
> > - str = kstrdup(scale, GFP_KERNEL);
> > - if (!str) {
> > - ret = -ENOMEM;
> > - goto out;
> > - }
> > + snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
> > + str = kstrdup(scale, GFP_KERNEL);
> > + if (!str) {
> > + ret = -ENOMEM;
>
> Ditto.
>
> > + goto out;
> > + }
> >
> > - event_attr_intel_cqm_llc_scale.event_str = str;
> > + event_attr_intel_cqm_llc_scale.event_str = str;
> > + }
> >
> > ret = intel_cqm_setup_rmid_cache();
> > if (ret)
> > goto out;
> >
> > for_each_online_cpu(i) {
> > - intel_cqm_cpu_prepare(i);
> > + ret = intel_cqm_cpu_prepare(i);
> > + if (ret)
> > + goto out;
>
> And that leaves even more half initialized stuff around.
>
> > cqm_pick_event_reader(i);
> > }
>
> I fear your assessment in
>
>
> http://lkml.kernel.org/r/06033C969873E840BDE9FC9B584F66B58BB89D@ORS
> MSX116.amr.corp.intel.com
>
> is slightly wrong.
>
> That's a completely convoluted mess. Aside of that its broken all over
> the place.
>
> This wants to be split up into preparatory patches which remodel the
> existing code and move out cqm stuff into helper functions, so the mbm
> stuff can be plugged in with its own helpers nicely.
>
> Thanks,
>
> tglx