Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754000AbcCAXsG (ORCPT ); Tue, 1 Mar 2016 18:48:06 -0500 Received: from mga09.intel.com ([134.134.136.24]:58944 "EHLO mga09.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752100AbcCAXsA (ORCPT ); Tue, 1 Mar 2016 18:48:00 -0500 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.22,524,1449561600"; d="scan'208";a="662132602" From: Vikas Shivappa To: vikas.shivappa@intel.com, vikas.shivappa@linux.intel.com Cc: linux-kernel@vger.kernel.org, x86@kernel.org, hpa@zytor.com, tglx@linutronix.de, mingo@kernel.org, peterz@infradead.org, ravi.v.shankar@intel.com, tony.luck@intel.com, fenghua.yu@intel.com, h.peter.anvin@intel.com Subject: [PATCH 4/6] x86/mbm: Memory bandwidth monitoring event management Date: Tue, 1 Mar 2016 15:48:26 -0800 Message-Id: <1456876108-28770-5-git-send-email-vikas.shivappa@linux.intel.com> X-Mailer: git-send-email 1.9.1 In-Reply-To: <1456876108-28770-1-git-send-email-vikas.shivappa@linux.intel.com> References: <1456876108-28770-1-git-send-email-vikas.shivappa@linux.intel.com> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 8265 Lines: 270 From: Tony Luck Includes all the core infrastructure to measure the total_bytes and bandwidth. We have per socket counters for both total system wide L3 external bytes and local socket memory-controller bytes. The current b/w is calculated for a minimum diff time(time since it was last counted) of 100ms. The OS does MSR writes to MSR_IA32_QM_EVTSEL and MSR_IA32_QM_CTR to read the counters and uses the IA32_PQR_ASSOC_MSR to associate the RMID with the task. The tasks have a common RMID for cqm(cache quality of service monitoring) and MBM. Hence most of the scheduling code is reused from cqm. Lot of the scheduling code was taken out from Tony's patch and a 3-4 lines of change were added in the intel_cqm_event_read. Since the timer is no more added on every context switch this change was made. Reviewed-by: Tony Luck Signed-off-by: Tony Luck Signed-off-by: Vikas Shivappa --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 158 ++++++++++++++++++++++++++++- 1 file changed, 154 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index cf08a0f..6638dcc 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -13,6 +13,11 @@ #define MSR_IA32_QM_CTR 0x0c8e #define MSR_IA32_QM_EVTSEL 0x0c8d +/* + * MBM Counter is 24bits wide. MBM_CNTR_MAX defines max counter + * value + */ +#define MBM_CNTR_MAX 0xffffff static u32 cqm_max_rmid = -1; static unsigned int cqm_l3_scale; /* supposedly cacheline size */ static bool cqm_enabled, mbm_enabled; @@ -68,6 +73,16 @@ static struct sample *mbm_total; */ static struct sample *mbm_local; +#define pkg_id topology_physical_package_id(smp_processor_id()) +/* + * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array. + * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of + * rmids per socket, an example is given below + * RMID1 of Socket0: vrmid = 1 + * RMID1 of Socket1: vrmid = 1 * (cqm_max_rmid + 1) + 1 + * RMID1 of Socket2: vrmid = 2 * (cqm_max_rmid + 1) + 1 + */ +#define rmid_2_index(rmid) ((pkg_id * (cqm_max_rmid + 1)) + rmid) /* * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru. * Also protects event->hw.cqm_rmid @@ -91,8 +106,19 @@ static cpumask_t cqm_cpumask; #define RMID_VAL_UNAVAIL (1ULL << 62) #define QOS_L3_OCCUP_EVENT_ID (1 << 0) +/* + * MBM Event IDs as defined in SDM section 17.15.5 + * Event IDs are used to program EVTSEL MSRs before reading mbm event counters + */ +enum mbm_evt_type { + QOS_MBM_TOTAL_EVENT_ID = 0x02, + QOS_MBM_LOCAL_EVENT_ID, + QOS_MBM_TOTAL_BW_EVENT_ID, + QOS_MBM_LOCAL_BW_EVENT_ID, +}; -#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID +#define QOS_MBM_BW_EVENT_MASK 0x04 +#define QOS_MBM_LOCAL_EVENT_MASK 0x01 /* * This is central to the rotation algorithm in __intel_cqm_rmid_rotate(). @@ -422,9 +448,16 @@ static bool __conflict_event(struct perf_event *a, struct perf_event *b) struct rmid_read { u32 rmid; atomic64_t value; + enum mbm_evt_type evt_type; }; static void __intel_cqm_event_count(void *info); +static void init_mbm_sample(u32 rmid, enum mbm_evt_type evt_type); + +static bool is_mbm_event(int e) +{ + return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_BW_EVENT_ID); +} /* * Exchange the RMID of a group of events. @@ -866,6 +899,98 @@ static void intel_cqm_rmid_rotate(struct work_struct *work) schedule_delayed_work(&intel_cqm_rmid_work, delay); } +static struct sample *update_sample(unsigned int rmid, + enum mbm_evt_type evt_type, int first) +{ + ktime_t cur_time; + struct sample *mbm_current; + u32 vrmid = rmid_2_index(rmid); + u64 val, bytes, diff_time; + u32 eventid; + + if (evt_type & QOS_MBM_LOCAL_EVENT_MASK) { + mbm_current = &mbm_local[vrmid]; + eventid = QOS_MBM_LOCAL_EVENT_ID; + } else { + mbm_current = &mbm_total[vrmid]; + eventid = QOS_MBM_TOTAL_EVENT_ID; + } + + cur_time = ktime_get(); + wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); + rdmsrl(MSR_IA32_QM_CTR, val); + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) + return mbm_current; + val &= MBM_CNTR_MAX; + + if (first) { + mbm_current->interval_start = cur_time; + mbm_current->prev_msr = val; + mbm_current->total_bytes = 0; + mbm_current->interval_bytes = 0; + mbm_current->bandwidth = 0; + return mbm_current; + } + + if (val < mbm_current->prev_msr) + bytes = MBM_CNTR_MAX - mbm_current->prev_msr + val + 1; + else + bytes = val - mbm_current->prev_msr; + bytes *= cqm_l3_scale; + + mbm_current->total_bytes += bytes; + mbm_current->interval_bytes += bytes; + mbm_current->prev_msr = val; + diff_time = ktime_ms_delta(cur_time, mbm_current->interval_start); + + /* + * The b/w measured is really the most recent/current b/w. + * We wait till enough time has passed to avoid + * arthmetic rounding problems.Having it at >=100ms, + * such errors would be <=1%. + */ + if (diff_time > 100) { + bytes = mbm_current->interval_bytes * MSEC_PER_SEC; + do_div(bytes, diff_time); + mbm_current->bandwidth = bytes; + mbm_current->interval_bytes = 0; + mbm_current->interval_start = cur_time; + } + + return mbm_current; +} + +static u64 rmid_read_mbm(unsigned int rmid, enum mbm_evt_type evt_type) +{ + struct sample *mbm_current; + + mbm_current = update_sample(rmid, evt_type, 0); + + if (evt_type & QOS_MBM_BW_EVENT_MASK) + return mbm_current->bandwidth; + else + return mbm_current->total_bytes; +} + +static void __intel_mbm_event_init(void *info) +{ + struct rmid_read *rr = info; + + update_sample(rr->rmid, rr->evt_type, 1); +} + +static void init_mbm_sample(u32 rmid, enum mbm_evt_type evt_type) +{ + struct rmid_read rr = { + .value = ATOMIC64_INIT(0), + }; + + rr.rmid = rmid; + rr.evt_type = evt_type; + /* on each socket, init sample */ + on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1); +} + /* * Find a group and setup RMID. * @@ -886,6 +1011,8 @@ static void intel_cqm_setup_event(struct perf_event *event, /* All tasks in a group share an RMID */ event->hw.cqm_rmid = rmid; *group = iter; + if (is_mbm_event(event->attr.config)) + init_mbm_sample(rmid, event->attr.config); return; } @@ -902,6 +1029,9 @@ static void intel_cqm_setup_event(struct perf_event *event, else rmid = __get_rmid(); + if (is_mbm_event(event->attr.config)) + init_mbm_sample(rmid, event->attr.config); + event->hw.cqm_rmid = rmid; } @@ -923,7 +1053,10 @@ static void intel_cqm_event_read(struct perf_event *event) if (!__rmid_valid(rmid)) goto out; - val = __rmid_read(rmid); + if (is_mbm_event(event->attr.config)) + val = rmid_read_mbm(rmid, event->attr.config); + else + val = __rmid_read(rmid); /* * Ignore this reading on error states and do not update the value. @@ -954,6 +1087,17 @@ static inline bool cqm_group_leader(struct perf_event *event) return !list_empty(&event->hw.cqm_groups_entry); } +static void __intel_mbm_event_count(void *info) +{ + struct rmid_read *rr = info; + u64 val; + + val = rmid_read_mbm(rr->rmid, rr->evt_type); + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) + return; + atomic64_add(val, &rr->value); +} + static u64 intel_cqm_event_count(struct perf_event *event) { unsigned long flags; @@ -1007,7 +1151,12 @@ static u64 intel_cqm_event_count(struct perf_event *event) if (!__rmid_valid(rr.rmid)) goto out; - on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1); + if (is_mbm_event(event->attr.config)) { + rr.evt_type = event->attr.config; + on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, &rr, 1); + } else { + on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1); + } raw_spin_lock_irqsave(&cache_lock, flags); if (event->hw.cqm_rmid == rr.rmid) @@ -1122,7 +1271,8 @@ static int intel_cqm_event_init(struct perf_event *event) if (event->attr.type != intel_cqm_pmu.type) return -ENOENT; - if (event->attr.config & ~QOS_EVENT_MASK) + if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) || + (event->attr.config > QOS_MBM_LOCAL_BW_EVENT_ID)) return -EINVAL; /* unsupported modes and filters */ -- 1.9.1