Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1762678AbcLPXPy (ORCPT ); Fri, 16 Dec 2016 18:15:54 -0500 Received: from mga02.intel.com ([134.134.136.20]:44065 "EHLO mga02.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1761632AbcLPXN0 (ORCPT ); Fri, 16 Dec 2016 18:13:26 -0500 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.33,360,1477983600"; d="scan'208";a="40673343" From: Vikas Shivappa To: vikas.shivappa@intel.com, vikas.shivappa@linux.intel.com Cc: linux-kernel@vger.kernel.org, x86@kernel.org, tglx@linutronix.de, peterz@infradead.org, ravi.v.shankar@intel.com, tony.luck@intel.com, fenghua.yu@intel.com, andi.kleen@intel.com, davidcc@google.com, eranian@google.com, hpa@zytor.com Subject: [PATCH 07/14] x86/rdt,cqm: Scheduling support update Date: Fri, 16 Dec 2016 15:13:01 -0800 Message-Id: <1481929988-31569-8-git-send-email-vikas.shivappa@linux.intel.com> X-Mailer: git-send-email 1.9.1 In-Reply-To: <1481929988-31569-1-git-send-email-vikas.shivappa@linux.intel.com> References: <1481929988-31569-1-git-send-email-vikas.shivappa@linux.intel.com> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 15587 Lines: 510 Introduce a scheduling hook finish_arch_pre_lock_switch which is called just after the perf sched_in during context switch. This method handles both cat and cqm sched in scenarios. The IA32_PQR_ASSOC MSR is used by cat(cache allocation) and cqm and this patch integrates the two msr writes to one. The common sched_in patch checks if the per cpu cache has a different RMID or CLOSid than the task and does the MSR write. During sched_in the task uses the task RMID if the task is monitored or else uses the task's cgroup rmid. Patch is based on David Carrillo-Cisneros patches in cqm2 series. Signed-off-by: Vikas Shivappa --- arch/x86/events/intel/cqm.c | 46 +++++++++------ arch/x86/include/asm/intel_pqr_common.h | 38 +++++++++++++ arch/x86/include/asm/intel_rdt.h | 39 ------------- arch/x86/include/asm/intel_rdt_common.h | 13 +++++ arch/x86/include/asm/processor.h | 4 ++ arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/intel_rdt_common.c | 97 ++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 4 +- arch/x86/kernel/process_32.c | 4 -- arch/x86/kernel/process_64.c | 4 -- kernel/sched/core.c | 1 + kernel/sched/sched.h | 3 + 12 files changed, 189 insertions(+), 65 deletions(-) create mode 100644 arch/x86/include/asm/intel_pqr_common.h create mode 100644 arch/x86/kernel/cpu/intel_rdt_common.c diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c index 57edbfc..506e187 100644 --- a/arch/x86/events/intel/cqm.c +++ b/arch/x86/events/intel/cqm.c @@ -28,13 +28,6 @@ static bool cqm_enabled, mbm_enabled; unsigned int cqm_socket_max; -/* - * The cached intel_pqr_state is strictly per CPU and can never be - * updated from a remote CPU. Both functions which modify the state - * (intel_cqm_event_start and intel_cqm_event_stop) are called with - * interrupts disabled, which is sufficient for the protection. - */ -DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); static struct hrtimer *mbm_timers; /** * struct sample - mbm event's (local or total) data @@ -55,7 +48,6 @@ struct sample { */ static struct sample *mbm_local; -#define pkg_id topology_physical_package_id(smp_processor_id()) /* * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array. * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of @@ -74,6 +66,8 @@ struct sample { static DEFINE_MUTEX(cache_mutex); static DEFINE_RAW_SPINLOCK(cache_lock); +DEFINE_STATIC_KEY_FALSE(cqm_enable_key); + /* * Groups of events that have the same target(s), one RMID per group. */ @@ -108,7 +102,7 @@ struct sample { * Likewise, an rmid value of -1 is used to indicate "no rmid currently * assigned" and is used as part of the rotation code. */ -static inline bool __rmid_valid(u32 rmid) +bool __rmid_valid(u32 rmid) { if (!rmid || rmid > cqm_max_rmid) return false; @@ -161,7 +155,7 @@ static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid, int domain) * * We expect to be called with cache_mutex held. */ -static u32 __get_rmid(int domain) +u32 __get_rmid(int domain) { struct list_head *cqm_flist; struct cqm_rmid_entry *entry; @@ -368,6 +362,23 @@ static void init_mbm_sample(u32 *rmid, u32 evt_type) on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1); } +#ifdef CONFIG_CGROUP_PERF +struct cgrp_cqm_info *cqminfo_from_tsk(struct task_struct *tsk) +{ + struct cgrp_cqm_info *ccinfo = NULL; + struct perf_cgroup *pcgrp; + + pcgrp = perf_cgroup_from_task(tsk, NULL); + + if (!pcgrp) + return NULL; + else + ccinfo = cgrp_to_cqm_info(pcgrp); + + return ccinfo; +} +#endif + static inline void cqm_enable_mon(struct cgrp_cqm_info *cqm_info, u32 *rmid) { if (rmid != NULL) { @@ -713,26 +724,27 @@ void alloc_needed_pkg_rmid(u32 *cqm_rmid) static void intel_cqm_event_start(struct perf_event *event, int mode) { struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - u32 rmid; if (!(event->hw.cqm_state & PERF_HES_STOPPED)) return; event->hw.cqm_state &= ~PERF_HES_STOPPED; - alloc_needed_pkg_rmid(event->hw.cqm_rmid); - - rmid = event->hw.cqm_rmid[pkg_id]; - state->rmid = rmid; - wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid); + if (is_task_event(event)) { + alloc_needed_pkg_rmid(event->hw.cqm_rmid); + state->next_task_rmid = event->hw.cqm_rmid[pkg_id]; + } } static void intel_cqm_event_stop(struct perf_event *event, int mode) { + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); + if (event->hw.cqm_state & PERF_HES_STOPPED) return; event->hw.cqm_state |= PERF_HES_STOPPED; + state->next_task_rmid = 0; } static int intel_cqm_event_add(struct perf_event *event, int mode) @@ -1366,6 +1378,8 @@ static int __init intel_cqm_init(void) if (mbm_enabled) pr_info("Intel MBM enabled\n"); + static_branch_enable(&cqm_enable_key); + /* * Setup the hot cpu notifier once we are sure cqm * is enabled to avoid notifier leak. diff --git a/arch/x86/include/asm/intel_pqr_common.h b/arch/x86/include/asm/intel_pqr_common.h new file mode 100644 index 0000000..8fe9d8e --- /dev/null +++ b/arch/x86/include/asm/intel_pqr_common.h @@ -0,0 +1,38 @@ +#ifndef _ASM_X86_INTEL_PQR_COMMON_H +#define _ASM_X86_INTEL_PQR_COMMON_H + +#ifdef CONFIG_INTEL_RDT + +#include +#include +#include +#include +#include + +void __intel_rdt_sched_in(void); + +/* + * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR + * + * Following considerations are made so that this has minimal impact + * on scheduler hot path: + * - This will stay as no-op unless we are running on an Intel SKU + * which supports resource control and we enable by mounting the + * resctrl file system. + * - Caches the per cpu CLOSid values and does the MSR write only + * when a task with a different CLOSid is scheduled in. + */ +static inline void intel_rdt_sched_in(void) +{ + if (static_branch_likely(&rdt_enable_key) || + static_branch_unlikely(&cqm_enable_key)) { + __intel_rdt_sched_in(); + } +} + +#else + +static inline void intel_rdt_sched_in(void) {} + +#endif +#endif diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h index 95ce5c8..3b4a099 100644 --- a/arch/x86/include/asm/intel_rdt.h +++ b/arch/x86/include/asm/intel_rdt.h @@ -5,7 +5,6 @@ #include #include - #include #define IA32_L3_QOS_CFG 0xc81 @@ -182,43 +181,5 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, int rdtgroup_schemata_show(struct kernfs_open_file *of, struct seq_file *s, void *v); -/* - * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR - * - * Following considerations are made so that this has minimal impact - * on scheduler hot path: - * - This will stay as no-op unless we are running on an Intel SKU - * which supports resource control and we enable by mounting the - * resctrl file system. - * - Caches the per cpu CLOSid values and does the MSR write only - * when a task with a different CLOSid is scheduled in. - * - * Must be called with preemption disabled. - */ -static inline void intel_rdt_sched_in(void) -{ - if (static_branch_likely(&rdt_enable_key)) { - struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - int closid; - - /* - * If this task has a closid assigned, use it. - * Else use the closid assigned to this cpu. - */ - closid = current->closid; - if (closid == 0) - closid = this_cpu_read(cpu_closid); - - if (closid != state->closid) { - state->closid = closid; - wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid); - } - } -} - -#else - -static inline void intel_rdt_sched_in(void) {} - #endif /* CONFIG_INTEL_RDT_A */ #endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h index e11ed5e..6424322 100644 --- a/arch/x86/include/asm/intel_rdt_common.h +++ b/arch/x86/include/asm/intel_rdt_common.h @@ -18,12 +18,25 @@ */ struct intel_pqr_state { u32 rmid; + u32 next_task_rmid; u32 closid; int rmid_usecnt; }; DECLARE_PER_CPU(struct intel_pqr_state, pqr_state); +#define pkg_id topology_physical_package_id(smp_processor_id()) + +u32 __get_rmid(int domain); +bool __rmid_valid(u32 rmid); +void alloc_needed_pkg_rmid(u32 *cqm_rmid); +struct cgrp_cqm_info *cqminfo_from_tsk(struct task_struct *tsk); + +extern struct cgrp_cqm_info cqm_rootcginfo; + +DECLARE_STATIC_KEY_FALSE(cqm_enable_key); +DECLARE_STATIC_KEY_FALSE(rdt_enable_key); + /** * struct cgrp_cqm_info - perf_event cgroup metadata for cqm * @cont_mon Continuous monitoring flag diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index e7f8c62..b0ce5cc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -870,4 +871,7 @@ static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) void stop_this_cpu(void *dummy); void df_debug(struct pt_regs *regs, long error_code); + +#define finish_arch_pre_lock_switch intel_rdt_sched_in + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index c9f8c81..1035c97 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o +obj-$(CONFIG_INTEL_RDT) += intel_rdt_common.o obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o obj-$(CONFIG_X86_MCE) += mcheck/ diff --git a/arch/x86/kernel/cpu/intel_rdt_common.c b/arch/x86/kernel/cpu/intel_rdt_common.c new file mode 100644 index 0000000..83c8c00 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt_common.c @@ -0,0 +1,97 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include + +#include +#include + +/* + * The cached intel_pqr_state is strictly per CPU and can never be + * updated from a remote CPU. Both functions which modify the state + * (intel_cqm_event_start and intel_cqm_event_stop) are called with + * interrupts disabled, which is sufficient for the protection. + */ +DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); + +#ifdef CONFIG_INTEL_RDT_M +static inline int get_cgroup_sched_rmid(void) +{ +#ifdef CONFIG_CGROUP_PERF + struct cgrp_cqm_info *ccinfo = NULL; + + ccinfo = cqminfo_from_tsk(current); + + if (!ccinfo) + return 0; + + /* + * A cgroup is always monitoring for itself or + * for an ancestor(default is root). + */ + if (ccinfo->mon_enabled) { + alloc_needed_pkg_rmid(ccinfo->rmid); + return ccinfo->rmid[pkg_id]; + } else { + alloc_needed_pkg_rmid(ccinfo->mfa->rmid); + return ccinfo->mfa->rmid[pkg_id]; + } +#endif + + return 0; +} + +static inline int get_sched_in_rmid(void) +{ + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); + u32 rmid = 0; + + rmid = state->next_task_rmid; + + return rmid ? rmid : get_cgroup_sched_rmid(); +} +#endif + +/* + * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR + * + * Following considerations are made so that this has minimal impact + * on scheduler hot path: + * - This will stay as no-op unless we are running on an Intel SKU + * which supports resource control and we enable by mounting the + * resctrl file system or it supports resource monitoring. + * - Caches the per cpu CLOSid/RMID values and does the MSR write only + * when a task with a different CLOSid/RMID is scheduled in. + */ +void __intel_rdt_sched_in(void) +{ + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); + int closid = 0; + u32 rmid = 0; + +#ifdef CONFIG_INTEL_RDT_A + if (static_branch_likely(&rdt_enable_key)) { + /* + * If this task has a closid assigned, use it. + * Else use the closid assigned to this cpu. + */ + closid = current->closid; + if (closid == 0) + closid = this_cpu_read(cpu_closid); + } +#endif + +#ifdef CONFIG_INTEL_RDT_M + if (static_branch_unlikely(&cqm_enable_key)) + rmid = get_sched_in_rmid(); +#endif + + if (closid != state->closid || rmid != state->rmid) { + + state->closid = closid; + state->rmid = rmid; + wrmsr(MSR_IA32_PQR_ASSOC, rmid, closid); + } +} diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 8af04af..8b6b429 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -206,7 +206,7 @@ static void rdt_update_cpu_closid(void *closid) * executing task might have its own closid selected. Just reuse * the context switch code. */ - intel_rdt_sched_in(); + __intel_rdt_sched_in(); } /* @@ -328,7 +328,7 @@ static void move_myself(struct callback_head *head) preempt_disable(); /* update PQR_ASSOC MSR to make resource group go into effect */ - intel_rdt_sched_in(); + __intel_rdt_sched_in(); preempt_enable(); kfree(callback); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index efe7f9f..bd7be8e 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -54,7 +54,6 @@ #include #include #include -#include void __show_regs(struct pt_regs *regs, int all) { @@ -300,8 +299,5 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, this_cpu_write(current_task, next_p); - /* Load the Intel cache allocation PQR MSR. */ - intel_rdt_sched_in(); - return prev_p; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index acd7d6f..b3760b3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -50,7 +50,6 @@ #include #include #include -#include __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); @@ -474,9 +473,6 @@ void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp) loadsegment(ss, __KERNEL_DS); } - /* Load the Intel cache allocation PQR MSR. */ - intel_rdt_sched_in(); - return prev_p; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 154fd68..b2c9106 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2766,6 +2766,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) prev_state = prev->state; vtime_task_switch(prev); perf_event_task_sched_in(prev, current); + finish_arch_pre_lock_switch(); finish_lock_switch(rq, prev); finish_arch_post_lock_switch(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 055f935..0a0208e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1112,6 +1112,9 @@ static inline int task_on_rq_migrating(struct task_struct *p) #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif +#ifndef finish_arch_pre_lock_switch +# define finish_arch_pre_lock_switch() do { } while (0) +#endif #ifndef finish_arch_post_lock_switch # define finish_arch_post_lock_switch() do { } while (0) #endif -- 1.9.1