From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
To: vikas.shivappa@intel.com
Cc: linux-kernel@vger.kernel.org, x86@kernel.org, hpa@zytor.com,
        tglx@linutronix.de, mingo@kernel.org, tj@kernel.org,
        peterz@infradead.org, matt.fleming@intel.com, will.auld@intel.com,
        glenn.p.williamson@intel.com, kanaka.d.juvva@intel.com,
        vikas.shivappa@linux.intel.com
Subject: [PATCH 6/9] x86/intel_rdt: Add support for cache bit mask management
Date: Thu,  6 Aug 2015 14:55:14 -0700
Message-Id: <1438898117-3692-7-git-send-email-vikas.shivappa@linux.intel.com>
In-Reply-To: <1438898117-3692-1-git-send-email-vikas.shivappa@linux.intel.com>
References: <1438898117-3692-1-git-send-email-vikas.shivappa@linux.intel.com>
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 8212
Lines: 298

Adds a file l3_cbm to the intel_rdt cgroup which represents the cache
capacity bit mask for the cgroup. The tasks in the cgroup would get to
fill the L3 cache represented by the cgroup's l3_cbm file. The bit mask
may map to ways in the cache but could be hardware implementation
specific.

The l3_cbm would represent one of IA32_L3_MASK_n MSRs, there by any
updates to the l3_cbm end up in an MSR write to the appropriate
IA32_L3_MASK_n. The IA32_L3_MASK_n MSRs are per package but the l3_cbm
represents the global value of the MSR on all packages.

When a child cgroup is created it inherits the CLOSid and the l3_cbm
from its parent. When a user changes the default l3_cbm for a cgroup, a
new CLOSid may be allocated if the l3_cbm was not used before. If the
new l3_cbm is the one that is already used, the count for that CLOSid
<-> l3_cbm is incremented. The changing of 'l3_cbm' may fail with
-ENOSPC once the kernel runs out of maximum CLOSids it can support.

User can create as many cgroups as he wants, but having different l3_cbm
at the same time is restricted by the maximum number of CLOSids. Kernel
maintains a CLOSid <-> l3_cbm mapping which keeps count of cgroups using
a CLOSid.

Reuse of CLOSids for cgroups with same bitmask also has following
advantages:
 - This helps to use the scant CLOSids optimally.
 - This also implies that during context switch, write to PQR-MSR is
 done only when a task with a different bitmask is scheduled in.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
---
 arch/x86/include/asm/intel_rdt.h |   3 +
 arch/x86/kernel/cpu/intel_rdt.c  | 202 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 204 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
index a887004..58bac91 100644
--- a/arch/x86/include/asm/intel_rdt.h
+++ b/arch/x86/include/asm/intel_rdt.h
@@ -4,6 +4,9 @@
 #ifdef CONFIG_CGROUP_RDT
 
 #include <linux/cgroup.h>
+#define MAX_CBM_LENGTH			32
+#define IA32_L3_CBM_BASE		0xc90
+#define CBM_FROM_INDEX(x)		(IA32_L3_CBM_BASE + x)
 
 struct rdt_subsys_info {
 	unsigned long *closmap;
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 52e1fd6..115f136 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -34,6 +34,13 @@ static struct clos_cbm_table *cctable;
 static struct rdt_subsys_info rdtss_info;
 static DEFINE_MUTEX(rdt_group_mutex);
 struct intel_rdt rdt_root_group;
+/*
+ * Mask of CPUs for writing CBM values. We only need one CPU per-socket.
+ */
+static cpumask_t rdt_cpumask;
+
+#define rdt_for_each_child(pos_css, parent_ir)		\
+	css_for_each_child((pos_css), &(parent_ir)->css)
 
 static inline void closid_get(u32 closid)
 {
@@ -117,12 +124,192 @@ static void intel_rdt_css_free(struct cgroup_subsys_state *css)
 	mutex_unlock(&rdt_group_mutex);
 }
 
+static int intel_cache_alloc_cbm_read(struct seq_file *m, void *v)
+{
+	struct intel_rdt *ir = css_rdt(seq_css(m));
+
+	seq_printf(m, "%08lx\n", cctable[ir->closid].l3_cbm);
+
+	return 0;
+}
+
+static inline bool cbm_is_contiguous(unsigned long var)
+{
+	unsigned long maxcbm = MAX_CBM_LENGTH;
+	unsigned long first_bit, zero_bit;
+
+	if (!var)
+		return false;
+
+	first_bit = find_first_bit(&var, maxcbm);
+	zero_bit = find_next_zero_bit(&var, maxcbm, first_bit);
+
+	if (find_next_bit(&var, maxcbm, zero_bit) < maxcbm)
+		return false;
+
+	return true;
+}
+
+static int cbm_validate(struct intel_rdt *ir, unsigned long cbmvalue)
+{
+	struct cgroup_subsys_state *css;
+	struct intel_rdt *par, *c;
+	unsigned long *cbm_tmp;
+	int err = 0;
+
+	if (!cbm_is_contiguous(cbmvalue)) {
+		err = -EINVAL;
+		goto out_err;
+	}
+
+	par = parent_rdt(ir);
+	cbm_tmp = &cctable[par->closid].l3_cbm;
+	if (!bitmap_subset(&cbmvalue, cbm_tmp, MAX_CBM_LENGTH)) {
+		err = -EINVAL;
+		goto out_err;
+	}
+
+	rcu_read_lock();
+	rdt_for_each_child(css, ir) {
+		c = css_rdt(css);
+		cbm_tmp = &cctable[c->closid].l3_cbm;
+		if (!bitmap_subset(cbm_tmp, &cbmvalue, MAX_CBM_LENGTH)) {
+			rcu_read_unlock();
+			err = -EINVAL;
+			goto out_err;
+		}
+	}
+	rcu_read_unlock();
+out_err:
+
+	return err;
+}
+
+static bool cbm_search(unsigned long cbm, u32 *closid)
+{
+	u32 maxid = boot_cpu_data.x86_cache_max_closid;
+	u32 i;
+
+	for (i = 0; i < maxid; i++) {
+		if (bitmap_equal(&cbm, &cctable[i].l3_cbm, MAX_CBM_LENGTH)) {
+			*closid = i;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static void closcbm_map_dump(void)
+{
+	u32 i;
+
+	pr_debug("CBMMAP\n");
+	for (i = 0; i < boot_cpu_data.x86_cache_max_closid; i++) {
+		pr_debug("l3_cbm: 0x%x,clos_refcnt: %u\n",
+		 (unsigned int)cctable[i].l3_cbm, cctable[i].clos_refcnt);
+	}
+}
+
+static void cbm_cpu_update(void *info)
+{
+	u32 closid = (u32) info;
+
+	wrmsrl(CBM_FROM_INDEX(closid), cctable[closid].l3_cbm);
+}
+
+/*
+ * cbm_update_all() - Update the cache bit mask for all packages.
+ */
+static inline void cbm_update_all(u32 closid)
+{
+	on_each_cpu_mask(&rdt_cpumask, cbm_cpu_update, (void *)closid, 1);
+}
+
+/*
+ * intel_cache_alloc_cbm_write() - Validates and writes the
+ * cache bit mask(cbm) to the IA32_L3_MASK_n
+ * and also store the same in the cctable.
+ *
+ * CLOSids are reused for cgroups which have same bitmask.
+ * This helps to use the scant CLOSids optimally. This also
+ * implies that at context switch write to PQR-MSR is done
+ * only when a task with a different bitmask is scheduled in.
+ */
+static int intel_cache_alloc_cbm_write(struct cgroup_subsys_state *css,
+				 struct cftype *cft, u64 cbmvalue)
+{
+	u32 max_cbm = boot_cpu_data.x86_cache_max_cbm_len;
+	struct intel_rdt *ir = css_rdt(css);
+	u64 max_mask;
+	int err = 0;
+	u32 closid;
+
+	if (ir == &rdt_root_group)
+		return -EPERM;
+
+	/*
+	 * Need global mutex as cbm write may allocate a closid.
+	 */
+	mutex_lock(&rdt_group_mutex);
+
+	max_mask = (1ULL << max_cbm) - 1;
+	if (cbmvalue & ~max_mask) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (cbmvalue == cctable[ir->closid].l3_cbm)
+		goto out;
+
+	err = cbm_validate(ir, cbmvalue);
+	if (err)
+		goto out;
+
+	/*
+	 * Try to get a reference for a different CLOSid and release the
+	 * reference to the current CLOSid.
+	 * Need to put down the reference here and get it back in case we
+	 * run out of closids. Otherwise we run into a problem when
+	 * we could be using the last closid that could have been available.
+	 */
+	closid_put(ir->closid);
+	if (cbm_search(cbmvalue, &closid)) {
+		ir->closid = closid;
+		closid_get(closid);
+	} else {
+		closid = ir->closid;
+		err = closid_alloc(ir);
+		if (err) {
+			closid_get(ir->closid);
+			goto out;
+		}
+
+		cctable[ir->closid].l3_cbm = cbmvalue;
+		cbm_update_all(ir->closid);
+	}
+	closcbm_map_dump();
+out:
+	mutex_unlock(&rdt_group_mutex);
+
+	return err;
+}
+
+static inline void rdt_cpumask_update(int cpu)
+{
+	static cpumask_t tmp;
+
+	cpumask_and(&tmp, &rdt_cpumask, topology_core_cpumask(cpu));
+	if (cpumask_empty(&tmp))
+		cpumask_set_cpu(cpu, &rdt_cpumask);
+}
+
 static int __init intel_rdt_late_init(void)
 {
 	struct cpuinfo_x86 *c = &boot_cpu_data;
 	static struct clos_cbm_table *cct;
 	u32 maxid, max_cbm_len;
-	int err = 0, size;
+	int err = 0, size, i;
 
 	if (!cpu_has(c, X86_FEATURE_CAT_L3)) {
 		rdt_root_group.css.ss->disabled = 1;
@@ -152,6 +339,9 @@ static int __init intel_rdt_late_init(void)
 	cct->l3_cbm = (1ULL << max_cbm_len) - 1;
 	cct->clos_refcnt = 1;
 
+	for_each_online_cpu(i)
+		rdt_cpumask_update(i);
+
 	pr_info("Intel cache allocation enabled\n");
 out_err:
 
@@ -160,8 +350,18 @@ out_err:
 
 late_initcall(intel_rdt_late_init);
 
+static struct cftype rdt_files[] = {
+	{
+		.name		= "l3_cbm",
+		.seq_show	= intel_cache_alloc_cbm_read,
+		.write_u64	= intel_cache_alloc_cbm_write,
+	},
+	{ }	/* terminate */
+};
+
 struct cgroup_subsys intel_rdt_cgrp_subsys = {
 	.css_alloc		= intel_rdt_css_alloc,
 	.css_free		= intel_rdt_css_free,
+	.legacy_cftypes		= rdt_files,
 	.early_init		= 0,
 };
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/