From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
To: linux-kernel@vger.kernel.org
Cc: x86@kernel.org, hpa@zytor.com, tglx@linutronix.de, mingo@kernel.org,
        tj@kernel.org, peterz@infradead.org, matt.fleming@intel.com,
        will.auld@intel.com, kanaka.d.juvva@intel.com,
        glenn.p.williamson@intel.com, vikas.shivappa@linux.intel.com,
        vikas.shivappa@intel.com
Subject: [PATCH 6/9] x86/intel_rdt: Add support for cache bit mask management
Date: Thu, 25 Jun 2015 12:25:21 -0700
Message-Id: <1435260324-18125-7-git-send-email-vikas.shivappa@linux.intel.com>
In-Reply-To: <1435260324-18125-1-git-send-email-vikas.shivappa@linux.intel.com>
References: <1435260324-18125-1-git-send-email-vikas.shivappa@linux.intel.com>
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 8248
Lines: 298

The change adds a file cache_mask to the RDT cgroup which represents the
cache bit mask(CBM) for the cgroup. cache_mask is specific to the Cache
allocation sub-feature of RDT. The tasks in the RDT cgroup would get to
fill the L3 cache represented by the cgroup's cache_mask file.

Update to the CBM is done by writing to the IA32_L3_MASK_n.  The RDT
cgroup follows cgroup hierarchy ,mkdir and adding tasks to the cgroup
never fails.  When a child cgroup is created it inherits the CLOSid and
the cache_mask from its parent.  When a user changes the default CBM for
a cgroup, a new CLOSid may be allocated if the cache_mask was not used
before. If the new CBM is the one that is already used, the count for
that CLOSid<->CBM is incremented. The changing of 'cache_mask' may fail
with -ENOSPC once the kernel runs out of maximum CLOSids it can support.

User can create as many cgroups as he wants but having different CBMs at
the same time is restricted by the maximum number of CLOSids .Kernel
maintains a CLOSid<->cbm mapping which keeps count of cgroups using a
CLOSid.

Reuse of CLOSids for cgroups with same bitmask also has following
advantages:
 - This helps to use the scant CLOSids optimally.
 - This also implies that during context switch, write to PQR-MSR is
 done only when a task with a different bitmask is scheduled in.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
---
 arch/x86/include/asm/intel_rdt.h |   3 +
 arch/x86/kernel/cpu/intel_rdt.c  | 205 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 207 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
index 2ce3e2c..3ad426c 100644
--- a/arch/x86/include/asm/intel_rdt.h
+++ b/arch/x86/include/asm/intel_rdt.h
@@ -4,6 +4,9 @@
 #ifdef CONFIG_CGROUP_RDT
 
 #include <linux/cgroup.h>
+#define MAX_CBM_LENGTH			32
+#define IA32_L3_CBM_BASE		0xc90
+#define CBM_FROM_INDEX(x)		(IA32_L3_CBM_BASE + x)
 
 struct rdt_subsys_info {
 	unsigned long *closmap;
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index b13dd57..750b02a 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -34,6 +34,13 @@ static struct clos_cbm_map *ccmap;
 static struct rdt_subsys_info rdtss_info;
 static DEFINE_MUTEX(rdt_group_mutex);
 struct intel_rdt rdt_root_group;
+/*
+ * Mask of CPUs for writing CBM values. We only need one CPU per-socket.
+ */
+static cpumask_t rdt_cpumask;
+
+#define rdt_for_each_child(pos_css, parent_ir)		\
+	css_for_each_child((pos_css), &(parent_ir)->css)
 
 static inline void closid_get(u32 closid)
 {
@@ -117,13 +124,195 @@ static void intel_rdt_css_free(struct cgroup_subsys_state *css)
 	mutex_unlock(&rdt_group_mutex);
 }
 
+static int intel_cache_alloc_cbm_read(struct seq_file *m, void *v)
+{
+	struct intel_rdt *ir = css_rdt(seq_css(m));
+
+	seq_printf(m, "%08lx\n", ccmap[ir->closid].cache_mask);
+
+	return 0;
+}
+
+static inline bool cbm_is_contiguous(unsigned long var)
+{
+	unsigned long maxcbm = MAX_CBM_LENGTH;
+	unsigned long first_bit, zero_bit;
+
+	if (!var)
+		return false;
+
+	first_bit = find_next_bit(&var, maxcbm, 0);
+	zero_bit = find_next_zero_bit(&var, maxcbm, first_bit);
+
+	if (find_next_bit(&var, maxcbm, zero_bit) < maxcbm)
+		return false;
+
+	return true;
+}
+
+static int cbm_validate(struct intel_rdt *ir, unsigned long cbmvalue)
+{
+	struct cgroup_subsys_state *css;
+	struct intel_rdt *par, *c;
+	unsigned long *cbm_tmp;
+	int err = 0;
+
+	if (!cbm_is_contiguous(cbmvalue)) {
+		pr_err("bitmask should have >= 1 bit and be contiguous\n");
+		err = -EINVAL;
+		goto out_err;
+	}
+
+	par = parent_rdt(ir);
+	cbm_tmp = &ccmap[par->closid].cache_mask;
+	if (!bitmap_subset(&cbmvalue, cbm_tmp, MAX_CBM_LENGTH)) {
+		err = -EINVAL;
+		goto out_err;
+	}
+
+	rcu_read_lock();
+	rdt_for_each_child(css, ir) {
+		c = css_rdt(css);
+		cbm_tmp = &ccmap[c->closid].cache_mask;
+		if (!bitmap_subset(cbm_tmp, &cbmvalue, MAX_CBM_LENGTH)) {
+			rcu_read_unlock();
+			pr_err("Children's mask not a subset\n");
+			err = -EINVAL;
+			goto out_err;
+		}
+	}
+	rcu_read_unlock();
+out_err:
+
+	return err;
+}
+
+static bool cbm_search(unsigned long cbm, u32 *closid)
+{
+	u32 maxid = boot_cpu_data.x86_cache_max_closid;
+	u32 i;
+
+	for (i = 0; i < maxid; i++) {
+		if (bitmap_equal(&cbm, &ccmap[i].cache_mask, MAX_CBM_LENGTH)) {
+			*closid = i;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static void closcbm_map_dump(void)
+{
+	u32 i;
+
+	pr_debug("CBMMAP\n");
+	for (i = 0; i < boot_cpu_data.x86_cache_max_closid; i++) {
+		pr_debug("cache_mask: 0x%x,clos_refcnt: %u\n",
+		 (unsigned int)ccmap[i].cache_mask, ccmap[i].clos_refcnt);
+	}
+}
+
+static void cbm_cpu_update(void *info)
+{
+	u32 closid = (u32) info;
+
+	wrmsrl(CBM_FROM_INDEX(closid), ccmap[closid].cache_mask);
+}
+
+/*
+ * cbm_update_all() - Update the cache bit mask for all packages.
+ */
+static inline void cbm_update_all(u32 closid)
+{
+	on_each_cpu_mask(&rdt_cpumask, cbm_cpu_update, (void *)closid, 1);
+}
+
+/*
+ * intel_cache_alloc_cbm_write() - Validates and writes the
+ * cache bit mask(cbm) to the IA32_L3_MASK_n
+ * and also store the same in the ccmap.
+ *
+ * CLOSids are reused for cgroups which have same bitmask.
+ * This helps to use the scant CLOSids optimally. This also
+ * implies that at context switch write to PQR-MSR is done
+ * only when a task with a different bitmask is scheduled in.
+ */
+static int intel_cache_alloc_cbm_write(struct cgroup_subsys_state *css,
+				 struct cftype *cft, u64 cbmvalue)
+{
+	u32 max_cbm = boot_cpu_data.x86_cache_max_cbm_len;
+	struct intel_rdt *ir = css_rdt(css);
+	ssize_t err = 0;
+	u64 max_mask;
+	u32 closid;
+
+	if (ir == &rdt_root_group)
+		return -EPERM;
+
+	/*
+	 * Need global mutex as cbm write may allocate a closid.
+	 */
+	mutex_lock(&rdt_group_mutex);
+
+	max_mask = (1ULL << max_cbm) - 1;
+	if (cbmvalue & ~max_mask) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (cbmvalue == ccmap[ir->closid].cache_mask)
+		goto out;
+
+	err = cbm_validate(ir, cbmvalue);
+	if (err)
+		goto out;
+
+	/*
+	 * Try to get a reference for a different CLOSid and release the
+	 * reference to the current CLOSid.
+	 * Need to put down the reference here and get it back in case we
+	 * run out of closids. Otherwise we run into a problem when
+	 * we could be using the last closid that could have been available.
+	 */
+	closid_put(ir->closid);
+	if (cbm_search(cbmvalue, &closid)) {
+		ir->closid = closid;
+		closid_get(closid);
+	} else {
+		closid = ir->closid;
+		err = closid_alloc(ir);
+		if (err) {
+			closid_get(ir->closid);
+			goto out;
+		}
+
+		ccmap[ir->closid].cache_mask = cbmvalue;
+		cbm_update_all(ir->closid);
+	}
+	closcbm_map_dump();
+out:
+	mutex_unlock(&rdt_group_mutex);
+
+	return err;
+}
+
+static inline void rdt_cpumask_update(int cpu)
+{
+	static cpumask_t tmp;
+
+	cpumask_and(&tmp, &rdt_cpumask, topology_core_cpumask(cpu));
+	if (cpumask_empty(&tmp))
+		cpumask_set_cpu(cpu, &rdt_cpumask);
+}
+
 static int __init intel_rdt_late_init(void)
 {
 	struct cpuinfo_x86 *c = &boot_cpu_data;
 	static struct clos_cbm_map *ccm;
 	u32 maxid, max_cbm_len;
+	int err = 0, i;
 	size_t sizeb;
-	int err = 0;
 
 	if (!cpu_has(c, X86_FEATURE_CAT_L3)) {
 		rdt_root_group.css.ss->disabled = 1;
@@ -153,6 +342,9 @@ static int __init intel_rdt_late_init(void)
 	ccm->cache_mask = (1ULL << max_cbm_len) - 1;
 	ccm->clos_refcnt = 1;
 
+	for_each_online_cpu(i)
+		rdt_cpumask_update(i);
+
 	pr_info("Intel cache allocation enabled\n");
 out_err:
 
@@ -161,8 +353,19 @@ out_err:
 
 late_initcall(intel_rdt_late_init);
 
+static struct cftype rdt_files[] = {
+	{
+		.name		= "l3_cache_mask",
+		.seq_show	= intel_cache_alloc_cbm_read,
+		.write_u64	= intel_cache_alloc_cbm_write,
+		.mode		= 0666,
+	},
+	{ }	/* terminate */
+};
+
 struct cgroup_subsys intel_rdt_cgrp_subsys = {
 	.css_alloc		= intel_rdt_css_alloc,
 	.css_free		= intel_rdt_css_free,
+	.legacy_cftypes		= rdt_files,
 	.early_init		= 0,
 };
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/