This patch adds a new cgroup subsystem to support the new Cache Allocation
Technology (CAT) feature found in future Intel Xeon Intel processors. CAT is
part of Resource Director Technology(RDT) or Platform Shared resource control
which provides support to control Platform shared resources like cache.
Cache Allocation Technology(CAT) provides a way for the Software
(OS/VMM) to restrict cache allocation to a defined 'subset' of cache
which may be overlapping with other 'subsets'. This feature is used
when allocating a line in cache ie when pulling new data into the cache.
This patch series is dependent on the V5 patches for Intel Cache QOS Monitoring
from Matt since the series also implements a common software cache for the
IA32_PQR_MSR :
https://lkml.kernel.org/r/[email protected]
It will apply on the CMT patch series(based on 3.19-rc4) in the link above.
Changes in V4:
- Integrated with the latest V5 CMT patches.
- Changed naming of cgroup to rdt(resource director technology) from cat(cache
allocation technology). This was done as the RDT is the umbrella term
for platform shared resources allocation. Hence in future it would be easier
to add resource allocation to the same cgroup
- Naming changes also applied to a lot of other data structures/APIs.
- Added documentation on cgroup usage for cache allocation to address a lot of
questions from various academic and industry regarding cache allocation
usage.
Changes in V3:
- Implements a common software cache for IA32_PQR_MSR
- Implements support for hsw CAT enumeration. This does not use the brand
strings like earlier version but does a probe test. The probe test is done
only on hsw family of processors
- Made a few coding convention, name changes
- Check for lock being held when ClosID manipulation happens
Changes in V2:
- Removed HSW specific enumeration changes. Plan to include it later as a
seperate patch.
- Fixed the code in prep_arch_switch to be specific for x86 and removed
x86 defines.
- Fixed cbm_write to not write all 1s when a cgroup is freed.
- Fixed one possible memory leak in init.
- Changed some of manual bitmap
manipulation to use the predefined bitmap APIs to make code more readable
- Changed name in sources from cqe to cat
- Global cat enable flag changed to static_key and disabled cgroup early_init
This patch adds support for the new Cache Allocation Technology (CAT)
feature found in future Intel Xeon processors. CAT is part of Intel
Resource Director Technology(RDT) which enables sharing of processor
resources. This patch includes CPUID enumeration routines for CAT and
new values to track CAT resources to the cpuinfo_x86 structure.
Cache Allocation Technology(CAT) provides a way for the Software
(OS/VMM) to restrict cache allocation to a defined 'subset' of cache
which may be overlapping with other 'subsets'. This feature is used
when allocating a line in cache ie when pulling new data into the cache.
The programming of the h/w is done via programming MSRs.
More information about CAT be found in the Intel (R) x86 Architecture
Software Developer Manual, section 17.15.
Signed-off-by: Vikas Shivappa <[email protected]>
---
arch/x86/include/asm/cpufeature.h | 6 ++++-
arch/x86/include/asm/processor.h | 3 +++
arch/x86/kernel/cpu/Makefile | 1 +
arch/x86/kernel/cpu/common.c | 15 ++++++++++++
arch/x86/kernel/cpu/intel_rdt.c | 51 +++++++++++++++++++++++++++++++++++++++
init/Kconfig | 11 +++++++++
6 files changed, 86 insertions(+), 1 deletion(-)
create mode 100644 arch/x86/kernel/cpu/intel_rdt.c
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 54fd8eb..d97b785 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -12,7 +12,7 @@
#include <asm/disabled-features.h>
#endif
-#define NCAPINTS 13 /* N 32-bit words worth of info */
+#define NCAPINTS 14 /* N 32-bit words worth of info */
#define NBUGINTS 1 /* N 32-bit bug flags */
/*
@@ -227,6 +227,7 @@
#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
+#define X86_FEATURE_RDT ( 9*32+15) /* Resource Allocation */
#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */
@@ -248,6 +249,9 @@
/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
+/* Intel-defined CPU features, CPUID level 0x00000010:0 (ebx), word 13 */
+#define X86_FEATURE_CAT_L3 (13*32 + 1) /*Cache QOS Enforcement L3*/
+
/*
* BUG word(s)
*/
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 242ceed..81d95ac 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -114,6 +114,9 @@ struct cpuinfo_x86 {
int x86_cache_occ_scale; /* scale to bytes */
int x86_power;
unsigned long loops_per_jiffy;
+ /* Cache Allocation Technology values */
+ int x86_cat_cbmlength;
+ int x86_cat_closs;
/* cpuid returned max cores value: */
u16 x86_max_cores;
u16 apicid;
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 6c1ca13..6c91e39 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
perf_event_intel_uncore_nhmex.o
endif
+obj-$(CONFIG_CGROUP_RDT) +=intel_rdt.o
obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9b0fb70..c5ea1dd 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -668,6 +668,21 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
}
}
+ /* Additional Intel-defined flags: level 0x00000010 */
+ if (c->cpuid_level >= 0x00000010) {
+ u32 eax, ebx, ecx, edx;
+
+ cpuid_count(0x00000010, 0, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[13] = ebx;
+
+ if (cpu_has(c, X86_FEATURE_CAT_L3)) {
+
+ cpuid_count(0x00000010, 1, &eax, &ebx, &ecx, &edx);
+ c->x86_cat_closs = (edx & 0xffff) + 1;
+ c->x86_cat_cbmlength = (eax & 0xf) + 1;
+ }
+ }
+
/* AMD-defined flags: level 0x80000001 */
xlvl = cpuid_eax(0x80000000);
c->extended_cpuid_level = xlvl;
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
new file mode 100644
index 0000000..46ce449
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -0,0 +1,51 @@
+/*
+ * Resource Director Technology(RDT) code
+ *
+ * Copyright (C) 2014 Intel Corporation
+ *
+ * 2014-09-10 Written by Vikas Shivappa
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual, section 17.15.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+
+static inline bool rdt_supported(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_RDT))
+ return true;
+
+ return false;
+}
+
+static int __init rdt_late_init(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ int maxid, cbm_len;
+
+ if (!rdt_supported(c))
+ return -ENODEV;
+
+ maxid = c->x86_cat_closs;
+ cbm_len = c->x86_cat_cbmlength;
+
+ pr_info("cbmlength:%u,Closs: %u\n", cbm_len, maxid);
+
+ return 0;
+}
+
+late_initcall(rdt_late_init);
diff --git a/init/Kconfig b/init/Kconfig
index 9afb971..c5004b3 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -961,6 +961,17 @@ config CPUSETS
Say N if unsure.
+config CGROUP_RDT
+ bool "Resource Director Technology cgroup subsystem"
+ depends on X86_64
+ help
+ This option provides framework to allocate resources like
+ cache lines when applications fill cache.
+ This can be used by users to configure how much cache
+ that can be allocated to different applications.
+
+ Say N if unsure.
+
config PROC_PID_CPUSET
bool "Include legacy /proc/<pid>/cpuset file"
depends on CPUSETS
--
1.9.1
This patch adds a cgroup subsystem to support Intel Resource Director
Technology(RDT) or Platform Shared resources Control. The resources that
are currently supported for sharing is Last level cache
(Cache Allocation Technology or CAT).
When a RDT cgroup is created it has a CLOSid and CBM associated with it
which are inherited from its parent. A Class of service(CLOS) in Cache
Allocation is represented by a CLOSid. CLOSid is internal to the kernel
and not exposed to user. Cache bitmask(CBM) represents one cache
'subset'. Root cgroup would have all available bits set for its CBM and
would be assigned the CLOSid 0.
CLOSid allocation is tracked using a separate bitmap. The maximum number
of CLOSids is specified by the h/w during CPUID enumeration and the
kernel simply throws an -ENOSPC when it runs out of CLOSids.
Each CBM has an associated CLOSid. If multiple cgroups have the same CBM
they would also have the same CLOSid. The reference count parameter in
CLOSid-CBM map keeps track of how many cgroups are using each
CLOSid<->CBM mapping.
Signed-off-by: Vikas Shivappa <[email protected]>
---
arch/x86/include/asm/intel_rdt.h | 40 +++++++++++++++
arch/x86/kernel/cpu/intel_rdt.c | 103 ++++++++++++++++++++++++++++++++++++---
include/linux/cgroup_subsys.h | 4 ++
3 files changed, 141 insertions(+), 6 deletions(-)
create mode 100644 arch/x86/include/asm/intel_rdt.h
diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
new file mode 100644
index 0000000..ecd9664
--- /dev/null
+++ b/arch/x86/include/asm/intel_rdt.h
@@ -0,0 +1,40 @@
+#ifndef _RDT_H_
+#define _RDT_H_
+
+#ifdef CONFIG_CGROUP_RDT
+
+#include <linux/cgroup.h>
+
+struct rdt_subsys_info {
+ /* Clos Bitmap to keep track of available CLOSids.*/
+ unsigned long *closmap;
+};
+
+struct intel_rdt {
+ struct cgroup_subsys_state css;
+ /* Class of service for the cgroup.*/
+ unsigned int clos;
+ /* Corresponding cache bit mask.*/
+ unsigned long *cbm;
+};
+
+struct clos_cbm_map {
+ unsigned long cbm;
+ unsigned int cgrp_count;
+};
+
+/*
+ * Return rdt group corresponding to this container.
+ */
+static inline struct intel_rdt *css_rdt(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct intel_rdt, css) : NULL;
+}
+
+static inline struct intel_rdt *parent_rdt(struct intel_rdt *ir)
+{
+ return css_rdt(ir->css.parent);
+}
+
+#endif
+#endif
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 46ce449..6cf1a16 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -23,10 +23,19 @@
#include <linux/slab.h>
#include <linux/err.h>
#include <linux/spinlock.h>
+#include <asm/intel_rdt.h>
-static inline bool rdt_supported(struct cpuinfo_x86 *c)
+/*
+ * ccmap maintains 1:1 mapping between CLOSid and cbm.
+ */
+static struct clos_cbm_map *ccmap;
+static struct rdt_subsys_info rdtss_info;
+static DEFINE_MUTEX(rdt_group_mutex);
+struct intel_rdt rdt_root_group;
+
+static inline bool cat_supported(struct cpuinfo_x86 *c)
{
- if (cpu_has(c, X86_FEATURE_RDT))
+ if (cpu_has(c, X86_FEATURE_CAT_L3))
return true;
return false;
@@ -35,17 +44,99 @@ static inline bool rdt_supported(struct cpuinfo_x86 *c)
static int __init rdt_late_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
+ static struct clos_cbm_map *ccm;
+ size_t sizeb;
int maxid, cbm_len;
- if (!rdt_supported(c))
+ if (!cat_supported(c)) {
+ rdt_root_group.css.ss->disabled = 1;
return -ENODEV;
+ } else {
+ maxid = c->x86_cat_closs;
+ cbm_len = c->x86_cat_cbmlength;
+ sizeb = BITS_TO_LONGS(maxid) * sizeof(long);
+
+ rdtss_info.closmap = kzalloc(sizeb, GFP_KERNEL);
+ if (!rdtss_info.closmap)
+ return -ENOMEM;
- maxid = c->x86_cat_closs;
- cbm_len = c->x86_cat_cbmlength;
+ sizeb = maxid * sizeof(struct clos_cbm_map);
+ ccmap = kzalloc(sizeb, GFP_KERNEL);
+ if (!ccmap) {
+ kfree(rdtss_info.closmap);
+ return -ENOMEM;
+ }
- pr_info("cbmlength:%u,Closs: %u\n", cbm_len, maxid);
+ set_bit(0, rdtss_info.closmap);
+ rdt_root_group.clos = 0;
+
+ ccm = &ccmap[0];
+ ccm->cbm = (u32)((u64)(1 << cbm_len) - 1);
+ rdt_root_group.cbm = &(ccm->cbm);
+ ccm->cgrp_count++;
+
+ pr_info("cbmlength:%u,Closs: %u\n", cbm_len, maxid);
+ }
return 0;
}
late_initcall(rdt_late_init);
+
+/*
+* Called with the rdt_group_mutex held.
+*/
+static int rdt_free_closid(struct intel_rdt *ir)
+{
+
+ lockdep_assert_held(&rdt_group_mutex);
+
+ WARN_ON(!ccmap[ir->clos].cgrp_count);
+ ccmap[ir->clos].cgrp_count--;
+ if (!ccmap[ir->clos].cgrp_count)
+ clear_bit(ir->clos, rdtss_info.closmap);
+
+ return 0;
+}
+
+static struct cgroup_subsys_state *
+rdt_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct intel_rdt *parent = css_rdt(parent_css);
+ struct intel_rdt *ir;
+
+ /*
+ * Cannot return failure on systems with no Cache Allocation
+ * as the cgroup_init does not handle failures gracefully.
+ */
+ if (!parent)
+ return &rdt_root_group.css;
+
+ ir = kzalloc(sizeof(struct intel_rdt), GFP_KERNEL);
+ if (!ir)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_lock(&rdt_group_mutex);
+ ir->clos = parent->clos;
+ ccmap[parent->clos].cgrp_count++;
+ mutex_unlock(&rdt_group_mutex);
+
+ ir->cbm = parent->cbm;
+ return &ir->css;
+}
+
+static void rdt_css_free(struct cgroup_subsys_state *css)
+{
+ struct intel_rdt *ir = css_rdt(css);
+
+ mutex_lock(&rdt_group_mutex);
+ rdt_free_closid(ir);
+ kfree(ir);
+ mutex_unlock(&rdt_group_mutex);
+}
+
+struct cgroup_subsys rdt_cgrp_subsys = {
+ .css_alloc = rdt_css_alloc,
+ .css_free = rdt_css_free,
+ .early_init = 0,
+};
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 98c4f9b..925c370 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -47,6 +47,10 @@ SUBSYS(net_prio)
SUBSYS(hugetlb)
#endif
+#if IS_ENABLED(CONFIG_CGROUP_RDT)
+SUBSYS(rdt)
+#endif
+
/*
* The following subsystems are not supported on the default hierarchy.
*/
--
1.9.1
Add support for cache bit mask manipulation. The change adds a file to
the RDT cgroup which represents the CBM(cache bit mask) for the cgroup.
The RDT cgroup follows cgroup hierarchy ,mkdir and adding tasks to the
cgroup never fails. When a child cgroup is created it inherits the
CLOSid and the CBM from its parent. When a user changes the default
CBM for a cgroup, a new CLOSid may be allocated if the CBM was not
used before. If the new CBM is the one that is already used, the
count for that CLOSid<->CBM is incremented. The changing of 'cbm'
may fail with -ENOSPC once the kernel runs out of maximum CLOSids it
can support.
User can create as many cgroups as he wants but having different CBMs
at the same time is restricted by the maximum number of CLOSids
(multiple cgroups can have the same CBM).
Kernel maintains a CLOSid<->cbm mapping which keeps count
of cgroups using a CLOSid.
The tasks in the CAT cgroup would get to fill the LLC cache represented
by the cgroup's 'cbm' file.
Reuse of CLOSids for cgroups with same bitmask also has following
advantages:
- This helps to use the scant CLOSids optimally.
- This also implies that during context switch, write to PQR-MSR is done
only when a task with a different bitmask is scheduled in.
Signed-off-by: Vikas Shivappa <[email protected]>
---
arch/x86/include/asm/intel_rdt.h | 3 +
arch/x86/kernel/cpu/intel_rdt.c | 179 +++++++++++++++++++++++++++++++++++++++
2 files changed, 182 insertions(+)
diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
index ecd9664..a414771 100644
--- a/arch/x86/include/asm/intel_rdt.h
+++ b/arch/x86/include/asm/intel_rdt.h
@@ -4,6 +4,9 @@
#ifdef CONFIG_CGROUP_RDT
#include <linux/cgroup.h>
+#define MAX_CBM_LENGTH 32
+#define IA32_L3_CBM_BASE 0xc90
+#define CBM_FROM_INDEX(x) (IA32_L3_CBM_BASE + x)
struct rdt_subsys_info {
/* Clos Bitmap to keep track of available CLOSids.*/
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 6cf1a16..dd090a7 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -33,6 +33,9 @@ static struct rdt_subsys_info rdtss_info;
static DEFINE_MUTEX(rdt_group_mutex);
struct intel_rdt rdt_root_group;
+#define rdt_for_each_child(pos_css, parent_ir) \
+ css_for_each_child((pos_css), &(parent_ir)->css)
+
static inline bool cat_supported(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_CAT_L3))
@@ -84,6 +87,30 @@ static int __init rdt_late_init(void)
late_initcall(rdt_late_init);
/*
+ * Allocates a new closid from unused closids.
+ * Called with the rdt_group_mutex held.
+ */
+
+static int rdt_alloc_closid(struct intel_rdt *ir)
+{
+ unsigned int id;
+ unsigned int maxid;
+
+ lockdep_assert_held(&rdt_group_mutex);
+
+ maxid = boot_cpu_data.x86_cat_closs;
+ id = find_next_zero_bit(rdtss_info.closmap, maxid, 0);
+ if (id == maxid)
+ return -ENOSPC;
+
+ set_bit(id, rdtss_info.closmap);
+ ccmap[id].cgrp_count++;
+ ir->clos = id;
+
+ return 0;
+}
+
+/*
* Called with the rdt_group_mutex held.
*/
static int rdt_free_closid(struct intel_rdt *ir)
@@ -135,8 +162,160 @@ static void rdt_css_free(struct cgroup_subsys_state *css)
mutex_unlock(&rdt_group_mutex);
}
+/*
+ * Tests if atleast two contiguous bits are set.
+ */
+
+static inline bool cbm_is_contiguous(unsigned long var)
+{
+ unsigned long first_bit, zero_bit;
+ unsigned long maxcbm = MAX_CBM_LENGTH;
+
+ if (bitmap_weight(&var, maxcbm) < 2)
+ return false;
+
+ first_bit = find_next_bit(&var, maxcbm, 0);
+ zero_bit = find_next_zero_bit(&var, maxcbm, first_bit);
+
+ if (find_next_bit(&var, maxcbm, zero_bit) < maxcbm)
+ return false;
+
+ return true;
+}
+
+static int cat_cbm_read(struct seq_file *m, void *v)
+{
+ struct intel_rdt *ir = css_rdt(seq_css(m));
+
+ seq_bitmap(m, ir->cbm, MAX_CBM_LENGTH);
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static int validate_cbm(struct intel_rdt *ir, unsigned long cbmvalue)
+{
+ struct intel_rdt *par, *c;
+ struct cgroup_subsys_state *css;
+
+ if (!cbm_is_contiguous(cbmvalue)) {
+ pr_info("cbm should have >= 2 bits and be contiguous\n");
+ return -EINVAL;
+ }
+
+ par = parent_rdt(ir);
+ if (!bitmap_subset(&cbmvalue, par->cbm, MAX_CBM_LENGTH))
+ return -EINVAL;
+
+ rcu_read_lock();
+ rdt_for_each_child(css, ir) {
+ c = css_rdt(css);
+ if (!bitmap_subset(c->cbm, &cbmvalue, MAX_CBM_LENGTH)) {
+ pr_info("Children's mask not a subset\n");
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ }
+
+ rcu_read_unlock();
+ return 0;
+}
+
+static bool cbm_search(unsigned long cbm, int *closid)
+{
+ int maxid = boot_cpu_data.x86_cat_closs;
+ unsigned int i;
+
+ for (i = 0; i < maxid; i++)
+ if (bitmap_equal(&cbm, &ccmap[i].cbm, MAX_CBM_LENGTH)) {
+ *closid = i;
+ return true;
+ }
+
+ return false;
+}
+
+static void cbmmap_dump(void)
+{
+ int i;
+
+ pr_debug("CBMMAP\n");
+ for (i = 0; i < boot_cpu_data.x86_cat_closs; i++)
+ pr_debug("cbm: 0x%x,cgrp_count: %u\n",
+ (unsigned int)ccmap[i].cbm, ccmap[i].cgrp_count);
+}
+
+/*
+ * rdt_cbm_write() - Validates and writes the cache bit mask(cbm)
+ * to the IA32_L3_MASK_n and also store the same in the ccmap.
+ *
+ * CLOSids are reused for cgroups which have same bitmask.
+ * - This helps to use the scant CLOSids optimally.
+ * - This also implies that at context switch write
+ * to PQR-MSR is done only when a task with a
+ * different bitmask is scheduled in.
+ */
+
+static int cat_cbm_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 cbmvalue)
+{
+ struct intel_rdt *ir = css_rdt(css);
+ ssize_t err = 0;
+ unsigned long cbm;
+ unsigned int closid;
+ u32 cbm_mask =
+ (u32)((u64)(1 << boot_cpu_data.x86_cat_cbmlength) - 1);
+
+ if (ir == &rdt_root_group)
+ return -EPERM;
+
+ /*
+ * Need global mutex as cbm write may allocate a closid.
+ */
+ mutex_lock(&rdt_group_mutex);
+ cbm = cbmvalue & cbm_mask;
+
+ if (bitmap_equal(&cbm, ir->cbm, MAX_CBM_LENGTH))
+ goto out;
+
+ err = validate_cbm(ir, cbm);
+ if (err)
+ goto out;
+
+ rdt_free_closid(ir);
+ if (cbm_search(cbm, &closid)) {
+ ir->clos = closid;
+ ccmap[ir->clos].cgrp_count++;
+ } else {
+ err = rdt_alloc_closid(ir);
+ if (err)
+ goto out;
+
+ wrmsrl(CBM_FROM_INDEX(ir->clos), cbm);
+ }
+
+ ccmap[ir->clos].cbm = cbm;
+ ir->cbm = &ccmap[ir->clos].cbm;
+ cbmmap_dump();
+
+out:
+
+ mutex_unlock(&rdt_group_mutex);
+ return err;
+}
+
+static struct cftype rdt_files[] = {
+ {
+ .name = "cbm",
+ .seq_show = cat_cbm_read,
+ .write_u64 = cat_cbm_write,
+ .mode = 0666,
+ },
+ { } /* terminate */
+};
+
struct cgroup_subsys rdt_cgrp_subsys = {
.css_alloc = rdt_css_alloc,
.css_free = rdt_css_free,
+ .legacy_cftypes = rdt_files,
.early_init = 0,
};
--
1.9.1
Adds support for IA32_PQR_ASSOC MSR writes during task scheduling.
The high 32 bits in the per processor MSR IA32_PQR_ASSOC represents the
CLOSid. During context switch kernel implements this by writing the
CLOSid of the cgroup to which the task belongs to the CPU's
IA32_PQR_ASSOC MSR.
For Cache Allocation, this would let the task fill in the cache 'subset'
represented by the cgroup's Cache bit mask(CBM).
Signed-off-by: Vikas Shivappa <[email protected]>
---
arch/x86/include/asm/intel_rdt.h | 55 ++++++++++++++++++++++++++++++++++++++++
arch/x86/include/asm/switch_to.h | 3 +++
arch/x86/kernel/cpu/intel_rdt.c | 4 ++-
kernel/sched/core.c | 1 +
kernel/sched/sched.h | 3 +++
5 files changed, 65 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
index a414771..bc57b56 100644
--- a/arch/x86/include/asm/intel_rdt.h
+++ b/arch/x86/include/asm/intel_rdt.h
@@ -4,9 +4,13 @@
#ifdef CONFIG_CGROUP_RDT
#include <linux/cgroup.h>
+
+#define MSR_IA32_PQR_ASSOC 0xc8f
#define MAX_CBM_LENGTH 32
#define IA32_L3_CBM_BASE 0xc90
#define CBM_FROM_INDEX(x) (IA32_L3_CBM_BASE + x)
+DECLARE_PER_CPU(unsigned int, x86_cpu_clos);
+extern struct static_key rdt_enable_key;
struct rdt_subsys_info {
/* Clos Bitmap to keep track of available CLOSids.*/
@@ -26,6 +30,11 @@ struct clos_cbm_map {
unsigned int cgrp_count;
};
+static inline bool rdt_enabled(void)
+{
+ return static_key_false(&rdt_enable_key);
+}
+
/*
* Return rdt group corresponding to this container.
*/
@@ -39,5 +48,51 @@ static inline struct intel_rdt *parent_rdt(struct intel_rdt *ir)
return css_rdt(ir->css.parent);
}
+/*
+ * Return rdt group to which this task belongs.
+ */
+static inline struct intel_rdt *task_rdt(struct task_struct *task)
+{
+ return css_rdt(task_css(task, rdt_cgrp_id));
+}
+
+/*
+ * rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
+ * if the current Closid is different than the new one.
+ */
+
+static inline void rdt_sched_in(struct task_struct *task)
+{
+ struct intel_rdt *ir;
+ unsigned int clos;
+
+ if (!rdt_enabled())
+ return;
+
+ /*
+ * This needs to be fixed after CQM code stabilizes
+ * to cache the whole PQR instead of just CLOSid.
+ * PQR has closid in high 32 bits and CQM-RMID in low 10 bits.
+ * Should not write a 0 to the low 10 bits of PQR
+ * and corrupt RMID.
+ */
+ clos = this_cpu_read(x86_cpu_clos);
+
+ rcu_read_lock();
+ ir = task_rdt(task);
+ if (ir->clos == clos) {
+ rcu_read_unlock();
+ return;
+ }
+
+ wrmsr(MSR_IA32_PQR_ASSOC, 0, ir->clos);
+ this_cpu_write(x86_cpu_clos, ir->clos);
+ rcu_read_unlock();
+}
+
+#else
+
+static inline void rdt_sched_in(struct task_struct *task) {}
+
#endif
#endif
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 751bf4b..82ef4b3 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -8,6 +8,9 @@ struct tss_struct;
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss);
+#include <asm/intel_rdt.h>
+#define post_arch_switch(current) rdt_sched_in(current)
+
#ifdef CONFIG_X86_32
#ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index dd090a7..602c580 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -32,6 +32,8 @@ static struct clos_cbm_map *ccmap;
static struct rdt_subsys_info rdtss_info;
static DEFINE_MUTEX(rdt_group_mutex);
struct intel_rdt rdt_root_group;
+struct static_key __read_mostly rdt_enable_key = STATIC_KEY_INIT_FALSE;
+DEFINE_PER_CPU(unsigned int, x86_cpu_clos);
#define rdt_for_each_child(pos_css, parent_ir) \
css_for_each_child((pos_css), &(parent_ir)->css)
@@ -77,7 +79,7 @@ static int __init rdt_late_init(void)
ccm->cbm = (u32)((u64)(1 << cbm_len) - 1);
rdt_root_group.cbm = &(ccm->cbm);
ccm->cgrp_count++;
-
+ static_key_slow_inc(&rdt_enable_key);
pr_info("cbmlength:%u,Closs: %u\n", cbm_len, maxid);
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d22fb16..a5c4d87 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2249,6 +2249,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
prev_state = prev->state;
vtime_task_switch(prev);
finish_arch_switch(prev);
+ post_arch_switch(current);
perf_event_task_sched_in(prev, current);
finish_lock_switch(rq, prev);
finish_arch_post_lock_switch();
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9a2a45c..49e77d7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1008,6 +1008,9 @@ static inline int task_on_rq_migrating(struct task_struct *p)
#ifndef finish_arch_switch
# define finish_arch_switch(prev) do { } while (0)
#endif
+#ifndef post_arch_switch
+# define post_arch_switch(current) do { } while (0)
+#endif
#ifndef finish_arch_post_lock_switch
# define finish_arch_post_lock_switch() do { } while (0)
#endif
--
1.9.1
This patch implements a common software cache for IA32_PQR_MSR(RMID 0:9,
CLOSId 32:63) to be used by both CMT and CAT. CMT updates the RMID
where as CAT updates the CLOSid in the software cache. When the new
RMID/CLOSid value is different from the cached values, IA32_PQR_MSR is
updated. Since the measured rdmsr latency for IA32_PQR_MSR is very
high(~250 cycles) this software cache is necessary to avoid reading the
MSR to compare the current CLOSid value.
Signed-off-by: Vikas Shivappa <[email protected]>
---
arch/x86/include/asm/intel_rdt.h | 31 +++++++++++++++---------------
arch/x86/include/asm/rdt_common.h | 13 +++++++++++++
arch/x86/kernel/cpu/perf_event_intel_cqm.c | 20 +++++++------------
3 files changed, 36 insertions(+), 28 deletions(-)
create mode 100644 arch/x86/include/asm/rdt_common.h
diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
index bc57b56..27621c8 100644
--- a/arch/x86/include/asm/intel_rdt.h
+++ b/arch/x86/include/asm/intel_rdt.h
@@ -4,12 +4,13 @@
#ifdef CONFIG_CGROUP_RDT
#include <linux/cgroup.h>
+#include <asm/rdt_common.h>
-#define MSR_IA32_PQR_ASSOC 0xc8f
#define MAX_CBM_LENGTH 32
#define IA32_L3_CBM_BASE 0xc90
#define CBM_FROM_INDEX(x) (IA32_L3_CBM_BASE + x)
-DECLARE_PER_CPU(unsigned int, x86_cpu_clos);
+
+DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
extern struct static_key rdt_enable_key;
struct rdt_subsys_info {
@@ -64,30 +65,30 @@ static inline struct intel_rdt *task_rdt(struct task_struct *task)
static inline void rdt_sched_in(struct task_struct *task)
{
struct intel_rdt *ir;
- unsigned int clos;
+ struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+ unsigned long flags;
if (!rdt_enabled())
return;
- /*
- * This needs to be fixed after CQM code stabilizes
- * to cache the whole PQR instead of just CLOSid.
- * PQR has closid in high 32 bits and CQM-RMID in low 10 bits.
- * Should not write a 0 to the low 10 bits of PQR
- * and corrupt RMID.
- */
- clos = this_cpu_read(x86_cpu_clos);
-
+ raw_spin_lock_irqsave(&state->lock, flags);
rcu_read_lock();
ir = task_rdt(task);
- if (ir->clos == clos) {
+ if (ir->clos == state->clos) {
rcu_read_unlock();
+ raw_spin_unlock_irqrestore(&state->lock, flags);
return;
}
- wrmsr(MSR_IA32_PQR_ASSOC, 0, ir->clos);
- this_cpu_write(x86_cpu_clos, ir->clos);
+ /*
+ * PQR has closid in high 32 bits and CQM-RMID
+ * in low 10 bits. Rewrite the exsting rmid from
+ * software cache.
+ */
+ wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, ir->clos);
+ state->clos = ir->clos;
rcu_read_unlock();
+ raw_spin_unlock_irqrestore(&state->lock, flags);
}
#else
diff --git a/arch/x86/include/asm/rdt_common.h b/arch/x86/include/asm/rdt_common.h
new file mode 100644
index 0000000..c87f908
--- /dev/null
+++ b/arch/x86/include/asm/rdt_common.h
@@ -0,0 +1,13 @@
+#ifndef _X86_RDT_H_
+#define _X86_RDT_H_
+
+#define MSR_IA32_PQR_ASSOC 0x0c8f
+
+struct intel_pqr_state {
+ raw_spinlock_t lock;
+ int rmid;
+ int clos;
+ int cnt;
+};
+
+#endif
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 596d1ec..63c52e0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -7,22 +7,16 @@
#include <linux/perf_event.h>
#include <linux/slab.h>
#include <asm/cpu_device_id.h>
+#include <asm/rdt_common.h>
#include "perf_event.h"
-#define MSR_IA32_PQR_ASSOC 0x0c8f
#define MSR_IA32_QM_CTR 0x0c8e
#define MSR_IA32_QM_EVTSEL 0x0c8d
static unsigned int cqm_max_rmid = -1;
static unsigned int cqm_l3_scale; /* supposedly cacheline size */
-struct intel_cqm_state {
- raw_spinlock_t lock;
- int rmid;
- int cnt;
-};
-
-static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
+DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
/*
* Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
@@ -931,7 +925,7 @@ out:
static void intel_cqm_event_start(struct perf_event *event, int mode)
{
- struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
+ struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
unsigned int rmid = event->hw.cqm_rmid;
unsigned long flags;
@@ -948,14 +942,14 @@ static void intel_cqm_event_start(struct perf_event *event, int mode)
WARN_ON_ONCE(state->rmid);
state->rmid = rmid;
- wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid);
+ wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, state->clos);
raw_spin_unlock_irqrestore(&state->lock, flags);
}
static void intel_cqm_event_stop(struct perf_event *event, int mode)
{
- struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
+ struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
unsigned long flags;
if (event->hw.cqm_state & PERF_HES_STOPPED)
@@ -968,7 +962,7 @@ static void intel_cqm_event_stop(struct perf_event *event, int mode)
if (!--state->cnt) {
state->rmid = 0;
- wrmsrl(MSR_IA32_PQR_ASSOC, 0);
+ wrmsr(MSR_IA32_PQR_ASSOC, 0, state->clos);
} else {
WARN_ON_ONCE(!state->rmid);
}
@@ -1213,7 +1207,7 @@ static inline void cqm_pick_event_reader(int cpu)
static void intel_cqm_cpu_prepare(unsigned int cpu)
{
- struct intel_cqm_state *state = &per_cpu(cqm_state, cpu);
+ struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
struct cpuinfo_x86 *c = &cpu_data(cpu);
raw_spin_lock_init(&state->lock);
--
1.9.1
CAT(Cache Allocation Technology) on hsw needs to be enumerated
separately. CAT is only supported on certain HSW SKUs. This patch does
a probe test for hsw CPUs by writing a CLOSid into high 32 bits of
IA32_PQR_MSR and see if the bits stick. The probe test is only done
after confirming that the CPU is HSW.
Signed-off-by: Vikas Shivappa <[email protected]>
---
arch/x86/kernel/cpu/intel_rdt.c | 42 +++++++++++++++++++++++++++++++++++++++++
1 file changed, 42 insertions(+)
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 602c580..d61be19 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -38,11 +38,53 @@ DEFINE_PER_CPU(unsigned int, x86_cpu_clos);
#define rdt_for_each_child(pos_css, parent_ir) \
css_for_each_child((pos_css), &(parent_ir)->css)
+/*
+ * hsw_probetest() - Have to do probe
+ * test for Intel haswell CPUs as it does not have
+ * CPUID enumeration support for CAT.
+ *
+ * Probes by writing to the high 32 bits(CLOSid)
+ * of the IA32_PQR_MSR and testing if the bits stick.
+ * Then hardcode the max CLOS and max bitmask length on hsw.
+ */
+
+static inline bool hsw_probetest(void)
+{
+ u32 l, h_old, h_new, h_tmp;
+
+ if (rdmsr_safe(MSR_IA32_PQR_ASSOC, &l, &h_old))
+ return false;
+
+ /*
+ * Default value is always 0 if feature is present.
+ */
+ h_tmp = h_old ^ 0x1U;
+ if (wrmsr_safe(MSR_IA32_PQR_ASSOC, l, h_tmp) ||
+ rdmsr_safe(MSR_IA32_PQR_ASSOC, &l, &h_new))
+ return false;
+
+ if (h_tmp != h_new)
+ return false;
+
+ wrmsr_safe(MSR_IA32_PQR_ASSOC, l, h_old);
+
+ boot_cpu_data.x86_cat_closs = 4;
+ boot_cpu_data.x86_cat_cbmlength = 20;
+
+ return true;
+}
+
static inline bool cat_supported(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_CAT_L3))
return true;
+ /*
+ * Probe test for Haswell CPUs.
+ */
+ if (c->x86 == 6 && c->x86_model == 0x3f)
+ return hsw_probetest();
+
return false;
}
--
1.9.1
This patch adds a description of Cache allocation technology, overview
of kernel implementation and usage of CAT cgroup interface.
Signed-off-by: Vikas Shivappa <[email protected]>
---
Documentation/cgroups/rdt.txt | 184 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 184 insertions(+)
create mode 100644 Documentation/cgroups/rdt.txt
diff --git a/Documentation/cgroups/rdt.txt b/Documentation/cgroups/rdt.txt
new file mode 100644
index 0000000..9f9e445
--- /dev/null
+++ b/Documentation/cgroups/rdt.txt
@@ -0,0 +1,184 @@
+ RDT
+ ---
+
+Copyright (C) 2014 Intel Corporation
+Written by [email protected]
+(based on contents and format from cpusets.txt)
+
+CONTENTS:
+=========
+
+1. Cache Allocation Technology
+ 1.1 What is RDT and CAT ?
+ 1.2 Why is CAT needed ?
+ 1.3 CAT implementation overview
+ 1.4 Assignment of CBM and CLOS
+ 1.5 Scheduling and Context Switch
+2. Usage Examples and Syntax
+
+1. Cache Allocation Technology(CAT)
+===================================
+
+1.1 What is RDT and CAT
+-----------------------
+
+CAT is a part of Resource Director Technology(RDT) or Platform Shared
+resource control which provides support to control Platform shared
+resources like cache. Currently Cache is the only resource that is
+supported in RDT.
+
+Cache Allocation Technology provides a way for the Software (OS/VMM)
+to restrict cache allocation to a defined 'subset' of cache which may
+be overlapping with other 'subsets'. This feature is used when
+allocating a line in cache ie when pulling new data into the cache.
+The programming of the h/w is done via programming MSRs.
+
+The different cache subsets are identified by CLOS identifier (class
+of service) and each CLOS has a CBM (cache bit mask). The CBM is a
+contiguous set of bits which defines the amount of cache resource that
+is available for each 'subset'.
+
+1.2 Why is CAT needed
+---------------------
+
+The CAT enables more cache resources to be made available for higher
+priority applications based on guidance from the execution
+environment.
+
+The architecture also allows dynamically changing these subsets during
+runtime to further optimize the performance of the higher priority
+application with minimal degradation to the low priority app.
+Additionally, resources can be rebalanced for system throughput
+benefit. (Refer to Section 17.15 in the Intel SDM)
+
+This technique may be useful in managing large computer systems which
+large LLC. Examples may be large servers running instances of
+webservers or database servers. In such complex systems, these subsets
+can be used for more careful placing of the available cache
+resources.
+
+The CAT kernel patch would provide a basic kernel framework for users
+to be able to implement such cache subsets.
+
+1.3 CAT implementation Overview
+-------------------------------
+
+Kernel implements a cgroup subsystem to support cache allocation
+
+Each cgroup has a CLOSid <-> CBM(cache bit mask) mapping.
+A CLOS(Class of service) is represented by a CLOSid.CLOSid is internal
+to the kernel and not exposed to user. Each cgroup would have one CBM
+and would just represent one cache 'subset'.
+
+The cgroup follows cgroup hierarchy ,mkdir and adding tasks to the
+cgroup never fails. When a child cgroup is created it inherits the
+CLOSid and the CBM from its parent. When a user changes the default
+CBM for a cgroup, a new CLOSid may be allocated if the CBM was not
+used before. The changing of 'cbm' may fail with -ERRNOSPC once the
+kernel runs out of maximum CLOSids it can support.
+User can create as many cgroups as he wants but having different CBMs
+at the same time is restricted by the maximum number of CLOSids
+(multiple cgroups can have the same CBM).
+Kernel maintains a CLOSid<->cbm mapping which keeps reference counter
+for each cgroup using a CLOSid.
+
+The tasks in the cgroup would get to fill the LLC cache represented by
+the cgroup's 'cbm' file.
+
+Root directory would have all available bits set in 'cbm' file by
+default.
+
+1.4 Assignment of CBM,CLOS
+--------------------------
+
+The 'cbm' needs to be a subset of the parent node's 'cbm'.
+Any contiguous subset of these bits(with a minimum of 2 bits) maybe
+set to indicate the cache mapping desired. The 'cbm' between 2
+directories can overlap. The 'cbm' would represent the cache 'subset'
+of the CAT cgroup.
+For ex: on a system with 16 bits of max cbm bits,
+if the directory has the least significant 4 bits set in its 'cbm'
+file(meaning the 'cbm' is just 0xf), it
+would be allocated the right quarter of the Last level cache which
+means the tasks belonging to this CAT cgroup can use the right quarter
+of the cache to fill. If it has the most significant 8 bits set ,it
+would be allocated the left half of the cache(8 bits out of 16
+represents 50%).
+
+The cache portion defined in the CBM file is available to all tasks
+within the cgroup to fill and these task are not allowed to allocate
+space in other parts of the cache.
+
+1.5 Scheduling and Context Switch
+---------------------------------
+
+During context switch kernel implements this by writing the
+CLOSid (internally maintained by kernel) of the cgroup to which the task
+belongs to the CPU's IA32_PQR_ASSOC MSR. The MSR is only written when
+there is a change in the CLOSid for the CPU in order to minimize the
+latency incurred during context switch.
+
+2. Usage examples and syntax
+============================
+
+To check if CAT was enabled on your system
+
+dmesg | grep -i intel_rdt
+should output : intel_rdt: cbmlength:xx, Closs:xx
+the length of cbm and CLOS should depend on the system you use.
+
+
+Following would mount the cache allocation cgroup subsystem and create
+2 directories. Please refer to Documentation/cgroups/cgroups.txt on
+details about how to use cgroups.
+
+ cd /sys/fs/cgroup
+ mkdir rdt
+ mount -t cgroup -ordt rdt /sys/fs/cgroup/rdt
+ cd rdt
+
+Create 2 rdt cgroups
+
+ mkdir group1
+ mkdir group2
+
+Following are some of the Files in the directory
+
+ ls
+ rdt.cbm
+ tasks
+
+Say if the cache is 2MB and cbm supports 16 bits, then setting the
+below allocates the 'right 1/4th(512KB)' of the cache to group2
+
+Edit the CBM for group2 to set the least significant 4 bits. This
+allocates 'right quarter' of the cache.
+
+ cd group2
+ /bin/echo 0xf > cat.cbm
+
+
+Edit the CBM for group2 to set the least significant 8 bits.This
+allocates the right half of the cache to 'group2'.
+
+ cd group2
+ /bin/echo 0xff > rdt.cbm
+
+Assign tasks to the group2
+
+ /bin/echo PID1 > tasks
+ /bin/echo PID2 > tasks
+
+ Meaning now threads
+ PID1 and PID2 get to fill the 'right half' of
+ the cache as the belong to cgroup group2.
+
+Create a group under group2
+
+ cd group2
+ mkdir group21
+ cat rdt.cbm
+ 0xff - inherits parents mask.
+
+ /bin/echo 0xfff > rdt.cbm - throws error as mask has to parent's mask's subset
+
--
1.9.1
On Tue, Feb 24, 2015 at 03:16:38PM -0800, Vikas Shivappa wrote:
> This patch adds support for the new Cache Allocation Technology (CAT)
> feature found in future Intel Xeon processors. CAT is part of Intel
> Resource Director Technology(RDT) which enables sharing of processor
> resources. This patch includes CPUID enumeration routines for CAT and
> new values to track CAT resources to the cpuinfo_x86 structure.
>
> Cache Allocation Technology(CAT) provides a way for the Software
> (OS/VMM) to restrict cache allocation to a defined 'subset' of cache
> which may be overlapping with other 'subsets'. This feature is used
> when allocating a line in cache ie when pulling new data into the cache.
> The programming of the h/w is done via programming MSRs.
>
> More information about CAT be found in the Intel (R) x86 Architecture
> Software Developer Manual, section 17.15.
>
> Signed-off-by: Vikas Shivappa <[email protected]>
> ---
> arch/x86/include/asm/cpufeature.h | 6 ++++-
> arch/x86/include/asm/processor.h | 3 +++
> arch/x86/kernel/cpu/Makefile | 1 +
> arch/x86/kernel/cpu/common.c | 15 ++++++++++++
> arch/x86/kernel/cpu/intel_rdt.c | 51 +++++++++++++++++++++++++++++++++++++++
> init/Kconfig | 11 +++++++++
> 6 files changed, 86 insertions(+), 1 deletion(-)
> create mode 100644 arch/x86/kernel/cpu/intel_rdt.c
>
> diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
> index 54fd8eb..d97b785 100644
> --- a/arch/x86/include/asm/cpufeature.h
> +++ b/arch/x86/include/asm/cpufeature.h
> @@ -12,7 +12,7 @@
> #include <asm/disabled-features.h>
> #endif
>
> -#define NCAPINTS 13 /* N 32-bit words worth of info */
> +#define NCAPINTS 14 /* N 32-bit words worth of info */
> #define NBUGINTS 1 /* N 32-bit bug flags */
>
> /*
> @@ -227,6 +227,7 @@
> #define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
> #define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
> #define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
> +#define X86_FEATURE_RDT ( 9*32+15) /* Resource Allocation */
> #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
> #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
> #define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */
> @@ -248,6 +249,9 @@
> /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
> #define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
>
> +/* Intel-defined CPU features, CPUID level 0x00000010:0 (ebx), word 13 */
> +#define X86_FEATURE_CAT_L3 (13*32 + 1) /*Cache QOS Enforcement L3*/
^^^^
Spaces between comment markers and text please.
> +
> /*
> * BUG word(s)
> */
> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
> index 242ceed..81d95ac 100644
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -114,6 +114,9 @@ struct cpuinfo_x86 {
> int x86_cache_occ_scale; /* scale to bytes */
> int x86_power;
> unsigned long loops_per_jiffy;
> + /* Cache Allocation Technology values */
> + int x86_cat_cbmlength;
> + int x86_cat_closs;
Do I see it correctly, those two can be u16 each?
> /* cpuid returned max cores value: */
> u16 x86_max_cores;
> u16 apicid;
> diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
> index 6c1ca13..6c91e39 100644
> --- a/arch/x86/kernel/cpu/Makefile
> +++ b/arch/x86/kernel/cpu/Makefile
> @@ -47,6 +47,7 @@ obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
> perf_event_intel_uncore_nhmex.o
> endif
>
> +obj-$(CONFIG_CGROUP_RDT) +=intel_rdt.o
>
> obj-$(CONFIG_X86_MCE) += mcheck/
> obj-$(CONFIG_MTRR) += mtrr/
> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
> index 9b0fb70..c5ea1dd 100644
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -668,6 +668,21 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
> }
> }
>
> + /* Additional Intel-defined flags: level 0x00000010 */
> + if (c->cpuid_level >= 0x00000010) {
> + u32 eax, ebx, ecx, edx;
> +
> + cpuid_count(0x00000010, 0, &eax, &ebx, &ecx, &edx);
> + c->x86_capability[13] = ebx;
> +
> + if (cpu_has(c, X86_FEATURE_CAT_L3)) {
> +
> + cpuid_count(0x00000010, 1, &eax, &ebx, &ecx, &edx);
> + c->x86_cat_closs = (edx & 0xffff) + 1;
> + c->x86_cat_cbmlength = (eax & 0xf) + 1;
> + }
> + }
> +
> /* AMD-defined flags: level 0x80000001 */
> xlvl = cpuid_eax(0x80000000);
> c->extended_cpuid_level = xlvl;
> diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
> new file mode 100644
> index 0000000..46ce449
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/intel_rdt.c
> @@ -0,0 +1,51 @@
> +/*
> + * Resource Director Technology(RDT) code
> + *
> + * Copyright (C) 2014 Intel Corporation
> + *
> + * 2014-09-10 Written by Vikas Shivappa
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
> + * more details.
> + *
> + * More information about RDT be found in the Intel (R) x86 Architecture
> + * Software Developer Manual, section 17.15.
> + */
> +
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +#include <linux/slab.h>
> +#include <linux/err.h>
> +#include <linux/spinlock.h>
> +
> +static inline bool rdt_supported(struct cpuinfo_x86 *c)
> +{
> + if (cpu_has(c, X86_FEATURE_RDT))
> + return true;
> +
> + return false;
> +}
> +
> +static int __init rdt_late_init(void)
> +{
> + struct cpuinfo_x86 *c = &boot_cpu_data;
> + int maxid, cbm_len;
> +
> + if (!rdt_supported(c))
you can do cpu_has() directly here instead of the custom wrapper and
drop that rdt_supported() thing.
> + return -ENODEV;
> +
> + maxid = c->x86_cat_closs;
> + cbm_len = c->x86_cat_cbmlength;
No need for those local variables, just use c->...
> +
> + pr_info("cbmlength:%u,Closs: %u\n", cbm_len, maxid);
This text message needs to be much more user-friendly if it is going out
to the console unconditionally.
> +
> + return 0;
> +}
> +
> +late_initcall(rdt_late_init);
Btw, this could all fit nicely in arch/x86/kernel/cpu/intel.c AFAICT
instead of adding a separate file and you probably don't even need the
late_initcall() even...
> diff --git a/init/Kconfig b/init/Kconfig
> index 9afb971..c5004b3 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -961,6 +961,17 @@ config CPUSETS
>
> Say N if unsure.
>
> +config CGROUP_RDT
> + bool "Resource Director Technology cgroup subsystem"
> + depends on X86_64
depends on X86_64 && CPU_SUP_INTEL
Also, this should probably also depend on CGROUP-something or so
AFAICT...
> + help
> + This option provides framework to allocate resources like
> + cache lines when applications fill cache.
> + This can be used by users to configure how much cache
> + that can be allocated to different applications.
This help text doesn't really help me if I'm Joe User.
--
Regards/Gruss,
Boris.
ECO tip #101: Trim your mails when you reply.
--
On Tue, 24 Feb 2015, Borislav Petkov wrote:
> On Tue, Feb 24, 2015 at 03:16:38PM -0800, Vikas Shivappa wrote:
>> -#define NCAPINTS 13 /* N 32-bit words worth of info */
>> +#define NCAPINTS 14 /* N 32-bit words worth of info */
>> #define NBUGINTS 1 /* N 32-bit bug flags */
>>
>> /*
>> @@ -227,6 +227,7 @@
>> #define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
>> #define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
>> #define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
>> +#define X86_FEATURE_RDT ( 9*32+15) /* Resource Allocation */
>> #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
>> #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
>> #define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */
>> @@ -248,6 +249,9 @@
>> /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
>> #define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
>>
>> +/* Intel-defined CPU features, CPUID level 0x00000010:0 (ebx), word 13 */
>> +#define X86_FEATURE_CAT_L3 (13*32 + 1) /*Cache QOS Enforcement L3*/
> ^^^^
> Spaces between comment markers and text please.
Will fix.
>
>> +
>> /*
>> * BUG word(s)
>> */
>> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
>> index 242ceed..81d95ac 100644
>> --- a/arch/x86/include/asm/processor.h
>> +++ b/arch/x86/include/asm/processor.h
>> @@ -114,6 +114,9 @@ struct cpuinfo_x86 {
>> int x86_cache_occ_scale; /* scale to bytes */
>> int x86_power;
>> unsigned long loops_per_jiffy;
>> + /* Cache Allocation Technology values */
>> + int x86_cat_cbmlength;
>> + int x86_cat_closs;
>
> Do I see it correctly, those two can be u16 each?
Yes , this can be u16 as the cbmlength and the number of clos are 4 and 16 bits
only. Will make the change
>
>> /* cpuid returned max cores value: */
>> u16 x86_max_cores;
>> u16 apicid;
>> diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
>> index 6c1ca13..6c91e39 100644
>> --- a/arch/x86/kernel/cpu/Makefile
>> +++ b/arch/x86/kernel/cpu/Makefile
>> @@ -47,6 +47,7 @@ obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
>> perf_event_intel_uncore_nhmex.o
>> endif
>>
>> +obj-$(CONFIG_CGROUP_RDT) +=intel_rdt.o
>>
>> obj-$(CONFIG_X86_MCE) += mcheck/
>> obj-$(CONFIG_MTRR) += mtrr/
>> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
>> index 9b0fb70..c5ea1dd 100644
>> --- a/arch/x86/kernel/cpu/common.c
>> +++ b/arch/x86/kernel/cpu/common.c
>> @@ -668,6 +668,21 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
>> }
>> }
>>
>> + /* Additional Intel-defined flags: level 0x00000010 */
>> + if (c->cpuid_level >= 0x00000010) {
>> + u32 eax, ebx, ecx, edx;
>> +
>> + cpuid_count(0x00000010, 0, &eax, &ebx, &ecx, &edx);
>> + c->x86_capability[13] = ebx;
>> +
>> + if (cpu_has(c, X86_FEATURE_CAT_L3)) {
>> +
>> + cpuid_count(0x00000010, 1, &eax, &ebx, &ecx, &edx);
>> + c->x86_cat_closs = (edx & 0xffff) + 1;
>> + c->x86_cat_cbmlength = (eax & 0xf) + 1;
>> + }
>> + }
>> +
>> /* AMD-defined flags: level 0x80000001 */
>> xlvl = cpuid_eax(0x80000000);
>> c->extended_cpuid_level = xlvl;
>> diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
>> new file mode 100644
>> index 0000000..46ce449
>> --- /dev/null
>> +++ b/arch/x86/kernel/cpu/intel_rdt.c
>> @@ -0,0 +1,51 @@
>> +/*
>> + * Resource Director Technology(RDT) code
>> + *
>> + * Copyright (C) 2014 Intel Corporation
>> + *
>> + * 2014-09-10 Written by Vikas Shivappa
>> + *
>> + * This program is free software; you can redistribute it and/or modify it
>> + * under the terms and conditions of the GNU General Public License,
>> + * version 2, as published by the Free Software Foundation.
>> + *
>> + * This program is distributed in the hope it will be useful, but WITHOUT
>> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
>> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
>> + * more details.
>> + *
>> + * More information about RDT be found in the Intel (R) x86 Architecture
>> + * Software Developer Manual, section 17.15.
>> + */
>> +
>> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
>> +
>> +#include <linux/slab.h>
>> +#include <linux/err.h>
>> +#include <linux/spinlock.h>
>> +
>> +static inline bool rdt_supported(struct cpuinfo_x86 *c)
>> +{
>> + if (cpu_has(c, X86_FEATURE_RDT))
>> + return true;
>> +
>> + return false;
>> +}
>> +
>> +static int __init rdt_late_init(void)
>> +{
>> + struct cpuinfo_x86 *c = &boot_cpu_data;
>> + int maxid, cbm_len;
>> +
>> + if (!rdt_supported(c))
>
> you can do cpu_has() directly here instead of the custom wrapper and
> drop that rdt_supported() thing.
>
>> + return -ENODEV;
>> +
>> + maxid = c->x86_cat_closs;
>> + cbm_len = c->x86_cat_cbmlength;
>
> No need for those local variables, just use c->...
The above two are due to my bad habit of splitting the patches from the full
implentation :) - these functions and variables come to use in the next patches
in the series. But this can be changed if needed..
>
>> +
>> + pr_info("cbmlength:%u,Closs: %u\n", cbm_len, maxid);
>
> This text message needs to be much more user-friendly if it is going out
> to the console unconditionally.
>
bit mask lengh: number of CLOSids: ? . it should print with the module name as
well which should help understand what it is for.
>> +
>> + return 0;
>> +}
>> +
>> +late_initcall(rdt_late_init);
>
> Btw, this could all fit nicely in arch/x86/kernel/cpu/intel.c AFAICT
> instead of adding a separate file and you probably don't even need the
> late_initcall() even...
RDT would be the common shared resource control support in intel architecture
and currently only has cache resource supported but can be easily extended to
include more resources as documented in the SDM. We have a cgroup subsystem for
rdt and donot want this to be enabled by default.
lateinit call is to ensure that this is called after the cgroup init and hence
can be disabled if the rdt h/w support is not present - This is related to
another caveat that cgroup_init cant handle failure gracefully , so we have to
return success during the first css_alloc and then fail the late_initcall ..
>
>> diff --git a/init/Kconfig b/init/Kconfig
>> index 9afb971..c5004b3 100644
>> --- a/init/Kconfig
>> +++ b/init/Kconfig
>> @@ -961,6 +961,17 @@ config CPUSETS
>>
>> Say N if unsure.
>>
>> +config CGROUP_RDT
>> + bool "Resource Director Technology cgroup subsystem"
>> + depends on X86_64
>
> depends on X86_64 && CPU_SUP_INTEL
>
> Also, this should probably also depend on CGROUP-something or so
> AFAICT...
This is with in the if CGROUPS
>
>> + help
>> + This option provides framework to allocate resources like
>> + cache lines when applications fill cache.
>> + This can be used by users to configure how much cache
>> + that can be allocated to different applications.
>
> This help text doesn't really help me if I'm Joe User.
>
ok,Will try to make it more readable.
Regards,
Vikas
> --
> Regards/Gruss,
> Boris.
>
> ECO tip #101: Trim your mails when you reply.
> --
>
On Tue, Feb 24, 2015 at 03:16:37PM -0800, Vikas Shivappa wrote:
> This patch adds a new cgroup subsystem to support the new Cache Allocation
> Technology (CAT) feature found in future Intel Xeon Intel processors. CAT is
> part of Resource Director Technology(RDT) or Platform Shared resource control
> which provides support to control Platform shared resources like cache.
Totally insane naming all of that. Note that if you google for "intel
rdt" it suggests you search for "intel rst" and this patch is the only
link it finds that's somewhat close.
The CAT thing was annoying already, but at least one can find that in
the SDM, this RDT thing, not a single mention.
On Tue, Feb 24, 2015 at 04:42:10PM -0800, Vikas Shivappa wrote:
> >>+
> >>+ pr_info("cbmlength:%u,Closs: %u\n", cbm_len, maxid);
> >
> >This text message needs to be much more user-friendly if it is going out
> >to the console unconditionally.
> >
>
> bit mask lengh: number of CLOSids: ? . it should print with the module name
> as well which should help understand what it is for.
Right, if I haven't read the SDM on RDT, I still don't understand what
those mean. What is the need for that message at all, what is it telling
me?
Can you show an example from a machine with RDT and explain what it is
good for?
> >>+config CGROUP_RDT
> >>+ bool "Resource Director Technology cgroup subsystem"
> >>+ depends on X86_64
> >
> >depends on X86_64 && CPU_SUP_INTEL
> >
> >Also, this should probably also depend on CGROUP-something or so
> >AFAICT...
>
> This is with in the if CGROUPS
Right, but you still need the CPU_SUP_INTEL dependency as !Intel x86
doesn't need that code built.
Thanks.
--
Regards/Gruss,
Boris.
ECO tip #101: Trim your mails when you reply.
--
> The CAT thing was annoying already, but at least one can find that in
> the SDM, this RDT thing, not a single mention.
The problems of development at the bleeding edge. Would you rather Linux
sat on the sidelines until there are enough Google hits from other users of
new features?
I did get one hit for a search for: intel "resource director technology" ... but
it only has one line in a table with a 50,000' view of what it does.
http://www.intel.com/content/dam/www/public/us/en/documents/guides/xeon-intel-server-processor-comparison-guide.pdf
Technology: Intel Resource Director Technology
Description: Allows the hypervisor to monitor Last Level Cache usage at the application
and VM levels.
Benefit: Helps to improve performance and efficiency by providing better
information for scheduling, load balancing, and workload migration
Which isn't any help in evaluating this patch series :-(
-Tony
* Luck, Tony <[email protected]> wrote:
> > The CAT thing was annoying already, but at least one
> > can find that in the SDM, this RDT thing, not a single
> > mention.
>
> The problems of development at the bleeding edge. Would
> you rather Linux sat on the sidelines until there are
> enough Google hits from other users of new features?
Well, we'd prefer there to be A) published documentation,
or, lacking published documentation, there be B) a coherent
technical description within the code itself what the
purpose is and how it all works conceptually (minus the
buzzwords), so that we have a common starting point when
reviewing it.
> Technology: Intel Resource Director Technology
>
> Description: Allows the hypervisor to monitor Last Level Cache usage at the application
> and VM levels.
>
> Benefit: Helps to improve performance and efficiency by providing better
> information for scheduling, load balancing, and workload migration
>
> Which isn't any help in evaluating this patch series :-(
No, but it already tells us more than the 0/7 description
of the patch series did! It should be possible to improve
on that.
Maintainers reverse engineering the implementation is an
inefficient approach.
Thanks,
Ingo
On Thu, 26 Feb 2015, Ingo Molnar wrote:
>
> * Luck, Tony <[email protected]> wrote:
>
>>> The CAT thing was annoying already, but at least one
>>> can find that in the SDM, this RDT thing, not a single
>>> mention.
>>
>> The problems of development at the bleeding edge. Would
>> you rather Linux sat on the sidelines until there are
>> enough Google hits from other users of new features?
>
> Well, we'd prefer there to be A) published documentation,
> or, lacking published documentation, there be B) a coherent
> technical description within the code itself what the
> purpose is and how it all works conceptually (minus the
> buzzwords), so that we have a common starting point when
> reviewing it.
Will add a description with in the code if that helps.
Thanks,
Vikas
On Thu, 26 Feb 2015, Vikas Shivappa wrote:
>
>
> On Thu, 26 Feb 2015, Ingo Molnar wrote:
>
>>
>> * Luck, Tony <[email protected]> wrote:
>>
>>>> The CAT thing was annoying already, but at least one
>>>> can find that in the SDM, this RDT thing, not a single
>>>> mention.
>>>
>>> The problems of development at the bleeding edge. Would
>>> you rather Linux sat on the sidelines until there are
>>> enough Google hits from other users of new features?
>>
>> Well, we'd prefer there to be A) published documentation,
>> or, lacking published documentation, there be B) a coherent
>> technical description within the code itself what the
>> purpose is and how it all works conceptually (minus the
>> buzzwords), so that we have a common starting point when
>> reviewing it.
>
> Will add a description with in the code if that helps.
There is a reference to the relevant Intel SDM section mentioned in the
code
" * More information about RDT be found in the Intel (R) x86 Architecture
* Software Developer Manual, section 17.15."
>
> Thanks,
> Vikas
>
>
On Wed, 25 Feb 2015, Borislav Petkov wrote:
> On Tue, Feb 24, 2015 at 04:42:10PM -0800, Vikas Shivappa wrote:
>>>> +
>>>> + pr_info("cbmlength:%u,Closs: %u\n", cbm_len, maxid);
>>>
>>> This text message needs to be much more user-friendly if it is going out
>>> to the console unconditionally.
>>>
>>
>> bit mask lengh: number of CLOSids: ? . it should print with the module name
>> as well which should help understand what it is for.
>
> Right, if I haven't read the SDM on RDT, I still don't understand what
> those mean. What is the need for that message at all, what is it telling
> me?
>
> Can you show an example from a machine with RDT and explain what it is
> good for?
This would be an indication that the System support RDT. On a system with RDT
would see a print.
intel_rdt: cbmlength: xx , CLOss:xx
This is documented in the RDT documentation that is added in the patch and the
code also mentiones the Intel SDM section which details the feature. The RDT
is expected to be used by advanced users atlest the ones who would use the
cgroup RDT interface , knows about the class of service , bit mask etc.. The use
cases are also documented in the RDT document in cgroups/rdt.txt (the last
patch in this series)
>
>>>> +config CGROUP_RDT
>>>> + bool "Resource Director Technology cgroup subsystem"
>>>> + depends on X86_64
>>>
>>> depends on X86_64 && CPU_SUP_INTEL
>>>
>>> Also, this should probably also depend on CGROUP-something or so
>>> AFAICT...
>>
>> This is with in the if CGROUPS
>
> Right, but you still need the CPU_SUP_INTEL dependency as !Intel x86
> doesn't need that code built.
Will add this dependency..
>
> Thanks.
>
> --
> Regards/Gruss,
> Boris.
>
> ECO tip #101: Trim your mails when you reply.
> --
>
On Thu, Feb 26, 2015 at 10:19:42AM -0800, Vikas Shivappa wrote:
> This would be an indication that the System support RDT. On a system with
> RDT would see a print.
>
> intel_rdt: cbmlength: xx , CLOss:xx
Ok, so I have a capacity bitmask of length xx and yy classes of service.
And?
Are you expecting for tools or experienced users to grep dmesg to find
that information?
Uh, but what happens on a machine which has a small log buffer and which
has wrapped around and that information has been overwritten?
See what I mean?
If you really want to communicate this information to someone, you
should use more robust methods like make userspace use CPUID directly or
expose that information in sysfs if CPUID is not an option (but I can't
imagine why it wouldn't be).
This flaky message which can get overwritten and gets used only by a
small percentage of people(?) (I haven't reached the part which tells
me the use cases for that resource management yet) is purely useless in
dmesg.
Even /proc/cpuinfo, which will have "rdt" et all in there according to
the defines you're adding, would be a much better way to detect what's
supported quickly than the message.
HTH.
--
Regards/Gruss,
Boris.
ECO tip #101: Trim your mails when you reply.
--
On Thu, 26 Feb 2015, Borislav Petkov wrote:
> On Thu, Feb 26, 2015 at 10:19:42AM -0800, Vikas Shivappa wrote:
>> This would be an indication that the System support RDT. On a system with
>> RDT would see a print.
>>
>> intel_rdt: cbmlength: xx , CLOss:xx
>
> Ok, so I have a capacity bitmask of length xx and yy classes of service.
> And?
>
> Are you expecting for tools or experienced users to grep dmesg to find
> that information?
>
> Uh, but what happens on a machine which has a small log buffer and which
> has wrapped around and that information has been overwritten?
Yes, this is not the only way to see if the feature is enabled. It can be seen
in cpuinfo like you mention below.
The root's cbm mask represents the max cbm length already -
that can be seen by the user as defined in the documentation.
It is under consideration to add Max closids or something like clos ids
available to be shown in the root cgroup once there are more resources and more
such parameters required to be exposed to user. It would be easier to view the
resources like CPUID availability through cgroup interface itself rather than
add an other interface for the same.
>
> See what I mean?
>
> If you really want to communicate this information to someone, you
> should use more robust methods like make userspace use CPUID directly or
> expose that information in sysfs if CPUID is not an option (but I can't
> imagine why it wouldn't be).
>
> This flaky message which can get overwritten and gets used only by a
> small percentage of people(?) (I haven't reached the part which tells
> me the use cases for that resource management yet) is purely useless in
> dmesg.
>
> Even /proc/cpuinfo, which will have "rdt" et all in there according to
> the defines you're adding, would be a much better way to detect what's
> supported quickly than the message.
>
> HTH.
>
> --
> Regards/Gruss,
> Boris.
>
> ECO tip #101: Trim your mails when you reply.
> --
>
On Thu, Feb 26, 2015 at 11:12:28AM -0800, Vikas Shivappa wrote:
> It would be easier to view the resources like CPUID availability
> through cgroup interface itself rather than add an other interface for
> the same.
Right, exposing that info in the same place where it is being
used/controlled makes most sense to me.
--
Regards/Gruss,
Boris.
ECO tip #101: Trim your mails when you reply.
--
On 25 February 2015 at 00:16, Vikas Shivappa
<[email protected]> wrote:
> +1.2 Why is CAT needed
> +---------------------
> +
> +The CAT enables more cache resources to be made available for higher
> +priority applications based on guidance from the execution
> +environment.
> +
> +The architecture also allows dynamically changing these subsets during
> +runtime to further optimize the performance of the higher priority
> +application with minimal degradation to the low priority app.
> +Additionally, resources can be rebalanced for system throughput
> +benefit. (Refer to Section 17.15 in the Intel SDM)
> +
> +This technique may be useful in managing large computer systems which
> +large LLC. Examples may be large servers running instances of
> +webservers or database servers. In such complex systems, these subsets
> +can be used for more careful placing of the available cache
> +resources.
> +
> +The CAT kernel patch would provide a basic kernel framework for users
> +to be able to implement such cache subsets.
Last paragraph can be deleted. If functionality is merged it is not a
patch anymore.
Hello,
On Tue, Feb 24, 2015 at 03:16:40PM -0800, Vikas Shivappa wrote:
> Add support for cache bit mask manipulation. The change adds a file to
> the RDT cgroup which represents the CBM(cache bit mask) for the cgroup.
>
> The RDT cgroup follows cgroup hierarchy ,mkdir and adding tasks to the
> cgroup never fails. When a child cgroup is created it inherits the
> CLOSid and the CBM from its parent. When a user changes the default
> CBM for a cgroup, a new CLOSid may be allocated if the CBM was not
> used before. If the new CBM is the one that is already used, the
> count for that CLOSid<->CBM is incremented. The changing of 'cbm'
> may fail with -ENOSPC once the kernel runs out of maximum CLOSids it
> can support.
> User can create as many cgroups as he wants but having different CBMs
> at the same time is restricted by the maximum number of CLOSids
> (multiple cgroups can have the same CBM).
> Kernel maintains a CLOSid<->cbm mapping which keeps count
> of cgroups using a CLOSid.
>
> The tasks in the CAT cgroup would get to fill the LLC cache represented
> by the cgroup's 'cbm' file.
>
> Reuse of CLOSids for cgroups with same bitmask also has following
> advantages:
> - This helps to use the scant CLOSids optimally.
> - This also implies that during context switch, write to PQR-MSR is done
> only when a task with a different bitmask is scheduled in.
I feel a bit underwhelmed about this new controller and its interface.
It is evidently at a lot lower level and way more niche than what
other controllers are doing, even cpuset. At the same time, as long
as it's well isolated, it piggybacking on cgroup should be okay. I
take it that the feature implemented is too coarse to allow for weight
based distribution?
Thanks.
--
tejun
On Fri, Feb 27, 2015 at 07:12:22AM -0500, Tejun Heo wrote:
> I feel a bit underwhelmed about this new controller and its interface.
> It is evidently at a lot lower level and way more niche than what
> other controllers are doing, even cpuset. At the same time, as long
> as it's well isolated, it piggybacking on cgroup should be okay. I
> take it that the feature implemented is too coarse to allow for weight
> based distribution?
And, Ingo, Peter, are you guys in general agreeing with this addition?
As Tony said, we don't wanna be left way behind but that doesn't mean
we wanna jump on everything giving off the faintest sign of movement,
which sadly has happened often enough in the storage area at least.
Thanks.
--
tejun
Hello Tejun,
On Fri, 27 Feb 2015, Tejun Heo wrote:
> Hello,
>
> On Tue, Feb 24, 2015 at 03:16:40PM -0800, Vikas Shivappa wrote:
>> Add support for cache bit mask manipulation. The change adds a file to
>> the RDT cgroup which represents the CBM(cache bit mask) for the cgroup.
>>
>> The RDT cgroup follows cgroup hierarchy ,mkdir and adding tasks to the
>> cgroup never fails. When a child cgroup is created it inherits the
>> CLOSid and the CBM from its parent. When a user changes the default
>> CBM for a cgroup, a new CLOSid may be allocated if the CBM was not
>> used before. If the new CBM is the one that is already used, the
>> count for that CLOSid<->CBM is incremented. The changing of 'cbm'
>> may fail with -ENOSPC once the kernel runs out of maximum CLOSids it
>> can support.
>> User can create as many cgroups as he wants but having different CBMs
>> at the same time is restricted by the maximum number of CLOSids
>> (multiple cgroups can have the same CBM).
>> Kernel maintains a CLOSid<->cbm mapping which keeps count
>> of cgroups using a CLOSid.
>>
>> The tasks in the CAT cgroup would get to fill the LLC cache represented
>> by the cgroup's 'cbm' file.
>>
>> Reuse of CLOSids for cgroups with same bitmask also has following
>> advantages:
>> - This helps to use the scant CLOSids optimally.
>> - This also implies that during context switch, write to PQR-MSR is done
>> only when a task with a different bitmask is scheduled in.
>
> I feel a bit underwhelmed about this new controller and its interface.
> It is evidently at a lot lower level and way more niche than what
> other controllers are doing, even cpuset. At the same time, as long
> as it's well isolated, it piggybacking on cgroup should be okay.
This cgroup subsystem would basically let the user partition one of the Platform
shared resource , the LLC cache. This could be extended in future to partition
more shared resources when there is hardware support that way we may eventually have more
files in the cgroup. RDT is a generic term for platform resource sharing.
For more information you can refer to section 17.15 of Intel SDM.
We did go through quite a bit of discussion on lkml regarding adding
the cgroup interface for CAT and the patches were posted only after that.
This cgroup would not interact with other cgroups in the sense would not modify
or add any elements to existing cgroups - there was such a proposal but was
removed as we did not get agreement on lkml.
the original lkml thread is here from 10/2014 for your reference -
https://lkml.org/lkml/2014/10/16/568
I
> take it that the feature implemented is too coarse to allow for weight
> based distribution?
>
Could you please clarify more on this ? However there is a limitation from
hardware that there have to be a minimum of 2 bits in the cbm if thats what you
referred to. Otherwise the bits in the cbm directly map to the number of cache
ways and hence the cache capacity ..
Thanks,
Vikas
> Thanks.
>
> --
> tejun
>
Hello, Vikas.
On Fri, Feb 27, 2015 at 11:34:16AM -0800, Vikas Shivappa wrote:
> This cgroup subsystem would basically let the user partition one of the
> Platform shared resource , the LLC cache. This could be extended in future
I suppose LLC means last level cache? It'd be great if you can spell
out the full term when the abbreviation is first referenced in the
comments or documentation.
> to partition more shared resources when there is hardware support that way
> we may eventually have more files in the cgroup. RDT is a generic term for
> platform resource sharing.
> For more information you can refer to section 17.15 of Intel SDM.
> We did go through quite a bit of discussion on lkml regarding adding the
> cgroup interface for CAT and the patches were posted only after that.
> This cgroup would not interact with other cgroups in the sense would not
> modify or add any elements to existing cgroups - there was such a proposal
> but was removed as we did not get agreement on lkml.
>
> the original lkml thread is here from 10/2014 for your reference -
> https://lkml.org/lkml/2014/10/16/568
Yeap, I followed that thread and this being a separate controller
definitely makes a lot more sense.
> I
> >take it that the feature implemented is too coarse to allow for weight
> >based distribution?
> >
> Could you please clarify more on this ? However there is a limitation from
> hardware that there have to be a minimum of 2 bits in the cbm if thats what
> you referred to. Otherwise the bits in the cbm directly map to the number of
> cache ways and hence the cache capacity ..
Right, so the granularity is fairly coarse and specifying things like
"distribute cache in 4:2:1 (or even in absolute bytes) to these three
cgroups" wouldn't work at all.
Thanks.
--
tejun
On Fri, 27 Feb 2015, Tejun Heo wrote:
> Hello, Vikas.
>
> On Fri, Feb 27, 2015 at 11:34:16AM -0800, Vikas Shivappa wrote:
>> This cgroup subsystem would basically let the user partition one of the
>> Platform shared resource , the LLC cache. This could be extended in future
>
> I suppose LLC means last level cache? It'd be great if you can spell
> out the full term when the abbreviation is first referenced in the
> comments or documentation.
>
Yes that's last level cache. Will update documentation/comments if any.
>> to partition more shared resources when there is hardware support that way
>> we may eventually have more files in the cgroup. RDT is a generic term for
>> platform resource sharing.
>
>> For more information you can refer to section 17.15 of Intel SDM.
>> We did go through quite a bit of discussion on lkml regarding adding the
>> cgroup interface for CAT and the patches were posted only after that.
>> This cgroup would not interact with other cgroups in the sense would not
>> modify or add any elements to existing cgroups - there was such a proposal
>> but was removed as we did not get agreement on lkml.
>>
>> the original lkml thread is here from 10/2014 for your reference -
>> https://lkml.org/lkml/2014/10/16/568
>
> Yeap, I followed that thread and this being a separate controller
> definitely makes a lot more sense.
>
>> I
>>> take it that the feature implemented is too coarse to allow for weight
>>> based distribution?
>>>
>> Could you please clarify more on this ? However there is a limitation from
>> hardware that there have to be a minimum of 2 bits in the cbm if thats what
>> you referred to. Otherwise the bits in the cbm directly map to the number of
>> cache ways and hence the cache capacity ..
>
> Right, so the granularity is fairly coarse and specifying things like
> "distribute cache in 4:2:1 (or even in absolute bytes) to these three
> cgroups" wouldn't work at all.
Specifying in any amount of cache bytes would be not possible because the minimum
granularity has to be atleast one cache way because the entire memory can be
indexed into one cache way.
Providing the bit mask granularity helps users to not worry about how much bytes
cache way is and can specify in terms of the bitmask. If we want to
provide such an interface in the cgroups where users can specify the size in
bytes then we need to show the user the
minimum granularity in bytes as well. Also note that this
bit masks are overlapping and hence the users have a way to specify overlapped
regions in cache which may be very useful in lot of scenarios where multiple
cgroups want to share the capacity.
The minimum granularity is 2 bits in the pre-production SKUs and it does
put limitation to scenarios you say. We will issue a patch update once it
hopefully gets updated in later SKUs. But note that the SDM also recommends
using
2 bits from performance aspect because an application using only cache-way would
have a lot more conflicts.
Say if max cbm is 20bits then the granularity is 10% of total cache..
>
> Thanks.
>
> --
> tejun
>
On Tue, 31 Mar 2015, Marcelo Tosatti wrote:
> On Tue, Mar 31, 2015 at 10:27:32AM -0700, Vikas Shivappa wrote:
>>
>>
>> On Thu, 26 Mar 2015, Marcelo Tosatti wrote:
>>
>>>
>>> I can't find any discussion relating to exposing the CBM interface
>>> directly to userspace in that thread ?
>>>
>>> Cpu.shares is written in ratio form, which is much more natural.
>>> Do you see any advantage in maintaining the
>>>
>>> (ratio -> cbm bitmasks)
>>>
>>> translation in userspace rather than in the kernel ?
>>>
>>> What about something like:
>>>
>>>
>>> root cgroup
>>> / \
>>> / \
>>> / \
>>> cgroupA-80 cgroupB-30
>>>
>>>
>>> So that whatever exceeds 100% is the ratio of cache
>>> shared at that level (cgroup A and B share 10% of cache
>>> at that level).
>>
>> But this also means the 2 groups share all of the cache ?
>>
>> Specifying the amount of bits to be shared lets you specify the
>> exact cache area where you want to share and also when your total
>> occupancy does not cover all of the cache. For ex: it gets more
>> complex when you want to share say only the left quarter of the
>> cache. cgroupA gets left half and cgroup gets left quarter. The
>> bitmask aligns with how the h/w is designed to share the cache which
>> gives you flexibility to define any specific overlapping areas of
>> the cache.
>
>>> https://access.redhat.com/documentation/en-US/Red_Hat_Enterprise_Linux/6/html/Resource_Management_Guide/sec-cpu_and_memory-use_case.html
>>>
>>> cpu — the cpu.shares parameter determines the share of CPU resources
>>> available to each process in all cgroups. Setting the parameter to 250,
>>> 250, and 500 in the finance, sales, and engineering cgroups respectively
>>> means that processes started in these groups will split the resources
>>> with a 1:1:2 ratio. Note that when a single process is running, it
>>> consumes as much CPU as necessary no matter which cgroup it is placed
>>> in. The CPU limitation only comes into effect when two or more processes
>>> compete for CPU resources.
>>>
>>>
>>
>> These are more defined in terms of how many cache lines (or how many
>> cache ways) they can use and would be difficult to define them in
>> terms of percentage. In contrast the cpu share is a time shared
>> thing and is much more granular where as here its not , its
>> occupancy in terms of cache lines/ways.. (however this is not really
>> defined as a restriction but thats the way it is now).
>> Also note that the granularity of the bitmasks define the
>> granularity of the percentages and in some SKUs the granularity is
>> 2b and not 1b.. So technically you wont be able to even allocate
>> percentage of cache even in 10% granularity for most of the cases
>> (if there are 30MB and 25 ways like in one of hsw SKU) and this will
>> vary for different SKUs which makes it more complicated for users.
>> However the user library is free to define own interface based on
>> the underlying cgroup interface say for example you never care about
>> the overlapping and using it for a specific SKU etc.. The underlying
>> cgroup framework is meant to be generic for all SKus and used for
>> most of the use cases.
>>
>> Also at this point I see a lot of enterprise and and other users
>> already using the cgroup interface or shown interest in the same.
>> However I see your point where you indicate the ease with which user
>> can specify in size/percentage which he might be used to doing for
>> other resources rather than bits where he needs to get an idea size
>> by calculating it seperately - But again note that you may not be
>> able to define percentages in many scenarios like the one above. And
>> another question would be we would need to convince the users to
>> adapt to the modified percentage user model (ex: like the one you
>> say above where percentage - 100 is the one thats shared)
>> I can review this requirements and others I have received and get
>> back to see the closest that can be done if possible.
>>
>> Thanks,
>> Vikas
>
> Vikas,
>
> I see. Don't have anything against performing the translation in userspace
> (i agree userspace should be able to allow ratios and specific
> minimum/maximum counts). Can you please export the relevant information
> in files in /sys or cgroups itself rather than requiring userspace to
> parse CPUID etc? Including the EBX register from CPUID(EAX=10H, ECX=1),
> which is necessary to implement "reserved LLC" properly.
>
> The current interface is unable to handle the cross CPU case, though.
> It would be necessary to expose per-socket masks.
>
>
Marcelo,
The current package supports per-socket updates to masks. Although the CLOSids
are allocated globally just like in CMT and not per package.
The maximum bitmask is the root node's bitmask which is exposed already. The
number of CLOSids are not exposed as kernel internally optimizes its usage and
that should not end up giving a wrong picture for the user. For ex: if the
number of CLOSids available is say 4 - the kernel could actually allocate them
to more cgroups than just 4 cgroups , and this logic may change based on other
features that my be added in the cgroup or depending on features available in
the SKUs .. However with CAT cgroups an error is
returned once kernel runs out of CLOSids. I am still reviewing this requirement
with respect to the closids and will send an update soon.
Thanks,
Vikas
On Thu, Mar 12, 2015 at 04:16:03PM -0700, Vikas Shivappa wrote:
> Add support for cache bit mask manipulation. The change adds a file to
> the RDT cgroup which represents the CBM(cache bit mask) for the cgroup.
>
> The RDT cgroup follows cgroup hierarchy ,mkdir and adding tasks to the
> cgroup never fails. When a child cgroup is created it inherits the
> CLOSid and the CBM from its parent. When a user changes the default
> CBM for a cgroup, a new CLOSid may be allocated if the CBM was not
> used before. If the new CBM is the one that is already used, the
> count for that CLOSid<->CBM is incremented. The changing of 'cbm'
> may fail with -ENOSPC once the kernel runs out of maximum CLOSids it
> can support.
> User can create as many cgroups as he wants but having different CBMs
> at the same time is restricted by the maximum number of CLOSids
> (multiple cgroups can have the same CBM).
> Kernel maintains a CLOSid<->cbm mapping which keeps count
> of cgroups using a CLOSid.
>
> The tasks in the CAT cgroup would get to fill the LLC cache represented
> by the cgroup's 'cbm' file.
>
> Reuse of CLOSids for cgroups with same bitmask also has following
> advantages:
> - This helps to use the scant CLOSids optimally.
> - This also implies that during context switch, write to PQR-MSR is done
> only when a task with a different bitmask is scheduled in.
>
> Signed-off-by: Vikas Shivappa <[email protected]>
> ---
> arch/x86/include/asm/intel_rdt.h | 3 +
> arch/x86/kernel/cpu/intel_rdt.c | 205 +++++++++++++++++++++++++++++++++++++++
> 2 files changed, 208 insertions(+)
>
> diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
> index 87af1a5..0ed28d9 100644
> --- a/arch/x86/include/asm/intel_rdt.h
> +++ b/arch/x86/include/asm/intel_rdt.h
> @@ -4,6 +4,9 @@
> #ifdef CONFIG_CGROUP_RDT
>
> #include <linux/cgroup.h>
> +#define MAX_CBM_LENGTH 32
> +#define IA32_L3_CBM_BASE 0xc90
> +#define CBM_FROM_INDEX(x) (IA32_L3_CBM_BASE + x)
>
> struct rdt_subsys_info {
> /* Clos Bitmap to keep track of available CLOSids.*/
> diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
> index 3726f41..495497a 100644
> --- a/arch/x86/kernel/cpu/intel_rdt.c
> +++ b/arch/x86/kernel/cpu/intel_rdt.c
> @@ -33,6 +33,9 @@ static struct rdt_subsys_info rdtss_info;
> static DEFINE_MUTEX(rdt_group_mutex);
> struct intel_rdt rdt_root_group;
>
> +#define rdt_for_each_child(pos_css, parent_ir) \
> + css_for_each_child((pos_css), &(parent_ir)->css)
> +
> static inline bool cat_supported(struct cpuinfo_x86 *c)
> {
> if (cpu_has(c, X86_FEATURE_CAT_L3))
> @@ -83,6 +86,31 @@ static int __init rdt_late_init(void)
> late_initcall(rdt_late_init);
>
> /*
> + * Allocates a new closid from unused closids.
> + * Called with the rdt_group_mutex held.
> + */
> +
> +static int rdt_alloc_closid(struct intel_rdt *ir)
> +{
> + unsigned int id;
> + unsigned int maxid;
> +
> + lockdep_assert_held(&rdt_group_mutex);
> +
> + maxid = boot_cpu_data.x86_cat_closs;
> + id = find_next_zero_bit(rdtss_info.closmap, maxid, 0);
> + if (id == maxid)
> + return -ENOSPC;
> +
> + set_bit(id, rdtss_info.closmap);
> + WARN_ON(ccmap[id].cgrp_count);
> + ccmap[id].cgrp_count++;
> + ir->clos = id;
> +
> + return 0;
> +}
> +
> +/*
> * Called with the rdt_group_mutex held.
> */
> static int rdt_free_closid(struct intel_rdt *ir)
> @@ -133,8 +161,185 @@ static void rdt_css_free(struct cgroup_subsys_state *css)
> mutex_unlock(&rdt_group_mutex);
> }
>
> +/*
> + * Tests if atleast two contiguous bits are set.
> + */
> +
> +static inline bool cbm_is_contiguous(unsigned long var)
> +{
> + unsigned long first_bit, zero_bit;
> + unsigned long maxcbm = MAX_CBM_LENGTH;
> +
> + if (bitmap_weight(&var, maxcbm) < 2)
> + return false;
> +
> + first_bit = find_next_bit(&var, maxcbm, 0);
> + zero_bit = find_next_zero_bit(&var, maxcbm, first_bit);
> +
> + if (find_next_bit(&var, maxcbm, zero_bit) < maxcbm)
> + return false;
> +
> + return true;
> +}
> +
> +static int cat_cbm_read(struct seq_file *m, void *v)
> +{
> + struct intel_rdt *ir = css_rdt(seq_css(m));
> +
> + seq_printf(m, "%08lx\n", ccmap[ir->clos].cbm);
> + return 0;
> +}
> +
> +static int validate_cbm(struct intel_rdt *ir, unsigned long cbmvalue)
> +{
> + struct intel_rdt *par, *c;
> + struct cgroup_subsys_state *css;
> + unsigned long *cbm_tmp;
> +
> + if (!cbm_is_contiguous(cbmvalue)) {
> + pr_info("cbm should have >= 2 bits and be contiguous\n");
> + return -EINVAL;
> + }
> +
> + par = parent_rdt(ir);
> + cbm_tmp = &ccmap[par->clos].cbm;
> + if (!bitmap_subset(&cbmvalue, cbm_tmp, MAX_CBM_LENGTH))
> + return -EINVAL;
Can you have different errors for the different cases?
> + rcu_read_lock();
> + rdt_for_each_child(css, ir) {
> + c = css_rdt(css);
> + cbm_tmp = &ccmap[c->clos].cbm;
> + if (!bitmap_subset(cbm_tmp, &cbmvalue, MAX_CBM_LENGTH)) {
> + pr_info("Children's mask not a subset\n");
> + rcu_read_unlock();
> + return -EINVAL;
> + }
> + }
> +
> + rcu_read_unlock();
> + return 0;
> +}
> +
> +static bool cbm_search(unsigned long cbm, int *closid)
> +{
> + int maxid = boot_cpu_data.x86_cat_closs;
> + unsigned int i;
> +
> + for (i = 0; i < maxid; i++)
> + if (bitmap_equal(&cbm, &ccmap[i].cbm, MAX_CBM_LENGTH)) {
> + *closid = i;
> + return true;
> + }
> +
> + return false;
> +}
> +
> +static void cbmmap_dump(void)
> +{
> + int i;
> +
> + pr_debug("CBMMAP\n");
> + for (i = 0; i < boot_cpu_data.x86_cat_closs; i++)
> + pr_debug("cbm: 0x%x,cgrp_count: %u\n",
> + (unsigned int)ccmap[i].cbm, ccmap[i].cgrp_count);
> +}
> +
> +static void cpu_cbm_update(void *info)
> +{
> + unsigned int closid = *((unsigned int *)info);
> +
> + wrmsrl(CBM_FROM_INDEX(closid), ccmap[closid].cbm);
> +}
> +
> +static inline void cbm_update(unsigned int closid)
> +{
> + int pkg_id = -1;
> + int cpu;
> +
> + for_each_online_cpu(cpu) {
> + if (pkg_id == topology_physical_package_id(cpu))
> + continue;
> + smp_call_function_single(cpu, cpu_cbm_update, &closid, 1);
> + pkg_id = topology_physical_package_id(cpu);
> +
> +
Can use smp_call_function_many, once, more efficient.
Can this race with CPU hotplug? BTW, on CPU hotplug, where are
the IA32_L3_MASK_n initialized for the new CPU ?
On Thu, 9 Apr 2015, Marcelo Tosatti wrote:
> On Thu, Mar 12, 2015 at 04:16:03PM -0700, Vikas Shivappa wrote:
>> Add support for cache bit mask manipulation. The change adds a file to
>> the RDT cgroup which represents the CBM(cache bit mask) for the cgroup.
>>
>> The RDT cgroup follows cgroup hierarchy ,mkdir and adding tasks to the
>> cgroup never fails. When a child cgroup is created it inherits the
>> CLOSid and the CBM from its parent. When a user changes the default
>> CBM for a cgroup, a new CLOSid may be allocated if the CBM was not
>> used before. If the new CBM is the one that is already used, the
>> count for that CLOSid<->CBM is incremented. The changing of 'cbm'
>> may fail with -ENOSPC once the kernel runs out of maximum CLOSids it
>> can support.
>> User can create as many cgroups as he wants but having different CBMs
>> at the same time is restricted by the maximum number of CLOSids
>> (multiple cgroups can have the same CBM).
>> Kernel maintains a CLOSid<->cbm mapping which keeps count
>> of cgroups using a CLOSid.
>>
>> The tasks in the CAT cgroup would get to fill the LLC cache represented
>> by the cgroup's 'cbm' file.
>>
>> Reuse of CLOSids for cgroups with same bitmask also has following
>> advantages:
>> - This helps to use the scant CLOSids optimally.
>> - This also implies that during context switch, write to PQR-MSR is done
>> only when a task with a different bitmask is scheduled in.
>>
>> Signed-off-by: Vikas Shivappa <[email protected]>
>> ---
>> arch/x86/include/asm/intel_rdt.h | 3 +
>> arch/x86/kernel/cpu/intel_rdt.c | 205 +++++++++++++++++++++++++++++++++++++++
>> 2 files changed, 208 insertions(+)
>>
>> diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
>> index 87af1a5..0ed28d9 100644
>> --- a/arch/x86/include/asm/intel_rdt.h
>> +++ b/arch/x86/include/asm/intel_rdt.h
>> @@ -4,6 +4,9 @@
>> #ifdef CONFIG_CGROUP_RDT
>>
>> #include <linux/cgroup.h>
>> +#define MAX_CBM_LENGTH 32
>> +#define IA32_L3_CBM_BASE 0xc90
>> +#define CBM_FROM_INDEX(x) (IA32_L3_CBM_BASE + x)
>>
>> struct rdt_subsys_info {
>> /* Clos Bitmap to keep track of available CLOSids.*/
>> diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
>> index 3726f41..495497a 100644
>> --- a/arch/x86/kernel/cpu/intel_rdt.c
>> +++ b/arch/x86/kernel/cpu/intel_rdt.c
>> @@ -33,6 +33,9 @@ static struct rdt_subsys_info rdtss_info;
>> static DEFINE_MUTEX(rdt_group_mutex);
>> struct intel_rdt rdt_root_group;
>>
>> +#define rdt_for_each_child(pos_css, parent_ir) \
>> + css_for_each_child((pos_css), &(parent_ir)->css)
>> +
>> static inline bool cat_supported(struct cpuinfo_x86 *c)
>> {
>> if (cpu_has(c, X86_FEATURE_CAT_L3))
>> @@ -83,6 +86,31 @@ static int __init rdt_late_init(void)
>> late_initcall(rdt_late_init);
>>
>> /*
>> + * Allocates a new closid from unused closids.
>> + * Called with the rdt_group_mutex held.
>> + */
>> +
>> +static int rdt_alloc_closid(struct intel_rdt *ir)
>> +{
>> + unsigned int id;
>> + unsigned int maxid;
>> +
>> + lockdep_assert_held(&rdt_group_mutex);
>> +
>> + maxid = boot_cpu_data.x86_cat_closs;
>> + id = find_next_zero_bit(rdtss_info.closmap, maxid, 0);
>> + if (id == maxid)
>> + return -ENOSPC;
>> +
>> + set_bit(id, rdtss_info.closmap);
>> + WARN_ON(ccmap[id].cgrp_count);
>> + ccmap[id].cgrp_count++;
>> + ir->clos = id;
>> +
>> + return 0;
>> +}
>> +
>> +/*
>> * Called with the rdt_group_mutex held.
>> */
>> static int rdt_free_closid(struct intel_rdt *ir)
>> @@ -133,8 +161,185 @@ static void rdt_css_free(struct cgroup_subsys_state *css)
>> mutex_unlock(&rdt_group_mutex);
>> }
>>
>> +/*
>> + * Tests if atleast two contiguous bits are set.
>> + */
>> +
>> +static inline bool cbm_is_contiguous(unsigned long var)
>> +{
>> + unsigned long first_bit, zero_bit;
>> + unsigned long maxcbm = MAX_CBM_LENGTH;
>> +
>> + if (bitmap_weight(&var, maxcbm) < 2)
>> + return false;
>> +
>> + first_bit = find_next_bit(&var, maxcbm, 0);
>> + zero_bit = find_next_zero_bit(&var, maxcbm, first_bit);
>> +
>> + if (find_next_bit(&var, maxcbm, zero_bit) < maxcbm)
>> + return false;
>> +
>> + return true;
>> +}
>> +
>> +static int cat_cbm_read(struct seq_file *m, void *v)
>> +{
>> + struct intel_rdt *ir = css_rdt(seq_css(m));
>> +
>> + seq_printf(m, "%08lx\n", ccmap[ir->clos].cbm);
>> + return 0;
>> +}
>> +
>> +static int validate_cbm(struct intel_rdt *ir, unsigned long cbmvalue)
>> +{
>> + struct intel_rdt *par, *c;
>> + struct cgroup_subsys_state *css;
>> + unsigned long *cbm_tmp;
>> +
>> + if (!cbm_is_contiguous(cbmvalue)) {
>> + pr_info("cbm should have >= 2 bits and be contiguous\n");
>> + return -EINVAL;
>> + }
>> +
>> + par = parent_rdt(ir);
>> + cbm_tmp = &ccmap[par->clos].cbm;
>> + if (!bitmap_subset(&cbmvalue, cbm_tmp, MAX_CBM_LENGTH))
>> + return -EINVAL;
>
> Can you have different errors for the different cases?
Could use -EPER
>
>> + rcu_read_lock();
>> + rdt_for_each_child(css, ir) {
>> + c = css_rdt(css);
>> + cbm_tmp = &ccmap[c->clos].cbm;
>> + if (!bitmap_subset(cbm_tmp, &cbmvalue, MAX_CBM_LENGTH)) {
>> + pr_info("Children's mask not a subset\n");
>> + rcu_read_unlock();
>> + return -EINVAL;
>> + }
>> + }
>> +
>> + rcu_read_unlock();
>> + return 0;
>> +}
>> +
>> +static bool cbm_search(unsigned long cbm, int *closid)
>> +{
>> + int maxid = boot_cpu_data.x86_cat_closs;
>> + unsigned int i;
>> +
>> + for (i = 0; i < maxid; i++)
>> + if (bitmap_equal(&cbm, &ccmap[i].cbm, MAX_CBM_LENGTH)) {
>> + *closid = i;
>> + return true;
>> + }
>> +
>> + return false;
>> +}
>> +
>> +static void cbmmap_dump(void)
>> +{
>> + int i;
>> +
>> + pr_debug("CBMMAP\n");
>> + for (i = 0; i < boot_cpu_data.x86_cat_closs; i++)
>> + pr_debug("cbm: 0x%x,cgrp_count: %u\n",
>> + (unsigned int)ccmap[i].cbm, ccmap[i].cgrp_count);
>> +}
>> +
>> +static void cpu_cbm_update(void *info)
>> +{
>> + unsigned int closid = *((unsigned int *)info);
>> +
>> + wrmsrl(CBM_FROM_INDEX(closid), ccmap[closid].cbm);
>> +}
>> +
>> +static inline void cbm_update(unsigned int closid)
>> +{
>> + int pkg_id = -1;
>> + int cpu;
>> +
>> + for_each_online_cpu(cpu) {
>> + if (pkg_id == topology_physical_package_id(cpu))
>> + continue;
>> + smp_call_function_single(cpu, cpu_cbm_update, &closid, 1);
>> + pkg_id = topology_physical_package_id(cpu);
>> +
>> +
>
> Can use smp_call_function_many, once, more efficient.
>
> Can this race with CPU hotplug? BTW, on CPU hotplug, where are
> the IA32_L3_MASK_n initialized for the new CPU ?
>
Thanks for pointing out , Will fix this . Think i was terrible when i changed
the design to not use
the cpuset did not change the hot cpu update , I remembered an other similar
update needed.The s3 resume needs a fix to the software cache as we used the msr before.
Thanks,
Vikas
>