2023-06-20 03:52:41

by Tony Luck

[permalink] [raw]
Subject: [RFC PATCH 2/2] resctrl2: Arch x86 modules for most of the legacy control/monitor functions

Missing:
1) "mba_MBps" feedback from MBM
2) Pseudo-locking
3) AMD SMBA/BMEC
4) Fixes and additions after v6.4-rc6
5) Other stuff I haven't noticed

N.B. This is proof-of-concept code ... the locking (or lack thereof)
for most of the files controlled by modules needs to be addressed.

Signed-off-by: Tony Luck <[email protected]>
---
arch/x86/include/asm/resctrl.h | 38 ++
fs/resctrl2/arch/x86/rdt.h | 22 +
arch/x86/kernel/cpu/amd.c | 3 +
arch/x86/kernel/cpu/intel.c | 3 +
arch/x86/kernel/process_32.c | 1 +
arch/x86/kernel/process_64.c | 3 +
fs/resctrl2/arch/x86/alloc.c | 119 +++++
fs/resctrl2/arch/x86/rdt_l2_cat.c | 1 +
fs/resctrl2/arch/x86/rdt_l2_cdp.c | 1 +
fs/resctrl2/arch/x86/rdt_l3_cat.c | 349 +++++++++++++++
fs/resctrl2/arch/x86/rdt_l3_cdp.c | 1 +
fs/resctrl2/arch/x86/rdt_l3_mba.c | 251 +++++++++++
fs/resctrl2/arch/x86/rdt_llc_occupancy.c | 100 +++++
fs/resctrl2/arch/x86/rdt_mbm_adjust.c | 91 ++++
fs/resctrl2/arch/x86/rdt_mbm_local_bytes.c | 1 +
fs/resctrl2/arch/x86/rdt_mbm_local_rate.c | 1 +
fs/resctrl2/arch/x86/rdt_mbm_total_bytes.c | 1 +
fs/resctrl2/arch/x86/rdt_mbm_total_rate.c | 1 +
fs/resctrl2/arch/x86/rdt_monitor.c | 491 +++++++++++++++++++++
arch/x86/Kconfig | 81 +++-
fs/resctrl2/arch/x86/Makefile | 29 ++
21 files changed, 1586 insertions(+), 2 deletions(-)
create mode 100644 fs/resctrl2/arch/x86/rdt.h
create mode 100644 fs/resctrl2/arch/x86/alloc.c
create mode 120000 fs/resctrl2/arch/x86/rdt_l2_cat.c
create mode 120000 fs/resctrl2/arch/x86/rdt_l2_cdp.c
create mode 100644 fs/resctrl2/arch/x86/rdt_l3_cat.c
create mode 120000 fs/resctrl2/arch/x86/rdt_l3_cdp.c
create mode 100644 fs/resctrl2/arch/x86/rdt_l3_mba.c
create mode 100644 fs/resctrl2/arch/x86/rdt_llc_occupancy.c
create mode 100644 fs/resctrl2/arch/x86/rdt_mbm_adjust.c
create mode 120000 fs/resctrl2/arch/x86/rdt_mbm_local_bytes.c
create mode 120000 fs/resctrl2/arch/x86/rdt_mbm_local_rate.c
create mode 120000 fs/resctrl2/arch/x86/rdt_mbm_total_bytes.c
create mode 120000 fs/resctrl2/arch/x86/rdt_mbm_total_rate.c
create mode 100644 fs/resctrl2/arch/x86/rdt_monitor.c
create mode 100644 fs/resctrl2/arch/x86/Makefile

diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
index 255a78d9d906..efbad3175680 100644
--- a/arch/x86/include/asm/resctrl.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -96,6 +96,44 @@ static inline void resctrl_sched_in(struct task_struct *tsk)

void resctrl_cpu_detect(struct cpuinfo_x86 *c);

+#elif defined(CONFIG_X86_CPU_RESCTRL2)
+
+bool arch_alloc_resctrl_ids(struct resctrl_group *rg);
+void arch_free_resctrl_ids(struct resctrl_group *rg);
+bool arch_init_alloc_ids(struct resctrl_resource *r);
+int rmid_alloc(int prmid);
+void rmid_free(int rmid);
+void arch_add_monitor(int mon_event);
+void arch_del_monitor(int mon_event);
+u64 rdt_rmid_read(int domain_id, int rmid, int event);
+void rdt_mbm_apply_quirk(int num_rmids);
+u64 get_corrected_mbm_count(u32 rmid, unsigned long val);
+
+static inline bool is_closid_match(struct task_struct *t, struct resctrl_group *rg)
+{
+ return (t->resctrl_ids >> 32) == (rg->resctrl_ids >> 32);
+}
+
+static inline bool arch_is_resctrl_id_match(struct task_struct *t, struct resctrl_group *rg)
+{
+ if (rg->type == DIR_MON)
+ return t->resctrl_ids == rg->resctrl_ids;
+ return is_closid_match(t, rg);
+}
+
+static inline bool arch_set_task_ids(struct task_struct *t, struct resctrl_group *rg)
+{
+ if (rg->type == DIR_MON) {
+ if (!is_closid_match(t, rg)) {
+ //rdt_last_cmd_puts("Can't move task to different control group\n");
+ return false;
+ }
+ }
+
+ WRITE_ONCE(t->resctrl_ids, rg->resctrl_ids);
+
+ return true;
+}
#else

static inline void resctrl_sched_in(struct task_struct *tsk) {}
diff --git a/fs/resctrl2/arch/x86/rdt.h b/fs/resctrl2/arch/x86/rdt.h
new file mode 100644
index 000000000000..98c8147bac8e
--- /dev/null
+++ b/fs/resctrl2/arch/x86/rdt.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Intel Corporation. */
+
+/* H/W supported RDT monitor events */
+#define EV_LLC 1
+#define EV_TOT 2
+#define EV_LOC 3
+#define EV_MAX 4
+
+/* S/W events */
+#define EV_TOTRATE 4
+#define EV_LOCRATE 5
+
+#define RESCTRL_FILE_DEF(X, fmt) \
+static int X##_show(struct seq_file *sf, void *v) \
+{ \
+ seq_printf(sf, fmt, X); \
+ return 0; \
+} \
+static struct kernfs_ops X##_ops = { \
+ .seq_show = X##_show \
+};
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 571abf808ea3..fbd95edcd75b 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -9,6 +9,7 @@
#include <linux/sched/clock.h>
#include <linux/random.h>
#include <linux/topology.h>
+#include <linux/resctrl.h>
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cacheinfo.h>
@@ -19,7 +20,9 @@
#include <asm/pci-direct.h>
#include <asm/delay.h>
#include <asm/debugreg.h>
+#ifndef CONFIG_RESCTRL2_FS
#include <asm/resctrl.h>
+#endif

#ifdef CONFIG_X86_64
# include <asm/mmconfig.h>
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 1c4639588ff9..66829e06b01f 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -14,6 +14,7 @@
#include <linux/workqueue.h>
#include <linux/delay.h>
#include <linux/cpuhotplug.h>
+#include <linux/resctrl.h>

#include <asm/cpufeature.h>
#include <asm/msr.h>
@@ -26,7 +27,9 @@
#include <asm/cpu_device_id.h>
#include <asm/cmdline.h>
#include <asm/traps.h>
+#ifndef CONFIG_RESCTRL2_FS
#include <asm/resctrl.h>
+#endif
#include <asm/numa.h>
#include <asm/thermal.h>

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 708c87b88cc1..8cf70973a331 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -38,6 +38,7 @@
#include <linux/io.h>
#include <linux/kdebug.h>
#include <linux/syscalls.h>
+#include <linux/resctrl.h>

#include <asm/ldt.h>
#include <asm/processor.h>
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3d181c16a2f6..7df3cf1dbcaa 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -40,6 +40,7 @@
#include <linux/ftrace.h>
#include <linux/syscalls.h>
#include <linux/iommu.h>
+#include <linux/resctrl.h>

#include <asm/processor.h>
#include <asm/pkru.h>
@@ -53,7 +54,9 @@
#include <asm/switch_to.h>
#include <asm/xen/hypervisor.h>
#include <asm/vdso.h>
+#ifndef CONFIG_RESCTRL2_FS
#include <asm/resctrl.h>
+#endif
#include <asm/unistd.h>
#include <asm/fsgsbase.h>
#ifdef CONFIG_IA32_EMULATION
diff --git a/fs/resctrl2/arch/x86/alloc.c b/fs/resctrl2/arch/x86/alloc.c
new file mode 100644
index 000000000000..d39f93150180
--- /dev/null
+++ b/fs/resctrl2/arch/x86/alloc.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2023 Intel Corporation. */
+
+#include <asm/cpufeatures.h>
+
+#include "../../internal.h"
+
+DEFINE_STATIC_KEY_FALSE(resctrl_enable_key);
+DEFINE_PER_CPU(struct resctrl_per_cpu_state, resctrl_per_cpu_state);
+
+/*
+ * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
+ * we can keep a bitmap of free CLOSIDs in a single integer.
+ *
+ * Using a global CLOSID across all resources has some advantages and
+ * some drawbacks:
+ * + We can simply use a field in the task structure to assign a task to a resource
+ * group.
+ * + Context switch code can avoid extra memory references deciding which
+ * CLOSID to load into the PQR_ASSOC MSR
+ * - We give up some options in configuring resource groups across multi-socket
+ * systems.
+ * - Our choices on how to configure each resource become progressively more
+ * limited as the number of resources grows.
+ */
+static int arch_ids;
+static int closid_free_map;
+u64 arch_resctrl_default_ids;
+
+void arch_resctrl_apply_ids(u64 resctrl_ids)
+{
+ wrmsrl(MSR_IA32_PQR_ASSOC, resctrl_ids);
+}
+
+static void closid_init(void)
+{
+ closid_free_map = BIT_MASK(arch_ids) - 1;
+
+ /* CLOSID 0 is always reserved for the default group */
+ closid_free_map &= ~1;
+}
+
+static int closid_alloc(void)
+{
+ u32 closid = ffs(closid_free_map);
+
+ if (closid == 0)
+ return -ENOSPC;
+ closid--;
+ closid_free_map &= ~(1 << closid);
+
+ return closid;
+}
+
+void closid_free(int closid)
+{
+ closid_free_map |= 1 << closid;
+}
+
+#define RESCTRL_ID(c, r) (((u64)(c) << 32) | (r))
+
+bool arch_check_resctrl_support(void)
+{
+ return boot_cpu_has(X86_FEATURE_CQM) || boot_cpu_has(X86_FEATURE_RDT_A);
+}
+
+bool arch_init_alloc_ids(struct resctrl_resource *r)
+{
+ if (r->num_alloc_ids < arch_ids)
+ return false;
+ if (arch_ids != 0) {
+ if (r->num_alloc_ids > arch_ids)
+ r->num_alloc_ids = arch_ids;
+ return true;
+ }
+ arch_ids = r->num_alloc_ids;
+
+ closid_init();
+
+ return true;
+}
+
+bool arch_alloc_resctrl_ids(struct resctrl_group *rg)
+{
+ int c, r;
+
+ switch (rg->type) {
+ case DIR_CTRL_MON:
+ c = closid_alloc();
+ if (c < 0)
+ return false;
+ r = rmid_alloc(-1);
+ if (r < 0) {
+ closid_free(c);
+ return false;
+ }
+ rg->resctrl_ids = RESCTRL_ID(c, r);
+ return true;
+
+ case DIR_MON:
+ /* monitor groups have same CLOSID as parent */
+ c = rg->parent->resctrl_ids >> 32;
+ r = rmid_alloc(rg->parent->resctrl_ids & 0xffff);
+ if (r < 0)
+ return false;
+ rg->resctrl_ids = RESCTRL_ID(c, r);
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+void arch_free_resctrl_ids(struct resctrl_group *rg)
+{
+ closid_free(rg->resctrl_ids >> 32);
+
+ rmid_free(rg->resctrl_ids & 0xffff);
+}
diff --git a/fs/resctrl2/arch/x86/rdt_l2_cat.c b/fs/resctrl2/arch/x86/rdt_l2_cat.c
new file mode 120000
index 000000000000..d25679af550e
--- /dev/null
+++ b/fs/resctrl2/arch/x86/rdt_l2_cat.c
@@ -0,0 +1 @@
+rdt_l3_cat.c
\ No newline at end of file
diff --git a/fs/resctrl2/arch/x86/rdt_l2_cdp.c b/fs/resctrl2/arch/x86/rdt_l2_cdp.c
new file mode 120000
index 000000000000..d25679af550e
--- /dev/null
+++ b/fs/resctrl2/arch/x86/rdt_l2_cdp.c
@@ -0,0 +1 @@
+rdt_l3_cat.c
\ No newline at end of file
diff --git a/fs/resctrl2/arch/x86/rdt_l3_cat.c b/fs/resctrl2/arch/x86/rdt_l3_cat.c
new file mode 100644
index 000000000000..1cb9a99e93b4
--- /dev/null
+++ b/fs/resctrl2/arch/x86/rdt_l3_cat.c
@@ -0,0 +1,349 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2023 Intel Corporation. */
+
+/*
+ * X86 Resource Control Driver For L2 and L3 cache allocation
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/resctrl.h>
+#include <linux/seq_file.h>
+
+#include "rdt.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#ifdef CDP
+#define SUFFIX_D "DATA"
+#define SUFFIX_C "CODE"
+#define MULDIV 2
+#else
+#define SUFFIX_D ""
+#define SUFFIX_C ""
+#define MULDIV 1
+#endif
+
+#if CACHE_LEVEL == 3
+#define NAME "L3"
+#define MSR MSR_IA32_L3_CBM_BASE
+#define MSRCDP MSR_IA32_L3_QOS_CFG
+#define LEAF_BIT 1
+#define SCOPE RESCTRL_L3CACHE
+#elif CACHE_LEVEL == 2
+#define NAME "L2"
+#define MSR MSR_IA32_L2_CBM_BASE
+#define MSRCDP MSR_IA32_L2_QOS_CFG
+#define LEAF_BIT 2
+#define SCOPE RESCTRL_L2CACHE
+#else
+#error "CACHE_LEVEL not defined"
+#endif
+
+struct cbm_masks {
+ u64 now;
+ u64 staged;
+ bool need_update;
+};
+
+struct mydomain {
+ int cbm_len;
+ struct cbm_masks cbm_masks[];
+};
+#define get_mydomain(d) ((struct mydomain *)&d[1])
+
+static struct resctrl_resource cat;
+
+static u32 cbm_mask;
+static int min_cbm_bits = 1;
+static int num_closids;
+static u32 shareable_bits;
+static bool arch_has_sparse_bitmaps;
+
+static void show(struct resctrl_resource *r, struct seq_file *m, u64 resctrl_ids)
+{
+ int closid = (resctrl_ids >> 32);
+ struct resctrl_domain *d;
+ struct cbm_masks *cbm;
+ char *sep = "";
+
+ list_for_each_entry(d, &r->domains, list) {
+ cbm = get_mydomain(d)->cbm_masks;
+ seq_printf(m, "%s%d=%llx", sep, d->id, cbm[closid].now);
+ sep = ";";
+ }
+ seq_puts(m, "\n");
+}
+
+static void resetstaging(struct resctrl_resource *r, u64 resctrl_ids)
+{
+ int closid = (resctrl_ids >> 32);
+ struct resctrl_domain *d;
+ struct cbm_masks *cbm;
+
+ list_for_each_entry(d, &r->domains, list) {
+ cbm = get_mydomain(d)->cbm_masks;
+ cbm[closid].need_update = false;
+ }
+}
+
+static bool validate_mask(struct resctrl_domain *d, char *buf, struct cbm_masks *c)
+{
+ unsigned long first_bit, last_bit, val;
+ struct mydomain *m = get_mydomain(d);
+ int ret;
+
+ ret = kstrtoul(buf, 16, &val);
+ if (ret) {
+ // rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf);
+ return false;
+ }
+
+ /* User didn't change this value */
+ if (val == c->now)
+ return true;
+
+ if ((min_cbm_bits > 0 && val == 0) || val > (1u << (m->cbm_len + 1)) - 1) {
+ // rdt_last_cmd_puts("Mask out of range\n");
+ return false;
+ }
+ if (val == 0)
+ goto ok;
+ first_bit = __ffs(val);
+ last_bit = __fls(val);
+ if ((last_bit - first_bit) + 1 < min_cbm_bits) {
+ // rdt_last_cmd_printf("Need at least %d bits in the mask\n", min_cbm_bits);
+ return false;
+ }
+ if (!arch_has_sparse_bitmaps && val != (((1u << (last_bit + 1)) - 1) & ~((1u << first_bit) - 1))) {
+ // rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val);
+ return false;
+ }
+
+ok:
+ c->need_update = true;
+ c->staged = val;
+
+ return true;
+}
+
+static int parse(struct resctrl_resource *r, char *line, u64 resctrl_ids)
+{
+ int closid = (resctrl_ids >> 32);
+ struct cbm_masks *cbm;
+ char *dom = NULL, *id;
+ struct resctrl_domain *d;
+ unsigned long dom_id;
+
+next:
+ if (!line || line[0] == '\0')
+ return 0;
+ dom = strsep(&line, ";");
+ id = strsep(&dom, "=");
+ id = strim(id);
+ if (!dom || kstrtoul(id, 10, &dom_id)) {
+ // rdt_last_cmd_puts("Missing '=' or non-numeric domain\n");
+ return -EINVAL;
+ }
+ dom = strim(dom);
+ list_for_each_entry(d, &r->domains, list) {
+ if (d->id != dom_id)
+ continue;
+ cbm = get_mydomain(d)->cbm_masks;
+ if (!validate_mask(d, dom, cbm + closid))
+ return -EINVAL;
+ goto next;
+ }
+ return -EINVAL;
+}
+
+struct rdt_msr_info {
+ int msr_base;
+ struct cbm_masks *cbm;
+};
+
+static void update_msrs(void *info)
+{
+ struct rdt_msr_info *mi = info;
+
+ for (int i = 0; i < cat.num_alloc_ids; i++) {
+ if (mi->cbm[i].need_update) {
+ mi->cbm[i].now = mi->cbm[i].staged;
+ mi->cbm[i].need_update = false;
+ wrmsrl(mi->msr_base + i * MULDIV, mi->cbm[i].now);
+ }
+ }
+}
+
+static void applychanges(struct resctrl_resource *r, u64 resctrl_ids)
+{
+ int closid = (resctrl_ids >> 32);
+ struct resctrl_domain *d;
+ struct cbm_masks *cbm;
+ struct rdt_msr_info mi;
+
+ list_for_each_entry(d, &r->domains, list) {
+ cbm = get_mydomain(d)->cbm_masks;
+ if (!cbm[closid].need_update)
+ continue;
+ mi.msr_base = r->archtag;
+ mi.cbm = cbm;
+ smp_call_function_single(cpumask_first(&d->cpu_mask), update_msrs, &mi, 1);
+ }
+}
+
+ u64 val;
+
+#ifdef CDP
+static void update_cdp(void *info)
+{
+ u64 val;
+
+ rdmsrl(MSRCDP, val);
+ if (info)
+ val |= BIT(0);
+ else
+ val &= ~BIT(0);
+ wrmsrl(MSRCDP, val);
+}
+#endif
+
+/*
+ * On domain discovery (duing module load, or CPU hotplug) set
+ * all controls to allow full access to all of cache. Ditto on
+ * module unload or domain removal.
+ */
+static void domain_update(struct resctrl_resource *r, int what, int cpu, struct resctrl_domain *d)
+{
+ struct mydomain *m = get_mydomain(d);
+ unsigned int eax, ebx, ecx, edx;
+ struct rdt_msr_info mi;
+ struct cbm_masks *cbm;
+
+ cbm = (struct cbm_masks *)(m + 1);
+ if (what == RESCTRL_DOMAIN_ADD || what == RESCTRL_DOMAIN_DELETE) {
+ cpuid_count(0x10, LEAF_BIT, &eax, &ebx, &ecx, &edx);
+ shareable_bits = ebx;
+ m->cbm_len = eax & 0x1f;
+ cbm_mask = (1u << (m->cbm_len + 1)) - 1;
+ for (int i = 0; i < cat.num_alloc_ids; i++) {
+ cbm[i].staged = cbm_mask;
+ cbm[i].need_update = true;
+ }
+ mi.msr_base = r->archtag;
+ mi.cbm = cbm;
+ smp_call_function_single(cpu, update_msrs, &mi, 1);
+ }
+#ifdef CDP
+ if (what == RESCTRL_DOMAIN_ADD)
+ smp_call_function_single(cpu, update_cdp, (void *)1, 1);
+ else if (what == RESCTRL_DOMAIN_DELETE)
+ smp_call_function_single(cpu, update_cdp, NULL, 1);
+#endif
+}
+
+RESCTRL_FILE_DEF(cbm_mask, "%x\n")
+RESCTRL_FILE_DEF(min_cbm_bits, "%d\n")
+RESCTRL_FILE_DEF(num_closids, "%d\n")
+RESCTRL_FILE_DEF(shareable_bits, "%x\n")
+
+static struct resctrl_fileinfo cat_files[] = {
+ { .name = "cbm_mask", .ops = &cbm_mask_ops },
+ { .name = "min_cbm_bits", .ops = &min_cbm_bits_ops },
+ { .name = "num_closids", .ops = &num_closids_ops },
+ { .name = "shareable_bits", .ops = &shareable_bits_ops },
+ { }
+};
+
+static struct resctrl_resource cat = {
+ .name = NAME SUFFIX_D,
+ .archtag = MSR,
+ .type = RESCTRL_CONTROL,
+ .show = show,
+ .resetstaging = resetstaging,
+ .parse = parse,
+ .applychanges = applychanges,
+ .scope = SCOPE,
+ .domain_size = sizeof(struct resctrl_domain) + sizeof(struct mydomain),
+ .domains = LIST_HEAD_INIT(cat.domains),
+ .domain_update = domain_update,
+ .infodir = "L3",
+ .infofiles = cat_files,
+};
+
+#ifdef CDP
+static struct resctrl_resource cat_code = {
+ .name = NAME SUFFIX_C,
+ .archtag = MSR + 1,
+ .type = RESCTRL_CONTROL,
+ .show = show,
+ .resetstaging = resetstaging,
+ .parse = parse,
+ .applychanges = applychanges,
+ .scope = SCOPE,
+ .domain_size = sizeof(struct resctrl_domain) + sizeof(struct mydomain),
+ .domains = LIST_HEAD_INIT(cat_code.domains),
+ .domain_update = domain_update,
+};
+#endif
+
+static int __init cat_init(void)
+{
+ unsigned int eax, ebx, ecx, edx, cat_features;
+ int ret;
+
+ if (!boot_cpu_has(X86_FEATURE_RDT_A)) {
+ pr_debug("No RDT allocation support\n");
+ return -ENODEV;
+ }
+
+ cat_features = cpuid_ebx(0x10);
+
+ if (!(cat_features & BIT(LEAF_BIT))) {
+ pr_debug("No RDT allocation for L%d cache\n", CACHE_LEVEL);
+ return -ENODEV;
+ }
+
+ cpuid_count(0x10, LEAF_BIT, &eax, &ebx, &ecx, &edx);
+#ifdef CDP
+ if (!(ecx & BIT(2))) {
+ pr_debug("No CDP mode for L%d cache\n", CACHE_LEVEL);
+ return -ENODEV;
+ }
+#endif
+ num_closids = (edx + 1) / MULDIV;
+
+ cat.domain_size += num_closids * sizeof(struct cbm_masks);
+ cat.num_alloc_ids = num_closids;
+#ifdef CDP
+ cat_code.domain_size += num_closids * sizeof(struct cbm_masks);
+ cat_code.num_alloc_ids = num_closids;
+#endif
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ min_cbm_bits = 0;
+ arch_has_sparse_bitmaps = true;
+ }
+
+ ret = resctrl_register_ctrl_resource(&cat);
+#ifdef CDP
+ if (!ret)
+ ret = resctrl_register_ctrl_resource(&cat_code);
+ if (ret)
+ resctrl_unregister_ctrl_resource(&cat);
+#endif
+ return ret;
+}
+
+static void __exit cat_cleanup(void)
+{
+ resctrl_unregister_ctrl_resource(&cat);
+#ifdef CDP
+ resctrl_unregister_ctrl_resource(&cat_code);
+#endif
+}
+
+module_init(cat_init);
+module_exit(cat_cleanup);
+
+MODULE_LICENSE("GPL");
diff --git a/fs/resctrl2/arch/x86/rdt_l3_cdp.c b/fs/resctrl2/arch/x86/rdt_l3_cdp.c
new file mode 120000
index 000000000000..d25679af550e
--- /dev/null
+++ b/fs/resctrl2/arch/x86/rdt_l3_cdp.c
@@ -0,0 +1 @@
+rdt_l3_cat.c
\ No newline at end of file
diff --git a/fs/resctrl2/arch/x86/rdt_l3_mba.c b/fs/resctrl2/arch/x86/rdt_l3_mba.c
new file mode 100644
index 000000000000..f937fb02697e
--- /dev/null
+++ b/fs/resctrl2/arch/x86/rdt_l3_mba.c
@@ -0,0 +1,251 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2023 Intel Corporation. */
+
+/*
+ * X86 Resource Control Driver For L2 and L3 cache allocation
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/resctrl.h>
+#include <linux/seq_file.h>
+
+#include "rdt.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+struct throttle_values {
+ u64 now;
+ u64 staged;
+ bool need_update;
+};
+
+struct mydomain {
+ int max_throttle;
+ struct throttle_values throttle_values[];
+};
+#define get_mydomain(d) ((struct mydomain *)(&d[1]))
+
+static struct resctrl_resource mba;
+
+static int bandwidth_gran, delay_linear, min_bandwidth, num_closids;
+
+static void show(struct resctrl_resource *r, struct seq_file *m, u64 resctrl_ids)
+{
+ int closid = (resctrl_ids >> 32);
+ struct resctrl_domain *d;
+ struct throttle_values *tvalues;
+ char *sep = "";
+
+ list_for_each_entry(d, &r->domains, list) {
+ tvalues = get_mydomain(d)->throttle_values;
+ seq_printf(m, "%s%d=%lld", sep, d->id, tvalues[closid].now);
+ sep = ";";
+ }
+ seq_puts(m, "\n");
+}
+
+static void resetstaging(struct resctrl_resource *r, u64 resctrl_ids)
+{
+ int closid = (resctrl_ids >> 32);
+ struct resctrl_domain *d;
+ struct throttle_values *tvalues;
+
+ list_for_each_entry(d, &r->domains, list) {
+ tvalues = get_mydomain(d)->throttle_values;
+ tvalues[closid].need_update = false;
+ }
+}
+
+static bool validate_throttle(struct resctrl_domain *d, char *buf, struct throttle_values *c)
+{
+ unsigned long val;
+ struct mydomain *m = get_mydomain(d);
+ int ret;
+
+ ret = kstrtoul(buf, 10, &val);
+ if (ret) {
+ // rdt_last_cmd_printf("Non-decimal character in the value %s\n", buf);
+ return false;
+ }
+
+ /* User didn't change this value */
+ if (val == c->now)
+ return true;
+
+ if (val > m->max_throttle) {
+ // rdt_last_cmd_puts("Throttle value out of range\n");
+ return false;
+ }
+ if (val % bandwidth_gran) {
+ // rdt_last_cmd_printf("Throttle must be multiple of %lld\n", bandwidth_gran);
+ return false;
+ }
+
+ c->need_update = true;
+ c->staged = val;
+
+ return true;
+}
+
+static int parse(struct resctrl_resource *r, char *line, u64 resctrl_ids)
+{
+ int closid = (resctrl_ids >> 32);
+ struct throttle_values *tvalues;
+ char *dom = NULL, *id;
+ struct resctrl_domain *d;
+ unsigned long dom_id;
+
+next:
+ if (!line || line[0] == '\0')
+ return 0;
+ dom = strsep(&line, ";");
+ id = strsep(&dom, "=");
+ id = strim(id);
+ if (!dom || kstrtoul(id, 10, &dom_id)) {
+ // rdt_last_cmd_puts("Missing '=' or non-numeric domain\n");
+ return -EINVAL;
+ }
+ dom = strim(dom);
+ list_for_each_entry(d, &r->domains, list) {
+ if (d->id != dom_id)
+ continue;
+ tvalues = get_mydomain(d)->throttle_values;
+ if (!validate_throttle(d, dom, tvalues + closid))
+ return -EINVAL;
+ goto next;
+ }
+ return -EINVAL;
+}
+
+struct rdt_msr_info {
+ int msr_base;
+ struct throttle_values *tvalues;
+};
+
+static void update_msrs(void *info)
+{
+ struct rdt_msr_info *mi = info;
+
+ for (int i = 0; i < mba.num_alloc_ids; i++) {
+ if (mi->tvalues[i].need_update) {
+ mi->tvalues[i].now = mi->tvalues[i].staged;
+ mi->tvalues[i].need_update = false;
+ wrmsrl(mi->msr_base + i, mi->tvalues[i].now);
+ }
+ }
+}
+
+static void applychanges(struct resctrl_resource *r, u64 resctrl_ids)
+{
+ int closid = (resctrl_ids >> 32);
+ struct resctrl_domain *d;
+ struct throttle_values *tvalues;
+ struct rdt_msr_info mi;
+
+ list_for_each_entry(d, &r->domains, list) {
+ tvalues = get_mydomain(d)->throttle_values;
+ if (!tvalues[closid].need_update)
+ continue;
+ mi.msr_base = r->archtag;
+ mi.tvalues = tvalues;
+ smp_call_function_single(cpumask_first(&d->cpu_mask), update_msrs, &mi, 1);
+ }
+}
+
+ u64 val;
+
+/*
+ * On domain discovery (duing module load, or CPU hotplug) set
+ * all controls to allow full access to all of cache. Ditto on
+ * module unload or domain removal.
+ */
+static void domain_update(struct resctrl_resource *r, int what, int cpu, struct resctrl_domain *d)
+{
+ struct mydomain *m = get_mydomain(d);
+ unsigned int eax, ebx, ecx, edx;
+ struct rdt_msr_info mi;
+ struct throttle_values *tvalues;
+
+ tvalues = (struct throttle_values *)(m + 1);
+ if (what == RESCTRL_DOMAIN_ADD || what == RESCTRL_DOMAIN_DELETE) {
+ cpuid_count(0x10, 3, &eax, &ebx, &ecx, &edx);
+ m->max_throttle = (eax & 0xfff) + 1;
+ bandwidth_gran = 100 - m->max_throttle;
+ min_bandwidth = 100 - m->max_throttle;
+ for (int i = 0; i < mba.num_alloc_ids; i++) {
+ tvalues[i].staged = 0;
+ tvalues[i].need_update = true;
+ }
+ mi.msr_base = r->archtag;
+ mi.tvalues = tvalues;
+ smp_call_function_single(cpu, update_msrs, &mi, 1);
+ }
+}
+
+RESCTRL_FILE_DEF(bandwidth_gran, "%d\n")
+RESCTRL_FILE_DEF(delay_linear, "%d\n")
+RESCTRL_FILE_DEF(min_bandwidth, "%d\n")
+RESCTRL_FILE_DEF(num_closids, "%d\n")
+
+static struct resctrl_fileinfo mb_files[] = {
+ { .name = "bandwidth_gran", .ops = &bandwidth_gran_ops },
+ { .name = "delay_linear", .ops = &delay_linear_ops },
+ { .name = "min_bandwidth", .ops = &min_bandwidth_ops },
+ { .name = "num_closids", .ops = &num_closids_ops },
+ { }
+};
+
+static struct resctrl_resource mba = {
+ .name = "MB",
+ .archtag = MSR_IA32_MBA_THRTL_BASE,
+ .type = RESCTRL_CONTROL,
+ .show = show,
+ .resetstaging = resetstaging,
+ .parse = parse,
+ .applychanges = applychanges,
+ .scope = RESCTRL_L3CACHE,
+ .domain_size = sizeof(struct resctrl_domain) + sizeof(struct mydomain),
+ .domains = LIST_HEAD_INIT(mba.domains),
+ .domain_update = domain_update,
+ .infodir = "MB",
+ .infofiles = mb_files,
+};
+
+static int __init mba_init(void)
+{
+ unsigned int eax, ebx, ecx, edx, mba_features;
+ int ret;
+
+ if (!boot_cpu_has(X86_FEATURE_RDT_A)) {
+ pr_debug("No RDT allocation support\n");
+ return -ENODEV;
+ }
+
+ mba_features = cpuid_ebx(0x10);
+
+ if (!(mba_features & BIT(3))) {
+ pr_debug("No RDT MBA allocation\n");
+ return -ENODEV;
+ }
+
+ cpuid_count(0x10, 3, &eax, &ebx, &ecx, &edx);
+ num_closids = edx + 1;
+ delay_linear = !!(ecx & BIT(2));
+
+ mba.domain_size += num_closids * sizeof(struct throttle_values);
+ mba.num_alloc_ids = num_closids;
+
+ ret = resctrl_register_ctrl_resource(&mba);
+ return ret;
+}
+
+static void __exit mba_cleanup(void)
+{
+ resctrl_unregister_ctrl_resource(&mba);
+}
+
+module_init(mba_init);
+module_exit(mba_cleanup);
+
+MODULE_LICENSE("GPL");
diff --git a/fs/resctrl2/arch/x86/rdt_llc_occupancy.c b/fs/resctrl2/arch/x86/rdt_llc_occupancy.c
new file mode 100644
index 000000000000..e2f96776c2f2
--- /dev/null
+++ b/fs/resctrl2/arch/x86/rdt_llc_occupancy.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2023 Intel Corporation. */
+
+#include <asm/cpufeatures.h>
+
+#include "../../internal.h"
+
+#include "rdt.h"
+
+#ifndef EVENT
+#error "Need definition of which EVENT this module tracks"
+#endif
+
+static int mon_show(struct seq_file *sf, void *v)
+{
+ struct kernfs_open_file *of = sf->private;
+ struct kernfs_node *kn = of->kn;
+ long resctrl_ids = (long)kn->priv;
+ long domain_id = (long)kn->parent->priv;
+
+ seq_printf(sf, "%llu\n", rdt_rmid_read(domain_id, resctrl_ids & 0xffff, EVENT));
+
+ return 0;
+}
+
+static void domain_update(struct resctrl_resource *r, int what, int cpu, struct resctrl_domain *d)
+{
+}
+
+static struct kernfs_ops ops = {
+ .seq_show = mon_show,
+};
+
+static struct resctrl_resource mon = {
+ .name = "L3",
+ .archtag = MSR_IA32_QM_EVTSEL,
+ .type = RESCTRL_MONITOR,
+ .scope = RESCTRL_L3CACHE,
+ .domain_size = sizeof(struct resctrl_domain),
+ .domains = LIST_HEAD_INIT(mon.domains),
+ .domain_update = domain_update,
+ .mon_domain_dir = "mon_L3_%02d",
+#if EVENT == EV_LLC
+ .mon_domain_file= "llc_occupancy",
+#elif EVENT == EV_TOT
+ .mon_domain_file= "mbm_total_bytes",
+#elif EVENT == EV_LOC
+ .mon_domain_file= "mbm_local_bytes",
+#elif EVENT == EV_TOTRATE
+ .mon_domain_file= "mbm_total_rate",
+#elif EVENT == EV_LOCRATE
+ .mon_domain_file= "mbm_local_rate",
+#else
+#error "Unknown EVENT type"
+#endif
+ .mod_domain_ops = &ops,
+ .mon_event = EVENT,
+};
+
+static int rdt_monitor_init(void)
+{
+ u32 eax, ebx, ecx, edx;
+ int bit;
+
+ switch (EVENT) {
+ case EV_LLC: case EV_TOT: case EV_LOC:
+ bit = EVENT - 1;
+ break;
+ case EV_TOTRATE:
+ bit = EV_TOT - 1;
+ break;
+ case EV_LOCRATE:
+ bit = EV_LOC - 1;
+ break;
+ }
+ if (!boot_cpu_has(X86_FEATURE_CQM))
+ return -ENODEV;
+
+ cpuid_count(0xf, 0, &eax, &ebx, &ecx, &edx);
+ if (!(edx & BIT(1)))
+ return -ENODEV;
+
+ cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx);
+ if (!(edx & BIT(bit)))
+ return -ENODEV;
+
+ resctrl_register_ctrl_resource(&mon);
+
+ return 0;
+}
+
+static void rdt_monitor_exit(void)
+{
+ resctrl_unregister_ctrl_resource(&mon);
+}
+
+module_init(rdt_monitor_init);
+module_exit(rdt_monitor_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/fs/resctrl2/arch/x86/rdt_mbm_adjust.c b/fs/resctrl2/arch/x86/rdt_mbm_adjust.c
new file mode 100644
index 000000000000..430cce257f32
--- /dev/null
+++ b/fs/resctrl2/arch/x86/rdt_mbm_adjust.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2023 Intel Corporation. */
+
+#include <asm/cpufeatures.h>
+#include <asm/intel-family.h>
+
+#include "../../internal.h"
+
+#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5))
+
+/*
+ * The correction factor table is documented in Documentation/arch/x86/resctrl.rst.
+ * If rmid > rmid threshold, MBM total and local values should be multiplied
+ * by the correction factor.
+ *
+ * The original table is modified for better code:
+ *
+ * 1. The threshold 0 is changed to rmid count - 1 so don't do correction
+ * for the case.
+ * 2. MBM total and local correction table indexed by core counter which is
+ * equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27.
+ * 3. The correction factor is normalized to 2^20 (1048576) so it's faster
+ * to calculate corrected value by shifting:
+ * corrected_value = (original_value * correction_factor) >> 20
+ */
+static const struct mbm_correction_factor_table {
+ u32 rmidthreshold;
+ u64 cf;
+} mbm_cf_table[] __initconst = {
+ {7, CF(1.000000)},
+ {15, CF(1.000000)},
+ {15, CF(0.969650)},
+ {31, CF(1.000000)},
+ {31, CF(1.066667)},
+ {31, CF(0.969650)},
+ {47, CF(1.142857)},
+ {63, CF(1.000000)},
+ {63, CF(1.185115)},
+ {63, CF(1.066553)},
+ {79, CF(1.454545)},
+ {95, CF(1.000000)},
+ {95, CF(1.230769)},
+ {95, CF(1.142857)},
+ {95, CF(1.066667)},
+ {127, CF(1.000000)},
+ {127, CF(1.254863)},
+ {127, CF(1.185255)},
+ {151, CF(1.000000)},
+ {127, CF(1.066667)},
+ {167, CF(1.000000)},
+ {159, CF(1.454334)},
+ {183, CF(1.000000)},
+ {127, CF(0.969744)},
+ {191, CF(1.280246)},
+ {191, CF(1.230921)},
+ {215, CF(1.000000)},
+ {191, CF(1.143118)},
+};
+
+static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX;
+static u64 mbm_cf __read_mostly;
+
+u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
+{
+ /* Correct MBM value. */
+ if (rmid > mbm_cf_rmidthreshold)
+ val = (val * mbm_cf) >> 20;
+
+ return val;
+}
+
+void __init rdt_mbm_apply_quirk(int num_rmids)
+{
+ int cf_index;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ boot_cpu_data.x86 != 6)
+ return;
+ if (boot_cpu_data.x86_model != INTEL_FAM6_BROADWELL_X &&
+ boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X)
+ return;
+
+ cf_index = num_rmids / 8 - 1;
+ if (cf_index >= ARRAY_SIZE(mbm_cf_table)) {
+ pr_info("No MBM correction factor available\n");
+ return;
+ }
+
+ mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold;
+ mbm_cf = mbm_cf_table[cf_index].cf;
+}
diff --git a/fs/resctrl2/arch/x86/rdt_mbm_local_bytes.c b/fs/resctrl2/arch/x86/rdt_mbm_local_bytes.c
new file mode 120000
index 000000000000..e15bfdd4143e
--- /dev/null
+++ b/fs/resctrl2/arch/x86/rdt_mbm_local_bytes.c
@@ -0,0 +1 @@
+rdt_llc_occupancy.c
\ No newline at end of file
diff --git a/fs/resctrl2/arch/x86/rdt_mbm_local_rate.c b/fs/resctrl2/arch/x86/rdt_mbm_local_rate.c
new file mode 120000
index 000000000000..e15bfdd4143e
--- /dev/null
+++ b/fs/resctrl2/arch/x86/rdt_mbm_local_rate.c
@@ -0,0 +1 @@
+rdt_llc_occupancy.c
\ No newline at end of file
diff --git a/fs/resctrl2/arch/x86/rdt_mbm_total_bytes.c b/fs/resctrl2/arch/x86/rdt_mbm_total_bytes.c
new file mode 120000
index 000000000000..e15bfdd4143e
--- /dev/null
+++ b/fs/resctrl2/arch/x86/rdt_mbm_total_bytes.c
@@ -0,0 +1 @@
+rdt_llc_occupancy.c
\ No newline at end of file
diff --git a/fs/resctrl2/arch/x86/rdt_mbm_total_rate.c b/fs/resctrl2/arch/x86/rdt_mbm_total_rate.c
new file mode 120000
index 000000000000..e15bfdd4143e
--- /dev/null
+++ b/fs/resctrl2/arch/x86/rdt_mbm_total_rate.c
@@ -0,0 +1 @@
+rdt_llc_occupancy.c
\ No newline at end of file
diff --git a/fs/resctrl2/arch/x86/rdt_monitor.c b/fs/resctrl2/arch/x86/rdt_monitor.c
new file mode 100644
index 000000000000..66523bcdffc8
--- /dev/null
+++ b/fs/resctrl2/arch/x86/rdt_monitor.c
@@ -0,0 +1,491 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2023 Intel Corporation. */
+
+#include <asm/cpufeatures.h>
+
+#include "../../internal.h"
+#include "rdt.h"
+
+#define MBM_POLL_DELAY 1000 // milliseconds
+
+char *stpcpy(char *__restrict__ dest, const char *__restrict__ src);
+
+struct rmid {
+ struct list_head list;
+ struct list_head child_list;
+ bool is_parent;
+ u64 llc_busy_domains;
+};
+
+struct mbm_event_state {
+ u64 chunks;
+ u64 prev_msr;
+ u64 prev_jiffies;
+ u64 rate;
+};
+
+struct arch_mbm_state {
+ struct mbm_event_state state[2];
+};
+
+struct mydomain {
+ int cpu;
+ spinlock_t msr_lock;
+ struct delayed_work worker;
+ struct arch_mbm_state state[];
+};
+#define get_mydomain(d) ((struct mydomain *)&d[1])
+
+struct rmid_info {
+ struct mydomain *mydomain;
+ u32 eventmap;
+ bool init;
+};
+
+static LIST_HEAD(active_rmids);
+static LIST_HEAD(free_rmids);
+static LIST_HEAD(limbo_rmids);
+
+static struct rmid *rmid_array;
+static int num_rmids;
+static int upscale;
+static int max_threshold_occupancy;
+static int mbm_width = 24;
+static char mon_features[64];
+static struct resctrl_resource monitor;
+static int active_events[EV_MAX];
+
+static void init_rmids(int mon_event);
+static void update_rmids(void *info);
+static bool rmid_polling;
+static u64 llc_busy_threshold;
+unsigned int resctrl_rmid_realloc_limit;
+
+static void check_limbo(struct resctrl_domain *d)
+{
+ struct rmid *r, *tmp;
+
+ list_for_each_entry_safe(r, tmp, &limbo_rmids, list) {
+ u64 rmid = r - rmid_array;
+ u64 chunks;
+
+ if (!(r->llc_busy_domains & BIT(d->id)))
+ continue;
+ wrmsrl(MSR_IA32_QM_EVTSEL, (rmid << 32) | EV_LLC);
+ rdmsrl(MSR_IA32_QM_CTR, chunks);
+
+ if (chunks <= llc_busy_threshold) {
+ r->llc_busy_domains &= ~BIT(d->id);
+ if (!r->llc_busy_domains)
+ list_move_tail(&r->list, &free_rmids);
+ }
+ }
+}
+
+static bool mbm_is_active(void)
+{
+ return (active_events[EV_TOT] + active_events[EV_LOC]) > 0;
+}
+
+static void mbm_poll(struct work_struct *work)
+{
+ struct resctrl_domain *d;
+ struct rmid_info ri;
+ unsigned long flags;
+ struct mydomain *m;
+
+ m = container_of(work, struct mydomain, worker.work);
+ d = (struct resctrl_domain *)m - 1;
+ ri.mydomain = m;
+ ri.eventmap = 0;
+ if (active_events[EV_TOT])
+ ri.eventmap |= BIT(EV_TOT);
+ if (active_events[EV_LOC])
+ ri.eventmap |= BIT(EV_LOC);
+ ri.init = false;
+
+ spin_lock_irqsave(&m->msr_lock, flags);
+ update_rmids(&ri);
+
+ if (!list_empty(&limbo_rmids))
+ check_limbo(d);
+
+ if (!list_empty(&limbo_rmids) || mbm_is_active())
+ schedule_delayed_work_on(m->cpu, &m->worker, msecs_to_jiffies(MBM_POLL_DELAY));
+ else
+ rmid_polling = false;
+ spin_unlock_irqrestore(&m->msr_lock, flags);
+}
+
+static void init_rmid_polling(void)
+{
+ struct resctrl_domain *d;
+ struct mydomain *m;
+
+ rmid_polling = true;
+ list_for_each_entry(d, &monitor.domains, list) {
+ m = get_mydomain(d);
+ INIT_DELAYED_WORK(&m->worker, mbm_poll);
+ m->cpu = cpumask_any(&d->cpu_mask);
+ schedule_delayed_work_on(m->cpu, &m->worker, msecs_to_jiffies(MBM_POLL_DELAY));
+ }
+}
+
+void arch_add_monitor(int mon_event)
+{
+ switch (mon_event) {
+ case EV_LOCRATE:
+ mon_event = EV_LOC;
+ break;
+ case EV_TOTRATE:
+ mon_event = EV_TOT;
+ break;
+ }
+
+ active_events[mon_event]++;
+
+ if (mon_event == EV_TOT || mon_event == EV_LOC) {
+ if (active_events[mon_event] == 1)
+ init_rmids(mon_event);
+ if (!rmid_polling && mbm_is_active())
+ init_rmid_polling();
+ }
+}
+
+void arch_del_monitor(int mon_event)
+{
+ switch (mon_event) {
+ case EV_LOCRATE:
+ mon_event = EV_LOC;
+ break;
+ case EV_TOTRATE:
+ mon_event = EV_TOT;
+ break;
+ }
+
+ active_events[mon_event]--;
+}
+
+int rmid_alloc(int prmid)
+{
+ struct rmid *r;
+
+ if (!num_rmids)
+ return 0;
+
+ if (list_empty(&free_rmids))
+ return list_empty(&limbo_rmids) ? -ENOSPC : -EBUSY;
+
+ r = list_first_entry(&free_rmids, struct rmid, list);
+
+ if (prmid < 0) {
+ r->is_parent = true;
+ INIT_LIST_HEAD(&r->child_list);
+ } else {
+ r->is_parent = false;
+ list_add(&r->child_list, &rmid_array[prmid].child_list);
+ }
+
+ list_move(&r->list, &active_rmids);
+
+ return r - rmid_array;
+}
+
+void rmid_free(int rmid)
+{
+ struct rmid *r = &rmid_array[rmid];
+ struct resctrl_domain *d;
+
+ if (active_events[EV_LLC]) {
+ list_for_each_entry(d, &monitor.domains, list)
+ r->llc_busy_domains |= BIT(d->id);
+ list_move_tail(&r->list, &limbo_rmids);
+ if (!rmid_polling)
+ init_rmid_polling();
+ } else {
+ list_move_tail(&r->list, &free_rmids);
+ }
+ if (r->is_parent)
+ WARN_ON(!list_empty(&r->child_list));
+ else
+ list_del(&r->child_list);
+}
+
+static u64 wrap(u64 old, u64 new)
+{
+ u64 shift = 64 - mbm_width, chunks;
+
+ chunks = (new << shift) - (old << shift);
+
+ return chunks >> shift;
+}
+
+static u64 adjust(struct mydomain *m, u64 rmid, u64 event, u64 chunks)
+{
+ struct mbm_event_state *s;
+ u64 rawchunks;
+
+
+ switch (event) {
+ case EV_LLC:
+ rawchunks = chunks;
+ break;
+ case EV_TOT:
+ s = &m->state[rmid].state[0];
+ rawchunks = get_corrected_mbm_count(rmid, s->chunks + wrap(s->prev_msr, chunks));
+ break;
+ case EV_LOC:
+ s = &m->state[rmid].state[1];
+ rawchunks = get_corrected_mbm_count(rmid, s->chunks + wrap(s->prev_msr, chunks));
+ break;
+ case EV_TOTRATE:
+ s = &m->state[rmid].state[0];
+ rawchunks = get_corrected_mbm_count(rmid, s->rate);
+ break;
+ case EV_LOCRATE:
+ s = &m->state[rmid].state[0];
+ rawchunks = get_corrected_mbm_count(rmid, s->rate);
+ break;
+ }
+ return rawchunks;
+}
+
+struct rrmid_info {
+ struct resctrl_domain *domain;
+ u64 rmid;
+ u64 event;
+ u64 chunks;
+};
+
+static void __rdt_rmid_read(void *info)
+{
+ struct rrmid_info *rr = info;
+ unsigned long flags;
+ struct rmid *cr, *r;
+ struct mydomain *m;
+ u64 chunks;
+
+ m = get_mydomain(rr->domain);
+
+ if (rr->event <= EV_LOC) {
+ spin_lock_irqsave(&m->msr_lock, flags);
+ wrmsrl(MSR_IA32_QM_EVTSEL, (rr->rmid << 32) | rr->event);
+ rdmsrl(MSR_IA32_QM_CTR, chunks);
+ } else {
+ chunks = 0;
+ }
+
+ rr->chunks = adjust(m, rr->rmid, rr->event, chunks);
+
+ r = &rmid_array[rr->rmid];
+ if (r->is_parent && !list_empty(&r->child_list)) {
+ list_for_each_entry(cr, &r->child_list, child_list) {
+ u64 crmid = cr - rmid_array;
+
+ if (rr->event <= EV_LOC) {
+ wrmsrl(MSR_IA32_QM_EVTSEL, (crmid << 32) | rr->event);
+ rdmsrl(MSR_IA32_QM_CTR, chunks);
+ } else {
+ chunks = 0;
+ }
+
+ rr->chunks += adjust(m, crmid, rr->event, chunks);
+ }
+ }
+
+ if (rr->event <= EV_LOC)
+ spin_unlock_irqrestore(&m->msr_lock, flags);
+}
+
+u64 rdt_rmid_read(int domain_id, int rmid, int event)
+{
+ struct resctrl_domain *d;
+ struct rrmid_info rr;
+ struct mydomain *m;
+
+ list_for_each_entry(d, &monitor.domains, list)
+ if (d->id == domain_id)
+ goto found;
+ return ~0ull;
+found:
+ m = get_mydomain(d);
+
+ rr.domain = d;
+ rr.rmid = rmid;
+ rr.event = event;
+
+ if (event <= EV_LOC)
+ smp_call_function_any(&d->cpu_mask, __rdt_rmid_read, &rr, 1);
+ else
+ __rdt_rmid_read(&rr);
+
+ return rr.chunks * upscale;
+}
+EXPORT_SYMBOL_GPL(rdt_rmid_read);
+
+static void update_rmids(void *info)
+{
+ struct rmid_info *ri = info;
+ struct mbm_event_state *s;
+ u64 addchunks, now;
+ u32 map, event;
+ struct rmid *r;
+
+ list_for_each_entry(r, &active_rmids, list) {
+ u64 msr, rmid = r - rmid_array;
+
+ for (map = ri->eventmap; map; map &= ~BIT(event)) {
+ event = __ffs(map);
+
+ if (event == EV_TOT)
+ s = &ri->mydomain->state[rmid].state[0];
+ else
+ s = &ri->mydomain->state[rmid].state[1];
+ wrmsrl(MSR_IA32_QM_EVTSEL, (rmid << 32) | event);
+ rdmsrl(MSR_IA32_QM_CTR, msr);
+ now = jiffies;
+ addchunks = wrap(s->prev_msr, msr);
+ if (ri->init) {
+ s->chunks = 0;
+ s->rate = 0;
+ } else {
+ s->chunks += addchunks;
+ s->rate = addchunks * HZ / (now - s->prev_jiffies);
+ }
+ s->prev_jiffies = now;
+ s->prev_msr = msr;
+ }
+ }
+}
+
+static void init_rmids(int mon_event)
+{
+ struct resctrl_domain *d;
+ struct rmid_info ri;
+
+ ri.init = true;
+
+ list_for_each_entry(d, &monitor.domains, list) {
+ ri.mydomain = get_mydomain(d);
+ ri.eventmap = BIT(mon_event);
+ smp_call_function_any(&d->cpu_mask, update_rmids, &ri, 1);
+ }
+}
+
+static void domain_update(struct resctrl_resource *r, int what, int cpu, struct resctrl_domain *d)
+{
+ struct mydomain *m = get_mydomain(d);
+
+ if (what == RESCTRL_DOMAIN_ADD ||
+ (what == RESCTRL_DOMAIN_DELETE_CPU && cpu == m->cpu)) {
+ if (what == RESCTRL_DOMAIN_DELETE_CPU)
+ cancel_delayed_work(&m->worker);
+ spin_lock_init(&m->msr_lock);
+ INIT_DELAYED_WORK(&m->worker, mbm_poll);
+ m->cpu = cpumask_any(&d->cpu_mask);
+ schedule_delayed_work_on(m->cpu, &m->worker, msecs_to_jiffies(MBM_POLL_DELAY));
+ }
+}
+
+static ssize_t max_threshold_occupancy_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ unsigned int bytes;
+ int ret;
+
+ ret = kstrtouint(buf, 0, &bytes);
+ if (ret)
+ return ret;
+
+ if (bytes > resctrl_rmid_realloc_limit)
+ return -EINVAL;
+
+ llc_busy_threshold = bytes / upscale;
+ max_threshold_occupancy = llc_busy_threshold * upscale;
+
+ return nbytes;
+}
+
+RESCTRL_FILE_DEF(max_threshold_occupancy, "%d\n")
+RESCTRL_FILE_DEF(mon_features, "%s")
+RESCTRL_FILE_DEF(num_rmids, "%d\n")
+
+static struct resctrl_fileinfo monitor_files[] = {
+ { .name = "max_threshold_occupancy", .ops = &max_threshold_occupancy_ops },
+ { .name = "mon_features", .ops = &mon_features_ops },
+ { .name = "num_rmids", .ops = &num_rmids_ops },
+ { }
+};
+
+static struct resctrl_resource monitor = {
+ .name = "L3",
+ .archtag = MSR_IA32_QM_EVTSEL,
+ .type = RESCTRL_MONITOR,
+ .scope = RESCTRL_L3CACHE,
+ .domain_size = sizeof(struct resctrl_domain),
+ .domains = LIST_HEAD_INIT(monitor.domains),
+ .domain_update = domain_update,
+ .infodir = "L3_MON",
+ .infofiles = monitor_files,
+};
+
+static int __init rdt_monitor_init(void)
+{
+ u32 eax, ebx, ecx, edx;
+ char *s;
+
+ if (!boot_cpu_has(X86_FEATURE_CQM))
+ return -ENODEV;
+
+ cpuid_count(0xf, 0, &eax, &ebx, &ecx, &edx);
+ if (!(edx & BIT(1)))
+ return -ENODEV;
+
+ cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx);
+ mbm_width += eax & 0xff;
+ upscale = ebx;
+ num_rmids = ecx + 1;
+ rdt_mbm_apply_quirk(num_rmids);
+
+ monitor.domain_size += num_rmids * sizeof(struct arch_mbm_state);
+
+ max_threshold_occupancy_ops.write = max_threshold_occupancy_write;
+
+ /*
+ * A reasonable upper limit on the max threshold is the number
+ * of lines tagged per RMID if all RMIDs have the same number of
+ * lines tagged in the LLC.
+ *
+ * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+ */
+ resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024;
+ llc_busy_threshold = (resctrl_rmid_realloc_limit / num_rmids) / upscale;
+ max_threshold_occupancy = llc_busy_threshold * upscale;
+
+ s = mon_features;
+ if (edx & BIT(0))
+ s = stpcpy(s, "llc_occupancy\n");
+ if (edx & BIT(1))
+ s = stpcpy(s, "mbm_total_bytes\n");
+ if (edx & BIT(2))
+ s = stpcpy(s, "mbm_local_bytes\n");
+
+ rmid_array = kzalloc(sizeof *rmid_array * num_rmids, GFP_KERNEL);
+ if (!rmid_array)
+ return -ENOMEM;
+
+ rmid_array[0].is_parent = true;
+ INIT_LIST_HEAD(&rmid_array[0].child_list);
+ list_add(&rmid_array[0].list, &active_rmids);
+
+ for (int i = 1; i < num_rmids; i++)
+ list_add_tail(&rmid_array[i].list, &free_rmids);
+
+ resctrl_register_ctrl_resource(&monitor);
+
+ return 0;
+}
+
+late_initcall(rdt_monitor_init);
+
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 53bab123a8ee..1b7bea469334 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -480,9 +480,16 @@ config GOLDFISH
def_bool y
depends on X86_GOLDFISH

+choice
+ prompt "Resource Control"
+ default X86_CPU_RESCTRL_OFF
+ depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD)
+
+config X86_CPU_RESCTRL_OFF
+ bool "No support for x86 CPU resource control"
+
config X86_CPU_RESCTRL
bool "x86 CPU resource control support"
- depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD)
select KERNFS
select PROC_CPU_RESCTRL if PROC_FS
help
@@ -499,7 +506,77 @@ config X86_CPU_RESCTRL
More information about AMD QoS can be found in the AMD64 Technology
Platform Quality of Service Extensions manual.

- Say N if unsure.
+config X86_CPU_RESCTRL2
+ bool "resctrl2 filesystem support"
+ select KERNFS
+ select RESCTRL2_FS
+ help
+ Enable x86 CPU resource control support.
+
+ Experimental software loadable module driven version
+ of support for resource control.
+
+endchoice
+
+config X86_RDT_L3_CAT
+ tristate "RDT L3 cache control"
+ depends on X86_CPU_RESCTRL2
+ help
+ Loadable module for RDT L3 cache allocation
+
+config X86_RDT_L2_CAT
+ tristate "RDT L2 cache control"
+ depends on X86_CPU_RESCTRL2
+ help
+ Loadable module for RDT L2 cache allocation
+
+config X86_RDT_L3_CDP
+ tristate "RDT L3 CDP cache control"
+ depends on X86_CPU_RESCTRL2
+ help
+ Loadable module for RDT L3 CDP cache allocation
+
+config X86_RDT_L2_CDP
+ tristate "RDT L2 CDP cache control"
+ depends on X86_CPU_RESCTRL2
+ help
+ Loadable module for RDT L2 CDP cache allocation
+
+config X86_RDT_L3_MBA
+ tristate "RDT L3 Memory bandwidth allocation"
+ depends on X86_CPU_RESCTRL2
+ help
+ Loadable module for RDT L3 memory bandwidth allocation
+
+config X86_RDT_LLC_OCCUPANCY
+ tristate "RDT L3 cache occupancy monitor"
+ depends on X86_CPU_RESCTRL2
+ help
+ Loadable module for RDT L3 cache occupancy monitor
+
+config X86_RDT_MBM_TOTAL_BYTES
+ tristate "RDT L3 total memory bandwidth monitor"
+ depends on X86_CPU_RESCTRL2
+ help
+ Loadable module for RDT L3 total memory bandwidth monitor
+
+config X86_RDT_MBM_LOCAL_BYTES
+ tristate "RDT L3 local memory bandwidth monitor"
+ depends on X86_CPU_RESCTRL2
+ help
+ Loadable module for RDT L3 local memory bandwidth monitor
+
+config X86_RDT_MBM_TOTAL_RATE
+ tristate "RDT L3 total memory bandwidth rate monitor"
+ depends on X86_CPU_RESCTRL2
+ help
+ Loadable module for RDT L3 total memory bandwidth rate monitor
+
+config X86_RDT_MBM_LOCAL_RATE
+ tristate "RDT L3 local memory bandwidth rate monitor"
+ depends on X86_CPU_RESCTRL2
+ help
+ Loadable module for RDT L3 local memory bandwidth rate monitor

if X86_32
config X86_BIGSMP
diff --git a/fs/resctrl2/arch/x86/Makefile b/fs/resctrl2/arch/x86/Makefile
new file mode 100644
index 000000000000..393a9412f151
--- /dev/null
+++ b/fs/resctrl2/arch/x86/Makefile
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_X86_CPU_RESCTRL2) += alloc.o
+obj-$(CONFIG_X86_CPU_RESCTRL2) += rdt_monitor.o
+obj-$(CONFIG_X86_CPU_RESCTRL2) += rdt_mbm_adjust.o
+
+CFLAGS_rdt_l3_cat.o += -DCACHE_LEVEL=3
+CFLAGS_rdt_l2_cat.o += -DCACHE_LEVEL=2
+CFLAGS_rdt_l3_cdp.o += -DCACHE_LEVEL=3 -DCDP
+CFLAGS_rdt_l2_cdp.o += -DCACHE_LEVEL=2 -DCDP
+
+obj-$(CONFIG_X86_RDT_L3_CAT) += rdt_l3_cat.o
+obj-$(CONFIG_X86_RDT_L2_CAT) += rdt_l2_cat.o
+obj-$(CONFIG_X86_RDT_L3_CDP) += rdt_l3_cdp.o
+obj-$(CONFIG_X86_RDT_L2_CDP) += rdt_l2_cdp.o
+
+CFLAGS_rdt_llc_occupancy.o += -DEVENT=1
+CFLAGS_rdt_mbm_total_bytes.o += -DEVENT=2
+CFLAGS_rdt_mbm_local_bytes.o += -DEVENT=3
+CFLAGS_rdt_mbm_total_rate.o += -DEVENT=4
+CFLAGS_rdt_mbm_local_rate.o += -DEVENT=5
+
+obj-$(CONFIG_X86_RDT_LLC_OCCUPANCY) += rdt_llc_occupancy.o
+obj-$(CONFIG_X86_RDT_MBM_TOTAL_BYTES) += rdt_mbm_total_bytes.o
+obj-$(CONFIG_X86_RDT_MBM_LOCAL_BYTES) += rdt_mbm_local_bytes.o
+obj-$(CONFIG_X86_RDT_MBM_TOTAL_RATE) += rdt_mbm_total_rate.o
+obj-$(CONFIG_X86_RDT_MBM_LOCAL_RATE) += rdt_mbm_local_rate.o
+
+obj-$(CONFIG_X86_RDT_L3_MBA) += rdt_l3_mba.o
--
2.40.1



2023-07-04 12:57:43

by Peter Newman

[permalink] [raw]
Subject: Re: [RFC PATCH 2/2] resctrl2: Arch x86 modules for most of the legacy control/monitor functions

Hi Tony,

On Tue, Jun 20, 2023 at 5:37 AM Tony Luck <[email protected]> wrote:
> +struct rmid {
> + struct list_head list;
> + struct list_head child_list;
> + bool is_parent;
> +static void __rdt_rmid_read(void *info)
> +{
> + struct rrmid_info *rr = info;
> + unsigned long flags;
> + struct rmid *cr, *r;
> + struct mydomain *m;
> + u64 chunks;
> +
> + m = get_mydomain(rr->domain);
> +
> + if (rr->event <= EV_LOC) {
> + spin_lock_irqsave(&m->msr_lock, flags);

Will there ultimately be any locking at the filesystem layer? I recall
from feedback on my change adding a spinlock here[1] before that the
filesystem-layer locking took care of this.

> + wrmsrl(MSR_IA32_QM_EVTSEL, (rr->rmid << 32) | rr->event);
> + rdmsrl(MSR_IA32_QM_CTR, chunks);
> + } else {
> + chunks = 0;
> + }
> +
> + rr->chunks = adjust(m, rr->rmid, rr->event, chunks);
> +
> + r = &rmid_array[rr->rmid];
> + if (r->is_parent && !list_empty(&r->child_list)) {
> + list_for_each_entry(cr, &r->child_list, child_list) {
> + u64 crmid = cr - rmid_array;
> +
> + if (rr->event <= EV_LOC) {
> + wrmsrl(MSR_IA32_QM_EVTSEL, (crmid << 32) | rr->event);
> + rdmsrl(MSR_IA32_QM_CTR, chunks);
> + } else {
> + chunks = 0;
> + }
> +
> + rr->chunks += adjust(m, crmid, rr->event, chunks);
> + }
> + }
> +
> + if (rr->event <= EV_LOC)
> + spin_unlock_irqrestore(&m->msr_lock, flags);
> +}
> +
> +u64 rdt_rmid_read(int domain_id, int rmid, int event)
> +{
> + struct resctrl_domain *d;
> + struct rrmid_info rr;
> + struct mydomain *m;
> +
> + list_for_each_entry(d, &monitor.domains, list)
> + if (d->id == domain_id)
> + goto found;
> + return ~0ull;
> +found:
> + m = get_mydomain(d);
> +
> + rr.domain = d;
> + rr.rmid = rmid;
> + rr.event = event;
> +
> + if (event <= EV_LOC)
> + smp_call_function_any(&d->cpu_mask, __rdt_rmid_read, &rr, 1);
> + else
> + __rdt_rmid_read(&rr);

I like that the driver is responsible for deciding where IPIs need to
be sent, but it looks like the consequence is that RDT-level code
wants to add in the child monitors' event counts once executing within
the correct domain. The one-per-domain IPI assumption from the current
resctrl code being wrong is probably harder to overcome than needing
to figure out what additional RMIDs to read, but I'd really need to
know the synchronization requirements for __rdt_rmid_read() to inspect
the monitoring group hierarchy.

Would you continue to promise that the FS structure won't change
during a monitor read? To us, the biggest priority for
parallelization is reading all the domain-group combinations in the
system, because we have a lot of them and want the tightest possible
snapshot of bandwidth usage, broken down by group.

Thanks!
-Peter

[1] https://lore.kernel.org/all/[email protected]/

2023-07-05 05:24:42

by Tony Luck

[permalink] [raw]
Subject: RE: [RFC PATCH 2/2] resctrl2: Arch x86 modules for most of the legacy control/monitor functions

>> + if (rr->event <= EV_LOC) {
>> + spin_lock_irqsave(&m->msr_lock, flags);
>
> Will there ultimately be any locking at the filesystem layer? I recall
> from feedback on my change adding a spinlock here[1] before that the
> filesystem-layer locking took care of this.

Yes. Working on that now. The spinlock will go away when everything is protected.
by resctrl_mutex.

> > + if (event <= EV_LOC)
> > + smp_call_function_any(&d->cpu_mask, __rdt_rmid_read, &rr, 1);
> > + else
> > + __rdt_rmid_read(&rr);
>
> I like that the driver is responsible for deciding where IPIs need to
> be sent, but it looks like the consequence is that RDT-level code
> wants to add in the child monitors' event counts once executing within
> the correct domain. The one-per-domain IPI assumption from the current
> resctrl code being wrong is probably harder to overcome than needing
> to figure out what additional RMIDs to read, but I'd really need to
> know the synchronization requirements for __rdt_rmid_read() to inspect
> the monitoring group hierarchy.

There are likely some gaps in current code. But I think they should be
fixable.

> Would you continue to promise that the FS structure won't change
> during a monitor read? To us, the biggest priority for
> parallelization is reading all the domain-group combinations in the
> system, because we have a lot of them and want the tightest possible
> snapshot of bandwidth usage, broken down by group.

The mbm_poll() code that makes sure that counters don't wrap is
doing all the expensive wrmsr(QM_EVTSEL);rdmsr(QM_COUNT)
once per second to give you the data you want. But existing resctrl
filesystem doesn't let you do a bulk read. I have some ideas on how
to provide something better. One question: do you really need that
snapshot to be system-wide? Or can you live with separate L3-scoped
snapshots that aren't tightly synchronized with each other?

-Tony

2023-07-06 10:39:29

by Peter Newman

[permalink] [raw]
Subject: Re: [RFC PATCH 2/2] resctrl2: Arch x86 modules for most of the legacy control/monitor functions

Hi Tony,

On Wed, Jul 5, 2023 at 6:46 AM Luck, Tony <[email protected]> wrote:
> The mbm_poll() code that makes sure that counters don't wrap is
> doing all the expensive wrmsr(QM_EVTSEL);rdmsr(QM_COUNT)
> once per second to give you the data you want.

I was doing that in the soft RMID series I posted earlier because it
simplified things, but then I had some realizations about how much
error +/- 1 second on the sampling point could result in[1]. We
usually measure the bandwidth rate with a 5-second window, so a
reading that's up to one second old would mean a 20% error in the
bandwidth calculation.

> But existing resctrl
> filesystem doesn't let you do a bulk read. I have some ideas on how
> to provide something better. One question: do you really need that
> snapshot to be system-wide? Or can you live with separate L3-scoped
> snapshots that aren't tightly synchronized with each other?

The most demanding use case is responding to a bandwidth saturation
emergency, where the admin wants a snapshot of all jobs' bandwidth
rates to identify which one(s) to kill. Jobs aren't confined to L3
domains, so I expect it's more important to get a tight snapshot for
an RMID-at-a-time. Going domain-at-a-time could create more races
where a high-bandwidth job migrates at the right time and avoids
detection.

One workaround we've been using to cut down on IPIs is providing a
usermode threshold knob for limiting how old a reported event count
can be, but this is more useful for soft RMIDs where reading a single
CPU's counters makes all RMIDs stored counts current, and IPIs
resulting from userspace reads of different groups in the same domain
are all redundant.

We could probably work with second-old event counts if we knew the
exact time they were recorded so we'd know the exact time delta for
the bandwidth count delta, but that seems like it would be awkward in
the current interface.

-Peter

[1] https://lore.kernel.org/lkml/CALPaoCi_N8JHbP4zFD=ijBh5-=4Q0Ec-LrspYNGGnj4G6A6U0g@mail.gmail.com/

2023-07-11 00:45:57

by Tony Luck

[permalink] [raw]
Subject: Re: [RFC PATCH 2/2] resctrl2: Arch x86 modules for most of the legacy control/monitor functions

On Thu, Jul 06, 2023 at 12:22:03PM +0200, Peter Newman wrote:
> Hi Tony,
>
> On Wed, Jul 5, 2023 at 6:46 AM Luck, Tony <[email protected]> wrote:
> > The mbm_poll() code that makes sure that counters don't wrap is
> > doing all the expensive wrmsr(QM_EVTSEL);rdmsr(QM_COUNT)
> > once per second to give you the data you want.
>
> I was doing that in the soft RMID series I posted earlier because it
> simplified things, but then I had some realizations about how much
> error +/- 1 second on the sampling point could result in[1]. We
> usually measure the bandwidth rate with a 5-second window, so a
> reading that's up to one second old would mean a 20% error in the
> bandwidth calculation.

I just pushed the latest version of the resctrl2 patches to the
resctrl2_v65rc1 branch of git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux.git

As well as locking, bug fixes, and general updates it includes an
experimental feature to provide summary MBM information for each
node. E.g. (both "total" and "local" rates are provided). Note
that you have to load modules rdt_mbm_local_bytes and
rdt_mbm_total_bytes so that the MBM overflow threads are
running. I should fix the code to print "n/a" instead of
"0" if they are not.

$ cat /sys/fs/resctrl/mon_data/mon_L3_00/mbm_summary
3638 3638 /g2
3087 3087 /g2/m2
3267 3267 /g2/m1
3443 3443 /g1
3629 3629 /g1/m2
3588 3587 /g1/m1
3999 3993 /
3370 3369 /m2
3432 3432 /m1

The rates are produced once per second by the MBM overflow
code. They compute MBytes/sec as "chunks since last poll"
divided by (now - then). I'm using jiffies for the times
which may be good enough. "now - then" is one second (maybe
more if the kernel thread doing the MBM polling is delayed
from running).

I should fix the summarization code to work the same as the
regular MBM files (i.e. make the parent control directory
report the sum of all its children).

The code also attempts (but fails) to make these mbm_summary
files poll(2)-able. With the wakeup dependent on aggregate
measure bandwidth compared against a configurable threshold:

$ cat /sys/fs/resctrl/info/L3_MON/mbm_poll_threshold
10000000

There's something wrong though. Poll(2) always says there is
data to be read. I only see one other piece of kernel code
implementing poll on kernfs (in the cgroup code). Perhaps
my problem is inability to write an appliction that uses
poll(2) correctly.

Let me know if this all seems like a useful direction. Maybe
the polling part is overkill and it is sufficient to just
have a cheap way to get all the bandwidths even if the values
seen might be up to one second old.

-Tony