Attached is an early version of the ioctl based CAT interface we
have been working on.
NOTE: it does not compile, there is no locking, but should
be sufficient for interested people to comment.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index db3622f..293726b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -757,6 +757,14 @@ config HPET_EMULATE_RTC
def_bool y
depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
+config CACHE_RESERVATION
+ tristate "Cache Reservation Support"
+ default n
+ ---help---
+ This feature makes use of Intel's Cache Allocation Technology to allow the
+ reservation of portions of the L3 cache to specific tasks. Please, see
+ Documentation/x86/cache-reservation.txt for more information.
+
config APB_TIMER
def_bool y if X86_INTEL_MID
prompt "Intel MID APB Timer Support" if X86_INTEL_MID
diff --git a/arch/x86/include/uapi/asm/cache_reservation.h b/arch/x86/include/uapi/asm/cache_reservation.h
new file mode 100644
index 0000000..c4dcc95
--- /dev/null
+++ b/arch/x86/include/uapi/asm/cache_reservation.h
@@ -0,0 +1,64 @@
+enum cache_rsvt_flags {
+ CACHE_RSVT_ROUND_DOWN = (1 << 0), /* round kbytes down */
+};
+
+enum cache_rsvt_type {
+ CACHE_RSVT_TYPE_CODE = 0, /* cache reservation is for code */
+ CACHE_RSVT_TYPE_DATA, /* cache reservation is for data */
+ CACHE_RSVT_TYPE_BOTH, /* cache reservation is for both */
+};
+
+struct cat_reservation {
+ __u64 kbytes;
+ __u32 type;
+ __u32 flags;
+ __u32 tcrid;
+ __u32 pad[11];
+};
+
+struct cat_reservation_cpumask {
+ size_t cpusetsize;
+ cpu_set_t *mask;
+ struct cat_reservation res;
+};
+
+struct pid_cat_reservation {
+ __u32 tcrid;
+ __s32 pid;
+ __u32 pad[8];
+};
+
+struct cat_tcrid {
+ __u32 tcrid;
+ __u32 pad[7];
+};
+
+struct cat_reservation_list {
+ /* -- input -- */
+ struct cat_reservation *res;
+ /* how many bytes allocated for list */
+ size_t cat_res_size;
+ cpu_set_t *mask;
+ /* how many bytes allocated for mask */
+ size_t cpusetsize;
+
+ /* -- output -- */
+ /* size of each cpu_set_t entry copied to
+ * cpu_set_t *mask
+ */
+ size_t cpumask_size;
+ __u32 pad[11];
+};
+
+struct cat_tcrid_tasks {
+ __u32 tcrid;
+ size_t nr_entries;
+ struct pid_t *list;
+};
+
+#define CAT_CREATE_RESERVATION _IOW(CATIO, 0x00, struct cat_reservation_cpumask)
+#define CAT_DELETE_RESERVATION _IOR(CATIO, 0x01, struct cat_tcrid)
+#define CAT_ATTACH_RESERVATION _IOW(CATIO, 0x02, struct pid_cat_reservation)
+#define CAT_DETACH_RESERVATION _IOW(CATIO, 0x03, struct pid_cat_reservation)
+#define CAT_GET_RESERVATIONS _IOW(CATIO, 0x04, struct cat_reservation_list)
+#define CAT_GET_TCRID_TASKS _IOW(CATIO, 0x05, struct)
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b1b78ff..57129d6 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -110,6 +110,8 @@ obj-$(CONFIG_EFI) += sysfb_efi.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
+obj-$(CONFIG_CACHE_RESERVATION) += cat/
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/cat/Makefile b/arch/x86/kernel/cat/Makefile
new file mode 100644
index 0000000..031fd64
--- /dev/null
+++ b/arch/x86/kernel/cat/Makefile
@@ -0,0 +1 @@
+obj-y += cache_reservation.o
diff --git a/arch/x86/kernel/cat/cache_reservation.c b/arch/x86/kernel/cat/cache_reservation.c
new file mode 100644
index 0000000..4187a57
--- /dev/null
+++ b/arch/x86/kernel/cat/cache_reservation.c
@@ -0,0 +1,1244 @@
+
+#include <linux/list.h>
+#include <linux/bitmap.h>
+#include <linux/kernel.h>
+#include <linux/cacheinfo.h>
+#include <linux/cpumask.h>
+#include <linux/topology.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include "cache_reservation.h"
+#include <uapi/asm/cache_reservation.h>
+#include <asm/uaccess.h>
+
+/*
+ *
+ * There are two main data structures: tcrid entries, and tcrid lists.
+ * A tcrid entry contains size,type information and is used
+ * to identify a cache allocation reservation.
+ * One task should not allocate more than one tcrid per type
+ * unless that tcrid is to be shared with a different task.
+ * A tcrid list is a set of tcrid entries, and is mapped to (used by)
+ * one or more tasks.
+ * Each task is mapped to only one tcrid list.
+ * A tcrid entry can be in one or more tcrid lists at the same time.
+ *
+ * Mapping to Intel CAT:
+ * * tcrid list maps one-to-one to a COS-ID.
+ * * tcrid entry represents a range of bits
+ * in a number of (one or more) Cache Capacity Bitmasks,
+ * which are specified in HW via IA32_L3_MASK_n MSRs.
+ * * one tcrid entry can be in different locations
+ * in different sockets.
+ * * tcrid entries of a tcrid list must be mapped contiguously
+ * in hardware.
+ *
+ */
+
+unsigned long *closmap;
+
+LIST_HEAD(tcr_global_list);
+DEFINE_MUTEX(tcr_list_mutex);
+
+DECLARE_BITMAP(tcrid_used_bitmap, CBM_LEN);
+struct tcr_entry *tcrid_table;
+static unsigned int total_tcrentry_bits;
+
+static unsigned int l3_cache_size;
+//static u32 max_closid;
+static u32 max_cbm_len;
+static unsigned int kbytes_per_cbm_bit;
+static unsigned int l3_nr_cbm_bits;
+
+static unsigned int max_sockets;
+
+struct cache_layout {
+ unsigned long *closmap;
+ u32 hw_shared_bitmask;
+ int id;
+ struct list_head link;
+ int nr_users;
+};
+
+LIST_HEAD(layout_list);
+
+struct per_socket_data {
+ /* start, end of shared region with HW */
+ u32 hw_shared_bitmask;
+ int initialized;
+ unsigned long *cosidzeromask;
+ struct cache_layout *layout;
+ unsigned int occupied_cbm_bits;
+};
+
+struct per_socket_data *psd;
+static unsigned int psd_size;
+
+/*
+ * CDP capable hardware: CDP-on by default.
+ * Use intel_cat_mode=cat kernel parameter to switch to cat.
+ */
+static bool __read_mostly enable_cdp = 1;
+module_param_named(ept, enable_cdp, bool, S_IRUGO);
+
+// protects addition to layout_list
+static DEFINE_RAW_SPINLOCK(cache_layout_lock);
+
+DECLARE_BITMAP(cache_layout_ids, MAX_LAYOUTS);
+
+struct cache_layout *find_create_layout(u32 hw_shared_bitmask)
+{
+ struct cache_layout *l;
+
+ raw_spin_lock(&cache_layout_lock);
+
+ list_for_each_entry(l, &layout_list, link) {
+ if (l->hw_shared_bitmask == hw_shared_bitmask)
+ l->nr_users++;
+ raw_spin_unlock(&cache_layout_lock);
+ return l;
+ }
+
+ l = kzalloc(GFP_ATOMIC, sizeof(struct cache_layout));
+ if (!l) {
+ panic("%s alloc failed", __func__);
+ }
+ l->hw_shared_bitmask = hw_shared_bitmask;
+ l->id = find_first_zero_bit(cache_layout_ids, MAX_LAYOUTS);
+ if (l->id == MAX_LAYOUTS) {
+ printk(KERN_ERR "intel_cat: MAX_LAYOUTS exceeded\n");
+ /* reuse id 0 */
+ l = list_first_entry(&layout_list, struct cache_layout, link);
+ l->nr_users++;
+ raw_spin_unlock(&cache_layout_lock);
+ return l;
+ }
+ set_bit(l->id, cache_layout_ids);
+ l->nr_users++;
+ INIT_LIST_HEAD(&l->link);
+ list_add(&l->link, &layout_list);
+ raw_spin_unlock(&cache_layout_lock);
+ return l;
+}
+
+u32 maxtcrlist_id;
+
+int alloc_tcrid_table(void)
+{
+ struct tcr_entry *e;
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ int i;
+
+ maxtcrlist_id = c->x86_cache_max_closid;
+
+ tcrid_table = kzalloc(GFP_KERNEL, CBM_LEN);
+ if (!tcrid_table)
+ return -ENOMEM;
+
+ for (i = 0; i < CBM_LEN; i++) {
+ unsigned int size;
+ e = &tcrid_table[i];
+ e->tcrid = i;
+ size = BITS_TO_LONGS(maxtcrlist_id) *
+ sizeof(unsigned long);
+ e->tcrlist_bmap = kzalloc(GFP_KERNEL, size);
+ if (!e->tcrlist_bmap) {
+ goto out_err;
+ }
+ }
+
+ return 0;
+out_err:
+ for (i = 0; i < CBM_LEN; i++) {
+ e = &tcrid_table[i];
+ kfree(e->tcrlist_bmap);
+ }
+ kfree(tcrid_table);
+ return -ENOMEM;
+}
+
+
+#define reserved_cbm_bits 2
+int account_cbm_bits(struct cat_reservation_cpumask *crmask,
+ unsigned int cbm_bits)
+{
+ unsigned int cpu;
+
+
+ // const struct cpumask *cpumask
+ for_each_cpu(cpu, crmask->mask) {
+ unsigned int socket, free_cbm_bits;
+ struct per_socket_data *psd;
+
+ if (!cpu_online(cpu))
+ return 1;
+
+ socket = topology_physical_package_id(cpu);
+ psd = get_socket_data(socket);
+ free_cbm_bits = l3_nr_cbm_bits - psd->occupied_cbm_bits;
+ if (cbm_bits > free_cbm_bits)
+ return 1;
+ }
+
+ for_each_cpu(cpu, crmask->mask) {
+ unsigned int socket, free_cbm_bits;
+ struct per_socket_data *psd;
+
+ socket = topology_physical_package_id(cpu);
+ psd = get_socket_data(socket);
+ psd->occupied_cbm_bits += cbm_bits;
+ }
+ return 0;
+}
+
+int deaccount_cbm_bits(struct tcr_entry *e)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, e->mask) {
+ unsigned int socket;
+ struct per_socket_data *psd;
+
+ /* FIXME:
+ *
+ * 1) alloc reservation
+ * 2) cpu offline
+ * 3) dealloc reservation
+ * 4) cpu online
+ */
+ if (!cpu_online(cpu))
+ return 1;
+
+ socket = topology_physical_package_id(cpu);
+ psd = get_socket_data(socket);
+ psd->occupied_cbm_bits -= e->cbm_bits;
+ }
+ return 0;
+}
+
+struct tcr_entry *alloc_tcr_entry(struct cat_reservation_cpumask *crmask,
+ unsigned int cbm_bits)
+{
+ struct tcr_entry *e;
+ int i;
+
+ i = find_first_zero_bit(tcrid_used_bitmap, CBM_LEN);
+ if (i >= CBM_LEN) {
+ return ERR_PTR(-ENOMEM);
+ }
+
+ if (account_cbm_bits(cpumask, cbm_bits))
+ return ERR_PTR(-ENOMEM);
+
+ set_bit(i, tcrid_used_bitmap);
+ e = &tcrid_table[i];
+
+ return e;
+}
+
+struct tcr_entry *find_tcr_entry(u32 tcrid)
+{
+ struct tcr_entry *e;
+
+ if (tcrid >= CBM_LEN) {
+ return ERR_PTR(-EINVAL);
+ }
+ if (!test_bit(tcrid, tcrid_used_bitmap)) {
+ return ERR_PTR(-EINVAL);
+ }
+
+ e = &tcrid_table[tcrid];
+ return e;
+}
+
+void free_tcr_entry(struct tcr_entry *e)
+{
+ clear_bit(e->tcrid, tcrid_used_bitmap);
+ WARN_ON(!bitmap_empty(e->tcrlist_bmap, maxtcrlist_id));
+ deaccount_cbm_bits(e);
+ if (e->cpumask)
+ free_cpumask_var(e->cpumask);
+ e->cpumask = NULL;
+}
+
+int tcrentry_in_tcrlist(struct tcr_entry *e, struct tcr_list *l)
+{
+ return test_bit(l->id, e->tcrlist_bmap);
+}
+
+
+#if 0
+void tcrlist_changed(struct tcr_list *l)
+{
+ unsigned int size = BITS_TO_LONGS(max_sockets * NR_CPUS) * sizeof(long);
+ bitmap_clear(l->synced_to_socket, size);
+}
+#endif
+
+int add_tcrentry_to_tcrlist(struct tcr_entry *e, struct tcr_list *l)
+{
+ set_bit(l->id, e->tcrlist_bmap);
+ set_bit(e->tcrid, l->tcrentry_bmap);
+ return 0;
+}
+
+int remove_tcrentry_from_tcrlist(struct tcr_entry *e, struct tcr_list *l)
+{
+ clear_bit(l->id, e->tcrlist_bmap);
+ clear_bit(e->tcrid, l->tcrentry_bmap);
+ /* no more tcrlists referencing this tcrentry: undo allocation
+ on the cache layouts */
+ if (bitmap_empty(&e->tcrlist_bmap, maxtcrlist_id))
+ dealloc_contiguous_regions(e, l);
+ /* no more tcrentries on this tcrlist: unlink it from task */
+ if (bitmap_empty(&l->tcrentry_bmap, CBM_LEN))
+ unlink_tcrlist_from_tasks(l);
+
+ return 0;
+}
+
+/*
+ * returns -ENOMEM if not enough space, -EPERM if no permission.
+ * returns 0 if reservation has been successful, copying actual
+ * number of kbytes reserved to "kbytes", type to type, and tcrid.
+ *
+ */
+int __create_cache_reservation(struct cat_reservation_cpumask *crmask,
+ unsigned long argp)
+{
+ struct tcr_entry *e;
+ unsigned int cbm_bits;
+ unsigned int kbytes;
+ struct cat_reservation *cr = &crmask->res;
+ int ret;
+
+ if (cr->type != CACHE_RSVT_TYPE_BOTH && !enable_cdp)
+ return -ENOTSUPP;
+
+ if (cr->type & CACHE_RSVT_ROUND_DOWN)
+ kbytes = round_down(cr->kbytes, kbytes_per_cbm_bit);
+ else
+ kbytes = round_up(cr->kbytes, kbytes_per_cbm_bit);
+
+ if (kbytes > l3_cache_size)
+ return -ENOSPC;
+
+ cbm_bits = kbytes / kbytes_per_cbm_bit;
+
+ e = alloc_tcr_entry(crmask, cbm_bits);
+ if (IS_ERR(e))
+ return PTR_ERR(e);
+
+ /* fix up the cr with the info we got and copy to user */
+ cr->kbytes = kbytes;
+ cr->type = CACHE_RSVT_TYPE_BOTH;
+ cr->flags = 0;
+ cr->tcrid = e->tcrid;
+ ret = -EFAULT;
+ if (copy_to_user(argp, cr, sizeof(*cr)))
+ goto out_release_tcrid;
+
+ e->user_kbytes = cr->kbytes;
+ e->rounded_kbytes = kbytes;
+ e->cbm_bits = kbytes / kbytes_per_cbm_bit;
+ e->type = cr->type;
+
+ return 0;
+out_release_tcrid:
+ free_tcr_entry(e);
+ return ret;
+}
+
+int create_cache_reservation(struct cat_reservation_cpumask *crmask,
+ unsigned long arg)
+{
+ cpumask_var_t new_mask;
+ int ret;
+ struct cat_reservation *cr = crmask->cr;
+
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = get_user_cpu_mask(crmask->mask, crmask->cpusetsize,
+ new_mask);
+ if (ret == 0)
+ ret = __create_cache_reservation(crmask, arg);
+
+ if (ret == 0) {
+ int len = crmask->cpusetsize;
+
+ size_t retlen = min_t(size_t, len, cpumask_size());
+
+ if (copy_to_user(crmask->mask, new_mask, retlen))
+ ret = -EFAULT;
+ else
+ ret = retlen;
+ }
+ if (ret > 0)
+ cr->cpumask = new_mask;
+ else
+ free_cpumask_var(new_mask);
+ return retval;
+}
+
+/*
+ * TCRentry -> TCRlist mapping:
+ * Each TCRlist is assigned an id from [0, ..., maxclosid]
+ * The id_to_tcrlist[maxclosid] structure contains pointers
+ * to tcrlist structures.
+ * TCRentries contains a bitmap[0, ..., maxclosid]. A bit
+ * set in this bitmap represents the fact that particular
+ * tcrlist references the tcrentry.
+ */
+struct tcr_list *id_to_tcrlist;
+#define TCRLIST_ID_SZ 128
+DECLARE_BITMAP(tcrlist_ids, TCRLIST_ID_SZ);
+
+static unsigned int alloc_tcrlist_id(void)
+{
+ unsigned int id;
+ id = find_first_zero_bit(&tcrlist_ids, TCRLIST_ID_SZ);
+ if (id < TCRLIST_ID_SZ)
+ set_bit(id, &tcrlist_ids);
+ return id;
+}
+
+static void free_tcrlist_id(unsigned int id)
+{
+ clear_bit(id, &tcrlist_ids);
+ id_to_tcrlist[id] = NULL;
+}
+
+
+struct tcr_list *alloc_tcrlist(void)
+{
+ unsigned int cpus_per_socket;
+ struct tcr_list *l;
+ unsigned int id;
+ u32 size;
+
+ l = kzalloc(sizeof(struct tcr_list), GFP_KERNEL);
+ if (!l) {
+ return ERR_PTR(-ENOMEM);
+ }
+ INIT_LIST_HEAD(&l->global_link);
+ INIT_LIST_HEAD(&l->tcr_list);
+ size = BITS_TO_LONGS(max_sockets * NR_CPUS) * sizeof(long);
+ l->synced_to_socket = kzalloc(GFP_KERNEL, size);
+ if (!l->synced_to_socket) {
+ kfree(l);
+ return ERR_PTR(-ENOMEM);
+ }
+ mutex_lock(&tcr_list_mutex);
+ id = alloc_tcrlist_id();
+ if (id >= TCRLIST_ID_SZ) {
+ kfree(l);
+ mutex_unlock(&tcr_list_mutex);
+ return ERR_PTR(-ENOMEM);
+ }
+ l->id = id;
+ id_to_tcrlist[id] = l;
+ list_add(&l->global_link, &tcr_global_list);
+
+ mutex_unlock(&tcr_list_mutex);
+ return l;
+}
+
+struct tcr_list *find_tcrlist(unsigned long *cmp_bmap)
+{
+ struct tcrlist *l;
+
+ list_for_each_entry(l, &tcr_global_list, global_link) {
+ if (bitmap_equal(l->tcrentry_bmap, &tcrentry_bmap, CBM_LEN))
+ return l;
+ }
+ return NULL;
+}
+
+void free_tcrlist(struct tcr_list *l)
+{
+ mutex_lock(&tcr_list_mutex);
+ free_tcrlist_id(l->id);
+ mutex_unlock(&tcr_list_mutex);
+ kfree(l);
+}
+
+/*
+ * tcrlist is created when attaching a tcrentry to a task.
+ *
+ * destroyed when either task count goes to zero,
+ * or tcrentry count goes to zero.
+ *
+ */
+static void inc_use_count(struct tcr_list *l)
+{
+ l->nr_tasks++;
+}
+
+static void dec_use_count(struct tcr_list *l)
+{
+ l->nr_tasks--;
+ if (l->nr_tasks == 0)
+ free_tcrlist(l);
+}
+
+int link_tcrlist_to_task(struct task_struct *t, struct tcr_list *l)
+{
+ inc_use_count(l);
+ rcu_assign_pointer(t->tcrlist, l);
+#if 0
+ #ifdef CONFIG_INTEL_CAT
+ struct list_head tcrlist_link;
+ #endif
+#endif
+
+ list_add(&t->tcrlist_link, &l->tasks);
+}
+
+int unlink_tcrlist_from_task(struct task_struct *t, struct tcr_list *l)
+{
+ rcu_assign_pointer(t->tcrlist, NULL);
+ rcu_synchronize();
+ list_del(&t->tcrlist_link);
+ dec_use_count(l);
+}
+
+void unlink_tcrlist_from_tasks(struct tcr_list *l)
+{
+ struct task_struct *tsk, *tsk2;
+
+ list_for_each_entry_safe(tsk, tsk2, &l->tasks, tcrlist_link) {
+ rcu_assign_pointer(tsk->tcrlist, NULL);
+ kick_task(tsk);
+ }
+ rcu_synchronize();
+
+ list_for_each_entry_safe(tsk, tsk2, &l->tasks, tcrlist_link) {
+ list_del(&t->tcrlist_link);
+ dec_use_count(l);
+ }
+}
+
+int delete_cache_reservation(struct cat_tcrid *i)
+{
+ struct tcr_entry *e;
+ int bit;
+
+ e = find_tcr_entry(i->tcrid);
+ if (IS_ERR(e)) {
+ return PTR_ERR(e);
+ }
+
+ for_each_set_bit(bit, &e->tcrlist_bmap, maxtcrlist_id) {
+ struct tcr_list *l;
+
+ l = id_to_tcrlist[id];
+ if (!l) {
+ BUG_ON();
+ return 0;
+ }
+ remove_tcrentry_from_tcrlist(e, l);
+ kick_tasks(l);
+ }
+ free_tcr_entry(e);
+ return 0;
+}
+
+
+int check_contiguous_region(struct tcr_entry *e, struct tcr_list *l,
+ struct cache_layout *layout, int *size_p)
+{
+ unsigned long *temp_closmap;
+ u32 size = BITS_TO_LONGS(max_cbm_len) * sizeof(unsigned long);
+ struct tcr_list_per_socket *psd = l->psd[layout->id];
+ u32 cbm_bits;
+
+ temp_closmap = kzalloc(GFP_KERNEL, size);
+ if (!temp_closmap) {
+ return -ENOMEM;
+ }
+
+ memcpy(temp_closmap, layout->closmap, size);
+ /* mark cache ways shared with hw as busy */
+ bitmap_or(temp_closmap, &layout->hw_shared_bitmask, min(max_cbm_len, 32));
+ cbm_bits = 0;
+ if (psd->cbm_end_bit) {
+ cbm_bits = psd->cbm_end_bit - psd->cbm_start_bit + 1;
+ bitmap_clear(temp_closmap, psd->cbm_start_bit, cbm_bits);
+ }
+
+ cbm_bits += e->cbm_bits;
+ s = bitmap_find_next_zero_area(temp_closmap, max_cbm_len, 0,
+ cbm_bits, 0);
+ if (s >= max_cbm_len) {
+ kfree(temp_closmap);
+ return -EBUSY;
+ }
+ *size_p = cbm_bits;
+ return s;
+}
+
+int alloc_contiguous_region(struct tcr_entry *e, struct tcr_list *l,
+ struct cache_layout *layout)
+{
+ int size_p, r;
+ struct tcr_list_per_socket *psd = l->psd[layout->id];
+
+ r = check_contiguous_region(e, l, clayout, &size_p);
+ if (r < 0)
+ return r;
+
+ psd->cbm_start_bit = r;
+ psd->cbm_end_bit = r + size_p;
+
+ for (bit = psd->cbm_start_bit; bit < psd->cbm_end_bit;
+ bit++) {
+ __set_bit(bit, layout->closmap);
+ }
+ return 0;
+}
+
+int alloc_contiguous_regions(struct tcr_entry *e, struct tcr_list *l)
+{
+ struct cache_layout *clayout;
+
+ list_for_each_entry(clayout, &layout_list, link) {
+ int size_p, r;
+
+ r = check_contiguous_region(e, l, clayout, &size_p);
+ if (r < 0)
+ return error;
+ r = alloc_contiguous_region(e, l, clayout);
+ if (r) {
+ WARN_ON(1);
+ }
+ }
+}
+
+int dealloc_contiguous_regions(struct tcr_entry *e, struct tcr_list *l)
+{
+ struct cache_layout *clayout;
+
+ list_for_each_entry(clayout, &layout_list, link) {
+ struct tcr_list_per_socket *psd = l->psd[clayout->id];
+ int bit;
+
+ for (bit = psd->cbm_start_bit; bit < psd->cbm_end_bit;
+ bit++) {
+ __clear_bit(bit, layout->closmap);
+ }
+ }
+}
+
+void kick_task(struct task_struct *tsk)
+{
+ set_tsk_need_resched(tsk);
+ kick_process(tsk);
+}
+
+/* When attach returns, any task attached to the tcrlist
+ * which has been modified must:
+ * Task Running) sync_to_msr.
+ * Task Not Running) nothing, as long as sync_to_msr is performed
+ * when its scheduled in.
+ */
+void kick_tasks(struct tcr_list *l)
+{
+ struct task_struct *tsk;
+
+ list_for_each_entry(tsk, &l->tasks, tcrlist_link) {
+ set_tsk_need_resched(tsk);
+ kick_process(tsk);
+ }
+}
+
+int attach_cache_reservation(struct pid_cat_reservation *pcr)
+{
+ struct pid *pid;
+ struct task_struct *task;
+ struct tcr_list *l, *undo;
+ struct tcr_entry *e;
+
+ e = find_tcr_entry(pcr->tcrid);
+ if (IS_ERR(e)) {
+ return PTR_ERR(e);
+ }
+
+ pid = find_get_pid(pcr);
+ if (!pid) {
+ return -ENOSYS;
+ }
+
+ task = get_pid_task(task);
+ if (!task) {
+ put_pid(pid;
+ return -EINVAL;
+ }
+
+ if (!task->tcrlist) {
+ u64 b = 1UL << e->tcrid;
+
+ l = find_tcrlist(&b);
+ if (l) {
+ link_tcrlist_to_task(task,l);
+ return 0;
+ }
+ l = alloc_tcrlist();
+ if (IS_ERR(l)) {
+ put_pid(pid);
+ put_task_struct(task);
+ return PTR_ERR(l);
+ }
+ undo = l;
+ } else {
+ l = task->tcrlist;
+ }
+
+ if (tcrentry_in_tcrlist(e, l))
+ return -EINVAL;
+
+ if (l->nr_tasks > 1) {
+ struct tcrlist_entry *lnew;
+ u64 b = l->tcrentry_bmap;
+
+ set_bit(e->tcrid, &b);
+
+ lnew = find_tcrlist(&b);
+ if (lnew) {
+ unlink_tcrlist_from_task(task, l);
+ link_tcrlist_to_task(task, lnew);
+ goto out;
+ }
+
+ lnew = alloc_tcrlist();
+ if (IS_ERR(lnew)) {
+ put_pid(pid);
+ put_task_struct(task);
+ return PTR_ERR(lnew);
+ }
+
+ if (alloc_contiguous_regions(e, lnew) == -ENOSPC) {
+ free_tcrlist(lnew);
+ return -ENOSPC;
+ }
+ for_each_set_bit(bit, &l->tcrentry_bmap, CBM_LEN) {
+ struct tcr_entry *et;
+
+ et = &tcrid_table[bit];
+ add_tcrentry_to_tcrlist(et, lnew);
+ }
+ unlink_tcrlist_from_task(task, l);
+ link_tcrlist_to_task(task, lnew);
+ l = lnew;
+ } else {
+ if (alloc_contiguous_regions(e, l) == -ENOSPC) {
+ if (undo)
+ free_tcrlist(undo);
+ return -ENOSPC;
+ }
+ }
+
+ add_tcrentry_to_tcrlist(e, l);
+ kick_tasks(l);
+out:
+ put_pid(pid);
+ put_task_struct(task);
+ return 0;
+}
+
+int detach_cache_reservation(struct pid_cat_reservation *pcr)
+{
+ struct pid *pid;
+ struct task_struct *task;
+ struct tcr_list *l, *undo;
+ struct tcr_entry *e;
+ int err;
+
+ e = find_tcr_entry(pcr->tcrid);
+ if (IS_ERR(e)) {
+ return PTR_ERR(e);
+ }
+
+ pid = find_get_pid(pcr);
+ if (!pid) {
+ return -ENOSYS;
+ }
+
+ task = get_pid_task(task);
+ if (!task) {
+ put_pid(pid);
+ return -EINVAL;
+ }
+
+ l = task->tcrlist;
+ if (!l) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (!tcrentry_in_tcrlist(e, l))
+ return -EINVAL;
+
+ if (l->nr_tasks > 1) {
+ struct tcrlist_entry *lnew;
+ u64 b = l->tcrentry_bmap;
+
+ clear_bit(e->tcrid, &b);
+
+ lnew = find_tcrlist(&b);
+ if (lnew) {
+ unlink_tcrlist_from_task(task, l);
+ link_tcrlist_to_task(task, lnew);
+ kick_task(task);
+ goto out;
+ }
+
+ lnew = alloc_tcrlist();
+ if (IS_ERR(lnew)) {
+ put_pid(pid);
+ put_task_struct(task);
+ return PTR_ERR(lnew);
+ }
+ for_each_set_bit(bit, &l->tcrentry_bmap, CBM_LEN) {
+ struct tcr_entry *et;
+
+ if (bit == e->tcrid)
+ continue;
+
+ et = &tcrid_table[bit];
+ add_tcrentry_to_tcrlist(et, lnew);
+ }
+ unlink_tcrlist_from_task(task, l);
+ link_tcrlist_to_task(task, lnew);
+ l = lnew;
+ kick_task(task);
+ } else {
+ remove_tcrentry_from_tcrlist(e, l);
+ }
+
+ err = 0;
+out:
+ put_pid(pid);
+ put_task_struct(task);
+ return err;
+}
+
+void sync_to_msr(struct task_struct *task, struct tcr_list *l,
+ unsigned int start, unsigned int end)
+{
+ u64 msr;
+ unsigned long bitmask = -1;
+ int len = end - start + 1;
+
+ bitmask = bitmask << (sizeof(unsigned long)*8 - len);
+ bitmask = bitmask >> (sizeof(unsigned long)*8 - end -1);
+
+ /* check and enforce cosidzero has [s,e] == 0 */
+ rdmsrl(CBM_FROM_INDEX(0), msr);
+ if (msr & bitmask)
+ wrmsrl(CBM_FROM_INDEX(0), msr & ~bitmask);
+
+ /* check and enforce this cosid has [s,e] == 1. */
+ rdmsrl(CBM_FROM_INDEX(l->id), msr);
+ if ((msr & bitmask) != bitmask)
+ wrmsrl(CBM_FROM_INDEX(l->id), msr | bitmask);
+
+ set_bit(this_socket, task->tcrlist->synced_to_socket);
+}
+
+void __intel_rdt_sched_in(void)
+{
+ struct task_struct *task = current;
+ unsigned int cpu = smp_processor_id();
+ unsigned int this_socket = topology_physical_package_id(cpu);
+ unsigned int start, end;
+ struct per_socket_data *psd = get_socket_data(this_socket);
+
+ /*
+ * The CBM bitmask for a particular task is enforced
+ * on sched-in to a given processor, and only for the
+ * range (cbm_start_bit,cbm_end_bit) which the
+ * tcr_list (COSid) owns.
+ * This way we allow COSid0 (global task pool) to use
+ * reserved L3 cache on sockets where the tasks that
+ * reserve the cache have not been scheduled.
+ *
+ * Since reading the MSRs is slow, it is necessary to
+ * cache the MSR CBM map on each socket.
+ *
+ */
+
+ if (task->tcrlist == NULL) {
+ wrmsrl(CBM_FROM_INDEX(0), psd->cosidzeromask);
+ }
+ else if (test_bit(this_socket,
+ task->tcrlist->synced_to_socket) == 0) {
+ spin_lock(&this_socket->msr_cbm_lock);
+ unsigned int start;
+ struct per_socket_data *psd = get_socket_data(this_socket);
+ struct cache_layout *layout = psd->layout;
+
+ start = task->tcrlist->psd[layout->id].cbm_start;
+ end = task->tcrlist->psd[layout->id].cbm_end;
+ sync_to_msr(task, tcrlist, start, end);
+ // barrier
+ spin_unlock(&this_socket->msr_cbm_lock);
+ }
+
+}
+
+static int get_reservations(struct cat_reservation_list *in,
+ unsigned long arg)
+{
+ int r, bit;
+ struct cat_reservation *cr;
+ void *res_user_ptr, *cpumask_user_ptr;
+ unsigned int copied_entries;
+ unsigned int x, coffset, uoffset;
+ size_t cpumasksz;
+
+ cpumasksz = cpumask_size()*bitmap_weight(&tcrid_used_bitmap, CBM_LEN);
+ cpumasksz = min_t(size_t, cpumasksz);
+
+ x = sizeof(*cr)*cpumasksz;
+ if (x > in->cat_res_size)
+ return -ENOSPC;
+ if (cpumasksz > in->cpumask_size)
+ return -ENOSPC;
+
+ cr = kzalloc(GFP_KERNEL, sizeof(*cr));
+ if (!cr)
+ return -ENOMEM;
+
+ res_user_ptr = in->list;
+ cpumask_user_ptr = in->mask;
+
+ in->cpumask_size = cpumasksz;
+ r = -EFAULT;
+ if (copy_to_user(argp, &in, sizeof(*in)))
+ goto out;
+
+ uoffset = coffset = copied_entries = 0;
+
+ for_each_set_bit(bit, &tcrid_used_bitmap, CBM_LEN) {
+ struct tcr_entry *e = &tcrid_table[bit];
+
+ cr->kbytes = e->rounded_kbytes;
+ cr->type = e->type;
+ cr->flags = 0;
+ cr->tcrid = tcrid;
+
+ if (copy_to_user(user_ptr + uoffset, &cr, sizeof(*cr))) {
+ r = -EFAULT;
+ goto out;
+ }
+ uoffset += sizeof(*cr);
+
+ if (copy_to_user(cpumask_user_ptr + coffset, e->cpumask, cpumasksz)) {
+ r = -EFAULT;
+ goto out;
+ }
+ coffset += cpumasksz;
+ copied_entries++;
+
+ memset(cr, 0, sizeof(*cr));
+ }
+
+ copied_entries = r;
+
+out:
+ kfree(cr);
+ return r;
+}
+
+static int basic_cr_checks(struct cat_reservation *cr)
+{
+ int r;
+
+ r = -EINVAL;
+ if (cr->type != CACHE_RSVT_TYPE_CODE &&
+ cr->type != CACHE_RSVT_TYPE_DATA &&
+ cr->type != CACHE_RSVT_TYPE_BOTH)
+ return r;
+
+ if (cr->flags != 0 && cr->flags != CACHE_RSVT_ROUND_DOWN)
+ return r;
+
+ r = 0;
+ return r;
+}
+
+static long intelcat_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ long r = -EINVAL;
+ switch (ioctl) {
+ case CAT_CREATE_RESERVATION:
+ struct cat_reservation_cpumask crmask;
+
+ r = -EFAULT;
+ if (copy_from_user(&crmask, argp, sizeof(crmask)))
+ goto out;
+
+ r = basic_cr_checks(&crmask.res);
+ if (r)
+ goto out;
+
+ r = create_cache_reservation(&crmask, arg);
+
+ break;
+ case CAT_DELETE_RESERVATION:
+ struct cat_tcrid tcrid;
+
+ r = -EFAULT;
+ if (copy_from_user(&tcrid, argp, sizeof(cr)))
+ goto out;
+
+ r = delete_cache_reservation(&tcrid);
+
+ break;
+ case CAT_ATTACH_RESERVATION:
+ struct pid_cat_reservation pcr;
+ r = -EFAULT;
+
+ if (copy_from_user(&pcr, argp, sizeof(pcr)))
+ goto out;
+ r = attach_cache_reservation(&pcr);
+ break;
+ case CAT_DETACH_RESERVATION:
+ struct pid_cat_reservation pcr;
+ r = -EFAULT;
+
+ if (copy_from_user(&pcr, argp, sizeof(pcr)))
+ goto out;
+ r = detach_cache_reservation(&pcr);
+ break;
+ case CAT_GET_RESERVATIONS:
+ struct cat_reservation_list *in;
+ r = -EFAULT;
+
+ if (copy_from_user(&pcr, argp, sizeof(pcr)))
+ goto out;
+
+ r = get_reservations(in, argp);
+ return r;
+ default:
+ break;
+ }
+
+out:
+ return r;
+}
+
+static struct file_operations intelcat_chardev_ops = {
+ .unlocked_ioctl = intelcat_ioctl,
+ .compat_ioctl = intelcat_ioctl,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice intel_cat_misc =
+{
+ INTEL_CAT_MINOR,
+ "intel_cat",
+ &intelcat_chardev_ops,
+};
+
+static int get_l3_cache_size(void)
+{
+ struct cpu_cacheinfo *cinfo;
+ struct cacheinfo *ci;
+
+ cinfo = get_cpu_cacheinfo(0);
+
+ if (cinfo && cinfo->num_levels >= 3) {
+ ci = cinfo->info_list[3];
+ l3_cache_size = ci->size;
+ return 0;
+ }
+ return -EINVAL;
+}
+
+static struct per_socket_data *get_socket_data(int socket)
+{
+ struct per_socket_data *data;
+
+ if (socket >= psd_size) {
+ BUG_ON();
+ return NULL;
+ }
+ return &psd[socket];
+}
+
+static int __init alloc_init_per_socket_data(void)
+{
+ psd = kzalloc(max_sockets * sizeof(struct per_socket_data));
+ if (!psd)
+ return -ENOMEM;
+ psd_size = max_sockets;
+ return 0;
+}
+
+static void percpu_init_hw_shared_zone(void)
+{
+ unsigned int cpu, this_socket;
+ struct cpuinfo_x86 *c;
+ uint32_t eax, ebx, ecx, edx;
+ struct per_socket_data *psd;
+ u32 size;
+
+ cpu = smp_processor_id();
+ this_socket = topology_physical_package_id(cpu);
+ psd = get_socket_data(this_socket);
+ c = &cpu_data(cpu);
+
+ cpuid_count(0x00000010, 1, &eax, &ebx, &ecx, &edx);
+ if (atomic_test_and_set(&psd->initialized))
+ return 0;
+ psd->hw_shared_bitmask = ebx;
+ // reserve 10% of cache ways for host
+ psd->reserved_for_host = c->x86_cache_max_cbm_len/10;
+ psd->reserved_for_host = max(psd->reserved_for_host,
+ bitmap_weight(&psd->hw_shared_bitmask));
+ psd->layout = find_create_layout(psd->hw_shared_bitmask);
+
+ size = BITS_TO_LONGS(c->x86_cache_max_cbm_len) * sizeof(unsigned long);
+ if (cdp_enabled)
+ size = 2*size;
+ psd->cosidzeromask = kzalloc(size, GFP_ATOMIC);
+ if (!closmap)
+ panic("%s allocation failed\n", __func__);
+
+ memset(psd->cosidzeromask, 1, size);
+}
+
+static int cat_cpu_notifier(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+
+ switch (action) {
+ case CPU_ONLINE:
+ percpu_init_hw_shared_zone();
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block cat_cpu_notifier_block = {
+ .notifier_call = cat_cpu_notifier,
+ .priority = -INT_MAX
+};
+
+static int init_hw_shared_zone(void)
+{
+ cpumask_t cpumask;
+ int cpu;
+ unsigned long *topology_bmap;
+ int size = BITS_TO_LONGS(max_sockets * NR_CPUS) * sizeof(long);
+
+ topology_bmap = kzalloc(size, GFP_KERNEL);
+ if (!topology_bmap)
+ return -ENOMEM;
+
+ cpumask_zero(&cpumask);
+
+ for_each_online_cpu(cpu) {
+ phys_id = topology_physical_package_id(cpu);
+ if (test_and_set_bit(phys_id, topology_bmap))
+ continue;
+ cpumask_set_cpu(cpu, &cpumask);
+ }
+
+ smp_call_function_many(&cpumask,
+ percpu_init_hw_shared_zone, 0, 1);
+
+ kfree(topology_bmap);
+
+ return 0;
+}
+
+
+static int __init intel_cat_mem_init(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ u32 maxid;
+
+ err = -ENOMEM;
+
+ max_cbm_len = c->x86_cache_max_cbm_len;
+ maxid = max_closid = c->x86_cache_max_closid;
+ //maxid = max_closid = c->x86_cache_max_closid;
+ size = BITS_TO_LONGS(maxid) * sizeof(long);
+ closmap = kzalloc(size, GFP_KERNEL);
+ if (!closmap)
+ goto err_out;
+
+ size = maxid * sizeof(struct tcr_list *);
+ id_to_tcrlist = kzalloc(size, GFP_KERNEL);
+ if (!id_to_tcrlist)
+ goto err_out;
+
+ err = alloc_tcrid_table();
+ if (err)
+ goto err_out;
+
+ err = get_l3_cache_size();
+ if (err)
+ goto err_out;
+
+ /* kbytes per cbm bit =
+ * L3 cache size in kbytes / capacity bitmask length.
+ */
+ kbytes_per_cbm_bit = (l3_cache_size >> 10) / max_cbm_len;
+
+ /* L3 cache size in kbytes / kbytes per cbm bit =
+ * cbm bits in L3 cache.
+ */
+ l3_nr_cbm_bits = (l3_cache_size >> 10) / kbytes_per_cbm_bit;
+
+ err = alloc_init_per_socket_data();
+ if (err)
+ goto err_out;
+
+ init_hw_shared_zone();
+
+ /* bit 0 is reserved for global task pool */
+ set_bit(0, &tcrlist_ids);
+
+ return 0;
+err_out:
+ kfree(id_to_tcrlist);
+ kfree(closmap);
+ return err;
+}
+
+static int __init intel_cat_init(void)
+{
+ int r;
+ int cpu;
+
+ preempt_disable();
+ cpu = smp_processor_id();
+ cpus_per_socket = cpumask_weight(topology_core_cpumask(cpu));
+ max_sockets = NR_CPUS/cpus_per_socket;
+ preempt_enable();
+
+ r = misc_register(&intel_cat_misc);
+ if (r) {
+ printk(KERN_ERR "intel_cat: misc_register error = %d\n",r);
+ return r;
+ }
+
+ r = intel_cat_mem_init();
+ if (r) {
+ misc_unregister(&intel_cat_misc);
+ }
+
+ cpu_notifier_register_begin();
+ __register_hotcpu_notifier(&cat_cpu_notifier_block);
+ cpu_notifier_register_done();
+
+ return r;
+}
+
diff --git a/arch/x86/kernel/cat/cache_reservation.h b/arch/x86/kernel/cat/cache_reservation.h
new file mode 100644
index 0000000..e8146a0
--- /dev/null
+++ b/arch/x86/kernel/cat/cache_reservation.h
@@ -0,0 +1,47 @@
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+
+struct tcr_entry {
+ unsigned int tcrid;
+
+ unsigned long *tcrlist_bmap;
+
+ u64 user_kbytes;
+ u64 rounded_kbytes;
+ unsigned int cbm_bits;
+
+ u32 type;
+
+ cpumask_var_t *cpumask;
+};
+
+#define CBM_LEN 64
+#define MAX_LAYOUTS 10
+
+struct tcr_list_per_socket {
+ int cbm_start_bit, cbm_end_bit;
+};
+
+struct tcr_list {
+ /* cache allocation */
+ struct tcr_list_per_socket psd[MAX_LAYOUTS];
+
+ /* bitmap indicating whether cap_bitmask is synced to a given socket */
+ unsigned long *synced_to_socket;
+
+ /* TCRlist id */
+ unsigned int id;
+
+ // One bit per tcrentry.
+ DECLARE_BITMAP(tcrentry_bmap, CBM_LEN);
+
+ // link in global tcrlist list
+ struct list_head global_link;
+ // list of tasks referencing this tcr_list
+ struct list_head tasks;
+ // nr of tasks referencing this tcr_list
+ unsigned int nr_tasks;
+};
+
On Fri, Nov 13, 2015 at 02:39:33PM -0200, Marcelo Tosatti wrote:
> + * * one tcrid entry can be in different locations
> + * in different sockets.
NAK on that without cpuset integration.
I do not want freely migratable tasks having radically different
performance profiles depending on which CPU they land.
On Fri, Nov 13, 2015 at 05:51:00PM +0100, Peter Zijlstra wrote:
> On Fri, Nov 13, 2015 at 02:39:33PM -0200, Marcelo Tosatti wrote:
> > + * * one tcrid entry can be in different locations
> > + * in different sockets.
>
> NAK on that without cpuset integration.
>
> I do not want freely migratable tasks having radically different
> performance profiles depending on which CPU they land.
Please expand on what "cpuset integration" means, operationally.
I hope it does not mean "i prefer cgroups as an interface",
because that does not mean much to me.
So you are saying this should be based on cgroups? Have you seen the
cgroups proposal and the issues with it, that have been posted?
On Fri, Nov 13, 2015 at 05:51:00PM +0100, Peter Zijlstra wrote:
> On Fri, Nov 13, 2015 at 02:39:33PM -0200, Marcelo Tosatti wrote:
> > + * * one tcrid entry can be in different locations
> > + * in different sockets.
>
> NAK on that without cpuset integration.
>
> I do not want freely migratable tasks having radically different
> performance profiles depending on which CPU they land.
Ok, so, configuration:
Socket-1 Socket-2
pinned thread-A with 100% L3 free
80% of L3
reserved
So it is a problem if a thread running on socket-2 is scheduled to
socket-1 because performance is radically different, fine.
Then one way to avoid that is to not allow freely migratable tasks
to move to Socket-1. Fine.
Then you want to use cpusets for that.
Can you fill in the blanks what is missing here?
On Fri, Nov 13, 2015 at 03:27:40PM -0200, Marcelo Tosatti wrote:
> On Fri, Nov 13, 2015 at 05:51:00PM +0100, Peter Zijlstra wrote:
> > On Fri, Nov 13, 2015 at 02:39:33PM -0200, Marcelo Tosatti wrote:
> > > + * * one tcrid entry can be in different locations
> > > + * in different sockets.
> >
> > NAK on that without cpuset integration.
> >
> > I do not want freely migratable tasks having radically different
> > performance profiles depending on which CPU they land.
>
> Please expand on what "cpuset integration" means, operationally.
> I hope it does not mean "i prefer cgroups as an interface",
> because that does not mean much to me.
>
> So you are saying this should be based on cgroups? Have you seen the
> cgroups proposal and the issues with it, that have been posted?
Subject: cat cgroup interface proposal (non hierarchical) was Re: [PATCH
V15 00/11] x86: Intel Cache
Allocation Technology Support
https://lkml.org/lkml/2015/11/2/700
On Fri, Nov 13, 2015 at 05:51:00PM +0100, Peter Zijlstra wrote:
> On Fri, Nov 13, 2015 at 02:39:33PM -0200, Marcelo Tosatti wrote:
> > + * * one tcrid entry can be in different locations
> > + * in different sockets.
>
> NAK on that without cpuset integration.
>
> I do not want freely migratable tasks having radically different
> performance profiles depending on which CPU they land.
OK, probably not cgroups interface (which can't be done unless
someone solves the issue of one task on multiple cgroups, or
explains why it is not an issue).
So you come with a cpuset configuration:
A cpuset defines a list of CPUs and memory nodes. The CPUs of
a system include all the logical processing units on which a process
can execute, including, if present, multiple processor cores within a
package and Hyper-Threads within a processor core. Memory nodes include
all distinct banks of main memory; small and SMP systems typi‐ cally
have just one memory node that contains all the system's main memory,
while NUMA (non-uniform memory access) systems have multiple memory
nodes.
----
Then for each task in the cpuset, you can configure via priorities
the percentage of time each task is allowed to run on the CPUs of the
cpuset.
You want something to automatically remove CPUs from cpusets
if a given amount of L3 cache is reserved?
Can do that in userspace, no problem.
On Fri, 13 Nov 2015 15:27:40 -0200
Marcelo Tosatti <[email protected]> wrote:
> On Fri, Nov 13, 2015 at 05:51:00PM +0100, Peter Zijlstra wrote:
> > On Fri, Nov 13, 2015 at 02:39:33PM -0200, Marcelo Tosatti wrote:
> > > + * * one tcrid entry can be in different locations
> > > + * in different sockets.
> >
> > NAK on that without cpuset integration.
> >
> > I do not want freely migratable tasks having radically different
> > performance profiles depending on which CPU they land.
>
> Please expand on what "cpuset integration" means, operationally.
> I hope it does not mean "i prefer cgroups as an interface",
> because that does not mean much to me.
I guess that what Peter is saying is that we don't want tasks
attached to a reservation landing on a CPU where the reservation
might be different or not existent at all.
Peter, what about integrating this with affinity masks instead
of cpusets (I have no idea how cpusets are implemented, but I
guess they are a superset of affinity masks).
This way, the ATTACH_RESERVATION command would fail if any
of the CPUs in the cpumask are not part of the reservation.
And then our code would have to be notified any time the process'
affinity mask is changed (we either fail the affinity change
or detach the process automatically from the reservation). Does
this sound like a good solution?
>
> So you are saying this should be based on cgroups? Have you seen the
> cgroups proposal and the issues with it, that have been posted?
>
>
On Fri, 13 Nov 2015 14:39:33 -0200
Marcelo Tosatti <[email protected]> wrote:
> Attached is an early version of the ioctl based CAT interface we
> have been working on.
We're also writing a user-space app that can manage reservations
from user-space. It's integrated with util-linux. I'll setup a repo
shortly to share the source code, but here are some examples:
Create a global 2MB reservation:
# cacheset --create 2M
Create a 2MB reservation on CPUs 0 and 1:
# cacheset --create 2M --cpu-list 0,1
# cacheset --create 2M 0x3
Attach pid 4901 to reservation ID 1
# cacheset --attach 4901 1
Detach pid 4901 to reservation ID 1
# cacheset --detach 4901 1
Delete reservation ID 1
# cacheset --delete 1
On Fri, Nov 13, 2015 at 02:04:38PM -0500, Luiz Capitulino wrote:
> On Fri, 13 Nov 2015 15:27:40 -0200
> Marcelo Tosatti <[email protected]> wrote:
>
> > On Fri, Nov 13, 2015 at 05:51:00PM +0100, Peter Zijlstra wrote:
> > > On Fri, Nov 13, 2015 at 02:39:33PM -0200, Marcelo Tosatti wrote:
> > > > + * * one tcrid entry can be in different locations
> > > > + * in different sockets.
> > >
> > > NAK on that without cpuset integration.
> > >
> > > I do not want freely migratable tasks having radically different
> > > performance profiles depending on which CPU they land.
> >
> > Please expand on what "cpuset integration" means, operationally.
> > I hope it does not mean "i prefer cgroups as an interface",
> > because that does not mean much to me.
>
> I guess that what Peter is saying is that we don't want tasks
> attached to a reservation landing on a CPU where the reservation
> might be different or not existent at all.
Then you pin the tasks to not land on those CPUs, explicitly.
If you create a reservation on the CPU, then there is a reservation
there. If you don't, then there will be no reservation on that CPU.
The amount of L3 cache you reserve is the same, on each CPU, just the
location of L3 cache is different, but the quantity is the same.
(this in case the region shared with HW is different). If the region
shared with HW is the same on each socket, then the location (as
specified in the capacity bitmask) is the same on every socket. HW
limitation.
> Peter, what about integrating this with affinity masks instead
> of cpusets (I have no idea how cpusets are implemented, but I
> guess they are a superset of affinity masks).
>
> This way, the ATTACH_RESERVATION command would fail if any
> of the CPUs in the cpumask are not part of the reservation.
> And then our code would have to be notified any time the process'
> affinity mask is changed (we either fail the affinity change
> or detach the process automatically from the reservation). Does
> this sound like a good solution?
Yes, perhaps notifications to cpusets is what Peter refers to?
On Fri, Nov 13, 2015 at 03:43:02PM -0200, Marcelo Tosatti wrote:
> Subject: cat cgroup interface proposal (non hierarchical) was Re: [PATCH
> V15 00/11] x86: Intel Cache
> Allocation Technology Support
>
>
> https://lkml.org/lkml/2015/11/2/700
I've really no idea what you're trying to say there. That just doesn't
parse.
On Fri, Nov 13, 2015 at 02:04:38PM -0500, Luiz Capitulino wrote:
> I guess that what Peter is saying is that we don't want tasks
> attached to a reservation landing on a CPU where the reservation
> might be different or not existent at all.
Correct.
> This way, the ATTACH_RESERVATION command would fail if any
> of the CPUs in the cpumask are not part of the reservation.
> And then our code would have to be notified any time the process'
> affinity mask is changed (we either fail the affinity change
> or detach the process automatically from the reservation). Does
> this sound like a good solution?
No. We're not going to have random drivers muck about with affinity
masks, and most certainly not some manky ioctl.
On Fri, Nov 13, 2015 at 03:33:04PM -0200, Marcelo Tosatti wrote:
> On Fri, Nov 13, 2015 at 05:51:00PM +0100, Peter Zijlstra wrote:
> > On Fri, Nov 13, 2015 at 02:39:33PM -0200, Marcelo Tosatti wrote:
> > > + * * one tcrid entry can be in different locations
> > > + * in different sockets.
> >
> > NAK on that without cpuset integration.
> >
> > I do not want freely migratable tasks having radically different
> > performance profiles depending on which CPU they land.
>
> Ok, so, configuration:
>
>
> Socket-1 Socket-2
>
> pinned thread-A with 100% L3 free
> 80% of L3
> reserved
>
>
> So it is a problem if a thread running on socket-2 is scheduled to
> socket-1 because performance is radically different, fine.
>
> Then one way to avoid that is to not allow freely migratable tasks
> to move to Socket-1. Fine.
>
> Then you want to use cpusets for that.
>
> Can you fill in the blanks what is missing here?
I'm still not seeing what the problem with CAT-cgroup is.
/cgroups/cpuset/
socket-1/cpus = $socket-1
tasks = $thread-A
socket-2/cpus = $socket-2
tasks = $thread-B
/cgroups/cat/
group-A/bitmap = 0x3F / 0xFF
group-A/tasks = $thread-A
group-B/bitmap = 0xFF / 0xFF
group-B/tasks = $thread-B
That gets you thread-A on socket-1 with 6/8 of the L3 and thread-B on
socket-2 with 8/8 of the L3.
On Fri, Nov 13, 2015 at 04:01:18PM -0200, Marcelo Tosatti wrote:
> OK, probably not cgroups interface (which can't be done unless
> someone solves the issue of one task on multiple cgroups, or
> explains why it is not an issue).
A task can be part of every controller once, but there is no reason it
needs to be the 'same' cgroup for each controller.
That's how cgroups work, always have.
Why would that be a problem? Just don't co-mount them.
On Mon, Nov 16, 2015 at 09:59:26AM +0100, Peter Zijlstra wrote:
> On Fri, Nov 13, 2015 at 03:43:02PM -0200, Marcelo Tosatti wrote:
> > Subject: cat cgroup interface proposal (non hierarchical) was Re: [PATCH
> > V15 00/11] x86: Intel Cache
> > Allocation Technology Support
> >
> >
> > https://lkml.org/lkml/2015/11/2/700
>
> I've really no idea what you're trying to say there. That just doesn't
> parse.
I've posted this internally at people understood and commented on it.
What part you don't understand?
Its a proposal for a cgroups based interface. The description
starts from the main mount point of the cgroup. So you would:
# mount -t cgroup -ointel_cat intel_cat /sys/fs/cgroup/cat/
# cd /sys/fs/cgroup/cat/
# ls
cat_hw_info
# mkdir cache_reservation_for_forwarding_app
# ls
cat_hw_info cache_reservation_for_forwarding_app
# cd cache_reservation_for_forwarding_app
# ls
type size socketmask enable_reservation
# ...
Yes?
cgroup CAT interface (non hierarchical):
---------------------------------------
0) Main directory files:
cat_hw_info
-----------
CAT HW information: CBM length, CDP supported, etc.
Information reported per-socket, as sockets can have
different configurations. Perhaps should be inside
sysfs.
1) Sub-directories represent cache reservations (size,type).
mkdir cache_reservation_for_forwarding_app
cd cache_reservation_for_forwarding_app
echo "80KB" > size
echo "data_and_code" > type
echo "socketmask=0xfffff" > socketmask (optional)
echo "1" > enable_reservation
echo "pid-of-forwarding-main-thread pid-of-helper-thread ..." > tasks
Files:
type
----------------
{data_and_code, only_code, only_data}. Type of
L3 CAT cache allocation to use. only_code,only_data only
supported on CDP capable processors.
size
----
size of L3 cache reservation.
rounding
--------
{round_down,round_up} whether to round up / round down
allocation size in kbytes, to cache-way size.
Default: round_up
socketmask
----------
Mask of sockets where the reservation is in effect.
A zero bit means the task will not have the L3 cache
portion that the cgroup references reserved on that socket.
Default: all sockets set.
enable
------
Allocate reservation with parameters set above.
When a reservation is enabled, it reserves L3 cache
space on any socket thats specified in "socketmask".
After cgroup has been enabled by a write of "1" to
"enable_reservation" file, only the "tasks" file can be modified.
To change the size of a cgroup reservation, recreate the directory.
tasks
-----
Contains the list of tasks which use this cache reservation.
Error reporting
---------------
Errors are reported in response to write as appropriate:
for example, write 1 > enable when there is not enough space
for "socketmask" would return -ENOSPC, etc.
Write to "enable" without size being set would return -EINVAL, etc.
Listing
-------
To list which reservations are in place, search for subdirectories
where "enabled" file has value 1.
Semantics: A task has guaranteed cache reservation on any CPU where its
scheduled in, for the lifetime of the cgroup, as long as that task is
not attached to further cgroups.
That is, a task belonging to cgroup-A can have its cache reservation
invalidated when attached to cgroup-B, (reasoning: it might be necessary
to reallocate the CBMs to respect contiguous bits in cache, a
restriction of the CAT HW interface).
-------
BLOCKER
-------
Can't use cgroups for CAT because:
"Control Groups extends the kernel as follows:
- Each task in the system has a reference-counted pointer to a
css_set.
- A css_set contains a set of reference-counted pointers to
cgroup_subsys_state objects, one for each cgroup subsystem
registered in the system."
You need a task to be part of two cgroups at one time,
to support the following configuration:
Task-A: 70% of cache reserved exclusively (reservation-0).
20% of cache reserved (reservation-1).
Task-B: 20% of cache reserved (reservation-1).
Unless reservations are created separately, then added to cgroups:
mount -t cgroup ... /../catcgroup/
cd /../catcgroup/
# create reservations
cd reservations
mkdir reservation-1
echo "80K" > size
echo "socketmask" > ...
echo "1" > enable
mkdir reservation-2
echo "160K" > size
echo "socketmask" > ...
echo "1" > enable
# attach reservation to cgroups
cd /../catcgroup/
mkdir cgroup-for-threaded-app
echo reservation-1 reservation-2 > reservations
echo "mainthread" > tasks
cd ..
mkdir cgroup-for-helper-thread
echo reservation-1 > reservations
echo "helperthread" > tasks
cd ..
This way mainthread and helperthread can share "reservation-1".
On Mon, Nov 16, 2015 at 10:07:56AM +0100, Peter Zijlstra wrote:
> On Fri, Nov 13, 2015 at 03:33:04PM -0200, Marcelo Tosatti wrote:
> > On Fri, Nov 13, 2015 at 05:51:00PM +0100, Peter Zijlstra wrote:
> > > On Fri, Nov 13, 2015 at 02:39:33PM -0200, Marcelo Tosatti wrote:
> > > > + * * one tcrid entry can be in different locations
> > > > + * in different sockets.
> > >
> > > NAK on that without cpuset integration.
> > >
> > > I do not want freely migratable tasks having radically different
> > > performance profiles depending on which CPU they land.
> >
> > Ok, so, configuration:
> >
> >
> > Socket-1 Socket-2
> >
> > pinned thread-A with 100% L3 free
> > 80% of L3
> > reserved
> >
> >
> > So it is a problem if a thread running on socket-2 is scheduled to
> > socket-1 because performance is radically different, fine.
> >
> > Then one way to avoid that is to not allow freely migratable tasks
> > to move to Socket-1. Fine.
> >
> > Then you want to use cpusets for that.
> >
> > Can you fill in the blanks what is missing here?
>
> I'm still not seeing what the problem with CAT-cgroup is.
>
> /cgroups/cpuset/
> socket-1/cpus = $socket-1
> tasks = $thread-A
>
> socket-2/cpus = $socket-2
> tasks = $thread-B
>
> /cgroups/cat/
> group-A/bitmap = 0x3F / 0xFF
> group-A/tasks = $thread-A
>
> group-B/bitmap = 0xFF / 0xFF
> group-B/tasks = $thread-B
>
>
> That gets you thread-A on socket-1 with 6/8 of the L3 and thread-B on
> socket-2 with 8/8 of the L3.
- need bitmasks per socket (optionally).
- format kept in kernel is not universal (have to convert every time
L3 cache size changes).
- need to specify type (i-cache or d-cache, differentiation supported on newer processors),
ok can add more bitmaps.
- position in bitmask represents nothing other than identification of
reservation and size, so:
group-A = 0x3F, group-B = 0xFF
is the same as
group-A = 0xFC, group-B = 0xFF
- have to locate a free region every time in the bitmasks.
So userspace has to do:
# lock write access to /cgroups/cat/
create group-C, taking into account bitmasks of
group-A and group-B.
# unlock write access to /cgroups/cat.
But OK, it works, lets use that.
On Mon, 16 Nov 2015, Marcelo Tosatti wrote:
> You need a task to be part of two cgroups at one time,
> to support the following configuration:
>
> Task-A: 70% of cache reserved exclusively (reservation-0).
> 20% of cache reserved (reservation-1).
>
> Task-B: 20% of cache reserved (reservation-1).
Why would such a configuration be desired?
Thanks,
tglx
On Mon, Nov 16, 2015 at 11:03:57AM -0200, Marcelo Tosatti wrote:
> cgroup CAT interface (non hierarchical):
That's a fail right there. Cgroup thingies must be hierarchical.
On Mon, Nov 16, 2015 at 12:37:08PM -0200, Marcelo Tosatti wrote:
> - position in bitmask represents nothing other than identification of
> reservation and size, so:
>
> group-A = 0x3F, group-B = 0xFF
> is the same as
> group-A = 0xFC, group-B = 0xFF
No, the position very much matters; maybe not in this example, but it
does the moment you get overlapping bitmaps.
Picking which bits overlap determines which other groups are affected.
This is why a bitmap is more expressive than a single percentage.
On Mon, 16 Nov 2015 10:07:56 +0100
Peter Zijlstra <[email protected]> wrote:
> I'm still not seeing what the problem with CAT-cgroup is.
>
> /cgroups/cpuset/
> socket-1/cpus = $socket-1
> tasks = $thread-A
>
> socket-2/cpus = $socket-2
> tasks = $thread-B
>
> /cgroups/cat/
> group-A/bitmap = 0x3F / 0xFF
> group-A/tasks = $thread-A
>
> group-B/bitmap = 0xFF / 0xFF
> group-B/tasks = $thread-B
>
>
> That gets you thread-A on socket-1 with 6/8 of the L3 and thread-B on
> socket-2 with 8/8 of the L3.
Peter, I'm giving a serious try on the cgroups patches and would be
glad to be enlightened if I'm missing something. But I don't see how
what you're proposing would solve the problem.
My understanding of CAT is that if I want to reserve 80% of the cache
in socket-1 to $thread-A I also have to:
1. Create another mask reserving 20% of the cache in socket-1
2. Assign that mask to all other threads that may run in socket-1
If I'm right about this, then when a task with 20% reservation migrates
to socket-2 it will only access 20% of the cache there even though there
should be no restrictions in socket-2's cache.
If the solution you're proposing means that I should assign all other
tasks to $thread-B, then what you're actually doing is pinning all
tasks but $thread-A to socket-2. You can do this today without CAT.
On Mon, Nov 16, 2015 at 11:18:42AM -0500, Luiz Capitulino wrote:
> Peter, I'm giving a serious try on the cgroups patches and would be
> glad to be enlightened if I'm missing something. But I don't see how
> what you're proposing would solve the problem.
>
> My understanding of CAT is that if I want to reserve 80% of the cache
> in socket-1 to $thread-A I also have to:
>
> 1. Create another mask reserving 20% of the cache in socket-1
> 2. Assign that mask to all other threads that may run in socket-1
>
> If I'm right about this, then when a task with 20% reservation migrates
> to socket-2 it will only access 20% of the cache there even though there
> should be no restrictions in socket-2's cache.
Uh what? Task-A was bound to socket-1, it will never get to socket-2.
Clearly I'm not getting these examples you're throwing around.
Also, I explicitly do not want tasks that can migrate between sockets to
have different performance profiles across those sockets.
On Mon, Nov 16, 2015 at 10:07:56AM +0100, Peter Zijlstra wrote:
> On Fri, Nov 13, 2015 at 03:33:04PM -0200, Marcelo Tosatti wrote:
> > On Fri, Nov 13, 2015 at 05:51:00PM +0100, Peter Zijlstra wrote:
> > > On Fri, Nov 13, 2015 at 02:39:33PM -0200, Marcelo Tosatti wrote:
> > > > + * * one tcrid entry can be in different locations
> > > > + * in different sockets.
> > >
> > > NAK on that without cpuset integration.
> > >
> > > I do not want freely migratable tasks having radically different
> > > performance profiles depending on which CPU they land.
> >
> > Ok, so, configuration:
> >
> >
> > Socket-1 Socket-2
> >
> > pinned thread-A with 100% L3 free
> > 80% of L3
> > reserved
> >
> >
> > So it is a problem if a thread running on socket-2 is scheduled to
> > socket-1 because performance is radically different, fine.
> >
> > Then one way to avoid that is to not allow freely migratable tasks
> > to move to Socket-1. Fine.
> >
> > Then you want to use cpusets for that.
> >
> > Can you fill in the blanks what is missing here?
>
> I'm still not seeing what the problem with CAT-cgroup is.
>
> /cgroups/cpuset/
> socket-1/cpus = $socket-1
> tasks = $thread-A
>
> socket-2/cpus = $socket-2
> tasks = $thread-B
>
> /cgroups/cat/
> group-A/bitmap = 0x3F / 0xFF
> group-A/tasks = $thread-A
>
> group-B/bitmap = 0xFF / 0xFF
> group-B/tasks = $thread-B
>
>
> That gets you thread-A on socket-1 with 6/8 of the L3 and thread-B on
> socket-2 with 8/8 of the L3.
Going that route, might as well expose the region shared with HW
to userspace and let userspace handle the problem of contiguous free regions,
which means the cgroups bitmask maps one-to-one to HW bitmap.
All is necessary then is to modify the Intel patches to
1) Support bitmaps per socket.
2) Remove hierarchical support.
3) Lazy enforcement (which can be done later as an improvement).
On Mon, 16 Nov 2015 17:26:57 +0100
Peter Zijlstra <[email protected]> wrote:
> On Mon, Nov 16, 2015 at 11:18:42AM -0500, Luiz Capitulino wrote:
> > Peter, I'm giving a serious try on the cgroups patches and would be
> > glad to be enlightened if I'm missing something. But I don't see how
> > what you're proposing would solve the problem.
> >
> > My understanding of CAT is that if I want to reserve 80% of the cache
> > in socket-1 to $thread-A I also have to:
> >
> > 1. Create another mask reserving 20% of the cache in socket-1
> > 2. Assign that mask to all other threads that may run in socket-1
> >
> > If I'm right about this, then when a task with 20% reservation migrates
> > to socket-2 it will only access 20% of the cache there even though there
> > should be no restrictions in socket-2's cache.
>
> Uh what? Task-A was bound to socket-1, it will never get to socket-2.
Sure, but you're going to allow other threads besides Task-A to execute
in socket-1 too, right? In this case, my understanding of CAT is that
those threads will require a mask to restrict them to only 20% of the
cache.
> Clearly I'm not getting these examples you're throwing around.
It's the "Isolated Bitmask" example from the Intel Manual p. 616
(You'll find the excerpt below, but Figure 17-27 is easier to understand).
My understanding of this example is that in order to isolate a portion
of the cache to Task-A I also have to create a second mask which excludes
bits reserved to Task-A. This second mask is assigned to all other tasks
that will share the L3 cache with Task-A.
"""
Figure 17-27 also shows three examples of sets of Cache Capacity Bitmasks. For simplicity these are represented
as 8-bit vectors, though this may vary depending on the implementation and how the mask is mapped to the avail-
able cache capacity. The first example shows the default case where all 4 Classes of Service (the total number of
COS are implementation-dependent) have full access to the cache. The second case shows an overlapped case,
which would allow some lower-priority threads share cache space with the highest priority threads. The third case
shows various non-overlapped partitioning schemes. As a matter of software policy for extensibility COS0 should
typically be considered and configured as the highest priority COS, followed by COS1, and so on, though there is
no hardware restriction enforcing this mapping. When the system boots all threads are initialized to COS0, which
has full access to the cache by default.
"""
> Also, I explicitly do not want tasks that can migrate between sockets to
> have different performance profiles across those sockets.
I think we can solve this problem with the ioctl interface, if that's
what you mean.
On Mon, Nov 16, 2015 at 03:42:08PM +0100, Thomas Gleixner wrote:
> On Mon, 16 Nov 2015, Marcelo Tosatti wrote:
> > You need a task to be part of two cgroups at one time,
> > to support the following configuration:
> >
> > Task-A: 70% of cache reserved exclusively (reservation-0).
> > 20% of cache reserved (reservation-1).
> >
> > Task-B: 20% of cache reserved (reservation-1).
>
> Why would such a configuration be desired?
>
> Thanks,
>
> tglx
The HW supports cache sharing, its a HW feature.
Makes sense to share reserved cache between threads of an application
that for example share a library.
On Mon, Nov 16, 2015 at 04:01:33PM +0100, Peter Zijlstra wrote:
> On Mon, Nov 16, 2015 at 11:03:57AM -0200, Marcelo Tosatti wrote:
> > cgroup CAT interface (non hierarchical):
>
> That's a fail right there. Cgroup thingies must be hierarchical.
Why?
On Mon, Nov 16, 2015 at 04:01:33PM +0100, Peter Zijlstra wrote:
> On Mon, Nov 16, 2015 at 11:03:57AM -0200, Marcelo Tosatti wrote:
> > cgroup CAT interface (non hierarchical):
>
> That's a fail right there. Cgroup thingies must be hierarchical.
What is the problem with ioctls again? (haven't seen any technical
argument).
If the ioctls were replaced by say, syscalls you would be happy?
If the ioctls were replaced by attach/detach/delete/unassign commands
via directories, would you be happy?
On Mon, Nov 16, 2015 at 02:39:03PM -0200, Marcelo Tosatti wrote:
> On Mon, Nov 16, 2015 at 10:07:56AM +0100, Peter Zijlstra wrote:
> > On Fri, Nov 13, 2015 at 03:33:04PM -0200, Marcelo Tosatti wrote:
> > > On Fri, Nov 13, 2015 at 05:51:00PM +0100, Peter Zijlstra wrote:
> > > > On Fri, Nov 13, 2015 at 02:39:33PM -0200, Marcelo Tosatti wrote:
> > > > > + * * one tcrid entry can be in different locations
> > > > > + * in different sockets.
> > > >
> > > > NAK on that without cpuset integration.
> > > >
> > > > I do not want freely migratable tasks having radically different
> > > > performance profiles depending on which CPU they land.
> > >
> > > Ok, so, configuration:
> > >
> > >
> > > Socket-1 Socket-2
> > >
> > > pinned thread-A with 100% L3 free
> > > 80% of L3
> > > reserved
> > >
> > >
> > > So it is a problem if a thread running on socket-2 is scheduled to
> > > socket-1 because performance is radically different, fine.
> > >
> > > Then one way to avoid that is to not allow freely migratable tasks
> > > to move to Socket-1. Fine.
> > >
> > > Then you want to use cpusets for that.
> > >
> > > Can you fill in the blanks what is missing here?
> >
> > I'm still not seeing what the problem with CAT-cgroup is.
> >
> > /cgroups/cpuset/
> > socket-1/cpus = $socket-1
> > tasks = $thread-A
> >
> > socket-2/cpus = $socket-2
> > tasks = $thread-B
> >
> > /cgroups/cat/
> > group-A/bitmap = 0x3F / 0xFF
> > group-A/tasks = $thread-A
> >
> > group-B/bitmap = 0xFF / 0xFF
> > group-B/tasks = $thread-B
> >
> >
> > That gets you thread-A on socket-1 with 6/8 of the L3 and thread-B on
> > socket-2 with 8/8 of the L3.
>
> Going that route, might as well expose the region shared with HW
> to userspace and let userspace handle the problem of contiguous free regions,
> which means the cgroups bitmask maps one-to-one to HW bitmap.
>
> All is necessary then is to modify the Intel patches to
>
> 1) Support bitmaps per socket.
Consider the following scenario, one server with two sockets:
socket-1 socket-2
[ [***] ] [ [***] ]
L3 cache bitmap L3 cache bitmap
[*] refers to the region shared with HW, as reported by CPUID (read the
Intel documentation).
socket-1.shared_region_with_hw = [bit 2, bit 5]
socket-2.shared_region_with_hw = [bit 16, bit 18]
Given that your application is critical, you do not want it to share any
reservation with HW. I was informed that there is no guarantee these
regions end up in the same location for different sockets. Lets say you
need 15 bits of reservation, and the total is 20 bits. One possibility would be:
socket-1.reservation = [bit 5, bit 15]
socket-2.reservation = [bit 1, bit 15]
For the current Intel CAT patchset, this restriction exists:
static int cbm_validate_rdt_cgroup(struct intel_rdt *ir, unsigned long
cbmvalue)
{
struct cgroup_subsys_state *css;
struct intel_rdt *par, *c;
unsigned long cbm_tmp = 0;
int err = 0;
if (!cbm_validate(cbmvalue)) {
err = -EINVAL;
goto out_err;
}
par = parent_rdt(ir);
clos_cbm_table_read(par->closid, &cbm_tmp);
if (!bitmap_subset(&cbmvalue, &cbm_tmp, MAX_CBM_LENGTH)) {
err = -EINVAL;
goto out_err;
}
Do you (or the author of the patch), can explain why is this
restriction here?
If the restriction has to be maintained, than one
hierarchy per-socket will be necessary to support different
bitmaps per socket.
If the restriction can be removed, then non hierarchical support
could look like:
/cgroups/cat/group-A/tasks = $thread-A
/cgroups/cat/group-A/socket-1/bitmap = 0x3F / 0xFF
/cgroups/cat/group-A/socket-2/bitmap = 0x... / 0xFF
Or one l3_cbm file containing one mask per socket,
separated by commas, similar to
/sys/devices/system/node/node0/cpumap
> 2) Remove hierarchical support.
There is nothing hierarchical in CAT, its flat.
Each set of tasks is associated with a number of bits
in each socket's L3 CBM mask.
> 3) Lazy enforcement (which can be done later as an improvement).
>
Hi!
>
> Attached is an early version of the ioctl based CAT interface we
> have been working on.
>
> NOTE: it does not compile, there is no locking, but should
> be sufficient for interested people to comment.
>
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index db3622f..293726b 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -757,6 +757,14 @@ config HPET_EMULATE_RTC
> def_bool y
> depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
>
> +config CACHE_RESERVATION
> + tristate "Cache Reservation Support"
> + default n
> + ---help---
> + This feature makes use of Intel's Cache Allocation Technology to allow the
> + reservation of portions of the L3 cache to specific tasks. Please, see
> + Documentation/x86/cache-reservation.txt for more
Including the Documentation/x86/cache-reservation.txt file in the
patch would be nice ;-).
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html