Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S933394AbbKMQly (ORCPT ); Fri, 13 Nov 2015 11:41:54 -0500 Received: from mx1.redhat.com ([209.132.183.28]:44917 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S933175AbbKMQj7 (ORCPT ); Fri, 13 Nov 2015 11:39:59 -0500 Date: Fri, 13 Nov 2015 14:39:33 -0200 From: Marcelo Tosatti To: Luiz Capitulino Cc: Peter Zijlstra , Thomas Gleixner , Vikas Shivappa , Tejun Heo , Yu Fenghua , linux-kernel@vger.kernel.org Subject: [PATCH RFC] ioctl based CAT interface Message-ID: <20151113163933.GA10222@amt.cnet> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline User-Agent: Mutt/1.5.21 (2010-09-15) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 33457 Lines: 1425 Attached is an early version of the ioctl based CAT interface we have been working on. NOTE: it does not compile, there is no locking, but should be sufficient for interested people to comment. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index db3622f..293726b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -757,6 +757,14 @@ config HPET_EMULATE_RTC def_bool y depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) +config CACHE_RESERVATION + tristate "Cache Reservation Support" + default n + ---help--- + This feature makes use of Intel's Cache Allocation Technology to allow the + reservation of portions of the L3 cache to specific tasks. Please, see + Documentation/x86/cache-reservation.txt for more information. + config APB_TIMER def_bool y if X86_INTEL_MID prompt "Intel MID APB Timer Support" if X86_INTEL_MID diff --git a/arch/x86/include/uapi/asm/cache_reservation.h b/arch/x86/include/uapi/asm/cache_reservation.h new file mode 100644 index 0000000..c4dcc95 --- /dev/null +++ b/arch/x86/include/uapi/asm/cache_reservation.h @@ -0,0 +1,64 @@ +enum cache_rsvt_flags { + CACHE_RSVT_ROUND_DOWN = (1 << 0), /* round kbytes down */ +}; + +enum cache_rsvt_type { + CACHE_RSVT_TYPE_CODE = 0, /* cache reservation is for code */ + CACHE_RSVT_TYPE_DATA, /* cache reservation is for data */ + CACHE_RSVT_TYPE_BOTH, /* cache reservation is for both */ +}; + +struct cat_reservation { + __u64 kbytes; + __u32 type; + __u32 flags; + __u32 tcrid; + __u32 pad[11]; +}; + +struct cat_reservation_cpumask { + size_t cpusetsize; + cpu_set_t *mask; + struct cat_reservation res; +}; + +struct pid_cat_reservation { + __u32 tcrid; + __s32 pid; + __u32 pad[8]; +}; + +struct cat_tcrid { + __u32 tcrid; + __u32 pad[7]; +}; + +struct cat_reservation_list { + /* -- input -- */ + struct cat_reservation *res; + /* how many bytes allocated for list */ + size_t cat_res_size; + cpu_set_t *mask; + /* how many bytes allocated for mask */ + size_t cpusetsize; + + /* -- output -- */ + /* size of each cpu_set_t entry copied to + * cpu_set_t *mask + */ + size_t cpumask_size; + __u32 pad[11]; +}; + +struct cat_tcrid_tasks { + __u32 tcrid; + size_t nr_entries; + struct pid_t *list; +}; + +#define CAT_CREATE_RESERVATION _IOW(CATIO, 0x00, struct cat_reservation_cpumask) +#define CAT_DELETE_RESERVATION _IOR(CATIO, 0x01, struct cat_tcrid) +#define CAT_ATTACH_RESERVATION _IOW(CATIO, 0x02, struct pid_cat_reservation) +#define CAT_DETACH_RESERVATION _IOW(CATIO, 0x03, struct pid_cat_reservation) +#define CAT_GET_RESERVATIONS _IOW(CATIO, 0x04, struct cat_reservation_list) +#define CAT_GET_TCRID_TASKS _IOW(CATIO, 0x05, struct) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index b1b78ff..57129d6 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -110,6 +110,8 @@ obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o +obj-$(CONFIG_CACHE_RESERVATION) += cat/ + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/cat/Makefile b/arch/x86/kernel/cat/Makefile new file mode 100644 index 0000000..031fd64 --- /dev/null +++ b/arch/x86/kernel/cat/Makefile @@ -0,0 +1 @@ +obj-y += cache_reservation.o diff --git a/arch/x86/kernel/cat/cache_reservation.c b/arch/x86/kernel/cat/cache_reservation.c new file mode 100644 index 0000000..4187a57 --- /dev/null +++ b/arch/x86/kernel/cat/cache_reservation.c @@ -0,0 +1,1244 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cache_reservation.h" +#include +#include + +/* + * + * There are two main data structures: tcrid entries, and tcrid lists. + * A tcrid entry contains size,type information and is used + * to identify a cache allocation reservation. + * One task should not allocate more than one tcrid per type + * unless that tcrid is to be shared with a different task. + * A tcrid list is a set of tcrid entries, and is mapped to (used by) + * one or more tasks. + * Each task is mapped to only one tcrid list. + * A tcrid entry can be in one or more tcrid lists at the same time. + * + * Mapping to Intel CAT: + * * tcrid list maps one-to-one to a COS-ID. + * * tcrid entry represents a range of bits + * in a number of (one or more) Cache Capacity Bitmasks, + * which are specified in HW via IA32_L3_MASK_n MSRs. + * * one tcrid entry can be in different locations + * in different sockets. + * * tcrid entries of a tcrid list must be mapped contiguously + * in hardware. + * + */ + +unsigned long *closmap; + +LIST_HEAD(tcr_global_list); +DEFINE_MUTEX(tcr_list_mutex); + +DECLARE_BITMAP(tcrid_used_bitmap, CBM_LEN); +struct tcr_entry *tcrid_table; +static unsigned int total_tcrentry_bits; + +static unsigned int l3_cache_size; +//static u32 max_closid; +static u32 max_cbm_len; +static unsigned int kbytes_per_cbm_bit; +static unsigned int l3_nr_cbm_bits; + +static unsigned int max_sockets; + +struct cache_layout { + unsigned long *closmap; + u32 hw_shared_bitmask; + int id; + struct list_head link; + int nr_users; +}; + +LIST_HEAD(layout_list); + +struct per_socket_data { + /* start, end of shared region with HW */ + u32 hw_shared_bitmask; + int initialized; + unsigned long *cosidzeromask; + struct cache_layout *layout; + unsigned int occupied_cbm_bits; +}; + +struct per_socket_data *psd; +static unsigned int psd_size; + +/* + * CDP capable hardware: CDP-on by default. + * Use intel_cat_mode=cat kernel parameter to switch to cat. + */ +static bool __read_mostly enable_cdp = 1; +module_param_named(ept, enable_cdp, bool, S_IRUGO); + +// protects addition to layout_list +static DEFINE_RAW_SPINLOCK(cache_layout_lock); + +DECLARE_BITMAP(cache_layout_ids, MAX_LAYOUTS); + +struct cache_layout *find_create_layout(u32 hw_shared_bitmask) +{ + struct cache_layout *l; + + raw_spin_lock(&cache_layout_lock); + + list_for_each_entry(l, &layout_list, link) { + if (l->hw_shared_bitmask == hw_shared_bitmask) + l->nr_users++; + raw_spin_unlock(&cache_layout_lock); + return l; + } + + l = kzalloc(GFP_ATOMIC, sizeof(struct cache_layout)); + if (!l) { + panic("%s alloc failed", __func__); + } + l->hw_shared_bitmask = hw_shared_bitmask; + l->id = find_first_zero_bit(cache_layout_ids, MAX_LAYOUTS); + if (l->id == MAX_LAYOUTS) { + printk(KERN_ERR "intel_cat: MAX_LAYOUTS exceeded\n"); + /* reuse id 0 */ + l = list_first_entry(&layout_list, struct cache_layout, link); + l->nr_users++; + raw_spin_unlock(&cache_layout_lock); + return l; + } + set_bit(l->id, cache_layout_ids); + l->nr_users++; + INIT_LIST_HEAD(&l->link); + list_add(&l->link, &layout_list); + raw_spin_unlock(&cache_layout_lock); + return l; +} + +u32 maxtcrlist_id; + +int alloc_tcrid_table(void) +{ + struct tcr_entry *e; + struct cpuinfo_x86 *c = &boot_cpu_data; + int i; + + maxtcrlist_id = c->x86_cache_max_closid; + + tcrid_table = kzalloc(GFP_KERNEL, CBM_LEN); + if (!tcrid_table) + return -ENOMEM; + + for (i = 0; i < CBM_LEN; i++) { + unsigned int size; + e = &tcrid_table[i]; + e->tcrid = i; + size = BITS_TO_LONGS(maxtcrlist_id) * + sizeof(unsigned long); + e->tcrlist_bmap = kzalloc(GFP_KERNEL, size); + if (!e->tcrlist_bmap) { + goto out_err; + } + } + + return 0; +out_err: + for (i = 0; i < CBM_LEN; i++) { + e = &tcrid_table[i]; + kfree(e->tcrlist_bmap); + } + kfree(tcrid_table); + return -ENOMEM; +} + + +#define reserved_cbm_bits 2 +int account_cbm_bits(struct cat_reservation_cpumask *crmask, + unsigned int cbm_bits) +{ + unsigned int cpu; + + + // const struct cpumask *cpumask + for_each_cpu(cpu, crmask->mask) { + unsigned int socket, free_cbm_bits; + struct per_socket_data *psd; + + if (!cpu_online(cpu)) + return 1; + + socket = topology_physical_package_id(cpu); + psd = get_socket_data(socket); + free_cbm_bits = l3_nr_cbm_bits - psd->occupied_cbm_bits; + if (cbm_bits > free_cbm_bits) + return 1; + } + + for_each_cpu(cpu, crmask->mask) { + unsigned int socket, free_cbm_bits; + struct per_socket_data *psd; + + socket = topology_physical_package_id(cpu); + psd = get_socket_data(socket); + psd->occupied_cbm_bits += cbm_bits; + } + return 0; +} + +int deaccount_cbm_bits(struct tcr_entry *e) +{ + unsigned int cpu; + + for_each_cpu(cpu, e->mask) { + unsigned int socket; + struct per_socket_data *psd; + + /* FIXME: + * + * 1) alloc reservation + * 2) cpu offline + * 3) dealloc reservation + * 4) cpu online + */ + if (!cpu_online(cpu)) + return 1; + + socket = topology_physical_package_id(cpu); + psd = get_socket_data(socket); + psd->occupied_cbm_bits -= e->cbm_bits; + } + return 0; +} + +struct tcr_entry *alloc_tcr_entry(struct cat_reservation_cpumask *crmask, + unsigned int cbm_bits) +{ + struct tcr_entry *e; + int i; + + i = find_first_zero_bit(tcrid_used_bitmap, CBM_LEN); + if (i >= CBM_LEN) { + return ERR_PTR(-ENOMEM); + } + + if (account_cbm_bits(cpumask, cbm_bits)) + return ERR_PTR(-ENOMEM); + + set_bit(i, tcrid_used_bitmap); + e = &tcrid_table[i]; + + return e; +} + +struct tcr_entry *find_tcr_entry(u32 tcrid) +{ + struct tcr_entry *e; + + if (tcrid >= CBM_LEN) { + return ERR_PTR(-EINVAL); + } + if (!test_bit(tcrid, tcrid_used_bitmap)) { + return ERR_PTR(-EINVAL); + } + + e = &tcrid_table[tcrid]; + return e; +} + +void free_tcr_entry(struct tcr_entry *e) +{ + clear_bit(e->tcrid, tcrid_used_bitmap); + WARN_ON(!bitmap_empty(e->tcrlist_bmap, maxtcrlist_id)); + deaccount_cbm_bits(e); + if (e->cpumask) + free_cpumask_var(e->cpumask); + e->cpumask = NULL; +} + +int tcrentry_in_tcrlist(struct tcr_entry *e, struct tcr_list *l) +{ + return test_bit(l->id, e->tcrlist_bmap); +} + + +#if 0 +void tcrlist_changed(struct tcr_list *l) +{ + unsigned int size = BITS_TO_LONGS(max_sockets * NR_CPUS) * sizeof(long); + bitmap_clear(l->synced_to_socket, size); +} +#endif + +int add_tcrentry_to_tcrlist(struct tcr_entry *e, struct tcr_list *l) +{ + set_bit(l->id, e->tcrlist_bmap); + set_bit(e->tcrid, l->tcrentry_bmap); + return 0; +} + +int remove_tcrentry_from_tcrlist(struct tcr_entry *e, struct tcr_list *l) +{ + clear_bit(l->id, e->tcrlist_bmap); + clear_bit(e->tcrid, l->tcrentry_bmap); + /* no more tcrlists referencing this tcrentry: undo allocation + on the cache layouts */ + if (bitmap_empty(&e->tcrlist_bmap, maxtcrlist_id)) + dealloc_contiguous_regions(e, l); + /* no more tcrentries on this tcrlist: unlink it from task */ + if (bitmap_empty(&l->tcrentry_bmap, CBM_LEN)) + unlink_tcrlist_from_tasks(l); + + return 0; +} + +/* + * returns -ENOMEM if not enough space, -EPERM if no permission. + * returns 0 if reservation has been successful, copying actual + * number of kbytes reserved to "kbytes", type to type, and tcrid. + * + */ +int __create_cache_reservation(struct cat_reservation_cpumask *crmask, + unsigned long argp) +{ + struct tcr_entry *e; + unsigned int cbm_bits; + unsigned int kbytes; + struct cat_reservation *cr = &crmask->res; + int ret; + + if (cr->type != CACHE_RSVT_TYPE_BOTH && !enable_cdp) + return -ENOTSUPP; + + if (cr->type & CACHE_RSVT_ROUND_DOWN) + kbytes = round_down(cr->kbytes, kbytes_per_cbm_bit); + else + kbytes = round_up(cr->kbytes, kbytes_per_cbm_bit); + + if (kbytes > l3_cache_size) + return -ENOSPC; + + cbm_bits = kbytes / kbytes_per_cbm_bit; + + e = alloc_tcr_entry(crmask, cbm_bits); + if (IS_ERR(e)) + return PTR_ERR(e); + + /* fix up the cr with the info we got and copy to user */ + cr->kbytes = kbytes; + cr->type = CACHE_RSVT_TYPE_BOTH; + cr->flags = 0; + cr->tcrid = e->tcrid; + ret = -EFAULT; + if (copy_to_user(argp, cr, sizeof(*cr))) + goto out_release_tcrid; + + e->user_kbytes = cr->kbytes; + e->rounded_kbytes = kbytes; + e->cbm_bits = kbytes / kbytes_per_cbm_bit; + e->type = cr->type; + + return 0; +out_release_tcrid: + free_tcr_entry(e); + return ret; +} + +int create_cache_reservation(struct cat_reservation_cpumask *crmask, + unsigned long arg) +{ + cpumask_var_t new_mask; + int ret; + struct cat_reservation *cr = crmask->cr; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; + + ret = get_user_cpu_mask(crmask->mask, crmask->cpusetsize, + new_mask); + if (ret == 0) + ret = __create_cache_reservation(crmask, arg); + + if (ret == 0) { + int len = crmask->cpusetsize; + + size_t retlen = min_t(size_t, len, cpumask_size()); + + if (copy_to_user(crmask->mask, new_mask, retlen)) + ret = -EFAULT; + else + ret = retlen; + } + if (ret > 0) + cr->cpumask = new_mask; + else + free_cpumask_var(new_mask); + return retval; +} + +/* + * TCRentry -> TCRlist mapping: + * Each TCRlist is assigned an id from [0, ..., maxclosid] + * The id_to_tcrlist[maxclosid] structure contains pointers + * to tcrlist structures. + * TCRentries contains a bitmap[0, ..., maxclosid]. A bit + * set in this bitmap represents the fact that particular + * tcrlist references the tcrentry. + */ +struct tcr_list *id_to_tcrlist; +#define TCRLIST_ID_SZ 128 +DECLARE_BITMAP(tcrlist_ids, TCRLIST_ID_SZ); + +static unsigned int alloc_tcrlist_id(void) +{ + unsigned int id; + id = find_first_zero_bit(&tcrlist_ids, TCRLIST_ID_SZ); + if (id < TCRLIST_ID_SZ) + set_bit(id, &tcrlist_ids); + return id; +} + +static void free_tcrlist_id(unsigned int id) +{ + clear_bit(id, &tcrlist_ids); + id_to_tcrlist[id] = NULL; +} + + +struct tcr_list *alloc_tcrlist(void) +{ + unsigned int cpus_per_socket; + struct tcr_list *l; + unsigned int id; + u32 size; + + l = kzalloc(sizeof(struct tcr_list), GFP_KERNEL); + if (!l) { + return ERR_PTR(-ENOMEM); + } + INIT_LIST_HEAD(&l->global_link); + INIT_LIST_HEAD(&l->tcr_list); + size = BITS_TO_LONGS(max_sockets * NR_CPUS) * sizeof(long); + l->synced_to_socket = kzalloc(GFP_KERNEL, size); + if (!l->synced_to_socket) { + kfree(l); + return ERR_PTR(-ENOMEM); + } + mutex_lock(&tcr_list_mutex); + id = alloc_tcrlist_id(); + if (id >= TCRLIST_ID_SZ) { + kfree(l); + mutex_unlock(&tcr_list_mutex); + return ERR_PTR(-ENOMEM); + } + l->id = id; + id_to_tcrlist[id] = l; + list_add(&l->global_link, &tcr_global_list); + + mutex_unlock(&tcr_list_mutex); + return l; +} + +struct tcr_list *find_tcrlist(unsigned long *cmp_bmap) +{ + struct tcrlist *l; + + list_for_each_entry(l, &tcr_global_list, global_link) { + if (bitmap_equal(l->tcrentry_bmap, &tcrentry_bmap, CBM_LEN)) + return l; + } + return NULL; +} + +void free_tcrlist(struct tcr_list *l) +{ + mutex_lock(&tcr_list_mutex); + free_tcrlist_id(l->id); + mutex_unlock(&tcr_list_mutex); + kfree(l); +} + +/* + * tcrlist is created when attaching a tcrentry to a task. + * + * destroyed when either task count goes to zero, + * or tcrentry count goes to zero. + * + */ +static void inc_use_count(struct tcr_list *l) +{ + l->nr_tasks++; +} + +static void dec_use_count(struct tcr_list *l) +{ + l->nr_tasks--; + if (l->nr_tasks == 0) + free_tcrlist(l); +} + +int link_tcrlist_to_task(struct task_struct *t, struct tcr_list *l) +{ + inc_use_count(l); + rcu_assign_pointer(t->tcrlist, l); +#if 0 + #ifdef CONFIG_INTEL_CAT + struct list_head tcrlist_link; + #endif +#endif + + list_add(&t->tcrlist_link, &l->tasks); +} + +int unlink_tcrlist_from_task(struct task_struct *t, struct tcr_list *l) +{ + rcu_assign_pointer(t->tcrlist, NULL); + rcu_synchronize(); + list_del(&t->tcrlist_link); + dec_use_count(l); +} + +void unlink_tcrlist_from_tasks(struct tcr_list *l) +{ + struct task_struct *tsk, *tsk2; + + list_for_each_entry_safe(tsk, tsk2, &l->tasks, tcrlist_link) { + rcu_assign_pointer(tsk->tcrlist, NULL); + kick_task(tsk); + } + rcu_synchronize(); + + list_for_each_entry_safe(tsk, tsk2, &l->tasks, tcrlist_link) { + list_del(&t->tcrlist_link); + dec_use_count(l); + } +} + +int delete_cache_reservation(struct cat_tcrid *i) +{ + struct tcr_entry *e; + int bit; + + e = find_tcr_entry(i->tcrid); + if (IS_ERR(e)) { + return PTR_ERR(e); + } + + for_each_set_bit(bit, &e->tcrlist_bmap, maxtcrlist_id) { + struct tcr_list *l; + + l = id_to_tcrlist[id]; + if (!l) { + BUG_ON(); + return 0; + } + remove_tcrentry_from_tcrlist(e, l); + kick_tasks(l); + } + free_tcr_entry(e); + return 0; +} + + +int check_contiguous_region(struct tcr_entry *e, struct tcr_list *l, + struct cache_layout *layout, int *size_p) +{ + unsigned long *temp_closmap; + u32 size = BITS_TO_LONGS(max_cbm_len) * sizeof(unsigned long); + struct tcr_list_per_socket *psd = l->psd[layout->id]; + u32 cbm_bits; + + temp_closmap = kzalloc(GFP_KERNEL, size); + if (!temp_closmap) { + return -ENOMEM; + } + + memcpy(temp_closmap, layout->closmap, size); + /* mark cache ways shared with hw as busy */ + bitmap_or(temp_closmap, &layout->hw_shared_bitmask, min(max_cbm_len, 32)); + cbm_bits = 0; + if (psd->cbm_end_bit) { + cbm_bits = psd->cbm_end_bit - psd->cbm_start_bit + 1; + bitmap_clear(temp_closmap, psd->cbm_start_bit, cbm_bits); + } + + cbm_bits += e->cbm_bits; + s = bitmap_find_next_zero_area(temp_closmap, max_cbm_len, 0, + cbm_bits, 0); + if (s >= max_cbm_len) { + kfree(temp_closmap); + return -EBUSY; + } + *size_p = cbm_bits; + return s; +} + +int alloc_contiguous_region(struct tcr_entry *e, struct tcr_list *l, + struct cache_layout *layout) +{ + int size_p, r; + struct tcr_list_per_socket *psd = l->psd[layout->id]; + + r = check_contiguous_region(e, l, clayout, &size_p); + if (r < 0) + return r; + + psd->cbm_start_bit = r; + psd->cbm_end_bit = r + size_p; + + for (bit = psd->cbm_start_bit; bit < psd->cbm_end_bit; + bit++) { + __set_bit(bit, layout->closmap); + } + return 0; +} + +int alloc_contiguous_regions(struct tcr_entry *e, struct tcr_list *l) +{ + struct cache_layout *clayout; + + list_for_each_entry(clayout, &layout_list, link) { + int size_p, r; + + r = check_contiguous_region(e, l, clayout, &size_p); + if (r < 0) + return error; + r = alloc_contiguous_region(e, l, clayout); + if (r) { + WARN_ON(1); + } + } +} + +int dealloc_contiguous_regions(struct tcr_entry *e, struct tcr_list *l) +{ + struct cache_layout *clayout; + + list_for_each_entry(clayout, &layout_list, link) { + struct tcr_list_per_socket *psd = l->psd[clayout->id]; + int bit; + + for (bit = psd->cbm_start_bit; bit < psd->cbm_end_bit; + bit++) { + __clear_bit(bit, layout->closmap); + } + } +} + +void kick_task(struct task_struct *tsk) +{ + set_tsk_need_resched(tsk); + kick_process(tsk); +} + +/* When attach returns, any task attached to the tcrlist + * which has been modified must: + * Task Running) sync_to_msr. + * Task Not Running) nothing, as long as sync_to_msr is performed + * when its scheduled in. + */ +void kick_tasks(struct tcr_list *l) +{ + struct task_struct *tsk; + + list_for_each_entry(tsk, &l->tasks, tcrlist_link) { + set_tsk_need_resched(tsk); + kick_process(tsk); + } +} + +int attach_cache_reservation(struct pid_cat_reservation *pcr) +{ + struct pid *pid; + struct task_struct *task; + struct tcr_list *l, *undo; + struct tcr_entry *e; + + e = find_tcr_entry(pcr->tcrid); + if (IS_ERR(e)) { + return PTR_ERR(e); + } + + pid = find_get_pid(pcr); + if (!pid) { + return -ENOSYS; + } + + task = get_pid_task(task); + if (!task) { + put_pid(pid; + return -EINVAL; + } + + if (!task->tcrlist) { + u64 b = 1UL << e->tcrid; + + l = find_tcrlist(&b); + if (l) { + link_tcrlist_to_task(task,l); + return 0; + } + l = alloc_tcrlist(); + if (IS_ERR(l)) { + put_pid(pid); + put_task_struct(task); + return PTR_ERR(l); + } + undo = l; + } else { + l = task->tcrlist; + } + + if (tcrentry_in_tcrlist(e, l)) + return -EINVAL; + + if (l->nr_tasks > 1) { + struct tcrlist_entry *lnew; + u64 b = l->tcrentry_bmap; + + set_bit(e->tcrid, &b); + + lnew = find_tcrlist(&b); + if (lnew) { + unlink_tcrlist_from_task(task, l); + link_tcrlist_to_task(task, lnew); + goto out; + } + + lnew = alloc_tcrlist(); + if (IS_ERR(lnew)) { + put_pid(pid); + put_task_struct(task); + return PTR_ERR(lnew); + } + + if (alloc_contiguous_regions(e, lnew) == -ENOSPC) { + free_tcrlist(lnew); + return -ENOSPC; + } + for_each_set_bit(bit, &l->tcrentry_bmap, CBM_LEN) { + struct tcr_entry *et; + + et = &tcrid_table[bit]; + add_tcrentry_to_tcrlist(et, lnew); + } + unlink_tcrlist_from_task(task, l); + link_tcrlist_to_task(task, lnew); + l = lnew; + } else { + if (alloc_contiguous_regions(e, l) == -ENOSPC) { + if (undo) + free_tcrlist(undo); + return -ENOSPC; + } + } + + add_tcrentry_to_tcrlist(e, l); + kick_tasks(l); +out: + put_pid(pid); + put_task_struct(task); + return 0; +} + +int detach_cache_reservation(struct pid_cat_reservation *pcr) +{ + struct pid *pid; + struct task_struct *task; + struct tcr_list *l, *undo; + struct tcr_entry *e; + int err; + + e = find_tcr_entry(pcr->tcrid); + if (IS_ERR(e)) { + return PTR_ERR(e); + } + + pid = find_get_pid(pcr); + if (!pid) { + return -ENOSYS; + } + + task = get_pid_task(task); + if (!task) { + put_pid(pid); + return -EINVAL; + } + + l = task->tcrlist; + if (!l) { + err = -EINVAL; + goto out; + } + + if (!tcrentry_in_tcrlist(e, l)) + return -EINVAL; + + if (l->nr_tasks > 1) { + struct tcrlist_entry *lnew; + u64 b = l->tcrentry_bmap; + + clear_bit(e->tcrid, &b); + + lnew = find_tcrlist(&b); + if (lnew) { + unlink_tcrlist_from_task(task, l); + link_tcrlist_to_task(task, lnew); + kick_task(task); + goto out; + } + + lnew = alloc_tcrlist(); + if (IS_ERR(lnew)) { + put_pid(pid); + put_task_struct(task); + return PTR_ERR(lnew); + } + for_each_set_bit(bit, &l->tcrentry_bmap, CBM_LEN) { + struct tcr_entry *et; + + if (bit == e->tcrid) + continue; + + et = &tcrid_table[bit]; + add_tcrentry_to_tcrlist(et, lnew); + } + unlink_tcrlist_from_task(task, l); + link_tcrlist_to_task(task, lnew); + l = lnew; + kick_task(task); + } else { + remove_tcrentry_from_tcrlist(e, l); + } + + err = 0; +out: + put_pid(pid); + put_task_struct(task); + return err; +} + +void sync_to_msr(struct task_struct *task, struct tcr_list *l, + unsigned int start, unsigned int end) +{ + u64 msr; + unsigned long bitmask = -1; + int len = end - start + 1; + + bitmask = bitmask << (sizeof(unsigned long)*8 - len); + bitmask = bitmask >> (sizeof(unsigned long)*8 - end -1); + + /* check and enforce cosidzero has [s,e] == 0 */ + rdmsrl(CBM_FROM_INDEX(0), msr); + if (msr & bitmask) + wrmsrl(CBM_FROM_INDEX(0), msr & ~bitmask); + + /* check and enforce this cosid has [s,e] == 1. */ + rdmsrl(CBM_FROM_INDEX(l->id), msr); + if ((msr & bitmask) != bitmask) + wrmsrl(CBM_FROM_INDEX(l->id), msr | bitmask); + + set_bit(this_socket, task->tcrlist->synced_to_socket); +} + +void __intel_rdt_sched_in(void) +{ + struct task_struct *task = current; + unsigned int cpu = smp_processor_id(); + unsigned int this_socket = topology_physical_package_id(cpu); + unsigned int start, end; + struct per_socket_data *psd = get_socket_data(this_socket); + + /* + * The CBM bitmask for a particular task is enforced + * on sched-in to a given processor, and only for the + * range (cbm_start_bit,cbm_end_bit) which the + * tcr_list (COSid) owns. + * This way we allow COSid0 (global task pool) to use + * reserved L3 cache on sockets where the tasks that + * reserve the cache have not been scheduled. + * + * Since reading the MSRs is slow, it is necessary to + * cache the MSR CBM map on each socket. + * + */ + + if (task->tcrlist == NULL) { + wrmsrl(CBM_FROM_INDEX(0), psd->cosidzeromask); + } + else if (test_bit(this_socket, + task->tcrlist->synced_to_socket) == 0) { + spin_lock(&this_socket->msr_cbm_lock); + unsigned int start; + struct per_socket_data *psd = get_socket_data(this_socket); + struct cache_layout *layout = psd->layout; + + start = task->tcrlist->psd[layout->id].cbm_start; + end = task->tcrlist->psd[layout->id].cbm_end; + sync_to_msr(task, tcrlist, start, end); + // barrier + spin_unlock(&this_socket->msr_cbm_lock); + } + +} + +static int get_reservations(struct cat_reservation_list *in, + unsigned long arg) +{ + int r, bit; + struct cat_reservation *cr; + void *res_user_ptr, *cpumask_user_ptr; + unsigned int copied_entries; + unsigned int x, coffset, uoffset; + size_t cpumasksz; + + cpumasksz = cpumask_size()*bitmap_weight(&tcrid_used_bitmap, CBM_LEN); + cpumasksz = min_t(size_t, cpumasksz); + + x = sizeof(*cr)*cpumasksz; + if (x > in->cat_res_size) + return -ENOSPC; + if (cpumasksz > in->cpumask_size) + return -ENOSPC; + + cr = kzalloc(GFP_KERNEL, sizeof(*cr)); + if (!cr) + return -ENOMEM; + + res_user_ptr = in->list; + cpumask_user_ptr = in->mask; + + in->cpumask_size = cpumasksz; + r = -EFAULT; + if (copy_to_user(argp, &in, sizeof(*in))) + goto out; + + uoffset = coffset = copied_entries = 0; + + for_each_set_bit(bit, &tcrid_used_bitmap, CBM_LEN) { + struct tcr_entry *e = &tcrid_table[bit]; + + cr->kbytes = e->rounded_kbytes; + cr->type = e->type; + cr->flags = 0; + cr->tcrid = tcrid; + + if (copy_to_user(user_ptr + uoffset, &cr, sizeof(*cr))) { + r = -EFAULT; + goto out; + } + uoffset += sizeof(*cr); + + if (copy_to_user(cpumask_user_ptr + coffset, e->cpumask, cpumasksz)) { + r = -EFAULT; + goto out; + } + coffset += cpumasksz; + copied_entries++; + + memset(cr, 0, sizeof(*cr)); + } + + copied_entries = r; + +out: + kfree(cr); + return r; +} + +static int basic_cr_checks(struct cat_reservation *cr) +{ + int r; + + r = -EINVAL; + if (cr->type != CACHE_RSVT_TYPE_CODE && + cr->type != CACHE_RSVT_TYPE_DATA && + cr->type != CACHE_RSVT_TYPE_BOTH) + return r; + + if (cr->flags != 0 && cr->flags != CACHE_RSVT_ROUND_DOWN) + return r; + + r = 0; + return r; +} + +static long intelcat_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + long r = -EINVAL; + switch (ioctl) { + case CAT_CREATE_RESERVATION: + struct cat_reservation_cpumask crmask; + + r = -EFAULT; + if (copy_from_user(&crmask, argp, sizeof(crmask))) + goto out; + + r = basic_cr_checks(&crmask.res); + if (r) + goto out; + + r = create_cache_reservation(&crmask, arg); + + break; + case CAT_DELETE_RESERVATION: + struct cat_tcrid tcrid; + + r = -EFAULT; + if (copy_from_user(&tcrid, argp, sizeof(cr))) + goto out; + + r = delete_cache_reservation(&tcrid); + + break; + case CAT_ATTACH_RESERVATION: + struct pid_cat_reservation pcr; + r = -EFAULT; + + if (copy_from_user(&pcr, argp, sizeof(pcr))) + goto out; + r = attach_cache_reservation(&pcr); + break; + case CAT_DETACH_RESERVATION: + struct pid_cat_reservation pcr; + r = -EFAULT; + + if (copy_from_user(&pcr, argp, sizeof(pcr))) + goto out; + r = detach_cache_reservation(&pcr); + break; + case CAT_GET_RESERVATIONS: + struct cat_reservation_list *in; + r = -EFAULT; + + if (copy_from_user(&pcr, argp, sizeof(pcr))) + goto out; + + r = get_reservations(in, argp); + return r; + default: + break; + } + +out: + return r; +} + +static struct file_operations intelcat_chardev_ops = { + .unlocked_ioctl = intelcat_ioctl, + .compat_ioctl = intelcat_ioctl, + .llseek = noop_llseek, +}; + +static struct miscdevice intel_cat_misc = +{ + INTEL_CAT_MINOR, + "intel_cat", + &intelcat_chardev_ops, +}; + +static int get_l3_cache_size(void) +{ + struct cpu_cacheinfo *cinfo; + struct cacheinfo *ci; + + cinfo = get_cpu_cacheinfo(0); + + if (cinfo && cinfo->num_levels >= 3) { + ci = cinfo->info_list[3]; + l3_cache_size = ci->size; + return 0; + } + return -EINVAL; +} + +static struct per_socket_data *get_socket_data(int socket) +{ + struct per_socket_data *data; + + if (socket >= psd_size) { + BUG_ON(); + return NULL; + } + return &psd[socket]; +} + +static int __init alloc_init_per_socket_data(void) +{ + psd = kzalloc(max_sockets * sizeof(struct per_socket_data)); + if (!psd) + return -ENOMEM; + psd_size = max_sockets; + return 0; +} + +static void percpu_init_hw_shared_zone(void) +{ + unsigned int cpu, this_socket; + struct cpuinfo_x86 *c; + uint32_t eax, ebx, ecx, edx; + struct per_socket_data *psd; + u32 size; + + cpu = smp_processor_id(); + this_socket = topology_physical_package_id(cpu); + psd = get_socket_data(this_socket); + c = &cpu_data(cpu); + + cpuid_count(0x00000010, 1, &eax, &ebx, &ecx, &edx); + if (atomic_test_and_set(&psd->initialized)) + return 0; + psd->hw_shared_bitmask = ebx; + // reserve 10% of cache ways for host + psd->reserved_for_host = c->x86_cache_max_cbm_len/10; + psd->reserved_for_host = max(psd->reserved_for_host, + bitmap_weight(&psd->hw_shared_bitmask)); + psd->layout = find_create_layout(psd->hw_shared_bitmask); + + size = BITS_TO_LONGS(c->x86_cache_max_cbm_len) * sizeof(unsigned long); + if (cdp_enabled) + size = 2*size; + psd->cosidzeromask = kzalloc(size, GFP_ATOMIC); + if (!closmap) + panic("%s allocation failed\n", __func__); + + memset(psd->cosidzeromask, 1, size); +} + +static int cat_cpu_notifier(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_ONLINE: + percpu_init_hw_shared_zone(); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block cat_cpu_notifier_block = { + .notifier_call = cat_cpu_notifier, + .priority = -INT_MAX +}; + +static int init_hw_shared_zone(void) +{ + cpumask_t cpumask; + int cpu; + unsigned long *topology_bmap; + int size = BITS_TO_LONGS(max_sockets * NR_CPUS) * sizeof(long); + + topology_bmap = kzalloc(size, GFP_KERNEL); + if (!topology_bmap) + return -ENOMEM; + + cpumask_zero(&cpumask); + + for_each_online_cpu(cpu) { + phys_id = topology_physical_package_id(cpu); + if (test_and_set_bit(phys_id, topology_bmap)) + continue; + cpumask_set_cpu(cpu, &cpumask); + } + + smp_call_function_many(&cpumask, + percpu_init_hw_shared_zone, 0, 1); + + kfree(topology_bmap); + + return 0; +} + + +static int __init intel_cat_mem_init(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + u32 maxid; + + err = -ENOMEM; + + max_cbm_len = c->x86_cache_max_cbm_len; + maxid = max_closid = c->x86_cache_max_closid; + //maxid = max_closid = c->x86_cache_max_closid; + size = BITS_TO_LONGS(maxid) * sizeof(long); + closmap = kzalloc(size, GFP_KERNEL); + if (!closmap) + goto err_out; + + size = maxid * sizeof(struct tcr_list *); + id_to_tcrlist = kzalloc(size, GFP_KERNEL); + if (!id_to_tcrlist) + goto err_out; + + err = alloc_tcrid_table(); + if (err) + goto err_out; + + err = get_l3_cache_size(); + if (err) + goto err_out; + + /* kbytes per cbm bit = + * L3 cache size in kbytes / capacity bitmask length. + */ + kbytes_per_cbm_bit = (l3_cache_size >> 10) / max_cbm_len; + + /* L3 cache size in kbytes / kbytes per cbm bit = + * cbm bits in L3 cache. + */ + l3_nr_cbm_bits = (l3_cache_size >> 10) / kbytes_per_cbm_bit; + + err = alloc_init_per_socket_data(); + if (err) + goto err_out; + + init_hw_shared_zone(); + + /* bit 0 is reserved for global task pool */ + set_bit(0, &tcrlist_ids); + + return 0; +err_out: + kfree(id_to_tcrlist); + kfree(closmap); + return err; +} + +static int __init intel_cat_init(void) +{ + int r; + int cpu; + + preempt_disable(); + cpu = smp_processor_id(); + cpus_per_socket = cpumask_weight(topology_core_cpumask(cpu)); + max_sockets = NR_CPUS/cpus_per_socket; + preempt_enable(); + + r = misc_register(&intel_cat_misc); + if (r) { + printk(KERN_ERR "intel_cat: misc_register error = %d\n",r); + return r; + } + + r = intel_cat_mem_init(); + if (r) { + misc_unregister(&intel_cat_misc); + } + + cpu_notifier_register_begin(); + __register_hotcpu_notifier(&cat_cpu_notifier_block); + cpu_notifier_register_done(); + + return r; +} + diff --git a/arch/x86/kernel/cat/cache_reservation.h b/arch/x86/kernel/cat/cache_reservation.h new file mode 100644 index 0000000..e8146a0 --- /dev/null +++ b/arch/x86/kernel/cat/cache_reservation.h @@ -0,0 +1,47 @@ + +#include +#include +#include + +struct tcr_entry { + unsigned int tcrid; + + unsigned long *tcrlist_bmap; + + u64 user_kbytes; + u64 rounded_kbytes; + unsigned int cbm_bits; + + u32 type; + + cpumask_var_t *cpumask; +}; + +#define CBM_LEN 64 +#define MAX_LAYOUTS 10 + +struct tcr_list_per_socket { + int cbm_start_bit, cbm_end_bit; +}; + +struct tcr_list { + /* cache allocation */ + struct tcr_list_per_socket psd[MAX_LAYOUTS]; + + /* bitmap indicating whether cap_bitmask is synced to a given socket */ + unsigned long *synced_to_socket; + + /* TCRlist id */ + unsigned int id; + + // One bit per tcrentry. + DECLARE_BITMAP(tcrentry_bmap, CBM_LEN); + + // link in global tcrlist list + struct list_head global_link; + // list of tasks referencing this tcr_list + struct list_head tasks; + // nr of tasks referencing this tcr_list + unsigned int nr_tasks; +}; + -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/