diff options
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/cpu/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/intel_rdt.c | 375 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/intel_rdt.h | 440 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c (renamed from arch/x86/kernel/cpu/intel_rdt_schemata.c) | 67 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/intel_rdt_monitor.c | 499 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 1117 | ||||
-rw-r--r-- | arch/x86/kernel/process_32.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/process_64.c | 2 |
8 files changed, 2235 insertions, 269 deletions
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index cdf82492b770..e17942c131c8 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -33,7 +33,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o +obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o intel_rdt_ctrlmondata.o obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 5b366462f579..cd5fc61ba450 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -30,7 +30,8 @@ #include <linux/cpuhotplug.h> #include <asm/intel-family.h> -#include <asm/intel_rdt.h> +#include <asm/intel_rdt_sched.h> +#include "intel_rdt.h" #define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 @@ -38,7 +39,13 @@ /* Mutex to protect rdtgroup access. */ DEFINE_MUTEX(rdtgroup_mutex); -DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); +/* + * The cached intel_pqr_state is strictly per CPU and can never be + * updated from a remote CPU. Functions which modify the state + * are called with interrupts disabled and no preemption, which + * is sufficient for the protection. + */ +DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); /* * Used to store the max resource name width and max resource data width @@ -46,6 +53,12 @@ DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); */ int max_name_width, max_data_width; +/* + * Global boolean for rdt_alloc which is true if any + * resource allocation is enabled. + */ +bool rdt_alloc_capable; + static void mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); static void @@ -54,7 +67,9 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); #define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains) struct rdt_resource rdt_resources_all[] = { + [RDT_RESOURCE_L3] = { + .rid = RDT_RESOURCE_L3, .name = "L3", .domains = domain_init(RDT_RESOURCE_L3), .msr_base = IA32_L3_CBM_BASE, @@ -67,8 +82,11 @@ struct rdt_resource rdt_resources_all[] = { }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_L3DATA] = { + .rid = RDT_RESOURCE_L3DATA, .name = "L3DATA", .domains = domain_init(RDT_RESOURCE_L3DATA), .msr_base = IA32_L3_CBM_BASE, @@ -81,8 +99,11 @@ struct rdt_resource rdt_resources_all[] = { }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_L3CODE] = { + .rid = RDT_RESOURCE_L3CODE, .name = "L3CODE", .domains = domain_init(RDT_RESOURCE_L3CODE), .msr_base = IA32_L3_CBM_BASE, @@ -95,8 +116,11 @@ struct rdt_resource rdt_resources_all[] = { }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_L2] = { + .rid = RDT_RESOURCE_L2, .name = "L2", .domains = domain_init(RDT_RESOURCE_L2), .msr_base = IA32_L2_CBM_BASE, @@ -109,8 +133,11 @@ struct rdt_resource rdt_resources_all[] = { }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_MBA] = { + .rid = RDT_RESOURCE_MBA, .name = "MB", .domains = domain_init(RDT_RESOURCE_MBA), .msr_base = IA32_MBA_THRTL_BASE, @@ -118,6 +145,7 @@ struct rdt_resource rdt_resources_all[] = { .cache_level = 3, .parse_ctrlval = parse_bw, .format_str = "%d=%*d", + .fflags = RFTYPE_RES_MB, }, }; @@ -144,33 +172,28 @@ static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid) * is always 20 on hsw server parts. The minimum cache bitmask length * allowed for HSW server is always 2 bits. Hardcode all of them. */ -static inline bool cache_alloc_hsw_probe(void) +static inline void cache_alloc_hsw_probe(void) { - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86 == 6 && - boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) { - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; - u32 l, h, max_cbm = BIT_MASK(20) - 1; - - if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0)) - return false; - rdmsr(IA32_L3_CBM_BASE, l, h); + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; + u32 l, h, max_cbm = BIT_MASK(20) - 1; - /* If all the bits were set in MSR, return success */ - if (l != max_cbm) - return false; + if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0)) + return; + rdmsr(IA32_L3_CBM_BASE, l, h); - r->num_closid = 4; - r->default_ctrl = max_cbm; - r->cache.cbm_len = 20; - r->cache.min_cbm_bits = 2; - r->capable = true; - r->enabled = true; + /* If all the bits were set in MSR, return success */ + if (l != max_cbm) + return; - return true; - } + r->num_closid = 4; + r->default_ctrl = max_cbm; + r->cache.cbm_len = 20; + r->cache.shareable_bits = 0xc0000; + r->cache.min_cbm_bits = 2; + r->alloc_capable = true; + r->alloc_enabled = true; - return false; + rdt_alloc_capable = true; } /* @@ -213,15 +236,14 @@ static bool rdt_get_mem_config(struct rdt_resource *r) return false; } r->data_width = 3; - rdt_get_mba_infofile(r); - r->capable = true; - r->enabled = true; + r->alloc_capable = true; + r->alloc_enabled = true; return true; } -static void rdt_get_cache_config(int idx, struct rdt_resource *r) +static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r) { union cpuid_0x10_1_eax eax; union cpuid_0x10_x_edx edx; @@ -231,10 +253,10 @@ static void rdt_get_cache_config(int idx, struct rdt_resource *r) r->num_closid = edx.split.cos_max + 1; r->cache.cbm_len = eax.split.cbm_len + 1; r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->cache.shareable_bits = ebx & r->default_ctrl; r->data_width = (r->cache.cbm_len + 3) / 4; - rdt_get_cache_infofile(r); - r->capable = true; - r->enabled = true; + r->alloc_capable = true; + r->alloc_enabled = true; } static void rdt_get_cdp_l3_config(int type) @@ -246,12 +268,12 @@ static void rdt_get_cdp_l3_config(int type) r->cache.cbm_len = r_l3->cache.cbm_len; r->default_ctrl = r_l3->default_ctrl; r->data_width = (r->cache.cbm_len + 3) / 4; - r->capable = true; + r->alloc_capable = true; /* * By default, CDP is disabled. CDP can be enabled by mount parameter * "cdp" during resctrl file system mount time. */ - r->enabled = false; + r->alloc_enabled = false; } static int get_cache_id(int cpu, int level) @@ -300,6 +322,19 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]); } +struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r) +{ + struct rdt_domain *d; + + list_for_each_entry(d, &r->domains, list) { + /* Find the domain that contains this CPU */ + if (cpumask_test_cpu(cpu, &d->cpu_mask)) + return d; + } + + return NULL; +} + void rdt_ctrl_update(void *arg) { struct msr_param *m = arg; @@ -307,12 +342,10 @@ void rdt_ctrl_update(void *arg) int cpu = smp_processor_id(); struct rdt_domain *d; - list_for_each_entry(d, &r->domains, list) { - /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->cpu_mask)) { - r->msr_update(d, m, r); - return; - } + d = get_domain_from_cpu(cpu, r); + if (d) { + r->msr_update(d, m, r); + return; } pr_warn_once("cpu %d not found in any domain for resource %s\n", cpu, r->name); @@ -326,8 +359,8 @@ void rdt_ctrl_update(void *arg) * caller, return the first domain whose id is bigger than the input id. * The domain list is sorted by id in ascending order. */ -static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, - struct list_head **pos) +struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, + struct list_head **pos) { struct rdt_domain *d; struct list_head *l; @@ -377,6 +410,44 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) return 0; } +static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) +{ + size_t tsize; + + if (is_llc_occupancy_enabled()) { + d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid), + sizeof(unsigned long), + GFP_KERNEL); + if (!d->rmid_busy_llc) + return -ENOMEM; + INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); + } + if (is_mbm_total_enabled()) { + tsize = sizeof(*d->mbm_total); + d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_total) { + kfree(d->rmid_busy_llc); + return -ENOMEM; + } + } + if (is_mbm_local_enabled()) { + tsize = sizeof(*d->mbm_local); + d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_local) { + kfree(d->rmid_busy_llc); + kfree(d->mbm_total); + return -ENOMEM; + } + } + + if (is_mbm_enabled()) { + INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); + mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL); + } + + return 0; +} + /* * domain_add_cpu - Add a cpu to a resource's domain list. * @@ -412,14 +483,26 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) return; d->id = id; + cpumask_set_cpu(cpu, &d->cpu_mask); - if (domain_setup_ctrlval(r, d)) { + if (r->alloc_capable && domain_setup_ctrlval(r, d)) { + kfree(d); + return; + } + + if (r->mon_capable && domain_setup_mon_state(r, d)) { kfree(d); return; } - cpumask_set_cpu(cpu, &d->cpu_mask); list_add_tail(&d->list, add_pos); + + /* + * If resctrl is mounted, add + * per domain monitor data directories. + */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + mkdir_mondata_subdir_allrdtgrp(r, d); } static void domain_remove_cpu(int cpu, struct rdt_resource *r) @@ -435,19 +518,58 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cpumask_clear_cpu(cpu, &d->cpu_mask); if (cpumask_empty(&d->cpu_mask)) { + /* + * If resctrl is mounted, remove all the + * per domain monitor data directories. + */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + rmdir_mondata_subdir_allrdtgrp(r, d->id); kfree(d->ctrl_val); + kfree(d->rmid_busy_llc); + kfree(d->mbm_total); + kfree(d->mbm_local); list_del(&d->list); + if (is_mbm_enabled()) + cancel_delayed_work(&d->mbm_over); + if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { + /* + * When a package is going down, forcefully + * decrement rmid->ebusy. There is no way to know + * that the L3 was flushed and hence may lead to + * incorrect counts in rare scenarios, but leaving + * the RMID as busy creates RMID leaks if the + * package never comes back. + */ + __check_limbo(d, true); + cancel_delayed_work(&d->cqm_limbo); + } + kfree(d); + return; + } + + if (r == &rdt_resources_all[RDT_RESOURCE_L3]) { + if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { + cancel_delayed_work(&d->mbm_over); + mbm_setup_overflow_handler(d, 0); + } + if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && + has_busy_rmid(r, d)) { + cancel_delayed_work(&d->cqm_limbo); + cqm_setup_limbo_handler(d, 0); + } } } -static void clear_closid(int cpu) +static void clear_closid_rmid(int cpu) { struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - per_cpu(cpu_closid, cpu) = 0; - state->closid = 0; - wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0); + state->default_closid = 0; + state->default_rmid = 0; + state->cur_closid = 0; + state->cur_rmid = 0; + wrmsr(IA32_PQR_ASSOC, 0, 0); } static int intel_rdt_online_cpu(unsigned int cpu) @@ -459,12 +581,23 @@ static int intel_rdt_online_cpu(unsigned int cpu) domain_add_cpu(cpu, r); /* The cpu is set in default rdtgroup after online. */ cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); - clear_closid(cpu); + clear_closid_rmid(cpu); mutex_unlock(&rdtgroup_mutex); return 0; } +static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) +{ + struct rdtgroup *cr; + + list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { + if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) { + break; + } + } +} + static int intel_rdt_offline_cpu(unsigned int cpu) { struct rdtgroup *rdtgrp; @@ -474,10 +607,12 @@ static int intel_rdt_offline_cpu(unsigned int cpu) for_each_capable_rdt_resource(r) domain_remove_cpu(cpu, r); list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { + clear_childcpus(rdtgrp, cpu); break; + } } - clear_closid(cpu); + clear_closid_rmid(cpu); mutex_unlock(&rdtgroup_mutex); return 0; @@ -492,7 +627,7 @@ static __init void rdt_init_padding(void) struct rdt_resource *r; int cl; - for_each_capable_rdt_resource(r) { + for_each_alloc_capable_rdt_resource(r) { cl = strlen(r->name); if (cl > max_name_width) max_name_width = cl; @@ -502,38 +637,153 @@ static __init void rdt_init_padding(void) } } -static __init bool get_rdt_resources(void) +enum { + RDT_FLAG_CMT, + RDT_FLAG_MBM_TOTAL, + RDT_FLAG_MBM_LOCAL, + RDT_FLAG_L3_CAT, + RDT_FLAG_L3_CDP, + RDT_FLAG_L2_CAT, + RDT_FLAG_MBA, +}; + +#define RDT_OPT(idx, n, f) \ +[idx] = { \ + .name = n, \ + .flag = f \ +} + +struct rdt_options { + char *name; + int flag; + bool force_off, force_on; +}; + +static struct rdt_options rdt_options[] __initdata = { + RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC), + RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL), + RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL), + RDT_OPT(RDT_FLAG_L3_CAT, "l3cat", X86_FEATURE_CAT_L3), + RDT_OPT(RDT_FLAG_L3_CDP, "l3cdp", X86_FEATURE_CDP_L3), + RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2), + RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA), +}; +#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) + +static int __init set_rdt_options(char *str) +{ + struct rdt_options *o; + bool force_off; + char *tok; + + if (*str == '=') + str++; + while ((tok = strsep(&str, ",")) != NULL) { + force_off = *tok == '!'; + if (force_off) + tok++; + for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { + if (strcmp(tok, o->name) == 0) { + if (force_off) + o->force_off = true; + else + o->force_on = true; + break; + } + } + } + return 1; +} +__setup("rdt", set_rdt_options); + +static bool __init rdt_cpu_has(int flag) +{ + bool ret = boot_cpu_has(flag); + struct rdt_options *o; + + if (!ret) + return ret; + + for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { + if (flag == o->flag) { + if (o->force_off) + ret = false; + if (o->force_on) + ret = true; + break; + } + } + return ret; +} + +static __init bool get_rdt_alloc_resources(void) { bool ret = false; - if (cache_alloc_hsw_probe()) + if (rdt_alloc_capable) return true; if (!boot_cpu_has(X86_FEATURE_RDT_A)) return false; - if (boot_cpu_has(X86_FEATURE_CAT_L3)) { - rdt_get_cache_config(1, &rdt_resources_all[RDT_RESOURCE_L3]); - if (boot_cpu_has(X86_FEATURE_CDP_L3)) { + if (rdt_cpu_has(X86_FEATURE_CAT_L3)) { + rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]); + if (rdt_cpu_has(X86_FEATURE_CDP_L3)) { rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); } ret = true; } - if (boot_cpu_has(X86_FEATURE_CAT_L2)) { + if (rdt_cpu_has(X86_FEATURE_CAT_L2)) { /* CPUID 0x10.2 fields are same format at 0x10.1 */ - rdt_get_cache_config(2, &rdt_resources_all[RDT_RESOURCE_L2]); + rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]); ret = true; } - if (boot_cpu_has(X86_FEATURE_MBA)) { + if (rdt_cpu_has(X86_FEATURE_MBA)) { if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA])) ret = true; } - return ret; } +static __init bool get_rdt_mon_resources(void) +{ + if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) + rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID); + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) + rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID); + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) + rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID); + + if (!rdt_mon_features) + return false; + + return !rdt_get_mon_l3_config(&rdt_resources_all[RDT_RESOURCE_L3]); +} + +static __init void rdt_quirks(void) +{ + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_HASWELL_X: + if (!rdt_options[RDT_FLAG_L3_CAT].force_off) + cache_alloc_hsw_probe(); + break; + case INTEL_FAM6_SKYLAKE_X: + if (boot_cpu_data.x86_mask <= 4) + set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat"); + } +} + +static __init bool get_rdt_resources(void) +{ + rdt_quirks(); + rdt_alloc_capable = get_rdt_alloc_resources(); + rdt_mon_capable = get_rdt_mon_resources(); + + return (rdt_mon_capable || rdt_alloc_capable); +} + static int __init intel_rdt_late_init(void) { struct rdt_resource *r; @@ -556,9 +806,12 @@ static int __init intel_rdt_late_init(void) return ret; } - for_each_capable_rdt_resource(r) + for_each_alloc_capable_rdt_resource(r) pr_info("Intel RDT %s allocation detected\n", r->name); + for_each_mon_capable_rdt_resource(r) + pr_info("Intel RDT %s monitoring detected\n", r->name); + return 0; } diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h new file mode 100644 index 000000000000..ebaddaeef023 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -0,0 +1,440 @@ +#ifndef _ASM_X86_INTEL_RDT_H +#define _ASM_X86_INTEL_RDT_H + +#include <linux/sched.h> +#include <linux/kernfs.h> +#include <linux/jump_label.h> + +#define IA32_L3_QOS_CFG 0xc81 +#define IA32_L3_CBM_BASE 0xc90 +#define IA32_L2_CBM_BASE 0xd10 +#define IA32_MBA_THRTL_BASE 0xd50 + +#define L3_QOS_CDP_ENABLE 0x01ULL + +/* + * Event IDs are used to program IA32_QM_EVTSEL before reading event + * counter from IA32_QM_CTR + */ +#define QOS_L3_OCCUP_EVENT_ID 0x01 +#define QOS_L3_MBM_TOTAL_EVENT_ID 0x02 +#define QOS_L3_MBM_LOCAL_EVENT_ID 0x03 + +#define CQM_LIMBOCHECK_INTERVAL 1000 + +#define MBM_CNTR_WIDTH 24 +#define MBM_OVERFLOW_INTERVAL 1000 + +#define RMID_VAL_ERROR BIT_ULL(63) +#define RMID_VAL_UNAVAIL BIT_ULL(62) + +DECLARE_STATIC_KEY_FALSE(rdt_enable_key); + +/** + * struct mon_evt - Entry in the event list of a resource + * @evtid: event id + * @name: name of the event + */ +struct mon_evt { + u32 evtid; + char *name; + struct list_head list; +}; + +/** + * struct mon_data_bits - Monitoring details for each event file + * @rid: Resource id associated with the event file. + * @evtid: Event id associated with the event file + * @domid: The domain to which the event file belongs + */ +union mon_data_bits { + void *priv; + struct { + unsigned int rid : 10; + unsigned int evtid : 8; + unsigned int domid : 14; + } u; +}; + +struct rmid_read { + struct rdtgroup *rgrp; + struct rdt_domain *d; + int evtid; + bool first; + u64 val; +}; + +extern unsigned int intel_cqm_threshold; +extern bool rdt_alloc_capable; +extern bool rdt_mon_capable; +extern unsigned int rdt_mon_features; + +enum rdt_group_type { + RDTCTRL_GROUP = 0, + RDTMON_GROUP, + RDT_NUM_GROUP, +}; + +/** + * struct mongroup - store mon group's data in resctrl fs. + * @mon_data_kn kernlfs node for the mon_data directory + * @parent: parent rdtgrp + * @crdtgrp_list: child rdtgroup node list + * @rmid: rmid for this rdtgroup + */ +struct mongroup { + struct kernfs_node *mon_data_kn; + struct rdtgroup *parent; + struct list_head crdtgrp_list; + u32 rmid; +}; + +/** + * struct rdtgroup - store rdtgroup's data in resctrl file system. + * @kn: kernfs node + * @rdtgroup_list: linked list for all rdtgroups + * @closid: closid for this rdtgroup + * @cpu_mask: CPUs assigned to this rdtgroup + * @flags: status bits + * @waitcount: how many cpus expect to find this + * group when they acquire rdtgroup_mutex + * @type: indicates type of this rdtgroup - either + * monitor only or ctrl_mon group + * @mon: mongroup related data + */ +struct rdtgroup { + struct kernfs_node *kn; + struct list_head rdtgroup_list; + u32 closid; + struct cpumask cpu_mask; + int flags; + atomic_t waitcount; + enum rdt_group_type type; + struct mongroup mon; +}; + +/* rdtgroup.flags */ +#define RDT_DELETED 1 + +/* rftype.flags */ +#define RFTYPE_FLAGS_CPUS_LIST 1 + +/* + * Define the file type flags for base and info directories. + */ +#define RFTYPE_INFO BIT(0) +#define RFTYPE_BASE BIT(1) +#define RF_CTRLSHIFT 4 +#define RF_MONSHIFT 5 +#define RFTYPE_CTRL BIT(RF_CTRLSHIFT) +#define RFTYPE_MON BIT(RF_MONSHIFT) +#define RFTYPE_RES_CACHE BIT(8) +#define RFTYPE_RES_MB BIT(9) +#define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) +#define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON) +#define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) + +/* List of all resource groups */ +extern struct list_head rdt_all_groups; + +extern int max_name_width, max_data_width; + +int __init rdtgroup_init(void); + +/** + * struct rftype - describe each file in the resctrl file system + * @name: File name + * @mode: Access mode + * @kf_ops: File operations + * @flags: File specific RFTYPE_FLAGS_* flags + * @fflags: File specific RF_* or RFTYPE_* flags + * @seq_show: Show content of the file + * @write: Write to the file + */ +struct rftype { + char *name; + umode_t mode; + struct kernfs_ops *kf_ops; + unsigned long flags; + unsigned long fflags; + + int (*seq_show)(struct kernfs_open_file *of, + struct seq_file *sf, void *v); + /* + * write() is the generic write callback which maps directly to + * kernfs write operation and overrides all other operations. + * Maximum write size is determined by ->max_write_len. + */ + ssize_t (*write)(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +}; + +/** + * struct mbm_state - status for each MBM counter in each domain + * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) + * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it + */ +struct mbm_state { + u64 chunks; + u64 prev_msr; +}; + +/** + * struct rdt_domain - group of cpus sharing an RDT resource + * @list: all instances of this resource + * @id: unique id for this instance + * @cpu_mask: which cpus share this resource + * @rmid_busy_llc: + * bitmap of which limbo RMIDs are above threshold + * @mbm_total: saved state for MBM total bandwidth + * @mbm_local: saved state for MBM local bandwidth + * @mbm_over: worker to periodically read MBM h/w counters + * @cqm_limbo: worker to periodically read CQM h/w counters + * @mbm_work_cpu: + * worker cpu for MBM h/w counters + * @cqm_work_cpu: + * worker cpu for CQM h/w counters + * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) + * @new_ctrl: new ctrl value to be loaded + * @have_new_ctrl: did user provide new_ctrl for this domain + */ +struct rdt_domain { + struct list_head list; + int id; + struct cpumask cpu_mask; + unsigned long *rmid_busy_llc; + struct mbm_state *mbm_total; + struct mbm_state *mbm_local; + struct delayed_work mbm_over; + struct delayed_work cqm_limbo; + int mbm_work_cpu; + int cqm_work_cpu; + u32 *ctrl_val; + u32 new_ctrl; + bool have_new_ctrl; +}; + +/** + * struct msr_param - set a range of MSRs from a domain + * @res: The resource to use + * @low: Beginning index from base MSR + * @high: End index + */ +struct msr_param { + struct rdt_resource *res; + int low; + int high; +}; + +/** + * struct rdt_cache - Cache allocation related data + * @cbm_len: Length of the cache bit mask + * @min_cbm_bits: Minimum number of consecutive bits to be set + * @cbm_idx_mult: Multiplier of CBM index + * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: + * closid * cbm_idx_multi + cbm_idx_offset + * in a cache bit mask + * @shareable_bits: Bitmask of shareable resource with other + * executing entities + */ +struct rdt_cache { + unsigned int cbm_len; + unsigned int min_cbm_bits; + unsigned int cbm_idx_mult; + unsigned int cbm_idx_offset; + unsigned int shareable_bits; +}; + +/** + * struct rdt_membw - Memory bandwidth allocation related data + * @max_delay: Max throttle delay. Delay is the hardware + * representation for memory bandwidth. + * @min_bw: Minimum memory bandwidth percentage user can request + * @bw_gran: Granularity at which the memory bandwidth is allocated + * @delay_linear: True if memory B/W delay is in linear scale + * @mb_map: Mapping of memory B/W percentage to memory B/W delay + */ +struct rdt_membw { + u32 max_delay; + u32 min_bw; + u32 bw_gran; + u32 delay_linear; + u32 *mb_map; +}; + +static inline bool is_llc_occupancy_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID)); +} + +static inline bool is_mbm_total_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID)); +} + +static inline bool is_mbm_local_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID)); +} + +static inline bool is_mbm_enabled(void) +{ + return (is_mbm_total_enabled() || is_mbm_local_enabled()); +} + +static inline bool is_mbm_event(int e) +{ + return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && + e <= QOS_L3_MBM_LOCAL_EVENT_ID); +} + +/** + * struct rdt_resource - attributes of an RDT resource + * @rid: The index of the resource + * @alloc_enabled: Is allocation enabled on this machine + * @mon_enabled: Is monitoring enabled for this feature + * @alloc_capable: Is allocation available on this machine + * @mon_capable: Is monitor feature available on this machine + * @name: Name to use in "schemata" file + * @num_closid: Number of CLOSIDs available + * @cache_level: Which cache level defines scope of this resource + * @default_ctrl: Specifies default cache cbm or memory B/W percent. + * @msr_base: Base MSR address for CBMs + * @msr_update: Function pointer to update QOS MSRs + * @data_width: Character width of data when displaying + * @domains: All domains for this resource + * @cache: Cache allocation related data + * @format_str: Per resource format string to show domain value + * @parse_ctrlval: Per resource function pointer to parse control values + * @evt_list: List of monitoring events + * @num_rmid: Number of RMIDs available + * @mon_scale: cqm counter * mon_scale = occupancy in bytes + * @fflags: flags to choose base and info files + */ +struct rdt_resource { + int rid; + bool alloc_enabled; + bool mon_enabled; + bool alloc_capable; + bool mon_capable; + char *name; + int num_closid; + int cache_level; + u32 default_ctrl; + unsigned int msr_base; + void (*msr_update) (struct rdt_domain *d, struct msr_param *m, + struct rdt_resource *r); + int data_width; + struct list_head domains; + struct rdt_cache cache; + struct rdt_membw membw; + const char *format_str; + int (*parse_ctrlval) (char *buf, struct rdt_resource *r, + struct rdt_domain *d); + struct list_head evt_list; + int num_rmid; + unsigned int mon_scale; + unsigned long fflags; +}; + +int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d); +int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d); + +extern struct mutex rdtgroup_mutex; + +extern struct rdt_resource rdt_resources_all[]; +extern struct rdtgroup rdtgroup_default; +DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); + +int __init rdtgroup_init(void); + +enum { + RDT_RESOURCE_L3, + RDT_RESOURCE_L3DATA, + RDT_RESOURCE_L3CODE, + RDT_RESOURCE_L2, + RDT_RESOURCE_MBA, + + /* Must be the last */ + RDT_NUM_RESOURCES, +}; + +#define for_each_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->alloc_capable || r->mon_capable) + +#define for_each_alloc_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->alloc_capable) + +#define for_each_mon_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->mon_capable) + +#define for_each_alloc_enabled_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->alloc_enabled) + +#define for_each_mon_enabled_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->mon_enabled) + +/* CPUID.(EAX=10H, ECX=ResID=1).EAX */ +union cpuid_0x10_1_eax { + struct { + unsigned int cbm_len:5; + } split; + unsigned int full; +}; + +/* CPUID.(EAX=10H, ECX=ResID=3).EAX */ +union cpuid_0x10_3_eax { + struct { + unsigned int max_delay:12; + } split; + unsigned int full; +}; + +/* CPUID.(EAX=10H, ECX=ResID).EDX */ +union cpuid_0x10_x_edx { + struct { + unsigned int cos_max:16; + } split; + unsigned int full; +}; + +void rdt_ctrl_update(void *arg); +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); +void rdtgroup_kn_unlock(struct kernfs_node *kn); +struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, + struct list_head **pos); +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); +struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); +int alloc_rmid(void); +void free_rmid(u32 rmid); +int rdt_get_mon_l3_config(struct rdt_resource *r); +void mon_event_count(void *info); +int rdtgroup_mondata_show(struct seq_file *m, void *arg); +void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + unsigned int dom_id); +void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain *d); +void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, + struct rdtgroup *rdtgrp, int evtid, int first); +void mbm_setup_overflow_handler(struct rdt_domain *dom, + unsigned long delay_ms); +void mbm_handle_overflow(struct work_struct *work); +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); +void cqm_handle_limbo(struct work_struct *work); +bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); +void __check_limbo(struct rdt_domain *d, bool force_free); + +#endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index 406d7a6532f9..f6ea94f8954a 100644 --- a/arch/x86/kernel/cpu/intel_rdt_schemata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -26,7 +26,7 @@ #include <linux/kernfs.h> #include <linux/seq_file.h> #include <linux/slab.h> -#include <asm/intel_rdt.h> +#include "intel_rdt.h" /* * Check whether MBA bandwidth percentage value is correct. The value is @@ -192,7 +192,7 @@ static int rdtgroup_parse_resource(char *resname, char *tok, int closid) { struct rdt_resource *r; - for_each_enabled_rdt_resource(r) { + for_each_alloc_enabled_rdt_resource(r) { if (!strcmp(resname, r->name) && closid < r->num_closid) return parse_line(tok, r); } @@ -221,7 +221,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, closid = rdtgrp->closid; - for_each_enabled_rdt_resource(r) { + for_each_alloc_enabled_rdt_resource(r) { list_for_each_entry(dom, &r->domains, list) dom->have_new_ctrl = false; } @@ -237,7 +237,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, goto out; } - for_each_enabled_rdt_resource(r) { + for_each_alloc_enabled_rdt_resource(r) { ret = update_domains(r, closid); if (ret) goto out; @@ -269,12 +269,13 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, { struct rdtgroup *rdtgrp; struct rdt_resource *r; - int closid, ret = 0; + int ret = 0; + u32 closid; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (rdtgrp) { closid = rdtgrp->closid; - for_each_enabled_rdt_resource(r) { + for_each_alloc_enabled_rdt_resource(r) { if (closid < r->num_closid) show_doms(s, r, closid); } @@ -284,3 +285,57 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, rdtgroup_kn_unlock(of->kn); return ret; } + +void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, + struct rdtgroup *rdtgrp, int evtid, int first) +{ + /* + * setup the parameters to send to the IPI to read the data. + */ + rr->rgrp = rdtgrp; + rr->evtid = evtid; + rr->d = d; + rr->val = 0; + rr->first = first; + + smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1); +} + +int rdtgroup_mondata_show(struct seq_file *m, void *arg) +{ + struct kernfs_open_file *of = m->private; + u32 resid, evtid, domid; + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + union mon_data_bits md; + struct rdt_domain *d; + struct rmid_read rr; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + md.priv = of->kn->priv; + resid = md.u.rid; + domid = md.u.domid; + evtid = md.u.evtid; + + r = &rdt_resources_all[resid]; + d = rdt_find_domain(r, domid, NULL); + if (!d) { + ret = -ENOENT; + goto out; + } + + mon_event_read(&rr, d, rdtgrp, evtid, false); + + if (rr.val & RMID_VAL_ERROR) + seq_puts(m, "Error\n"); + else if (rr.val & RMID_VAL_UNAVAIL) + seq_puts(m, "Unavailable\n"); + else + seq_printf(m, "%llu\n", rr.val * r->mon_scale); + +out: + rdtgroup_kn_unlock(of->kn); + return ret; +} diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c new file mode 100644 index 000000000000..30827510094b --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c @@ -0,0 +1,499 @@ +/* + * Resource Director Technology(RDT) + * - Monitoring code + * + * Copyright (C) 2017 Intel Corporation + * + * Author: + * Vikas Shivappa <vikas.shivappa@intel.com> + * + * This replaces the cqm.c based on perf but we reuse a lot of + * code and datastructures originally from Peter Zijlstra and Matt Fleming. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <asm/cpu_device_id.h> +#include "intel_rdt.h" + +#define MSR_IA32_QM_CTR 0x0c8e +#define MSR_IA32_QM_EVTSEL 0x0c8d + +struct rmid_entry { + u32 rmid; + int busy; + struct list_head list; +}; + +/** + * @rmid_free_lru A least recently used list of free RMIDs + * These RMIDs are guaranteed to have an occupancy less than the + * threshold occupancy + */ +static LIST_HEAD(rmid_free_lru); + +/** + * @rmid_limbo_count count of currently unused but (potentially) + * dirty RMIDs. + * This counts RMIDs that no one is currently using but that + * may have a occupancy value > intel_cqm_threshold. User can change + * the threshold occupancy value. + */ +unsigned int rmid_limbo_count; + +/** + * @rmid_entry - The entry in the limbo and free lists. + */ +static struct rmid_entry *rmid_ptrs; + +/* + * Global boolean for rdt_monitor which is true if any + * resource monitoring is enabled. + */ +bool rdt_mon_capable; + +/* + * Global to indicate which monitoring events are enabled. + */ +unsigned int rdt_mon_features; + +/* + * This is the threshold cache occupancy at which we will consider an + * RMID available for re-allocation. + */ +unsigned int intel_cqm_threshold; + +static inline struct rmid_entry *__rmid_entry(u32 rmid) +{ + struct rmid_entry *entry; + + entry = &rmid_ptrs[rmid]; + WARN_ON(entry->rmid != rmid); + + return entry; +} + +static u64 __rmid_read(u32 rmid, u32 eventid) +{ + u64 val; + + /* + * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured + * with a valid event code for supported resource type and the bits + * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID, + * IA32_QM_CTR.data (bits 61:0) reports the monitored data. + * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) + * are error bits. + */ + wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); + rdmsrl(MSR_IA32_QM_CTR, val); + + return val; +} + +static bool rmid_dirty(struct rmid_entry *entry) +{ + u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); + + return val >= intel_cqm_threshold; +} + +/* + * Check the RMIDs that are marked as busy for this domain. If the + * reported LLC occupancy is below the threshold clear the busy bit and + * decrement the count. If the busy count gets to zero on an RMID, we + * free the RMID + */ +void __check_limbo(struct rdt_domain *d, bool force_free) +{ + struct rmid_entry *entry; + struct rdt_resource *r; + u32 crmid = 1, nrmid; + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + /* + * Skip RMID 0 and start from RMID 1 and check all the RMIDs that + * are marked as busy for occupancy < threshold. If the occupancy + * is less than the threshold decrement the busy counter of the + * RMID and move it to the free list when the counter reaches 0. + */ + for (;;) { + nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid); + if (nrmid >= r->num_rmid) + break; + + entry = __rmid_entry(nrmid); + if (force_free || !rmid_dirty(entry)) { + clear_bit(entry->rmid, d->rmid_busy_llc); + if (!--entry->busy) { + rmid_limbo_count--; + list_add_tail(&entry->list, &rmid_free_lru); + } + } + crmid = nrmid + 1; + } +} + +bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d) +{ + return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid; +} + +/* + * As of now the RMIDs allocation is global. + * However we keep track of which packages the RMIDs + * are used to optimize the limbo list management. + */ +int alloc_rmid(void) +{ + struct rmid_entry *entry; + + lockdep_assert_held(&rdtgroup_mutex); + + if (list_empty(&rmid_free_lru)) + return rmid_limbo_count ? -EBUSY : -ENOSPC; + + entry = list_first_entry(&rmid_free_lru, + struct rmid_entry, list); + list_del(&entry->list); + + return entry->rmid; +} + +static void add_rmid_to_limbo(struct rmid_entry *entry) +{ + struct rdt_resource *r; + struct rdt_domain *d; + int cpu; + u64 val; + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + entry->busy = 0; + cpu = get_cpu(); + list_for_each_entry(d, &r->domains, list) { + if (cpumask_test_cpu(cpu, &d->cpu_mask)) { + val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); + if (val <= intel_cqm_threshold) + continue; + } + + /* + * For the first limbo RMID in the domain, + * setup up the limbo worker. + */ + if (!has_busy_rmid(r, d)) + cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL); + set_bit(entry->rmid, d->rmid_busy_llc); + entry->busy++; + } + put_cpu(); + + if (entry->busy) + rmid_limbo_count++; + else + list_add_tail(&entry->list, &rmid_free_lru); +} + +void free_rmid(u32 rmid) +{ + struct rmid_entry *entry; + + if (!rmid) + return; + + lockdep_assert_held(&rdtgroup_mutex); + + entry = __rmid_entry(rmid); + + if (is_llc_occupancy_enabled()) + add_rmid_to_limbo(entry); + else + list_add_tail(&entry->list, &rmid_free_lru); +} + +static int __mon_event_count(u32 rmid, struct rmid_read *rr) +{ + u64 chunks, shift, tval; + struct mbm_state *m; + + tval = __rmid_read(rmid, rr->evtid); + if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) { + rr->val = tval; + return -EINVAL; + } + switch (rr->evtid) { + case QOS_L3_OCCUP_EVENT_ID: + rr->val += tval; + return 0; + case QOS_L3_MBM_TOTAL_EVENT_ID: + m = &rr->d->mbm_total[rmid]; + break; + case QOS_L3_MBM_LOCAL_EVENT_ID: + m = &rr->d->mbm_local[rmid]; + break; + default: + /* + * Code would never reach here because + * an invalid event id would fail the __rmid_read. + */ + return -EINVAL; + } + + if (rr->first) { + m->prev_msr = tval; + m->chunks = 0; + return 0; + } + + shift = 64 - MBM_CNTR_WIDTH; + chunks = (tval << shift) - (m->prev_msr << shift); + chunks >>= shift; + m->chunks += chunks; + m->prev_msr = tval; + + rr->val += m->chunks; + return 0; +} + +/* + * This is called via IPI to read the CQM/MBM counters + * on a domain. + */ +void mon_event_count(void *info) +{ + struct rdtgroup *rdtgrp, *entry; + struct rmid_read *rr = info; + struct list_head *head; + + rdtgrp = rr->rgrp; + + if (__mon_event_count(rdtgrp->mon.rmid, rr)) + return; + + /* + * For Ctrl groups read data from child monitor groups. + */ + head = &rdtgrp->mon.crdtgrp_list; + + if (rdtgrp->type == RDTCTRL_GROUP) { + list_for_each_entry(entry, head, mon.crdtgrp_list) { + if (__mon_event_count(entry->mon.rmid, rr)) + return; + } + } +} + +static void mbm_update(struct rdt_domain *d, int rmid) +{ + struct rmid_read rr; + + rr.first = false; + rr.d = d; + + /* + * This is protected from concurrent reads from user + * as both the user and we hold the global mutex. + */ + if (is_mbm_total_enabled()) { + rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; + __mon_event_count(rmid, &rr); + } + if (is_mbm_local_enabled()) { + rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; + __mon_event_count(rmid, &rr); + } +} + +/* + * Handler to scan the limbo list and move the RMIDs + * to free list whose occupancy < threshold_occupancy. + */ +void cqm_handle_limbo(struct work_struct *work) +{ + unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); + int cpu = smp_processor_id(); + struct rdt_resource *r; + struct rdt_domain *d; + + mutex_lock(&rdtgroup_mutex); + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + d = get_domain_from_cpu(cpu, r); + + if (!d) { + pr_warn_once("Failure to get domain for limbo worker\n"); + goto out_unlock; + } + + __check_limbo(d, false); + + if (has_busy_rmid(r, d)) + schedule_delayed_work_on(cpu, &d->cqm_limbo, delay); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); +} + +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms) +{ + unsigned long delay = msecs_to_jiffies(delay_ms); + struct rdt_resource *r; + int cpu; + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + cpu = cpumask_any(&dom->cpu_mask); + dom->cqm_work_cpu = cpu; + + schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); +} + +void mbm_handle_overflow(struct work_struct *work) +{ + unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); + struct rdtgroup *prgrp, *crgrp; + int cpu = smp_processor_id(); + struct list_head *head; + struct rdt_domain *d; + + mutex_lock(&rdtgroup_mutex); + + if (!static_branch_likely(&rdt_enable_key)) + goto out_unlock; + + d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]); + if (!d) + goto out_unlock; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + mbm_update(d, prgrp->mon.rmid); + + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) + mbm_update(d, crgrp->mon.rmid); + } + + schedule_delayed_work_on(cpu, &d->mbm_over, delay); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); +} + +void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) +{ + unsigned long delay = msecs_to_jiffies(delay_ms); + int cpu; + + if (!static_branch_likely(&rdt_enable_key)) + return; + cpu = cpumask_any(&dom->cpu_mask); + dom->mbm_work_cpu = cpu; + schedule_delayed_work_on(cpu, &dom->mbm_over, delay); +} + +static int dom_data_init(struct rdt_resource *r) +{ + struct rmid_entry *entry = NULL; + int i, nr_rmids; + + nr_rmids = r->num_rmid; + rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL); + if (!rmid_ptrs) + return -ENOMEM; + + for (i = 0; i < nr_rmids; i++) { + entry = &rmid_ptrs[i]; + INIT_LIST_HEAD(&entry->list); + + entry->rmid = i; + list_add_tail(&entry->list, &rmid_free_lru); + } + + /* + * RMID 0 is special and is always allocated. It's used for all + * tasks that are not monitored. + */ + entry = __rmid_entry(0); + list_del(&entry->list); + + return 0; +} + +static struct mon_evt llc_occupancy_event = { + .name = "llc_occupancy", + .evtid = QOS_L3_OCCUP_EVENT_ID, +}; + +static struct mon_evt mbm_total_event = { + .name = "mbm_total_bytes", + .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, +}; + +static struct mon_evt mbm_local_event = { + .name = "mbm_local_bytes", + .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, +}; + +/* + * Initialize the event list for the resource. + * + * Note that MBM events are also part of RDT_RESOURCE_L3 resource + * because as per the SDM the total and local memory bandwidth + * are enumerated as part of L3 monitoring. + */ +static void l3_mon_evt_init(struct rdt_resource *r) +{ + INIT_LIST_HEAD(&r->evt_list); + + if (is_llc_occupancy_enabled()) + list_add_tail(&llc_occupancy_event.list, &r->evt_list); + if (is_mbm_total_enabled()) + list_add_tail(&mbm_total_event.list, &r->evt_list); + if (is_mbm_local_enabled()) + list_add_tail(&mbm_local_event.list, &r->evt_list); +} + +int rdt_get_mon_l3_config(struct rdt_resource *r) +{ + int ret; + + r->mon_scale = boot_cpu_data.x86_cache_occ_scale; + r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; + + /* + * A reasonable upper limit on the max threshold is the number + * of lines tagged per RMID if all RMIDs have the same number of + * lines tagged in the LLC. + * + * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. + */ + intel_cqm_threshold = boot_cpu_data.x86_cache_size * 1024 / r->num_rmid; + + /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */ + intel_cqm_threshold /= r->mon_scale; + + ret = dom_data_init(r); + if (ret) + return ret; + + l3_mon_evt_init(r); + + r->mon_capable = true; + r->mon_enabled = true; + + return 0; +} diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 9257bd9dc664..a869d4a073c5 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -32,17 +32,25 @@ #include <uapi/linux/magic.h> -#include <asm/intel_rdt.h> -#include <asm/intel_rdt_common.h> +#include <asm/intel_rdt_sched.h> +#include "intel_rdt.h" DEFINE_STATIC_KEY_FALSE(rdt_enable_key); -struct kernfs_root *rdt_root; +DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); +DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); +static struct kernfs_root *rdt_root; struct rdtgroup rdtgroup_default; LIST_HEAD(rdt_all_groups); /* Kernel fs node for "info" directory under root */ static struct kernfs_node *kn_info; +/* Kernel fs node for "mon_groups" directory under root */ +static struct kernfs_node *kn_mongrp; + +/* Kernel fs node for "mon_data" directory under root */ +static struct kernfs_node *kn_mondata; + /* * Trivial allocator for CLOSIDs. Since h/w only supports a small number, * we can keep a bitmap of free CLOSIDs in a single integer. @@ -66,7 +74,7 @@ static void closid_init(void) int rdt_min_closid = 32; /* Compute rdt_min_closid across all resources */ - for_each_enabled_rdt_resource(r) + for_each_alloc_enabled_rdt_resource(r) rdt_min_closid = min(rdt_min_closid, r->num_closid); closid_free_map = BIT_MASK(rdt_min_closid) - 1; @@ -75,9 +83,9 @@ static void closid_init(void) closid_free_map &= ~1; } -int closid_alloc(void) +static int closid_alloc(void) { - int closid = ffs(closid_free_map); + u32 closid = ffs(closid_free_map); if (closid == 0) return -ENOSPC; @@ -125,28 +133,6 @@ static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) return 0; } -static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts, - int len) -{ - struct rftype *rft; - int ret; - - lockdep_assert_held(&rdtgroup_mutex); - - for (rft = rfts; rft < rfts + len; rft++) { - ret = rdtgroup_add_file(kn, rft); - if (ret) - goto error; - } - - return 0; -error: - pr_warn("Failed to add %s, err=%d\n", rft->name, ret); - while (--rft >= rfts) - kernfs_remove_by_name(kn, rft->name); - return ret; -} - static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; @@ -174,6 +160,11 @@ static struct kernfs_ops rdtgroup_kf_single_ops = { .seq_show = rdtgroup_seqfile_show, }; +static struct kernfs_ops kf_mondata_ops = { + .atomic_write_len = PAGE_SIZE, + .seq_show = rdtgroup_mondata_show, +}; + static bool is_cpu_list(struct kernfs_open_file *of) { struct rftype *rft = of->kn->priv; @@ -203,13 +194,18 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, /* * This is safe against intel_rdt_sched_in() called from __switch_to() * because __switch_to() is executed with interrupts disabled. A local call - * from rdt_update_closid() is proteced against __switch_to() because + * from update_closid_rmid() is proteced against __switch_to() because * preemption is disabled. */ -static void rdt_update_cpu_closid(void *closid) +static void update_cpu_closid_rmid(void *info) { - if (closid) - this_cpu_write(cpu_closid, *(int *)closid); + struct rdtgroup *r = info; + + if (r) { + this_cpu_write(pqr_state.default_closid, r->closid); + this_cpu_write(pqr_state.default_rmid, r->mon.rmid); + } + /* * We cannot unconditionally write the MSR because the current * executing task might have its own closid selected. Just reuse @@ -221,28 +217,128 @@ static void rdt_update_cpu_closid(void *closid) /* * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, * - * Per task closids must have been set up before calling this function. - * - * The per cpu closids are updated with the smp function call, when @closid - * is not NULL. If @closid is NULL then all affected percpu closids must - * have been set up before calling this function. + * Per task closids/rmids must have been set up before calling this function. */ static void -rdt_update_closid(const struct cpumask *cpu_mask, int *closid) +update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) { int cpu = get_cpu(); if (cpumask_test_cpu(cpu, cpu_mask)) - rdt_update_cpu_closid(closid); - smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1); + update_cpu_closid_rmid(r); + smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1); put_cpu(); } +static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, + cpumask_var_t tmpmask) +{ + struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp; + struct list_head *head; + + /* Check whether cpus belong to parent ctrl group */ + cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); + if (cpumask_weight(tmpmask)) + return -EINVAL; + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (cpumask_weight(tmpmask)) { + /* Give any dropped cpus to parent rdtgroup */ + cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); + update_closid_rmid(tmpmask, prgrp); + } + + /* + * If we added cpus, remove them from previous group that owned them + * and update per-cpu rmid + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (cpumask_weight(tmpmask)) { + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + if (crgrp == rdtgrp) + continue; + cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask, + tmpmask); + } + update_closid_rmid(tmpmask, rdtgrp); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + + return 0; +} + +static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m) +{ + struct rdtgroup *crgrp; + + cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m); + /* update the child mon group masks as well*/ + list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list) + cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask); +} + +static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, + cpumask_var_t tmpmask, cpumask_var_t tmpmask1) +{ + struct rdtgroup *r, *crgrp; + struct list_head *head; + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (cpumask_weight(tmpmask)) { + /* Can't drop from default group */ + if (rdtgrp == &rdtgroup_default) + return -EINVAL; + + /* Give any dropped cpus to rdtgroup_default */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, tmpmask); + update_closid_rmid(tmpmask, &rdtgroup_default); + } + + /* + * If we added cpus, remove them from previous group and + * the prev group's child groups that owned them + * and update per-cpu closid/rmid. + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (cpumask_weight(tmpmask)) { + list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { + if (r == rdtgrp) + continue; + cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); + if (cpumask_weight(tmpmask1)) + cpumask_rdtgrp_clear(r, tmpmask1); + } + update_closid_rmid(tmpmask, rdtgrp); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + + /* + * Clear child mon group masks since there is a new parent mask + * now and update the rmid for the cpus the child lost. + */ + head = &rdtgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask); + update_closid_rmid(tmpmask, rdtgrp); + cpumask_clear(&crgrp->cpu_mask); + } + + return 0; +} + static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - cpumask_var_t tmpmask, newmask; - struct rdtgroup *rdtgrp, *r; + cpumask_var_t tmpmask, newmask, tmpmask1; + struct rdtgroup *rdtgrp; int ret; if (!buf) @@ -254,6 +350,11 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, free_cpumask_var(tmpmask); return -ENOMEM; } + if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) { + free_cpumask_var(tmpmask); + free_cpumask_var(newmask); + return -ENOMEM; + } rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { @@ -276,41 +377,18 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, goto unlock; } - /* Check whether cpus are dropped from this group */ - cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (cpumask_weight(tmpmask)) { - /* Can't drop from default group */ - if (rdtgrp == &rdtgroup_default) { - ret = -EINVAL; - goto unlock; - } - /* Give any dropped cpus to rdtgroup_default */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, tmpmask); - rdt_update_closid(tmpmask, &rdtgroup_default.closid); - } - - /* - * If we added cpus, remove them from previous group that owned them - * and update per-cpu closid - */ - cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { - list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { - if (r == rdtgrp) - continue; - cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask); - } - rdt_update_closid(tmpmask, &rdtgrp->closid); - } - - /* Done pushing/pulling - update this group with new mask */ - cpumask_copy(&rdtgrp->cpu_mask, newmask); + if (rdtgrp->type == RDTCTRL_GROUP) + ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1); + else if (rdtgrp->type == RDTMON_GROUP) + ret = cpus_mon_write(rdtgrp, newmask, tmpmask); + else + ret = -EINVAL; unlock: rdtgroup_kn_unlock(of->kn); free_cpumask_var(tmpmask); free_cpumask_var(newmask); + free_cpumask_var(tmpmask1); return ret ?: nbytes; } @@ -336,6 +414,7 @@ static void move_myself(struct callback_head *head) if (atomic_dec_and_test(&rdtgrp->waitcount) && (rdtgrp->flags & RDT_DELETED)) { current->closid = 0; + current->rmid = 0; kfree(rdtgrp); } @@ -374,7 +453,20 @@ static int __rdtgroup_move_task(struct task_struct *tsk, atomic_dec(&rdtgrp->waitcount); kfree(callback); } else { - tsk->closid = rdtgrp->closid; + /* + * For ctrl_mon groups move both closid and rmid. + * For monitor groups, can move the tasks only from + * their parent CTRL group. + */ + if (rdtgrp->type == RDTCTRL_GROUP) { + tsk->closid = rdtgrp->closid; + tsk->rmid = rdtgrp->mon.rmid; + } else if (rdtgrp->type == RDTMON_GROUP) { + if (rdtgrp->mon.parent->closid == tsk->closid) + tsk->rmid = rdtgrp->mon.rmid; + else + ret = -EINVAL; + } } return ret; } @@ -454,7 +546,8 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) rcu_read_lock(); for_each_process_thread(p, t) { - if (t->closid == r->closid) + if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) || + (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) seq_printf(s, "%d\n", t->pid); } rcu_read_unlock(); @@ -476,39 +569,6 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of, return ret; } -/* Files in each rdtgroup */ -static struct rftype rdtgroup_base_files[] = { - { - .name = "cpus", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - }, - { - .name = "cpus_list", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - .flags = RFTYPE_FLAGS_CPUS_LIST, - }, - { - .name = "tasks", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_tasks_write, - .seq_show = rdtgroup_tasks_show, - }, - { - .name = "schemata", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_schemata_write, - .seq_show = rdtgroup_schemata_show, - }, -}; - static int rdt_num_closids_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -536,6 +596,15 @@ static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, return 0; } +static int rdt_shareable_bits_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%x\n", r->cache.shareable_bits); + return 0; +} + static int rdt_min_bw_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -545,6 +614,28 @@ static int rdt_min_bw_show(struct kernfs_open_file *of, return 0; } +static int rdt_num_rmids_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%d\n", r->num_rmid); + + return 0; +} + +static int rdt_mon_features_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + struct mon_evt *mevt; + + list_for_each_entry(mevt, &r->evt_list, list) + seq_printf(seq, "%s\n", mevt->name); + + return 0; +} + static int rdt_bw_gran_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -563,74 +654,200 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of, return 0; } +static int max_threshold_occ_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%u\n", intel_cqm_threshold * r->mon_scale); + + return 0; +} + +static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdt_resource *r = of->kn->parent->priv; + unsigned int bytes; + int ret; + + ret = kstrtouint(buf, 0, &bytes); + if (ret) + return ret; + + if (bytes > (boot_cpu_data.x86_cache_size * 1024)) + return -EINVAL; + + intel_cqm_threshold = bytes / r->mon_scale; + + return nbytes; +} + /* rdtgroup information files for one cache resource. */ -static struct rftype res_cache_info_files[] = { +static struct rftype res_common_files[] = { { .name = "num_closids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_num_closids_show, + .fflags = RF_CTRL_INFO, + }, + { + .name = "mon_features", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_mon_features_show, + .fflags = RF_MON_INFO, + }, + { + .name = "num_rmids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_rmids_show, + .fflags = RF_MON_INFO, }, { .name = "cbm_mask", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_default_ctrl_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "min_cbm_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_min_cbm_bits_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, }, -}; - -/* rdtgroup information files for memory bandwidth. */ -static struct rftype res_mba_info_files[] = { { - .name = "num_closids", + .name = "shareable_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_num_closids_show, + .seq_show = rdt_shareable_bits_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "min_bandwidth", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_min_bw_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, }, { .name = "bandwidth_gran", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_bw_gran_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, }, { .name = "delay_linear", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_delay_linear_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + }, + { + .name = "max_threshold_occupancy", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = max_threshold_occ_write, + .seq_show = max_threshold_occ_show, + .fflags = RF_MON_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "cpus", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .fflags = RFTYPE_BASE, + }, + { + .name = "cpus_list", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .flags = RFTYPE_FLAGS_CPUS_LIST, + .fflags = RFTYPE_BASE, + }, + { + .name = "tasks", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_tasks_write, + .seq_show = rdtgroup_tasks_show, + .fflags = RFTYPE_BASE, + }, + { + .name = "schemata", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_schemata_write, + .seq_show = rdtgroup_schemata_show, + .fflags = RF_CTRL_BASE, }, }; -void rdt_get_mba_infofile(struct rdt_resource *r) +static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) { - r->info_files = res_mba_info_files; - r->nr_info_files = ARRAY_SIZE(res_mba_info_files); + struct rftype *rfts, *rft; + int ret, len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + lockdep_assert_held(&rdtgroup_mutex); + + for (rft = rfts; rft < rfts + len; rft++) { + if ((fflags & rft->fflags) == rft->fflags) { + ret = rdtgroup_add_file(kn, rft); + if (ret) + goto error; + } + } + + return 0; +error: + pr_warn("Failed to add %s, err=%d\n", rft->name, ret); + while (--rft >= rfts) { + if ((fflags & rft->fflags) == rft->fflags) + kernfs_remove_by_name(kn, rft->name); + } + return ret; } -void rdt_get_cache_infofile(struct rdt_resource *r) +static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name, + unsigned long fflags) { - r->info_files = res_cache_info_files; - r->nr_info_files = ARRAY_SIZE(res_cache_info_files); + struct kernfs_node *kn_subdir; + int ret; + + kn_subdir = kernfs_create_dir(kn_info, name, + kn_info->mode, r); + if (IS_ERR(kn_subdir)) + return PTR_ERR(kn_subdir); + + kernfs_get(kn_subdir); + ret = rdtgroup_kn_set_ugid(kn_subdir); + if (ret) + return ret; + + ret = rdtgroup_add_files(kn_subdir, fflags); + if (!ret) + kernfs_activate(kn_subdir); + + return ret; } static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) { - struct kernfs_node *kn_subdir; - struct rftype *res_info_files; struct rdt_resource *r; - int ret, len; + unsigned long fflags; + char name[32]; + int ret; /* create the directory */ kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); @@ -638,25 +855,19 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) return PTR_ERR(kn_info); kernfs_get(kn_info); - for_each_enabled_rdt_resource(r) { - kn_subdir = kernfs_create_dir(kn_info, r->name, - kn_info->mode, r); - if (IS_ERR(kn_subdir)) { - ret = PTR_ERR(kn_subdir); - goto out_destroy; - } - kernfs_get(kn_subdir); - ret = rdtgroup_kn_set_ugid(kn_subdir); + for_each_alloc_enabled_rdt_resource(r) { + fflags = r->fflags | RF_CTRL_INFO; + ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags); if (ret) goto out_destroy; + } - res_info_files = r->info_files; - len = r->nr_info_files; - - ret = rdtgroup_add_files(kn_subdir, res_info_files, len); + for_each_mon_enabled_rdt_resource(r) { + fflags = r->fflags | RF_MON_INFO; + sprintf(name, "%s_MON", r->name); + ret = rdtgroup_mkdir_info_resdir(r, name, fflags); if (ret) goto out_destroy; - kernfs_activate(kn_subdir); } /* @@ -678,6 +889,39 @@ out_destroy: return ret; } +static int +mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, + char *name, struct kernfs_node **dest_kn) +{ + struct kernfs_node *kn; + int ret; + + /* create the directory */ + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + if (dest_kn) + *dest_kn = kn; + + /* + * This extra ref will be put in kernfs_remove() and guarantees + * that @rdtgrp->kn is always accessible. + */ + kernfs_get(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + kernfs_activate(kn); + + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} static void l3_qos_cfg_update(void *arg) { bool *enable = arg; @@ -718,14 +962,15 @@ static int cdp_enable(void) struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; int ret; - if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable) + if (!r_l3->alloc_capable || !r_l3data->alloc_capable || + !r_l3code->alloc_capable) return -EINVAL; ret = set_l3_qos_cfg(r_l3, true); if (!ret) { - r_l3->enabled = false; - r_l3data->enabled = true; - r_l3code->enabled = true; + r_l3->alloc_enabled = false; + r_l3data->alloc_enabled = true; + r_l3code->alloc_enabled = true; } return ret; } @@ -734,11 +979,11 @@ static void cdp_disable(void) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; - r->enabled = r->capable; + r->alloc_enabled = r->alloc_capable; - if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) { - rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false; - rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false; + if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) { + rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled = false; + rdt_resources_all[RDT_RESOURCE_L3CODE].alloc_enabled = false; set_l3_qos_cfg(r, false); } } @@ -823,10 +1068,16 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) } } +static int mkdir_mondata_all(struct kernfs_node *parent_kn, + struct rdtgroup *prgrp, + struct kernfs_node **mon_data_kn); + static struct dentry *rdt_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + struct rdt_domain *dom; + struct rdt_resource *r; struct dentry *dentry; int ret; @@ -853,15 +1104,54 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, goto out_cdp; } + if (rdt_mon_capable) { + ret = mongroup_create_dir(rdtgroup_default.kn, + NULL, "mon_groups", + &kn_mongrp); + if (ret) { + dentry = ERR_PTR(ret); + goto out_info; + } + kernfs_get(kn_mongrp); + + ret = mkdir_mondata_all(rdtgroup_default.kn, + &rdtgroup_default, &kn_mondata); + if (ret) { + dentry = ERR_PTR(ret); + goto out_mongrp; + } + kernfs_get(kn_mondata); + rdtgroup_default.mon.mon_data_kn = kn_mondata; + } + dentry = kernfs_mount(fs_type, flags, rdt_root, RDTGROUP_SUPER_MAGIC, NULL); if (IS_ERR(dentry)) - goto out_destroy; + goto out_mondata; + + if (rdt_alloc_capable) + static_branch_enable(&rdt_alloc_enable_key); + if (rdt_mon_capable) + static_branch_enable(&rdt_mon_enable_key); + + if (rdt_alloc_capable || rdt_mon_capable) + static_branch_enable(&rdt_enable_key); + + if (is_mbm_enabled()) { + r = &rdt_resources_all[RDT_RESOURCE_L3]; + list_for_each_entry(dom, &r->domains, list) + mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL); + } - static_branch_enable(&rdt_enable_key); goto out; -out_destroy: +out_mondata: + if (rdt_mon_capable) + kernfs_remove(kn_mondata); +out_mongrp: + if (rdt_mon_capable) + kernfs_remove(kn_mongrp); +out_info: kernfs_remove(kn_info); out_cdp: cdp_disable(); @@ -909,6 +1199,18 @@ static int reset_all_ctrls(struct rdt_resource *r) return 0; } +static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_alloc_capable && + (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); +} + +static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_mon_capable && + (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); +} + /* * Move tasks from one to the other group. If @from is NULL, then all tasks * in the systems are moved unconditionally (used for teardown). @@ -924,8 +1226,11 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, read_lock(&tasklist_lock); for_each_process_thread(p, t) { - if (!from || t->closid == from->closid) { + if (!from || is_closid_match(t, from) || + is_rmid_match(t, from)) { t->closid = to->closid; + t->rmid = to->mon.rmid; + #ifdef CONFIG_SMP /* * This is safe on x86 w/o barriers as the ordering @@ -944,6 +1249,19 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, read_unlock(&tasklist_lock); } +static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) +{ + struct rdtgroup *sentry, *stmp; + struct list_head *head; + + head = &rdtgrp->mon.crdtgrp_list; + list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { + free_rmid(sentry->mon.rmid); + list_del(&sentry->mon.crdtgrp_list); + kfree(sentry); + } +} + /* * Forcibly remove all of subdirectories under root. */ @@ -955,6 +1273,9 @@ static void rmdir_all_sub(void) rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { + /* Free any child rmids */ + free_all_child_rdtgrp(rdtgrp); + /* Remove each rdtgroup other than root */ if (rdtgrp == &rdtgroup_default) continue; @@ -967,16 +1288,20 @@ static void rmdir_all_sub(void) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + free_rmid(rdtgrp->mon.rmid); + kernfs_remove(rdtgrp->kn); list_del(&rdtgrp->rdtgroup_list); kfree(rdtgrp); } /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ get_online_cpus(); - rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid); + update_closid_rmid(cpu_online_mask, &rdtgroup_default); put_online_cpus(); kernfs_remove(kn_info); + kernfs_remove(kn_mongrp); + kernfs_remove(kn_mondata); } static void rdt_kill_sb(struct super_block *sb) @@ -986,10 +1311,12 @@ static void rdt_kill_sb(struct super_block *sb) mutex_lock(&rdtgroup_mutex); /*Put everything back to default values. */ - for_each_enabled_rdt_resource(r) + for_each_alloc_enabled_rdt_resource(r) reset_all_ctrls(r); cdp_disable(); rmdir_all_sub(); + static_branch_disable(&rdt_alloc_enable_key); + static_branch_disable(&rdt_mon_enable_key); static_branch_disable(&rdt_enable_key); kernfs_kill_sb(sb); mutex_unlock(&rdtgroup_mutex); @@ -1001,46 +1328,223 @@ static struct file_system_type rdt_fs_type = { .kill_sb = rdt_kill_sb, }; -static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) +static int mon_addfile(struct kernfs_node *parent_kn, const char *name, + void *priv) { - struct rdtgroup *parent, *rdtgrp; struct kernfs_node *kn; - int ret, closid; + int ret = 0; - /* Only allow mkdir in the root directory */ - if (parent_kn != rdtgroup_default.kn) - return -EPERM; + kn = __kernfs_create_file(parent_kn, name, 0444, 0, + &kf_mondata_ops, priv, NULL, NULL); + if (IS_ERR(kn)) + return PTR_ERR(kn); - /* Do not accept '\n' to avoid unparsable situation. */ - if (strchr(name, '\n')) - return -EINVAL; + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + kernfs_remove(kn); + return ret; + } - parent = rdtgroup_kn_lock_live(parent_kn); - if (!parent) { - ret = -ENODEV; - goto out_unlock; + return ret; +} + +/* + * Remove all subdirectories of mon_data of ctrl_mon groups + * and monitor groups with given domain id. + */ +void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id) +{ + struct rdtgroup *prgrp, *crgrp; + char name[32]; + + if (!r->mon_enabled) + return; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + sprintf(name, "mon_%s_%02d", r->name, dom_id); + kernfs_remove_by_name(prgrp->mon.mon_data_kn, name); + + list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) + kernfs_remove_by_name(crgrp->mon.mon_data_kn, name); } +} - ret = closid_alloc(); - if (ret < 0) +static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, + struct rdt_domain *d, + struct rdt_resource *r, struct rdtgroup *prgrp) +{ + union mon_data_bits priv; + struct kernfs_node *kn; + struct mon_evt *mevt; + struct rmid_read rr; + char name[32]; + int ret; + + sprintf(name, "mon_%s_%02d", r->name, d->id); + /* create the directory */ + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + /* + * This extra ref will be put in kernfs_remove() and guarantees + * that kn is always accessible. + */ + kernfs_get(kn); + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + if (WARN_ON(list_empty(&r->evt_list))) { + ret = -EPERM; + goto out_destroy; + } + + priv.u.rid = r->rid; + priv.u.domid = d->id; + list_for_each_entry(mevt, &r->evt_list, list) { + priv.u.evtid = mevt->evtid; + ret = mon_addfile(kn, mevt->name, priv.priv); + if (ret) + goto out_destroy; + + if (is_mbm_event(mevt->evtid)) + mon_event_read(&rr, d, prgrp, mevt->evtid, true); + } + kernfs_activate(kn); + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +/* + * Add all subdirectories of mon_data for "ctrl_mon" groups + * and "monitor" groups with given domain id. + */ +void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain *d) +{ + struct kernfs_node *parent_kn; + struct rdtgroup *prgrp, *crgrp; + struct list_head *head; + + if (!r->mon_enabled) + return; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + parent_kn = prgrp->mon.mon_data_kn; + mkdir_mondata_subdir(parent_kn, d, r, prgrp); + + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + parent_kn = crgrp->mon.mon_data_kn; + mkdir_mondata_subdir(parent_kn, d, r, crgrp); + } + } +} + +static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, + struct rdt_resource *r, + struct rdtgroup *prgrp) +{ + struct rdt_domain *dom; + int ret; + + list_for_each_entry(dom, &r->domains, list) { + ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); + if (ret) + return ret; + } + + return 0; +} + +/* + * This creates a directory mon_data which contains the monitored data. + * + * mon_data has one directory for each domain whic are named + * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data + * with L3 domain looks as below: + * ./mon_data: + * mon_L3_00 + * mon_L3_01 + * mon_L3_02 + * ... + * + * Each domain directory has one file per event: + * ./mon_L3_00/: + * llc_occupancy + * + */ +static int mkdir_mondata_all(struct kernfs_node *parent_kn, + struct rdtgroup *prgrp, + struct kernfs_node **dest_kn) +{ + struct rdt_resource *r; + struct kernfs_node *kn; + int ret; + + /* + * Create the mon_data directory first. + */ + ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn); + if (ret) + return ret; + + if (dest_kn) + *dest_kn = kn; + + /* + * Create the subdirectories for each domain. Note that all events + * in a domain like L3 are grouped into a resource whose domain is L3 + */ + for_each_mon_enabled_rdt_resource(r) { + ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); + if (ret) + goto out_destroy; + } + + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, + struct kernfs_node *prgrp_kn, + const char *name, umode_t mode, + enum rdt_group_type rtype, struct rdtgroup **r) +{ + struct rdtgroup *prdtgrp, *rdtgrp; + struct kernfs_node *kn; + uint files = 0; + int ret; + + prdtgrp = rdtgroup_kn_lock_live(prgrp_kn); + if (!prdtgrp) { + ret = -ENODEV; goto out_unlock; - closid = ret; + } /* allocate the rdtgroup. */ rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); if (!rdtgrp) { ret = -ENOSPC; - goto out_closid_free; + goto out_unlock; } - rdtgrp->closid = closid; - list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); + *r = rdtgrp; + rdtgrp->mon.parent = prdtgrp; + rdtgrp->type = rtype; + INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list); /* kernfs creates the directory for rdtgrp */ - kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp); + kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); if (IS_ERR(kn)) { ret = PTR_ERR(kn); - goto out_cancel_ref; + goto out_free_rgrp; } rdtgrp->kn = kn; @@ -1056,43 +1560,211 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; - ret = rdtgroup_add_files(kn, rdtgroup_base_files, - ARRAY_SIZE(rdtgroup_base_files)); + files = RFTYPE_BASE | RFTYPE_CTRL; + files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype); + ret = rdtgroup_add_files(kn, files); if (ret) goto out_destroy; + if (rdt_mon_capable) { + ret = alloc_rmid(); + if (ret < 0) + goto out_destroy; + rdtgrp->mon.rmid = ret; + + ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn); + if (ret) + goto out_idfree; + } kernfs_activate(kn); - ret = 0; - goto out_unlock; + /* + * The caller unlocks the prgrp_kn upon success. + */ + return 0; +out_idfree: + free_rmid(rdtgrp->mon.rmid); out_destroy: kernfs_remove(rdtgrp->kn); -out_cancel_ref: - list_del(&rdtgrp->rdtgroup_list); +out_free_rgrp: kfree(rdtgrp); -out_closid_free: - closid_free(closid); out_unlock: - rdtgroup_kn_unlock(parent_kn); + rdtgroup_kn_unlock(prgrp_kn); return ret; } -static int rdtgroup_rmdir(struct kernfs_node *kn) +static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) +{ + kernfs_remove(rgrp->kn); + free_rmid(rgrp->mon.rmid); + kfree(rgrp); +} + +/* + * Create a monitor group under "mon_groups" directory of a control + * and monitor group(ctrl_mon). This is a resource group + * to monitor a subset of tasks and cpus in its parent ctrl_mon group. + */ +static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, + struct kernfs_node *prgrp_kn, + const char *name, + umode_t mode) +{ + struct rdtgroup *rdtgrp, *prgrp; + int ret; + + ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP, + &rdtgrp); + if (ret) + return ret; + + prgrp = rdtgrp->mon.parent; + rdtgrp->closid = prgrp->closid; + + /* + * Add the rdtgrp to the list of rdtgrps the parent + * ctrl_mon group has to track. + */ + list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); + + rdtgroup_kn_unlock(prgrp_kn); + return ret; +} + +/* + * These are rdtgroups created under the root directory. Can be used + * to allocate and monitor resources. + */ +static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, + struct kernfs_node *prgrp_kn, + const char *name, umode_t mode) { - int ret, cpu, closid = rdtgroup_default.closid; struct rdtgroup *rdtgrp; - cpumask_var_t tmpmask; + struct kernfs_node *kn; + u32 closid; + int ret; - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) - return -ENOMEM; + ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP, + &rdtgrp); + if (ret) + return ret; - rdtgrp = rdtgroup_kn_lock_live(kn); - if (!rdtgrp) { - ret = -EPERM; - goto out; + kn = rdtgrp->kn; + ret = closid_alloc(); + if (ret < 0) + goto out_common_fail; + closid = ret; + + rdtgrp->closid = closid; + list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); + + if (rdt_mon_capable) { + /* + * Create an empty mon_groups directory to hold the subset + * of tasks and cpus to monitor. + */ + ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL); + if (ret) + goto out_id_free; } + goto out_unlock; + +out_id_free: + closid_free(closid); + list_del(&rdtgrp->rdtgroup_list); +out_common_fail: + mkdir_rdt_prepare_clean(rdtgrp); +out_unlock: + rdtgroup_kn_unlock(prgrp_kn); + return ret; +} + +/* + * We allow creating mon groups only with in a directory called "mon_groups" + * which is present in every ctrl_mon group. Check if this is a valid + * "mon_groups" directory. + * + * 1. The directory should be named "mon_groups". + * 2. The mon group itself should "not" be named "mon_groups". + * This makes sure "mon_groups" directory always has a ctrl_mon group + * as parent. + */ +static bool is_mon_groups(struct kernfs_node *kn, const char *name) +{ + return (!strcmp(kn->name, "mon_groups") && + strcmp(name, "mon_groups")); +} + +static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) +{ + /* Do not accept '\n' to avoid unparsable situation. */ + if (strchr(name, '\n')) + return -EINVAL; + + /* + * If the parent directory is the root directory and RDT + * allocation is supported, add a control and monitoring + * subdirectory + */ + if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn) + return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode); + + /* + * If RDT monitoring is supported and the parent directory is a valid + * "mon_groups" directory, add a monitoring subdirectory. + */ + if (rdt_mon_capable && is_mon_groups(parent_kn, name)) + return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode); + + return -EPERM; +} + +static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp, + cpumask_var_t tmpmask) +{ + struct rdtgroup *prdtgrp = rdtgrp->mon.parent; + int cpu; + + /* Give any tasks back to the parent group */ + rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask); + + /* Update per cpu rmid of the moved CPUs first */ + for_each_cpu(cpu, &rdtgrp->cpu_mask) + per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid; + /* + * Update the MSR on moved CPUs and CPUs which have moved + * task running on them. + */ + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); + update_closid_rmid(tmpmask, NULL); + + rdtgrp->flags = RDT_DELETED; + free_rmid(rdtgrp->mon.rmid); + + /* + * Remove the rdtgrp from the parent ctrl_mon group's list + */ + WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); + list_del(&rdtgrp->mon.crdtgrp_list); + + /* + * one extra hold on this, will drop when we kfree(rdtgrp) + * in rdtgroup_kn_unlock() + */ + kernfs_get(kn); + kernfs_remove(rdtgrp->kn); + + return 0; +} + +static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, + cpumask_var_t tmpmask) +{ + int cpu; + /* Give any tasks back to the default group */ rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); @@ -1100,18 +1772,28 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - /* Update per cpu closid of the moved CPUs first */ - for_each_cpu(cpu, &rdtgrp->cpu_mask) - per_cpu(cpu_closid, cpu) = closid; + /* Update per cpu closid and rmid of the moved CPUs first */ + for_each_cpu(cpu, &rdtgrp->cpu_mask) { + per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid; + per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid; + } + /* * Update the MSR on moved CPUs and CPUs which have moved * task running on them. */ cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); - rdt_update_closid(tmpmask, NULL); + update_closid_rmid(tmpmask, NULL); rdtgrp->flags = RDT_DELETED; closid_free(rdtgrp->closid); + free_rmid(rdtgrp->mon.rmid); + + /* + * Free all the child monitor group rmids. + */ + free_all_child_rdtgrp(rdtgrp); + list_del(&rdtgrp->rdtgroup_list); /* @@ -1120,7 +1802,41 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) */ kernfs_get(kn); kernfs_remove(rdtgrp->kn); - ret = 0; + + return 0; +} + +static int rdtgroup_rmdir(struct kernfs_node *kn) +{ + struct kernfs_node *parent_kn = kn->parent; + struct rdtgroup *rdtgrp; + cpumask_var_t tmpmask; + int ret = 0; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + + rdtgrp = rdtgroup_kn_lock_live(kn); + if (!rdtgrp) { + ret = -EPERM; + goto out; + } + + /* + * If the rdtgroup is a ctrl_mon group and parent directory + * is the root directory, remove the ctrl_mon group. + * + * If the rdtgroup is a mon group and parent directory + * is a valid "mon_groups" directory, remove the mon group. + */ + if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) + ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask); + else if (rdtgrp->type == RDTMON_GROUP && + is_mon_groups(parent_kn, kn->name)) + ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask); + else + ret = -EPERM; + out: rdtgroup_kn_unlock(kn); free_cpumask_var(tmpmask); @@ -1129,7 +1845,7 @@ out: static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) { - if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) + if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) seq_puts(seq, ",cdp"); return 0; } @@ -1153,10 +1869,13 @@ static int __init rdtgroup_setup_root(void) mutex_lock(&rdtgroup_mutex); rdtgroup_default.closid = 0; + rdtgroup_default.mon.rmid = 0; + rdtgroup_default.type = RDTCTRL_GROUP; + INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); + list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); - ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files, - ARRAY_SIZE(rdtgroup_base_files)); + ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE); if (ret) { kernfs_destroy_root(rdt_root); goto out; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index efc5eeb58292..11966251cd42 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -56,7 +56,7 @@ #include <asm/debugreg.h> #include <asm/switch_to.h> #include <asm/vm86.h> -#include <asm/intel_rdt.h> +#include <asm/intel_rdt_sched.h> #include <asm/proto.h> void __show_regs(struct pt_regs *regs, int all) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c85269a76511..302e7b2572d1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,7 +52,7 @@ #include <asm/switch_to.h> #include <asm/xen/hypervisor.h> #include <asm/vdso.h> -#include <asm/intel_rdt.h> +#include <asm/intel_rdt_sched.h> #include <asm/unistd.h> #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ |