summaryrefslogtreecommitdiff
path: root/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c')
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c385
1 files changed, 230 insertions, 155 deletions
diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c
index f8c260d522ca..815b4e92522c 100644
--- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c
+++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c
@@ -17,6 +17,7 @@
#include <linux/debugfs.h>
#include <linux/kthread.h>
#include <linux/mman.h>
+#include <linux/perf_event.h>
#include <linux/pm_qos.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
@@ -26,6 +27,7 @@
#include <asm/intel_rdt_sched.h>
#include <asm/perf_event.h>
+#include "../../events/perf_event.h" /* For X86_CONFIG() */
#include "intel_rdt.h"
#define CREATE_TRACE_POINTS
@@ -91,7 +93,7 @@ static u64 get_prefetch_disable_bits(void)
*/
return 0xF;
case INTEL_FAM6_ATOM_GOLDMONT:
- case INTEL_FAM6_ATOM_GEMINI_LAKE:
+ case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
/*
* SDM defines bits of MSR_MISC_FEATURE_CONTROL register
* as:
@@ -106,16 +108,6 @@ static u64 get_prefetch_disable_bits(void)
return 0;
}
-/*
- * Helper to write 64bit value to MSR without tracing. Used when
- * use of the cache should be restricted and use of registers used
- * for local variables avoided.
- */
-static inline void pseudo_wrmsrl_notrace(unsigned int msr, u64 val)
-{
- __wrmsr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32));
-}
-
/**
* pseudo_lock_minor_get - Obtain available minor number
* @minor: Pointer to where new minor number will be stored
@@ -888,31 +880,14 @@ static int measure_cycles_lat_fn(void *_plr)
struct pseudo_lock_region *plr = _plr;
unsigned long i;
u64 start, end;
-#ifdef CONFIG_KASAN
- /*
- * The registers used for local register variables are also used
- * when KASAN is active. When KASAN is active we use a regular
- * variable to ensure we always use a valid pointer to access memory.
- * The cost is that accessing this pointer, which could be in
- * cache, will be included in the measurement of memory read latency.
- */
void *mem_r;
-#else
-#ifdef CONFIG_X86_64
- register void *mem_r asm("rbx");
-#else
- register void *mem_r asm("ebx");
-#endif /* CONFIG_X86_64 */
-#endif /* CONFIG_KASAN */
local_irq_disable();
/*
- * The wrmsr call may be reordered with the assignment below it.
- * Call wrmsr as directly as possible to avoid tracing clobbering
- * local register variable used for memory pointer.
+ * Disable hardware prefetchers.
*/
- __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
- mem_r = plr->kmem;
+ wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
+ mem_r = READ_ONCE(plr->kmem);
/*
* Dummy execute of the time measurement to load the needed
* instructions into the L1 instruction cache.
@@ -934,157 +909,240 @@ static int measure_cycles_lat_fn(void *_plr)
return 0;
}
-static int measure_cycles_perf_fn(void *_plr)
+/*
+ * Create a perf_event_attr for the hit and miss perf events that will
+ * be used during the performance measurement. A perf_event maintains
+ * a pointer to its perf_event_attr so a unique attribute structure is
+ * created for each perf_event.
+ *
+ * The actual configuration of the event is set right before use in order
+ * to use the X86_CONFIG macro.
+ */
+static struct perf_event_attr perf_miss_attr = {
+ .type = PERF_TYPE_RAW,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 1,
+ .disabled = 0,
+ .exclude_user = 1,
+};
+
+static struct perf_event_attr perf_hit_attr = {
+ .type = PERF_TYPE_RAW,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 1,
+ .disabled = 0,
+ .exclude_user = 1,
+};
+
+struct residency_counts {
+ u64 miss_before, hits_before;
+ u64 miss_after, hits_after;
+};
+
+static int measure_residency_fn(struct perf_event_attr *miss_attr,
+ struct perf_event_attr *hit_attr,
+ struct pseudo_lock_region *plr,
+ struct residency_counts *counts)
{
- unsigned long long l3_hits = 0, l3_miss = 0;
- u64 l3_hit_bits = 0, l3_miss_bits = 0;
- struct pseudo_lock_region *plr = _plr;
- unsigned long long l2_hits, l2_miss;
- u64 l2_hit_bits, l2_miss_bits;
- unsigned long i;
-#ifdef CONFIG_KASAN
- /*
- * The registers used for local register variables are also used
- * when KASAN is active. When KASAN is active we use regular variables
- * at the cost of including cache access latency to these variables
- * in the measurements.
- */
+ u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0;
+ struct perf_event *miss_event, *hit_event;
+ int hit_pmcnum, miss_pmcnum;
unsigned int line_size;
unsigned int size;
+ unsigned long i;
void *mem_r;
-#else
- register unsigned int line_size asm("esi");
- register unsigned int size asm("edi");
-#ifdef CONFIG_X86_64
- register void *mem_r asm("rbx");
-#else
- register void *mem_r asm("ebx");
-#endif /* CONFIG_X86_64 */
-#endif /* CONFIG_KASAN */
+ u64 tmp;
+
+ miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu,
+ NULL, NULL, NULL);
+ if (IS_ERR(miss_event))
+ goto out;
+
+ hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu,
+ NULL, NULL, NULL);
+ if (IS_ERR(hit_event))
+ goto out_miss;
+
+ local_irq_disable();
+ /*
+ * Check any possible error state of events used by performing
+ * one local read.
+ */
+ if (perf_event_read_local(miss_event, &tmp, NULL, NULL)) {
+ local_irq_enable();
+ goto out_hit;
+ }
+ if (perf_event_read_local(hit_event, &tmp, NULL, NULL)) {
+ local_irq_enable();
+ goto out_hit;
+ }
+
+ /*
+ * Disable hardware prefetchers.
+ */
+ wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
+
+ /* Initialize rest of local variables */
+ /*
+ * Performance event has been validated right before this with
+ * interrupts disabled - it is thus safe to read the counter index.
+ */
+ miss_pmcnum = x86_perf_rdpmc_index(miss_event);
+ hit_pmcnum = x86_perf_rdpmc_index(hit_event);
+ line_size = READ_ONCE(plr->line_size);
+ mem_r = READ_ONCE(plr->kmem);
+ size = READ_ONCE(plr->size);
+
+ /*
+ * Read counter variables twice - first to load the instructions
+ * used in L1 cache, second to capture accurate value that does not
+ * include cache misses incurred because of instruction loads.
+ */
+ rdpmcl(hit_pmcnum, hits_before);
+ rdpmcl(miss_pmcnum, miss_before);
+ /*
+ * From SDM: Performing back-to-back fast reads are not guaranteed
+ * to be monotonic.
+ * Use LFENCE to ensure all previous instructions are retired
+ * before proceeding.
+ */
+ rmb();
+ rdpmcl(hit_pmcnum, hits_before);
+ rdpmcl(miss_pmcnum, miss_before);
+ /*
+ * Use LFENCE to ensure all previous instructions are retired
+ * before proceeding.
+ */
+ rmb();
+ for (i = 0; i < size; i += line_size) {
+ /*
+ * Add a barrier to prevent speculative execution of this
+ * loop reading beyond the end of the buffer.
+ */
+ rmb();
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ }
+ /*
+ * Use LFENCE to ensure all previous instructions are retired
+ * before proceeding.
+ */
+ rmb();
+ rdpmcl(hit_pmcnum, hits_after);
+ rdpmcl(miss_pmcnum, miss_after);
+ /*
+ * Use LFENCE to ensure all previous instructions are retired
+ * before proceeding.
+ */
+ rmb();
+ /* Re-enable hardware prefetchers */
+ wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
+ local_irq_enable();
+out_hit:
+ perf_event_release_kernel(hit_event);
+out_miss:
+ perf_event_release_kernel(miss_event);
+out:
+ /*
+ * All counts will be zero on failure.
+ */
+ counts->miss_before = miss_before;
+ counts->hits_before = hits_before;
+ counts->miss_after = miss_after;
+ counts->hits_after = hits_after;
+ return 0;
+}
+
+static int measure_l2_residency(void *_plr)
+{
+ struct pseudo_lock_region *plr = _plr;
+ struct residency_counts counts = {0};
/*
* Non-architectural event for the Goldmont Microarchitecture
* from Intel x86 Architecture Software Developer Manual (SDM):
* MEM_LOAD_UOPS_RETIRED D1H (event number)
* Umask values:
- * L1_HIT 01H
* L2_HIT 02H
- * L1_MISS 08H
* L2_MISS 10H
- *
- * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event
- * has two "no fix" errata associated with it: BDM35 and BDM100. On
- * this platform we use the following events instead:
- * L2_RQSTS 24H (Documented in https://download.01.org/perfmon/BDW/)
- * REFERENCES FFH
- * MISS 3FH
- * LONGEST_LAT_CACHE 2EH (Documented in SDM)
- * REFERENCE 4FH
- * MISS 41H
*/
-
- /*
- * Start by setting flags for IA32_PERFEVTSELx:
- * OS (Operating system mode) 0x2
- * INT (APIC interrupt enable) 0x10
- * EN (Enable counter) 0x40
- *
- * Then add the Umask value and event number to select performance
- * event.
- */
-
switch (boot_cpu_data.x86_model) {
case INTEL_FAM6_ATOM_GOLDMONT:
- case INTEL_FAM6_ATOM_GEMINI_LAKE:
- l2_hit_bits = (0x52ULL << 16) | (0x2 << 8) | 0xd1;
- l2_miss_bits = (0x52ULL << 16) | (0x10 << 8) | 0xd1;
- break;
- case INTEL_FAM6_BROADWELL_X:
- /* On BDW the l2_hit_bits count references, not hits */
- l2_hit_bits = (0x52ULL << 16) | (0xff << 8) | 0x24;
- l2_miss_bits = (0x52ULL << 16) | (0x3f << 8) | 0x24;
- /* On BDW the l3_hit_bits count references, not hits */
- l3_hit_bits = (0x52ULL << 16) | (0x4f << 8) | 0x2e;
- l3_miss_bits = (0x52ULL << 16) | (0x41 << 8) | 0x2e;
+ case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
+ perf_miss_attr.config = X86_CONFIG(.event = 0xd1,
+ .umask = 0x10);
+ perf_hit_attr.config = X86_CONFIG(.event = 0xd1,
+ .umask = 0x2);
break;
default:
goto out;
}
- local_irq_disable();
+ measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);
/*
- * Call wrmsr direcly to avoid the local register variables from
- * being overwritten due to reordering of their assignment with
- * the wrmsr calls.
+ * If a failure prevented the measurements from succeeding
+ * tracepoints will still be written and all counts will be zero.
*/
- __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
- /* Disable events and reset counters */
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, 0x0);
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x0);
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0, 0x0);
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 1, 0x0);
- if (l3_hit_bits > 0) {
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x0);
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, 0x0);
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 2, 0x0);
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 3, 0x0);
- }
- /* Set and enable the L2 counters */
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, l2_hit_bits);
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, l2_miss_bits);
- if (l3_hit_bits > 0) {
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2,
- l3_hit_bits);
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3,
- l3_miss_bits);
- }
- mem_r = plr->kmem;
- size = plr->size;
- line_size = plr->line_size;
- for (i = 0; i < size; i += line_size) {
- asm volatile("mov (%0,%1,1), %%eax\n\t"
- :
- : "r" (mem_r), "r" (i)
- : "%eax", "memory");
- }
+ trace_pseudo_lock_l2(counts.hits_after - counts.hits_before,
+ counts.miss_after - counts.miss_before);
+out:
+ plr->thread_done = 1;
+ wake_up_interruptible(&plr->lock_thread_wq);
+ return 0;
+}
+
+static int measure_l3_residency(void *_plr)
+{
+ struct pseudo_lock_region *plr = _plr;
+ struct residency_counts counts = {0};
+
/*
- * Call wrmsr directly (no tracing) to not influence
- * the cache access counters as they are disabled.
+ * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event
+ * has two "no fix" errata associated with it: BDM35 and BDM100. On
+ * this platform the following events are used instead:
+ * LONGEST_LAT_CACHE 2EH (Documented in SDM)
+ * REFERENCE 4FH
+ * MISS 41H
*/
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0,
- l2_hit_bits & ~(0x40ULL << 16));
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1,
- l2_miss_bits & ~(0x40ULL << 16));
- if (l3_hit_bits > 0) {
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2,
- l3_hit_bits & ~(0x40ULL << 16));
- pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3,
- l3_miss_bits & ~(0x40ULL << 16));
- }
- l2_hits = native_read_pmc(0);
- l2_miss = native_read_pmc(1);
- if (l3_hit_bits > 0) {
- l3_hits = native_read_pmc(2);
- l3_miss = native_read_pmc(3);
+
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_BROADWELL_X:
+ /* On BDW the hit event counts references, not hits */
+ perf_hit_attr.config = X86_CONFIG(.event = 0x2e,
+ .umask = 0x4f);
+ perf_miss_attr.config = X86_CONFIG(.event = 0x2e,
+ .umask = 0x41);
+ break;
+ default:
+ goto out;
}
- wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
- local_irq_enable();
+
+ measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);
/*
- * On BDW we count references and misses, need to adjust. Sometimes
- * the "hits" counter is a bit more than the references, for
- * example, x references but x + 1 hits. To not report invalid
- * hit values in this case we treat that as misses eaqual to
- * references.
+ * If a failure prevented the measurements from succeeding
+ * tracepoints will still be written and all counts will be zero.
*/
- if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X)
- l2_hits -= (l2_miss > l2_hits ? l2_hits : l2_miss);
- trace_pseudo_lock_l2(l2_hits, l2_miss);
- if (l3_hit_bits > 0) {
- if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X)
- l3_hits -= (l3_miss > l3_hits ? l3_hits : l3_miss);
- trace_pseudo_lock_l3(l3_hits, l3_miss);
+
+ counts.miss_after -= counts.miss_before;
+ if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) {
+ /*
+ * On BDW references and misses are counted, need to adjust.
+ * Sometimes the "hits" counter is a bit more than the
+ * references, for example, x references but x + 1 hits.
+ * To not report invalid hit values in this case we treat
+ * that as misses equal to references.
+ */
+ /* First compute the number of cache references measured */
+ counts.hits_after -= counts.hits_before;
+ /* Next convert references to cache hits */
+ counts.hits_after -= min(counts.miss_after, counts.hits_after);
+ } else {
+ counts.hits_after -= counts.hits_before;
}
+ trace_pseudo_lock_l3(counts.hits_after, counts.miss_after);
out:
plr->thread_done = 1;
wake_up_interruptible(&plr->lock_thread_wq);
@@ -1116,6 +1174,11 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
goto out;
}
+ if (!plr->d) {
+ ret = -ENODEV;
+ goto out;
+ }
+
plr->thread_done = 0;
cpu = cpumask_first(&plr->d->cpu_mask);
if (!cpu_online(cpu)) {
@@ -1123,13 +1186,20 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
goto out;
}
+ plr->cpu = cpu;
+
if (sel == 1)
thread = kthread_create_on_node(measure_cycles_lat_fn, plr,
cpu_to_node(cpu),
"pseudo_lock_measure/%u",
cpu);
else if (sel == 2)
- thread = kthread_create_on_node(measure_cycles_perf_fn, plr,
+ thread = kthread_create_on_node(measure_l2_residency, plr,
+ cpu_to_node(cpu),
+ "pseudo_lock_measure/%u",
+ cpu);
+ else if (sel == 3)
+ thread = kthread_create_on_node(measure_l3_residency, plr,
cpu_to_node(cpu),
"pseudo_lock_measure/%u",
cpu);
@@ -1173,7 +1243,7 @@ static ssize_t pseudo_lock_measure_trigger(struct file *file,
buf[buf_size] = '\0';
ret = kstrtoint(buf, 10, &sel);
if (ret == 0) {
- if (sel != 1)
+ if (sel != 1 && sel != 2 && sel != 3)
return -EINVAL;
ret = debugfs_file_get(file->f_path.dentry);
if (ret)
@@ -1429,6 +1499,11 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
plr = rdtgrp->plr;
+ if (!plr->d) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -ENODEV;
+ }
+
/*
* Task is required to run with affinity to the cpus associated
* with the pseudo-locked region. If this is not the case the task