summaryrefslogtreecommitdiff
path: root/arch/x86/events/intel/ds.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-07-16 17:13:31 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-07-16 17:13:31 -0700
commit576a997c6315ee482519e7cc080f341b07638808 (patch)
tree625cff628b8db0638ed496f560d38eda0375cacb /arch/x86/events/intel/ds.c
parent4a996d90b9e046c6d59845acf00a54d464c34ff3 (diff)
parentfa0c1c9d283b37fdb7fc1dcccbb88fc8f48a4aa4 (diff)
Merge tag 'perf-core-2024-07-16' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull performance events updates from Ingo Molnar: - Intel PT support enhancements & fixes - Fix leaked SIGTRAP events - Improve and fix the Intel uncore driver - Add support for Intel HBM and CXL uncore counters - Add Intel Lake and Arrow Lake support - AMD uncore driver fixes - Make SIGTRAP and __perf_pending_irq() work on RT - Micro-optimizations - Misc cleanups and fixes * tag 'perf-core-2024-07-16' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (44 commits) perf/x86/intel: Add a distinct name for Granite Rapids perf/x86/intel/ds: Fix non 0 retire latency on Raptorlake perf/x86/intel: Hide Topdown metrics events if the feature is not enumerated perf/x86/intel/uncore: Fix the bits of the CHA extended umask for SPR perf: Split __perf_pending_irq() out of perf_pending_irq() perf: Don't disable preemption in perf_pending_task(). perf: Move swevent_htable::recursion into task_struct. perf: Shrink the size of the recursion counter. perf: Enqueue SIGTRAP always via task_work. task_work: Add TWA_NMI_CURRENT as an additional notify mode. perf: Move irq_work_queue() where the event is prepared. perf: Fix event leak upon exec and file release perf: Fix event leak upon exit task_work: Introduce task_work_cancel() again task_work: s/task_work_cancel()/task_work_cancel_func()/ perf/x86/amd/uncore: Fix DF and UMC domain identification perf/x86/amd/uncore: Avoid PMU registration if counters are unavailable perf/x86/intel: Support Perfmon MSRs aliasing perf/x86/intel: Support PERFEVTSEL extension perf/x86: Add config_mask to represent EVENTSEL bitmask ...
Diffstat (limited to 'arch/x86/events/intel/ds.c')
-rw-r--r--arch/x86/events/intel/ds.c180
1 files changed, 148 insertions, 32 deletions
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index e010bfed8417..fa5ea65de0d0 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -63,6 +63,15 @@ union intel_x86_pebs_dse {
unsigned int mtl_fwd_blk:1;
unsigned int ld_reserved4:24;
};
+ struct {
+ unsigned int lnc_dse:8;
+ unsigned int ld_reserved5:2;
+ unsigned int lnc_stlb_miss:1;
+ unsigned int lnc_locked:1;
+ unsigned int lnc_data_blk:1;
+ unsigned int lnc_addr_blk:1;
+ unsigned int ld_reserved6:18;
+ };
};
@@ -77,7 +86,7 @@ union intel_x86_pebs_dse {
#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
/* Version for Sandy Bridge and later */
-static u64 pebs_data_source[] = {
+static u64 pebs_data_source[PERF_PEBS_DATA_SOURCE_MAX] = {
P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x01: L1 local */
OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */
@@ -173,6 +182,40 @@ void __init intel_pmu_pebs_data_source_cmt(void)
__intel_pmu_pebs_data_source_cmt(pebs_data_source);
}
+/* Version for Lion Cove and later */
+static u64 lnc_pebs_data_source[PERF_PEBS_DATA_SOURCE_MAX] = {
+ P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA), /* 0x00: ukn L3 */
+ OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x01: L1 hit */
+ OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x02: L1 hit */
+ OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x03: LFB/L1 Miss Handling Buffer hit */
+ 0, /* 0x04: Reserved */
+ OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x05: L2 Hit */
+ OP_LH | LEVEL(L2_MHB) | P(SNOOP, NONE), /* 0x06: L2 Miss Handling Buffer Hit */
+ 0, /* 0x07: Reserved */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, NONE), /* 0x08: L3 Hit */
+ 0, /* 0x09: Reserved */
+ 0, /* 0x0a: Reserved */
+ 0, /* 0x0b: Reserved */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOPX, FWD), /* 0x0c: L3 Hit Snoop Fwd */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM), /* 0x0d: L3 Hit Snoop HitM */
+ 0, /* 0x0e: Reserved */
+ P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM), /* 0x0f: L3 Miss Snoop HitM */
+ OP_LH | LEVEL(MSC) | P(SNOOP, NONE), /* 0x10: Memory-side Cache Hit */
+ OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | P(SNOOP, NONE), /* 0x11: Local Memory Hit */
+};
+
+void __init intel_pmu_pebs_data_source_lnl(void)
+{
+ u64 *data_source;
+
+ data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].pebs_data_source;
+ memcpy(data_source, lnc_pebs_data_source, sizeof(lnc_pebs_data_source));
+
+ data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX].pebs_data_source;
+ memcpy(data_source, pebs_data_source, sizeof(pebs_data_source));
+ __intel_pmu_pebs_data_source_cmt(data_source);
+}
+
static u64 precise_store_data(u64 status)
{
union intel_x86_pebs_dse dse;
@@ -257,14 +300,14 @@ static inline void pebs_set_tlb_lock(u64 *val, bool tlb, bool lock)
}
/* Retrieve the latency data for e-core of ADL */
-static u64 __adl_latency_data_small(struct perf_event *event, u64 status,
- u8 dse, bool tlb, bool lock, bool blk)
+static u64 __grt_latency_data(struct perf_event *event, u64 status,
+ u8 dse, bool tlb, bool lock, bool blk)
{
u64 val;
WARN_ON_ONCE(hybrid_pmu(event->pmu)->pmu_type == hybrid_big);
- dse &= PERF_PEBS_DATA_SOURCE_MASK;
+ dse &= PERF_PEBS_DATA_SOURCE_GRT_MASK;
val = hybrid_var(event->pmu, pebs_data_source)[dse];
pebs_set_tlb_lock(&val, tlb, lock);
@@ -277,27 +320,72 @@ static u64 __adl_latency_data_small(struct perf_event *event, u64 status,
return val;
}
-u64 adl_latency_data_small(struct perf_event *event, u64 status)
+u64 grt_latency_data(struct perf_event *event, u64 status)
{
union intel_x86_pebs_dse dse;
dse.val = status;
- return __adl_latency_data_small(event, status, dse.ld_dse,
- dse.ld_locked, dse.ld_stlb_miss,
- dse.ld_data_blk);
+ return __grt_latency_data(event, status, dse.ld_dse,
+ dse.ld_locked, dse.ld_stlb_miss,
+ dse.ld_data_blk);
}
/* Retrieve the latency data for e-core of MTL */
-u64 mtl_latency_data_small(struct perf_event *event, u64 status)
+u64 cmt_latency_data(struct perf_event *event, u64 status)
{
union intel_x86_pebs_dse dse;
dse.val = status;
- return __adl_latency_data_small(event, status, dse.mtl_dse,
- dse.mtl_stlb_miss, dse.mtl_locked,
- dse.mtl_fwd_blk);
+ return __grt_latency_data(event, status, dse.mtl_dse,
+ dse.mtl_stlb_miss, dse.mtl_locked,
+ dse.mtl_fwd_blk);
+}
+
+static u64 lnc_latency_data(struct perf_event *event, u64 status)
+{
+ union intel_x86_pebs_dse dse;
+ union perf_mem_data_src src;
+ u64 val;
+
+ dse.val = status;
+
+ /* LNC core latency data */
+ val = hybrid_var(event->pmu, pebs_data_source)[status & PERF_PEBS_DATA_SOURCE_MASK];
+ if (!val)
+ val = P(OP, LOAD) | LEVEL(NA) | P(SNOOP, NA);
+
+ if (dse.lnc_stlb_miss)
+ val |= P(TLB, MISS) | P(TLB, L2);
+ else
+ val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
+
+ if (dse.lnc_locked)
+ val |= P(LOCK, LOCKED);
+
+ if (dse.lnc_data_blk)
+ val |= P(BLK, DATA);
+ if (dse.lnc_addr_blk)
+ val |= P(BLK, ADDR);
+ if (!dse.lnc_data_blk && !dse.lnc_addr_blk)
+ val |= P(BLK, NA);
+
+ src.val = val;
+ if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
+ src.mem_op = P(OP, STORE);
+
+ return src.val;
+}
+
+u64 lnl_latency_data(struct perf_event *event, u64 status)
+{
+ struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+ if (pmu->pmu_type == hybrid_small)
+ return cmt_latency_data(event, status);
+
+ return lnc_latency_data(event, status);
}
static u64 load_latency_data(struct perf_event *event, u64 status)
@@ -1086,6 +1174,32 @@ struct event_constraint intel_glc_pebs_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_lnc_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL),
+
+ INTEL_HYBRID_LDLAT_CONSTRAINT(0x1cd, 0x3ff),
+ INTEL_HYBRID_STLAT_CONSTRAINT(0x2cd, 0x3),
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
+
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf),
+
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xd0, 0xf),
+
+ /*
+ * Everything else is handled by PMU_FL_PEBS_ALL, because we
+ * need the full constraints from the main table.
+ */
+
+ EVENT_CONSTRAINT_END
+};
+
struct event_constraint *intel_pebs_constraints(struct perf_event *event)
{
struct event_constraint *pebs_constraints = hybrid(event->pmu, pebs_constraints);
@@ -1137,8 +1251,7 @@ void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sche
static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)
{
struct debug_store *ds = cpuc->ds;
- int max_pebs_events = hybrid(cpuc->pmu, max_pebs_events);
- int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
+ int max_pebs_events = intel_pmu_max_num_pebs(cpuc->pmu);
u64 threshold;
int reserved;
@@ -1146,7 +1259,7 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)
return;
if (x86_pmu.flags & PMU_FL_PEBS_ALL)
- reserved = max_pebs_events + num_counters_fixed;
+ reserved = max_pebs_events + x86_pmu_max_num_counters_fixed(cpuc->pmu);
else
reserved = max_pebs_events;
@@ -1831,8 +1944,12 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
set_linear_ip(regs, basic->ip);
regs->flags = PERF_EFLAGS_EXACT;
- if ((sample_type & PERF_SAMPLE_WEIGHT_STRUCT) && (x86_pmu.flags & PMU_FL_RETIRE_LATENCY))
- data->weight.var3_w = format_size >> PEBS_RETIRE_LATENCY_OFFSET & PEBS_LATENCY_MASK;
+ if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) {
+ if (x86_pmu.flags & PMU_FL_RETIRE_LATENCY)
+ data->weight.var3_w = format_size >> PEBS_RETIRE_LATENCY_OFFSET & PEBS_LATENCY_MASK;
+ else
+ data->weight.var3_w = 0;
+ }
/*
* The record for MEMINFO is in front of GP
@@ -2157,6 +2274,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
void *base, *at, *top;
short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
short error[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
+ int max_pebs_events = intel_pmu_max_num_pebs(NULL);
int bit, i, size;
u64 mask;
@@ -2168,11 +2286,11 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
ds->pebs_index = ds->pebs_buffer_base;
- mask = (1ULL << x86_pmu.max_pebs_events) - 1;
- size = x86_pmu.max_pebs_events;
+ mask = x86_pmu.pebs_events_mask;
+ size = max_pebs_events;
if (x86_pmu.flags & PMU_FL_PEBS_ALL) {
- mask |= ((1ULL << x86_pmu.num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED;
- size = INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed;
+ mask |= x86_pmu.fixed_cntr_mask64 << INTEL_PMC_IDX_FIXED;
+ size = INTEL_PMC_IDX_FIXED + x86_pmu_max_num_counters_fixed(NULL);
}
if (unlikely(base >= top)) {
@@ -2208,8 +2326,9 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
pebs_status = p->status = cpuc->pebs_enabled;
bit = find_first_bit((unsigned long *)&pebs_status,
- x86_pmu.max_pebs_events);
- if (bit >= x86_pmu.max_pebs_events)
+ max_pebs_events);
+
+ if (!(x86_pmu.pebs_events_mask & (1 << bit)))
continue;
/*
@@ -2267,12 +2386,10 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
{
short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- int max_pebs_events = hybrid(cpuc->pmu, max_pebs_events);
- int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
struct debug_store *ds = cpuc->ds;
struct perf_event *event;
void *base, *at, *top;
- int bit, size;
+ int bit;
u64 mask;
if (!x86_pmu.pebs_active)
@@ -2283,12 +2400,11 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
ds->pebs_index = ds->pebs_buffer_base;
- mask = ((1ULL << max_pebs_events) - 1) |
- (((1ULL << num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED);
- size = INTEL_PMC_IDX_FIXED + num_counters_fixed;
+ mask = hybrid(cpuc->pmu, pebs_events_mask) |
+ (hybrid(cpuc->pmu, fixed_cntr_mask64) << INTEL_PMC_IDX_FIXED);
if (unlikely(base >= top)) {
- intel_pmu_pebs_event_update_no_drain(cpuc, size);
+ intel_pmu_pebs_event_update_no_drain(cpuc, X86_PMC_IDX_MAX);
return;
}
@@ -2298,11 +2414,11 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
pebs_status = get_pebs_status(at) & cpuc->pebs_enabled;
pebs_status &= mask;
- for_each_set_bit(bit, (unsigned long *)&pebs_status, size)
+ for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX)
counts[bit]++;
}
- for_each_set_bit(bit, (unsigned long *)&mask, size) {
+ for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) {
if (counts[bit] == 0)
continue;