diff options
-rw-r--r-- | arch/arm64/include/asm/pgtable.h | 4 | ||||
-rw-r--r-- | arch/powerpc/include/asm/nohash/32/pte-8xx.h | 23 | ||||
-rw-r--r-- | arch/powerpc/perf/core-book3s.c | 6 | ||||
-rw-r--r-- | arch/sparc/include/asm/pgtable_64.h | 13 | ||||
-rw-r--r-- | arch/sparc/mm/hugetlbpage.c | 19 | ||||
-rw-r--r-- | arch/x86/events/core.c | 2 | ||||
-rw-r--r-- | arch/x86/events/intel/core.c | 20 | ||||
-rw-r--r-- | arch/x86/events/intel/cstate.c | 19 | ||||
-rw-r--r-- | arch/x86/events/intel/ds.c | 11 | ||||
-rw-r--r-- | arch/x86/events/intel/lbr.c | 2 | ||||
-rw-r--r-- | arch/x86/events/intel/uncore.c | 6 | ||||
-rw-r--r-- | arch/x86/events/intel/uncore_snb.c | 20 | ||||
-rw-r--r-- | arch/x86/events/msr.c | 1 | ||||
-rw-r--r-- | arch/x86/events/perf_event.h | 2 | ||||
-rw-r--r-- | arch/x86/kernel/kprobes/core.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/uprobes.c | 2 | ||||
-rw-r--r-- | include/linux/perf_event.h | 6 | ||||
-rw-r--r-- | include/linux/pgtable.h | 71 | ||||
-rw-r--r-- | include/uapi/linux/perf_event.h | 6 | ||||
-rw-r--r-- | kernel/events/core.c | 160 | ||||
-rw-r--r-- | mm/gup.c | 58 |
21 files changed, 349 insertions, 108 deletions
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 005eb035fbfa..88a18d7f4e59 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -415,6 +415,7 @@ static inline int pmd_trans_huge(pmd_t pmd) #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) #define pmd_young(pmd) pte_young(pmd_pte(pmd)) #define pmd_valid(pmd) pte_valid(pmd_pte(pmd)) +#define pmd_cont(pmd) pte_cont(pmd_pte(pmd)) #define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) #define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd))) #define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd))) @@ -511,6 +512,9 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, PMD_TYPE_SECT) #define pmd_leaf(pmd) pmd_sect(pmd) +#define pmd_leaf_size(pmd) (pmd_cont(pmd) ? CONT_PMD_SIZE : PMD_SIZE) +#define pte_leaf_size(pte) (pte_cont(pte) ? CONT_PTE_SIZE : PAGE_SIZE) + #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3 static inline bool pud_sect(pud_t pud) { return false; } static inline bool pud_table(pud_t pud) { return true; } diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h index 1581204467e1..fcc48d590d88 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h @@ -135,6 +135,29 @@ static inline pte_t pte_mkhuge(pte_t pte) } #define pte_mkhuge pte_mkhuge + +static inline unsigned long pgd_leaf_size(pgd_t pgd) +{ + if (pgd_val(pgd) & _PMD_PAGE_8M) + return SZ_8M; + return SZ_4M; +} + +#define pgd_leaf_size pgd_leaf_size + +static inline unsigned long pte_leaf_size(pte_t pte) +{ + pte_basic_t val = pte_val(pte); + + if (val & _PAGE_HUGE) + return SZ_512K; + if (val & _PAGE_SPS) + return SZ_16K; + return SZ_4K; +} + +#define pte_leaf_size pte_leaf_size + #endif #endif /* __KERNEL__ */ diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 08643cba1494..6586f7e71cfb 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2074,6 +2074,9 @@ static struct pmu power_pmu = { .sched_task = power_pmu_sched_task, }; +#define PERF_SAMPLE_ADDR_TYPE (PERF_SAMPLE_ADDR | \ + PERF_SAMPLE_PHYS_ADDR | \ + PERF_SAMPLE_DATA_PAGE_SIZE) /* * A counter has overflowed; update its count and record * things if requested. Note that interrupts are hard-disabled @@ -2129,8 +2132,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, perf_sample_data_init(&data, ~0ULL, event->hw.last_period); - if (event->attr.sample_type & - (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) + if (event->attr.sample_type & PERF_SAMPLE_ADDR_TYPE) perf_get_data_addr(event, regs, &data.addr); if (event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK) { diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 7ef6affa105e..550d3904de65 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -1121,6 +1121,19 @@ extern unsigned long cmdline_memory_size; asmlinkage void do_sparc64_fault(struct pt_regs *regs); +#ifdef CONFIG_HUGETLB_PAGE + +#define pud_leaf_size pud_leaf_size +extern unsigned long pud_leaf_size(pud_t pud); + +#define pmd_leaf_size pmd_leaf_size +extern unsigned long pmd_leaf_size(pmd_t pmd); + +#define pte_leaf_size pte_leaf_size +extern unsigned long pte_leaf_size(pte_t pte); + +#endif /* CONFIG_HUGETLB_PAGE */ + #endif /* !(__ASSEMBLY__) */ #endif /* !(_SPARC64_PGTABLE_H) */ diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index ec423b5f17dd..ad4b42f04988 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -247,14 +247,17 @@ static unsigned int sun4u_huge_tte_to_shift(pte_t entry) return shift; } -static unsigned int huge_tte_to_shift(pte_t entry) +static unsigned long tte_to_shift(pte_t entry) { - unsigned long shift; - if (tlb_type == hypervisor) - shift = sun4v_huge_tte_to_shift(entry); - else - shift = sun4u_huge_tte_to_shift(entry); + return sun4v_huge_tte_to_shift(entry); + + return sun4u_huge_tte_to_shift(entry); +} + +static unsigned int huge_tte_to_shift(pte_t entry) +{ + unsigned long shift = tte_to_shift(entry); if (shift == PAGE_SHIFT) WARN_ONCE(1, "tto_to_shift: invalid hugepage tte=0x%lx\n", @@ -272,6 +275,10 @@ static unsigned long huge_tte_to_size(pte_t pte) return size; } +unsigned long pud_leaf_size(pud_t pud) { return 1UL << tte_to_shift(*(pte_t *)&pud); } +unsigned long pmd_leaf_size(pmd_t pmd) { return 1UL << tte_to_shift(*(pte_t *)&pmd); } +unsigned long pte_leaf_size(pte_t pte) { return 1UL << tte_to_shift(pte); } + pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 77b963e5e70a..e37de298a495 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1174,7 +1174,7 @@ static inline void x86_assign_hw_event(struct perf_event *event, case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: /* All the metric events are mapped onto the fixed counter 3. */ idx = INTEL_PMC_IDX_FIXED_SLOTS; - /* fall through */ + fallthrough; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1: hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index af457f8cb29d..d4569bfa83e3 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -257,7 +257,8 @@ static struct event_constraint intel_icl_event_constraints[] = { INTEL_EVENT_CONSTRAINT_RANGE(0x48, 0x54, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0x60, 0x8b, 0xf), INTEL_UEVENT_CONSTRAINT(0x04a3, 0xff), /* CYCLE_ACTIVITY.STALLS_TOTAL */ - INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff), /* CYCLE_ACTIVITY.STALLS_MEM_ANY */ + INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff), /* CYCLE_ACTIVITY.CYCLES_MEM_ANY */ + INTEL_UEVENT_CONSTRAINT(0x14a3, 0xff), /* CYCLE_ACTIVITY.STALLS_MEM_ANY */ INTEL_EVENT_CONSTRAINT(0xa3, 0xf), /* CYCLE_ACTIVITY.* */ INTEL_EVENT_CONSTRAINT_RANGE(0xa8, 0xb0, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0xb7, 0xbd, 0xf), @@ -1900,6 +1901,19 @@ static __initconst const u64 tnt_hw_cache_extra_regs }, }; +EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound_tnt, "event=0x71,umask=0x0"); +EVENT_ATTR_STR(topdown-retiring, td_retiring_tnt, "event=0xc2,umask=0x0"); +EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec_tnt, "event=0x73,umask=0x6"); +EVENT_ATTR_STR(topdown-be-bound, td_be_bound_tnt, "event=0x74,umask=0x0"); + +static struct attribute *tnt_events_attrs[] = { + EVENT_PTR(td_fe_bound_tnt), + EVENT_PTR(td_retiring_tnt), + EVENT_PTR(td_bad_spec_tnt), + EVENT_PTR(td_be_bound_tnt), + NULL, +}; + static struct extra_reg intel_tnt_extra_regs[] __read_mostly = { /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x800ff0ffffff9fffull, RSP_0), @@ -5173,6 +5187,7 @@ __init int intel_pmu_init(void) x86_pmu.lbr_pt_coexist = true; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.get_event_constraints = tnt_get_event_constraints; + td_attr = tnt_events_attrs; extra_attr = slm_format_attr; pr_cont("Tremont events, "); name = "Tremont"; @@ -5442,6 +5457,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ICELAKE: case INTEL_FAM6_TIGERLAKE_L: case INTEL_FAM6_TIGERLAKE: + case INTEL_FAM6_ROCKETLAKE: x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); @@ -5464,7 +5480,7 @@ __init int intel_pmu_init(void) mem_attr = icl_events_attrs; td_attr = icl_td_events_attrs; tsx_attr = icl_tsx_events_attrs; - x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xca, .umask=0x02); + x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04); x86_pmu.lbr_pt_coexist = true; intel_pmu_pebs_data_source_skl(pmem); x86_pmu.update_topdown_event = icl_update_topdown_event; diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 4eb7ee5fed72..407eee5f6f95 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -51,46 +51,46 @@ * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL, - * TNT + * TNT,RKL * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 * Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML, - * ICL,TGL + * ICL,TGL,RKL * Scope: Core * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. * perf code: 0x00 * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, - * KBL,CML,ICL,TGL,TNT + * KBL,CML,ICL,TGL,TNT,RKL * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL, - * GLM,CNL,KBL,CML,ICL,TGL,TNT + * GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL * Scope: Package (physical package) * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter. * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL, - * TNT + * TNT,RKL * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL, - * KBL,CML,ICL,TGL + * KBL,CML,ICL,TGL,RKL * Scope: Package (physical package) * MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter. * perf code: 0x04 - * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL + * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL * Scope: Package (physical package) * MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter. * perf code: 0x05 - * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL + * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL * Scope: Package (physical package) * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. * perf code: 0x06 * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL, - * TNT + * TNT,RKL * Scope: Package (physical package) * */ @@ -649,6 +649,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &icl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &icl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index a5c6ff5f1b4c..67dbc91bccfe 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -961,7 +961,8 @@ static void adaptive_pebs_record_size_update(void) #define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC | \ PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT | \ - PERF_SAMPLE_TRANSACTION) + PERF_SAMPLE_TRANSACTION | \ + PERF_SAMPLE_DATA_PAGE_SIZE) static u64 pebs_update_adaptive_cfg(struct perf_event *event) { @@ -1337,6 +1338,10 @@ static u64 get_data_src(struct perf_event *event, u64 aux) return val; } +#define PERF_SAMPLE_ADDR_TYPE (PERF_SAMPLE_ADDR | \ + PERF_SAMPLE_PHYS_ADDR | \ + PERF_SAMPLE_DATA_PAGE_SIZE) + static void setup_pebs_fixed_sample_data(struct perf_event *event, struct pt_regs *iregs, void *__pebs, struct perf_sample_data *data, @@ -1451,7 +1456,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, } - if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) && + if ((sample_type & PERF_SAMPLE_ADDR_TYPE) && x86_pmu.intel_cap.pebs_format >= 1) data->addr = pebs->dla; @@ -1579,7 +1584,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, if (sample_type & PERF_SAMPLE_DATA_SRC) data->data_src.val = get_data_src(event, meminfo->aux); - if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) + if (sample_type & PERF_SAMPLE_ADDR_TYPE) data->addr = meminfo->address; if (sample_type & PERF_SAMPLE_TRANSACTION) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 1aadb253d296..21890dacfcfe 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -919,7 +919,7 @@ static __always_inline bool get_lbr_predicted(u64 info) return !(info & LBR_INFO_MISPRED); } -static __always_inline bool get_lbr_cycles(u64 info) +static __always_inline u16 get_lbr_cycles(u64 info) { if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !(x86_pmu.lbr_timed_lbr && info & LBR_INFO_CYC_CNT_VALID)) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 80d52cbe2fde..357258f82dc8 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1636,6 +1636,11 @@ static const struct intel_uncore_init_fun tgl_l_uncore_init __initconst = { .mmio_init = tgl_l_uncore_mmio_init, }; +static const struct intel_uncore_init_fun rkl_uncore_init __initconst = { + .cpu_init = tgl_uncore_cpu_init, + .pci_init = skl_uncore_pci_init, +}; + static const struct intel_uncore_init_fun icx_uncore_init __initconst = { .cpu_init = icx_uncore_cpu_init, .pci_init = icx_uncore_pci_init, @@ -1683,6 +1688,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &tgl_l_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &tgl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rkl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init), {}, }; diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index bbd1120ae161..098f893e2e22 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -60,7 +60,8 @@ #define PCI_DEVICE_ID_INTEL_TGL_U3_IMC 0x9a12 #define PCI_DEVICE_ID_INTEL_TGL_U4_IMC 0x9a14 #define PCI_DEVICE_ID_INTEL_TGL_H_IMC 0x9a36 - +#define PCI_DEVICE_ID_INTEL_RKL_1_IMC 0x4c43 +#define PCI_DEVICE_ID_INTEL_RKL_2_IMC 0x4c53 /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff @@ -405,6 +406,12 @@ static struct intel_uncore_type *tgl_msr_uncores[] = { NULL, }; +static void rkl_uncore_msr_init_box(struct intel_uncore_box *box) +{ + if (box->pmu->pmu_idx == 0) + wrmsrl(SKL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN); +} + void tgl_uncore_cpu_init(void) { uncore_msr_uncores = tgl_msr_uncores; @@ -412,6 +419,7 @@ void tgl_uncore_cpu_init(void) icl_uncore_cbox.ops = &skl_uncore_msr_ops; icl_uncore_clockbox.ops = &skl_uncore_msr_ops; snb_uncore_arb.ops = &skl_uncore_msr_ops; + skl_uncore_msr_ops.init_box = rkl_uncore_msr_init_box; } enum { @@ -926,6 +934,14 @@ static const struct pci_device_id icl_uncore_pci_ids[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U2_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RKL_1_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RKL_2_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, { /* end: all zeroes */ }, }; @@ -1019,6 +1035,8 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(CML_S5_IMC, &skl_uncore_pci_driver), IMC_DEV(ICL_U_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ IMC_DEV(ICL_U2_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ + IMC_DEV(RKL_1_IMC, &icl_uncore_pci_driver), + IMC_DEV(RKL_2_IMC, &icl_uncore_pci_driver), { /* end marker */ } }; diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 4be8f9cabd07..680404c58cb1 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -99,6 +99,7 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_ICELAKE_D: case INTEL_FAM6_TIGERLAKE_L: case INTEL_FAM6_TIGERLAKE: + case INTEL_FAM6_ROCKETLAKE: if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) return true; break; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 6a8edfe59b09..7895cf4c59a7 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -132,7 +132,7 @@ struct amd_nb { PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | \ - PERF_SAMPLE_PERIOD) + PERF_SAMPLE_PERIOD | PERF_SAMPLE_CODE_PAGE_SIZE) #define PEBS_GP_REGS \ ((1ULL << PERF_REG_X86_AX) | \ diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 547c7abb39f5..a65e9e97857f 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -864,6 +864,7 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs, p->ainsn.boostable = true; goto no_change; } + break; default: break; } @@ -937,6 +938,11 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) * So clear it by resetting the current kprobe: */ regs->flags &= ~X86_EFLAGS_TF; + /* + * Since the single step (trap) has been cancelled, + * we need to restore BTF here. + */ + restore_btf(); /* * If the TF flag was set before the kprobe hit, diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 138bdb1fd136..a2b413394917 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -1017,6 +1017,8 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, if (uprobe_post_sstep_notifier(regs)) ret = NOTIFY_STOP; + break; + default: break; } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 96450f6fb1de..9a38f579bc76 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1028,6 +1028,8 @@ struct perf_sample_data { u64 phys_addr; u64 cgroup; + u64 data_page_size; + u64 code_page_size; } ____cacheline_aligned; /* default value for data source */ @@ -1585,4 +1587,8 @@ extern void __weak arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now); +#ifdef CONFIG_MMU +extern __weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr); +#endif + #endif /* _LINUX_PERF_EVENT_H */ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e237004d498d..8fcdfa52eb4b 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -258,6 +258,61 @@ static inline pte_t ptep_get(pte_t *ptep) } #endif +#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH +/* + * WARNING: only to be used in the get_user_pages_fast() implementation. + * + * With get_user_pages_fast(), we walk down the pagetables without taking any + * locks. For this we would like to load the pointers atomically, but sometimes + * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What + * we do have is the guarantee that a PTE will only either go from not present + * to present, or present to not present or both -- it will not switch to a + * completely different present page without a TLB flush in between; something + * that we are blocking by holding interrupts off. + * + * Setting ptes from not present to present goes: + * + * ptep->pte_high = h; + * smp_wmb(); + * ptep->pte_low = l; + * + * And present to not present goes: + * + * ptep->pte_low = 0; + * smp_wmb(); + * ptep->pte_high = 0; + * + * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'. + * We load pte_high *after* loading pte_low, which ensures we don't see an older + * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't + * picked up a changed pte high. We might have gotten rubbish values from + * pte_low and pte_high, but we are guaranteed that pte_low will not have the + * present bit set *unless* it is 'l'. Because get_user_pages_fast() only + * operates on present ptes we're safe. + */ +static inline pte_t ptep_get_lockless(pte_t *ptep) +{ + pte_t pte; + + do { + pte.pte_low = ptep->pte_low; + smp_rmb(); + pte.pte_high = ptep->pte_high; + smp_rmb(); + } while (unlikely(pte.pte_low != ptep->pte_low)); + + return pte; +} +#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */ +/* + * We require that the PTE can be read atomically. + */ +static inline pte_t ptep_get_lockless(pte_t *ptep) +{ + return ptep_get(ptep); +} +#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, @@ -1494,4 +1549,20 @@ typedef unsigned int pgtbl_mod_mask; #define pmd_leaf(x) 0 #endif +#ifndef pgd_leaf_size +#define pgd_leaf_size(x) (1ULL << PGDIR_SHIFT) +#endif +#ifndef p4d_leaf_size +#define p4d_leaf_size(x) P4D_SIZE +#endif +#ifndef pud_leaf_size +#define pud_leaf_size(x) PUD_SIZE +#endif +#ifndef pmd_leaf_size +#define pmd_leaf_size(x) PMD_SIZE +#endif +#ifndef pte_leaf_size +#define pte_leaf_size(x) PAGE_SIZE +#endif + #endif /* _LINUX_PGTABLE_H */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index b95d3c485d27..b15e3447cd9f 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -143,8 +143,10 @@ enum perf_event_sample_format { PERF_SAMPLE_PHYS_ADDR = 1U << 19, PERF_SAMPLE_AUX = 1U << 20, PERF_SAMPLE_CGROUP = 1U << 21, + PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22, + PERF_SAMPLE_CODE_PAGE_SIZE = 1U << 23, - PERF_SAMPLE_MAX = 1U << 22, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 24, /* non-ABI */ __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; @@ -896,6 +898,8 @@ enum perf_event_type { * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR * { u64 size; * char data[size]; } && PERF_SAMPLE_AUX + * { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE + * { u64 code_page_size;} && PERF_SAMPLE_CODE_PAGE_SIZE * }; */ PERF_RECORD_SAMPLE = 9, diff --git a/kernel/events/core.c b/kernel/events/core.c index dc568ca295bd..19ae6c931c52 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -51,6 +51,8 @@ #include <linux/proc_ns.h> #include <linux/mount.h> #include <linux/min_heap.h> +#include <linux/highmem.h> +#include <linux/pgtable.h> #include "internal.h" @@ -1895,6 +1897,12 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_CGROUP) size += sizeof(data->cgroup); + if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) + size += sizeof(data->data_page_size); + + if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) + size += sizeof(data->code_page_size); + event->header_size = size; } @@ -6931,6 +6939,12 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_CGROUP) perf_output_put(handle, data->cgroup); + if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) + perf_output_put(handle, data->data_page_size); + + if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) + perf_output_put(handle, data->code_page_size); + if (sample_type & PERF_SAMPLE_AUX) { perf_output_put(handle, data->aux_size); @@ -6988,6 +7002,93 @@ static u64 perf_virt_to_phys(u64 virt) return phys_addr; } +/* + * Return the pagetable size of a given virtual address. + */ +static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) +{ + u64 size = 0; + +#ifdef CONFIG_HAVE_FAST_GUP + pgd_t *pgdp, pgd; + p4d_t *p4dp, p4d; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; + pte_t *ptep, pte; + + pgdp = pgd_offset(mm, addr); + pgd = READ_ONCE(*pgdp); + if (pgd_none(pgd)) + return 0; + + if (pgd_leaf(pgd)) + return pgd_leaf_size(pgd); + + p4dp = p4d_offset_lockless(pgdp, pgd, addr); + p4d = READ_ONCE(*p4dp); + if (!p4d_present(p4d)) + return 0; + + if (p4d_leaf(p4d)) + return p4d_leaf_size(p4d); + + pudp = pud_offset_lockless(p4dp, p4d, addr); + pud = READ_ONCE(*pudp); + if (!pud_present(pud)) + return 0; + + if (pud_leaf(pud)) + return pud_leaf_size(pud); + + pmdp = pmd_offset_lockless(pudp, pud, addr); + pmd = READ_ONCE(*pmdp); + if (!pmd_present(pmd)) + return 0; + + if (pmd_leaf(pmd)) + return pmd_leaf_size(pmd); + + ptep = pte_offset_map(&pmd, addr); + pte = ptep_get_lockless(ptep); + if (pte_present(pte)) + size = pte_leaf_size(pte); + pte_unmap(ptep); +#endif /* CONFIG_HAVE_FAST_GUP */ + + return size; +} + +static u64 perf_get_page_size(unsigned long addr) +{ + struct mm_struct *mm; + unsigned long flags; + u64 size; + + if (!addr) + return 0; + + /* + * Software page-table walkers must disable IRQs, + * which prevents any tear down of the page tables. + */ + local_irq_save(flags); + + mm = current->mm; + if (!mm) { + /* + * For kernel threads and the like, use init_mm so that + * we can find kernel memory. + */ + mm = &init_mm; + } + + size = perf_get_pgtable_size(mm, addr); + + local_irq_restore(flags); + + return size; +} + static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; struct perf_callchain_entry * @@ -7023,7 +7124,7 @@ void perf_prepare_sample(struct perf_event_header *header, __perf_event_header__init_id(header, data, event); - if (sample_type & PERF_SAMPLE_IP) + if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) data->ip = perf_instruction_pointer(regs); if (sample_type & PERF_SAMPLE_CALLCHAIN) { @@ -7142,6 +7243,17 @@ void perf_prepare_sample(struct perf_event_header *header, } #endif + /* + * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't + * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr, + * but the value will not dump to the userspace. + */ + if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) + data->data_page_size = perf_get_page_size(data->addr); + + if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) + data->code_page_size = perf_get_page_size(data->ip); + if (sample_type & PERF_SAMPLE_AUX) { u64 size; @@ -11720,24 +11832,6 @@ SYSCALL_DEFINE5(perf_event_open, goto err_task; } - if (task) { - err = mutex_lock_interruptible(&task->signal->exec_update_mutex); - if (err) - goto err_task; - - /* - * Preserve ptrace permission check for backwards compatibility. - * - * We must hold exec_update_mutex across this and any potential - * perf_install_in_context() call for this new event to - * serialize against exec() altering our credentials (and the - * perf_event_exit_task() that could imply). - */ - err = -EACCES; - if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) - goto err_cred; - } - if (flags & PERF_FLAG_PID_CGROUP) cgroup_fd = pid; @@ -11745,7 +11839,7 @@ SYSCALL_DEFINE5(perf_event_open, NULL, NULL, cgroup_fd); if (IS_ERR(event)) { err = PTR_ERR(event); - goto err_cred; + goto err_task; } if (is_sampling_event(event)) { @@ -11864,6 +11958,24 @@ SYSCALL_DEFINE5(perf_event_open, goto err_context; } + if (task) { + err = mutex_lock_interruptible(&task->signal->exec_update_mutex); + if (err) + goto err_file; + + /* + * Preserve ptrace permission check for backwards compatibility. + * + * We must hold exec_update_mutex across this and any potential + * perf_install_in_context() call for this new event to + * serialize against exec() altering our credentials (and the + * perf_event_exit_task() that could imply). + */ + err = -EACCES; + if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) + goto err_cred; + } + if (move_group) { gctx = __perf_event_ctx_lock_double(group_leader, ctx); @@ -12039,7 +12151,10 @@ err_locked: if (move_group) perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); -/* err_file: */ +err_cred: + if (task) + mutex_unlock(&task->signal->exec_update_mutex); +err_file: fput(event_file); err_context: perf_unpin_context(ctx); @@ -12051,9 +12166,6 @@ err_alloc: */ if (!event_file) free_event(event); -err_cred: - if (task) - mutex_unlock(&task->signal->exec_update_mutex); err_task: if (task) put_task_struct(task); @@ -2085,62 +2085,6 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags) put_page(page); } -#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH - -/* - * WARNING: only to be used in the get_user_pages_fast() implementation. - * - * With get_user_pages_fast(), we walk down the pagetables without taking any - * locks. For this we would like to load the pointers atomically, but sometimes - * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What - * we do have is the guarantee that a PTE will only either go from not present - * to present, or present to not present or both -- it will not switch to a - * completely different present page without a TLB flush in between; something - * that we are blocking by holding interrupts off. - * - * Setting ptes from not present to present goes: - * - * ptep->pte_high = h; - * smp_wmb(); - * ptep->pte_low = l; - * - * And present to not present goes: - * - * ptep->pte_low = 0; - * smp_wmb(); - * ptep->pte_high = 0; - * - * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'. - * We load pte_high *after* loading pte_low, which ensures we don't see an older - * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't - * picked up a changed pte high. We might have gotten rubbish values from - * pte_low and pte_high, but we are guaranteed that pte_low will not have the - * present bit set *unless* it is 'l'. Because get_user_pages_fast() only - * operates on present ptes we're safe. - */ -static inline pte_t gup_get_pte(pte_t *ptep) -{ - pte_t pte; - - do { - pte.pte_low = ptep->pte_low; - smp_rmb(); - pte.pte_high = ptep->pte_high; - smp_rmb(); - } while (unlikely(pte.pte_low != ptep->pte_low)); - - return pte; -} -#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */ -/* - * We require that the PTE can be read atomically. - */ -static inline pte_t gup_get_pte(pte_t *ptep) -{ - return ptep_get(ptep); -} -#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ - static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, unsigned int flags, struct page **pages) @@ -2166,7 +2110,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, ptem = ptep = pte_offset_map(&pmd, addr); do { - pte_t pte = gup_get_pte(ptep); + pte_t pte = ptep_get_lockless(ptep); struct page *head, *page; /* |