diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-07-16 17:13:31 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-07-16 17:13:31 -0700 |
commit | 576a997c6315ee482519e7cc080f341b07638808 (patch) | |
tree | 625cff628b8db0638ed496f560d38eda0375cacb /kernel | |
parent | 4a996d90b9e046c6d59845acf00a54d464c34ff3 (diff) | |
parent | fa0c1c9d283b37fdb7fc1dcccbb88fc8f48a4aa4 (diff) |
Merge tag 'perf-core-2024-07-16' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull performance events updates from Ingo Molnar:
- Intel PT support enhancements & fixes
- Fix leaked SIGTRAP events
- Improve and fix the Intel uncore driver
- Add support for Intel HBM and CXL uncore counters
- Add Intel Lake and Arrow Lake support
- AMD uncore driver fixes
- Make SIGTRAP and __perf_pending_irq() work on RT
- Micro-optimizations
- Misc cleanups and fixes
* tag 'perf-core-2024-07-16' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (44 commits)
perf/x86/intel: Add a distinct name for Granite Rapids
perf/x86/intel/ds: Fix non 0 retire latency on Raptorlake
perf/x86/intel: Hide Topdown metrics events if the feature is not enumerated
perf/x86/intel/uncore: Fix the bits of the CHA extended umask for SPR
perf: Split __perf_pending_irq() out of perf_pending_irq()
perf: Don't disable preemption in perf_pending_task().
perf: Move swevent_htable::recursion into task_struct.
perf: Shrink the size of the recursion counter.
perf: Enqueue SIGTRAP always via task_work.
task_work: Add TWA_NMI_CURRENT as an additional notify mode.
perf: Move irq_work_queue() where the event is prepared.
perf: Fix event leak upon exec and file release
perf: Fix event leak upon exit
task_work: Introduce task_work_cancel() again
task_work: s/task_work_cancel()/task_work_cancel_func()/
perf/x86/amd/uncore: Fix DF and UMC domain identification
perf/x86/amd/uncore: Avoid PMU registration if counters are unavailable
perf/x86/intel: Support Perfmon MSRs aliasing
perf/x86/intel: Support PERFEVTSEL extension
perf/x86: Add config_mask to represent EVENTSEL bitmask
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/events/callchain.c | 2 | ||||
-rw-r--r-- | kernel/events/core.c | 125 | ||||
-rw-r--r-- | kernel/events/internal.h | 6 | ||||
-rw-r--r-- | kernel/events/ring_buffer.c | 7 | ||||
-rw-r--r-- | kernel/irq/manage.c | 2 | ||||
-rw-r--r-- | kernel/task_work.c | 58 |
6 files changed, 136 insertions, 64 deletions
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 1273be84392c..ad57944b6c40 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -29,7 +29,7 @@ static inline size_t perf_callchain_entry__sizeof(void) sysctl_perf_event_max_contexts_per_stack)); } -static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); +static DEFINE_PER_CPU(u8, callchain_recursion[PERF_NR_CONTEXTS]); static atomic_t nr_callchain_events; static DEFINE_MUTEX(callchain_mutex); static struct callchain_cpus_entries *callchain_cpus_entries; diff --git a/kernel/events/core.c b/kernel/events/core.c index 8f908f077935..ab6c4c942f79 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2283,21 +2283,6 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx) state = PERF_EVENT_STATE_OFF; } - if (event->pending_sigtrap) { - bool dec = true; - - event->pending_sigtrap = 0; - if (state != PERF_EVENT_STATE_OFF && - !event->pending_work) { - event->pending_work = 1; - dec = false; - WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount)); - task_work_add(current, &event->pending_task, TWA_RESUME); - } - if (dec) - local_dec(&event->ctx->nr_pending); - } - perf_event_set_state(event, state); if (!is_software_event(event)) @@ -2466,7 +2451,7 @@ static void __perf_event_disable(struct perf_event *event, * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). * - * When called from perf_pending_irq it's OK because event->ctx + * When called from perf_pending_disable it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ @@ -2506,7 +2491,7 @@ EXPORT_SYMBOL_GPL(perf_event_disable); void perf_event_disable_inatomic(struct perf_event *event) { event->pending_disable = 1; - irq_work_queue(&event->pending_irq); + irq_work_queue(&event->pending_disable_irq); } #define MAX_INTERRUPTS (~0ULL) @@ -5206,9 +5191,35 @@ static bool exclusive_event_installable(struct perf_event *event, static void perf_addr_filters_splice(struct perf_event *event, struct list_head *head); +static void perf_pending_task_sync(struct perf_event *event) +{ + struct callback_head *head = &event->pending_task; + + if (!event->pending_work) + return; + /* + * If the task is queued to the current task's queue, we + * obviously can't wait for it to complete. Simply cancel it. + */ + if (task_work_cancel(current, head)) { + event->pending_work = 0; + local_dec(&event->ctx->nr_pending); + return; + } + + /* + * All accesses related to the event are within the same RCU section in + * perf_pending_task(). The RCU grace period before the event is freed + * will make sure all those accesses are complete by then. + */ + rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE); +} + static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending_irq); + irq_work_sync(&event->pending_disable_irq); + perf_pending_task_sync(event); unaccount_event(event); @@ -6509,6 +6520,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; nr_pages = vma_size / PAGE_SIZE; + if (nr_pages > INT_MAX) + return -ENOMEM; mutex_lock(&event->mmap_mutex); ret = -EINVAL; @@ -6750,7 +6763,7 @@ static void perf_sigtrap(struct perf_event *event) /* * Deliver the pending work in-event-context or follow the context. */ -static void __perf_pending_irq(struct perf_event *event) +static void __perf_pending_disable(struct perf_event *event) { int cpu = READ_ONCE(event->oncpu); @@ -6765,11 +6778,6 @@ static void __perf_pending_irq(struct perf_event *event) * Yay, we hit home and are in the context of the event. */ if (cpu == smp_processor_id()) { - if (event->pending_sigtrap) { - event->pending_sigtrap = 0; - perf_sigtrap(event); - local_dec(&event->ctx->nr_pending); - } if (event->pending_disable) { event->pending_disable = 0; perf_event_disable_local(event); @@ -6793,11 +6801,26 @@ static void __perf_pending_irq(struct perf_event *event) * irq_work_queue(); // FAILS * * irq_work_run() - * perf_pending_irq() + * perf_pending_disable() * * But the event runs on CPU-B and wants disabling there. */ - irq_work_queue_on(&event->pending_irq, cpu); + irq_work_queue_on(&event->pending_disable_irq, cpu); +} + +static void perf_pending_disable(struct irq_work *entry) +{ + struct perf_event *event = container_of(entry, struct perf_event, pending_disable_irq); + int rctx; + + /* + * If we 'fail' here, that's OK, it means recursion is already disabled + * and we won't recurse 'further'. + */ + rctx = perf_swevent_get_recursion_context(); + __perf_pending_disable(event); + if (rctx >= 0) + perf_swevent_put_recursion_context(rctx); } static void perf_pending_irq(struct irq_work *entry) @@ -6820,8 +6843,6 @@ static void perf_pending_irq(struct irq_work *entry) perf_event_wakeup(event); } - __perf_pending_irq(event); - if (rctx >= 0) perf_swevent_put_recursion_context(rctx); } @@ -6832,23 +6853,27 @@ static void perf_pending_task(struct callback_head *head) int rctx; /* + * All accesses to the event must belong to the same implicit RCU read-side + * critical section as the ->pending_work reset. See comment in + * perf_pending_task_sync(). + */ + rcu_read_lock(); + /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ - preempt_disable_notrace(); rctx = perf_swevent_get_recursion_context(); if (event->pending_work) { event->pending_work = 0; perf_sigtrap(event); local_dec(&event->ctx->nr_pending); + rcuwait_wake_up(&event->pending_work_wait); } + rcu_read_unlock(); if (rctx >= 0) perf_swevent_put_recursion_context(rctx); - preempt_enable_notrace(); - - put_event(event); } #ifdef CONFIG_GUEST_PERF_EVENTS @@ -9706,16 +9731,26 @@ static int __perf_event_overflow(struct perf_event *event, */ bool valid_sample = sample_is_allowed(event, regs); unsigned int pending_id = 1; + enum task_work_notify_mode notify_mode; if (regs) pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1; - if (!event->pending_sigtrap) { - event->pending_sigtrap = pending_id; + + notify_mode = in_nmi() ? TWA_NMI_CURRENT : TWA_RESUME; + + if (!event->pending_work && + !task_work_add(current, &event->pending_task, notify_mode)) { + event->pending_work = pending_id; local_inc(&event->ctx->nr_pending); + + event->pending_addr = 0; + if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) + event->pending_addr = data->addr; + } else if (event->attr.exclude_kernel && valid_sample) { /* * Should not be able to return to user space without - * consuming pending_sigtrap; with exceptions: + * consuming pending_work; with exceptions: * * 1. Where !exclude_kernel, events can overflow again * in the kernel without returning to user space. @@ -9725,13 +9760,8 @@ static int __perf_event_overflow(struct perf_event *event, * To approximate progress (with false negatives), * check 32-bit hash of the current IP. */ - WARN_ON_ONCE(event->pending_sigtrap != pending_id); + WARN_ON_ONCE(event->pending_work != pending_id); } - - event->pending_addr = 0; - if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) - event->pending_addr = data->addr; - irq_work_queue(&event->pending_irq); } READ_ONCE(event->overflow_handler)(event, data, regs); @@ -9759,11 +9789,7 @@ struct swevent_htable { struct swevent_hlist *swevent_hlist; struct mutex hlist_mutex; int hlist_refcount; - - /* Recursion avoidance in each contexts */ - int recursion[PERF_NR_CONTEXTS]; }; - static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); /* @@ -9961,17 +9987,13 @@ DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]); int perf_swevent_get_recursion_context(void) { - struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); - - return get_recursion_context(swhash->recursion); + return get_recursion_context(current->perf_recursion); } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); void perf_swevent_put_recursion_context(int rctx) { - struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); - - put_recursion_context(swhash->recursion, rctx); + put_recursion_context(current->perf_recursion, rctx); } void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) @@ -11961,7 +11983,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, init_waitqueue_head(&event->waitq); init_irq_work(&event->pending_irq, perf_pending_irq); + event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable); init_task_work(&event->pending_task, perf_pending_task); + rcuwait_init(&event->pending_work_wait); mutex_init(&event->mmap_mutex); raw_spin_lock_init(&event->addr_filters.lock); @@ -13637,6 +13661,7 @@ int perf_event_init_task(struct task_struct *child, u64 clone_flags) { int ret; + memset(child->perf_recursion, 0, sizeof(child->perf_recursion)); child->perf_event_ctxp = NULL; mutex_init(&child->perf_event_mutex); INIT_LIST_HEAD(&child->perf_event_list); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 5150d5f84c03..451514442a1b 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -128,7 +128,7 @@ static inline unsigned long perf_data_size(struct perf_buffer *rb) static inline unsigned long perf_aux_size(struct perf_buffer *rb) { - return rb->aux_nr_pages << PAGE_SHIFT; + return (unsigned long)rb->aux_nr_pages << PAGE_SHIFT; } #define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...) \ @@ -208,7 +208,7 @@ arch_perf_out_copy_user(void *dst, const void *src, unsigned long n) DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) -static inline int get_recursion_context(int *recursion) +static inline int get_recursion_context(u8 *recursion) { unsigned char rctx = interrupt_context_level(); @@ -221,7 +221,7 @@ static inline int get_recursion_context(int *recursion) return rctx; } -static inline void put_recursion_context(int *recursion, int rctx) +static inline void put_recursion_context(u8 *recursion, unsigned char rctx) { barrier(); recursion[rctx]--; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 4013408ce012..8cadf97bc290 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -682,13 +682,18 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, if (!has_aux(event)) return -EOPNOTSUPP; + if (nr_pages <= 0) + return -EINVAL; + if (!overwrite) { /* * Watermark defaults to half the buffer, and so does the * max_order, to aid PMU drivers in double buffering. */ if (!watermark) - watermark = nr_pages << (PAGE_SHIFT - 1); + watermark = min_t(unsigned long, + U32_MAX, + (unsigned long)nr_pages << (PAGE_SHIFT - 1)); /* * Use aux_watermark as the basis for chunking to diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 71b0fc2d0aea..dd53298ef1a5 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1337,7 +1337,7 @@ static int irq_thread(void *data) * synchronize_hardirq(). So neither IRQTF_RUNTHREAD nor the * oneshot mask bit can be set. */ - task_work_cancel(current, irq_thread_dtor); + task_work_cancel_func(current, irq_thread_dtor); return 0; } diff --git a/kernel/task_work.c b/kernel/task_work.c index 95a7e1b7f1da..5c2daa7ad3f9 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -1,10 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/irq_work.h> #include <linux/spinlock.h> #include <linux/task_work.h> #include <linux/resume_user_mode.h> static struct callback_head work_exited; /* all we need is ->next == NULL */ +static void task_work_set_notify_irq(struct irq_work *entry) +{ + test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); +} +static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) = + IRQ_WORK_INIT_HARD(task_work_set_notify_irq); + /** * task_work_add - ask the @task to execute @work->func() * @task: the task which should run the callback @@ -12,7 +20,7 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * @notify: how to notify the targeted task * * Queue @work for task_work_run() below and notify the @task if @notify - * is @TWA_RESUME, @TWA_SIGNAL, or @TWA_SIGNAL_NO_IPI. + * is @TWA_RESUME, @TWA_SIGNAL, @TWA_SIGNAL_NO_IPI or @TWA_NMI_CURRENT. * * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted * task and run the task_work, regardless of whether the task is currently @@ -24,6 +32,8 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * kernel anyway. * @TWA_RESUME work is run only when the task exits the kernel and returns to * user mode, or before entering guest mode. + * @TWA_NMI_CURRENT works like @TWA_RESUME, except it can only be used for the + * current @task and if the current context is NMI. * * Fails if the @task is exiting/exited and thus it can't process this @work. * Otherwise @work->func() will be called when the @task goes through one of @@ -44,8 +54,13 @@ int task_work_add(struct task_struct *task, struct callback_head *work, { struct callback_head *head; - /* record the work call stack in order to print it in KASAN reports */ - kasan_record_aux_stack(work); + if (notify == TWA_NMI_CURRENT) { + if (WARN_ON_ONCE(task != current)) + return -EINVAL; + } else { + /* record the work call stack in order to print it in KASAN reports */ + kasan_record_aux_stack(work); + } head = READ_ONCE(task->task_works); do { @@ -66,6 +81,9 @@ int task_work_add(struct task_struct *task, struct callback_head *work, case TWA_SIGNAL_NO_IPI: __set_notify_signal(task); break; + case TWA_NMI_CURRENT: + irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume)); + break; default: WARN_ON_ONCE(1); break; @@ -120,9 +138,9 @@ static bool task_work_func_match(struct callback_head *cb, void *data) } /** - * task_work_cancel - cancel a pending work added by task_work_add() - * @task: the task which should execute the work - * @func: identifies the work to remove + * task_work_cancel_func - cancel a pending work matching a function added by task_work_add() + * @task: the task which should execute the func's work + * @func: identifies the func to match with a work to remove * * Find the last queued pending work with ->func == @func and remove * it from queue. @@ -131,11 +149,35 @@ static bool task_work_func_match(struct callback_head *cb, void *data) * The found work or NULL if not found. */ struct callback_head * -task_work_cancel(struct task_struct *task, task_work_func_t func) +task_work_cancel_func(struct task_struct *task, task_work_func_t func) { return task_work_cancel_match(task, task_work_func_match, func); } +static bool task_work_match(struct callback_head *cb, void *data) +{ + return cb == data; +} + +/** + * task_work_cancel - cancel a pending work added by task_work_add() + * @task: the task which should execute the work + * @cb: the callback to remove if queued + * + * Remove a callback from a task's queue if queued. + * + * RETURNS: + * True if the callback was queued and got cancelled, false otherwise. + */ +bool task_work_cancel(struct task_struct *task, struct callback_head *cb) +{ + struct callback_head *ret; + + ret = task_work_cancel_match(task, task_work_match, cb); + + return ret == cb; +} + /** * task_work_run - execute the works added by task_work_add() * @@ -168,7 +210,7 @@ void task_work_run(void) if (!work) break; /* - * Synchronize with task_work_cancel(). It can not remove + * Synchronize with task_work_cancel_match(). It can not remove * the first entry == work, cmpxchg(task_works) must fail. * But it can remove another entry from the ->next list. */ |