From 79bd233010859463e46a0a4b3926eaaba25a6110 Mon Sep 17 00:00:00 2001 From: Ben Gainey Date: Tue, 30 Jul 2024 09:44:14 +0100 Subject: perf: Rename perf_event_context.nr_pending to nr_no_switch_fast. nr_pending counts the number of events in the context that either pending_sigtrap or pending_work, but it is used to prevent taking the fast path in perf_event_context_sched_out. Renamed to reflect what it is used for, rather than what it counts. This change allows using the field to track other event properties that also require skipping the fast path without possible confusion over the name. Signed-off-by: Ben Gainey Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20240730084417.7693-2-ben.gainey@arm.com --- kernel/events/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index aa3450bdc227..e6cc354a3cee 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3516,9 +3516,9 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) perf_ctx_disable(ctx, false); - /* PMIs are disabled; ctx->nr_pending is stable. */ - if (local_read(&ctx->nr_pending) || - local_read(&next_ctx->nr_pending)) { + /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */ + if (local_read(&ctx->nr_no_switch_fast) || + local_read(&next_ctx->nr_no_switch_fast)) { /* * Must not swap out ctx when there's pending * events that rely on the ctx->task relation. @@ -5204,7 +5204,7 @@ static void perf_pending_task_sync(struct perf_event *event) */ if (task_work_cancel(current, head)) { event->pending_work = 0; - local_dec(&event->ctx->nr_pending); + local_dec(&event->ctx->nr_no_switch_fast); return; } @@ -6868,7 +6868,7 @@ static void perf_pending_task(struct callback_head *head) if (event->pending_work) { event->pending_work = 0; perf_sigtrap(event); - local_dec(&event->ctx->nr_pending); + local_dec(&event->ctx->nr_no_switch_fast); rcuwait_wake_up(&event->pending_work_wait); } rcu_read_unlock(); @@ -9740,7 +9740,7 @@ static int __perf_event_overflow(struct perf_event *event, if (!event->pending_work && !task_work_add(current, &event->pending_task, notify_mode)) { event->pending_work = pending_id; - local_inc(&event->ctx->nr_pending); + local_inc(&event->ctx->nr_no_switch_fast); event->pending_addr = 0; if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) -- cgit v1.2.3-58-ga151 From 7e8b255650fcfa1d05a57e4093d8405e6d8dd488 Mon Sep 17 00:00:00 2001 From: Ben Gainey Date: Tue, 30 Jul 2024 09:44:15 +0100 Subject: perf: Support PERF_SAMPLE_READ with inherit This change allows events to use PERF_SAMPLE_READ with inherit so long as PERF_SAMPLE_TID is also set. This enables sample based profiling of a group of counters over a hierarchy of processes or threads. This is useful, for example, for collecting per-thread counters/metrics, event based sampling of multiple counters as a unit, access to the enabled and running time when using multiplexing and so on. Prior to this, users were restricted to either collecting aggregate statistics for a multi-threaded/-process application (e.g. with "perf stat"), or to sample individual threads, or to profile the entire system (which requires root or CAP_PERFMON, and may produce much more data than is required). Theoretically a tool could poll for or otherwise monitor thread/process creation and construct whatever events the user is interested in using perf_event_open, for each new thread or process, but this is racy, can lead to file-descriptor exhaustion, and ultimately just replicates the behaviour of inherit, but in userspace. This configuration differs from inherit without PERF_SAMPLE_READ in that the accumulated event count, and consequently any sample (such as if triggered by overflow of sample_period) will be on a per-thread rather than on an aggregate basis. The meaning of read_format::value field of both PERF_RECORD_READ and PERF_RECORD_SAMPLE is changed such that if the sampled event uses this new configuration then the values reported will be per-thread rather than the global aggregate value. This is a change from the existing semantics of read_format (where PERF_SAMPLE_READ is used without inherit), but it is necessary to expose the per-thread counter values, and it avoids reinventing a separate "read_format_thread" field that otherwise replicates the same behaviour. This change should not break existing tools, since this configuration was not previously valid and was rejected by the kernel. Tools that opt into this new mode will need to account for this when calculating the counter delta for a given sample. Tools that wish to have both the per-thread and aggregate value can perform the global aggregation themselves from the per-thread values. The change to read_format::value does not affect existing valid perf_event_attr configurations, nor does it change the behaviour of calls to "read" on an event descriptor. Both continue to report the aggregate value for the entire thread/process hierarchy. The difference between the results reported by "read" and PERF_RECORD_SAMPLE in this new configuration is justified on the basis that it is not (easily) possible for "read" to target a specific thread (the caller only has the fd for the original parent event). Signed-off-by: Ben Gainey Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20240730084417.7693-3-ben.gainey@arm.com --- include/linux/perf_event.h | 3 +++ kernel/events/core.c | 55 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 44 insertions(+), 14 deletions(-) (limited to 'kernel/events') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 655f66b18418..701549967c18 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -969,6 +969,9 @@ struct perf_event_context { * The count of events for which using the switch-out fast path * should be avoided. * + * Sum (event->pending_work + events with + * (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))) + * * The SIGTRAP is targeted at ctx->task, as such it won't do changing * that until the signal is delivered. */ diff --git a/kernel/events/core.c b/kernel/events/core.c index e6cc354a3cee..c01a32687dad 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1767,6 +1767,14 @@ perf_event_groups_next(struct perf_event *event, struct pmu *pmu) event = rb_entry_safe(rb_next(&event->group_node), \ typeof(*event), group_node)) +/* + * Does the event attribute request inherit with PERF_SAMPLE_READ + */ +static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr) +{ + return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ); +} + /* * Add an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. @@ -1797,6 +1805,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_user++; if (event->attr.inherit_stat) ctx->nr_stat++; + if (has_inherit_and_sample_read(&event->attr)) + local_inc(&ctx->nr_no_switch_fast); if (event->state > PERF_EVENT_STATE_OFF) perf_cgroup_event_enable(event, ctx); @@ -2021,6 +2031,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_user--; if (event->attr.inherit_stat) ctx->nr_stat--; + if (has_inherit_and_sample_read(&event->attr)) + local_dec(&ctx->nr_no_switch_fast); list_del_rcu(&event->event_entry); @@ -3522,6 +3534,11 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) /* * Must not swap out ctx when there's pending * events that rely on the ctx->task relation. + * + * Likewise, when a context contains inherit + + * SAMPLE_READ events they should be switched + * out using the slow path so that they are + * treated as if they were distinct contexts. */ raw_spin_unlock(&next_ctx->lock); rcu_read_unlock(); @@ -4538,8 +4555,11 @@ unlock: raw_spin_unlock(&ctx->lock); } -static inline u64 perf_event_count(struct perf_event *event) +static inline u64 perf_event_count(struct perf_event *event, bool self) { + if (self) + return local64_read(&event->count); + return local64_read(&event->count) + atomic64_read(&event->child_count); } @@ -5498,7 +5518,7 @@ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 * mutex_lock(&event->child_mutex); (void)perf_event_read(event, false); - total += perf_event_count(event); + total += perf_event_count(event, false); *enabled += event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); @@ -5507,7 +5527,7 @@ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 * list_for_each_entry(child, &event->child_list, child_list) { (void)perf_event_read(child, false); - total += perf_event_count(child); + total += perf_event_count(child, false); *enabled += child->total_time_enabled; *running += child->total_time_running; } @@ -5589,14 +5609,14 @@ static int __perf_read_group_add(struct perf_event *leader, /* * Write {count,id} tuples for every sibling. */ - values[n++] += perf_event_count(leader); + values[n++] += perf_event_count(leader, false); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&leader->lost_samples); for_each_sibling_event(sub, leader) { - values[n++] += perf_event_count(sub); + values[n++] += perf_event_count(sub, false); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); if (read_format & PERF_FORMAT_LOST) @@ -6176,7 +6196,7 @@ void perf_event_update_userpage(struct perf_event *event) ++userpg->lock; barrier(); userpg->index = perf_event_index(event); - userpg->offset = perf_event_count(event); + userpg->offset = perf_event_count(event, false); if (userpg->index) userpg->offset -= local64_read(&event->hw.prev_count); @@ -7250,7 +7270,7 @@ static void perf_output_read_one(struct perf_output_handle *handle, u64 values[5]; int n = 0; - values[n++] = perf_event_count(event); + values[n++] = perf_event_count(event, has_inherit_and_sample_read(&event->attr)); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { values[n++] = enabled + atomic64_read(&event->child_total_time_enabled); @@ -7268,14 +7288,15 @@ static void perf_output_read_one(struct perf_output_handle *handle, } static void perf_output_read_group(struct perf_output_handle *handle, - struct perf_event *event, - u64 enabled, u64 running) + struct perf_event *event, + u64 enabled, u64 running) { struct perf_event *leader = event->group_leader, *sub; u64 read_format = event->attr.read_format; unsigned long flags; u64 values[6]; int n = 0; + bool self = has_inherit_and_sample_read(&event->attr); /* * Disabling interrupts avoids all counter scheduling @@ -7295,7 +7316,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, (leader->state == PERF_EVENT_STATE_ACTIVE)) leader->pmu->read(leader); - values[n++] = perf_event_count(leader); + values[n++] = perf_event_count(leader, self); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); if (read_format & PERF_FORMAT_LOST) @@ -7310,7 +7331,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, (sub->state == PERF_EVENT_STATE_ACTIVE)) sub->pmu->read(sub); - values[n++] = perf_event_count(sub); + values[n++] = perf_event_count(sub, self); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); if (read_format & PERF_FORMAT_LOST) @@ -7331,6 +7352,10 @@ static void perf_output_read_group(struct perf_output_handle *handle, * The problem is that its both hard and excessively expensive to iterate the * child list, not to mention that its impossible to IPI the children running * on another CPU, from interrupt/NMI context. + * + * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread + * counts rather than attempting to accumulate some value across all children on + * all cores. */ static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) @@ -12057,10 +12082,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, local64_set(&hwc->period_left, hwc->sample_period); /* - * We currently do not support PERF_SAMPLE_READ on inherited events. + * We do not support PERF_SAMPLE_READ on inherited events unless + * PERF_SAMPLE_TID is also selected, which allows inherited events to + * collect per-thread samples. * See perf_output_read(). */ - if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)) + if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID)) goto err_ns; if (!has_branch_stack(event)) @@ -13084,7 +13111,7 @@ static void sync_child_event(struct perf_event *child_event) perf_event_read_event(child_event, task); } - child_val = perf_event_count(child_event); + child_val = perf_event_count(child_event, false); /* * Add back the child's count to the parent's count: -- cgit v1.2.3-58-ga151 From cfa7f3d2c526c224a6271cc78a4a27a0de06f4f0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 29 Jul 2024 10:52:23 -0700 Subject: perf,x86: avoid missing caller address in stack traces captured in uprobe When tracing user functions with uprobe functionality, it's common to install the probe (e.g., a BPF program) at the first instruction of the function. This is often going to be `push %rbp` instruction in function preamble, which means that within that function frame pointer hasn't been established yet. This leads to consistently missing an actual caller of the traced function, because perf_callchain_user() only records current IP (capturing traced function) and then following frame pointer chain (which would be caller's frame, containing the address of caller's caller). So when we have target_1 -> target_2 -> target_3 call chain and we are tracing an entry to target_3, captured stack trace will report target_1 -> target_3 call chain, which is wrong and confusing. This patch proposes a x86-64-specific heuristic to detect `push %rbp` (`push %ebp` on 32-bit architecture) instruction being traced. Given entire kernel implementation of user space stack trace capturing works under assumption that user space code was compiled with frame pointer register (%rbp/%ebp) preservation, it seems pretty reasonable to use this instruction as a strong indicator that this is the entry to the function. In that case, return address is still pointed to by %rsp/%esp, so we fetch it and add to stack trace before proceeding to unwind the rest using frame pointer-based logic. We also check for `endbr64` (for 64-bit modes) as another common pattern for function entry, as suggested by Josh Poimboeuf. Even if we get this wrong sometimes for uprobes attached not at the function entry, it's OK because stack trace will still be overall meaningful, just with one extra bogus entry. If we don't detect this, we end up with guaranteed to be missing caller function entry in the stack trace, which is worse overall. Signed-off-by: Andrii Nakryiko Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20240729175223.23914-1-andrii@kernel.org --- arch/x86/events/core.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/uprobes.h | 2 ++ kernel/events/uprobes.c | 2 ++ 3 files changed, 67 insertions(+) (limited to 'kernel/events') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 12f2a0c14d33..e6d133dda70f 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -41,6 +41,8 @@ #include #include #include +#include +#include #include "perf_event.h" @@ -2814,6 +2816,46 @@ static unsigned long get_segment_base(unsigned int segment) return get_desc_base(desc); } +#ifdef CONFIG_UPROBES +/* + * Heuristic-based check if uprobe is installed at the function entry. + * + * Under assumption of user code being compiled with frame pointers, + * `push %rbp/%ebp` is a good indicator that we indeed are. + * + * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern. + * If we get this wrong, captured stack trace might have one extra bogus + * entry, but the rest of stack trace will still be meaningful. + */ +static bool is_uprobe_at_func_entry(struct pt_regs *regs) +{ + struct arch_uprobe *auprobe; + + if (!current->utask) + return false; + + auprobe = current->utask->auprobe; + if (!auprobe) + return false; + + /* push %rbp/%ebp */ + if (auprobe->insn[0] == 0x55) + return true; + + /* endbr64 (64-bit only) */ + if (user_64bit_mode(regs) && is_endbr(*(u32 *)auprobe->insn)) + return true; + + return false; +} + +#else +static bool is_uprobe_at_func_entry(struct pt_regs *regs) +{ + return false; +} +#endif /* CONFIG_UPROBES */ + #ifdef CONFIG_IA32_EMULATION #include @@ -2825,6 +2867,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent unsigned long ss_base, cs_base; struct stack_frame_ia32 frame; const struct stack_frame_ia32 __user *fp; + u32 ret_addr; if (user_64bit_mode(regs)) return 0; @@ -2834,6 +2877,12 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent fp = compat_ptr(ss_base + regs->bp); pagefault_disable(); + + /* see perf_callchain_user() below for why we do this */ + if (is_uprobe_at_func_entry(regs) && + !get_user(ret_addr, (const u32 __user *)regs->sp)) + perf_callchain_store(entry, ret_addr); + while (entry->nr < entry->max_stack) { if (!valid_user_frame(fp, sizeof(frame))) break; @@ -2862,6 +2911,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs { struct stack_frame frame; const struct stack_frame __user *fp; + unsigned long ret_addr; if (perf_guest_state()) { /* TODO: We don't support guest os callchain now */ @@ -2885,6 +2935,19 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs return; pagefault_disable(); + + /* + * If we are called from uprobe handler, and we are indeed at the very + * entry to user function (which is normally a `push %rbp` instruction, + * under assumption of application being compiled with frame pointers), + * we should read return address from *regs->sp before proceeding + * to follow frame pointers, otherwise we'll skip immediate caller + * as %rbp is not yet setup. + */ + if (is_uprobe_at_func_entry(regs) && + !get_user(ret_addr, (const unsigned long __user *)regs->sp)) + perf_callchain_store(entry, ret_addr); + while (entry->nr < entry->max_stack) { if (!valid_user_frame(fp, sizeof(frame))) break; diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index b503fafb7fb3..a270a5892ab4 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -76,6 +76,8 @@ struct uprobe_task { struct uprobe *active_uprobe; unsigned long xol_vaddr; + struct arch_uprobe *auprobe; + struct return_instance *return_instances; unsigned int depth; }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 73cc47708679..f69ecd39b1a8 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2082,6 +2082,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) bool need_prep = false; /* prepare return uprobe, when needed */ down_read(&uprobe->register_rwsem); + current->utask->auprobe = &uprobe->arch; for (uc = uprobe->consumers; uc; uc = uc->next) { int rc = 0; @@ -2096,6 +2097,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) remove &= rc; } + current->utask->auprobe = NULL; if (need_prep && !remove) prepare_uretprobe(uprobe, regs); /* put bp at return */ -- cgit v1.2.3-58-ga151 From 84455e6923c79a37812930787aaa141e82afe315 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 1 Aug 2024 15:27:09 +0200 Subject: uprobes: document the usage of mm->mmap_lock The comment above uprobe_write_opcode() is wrong, unapply_uprobe() calls it under mmap_read_lock() and this is correct. And it is completely unclear why register_for_each_vma() takes mmap_lock for writing, add a comment to explain that mmap_write_lock() is needed to avoid the following race: - A task T hits the bp installed by uprobe and calls find_active_uprobe() - uprobe_unregister() removes this uprobe/bp - T calls find_uprobe() which returns NULL - another uprobe_register() installs the bp at the same address - T calls is_trap_at_addr() which returns true - T returns to handle_swbp() and gets SIGTRAP. Reported-by: Andrii Nakryiko Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiri Olsa Acked-by: Andrii Nakryiko Acked-by: "Masami Hiramatsu (Google)" Link: https://lore.kernel.org/r/20240801132709.GA8780@redhat.com --- kernel/events/uprobes.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f69ecd39b1a8..2d1457eee965 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -453,7 +453,7 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. * - * Called with mm->mmap_lock held for write. + * Called with mm->mmap_lock held for read or write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, @@ -1046,7 +1046,13 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) if (err && is_register) goto free; - + /* + * We take mmap_lock for writing to avoid the race with + * find_active_uprobe() which takes mmap_lock for reading. + * Thus this install_breakpoint() can not make + * is_trap_at_addr() true right after find_uprobe() + * returns NULL in find_active_uprobe(). + */ mmap_write_lock(mm); vma = find_vma(mm, info->vaddr); if (!vma || !valid_vma(vma, is_register) || -- cgit v1.2.3-58-ga151 From 300b05621a3f85621130e356ca6ae90f6a4eec0e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 1 Aug 2024 15:27:14 +0200 Subject: uprobes: is_trap_at_addr: don't use get_user_pages_remote() get_user_pages_remote() and the comment above it make no sense. There is no task_struct passed into get_user_pages_remote() anymore, and nowadays mm_account_fault() increments the current->min/maj_flt counters regardless of FAULT_FLAG_REMOTE. Reported-by: Andrii Nakryiko Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiri Olsa Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240801132714.GA8783@redhat.com --- kernel/events/uprobes.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2d1457eee965..698bb22f7102 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2035,13 +2035,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) if (likely(result == 0)) goto out; - /* - * The NULL 'tsk' here ensures that any faults that occur here - * will not be accounted to the task. 'mm' *is* current->mm, - * but we treat this as a 'remote' access since it is - * essentially a kernel access to the memory. - */ - result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, NULL); + result = get_user_pages(vaddr, 1, FOLL_FORCE, &page); if (result < 0) return result; -- cgit v1.2.3-58-ga151 From 7c2bae2d9c27a89280b63ff3567d2dac2d89db28 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 1 Aug 2024 15:27:19 +0200 Subject: uprobes: simplify error handling for alloc_uprobe() Return -ENOMEM instead of NULL, which makes caller's error handling just a touch simpler. Signed-off-by: Andrii Nakryiko Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: "Masami Hiramatsu (Google)" Reviewed-by: Jiri Olsa Link: https://lore.kernel.org/r/20240801132719.GA8788@redhat.com --- kernel/events/uprobes.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 698bb22f7102..e9b092acc71b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -725,7 +725,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL); if (!uprobe) - return NULL; + return ERR_PTR(-ENOMEM); uprobe->inode = inode; uprobe->offset = offset; @@ -1167,8 +1167,6 @@ static int __uprobe_register(struct inode *inode, loff_t offset, retry: uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); - if (!uprobe) - return -ENOMEM; if (IS_ERR(uprobe)) return PTR_ERR(uprobe); -- cgit v1.2.3-58-ga151 From e04332ebc8ac128fa551e83f1161ab1c094d13a9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 1 Aug 2024 15:27:28 +0200 Subject: uprobes: kill uprobe_register_refctr() It doesn't make any sense to have 2 versions of _register(). Note that trace_uprobe_enable(), the only user of uprobe_register(), doesn't need to check tu->ref_ctr_offset to decide which one should be used, it could safely pass ref_ctr_offset == 0 to uprobe_register_refctr(). Add this argument to uprobe_register(), update the callers, and kill uprobe_register_refctr(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiri Olsa Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240801132728.GA8800@redhat.com --- include/linux/uprobes.h | 9 ++------ kernel/events/uprobes.c | 24 ++++++---------------- kernel/trace/bpf_trace.c | 8 ++++---- kernel/trace/trace_uprobe.c | 7 +------ .../selftests/bpf/bpf_testmod/bpf_testmod.c | 4 ++-- 5 files changed, 15 insertions(+), 37 deletions(-) (limited to 'kernel/events') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index a270a5892ab4..788813c0b8fc 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -112,8 +112,7 @@ extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); -extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); -extern int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); +extern int uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern int uprobe_mmap(struct vm_area_struct *vma); @@ -154,11 +153,7 @@ static inline void uprobes_init(void) #define uprobe_get_trap_addr(regs) instruction_pointer(regs) static inline int -uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) -{ - return -ENOSYS; -} -static inline int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc) +uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc) { return -ENOSYS; } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e9b092acc71b..3a80154bc4c7 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1121,25 +1121,26 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume EXPORT_SYMBOL_GPL(uprobe_unregister); /* - * __uprobe_register - register a probe + * uprobe_register - register a probe * @inode: the file in which the probe has to be placed. * @offset: offset from the start of the file. + * @ref_ctr_offset: offset of SDT marker / reference counter * @uc: information on howto handle the probe.. * - * Apart from the access refcount, __uprobe_register() takes a creation + * Apart from the access refcount, uprobe_register() takes a creation * refcount (thro alloc_uprobe) if and only if this @uprobe is getting * inserted into the rbtree (i.e first consumer for a @inode:@offset * tuple). Creation refcount stops uprobe_unregister from freeing the * @uprobe even before the register operation is complete. Creation * refcount is released when the last @uc for the @uprobe - * unregisters. Caller of __uprobe_register() is required to keep @inode + * unregisters. Caller of uprobe_register() is required to keep @inode * (and the containing mount) referenced. * * Return errno if it cannot successully install probes * else return 0 (success) */ -static int __uprobe_register(struct inode *inode, loff_t offset, - loff_t ref_ctr_offset, struct uprobe_consumer *uc) +int uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, + struct uprobe_consumer *uc) { struct uprobe *uprobe; int ret; @@ -1189,21 +1190,8 @@ static int __uprobe_register(struct inode *inode, loff_t offset, goto retry; return ret; } - -int uprobe_register(struct inode *inode, loff_t offset, - struct uprobe_consumer *uc) -{ - return __uprobe_register(inode, offset, 0, uc); -} EXPORT_SYMBOL_GPL(uprobe_register); -int uprobe_register_refctr(struct inode *inode, loff_t offset, - loff_t ref_ctr_offset, struct uprobe_consumer *uc) -{ - return __uprobe_register(inode, offset, ref_ctr_offset, uc); -} -EXPORT_SYMBOL_GPL(uprobe_register_refctr); - /* * uprobe_apply - unregister an already registered probe. * @inode: the file in which the probe has to be removed. diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index cd098846e251..afa909e17824 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3480,10 +3480,10 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr &bpf_uprobe_multi_link_lops, prog); for (i = 0; i < cnt; i++) { - err = uprobe_register_refctr(d_real_inode(link->path.dentry), - uprobes[i].offset, - uprobes[i].ref_ctr_offset, - &uprobes[i].consumer); + err = uprobe_register(d_real_inode(link->path.dentry), + uprobes[i].offset, + uprobes[i].ref_ctr_offset, + &uprobes[i].consumer); if (err) { bpf_uprobe_unregister(&path, uprobes, i); goto error_free; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c98e3b3386ba..1f590f989c1e 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1089,12 +1089,7 @@ static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter) tu->consumer.filter = filter; tu->inode = d_real_inode(tu->path.dentry); - if (tu->ref_ctr_offset) - ret = uprobe_register_refctr(tu->inode, tu->offset, - tu->ref_ctr_offset, &tu->consumer); - else - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); - + ret = uprobe_register(tu->inode, tu->offset, tu->ref_ctr_offset, &tu->consumer); if (ret) tu->inode = NULL; diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 72f565af4f82..55f6905de743 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -458,8 +458,8 @@ static int testmod_register_uprobe(loff_t offset) if (err) goto out; - err = uprobe_register_refctr(d_real_inode(uprobe.path.dentry), - offset, 0, &uprobe.consumer); + err = uprobe_register(d_real_inode(uprobe.path.dentry), + offset, 0, &uprobe.consumer); if (err) path_put(&uprobe.path); else -- cgit v1.2.3-58-ga151 From 3c83a9ad0295eb63bdeb81d821b8c3b9417fbcac Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 1 Aug 2024 15:27:34 +0200 Subject: uprobes: make uprobe_register() return struct uprobe * This way uprobe_unregister() and uprobe_apply() can use "struct uprobe *" rather than inode + offset. This simplifies the code and allows to avoid the unnecessary find_uprobe() + put_uprobe() in these functions. TODO: uprobe_unregister() still needs get_uprobe/put_uprobe to ensure that this uprobe can't be freed before up_write(&uprobe->register_rwsem). Co-developed-by: Andrii Nakryiko Signed-off-by: Andrii Nakryiko Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiri Olsa Link: https://lore.kernel.org/r/20240801132734.GA8803@redhat.com --- include/linux/uprobes.h | 15 +++--- kernel/events/uprobes.c | 56 +++++++++------------- kernel/trace/bpf_trace.c | 25 +++++----- kernel/trace/trace_uprobe.c | 26 +++++----- .../selftests/bpf/bpf_testmod/bpf_testmod.c | 25 +++++----- 5 files changed, 67 insertions(+), 80 deletions(-) (limited to 'kernel/events') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 788813c0b8fc..f50df1fa93e7 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -16,6 +16,7 @@ #include #include +struct uprobe; struct vm_area_struct; struct mm_struct; struct inode; @@ -112,9 +113,9 @@ extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); -extern int uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); -extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); -extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); +extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); +extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); +extern void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc); extern int uprobe_mmap(struct vm_area_struct *vma); extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void uprobe_start_dup_mmap(void); @@ -152,18 +153,18 @@ static inline void uprobes_init(void) #define uprobe_get_trap_addr(regs) instruction_pointer(regs) -static inline int +static inline struct uprobe * uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc) { - return -ENOSYS; + return ERR_PTR(-ENOSYS); } static inline int -uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool add) +uprobe_apply(struct uprobe* uprobe, struct uprobe_consumer *uc, bool add) { return -ENOSYS; } static inline void -uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) +uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) { } static inline int uprobe_mmap(struct vm_area_struct *vma) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 3a80154bc4c7..b33f1397dae0 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1099,20 +1099,14 @@ __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) delete_uprobe(uprobe); } -/* +/** * uprobe_unregister - unregister an already registered probe. - * @inode: the file in which the probe has to be removed. - * @offset: offset from the start of the file. + * @uprobe: uprobe to remove * @uc: identify which probe if multiple probes are colocated. */ -void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) +void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) { - struct uprobe *uprobe; - - uprobe = find_uprobe(inode, offset); - if (WARN_ON(!uprobe)) - return; - + get_uprobe(uprobe); down_write(&uprobe->register_rwsem); __uprobe_unregister(uprobe, uc); up_write(&uprobe->register_rwsem); @@ -1120,7 +1114,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume } EXPORT_SYMBOL_GPL(uprobe_unregister); -/* +/** * uprobe_register - register a probe * @inode: the file in which the probe has to be placed. * @offset: offset from the start of the file. @@ -1136,40 +1130,40 @@ EXPORT_SYMBOL_GPL(uprobe_unregister); * unregisters. Caller of uprobe_register() is required to keep @inode * (and the containing mount) referenced. * - * Return errno if it cannot successully install probes - * else return 0 (success) + * Return: pointer to the new uprobe on success or an ERR_PTR on failure. */ -int uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, - struct uprobe_consumer *uc) +struct uprobe *uprobe_register(struct inode *inode, + loff_t offset, loff_t ref_ctr_offset, + struct uprobe_consumer *uc) { struct uprobe *uprobe; int ret; /* Uprobe must have at least one set consumer */ if (!uc->handler && !uc->ret_handler) - return -EINVAL; + return ERR_PTR(-EINVAL); /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */ if (!inode->i_mapping->a_ops->read_folio && !shmem_mapping(inode->i_mapping)) - return -EIO; + return ERR_PTR(-EIO); /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) - return -EINVAL; + return ERR_PTR(-EINVAL); /* * This ensures that copy_from_page(), copy_to_page() and * __update_ref_ctr() can't cross page boundary. */ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) - return -EINVAL; + return ERR_PTR(-EINVAL); if (!IS_ALIGNED(ref_ctr_offset, sizeof(short))) - return -EINVAL; + return ERR_PTR(-EINVAL); retry: uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); if (IS_ERR(uprobe)) - return PTR_ERR(uprobe); + return uprobe; /* * We can race with uprobe_unregister()->delete_uprobe(). @@ -1188,35 +1182,29 @@ int uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, if (unlikely(ret == -EAGAIN)) goto retry; - return ret; + + return ret ? ERR_PTR(ret) : uprobe; } EXPORT_SYMBOL_GPL(uprobe_register); -/* - * uprobe_apply - unregister an already registered probe. - * @inode: the file in which the probe has to be removed. - * @offset: offset from the start of the file. +/** + * uprobe_apply - add or remove the breakpoints according to @uc->filter + * @uprobe: uprobe which "owns" the breakpoint * @uc: consumer which wants to add more or remove some breakpoints * @add: add or remove the breakpoints + * Return: 0 on success or negative error code. */ -int uprobe_apply(struct inode *inode, loff_t offset, - struct uprobe_consumer *uc, bool add) +int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add) { - struct uprobe *uprobe; struct uprobe_consumer *con; int ret = -ENOENT; - uprobe = find_uprobe(inode, offset); - if (WARN_ON(!uprobe)) - return ret; - down_write(&uprobe->register_rwsem); for (con = uprobe->consumers; con && con != uc ; con = con->next) ; if (con) ret = register_for_each_vma(uprobe, add ? uc : NULL); up_write(&uprobe->register_rwsem); - put_uprobe(uprobe); return ret; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index afa909e17824..4e391daafa64 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3160,6 +3160,7 @@ struct bpf_uprobe { loff_t offset; unsigned long ref_ctr_offset; u64 cookie; + struct uprobe *uprobe; struct uprobe_consumer consumer; }; @@ -3178,15 +3179,12 @@ struct bpf_uprobe_multi_run_ctx { struct bpf_uprobe *uprobe; }; -static void bpf_uprobe_unregister(struct path *path, struct bpf_uprobe *uprobes, - u32 cnt) +static void bpf_uprobe_unregister(struct bpf_uprobe *uprobes, u32 cnt) { u32 i; - for (i = 0; i < cnt; i++) { - uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset, - &uprobes[i].consumer); - } + for (i = 0; i < cnt; i++) + uprobe_unregister(uprobes[i].uprobe, &uprobes[i].consumer); } static void bpf_uprobe_multi_link_release(struct bpf_link *link) @@ -3194,7 +3192,7 @@ static void bpf_uprobe_multi_link_release(struct bpf_link *link) struct bpf_uprobe_multi_link *umulti_link; umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); - bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt); + bpf_uprobe_unregister(umulti_link->uprobes, umulti_link->cnt); if (umulti_link->task) put_task_struct(umulti_link->task); path_put(&umulti_link->path); @@ -3480,12 +3478,13 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr &bpf_uprobe_multi_link_lops, prog); for (i = 0; i < cnt; i++) { - err = uprobe_register(d_real_inode(link->path.dentry), - uprobes[i].offset, - uprobes[i].ref_ctr_offset, - &uprobes[i].consumer); - if (err) { - bpf_uprobe_unregister(&path, uprobes, i); + uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry), + uprobes[i].offset, + uprobes[i].ref_ctr_offset, + &uprobes[i].consumer); + if (IS_ERR(uprobes[i].uprobe)) { + err = PTR_ERR(uprobes[i].uprobe); + bpf_uprobe_unregister(uprobes, i); goto error_free; } } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 1f590f989c1e..52e76a73fa7c 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -58,8 +58,8 @@ struct trace_uprobe { struct dyn_event devent; struct uprobe_consumer consumer; struct path path; - struct inode *inode; char *filename; + struct uprobe *uprobe; unsigned long offset; unsigned long ref_ctr_offset; unsigned long nhit; @@ -1084,16 +1084,16 @@ typedef bool (*filter_func_t)(struct uprobe_consumer *self, static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter) { - int ret; + struct inode *inode = d_real_inode(tu->path.dentry); + struct uprobe *uprobe; tu->consumer.filter = filter; - tu->inode = d_real_inode(tu->path.dentry); - - ret = uprobe_register(tu->inode, tu->offset, tu->ref_ctr_offset, &tu->consumer); - if (ret) - tu->inode = NULL; + uprobe = uprobe_register(inode, tu->offset, tu->ref_ctr_offset, &tu->consumer); + if (IS_ERR(uprobe)) + return PTR_ERR(uprobe); - return ret; + tu->uprobe = uprobe; + return 0; } static void __probe_event_disable(struct trace_probe *tp) @@ -1104,11 +1104,11 @@ static void __probe_event_disable(struct trace_probe *tp) WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter)); list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { - if (!tu->inode) + if (!tu->uprobe) continue; - uprobe_unregister(tu->inode, tu->offset, &tu->consumer); - tu->inode = NULL; + uprobe_unregister(tu->uprobe, &tu->consumer); + tu->uprobe = NULL; } } @@ -1305,7 +1305,7 @@ static int uprobe_perf_close(struct trace_event_call *call, return 0; list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { - ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + ret = uprobe_apply(tu->uprobe, &tu->consumer, false); if (ret) break; } @@ -1329,7 +1329,7 @@ static int uprobe_perf_open(struct trace_event_call *call, return 0; list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { - err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + err = uprobe_apply(tu->uprobe, &tu->consumer, true); if (err) { uprobe_perf_close(call, event); break; diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 55f6905de743..3c0515a27842 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -432,7 +432,7 @@ uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func, struct testmod_uprobe { struct path path; - loff_t offset; + struct uprobe *uprobe; struct uprobe_consumer consumer; }; @@ -446,25 +446,25 @@ static int testmod_register_uprobe(loff_t offset) { int err = -EBUSY; - if (uprobe.offset) + if (uprobe.uprobe) return -EBUSY; mutex_lock(&testmod_uprobe_mutex); - if (uprobe.offset) + if (uprobe.uprobe) goto out; err = kern_path("/proc/self/exe", LOOKUP_FOLLOW, &uprobe.path); if (err) goto out; - err = uprobe_register(d_real_inode(uprobe.path.dentry), - offset, 0, &uprobe.consumer); - if (err) + uprobe.uprobe = uprobe_register(d_real_inode(uprobe.path.dentry), + offset, 0, &uprobe.consumer); + if (IS_ERR(uprobe.uprobe)) { + err = PTR_ERR(uprobe.uprobe); path_put(&uprobe.path); - else - uprobe.offset = offset; - + uprobe.uprobe = NULL; + } out: mutex_unlock(&testmod_uprobe_mutex); return err; @@ -474,11 +474,10 @@ static void testmod_unregister_uprobe(void) { mutex_lock(&testmod_uprobe_mutex); - if (uprobe.offset) { - uprobe_unregister(d_real_inode(uprobe.path.dentry), - uprobe.offset, &uprobe.consumer); + if (uprobe.uprobe) { + uprobe_unregister(uprobe.uprobe, &uprobe.consumer); path_put(&uprobe.path); - uprobe.offset = 0; + uprobe.uprobe = NULL; } mutex_unlock(&testmod_uprobe_mutex); -- cgit v1.2.3-58-ga151 From bb18c5de1c288050ef8bd4af4ca16896ad4cd3fc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 1 Aug 2024 15:27:39 +0200 Subject: uprobes: change uprobe_register() to use uprobe_unregister() instead of __uprobe_unregister() If register_for_each_vma() fails uprobe_register() can safely drop uprobe->register_rwsem and use uprobe_unregister(). There is no worry about the races with another register/unregister, consumer_add() was already called so this case doesn't differ from _unregister() right after the successful _register(). Yes this means the extra up_write() + down_write(), but this is the slow and unlikely case anyway. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiri Olsa Acked-by: Andrii Nakryiko Acked-by: "Masami Hiramatsu (Google)" Link: https://lore.kernel.org/r/20240801132739.GA8809@redhat.com --- kernel/events/uprobes.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index b33f1397dae0..eacf287ecd89 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1174,16 +1174,18 @@ struct uprobe *uprobe_register(struct inode *inode, if (likely(uprobe_is_active(uprobe))) { consumer_add(uprobe, uc); ret = register_for_each_vma(uprobe, uc); - if (ret) - __uprobe_unregister(uprobe, uc); } up_write(&uprobe->register_rwsem); put_uprobe(uprobe); - if (unlikely(ret == -EAGAIN)) - goto retry; + if (ret) { + if (unlikely(ret == -EAGAIN)) + goto retry; + uprobe_unregister(uprobe, uc); + return ERR_PTR(ret); + } - return ret ? ERR_PTR(ret) : uprobe; + return uprobe; } EXPORT_SYMBOL_GPL(uprobe_register); -- cgit v1.2.3-58-ga151 From 70408bebba94a63ea11471ed00168cd8606a9328 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 1 Aug 2024 15:27:44 +0200 Subject: uprobes: fold __uprobe_unregister() into uprobe_unregister() Fold __uprobe_unregister() into its single caller, uprobe_unregister(). A separate patch to simplify the next change. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiri Olsa Acked-by: Andrii Nakryiko Acked-by: "Masami Hiramatsu (Google)" Link: https://lore.kernel.org/r/20240801132744.GA8814@redhat.com --- kernel/events/uprobes.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index eacf287ecd89..175058b209a9 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1085,20 +1085,6 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) return err; } -static void -__uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) -{ - int err; - - if (WARN_ON(!consumer_del(uprobe, uc))) - return; - - err = register_for_each_vma(uprobe, NULL); - /* TODO : cant unregister? schedule a worker thread */ - if (!uprobe->consumers && !err) - delete_uprobe(uprobe); -} - /** * uprobe_unregister - unregister an already registered probe. * @uprobe: uprobe to remove @@ -1106,9 +1092,18 @@ __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) */ void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) { + int err; + get_uprobe(uprobe); down_write(&uprobe->register_rwsem); - __uprobe_unregister(uprobe, uc); + if (WARN_ON(!consumer_del(uprobe, uc))) + err = -ENOENT; + else + err = register_for_each_vma(uprobe, NULL); + + /* TODO : cant unregister? schedule a worker thread */ + if (!err && !uprobe->consumers) + delete_uprobe(uprobe); up_write(&uprobe->register_rwsem); put_uprobe(uprobe); } -- cgit v1.2.3-58-ga151 From 12026d2034dfeb575e0eb28f33431cbf03dc732c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 1 Aug 2024 15:27:49 +0200 Subject: uprobes: shift put_uprobe() from delete_uprobe() to uprobe_unregister() Kill the extra get_uprobe() + put_uprobe() in uprobe_unregister() and move the possibly final put_uprobe() from delete_uprobe() to its only caller, uprobe_unregister(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiri Olsa Acked-by: Andrii Nakryiko Acked-by: "Masami Hiramatsu (Google)" Link: https://lore.kernel.org/r/20240801132749.GA8817@redhat.com --- kernel/events/uprobes.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 175058b209a9..30348f13d4a7 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -939,7 +939,6 @@ static void delete_uprobe(struct uprobe *uprobe) rb_erase(&uprobe->rb_node, &uprobes_tree); write_unlock(&uprobes_treelock); RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */ - put_uprobe(uprobe); } struct map_info { @@ -1094,7 +1093,6 @@ void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) { int err; - get_uprobe(uprobe); down_write(&uprobe->register_rwsem); if (WARN_ON(!consumer_del(uprobe, uc))) err = -ENOENT; @@ -1102,10 +1100,16 @@ void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) err = register_for_each_vma(uprobe, NULL); /* TODO : cant unregister? schedule a worker thread */ - if (!err && !uprobe->consumers) - delete_uprobe(uprobe); + if (!err) { + if (!uprobe->consumers) + delete_uprobe(uprobe); + else + err = -EBUSY; + } up_write(&uprobe->register_rwsem); - put_uprobe(uprobe); + + if (!err) + put_uprobe(uprobe); } EXPORT_SYMBOL_GPL(uprobe_unregister); -- cgit v1.2.3-58-ga151 From 2d17cf1abcbe8a45b7dc41a768ed22aac158ddd8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 7 Aug 2024 13:29:25 +0200 Subject: perf: Optimize context reschedule for single PMU cases Currently re-scheduling a context will reschedule all active PMUs for that context, even if it is known only a single event is added. Namhyung reported that changing this to only reschedule the affected PMU when possible provides significant performance gains under certain conditions. Therefore, allow partial context reschedules for a specific PMU, that of the event modified. While the patch looks somewhat noisy, it mostly just propagates a new @pmu argument through the callchain and modifies the epc loop to only pick the 'epc->pmu == @pmu' case. Reported-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Reviewed-by: Namhyung Kim Link: https://lore.kernel.org/r/20240807115549.920950699@infradead.org --- kernel/events/core.c | 164 +++++++++++++++++++++++++++------------------------ 1 file changed, 88 insertions(+), 76 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index c01a32687dad..dad2b9ac42c0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -685,30 +685,32 @@ do { \ ___p; \ }) +#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \ + list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \ + if (_cgroup && !_epc->nr_cgroups) \ + continue; \ + else if (_pmu && _epc->pmu != _pmu) \ + continue; \ + else + static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (cgroup && !pmu_ctx->nr_cgroups) - continue; + for_each_epc(pmu_ctx, ctx, NULL, cgroup) perf_pmu_disable(pmu_ctx->pmu); - } } static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (cgroup && !pmu_ctx->nr_cgroups) - continue; + for_each_epc(pmu_ctx, ctx, NULL, cgroup) perf_pmu_enable(pmu_ctx->pmu); - } } -static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type); -static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type); +static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); +static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); #ifdef CONFIG_CGROUP_PERF @@ -865,7 +867,7 @@ static void perf_cgroup_switch(struct task_struct *task) perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_ctx_disable(&cpuctx->ctx, true); - ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP); + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); /* * must not be done before ctxswout due * to update_cgrp_time_from_cpuctx() in @@ -877,7 +879,7 @@ static void perf_cgroup_switch(struct task_struct *task) * perf_cgroup_set_timestamp() in ctx_sched_in() * to not have to pass task around */ - ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP); + ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); perf_ctx_enable(&cpuctx->ctx, true); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -2656,7 +2658,8 @@ static void add_event_to_ctx(struct perf_event *event, } static void task_ctx_sched_out(struct perf_event_context *ctx, - enum event_type_t event_type) + struct pmu *pmu, + enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); @@ -2666,18 +2669,19 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; - ctx_sched_out(ctx, event_type); + ctx_sched_out(ctx, pmu, event_type); } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) + struct perf_event_context *ctx, + struct pmu *pmu) { - ctx_sched_in(&cpuctx->ctx, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED); if (ctx) - ctx_sched_in(ctx, EVENT_PINNED); - ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, pmu, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); if (ctx) - ctx_sched_in(ctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE); } /* @@ -2695,16 +2699,12 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx, * event_type is a bit mask of the types of events involved. For CPU events, * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE. */ -/* - * XXX: ctx_resched() reschedule entire perf_event_context while adding new - * event to the context or enabling existing event in the context. We can - * probably optimize it by rescheduling only affected pmu_ctx. - */ static void ctx_resched(struct perf_cpu_context *cpuctx, struct perf_event_context *task_ctx, - enum event_type_t event_type) + struct pmu *pmu, enum event_type_t event_type) { bool cpu_event = !!(event_type & EVENT_CPU); + struct perf_event_pmu_context *epc; /* * If pinned groups are involved, flexible groups also need to be @@ -2715,10 +2715,14 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, event_type &= EVENT_ALL; - perf_ctx_disable(&cpuctx->ctx, false); + for_each_epc(epc, &cpuctx->ctx, pmu, false) + perf_pmu_disable(epc->pmu); + if (task_ctx) { - perf_ctx_disable(task_ctx, false); - task_ctx_sched_out(task_ctx, event_type); + for_each_epc(epc, task_ctx, pmu, false) + perf_pmu_disable(epc->pmu); + + task_ctx_sched_out(task_ctx, pmu, event_type); } /* @@ -2729,15 +2733,19 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, * - otherwise, do nothing more. */ if (cpu_event) - ctx_sched_out(&cpuctx->ctx, event_type); + ctx_sched_out(&cpuctx->ctx, pmu, event_type); else if (event_type & EVENT_PINNED) - ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); + ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); + + perf_event_sched_in(cpuctx, task_ctx, pmu); - perf_event_sched_in(cpuctx, task_ctx); + for_each_epc(epc, &cpuctx->ctx, pmu, false) + perf_pmu_enable(epc->pmu); - perf_ctx_enable(&cpuctx->ctx, false); - if (task_ctx) - perf_ctx_enable(task_ctx, false); + if (task_ctx) { + for_each_epc(epc, task_ctx, pmu, false) + perf_pmu_enable(epc->pmu); + } } void perf_pmu_resched(struct pmu *pmu) @@ -2746,7 +2754,7 @@ void perf_pmu_resched(struct pmu *pmu) struct perf_event_context *task_ctx = cpuctx->task_ctx; perf_ctx_lock(cpuctx, task_ctx); - ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU); + ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU); perf_ctx_unlock(cpuctx, task_ctx); } @@ -2802,9 +2810,10 @@ static int __perf_install_in_context(void *info) #endif if (reprogram) { - ctx_sched_out(ctx, EVENT_TIME); + ctx_sched_out(ctx, NULL, EVENT_TIME); add_event_to_ctx(event, ctx); - ctx_resched(cpuctx, task_ctx, get_event_type(event)); + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, + get_event_type(event)); } else { add_event_to_ctx(event, ctx); } @@ -2948,7 +2957,7 @@ static void __perf_event_enable(struct perf_event *event, return; if (ctx->is_active) - ctx_sched_out(ctx, EVENT_TIME); + ctx_sched_out(ctx, NULL, EVENT_TIME); perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); perf_cgroup_event_enable(event, ctx); @@ -2957,7 +2966,7 @@ static void __perf_event_enable(struct perf_event *event, return; if (!event_filter_match(event)) { - ctx_sched_in(ctx, EVENT_TIME); + ctx_sched_in(ctx, NULL, EVENT_TIME); return; } @@ -2966,7 +2975,7 @@ static void __perf_event_enable(struct perf_event *event, * then don't put it on unless the group is on. */ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { - ctx_sched_in(ctx, EVENT_TIME); + ctx_sched_in(ctx, NULL, EVENT_TIME); return; } @@ -2974,7 +2983,7 @@ static void __perf_event_enable(struct perf_event *event, if (ctx->task) WARN_ON_ONCE(task_ctx != ctx); - ctx_resched(cpuctx, task_ctx, get_event_type(event)); + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event)); } /* @@ -3276,8 +3285,17 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, perf_pmu_enable(pmu); } +/* + * Be very careful with the @pmu argument since this will change ctx state. + * The @pmu argument works for ctx_resched(), because that is symmetric in + * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant. + * + * However, if you were to be asymmetrical, you could end up with messed up + * state, eg. ctx->is_active cleared even though most EPCs would still actually + * be active. + */ static void -ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) +ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_pmu_context *pmu_ctx; @@ -3331,11 +3349,8 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) is_active ^= ctx->is_active; /* changed bits */ - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (cgroup && !pmu_ctx->nr_cgroups) - continue; + for_each_epc(pmu_ctx, ctx, pmu, cgroup) __pmu_ctx_sched_out(pmu_ctx, is_active); - } } /* @@ -3579,7 +3594,7 @@ unlock: inside_switch: perf_ctx_sched_task_cb(ctx, false); - task_ctx_sched_out(ctx, EVENT_ALL); + task_ctx_sched_out(ctx, NULL, EVENT_ALL); perf_ctx_enable(ctx, false); raw_spin_unlock(&ctx->lock); @@ -3877,29 +3892,22 @@ static void pmu_groups_sched_in(struct perf_event_context *ctx, merge_sched_in, &can_add_hw); } -static void ctx_groups_sched_in(struct perf_event_context *ctx, - struct perf_event_groups *groups, - bool cgroup) +static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, + enum event_type_t event_type) { - struct perf_event_pmu_context *pmu_ctx; - - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (cgroup && !pmu_ctx->nr_cgroups) - continue; - pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu); - } -} + struct perf_event_context *ctx = pmu_ctx->ctx; -static void __pmu_ctx_sched_in(struct perf_event_context *ctx, - struct pmu *pmu) -{ - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu); + if (event_type & EVENT_PINNED) + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu); + if (event_type & EVENT_FLEXIBLE) + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu); } static void -ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) +ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; bool cgroup = event_type & EVENT_CGROUP; @@ -3935,12 +3943,16 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - if (is_active & EVENT_PINNED) - ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup); + if (is_active & EVENT_PINNED) { + for_each_epc(pmu_ctx, ctx, pmu, cgroup) + __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); + } /* Then walk through the lower prio flexible groups */ - if (is_active & EVENT_FLEXIBLE) - ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup); + if (is_active & EVENT_FLEXIBLE) { + for_each_epc(pmu_ctx, ctx, pmu, cgroup) + __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); + } } static void perf_event_context_sched_in(struct task_struct *task) @@ -3983,10 +3995,10 @@ static void perf_event_context_sched_in(struct task_struct *task) */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { perf_ctx_disable(&cpuctx->ctx, false); - ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE); } - perf_event_sched_in(cpuctx, ctx); + perf_event_sched_in(cpuctx, ctx, NULL); perf_ctx_sched_task_cb(cpuctx->task_ctx, true); @@ -4327,14 +4339,14 @@ static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc) update_context_time(&cpuctx->ctx); __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE); rotate_ctx(&cpuctx->ctx, cpu_event); - __pmu_ctx_sched_in(&cpuctx->ctx, pmu); + __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE); } if (task_event) rotate_ctx(task_epc->ctx, task_event); if (task_event || (task_epc && cpu_event)) - __pmu_ctx_sched_in(task_epc->ctx, pmu); + __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE); perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -4400,7 +4412,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) cpuctx = this_cpu_ptr(&perf_cpu_context); perf_ctx_lock(cpuctx, ctx); - ctx_sched_out(ctx, EVENT_TIME); + ctx_sched_out(ctx, NULL, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) { enabled |= event_enable_on_exec(event, ctx); @@ -4412,9 +4424,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) */ if (enabled) { clone_ctx = unclone_ctx(ctx); - ctx_resched(cpuctx, ctx, event_type); + ctx_resched(cpuctx, ctx, NULL, event_type); } else { - ctx_sched_in(ctx, EVENT_TIME); + ctx_sched_in(ctx, NULL, EVENT_TIME); } perf_ctx_unlock(cpuctx, ctx); @@ -13202,7 +13214,7 @@ static void perf_event_exit_task_context(struct task_struct *child) * in. */ raw_spin_lock_irq(&child_ctx->lock); - task_ctx_sched_out(child_ctx, EVENT_ALL); + task_ctx_sched_out(child_ctx, NULL, EVENT_ALL); /* * Now that the context is inactive, destroy the task <-> ctx relation @@ -13751,7 +13763,7 @@ static void __perf_event_exit_context(void *__info) struct perf_event *event; raw_spin_lock(&ctx->lock); - ctx_sched_out(ctx, EVENT_TIME); + ctx_sched_out(ctx, NULL, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); raw_spin_unlock(&ctx->lock); -- cgit v1.2.3-58-ga151 From 9a32bd9901fe5b1dcf544389dbf04f3b0a2fbab4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 7 Aug 2024 13:29:26 +0200 Subject: perf: Extract a few helpers The context time update code is repeated verbatim a few times. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Reviewed-by: Namhyung Kim Link: https://lore.kernel.org/r/20240807115550.031212518@infradead.org --- kernel/events/core.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index dad2b9ac42c0..eb03c9ab1670 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2330,6 +2330,24 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) event_sched_out(event, ctx); } +static inline void +ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) +{ + if (ctx->is_active & EVENT_TIME) { + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx, false); + } +} + +static inline void +ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event) +{ + if (ctx->is_active & EVENT_TIME) { + update_context_time(ctx); + update_cgrp_time_from_event(event); + } +} + #define DETACH_GROUP 0x01UL #define DETACH_CHILD 0x02UL #define DETACH_DEAD 0x04UL @@ -2349,10 +2367,7 @@ __perf_remove_from_context(struct perf_event *event, struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; unsigned long flags = (unsigned long)info; - if (ctx->is_active & EVENT_TIME) { - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx, false); - } + ctx_time_update(cpuctx, ctx); /* * Ensure event_sched_out() switches to OFF, at the very least @@ -2437,12 +2452,8 @@ static void __perf_event_disable(struct perf_event *event, if (event->state < PERF_EVENT_STATE_INACTIVE) return; - if (ctx->is_active & EVENT_TIME) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } - perf_pmu_disable(event->pmu_ctx->pmu); + ctx_time_update_event(ctx, event); if (event == event->group_leader) group_sched_out(event, ctx); @@ -4529,10 +4540,7 @@ static void __perf_event_read(void *info) return; raw_spin_lock(&ctx->lock); - if (ctx->is_active & EVENT_TIME) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } + ctx_time_update_event(ctx, event); perf_event_update_time(event); if (data->group) @@ -4732,10 +4740,7 @@ again: * May read while context is not active (e.g., thread is * blocked), in that case we cannot update context time */ - if (ctx->is_active & EVENT_TIME) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } + ctx_time_update_event(ctx, event); perf_event_update_time(event); if (group) -- cgit v1.2.3-58-ga151 From 558abc7e3f895049faa46b08656be4c60dc6e9fd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 7 Aug 2024 13:29:27 +0200 Subject: perf: Fix event_function_call() locking All the event_function/@func call context already uses perf_ctx_lock() except for the !ctx->is_active case. Make it all consistent. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Reviewed-by: Namhyung Kim Link: https://lore.kernel.org/r/20240807115550.138301094@infradead.org --- kernel/events/core.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index eb03c9ab1670..ab49deae6e6f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -263,6 +263,7 @@ unlock: static void event_function_call(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ struct event_function_struct efs = { .event = event, @@ -291,22 +292,22 @@ again: if (!task_function_call(task, event_function, &efs)) return; - raw_spin_lock_irq(&ctx->lock); + perf_ctx_lock(cpuctx, ctx); /* * Reload the task pointer, it might have been changed by * a concurrent perf_event_context_sched_out(). */ task = ctx->task; if (task == TASK_TOMBSTONE) { - raw_spin_unlock_irq(&ctx->lock); + perf_ctx_unlock(cpuctx, ctx); return; } if (ctx->is_active) { - raw_spin_unlock_irq(&ctx->lock); + perf_ctx_unlock(cpuctx, ctx); goto again; } func(event, NULL, ctx, data); - raw_spin_unlock_irq(&ctx->lock); + perf_ctx_unlock(cpuctx, ctx); } /* -- cgit v1.2.3-58-ga151 From 5d95a2af973d47260b1e1828953fc860c0094052 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 7 Aug 2024 13:29:28 +0200 Subject: perf: Add context time freeze Many of the the context reschedule users are of the form: ctx_sched_out(.type = EVENT_TIME); ... modify context ctx_resched(); With the idea that the whole reschedule happens with a single time-stamp, rather than with each ctx_sched_out() advancing time and ctx_sched_in() re-starting time, creating a non-atomic experience. However, Kan noticed that since this completely stops time, it actually looses a bit of time between the stop and start. Worse, now that we can do partial (per PMU) reschedules, the PMUs that are not scheduled out still observe the time glitch. Replace this with: ctx_time_freeze(); ... modify context ctx_resched(); With the assumption that this happens in a perf_ctx_lock() / perf_ctx_unlock() pair. The new ctx_time_freeze() will update time and sets EVENT_FROZEN, and ensures EVENT_TIME and EVENT_FROZEN remain set, this avoids perf_event_time_now() from observing a time wobble from not seeing EVENT_TIME for a little while. Additionally, this avoids loosing time between ctx_sched_out(EVENT_TIME) and ctx_sched_in(), which would re-set the timestamp. Reported-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Reviewed-by: Namhyung Kim Link: https://lore.kernel.org/r/20240807115550.250637571@infradead.org --- kernel/events/core.c | 128 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 86 insertions(+), 42 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index ab49deae6e6f..197d3be443bb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -155,20 +155,55 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info) return data.ret; } +enum event_type_t { + EVENT_FLEXIBLE = 0x01, + EVENT_PINNED = 0x02, + EVENT_TIME = 0x04, + EVENT_FROZEN = 0x08, + /* see ctx_resched() for details */ + EVENT_CPU = 0x10, + EVENT_CGROUP = 0x20, + + /* compound helpers */ + EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, + EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, +}; + +static inline void __perf_ctx_lock(struct perf_event_context *ctx) +{ + raw_spin_lock(&ctx->lock); + WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN); +} + static void perf_ctx_lock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - raw_spin_lock(&cpuctx->ctx.lock); + __perf_ctx_lock(&cpuctx->ctx); if (ctx) - raw_spin_lock(&ctx->lock); + __perf_ctx_lock(ctx); +} + +static inline void __perf_ctx_unlock(struct perf_event_context *ctx) +{ + /* + * If ctx_sched_in() didn't again set any ALL flags, clean up + * after ctx_sched_out() by clearing is_active. + */ + if (ctx->is_active & EVENT_FROZEN) { + if (!(ctx->is_active & EVENT_ALL)) + ctx->is_active = 0; + else + ctx->is_active &= ~EVENT_FROZEN; + } + raw_spin_unlock(&ctx->lock); } static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { if (ctx) - raw_spin_unlock(&ctx->lock); - raw_spin_unlock(&cpuctx->ctx.lock); + __perf_ctx_unlock(ctx); + __perf_ctx_unlock(&cpuctx->ctx); } #define TASK_TOMBSTONE ((void *)-1L) @@ -370,16 +405,6 @@ unlock: (PERF_SAMPLE_BRANCH_KERNEL |\ PERF_SAMPLE_BRANCH_HV) -enum event_type_t { - EVENT_FLEXIBLE = 0x1, - EVENT_PINNED = 0x2, - EVENT_TIME = 0x4, - /* see ctx_resched() for details */ - EVENT_CPU = 0x8, - EVENT_CGROUP = 0x10, - EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, -}; - /* * perf_sched_events : >0 events exist */ @@ -2332,18 +2357,39 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) } static inline void -ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) +__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final) { if (ctx->is_active & EVENT_TIME) { + if (ctx->is_active & EVENT_FROZEN) + return; update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx, false); + update_cgrp_time_from_cpuctx(cpuctx, final); } } +static inline void +ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) +{ + __ctx_time_update(cpuctx, ctx, false); +} + +/* + * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock(). + */ +static inline void +ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) +{ + ctx_time_update(cpuctx, ctx); + if (ctx->is_active & EVENT_TIME) + ctx->is_active |= EVENT_FROZEN; +} + static inline void ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event) { if (ctx->is_active & EVENT_TIME) { + if (ctx->is_active & EVENT_FROZEN) + return; update_context_time(ctx); update_cgrp_time_from_event(event); } @@ -2822,7 +2868,7 @@ static int __perf_install_in_context(void *info) #endif if (reprogram) { - ctx_sched_out(ctx, NULL, EVENT_TIME); + ctx_time_freeze(cpuctx, ctx); add_event_to_ctx(event, ctx); ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event)); @@ -2968,8 +3014,7 @@ static void __perf_event_enable(struct perf_event *event, event->state <= PERF_EVENT_STATE_ERROR) return; - if (ctx->is_active) - ctx_sched_out(ctx, NULL, EVENT_TIME); + ctx_time_freeze(cpuctx, ctx); perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); perf_cgroup_event_enable(event, ctx); @@ -2977,19 +3022,15 @@ static void __perf_event_enable(struct perf_event *event, if (!ctx->is_active) return; - if (!event_filter_match(event)) { - ctx_sched_in(ctx, NULL, EVENT_TIME); + if (!event_filter_match(event)) return; - } /* * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. */ - if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { - ctx_sched_in(ctx, NULL, EVENT_TIME); + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) return; - } task_ctx = cpuctx->task_ctx; if (ctx->task) @@ -3263,7 +3304,7 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, struct perf_event *event, *tmp; struct pmu *pmu = pmu_ctx->pmu; - if (ctx->task && !ctx->is_active) { + if (ctx->task && !(ctx->is_active & EVENT_ALL)) { struct perf_cpu_pmu_context *cpc; cpc = this_cpu_ptr(pmu->cpu_pmu_context); @@ -3338,24 +3379,29 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t * * would only update time for the pinned events. */ - if (is_active & EVENT_TIME) { - /* update (and stop) ctx time */ - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx); + __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx); + + /* + * CPU-release for the below ->is_active store, + * see __load_acquire() in perf_event_time_now() + */ + barrier(); + ctx->is_active &= ~event_type; + + if (!(ctx->is_active & EVENT_ALL)) { /* - * CPU-release for the below ->is_active store, - * see __load_acquire() in perf_event_time_now() + * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now() + * does not observe a hole. perf_ctx_unlock() will clean up. */ - barrier(); + if (ctx->is_active & EVENT_FROZEN) + ctx->is_active &= EVENT_TIME_FROZEN; + else + ctx->is_active = 0; } - ctx->is_active &= ~event_type; - if (!(ctx->is_active & EVENT_ALL)) - ctx->is_active = 0; - if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); - if (!ctx->is_active) + if (!(ctx->is_active & EVENT_ALL)) cpuctx->task_ctx = NULL; } @@ -3943,7 +3989,7 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { - if (!is_active) + if (!(is_active & EVENT_ALL)) cpuctx->task_ctx = ctx; else WARN_ON_ONCE(cpuctx->task_ctx != ctx); @@ -4424,7 +4470,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) cpuctx = this_cpu_ptr(&perf_cpu_context); perf_ctx_lock(cpuctx, ctx); - ctx_sched_out(ctx, NULL, EVENT_TIME); + ctx_time_freeze(cpuctx, ctx); list_for_each_entry(event, &ctx->event_list, event_entry) { enabled |= event_enable_on_exec(event, ctx); @@ -4437,8 +4483,6 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) if (enabled) { clone_ctx = unclone_ctx(ctx); ctx_resched(cpuctx, ctx, NULL, event_type); - } else { - ctx_sched_in(ctx, NULL, EVENT_TIME); } perf_ctx_unlock(cpuctx, ctx); -- cgit v1.2.3-58-ga151 From 3e15a3fe3a2a170c5be52783667706875c088f96 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 7 Aug 2024 13:29:29 +0200 Subject: perf: Optimize __pmu_ctx_sched_out() There is is no point in doing the perf_pmu_disable() dance just to do nothing. This happens for ctx_sched_out(.type = EVENT_TIME) for instance. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Reviewed-by: Namhyung Kim Link: https://lore.kernel.org/r/20240807115550.392851915@infradead.org --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 197d3be443bb..9893ba5e98aa 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3312,7 +3312,7 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, cpc->task_epc = NULL; } - if (!event_type) + if (!(event_type & EVENT_ALL)) return; perf_pmu_disable(pmu); -- cgit v1.2.3-58-ga151 From fe826cc2654e8561b64246325e6a51b62bf2488c Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 13 Aug 2024 22:55:11 +0200 Subject: perf: Really fix event_function_call() locking Commit 558abc7e3f89 ("perf: Fix event_function_call() locking") lost IRQ disabling by mistake. Fixes: 558abc7e3f89 ("perf: Fix event_function_call() locking") Reported-by: Pengfei Xu Reported-by: Naresh Kamboju Tested-by: Pengfei Xu Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) --- kernel/events/core.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9893ba5e98aa..c6a720f41225 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -298,8 +298,8 @@ unlock: static void event_function_call(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ + struct perf_cpu_context *cpuctx; struct event_function_struct efs = { .event = event, .func = func, @@ -327,22 +327,25 @@ again: if (!task_function_call(task, event_function, &efs)) return; + local_irq_disable(); + cpuctx = this_cpu_ptr(&perf_cpu_context); perf_ctx_lock(cpuctx, ctx); /* * Reload the task pointer, it might have been changed by * a concurrent perf_event_context_sched_out(). */ task = ctx->task; - if (task == TASK_TOMBSTONE) { - perf_ctx_unlock(cpuctx, ctx); - return; - } + if (task == TASK_TOMBSTONE) + goto unlock; if (ctx->is_active) { perf_ctx_unlock(cpuctx, ctx); + local_irq_enable(); goto again; } func(event, NULL, ctx, data); +unlock: perf_ctx_unlock(cpuctx, ctx); + local_irq_enable(); } /* -- cgit v1.2.3-58-ga151 From 62c0b1061593d7012292f781f11145b2d46f43ab Mon Sep 17 00:00:00 2001 From: Luo Gengkun Date: Sat, 31 Aug 2024 07:43:15 +0000 Subject: perf/core: Fix small negative period being ignored In perf_adjust_period, we will first calculate period, and then use this period to calculate delta. However, when delta is less than 0, there will be a deviation compared to when delta is greater than or equal to 0. For example, when delta is in the range of [-14,-1], the range of delta = delta + 7 is between [-7,6], so the final value of delta/8 is 0. Therefore, the impact of -1 and -2 will be ignored. This is unacceptable when the target period is very short, because we will lose a lot of samples. Here are some tests and analyzes: before: # perf record -e cs -F 1000 ./a.out [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.022 MB perf.data (518 samples) ] # perf script ... a.out 396 257.956048: 23 cs: ffffffff81f4eeec schedul> a.out 396 257.957891: 23 cs: ffffffff81f4eeec schedul> a.out 396 257.959730: 23 cs: ffffffff81f4eeec schedul> a.out 396 257.961545: 23 cs: ffffffff81f4eeec schedul> a.out 396 257.963355: 23 cs: ffffffff81f4eeec schedul> a.out 396 257.965163: 23 cs: ffffffff81f4eeec schedul> a.out 396 257.966973: 23 cs: ffffffff81f4eeec schedul> a.out 396 257.968785: 23 cs: ffffffff81f4eeec schedul> a.out 396 257.970593: 23 cs: ffffffff81f4eeec schedul> ... after: # perf record -e cs -F 1000 ./a.out [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.058 MB perf.data (1466 samples) ] # perf script ... a.out 395 59.338813: 11 cs: ffffffff81f4eeec schedul> a.out 395 59.339707: 12 cs: ffffffff81f4eeec schedul> a.out 395 59.340682: 13 cs: ffffffff81f4eeec schedul> a.out 395 59.341751: 13 cs: ffffffff81f4eeec schedul> a.out 395 59.342799: 12 cs: ffffffff81f4eeec schedul> a.out 395 59.343765: 11 cs: ffffffff81f4eeec schedul> a.out 395 59.344651: 11 cs: ffffffff81f4eeec schedul> a.out 395 59.345539: 12 cs: ffffffff81f4eeec schedul> a.out 395 59.346502: 13 cs: ffffffff81f4eeec schedul> ... test.c int main() { for (int i = 0; i < 20000; i++) usleep(10); return 0; } # time ./a.out real 0m1.583s user 0m0.040s sys 0m0.298s The above results were tested on x86-64 qemu with KVM enabled using test.c as test program. Ideally, we should have around 1500 samples, but the previous algorithm had only about 500, whereas the modified algorithm now has about 1400. Further more, the new version shows 1 sample per 0.001s, while the previous one is 1 sample per 0.002s.This indicates that the new algorithm is more sensitive to small negative values compared to old algorithm. Fixes: bd2b5b12849a ("perf_counter: More aggressive frequency adjustment") Signed-off-by: Luo Gengkun Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Adrian Hunter Reviewed-by: Kan Liang Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20240831074316.2106159-2-luogengkun@huaweicloud.com --- kernel/events/core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4acec97e5448..67e115d4ef96 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4183,7 +4183,11 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo period = perf_calculate_period(event, nsec, count); delta = (s64)(period - hwc->sample_period); - delta = (delta + 7) / 8; /* low pass filter */ + if (delta >= 0) + delta += 7; + else + delta -= 7; + delta /= 8; /* low pass filter */ sample_period = hwc->sample_period + delta; -- cgit v1.2.3-58-ga151 From 3f7f1a64da731749f1bbd7424b44c1ec6191f21c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 3 Sep 2024 10:45:56 -0700 Subject: uprobes: revamp uprobe refcounting and lifetime management Revamp how struct uprobe is refcounted, and thus how its lifetime is managed. Right now, there are a few possible "owners" of uprobe refcount: - uprobes_tree RB tree assumes one refcount when uprobe is registered and added to the lookup tree; - while uprobe is triggered and kernel is handling it in the breakpoint handler code, temporary refcount bump is done to keep uprobe from being freed; - if we have uretprobe requested on a given struct uprobe instance, we take another refcount to keep uprobe alive until user space code returns from the function and triggers return handler. The uprobe_tree's extra refcount of 1 is confusing and problematic. No matter how many actual consumers are attached, they all share the same refcount, and we have an extra logic to drop the "last" (which might not really be last) refcount once uprobe's consumer list becomes empty. This is unconventional and has to be kept in mind as a special case all the time. Further, because of this design we have the situations where find_uprobe() will find uprobe, bump refcount, return it to the caller, but that uprobe will still need uprobe_is_active() check, after which the caller is required to drop refcount and try again. This is just too many details leaking to the higher level logic. This patch changes refcounting scheme in such a way as to not have uprobes_tree keeping extra refcount for struct uprobe. Instead, each uprobe_consumer is assuming its own refcount, which will be dropped when consumer is unregistered. Other than that, all the active users of uprobe (entry and return uprobe handling code) keeps exactly the same refcounting approach. With the above setup, once uprobe's refcount drops to zero, we need to make sure that uprobe's "destructor" removes uprobe from uprobes_tree, of course. This, though, races with uprobe entry handling code in handle_swbp(), which, through find_active_uprobe()->find_uprobe() lookup, can race with uprobe being destroyed after refcount drops to zero (e.g., due to uprobe_consumer unregistering). So we add try_get_uprobe(), which will attempt to bump refcount, unless it already is zero. Caller needs to guarantee that uprobe instance won't be freed in parallel, which is the case while we keep uprobes_treelock (for read or write, doesn't matter). Note also, we now don't leak the race between registration and unregistration, so we remove the retry logic completely. If find_uprobe() returns valid uprobe, it's guaranteed to remain in uprobes_tree with properly incremented refcount. The race is handled inside __insert_uprobe() and put_uprobe() working together: __insert_uprobe() will remove uprobe from RB-tree, if it can't bump refcount and will retry to insert the new uprobe instance. put_uprobe() won't attempt to remove uprobe from RB-tree, if it's already not there. All that is protected by uprobes_treelock, which keeps things simple. Signed-off-by: Andrii Nakryiko Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240903174603.3554182-2-andrii@kernel.org --- kernel/events/uprobes.c | 179 +++++++++++++++++++++++++++--------------------- 1 file changed, 101 insertions(+), 78 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index cac45ea4c284..cd92e8dc3ed0 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -109,6 +109,11 @@ struct xol_area { unsigned long vaddr; /* Page(s) of instruction slots */ }; +static void uprobe_warn(struct task_struct *t, const char *msg) +{ + pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg); +} + /* * valid_vma: Verify if the specified vma is an executable vma * Relax restrictions while unregistering: vm_flags might have @@ -587,25 +592,53 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v *(uprobe_opcode_t *)&auprobe->insn); } +/* uprobe should have guaranteed positive refcount */ static struct uprobe *get_uprobe(struct uprobe *uprobe) { refcount_inc(&uprobe->ref); return uprobe; } +/* + * uprobe should have guaranteed lifetime, which can be either of: + * - caller already has refcount taken (and wants an extra one); + * - uprobe is RCU protected and won't be freed until after grace period; + * - we are holding uprobes_treelock (for read or write, doesn't matter). + */ +static struct uprobe *try_get_uprobe(struct uprobe *uprobe) +{ + if (refcount_inc_not_zero(&uprobe->ref)) + return uprobe; + return NULL; +} + +static inline bool uprobe_is_active(struct uprobe *uprobe) +{ + return !RB_EMPTY_NODE(&uprobe->rb_node); +} + static void put_uprobe(struct uprobe *uprobe) { - if (refcount_dec_and_test(&uprobe->ref)) { - /* - * If application munmap(exec_vma) before uprobe_unregister() - * gets called, we don't get a chance to remove uprobe from - * delayed_uprobe_list from remove_breakpoint(). Do it here. - */ - mutex_lock(&delayed_uprobe_lock); - delayed_uprobe_remove(uprobe, NULL); - mutex_unlock(&delayed_uprobe_lock); - kfree(uprobe); - } + if (!refcount_dec_and_test(&uprobe->ref)) + return; + + write_lock(&uprobes_treelock); + + if (uprobe_is_active(uprobe)) + rb_erase(&uprobe->rb_node, &uprobes_tree); + + write_unlock(&uprobes_treelock); + + /* + * If application munmap(exec_vma) before uprobe_unregister() + * gets called, we don't get a chance to remove uprobe from + * delayed_uprobe_list from remove_breakpoint(). Do it here. + */ + mutex_lock(&delayed_uprobe_lock); + delayed_uprobe_remove(uprobe, NULL); + mutex_unlock(&delayed_uprobe_lock); + + kfree(uprobe); } static __always_inline @@ -656,7 +689,7 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key); if (node) - return get_uprobe(__node_2_uprobe(node)); + return try_get_uprobe(__node_2_uprobe(node)); return NULL; } @@ -676,26 +709,44 @@ static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) return uprobe; } +/* + * Attempt to insert a new uprobe into uprobes_tree. + * + * If uprobe already exists (for given inode+offset), we just increment + * refcount of previously existing uprobe. + * + * If not, a provided new instance of uprobe is inserted into the tree (with + * assumed initial refcount == 1). + * + * In any case, we return a uprobe instance that ends up being in uprobes_tree. + * Caller has to clean up new uprobe instance, if it ended up not being + * inserted into the tree. + * + * We assume that uprobes_treelock is held for writing. + */ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) { struct rb_node *node; - +again: node = rb_find_add(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp); - if (node) - return get_uprobe(__node_2_uprobe(node)); + if (node) { + struct uprobe *u = __node_2_uprobe(node); - /* get access + creation ref */ - refcount_set(&uprobe->ref, 2); - return NULL; + if (!try_get_uprobe(u)) { + rb_erase(node, &uprobes_tree); + RB_CLEAR_NODE(&u->rb_node); + goto again; + } + + return u; + } + + return uprobe; } /* - * Acquire uprobes_treelock. - * Matching uprobe already exists in rbtree; - * increment (access refcount) and return the matching uprobe. - * - * No matching uprobe; insert the uprobe in rb_tree; - * get a double refcount (access + creation) and return NULL. + * Acquire uprobes_treelock and insert uprobe into uprobes_tree + * (or reuse existing one, see __insert_uprobe() comments above). */ static struct uprobe *insert_uprobe(struct uprobe *uprobe) { @@ -732,11 +783,13 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, uprobe->ref_ctr_offset = ref_ctr_offset; init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); + RB_CLEAR_NODE(&uprobe->rb_node); + refcount_set(&uprobe->ref, 1); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); /* a uprobe exists for this inode:offset combination */ - if (cur_uprobe) { + if (cur_uprobe != uprobe) { if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) { ref_ctr_mismatch_warn(cur_uprobe, uprobe); put_uprobe(cur_uprobe); @@ -921,26 +974,6 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad return set_orig_insn(&uprobe->arch, mm, vaddr); } -static inline bool uprobe_is_active(struct uprobe *uprobe) -{ - return !RB_EMPTY_NODE(&uprobe->rb_node); -} -/* - * There could be threads that have already hit the breakpoint. They - * will recheck the current insn and restart if find_uprobe() fails. - * See find_active_uprobe(). - */ -static void delete_uprobe(struct uprobe *uprobe) -{ - if (WARN_ON(!uprobe_is_active(uprobe))) - return; - - write_lock(&uprobes_treelock); - rb_erase(&uprobe->rb_node, &uprobes_tree); - write_unlock(&uprobes_treelock); - RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */ -} - struct map_info { struct map_info *next; struct mm_struct *mm; @@ -1094,17 +1127,13 @@ void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) int err; down_write(&uprobe->register_rwsem); - if (WARN_ON(!consumer_del(uprobe, uc))) + if (WARN_ON(!consumer_del(uprobe, uc))) { err = -ENOENT; - else + } else { err = register_for_each_vma(uprobe, NULL); - - /* TODO : cant unregister? schedule a worker thread */ - if (!err) { - if (!uprobe->consumers) - delete_uprobe(uprobe); - else - err = -EBUSY; + /* TODO : cant unregister? schedule a worker thread */ + if (unlikely(err)) + uprobe_warn(current, "unregister, leaking uprobe"); } up_write(&uprobe->register_rwsem); @@ -1159,27 +1188,16 @@ struct uprobe *uprobe_register(struct inode *inode, if (!IS_ALIGNED(ref_ctr_offset, sizeof(short))) return ERR_PTR(-EINVAL); - retry: uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); if (IS_ERR(uprobe)) return uprobe; - /* - * We can race with uprobe_unregister()->delete_uprobe(). - * Check uprobe_is_active() and retry if it is false. - */ down_write(&uprobe->register_rwsem); - ret = -EAGAIN; - if (likely(uprobe_is_active(uprobe))) { - consumer_add(uprobe, uc); - ret = register_for_each_vma(uprobe, uc); - } + consumer_add(uprobe, uc); + ret = register_for_each_vma(uprobe, uc); up_write(&uprobe->register_rwsem); - put_uprobe(uprobe); if (ret) { - if (unlikely(ret == -EAGAIN)) - goto retry; uprobe_unregister(uprobe, uc); return ERR_PTR(ret); } @@ -1286,15 +1304,17 @@ static void build_probe_list(struct inode *inode, u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset < min) break; - list_add(&u->pending_list, head); - get_uprobe(u); + /* if uprobe went away, it's safe to ignore it */ + if (try_get_uprobe(u)) + list_add(&u->pending_list, head); } for (t = n; (t = rb_next(t)); ) { u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset > max) break; - list_add(&u->pending_list, head); - get_uprobe(u); + /* if uprobe went away, it's safe to ignore it */ + if (try_get_uprobe(u)) + list_add(&u->pending_list, head); } } read_unlock(&uprobes_treelock); @@ -1751,6 +1771,12 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) return -ENOMEM; *n = *o; + /* + * uprobe's refcnt has to be positive at this point, kept by + * utask->return_instances items; return_instances can't be + * removed right now, as task is blocked due to duping; so + * get_uprobe() is safe to use here. + */ get_uprobe(n->uprobe); n->next = NULL; @@ -1762,12 +1788,6 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) return 0; } -static void uprobe_warn(struct task_struct *t, const char *msg) -{ - pr_warn("uprobe: %s:%d failed to %s\n", - current->comm, current->pid, msg); -} - static void dup_xol_work(struct callback_head *work) { if (current->flags & PF_EXITING) @@ -1893,7 +1913,10 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) } orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; } - + /* + * uprobe's refcnt is positive, held by caller, so it's safe to + * unconditionally bump it one more time here + */ ri->uprobe = get_uprobe(uprobe); ri->func = instruction_pointer(regs); ri->stack = user_stack_pointer(regs); -- cgit v1.2.3-58-ga151 From 8617408f7a01e94ce1f73e40a7704530e5dfb25c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 3 Sep 2024 10:45:57 -0700 Subject: uprobes: protected uprobe lifetime with SRCU To avoid unnecessarily taking a (brief) refcount on uprobe during breakpoint handling in handle_swbp for entry uprobes, make find_uprobe() not take refcount, but protect the lifetime of a uprobe instance with RCU. This improves scalability, as refcount gets quite expensive due to cache line bouncing between multiple CPUs. Specifically, we utilize our own uprobe-specific SRCU instance for this RCU protection. put_uprobe() will delay actual kfree() using call_srcu(). For now, uretprobe and single-stepping handling will still acquire refcount as necessary. We'll address these issues in follow up patches by making them use SRCU with timeout. Signed-off-by: Andrii Nakryiko Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240903174603.3554182-3-andrii@kernel.org --- kernel/events/uprobes.c | 94 ++++++++++++++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 40 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index cd92e8dc3ed0..d228d2ba30bb 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -41,6 +41,8 @@ static struct rb_root uprobes_tree = RB_ROOT; static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ +DEFINE_STATIC_SRCU(uprobes_srcu); + #define UPROBES_HASH_SZ 13 /* serialize uprobe->pending_list */ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; @@ -59,6 +61,7 @@ struct uprobe { struct list_head pending_list; struct uprobe_consumer *consumers; struct inode *inode; /* Also hold a ref to inode */ + struct rcu_head rcu; loff_t offset; loff_t ref_ctr_offset; unsigned long flags; @@ -617,6 +620,13 @@ static inline bool uprobe_is_active(struct uprobe *uprobe) return !RB_EMPTY_NODE(&uprobe->rb_node); } +static void uprobe_free_rcu(struct rcu_head *rcu) +{ + struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); + + kfree(uprobe); +} + static void put_uprobe(struct uprobe *uprobe) { if (!refcount_dec_and_test(&uprobe->ref)) @@ -638,7 +648,7 @@ static void put_uprobe(struct uprobe *uprobe) delayed_uprobe_remove(uprobe, NULL); mutex_unlock(&delayed_uprobe_lock); - kfree(uprobe); + call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu); } static __always_inline @@ -680,33 +690,25 @@ static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b) return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b)); } -static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) +/* + * Assumes being inside RCU protected region. + * No refcount is taken on returned uprobe. + */ +static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset) { struct __uprobe_key key = { .inode = inode, .offset = offset, }; - struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key); - - if (node) - return try_get_uprobe(__node_2_uprobe(node)); - - return NULL; -} + struct rb_node *node; -/* - * Find a uprobe corresponding to a given inode:offset - * Acquires uprobes_treelock - */ -static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) -{ - struct uprobe *uprobe; + lockdep_assert(srcu_read_lock_held(&uprobes_srcu)); read_lock(&uprobes_treelock); - uprobe = __find_uprobe(inode, offset); + node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key); read_unlock(&uprobes_treelock); - return uprobe; + return node ? __node_2_uprobe(node) : NULL; } /* @@ -1080,10 +1082,10 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) goto free; /* * We take mmap_lock for writing to avoid the race with - * find_active_uprobe() which takes mmap_lock for reading. + * find_active_uprobe_rcu() which takes mmap_lock for reading. * Thus this install_breakpoint() can not make - * is_trap_at_addr() true right after find_uprobe() - * returns NULL in find_active_uprobe(). + * is_trap_at_addr() true right after find_uprobe_rcu() + * returns NULL in find_active_uprobe_rcu(). */ mmap_write_lock(mm); vma = find_vma(mm, info->vaddr); @@ -1884,9 +1886,13 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) return; } + /* we need to bump refcount to store uprobe in utask */ + if (!try_get_uprobe(uprobe)) + return; + ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL); if (!ri) - return; + goto fail; trampoline_vaddr = uprobe_get_trampoline_vaddr(); orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); @@ -1913,11 +1919,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) } orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; } - /* - * uprobe's refcnt is positive, held by caller, so it's safe to - * unconditionally bump it one more time here - */ - ri->uprobe = get_uprobe(uprobe); + ri->uprobe = uprobe; ri->func = instruction_pointer(regs); ri->stack = user_stack_pointer(regs); ri->orig_ret_vaddr = orig_ret_vaddr; @@ -1928,8 +1930,9 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) utask->return_instances = ri; return; - fail: +fail: kfree(ri); + put_uprobe(uprobe); } /* Prepare to single-step probed instruction out of line. */ @@ -1944,9 +1947,14 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) if (!utask) return -ENOMEM; + if (!try_get_uprobe(uprobe)) + return -EINVAL; + xol_vaddr = xol_get_insn_slot(uprobe); - if (!xol_vaddr) - return -ENOMEM; + if (!xol_vaddr) { + err = -ENOMEM; + goto err_out; + } utask->xol_vaddr = xol_vaddr; utask->vaddr = bp_vaddr; @@ -1954,12 +1962,15 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) err = arch_uprobe_pre_xol(&uprobe->arch, regs); if (unlikely(err)) { xol_free_insn_slot(current); - return err; + goto err_out; } utask->active_uprobe = uprobe; utask->state = UTASK_SSTEP; return 0; +err_out: + put_uprobe(uprobe); + return err; } /* @@ -2043,7 +2054,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) return is_trap_insn(&opcode); } -static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) +/* assumes being inside RCU protected region */ +static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp) { struct mm_struct *mm = current->mm; struct uprobe *uprobe = NULL; @@ -2056,7 +2068,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) struct inode *inode = file_inode(vma->vm_file); loff_t offset = vaddr_to_offset(vma, bp_vaddr); - uprobe = find_uprobe(inode, offset); + uprobe = find_uprobe_rcu(inode, offset); } if (!uprobe) @@ -2202,13 +2214,15 @@ static void handle_swbp(struct pt_regs *regs) { struct uprobe *uprobe; unsigned long bp_vaddr; - int is_swbp; + int is_swbp, srcu_idx; bp_vaddr = uprobe_get_swbp_addr(regs); if (bp_vaddr == uprobe_get_trampoline_vaddr()) return uprobe_handle_trampoline(regs); - uprobe = find_active_uprobe(bp_vaddr, &is_swbp); + srcu_idx = srcu_read_lock(&uprobes_srcu); + + uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); if (!uprobe) { if (is_swbp > 0) { /* No matching uprobe; signal SIGTRAP. */ @@ -2224,7 +2238,7 @@ static void handle_swbp(struct pt_regs *regs) */ instruction_pointer_set(regs, bp_vaddr); } - return; + goto out; } /* change it in advance for ->handler() and restart */ @@ -2259,12 +2273,12 @@ static void handle_swbp(struct pt_regs *regs) if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; - if (!pre_ssout(uprobe, regs, bp_vaddr)) - return; + if (pre_ssout(uprobe, regs, bp_vaddr)) + goto out; - /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ out: - put_uprobe(uprobe); + /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ + srcu_read_unlock(&uprobes_srcu, srcu_idx); } /* -- cgit v1.2.3-58-ga151 From 59da880afed211c989ef65da577b24215ce57774 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 3 Sep 2024 10:45:58 -0700 Subject: uprobes: get rid of enum uprobe_filter_ctx in uprobe filter callbacks It serves no purpose beyond adding unnecessray argument passed to the filter callback. Just get rid of it, no one is actually using it. Signed-off-by: Andrii Nakryiko Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240903174603.3554182-4-andrii@kernel.org --- include/linux/uprobes.h | 10 +--------- kernel/events/uprobes.c | 18 +++++++----------- kernel/trace/bpf_trace.c | 3 +-- kernel/trace/trace_uprobe.c | 9 +++------ 4 files changed, 12 insertions(+), 28 deletions(-) (limited to 'kernel/events') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index f50df1fa93e7..63ae2ade3487 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -28,20 +28,12 @@ struct page; #define MAX_URETPROBE_DEPTH 64 -enum uprobe_filter_ctx { - UPROBE_FILTER_REGISTER, - UPROBE_FILTER_UNREGISTER, - UPROBE_FILTER_MMAP, -}; - struct uprobe_consumer { int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs); int (*ret_handler)(struct uprobe_consumer *self, unsigned long func, struct pt_regs *regs); - bool (*filter)(struct uprobe_consumer *self, - enum uprobe_filter_ctx ctx, - struct mm_struct *mm); + bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm); struct uprobe_consumer *next; }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d228d2ba30bb..87b499cd2334 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -918,21 +918,19 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, return ret; } -static inline bool consumer_filter(struct uprobe_consumer *uc, - enum uprobe_filter_ctx ctx, struct mm_struct *mm) +static inline bool consumer_filter(struct uprobe_consumer *uc, struct mm_struct *mm) { - return !uc->filter || uc->filter(uc, ctx, mm); + return !uc->filter || uc->filter(uc, mm); } -static bool filter_chain(struct uprobe *uprobe, - enum uprobe_filter_ctx ctx, struct mm_struct *mm) +static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) { struct uprobe_consumer *uc; bool ret = false; down_read(&uprobe->consumer_rwsem); for (uc = uprobe->consumers; uc; uc = uc->next) { - ret = consumer_filter(uc, ctx, mm); + ret = consumer_filter(uc, mm); if (ret) break; } @@ -1099,12 +1097,10 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) if (is_register) { /* consult only the "caller", new consumer. */ - if (consumer_filter(new, - UPROBE_FILTER_REGISTER, mm)) + if (consumer_filter(new, mm)) err = install_breakpoint(uprobe, mm, vma, info->vaddr); } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { - if (!filter_chain(uprobe, - UPROBE_FILTER_UNREGISTER, mm)) + if (!filter_chain(uprobe, mm)) err |= remove_breakpoint(uprobe, mm, info->vaddr); } @@ -1387,7 +1383,7 @@ int uprobe_mmap(struct vm_area_struct *vma) */ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { if (!fatal_signal_pending(current) && - filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) { + filter_chain(uprobe, vma->vm_mm)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 90cd30e9723e..c99bf0617602 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3320,8 +3320,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe, } static bool -uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx ctx, - struct mm_struct *mm) +uprobe_multi_link_filter(struct uprobe_consumer *con, struct mm_struct *mm) { struct bpf_uprobe *uprobe; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 52e76a73fa7c..7eb79e0a5352 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1078,9 +1078,7 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e return trace_handle_return(s); } -typedef bool (*filter_func_t)(struct uprobe_consumer *self, - enum uprobe_filter_ctx ctx, - struct mm_struct *mm); +typedef bool (*filter_func_t)(struct uprobe_consumer *self, struct mm_struct *mm); static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter) { @@ -1339,8 +1337,7 @@ static int uprobe_perf_open(struct trace_event_call *call, return err; } -static bool uprobe_perf_filter(struct uprobe_consumer *uc, - enum uprobe_filter_ctx ctx, struct mm_struct *mm) +static bool uprobe_perf_filter(struct uprobe_consumer *uc, struct mm_struct *mm) { struct trace_uprobe_filter *filter; struct trace_uprobe *tu; @@ -1426,7 +1423,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs, struct uprobe_cpu_buffer **ucbp) { - if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) + if (!uprobe_perf_filter(&tu->consumer, current->mm)) return UPROBE_HANDLER_REMOVE; if (!is_ret_probe(tu)) -- cgit v1.2.3-58-ga151 From cc01bd044e6a521d2cd128f685ee8d23ef0067f2 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 3 Sep 2024 10:45:59 -0700 Subject: uprobes: travers uprobe's consumer list locklessly under SRCU protection uprobe->register_rwsem is one of a few big bottlenecks to scalability of uprobes, so we need to get rid of it to improve uprobe performance and multi-CPU scalability. First, we turn uprobe's consumer list to a typical doubly-linked list and utilize existing RCU-aware helpers for traversing such lists, as well as adding and removing elements from it. For entry uprobes we already have SRCU protection active since before uprobe lookup. For uretprobe we keep refcount, guaranteeing that uprobe won't go away from under us, but we add SRCU protection around consumer list traversal. Lastly, to keep handler_chain()'s UPROBE_HANDLER_REMOVE handling simple, we remember whether any removal was requested during handler calls, but then we double-check the decision under a proper register_rwsem using consumers' filter callbacks. Handler removal is very rare, so this extra lock won't hurt performance, overall, but we also avoid the need for any extra protection (e.g., seqcount locks). Signed-off-by: Andrii Nakryiko Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240903174603.3554182-5-andrii@kernel.org --- include/linux/uprobes.h | 10 ++++- kernel/events/uprobes.c | 104 ++++++++++++++++++++++++++++-------------------- 2 files changed, 70 insertions(+), 44 deletions(-) (limited to 'kernel/events') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 63ae2ade3487..f112b56e9479 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -29,13 +29,21 @@ struct page; #define MAX_URETPROBE_DEPTH 64 struct uprobe_consumer { + /* + * handler() can return UPROBE_HANDLER_REMOVE to signal the need to + * unregister uprobe for current process. If UPROBE_HANDLER_REMOVE is + * returned, filter() callback has to be implemented as well and it + * should return false to "confirm" the decision to uninstall uprobe + * for the current process. If filter() is omitted or returns true, + * UPROBE_HANDLER_REMOVE is effectively ignored. + */ int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs); int (*ret_handler)(struct uprobe_consumer *self, unsigned long func, struct pt_regs *regs); bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm); - struct uprobe_consumer *next; + struct list_head cons_node; }; #ifdef CONFIG_UPROBES diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 87b499cd2334..e15c0306e535 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -59,7 +59,7 @@ struct uprobe { struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; struct list_head pending_list; - struct uprobe_consumer *consumers; + struct list_head consumers; struct inode *inode; /* Also hold a ref to inode */ struct rcu_head rcu; loff_t offset; @@ -783,6 +783,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, uprobe->inode = inode; uprobe->offset = offset; uprobe->ref_ctr_offset = ref_ctr_offset; + INIT_LIST_HEAD(&uprobe->consumers); init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); RB_CLEAR_NODE(&uprobe->rb_node); @@ -808,32 +809,19 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) { down_write(&uprobe->consumer_rwsem); - uc->next = uprobe->consumers; - uprobe->consumers = uc; + list_add_rcu(&uc->cons_node, &uprobe->consumers); up_write(&uprobe->consumer_rwsem); } /* * For uprobe @uprobe, delete the consumer @uc. - * Return true if the @uc is deleted successfully - * or return false. + * Should never be called with consumer that's not part of @uprobe->consumers. */ -static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) +static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) { - struct uprobe_consumer **con; - bool ret = false; - down_write(&uprobe->consumer_rwsem); - for (con = &uprobe->consumers; *con; con = &(*con)->next) { - if (*con == uc) { - *con = uc->next; - ret = true; - break; - } - } + list_del_rcu(&uc->cons_node); up_write(&uprobe->consumer_rwsem); - - return ret; } static int __copy_insn(struct address_space *mapping, struct file *filp, @@ -929,7 +917,8 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) bool ret = false; down_read(&uprobe->consumer_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) { + list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, + srcu_read_lock_held(&uprobes_srcu)) { ret = consumer_filter(uc, mm); if (ret) break; @@ -1125,18 +1114,29 @@ void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) int err; down_write(&uprobe->register_rwsem); - if (WARN_ON(!consumer_del(uprobe, uc))) { - err = -ENOENT; - } else { - err = register_for_each_vma(uprobe, NULL); - /* TODO : cant unregister? schedule a worker thread */ - if (unlikely(err)) - uprobe_warn(current, "unregister, leaking uprobe"); - } + consumer_del(uprobe, uc); + err = register_for_each_vma(uprobe, NULL); up_write(&uprobe->register_rwsem); - if (!err) - put_uprobe(uprobe); + /* TODO : cant unregister? schedule a worker thread */ + if (unlikely(err)) { + uprobe_warn(current, "unregister, leaking uprobe"); + goto out_sync; + } + + put_uprobe(uprobe); + +out_sync: + /* + * Now that handler_chain() and handle_uretprobe_chain() iterate over + * uprobe->consumers list under RCU protection without holding + * uprobe->register_rwsem, we need to wait for RCU grace period to + * make sure that we can't call into just unregistered + * uprobe_consumer's callbacks anymore. If we don't do that, fast and + * unlucky enough caller can free consumer's memory and cause + * handler_chain() or handle_uretprobe_chain() to do an use-after-free. + */ + synchronize_srcu(&uprobes_srcu); } EXPORT_SYMBOL_GPL(uprobe_unregister); @@ -1214,13 +1214,20 @@ EXPORT_SYMBOL_GPL(uprobe_register); int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add) { struct uprobe_consumer *con; - int ret = -ENOENT; + int ret = -ENOENT, srcu_idx; down_write(&uprobe->register_rwsem); - for (con = uprobe->consumers; con && con != uc ; con = con->next) - ; - if (con) - ret = register_for_each_vma(uprobe, add ? uc : NULL); + + srcu_idx = srcu_read_lock(&uprobes_srcu); + list_for_each_entry_srcu(con, &uprobe->consumers, cons_node, + srcu_read_lock_held(&uprobes_srcu)) { + if (con == uc) { + ret = register_for_each_vma(uprobe, add ? uc : NULL); + break; + } + } + srcu_read_unlock(&uprobes_srcu, srcu_idx); + up_write(&uprobe->register_rwsem); return ret; @@ -2085,10 +2092,12 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) struct uprobe_consumer *uc; int remove = UPROBE_HANDLER_REMOVE; bool need_prep = false; /* prepare return uprobe, when needed */ + bool has_consumers = false; - down_read(&uprobe->register_rwsem); current->utask->auprobe = &uprobe->arch; - for (uc = uprobe->consumers; uc; uc = uc->next) { + + list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, + srcu_read_lock_held(&uprobes_srcu)) { int rc = 0; if (uc->handler) { @@ -2101,17 +2110,24 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) need_prep = true; remove &= rc; + has_consumers = true; } current->utask->auprobe = NULL; if (need_prep && !remove) prepare_uretprobe(uprobe, regs); /* put bp at return */ - if (remove && uprobe->consumers) { - WARN_ON(!uprobe_is_active(uprobe)); - unapply_uprobe(uprobe, current->mm); + if (remove && has_consumers) { + down_read(&uprobe->register_rwsem); + + /* re-check that removal is still required, this time under lock */ + if (!filter_chain(uprobe, current->mm)) { + WARN_ON(!uprobe_is_active(uprobe)); + unapply_uprobe(uprobe, current->mm); + } + + up_read(&uprobe->register_rwsem); } - up_read(&uprobe->register_rwsem); } static void @@ -2119,13 +2135,15 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) { struct uprobe *uprobe = ri->uprobe; struct uprobe_consumer *uc; + int srcu_idx; - down_read(&uprobe->register_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) { + srcu_idx = srcu_read_lock(&uprobes_srcu); + list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, + srcu_read_lock_held(&uprobes_srcu)) { if (uc->ret_handler) uc->ret_handler(uc, ri->func, regs); } - up_read(&uprobe->register_rwsem); + srcu_read_unlock(&uprobes_srcu, srcu_idx); } static struct return_instance *find_next_ret_chain(struct return_instance *ri) -- cgit v1.2.3-58-ga151 From 04b01625da130c7521b768996cd5e48052198b97 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 3 Sep 2024 10:46:00 -0700 Subject: perf/uprobe: split uprobe_unregister() With uprobe_unregister() having grown a synchronize_srcu(), it becomes fairly slow to call. Esp. since both users of this API call it in a loop. Peel off the sync_srcu() and do it once, after the loop. We also need to add uprobe_unregister_sync() into uprobe_register()'s error handling path, as we need to be careful about returning to the caller before we have a guarantee that partially attached consumer won't be called anymore. This is an unlikely slow path and this should be totally fine to be slow in the case of a failed attach. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: "Peter Zijlstra (Intel)" Co-developed-by: Andrii Nakryiko Signed-off-by: Andrii Nakryiko Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240903174603.3554182-6-andrii@kernel.org --- include/linux/uprobes.h | 8 ++++++-- kernel/events/uprobes.c | 21 +++++++++++++++------ kernel/trace/bpf_trace.c | 5 ++++- kernel/trace/trace_uprobe.c | 6 +++++- .../testing/selftests/bpf/bpf_testmod/bpf_testmod.c | 3 ++- 5 files changed, 32 insertions(+), 11 deletions(-) (limited to 'kernel/events') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index f112b56e9479..2b294bf1881f 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -115,7 +115,8 @@ extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); -extern void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc); +extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); +extern void uprobe_unregister_sync(void); extern int uprobe_mmap(struct vm_area_struct *vma); extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void uprobe_start_dup_mmap(void); @@ -164,7 +165,10 @@ uprobe_apply(struct uprobe* uprobe, struct uprobe_consumer *uc, bool add) return -ENOSYS; } static inline void -uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) +uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc) +{ +} +static inline void uprobe_unregister_sync(void) { } static inline int uprobe_mmap(struct vm_area_struct *vma) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e15c0306e535..694f6790e848 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1105,11 +1105,11 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) } /** - * uprobe_unregister - unregister an already registered probe. + * uprobe_unregister_nosync - unregister an already registered probe. * @uprobe: uprobe to remove * @uc: identify which probe if multiple probes are colocated. */ -void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) +void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc) { int err; @@ -1121,12 +1121,15 @@ void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) /* TODO : cant unregister? schedule a worker thread */ if (unlikely(err)) { uprobe_warn(current, "unregister, leaking uprobe"); - goto out_sync; + return; } put_uprobe(uprobe); +} +EXPORT_SYMBOL_GPL(uprobe_unregister_nosync); -out_sync: +void uprobe_unregister_sync(void) +{ /* * Now that handler_chain() and handle_uretprobe_chain() iterate over * uprobe->consumers list under RCU protection without holding @@ -1138,7 +1141,7 @@ out_sync: */ synchronize_srcu(&uprobes_srcu); } -EXPORT_SYMBOL_GPL(uprobe_unregister); +EXPORT_SYMBOL_GPL(uprobe_unregister_sync); /** * uprobe_register - register a probe @@ -1196,7 +1199,13 @@ struct uprobe *uprobe_register(struct inode *inode, up_write(&uprobe->register_rwsem); if (ret) { - uprobe_unregister(uprobe, uc); + uprobe_unregister_nosync(uprobe, uc); + /* + * Registration might have partially succeeded, so we can have + * this consumer being called right at this time. We need to + * sync here. It's ok, it's unlikely slow path. + */ + uprobe_unregister_sync(); return ERR_PTR(ret); } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index c99bf0617602..ac0a01cc8634 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3184,7 +3184,10 @@ static void bpf_uprobe_unregister(struct bpf_uprobe *uprobes, u32 cnt) u32 i; for (i = 0; i < cnt; i++) - uprobe_unregister(uprobes[i].uprobe, &uprobes[i].consumer); + uprobe_unregister_nosync(uprobes[i].uprobe, &uprobes[i].consumer); + + if (cnt) + uprobe_unregister_sync(); } static void bpf_uprobe_multi_link_release(struct bpf_link *link) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 7eb79e0a5352..f7443e996b1b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1097,6 +1097,7 @@ static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter) static void __probe_event_disable(struct trace_probe *tp) { struct trace_uprobe *tu; + bool sync = false; tu = container_of(tp, struct trace_uprobe, tp); WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter)); @@ -1105,9 +1106,12 @@ static void __probe_event_disable(struct trace_probe *tp) if (!tu->uprobe) continue; - uprobe_unregister(tu->uprobe, &tu->consumer); + uprobe_unregister_nosync(tu->uprobe, &tu->consumer); + sync = true; tu->uprobe = NULL; } + if (sync) + uprobe_unregister_sync(); } static int probe_event_enable(struct trace_event_call *call, diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 3c0515a27842..1fc16657cf42 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -475,7 +475,8 @@ static void testmod_unregister_uprobe(void) mutex_lock(&testmod_uprobe_mutex); if (uprobe.uprobe) { - uprobe_unregister(uprobe.uprobe, &uprobe.consumer); + uprobe_unregister_nosync(uprobe.uprobe, &uprobe.consumer); + uprobe_unregister_sync(); path_put(&uprobe.path); uprobe.uprobe = NULL; } -- cgit v1.2.3-58-ga151 From cd7bdd9d46a9540f3a20a0e14c99aa37b2d4a1dd Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 3 Sep 2024 10:46:02 -0700 Subject: uprobes: perform lockless SRCU-protected uprobes_tree lookup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Another big bottleneck to scalablity is uprobe_treelock that's taken in a very hot path in handle_swbp(). Now that uprobes are SRCU-protected, take advantage of that and make uprobes_tree RB-tree look up lockless. To make RB-tree RCU-protected lockless lookup correct, we need to take into account that such RB-tree lookup can return false negatives if there are parallel RB-tree modifications (rotations) going on. We use seqcount lock to detect whether RB-tree changed, and if we find nothing while RB-tree got modified inbetween, we just retry. If uprobe was found, then it's guaranteed to be a correct lookup. With all the lock-avoiding changes done, we get a pretty decent improvement in performance and scalability of uprobes with number of CPUs, even though we are still nowhere near linear scalability. This is due to SRCU not really scaling very well with number of CPUs on a particular hardware that was used for testing (80-core Intel Xeon Gold 6138 CPU @ 2.00GHz), but also due to the remaning mmap_lock, which is currently taken to resolve interrupt address to inode+offset and then uprobe instance. And, of course, uretprobes still need similar RCU to avoid refcount in the hot path, which will be addressed in the follow up patches. Nevertheless, the improvement is good. We used BPF selftest-based uprobe-nop and uretprobe-nop benchmarks to get the below numbers, varying number of CPUs on which uprobes and uretprobes are triggered. BASELINE ======== uprobe-nop ( 1 cpus): 3.032 ± 0.023M/s ( 3.032M/s/cpu) uprobe-nop ( 2 cpus): 3.452 ± 0.005M/s ( 1.726M/s/cpu) uprobe-nop ( 4 cpus): 3.663 ± 0.005M/s ( 0.916M/s/cpu) uprobe-nop ( 8 cpus): 3.718 ± 0.038M/s ( 0.465M/s/cpu) uprobe-nop (16 cpus): 3.344 ± 0.008M/s ( 0.209M/s/cpu) uprobe-nop (32 cpus): 2.288 ± 0.021M/s ( 0.071M/s/cpu) uprobe-nop (64 cpus): 3.205 ± 0.004M/s ( 0.050M/s/cpu) uretprobe-nop ( 1 cpus): 1.979 ± 0.005M/s ( 1.979M/s/cpu) uretprobe-nop ( 2 cpus): 2.361 ± 0.005M/s ( 1.180M/s/cpu) uretprobe-nop ( 4 cpus): 2.309 ± 0.002M/s ( 0.577M/s/cpu) uretprobe-nop ( 8 cpus): 2.253 ± 0.001M/s ( 0.282M/s/cpu) uretprobe-nop (16 cpus): 2.007 ± 0.000M/s ( 0.125M/s/cpu) uretprobe-nop (32 cpus): 1.624 ± 0.003M/s ( 0.051M/s/cpu) uretprobe-nop (64 cpus): 2.149 ± 0.001M/s ( 0.034M/s/cpu) SRCU CHANGES ============ uprobe-nop ( 1 cpus): 3.276 ± 0.005M/s ( 3.276M/s/cpu) uprobe-nop ( 2 cpus): 4.125 ± 0.002M/s ( 2.063M/s/cpu) uprobe-nop ( 4 cpus): 7.713 ± 0.002M/s ( 1.928M/s/cpu) uprobe-nop ( 8 cpus): 8.097 ± 0.006M/s ( 1.012M/s/cpu) uprobe-nop (16 cpus): 6.501 ± 0.056M/s ( 0.406M/s/cpu) uprobe-nop (32 cpus): 4.398 ± 0.084M/s ( 0.137M/s/cpu) uprobe-nop (64 cpus): 6.452 ± 0.000M/s ( 0.101M/s/cpu) uretprobe-nop ( 1 cpus): 2.055 ± 0.001M/s ( 2.055M/s/cpu) uretprobe-nop ( 2 cpus): 2.677 ± 0.000M/s ( 1.339M/s/cpu) uretprobe-nop ( 4 cpus): 4.561 ± 0.003M/s ( 1.140M/s/cpu) uretprobe-nop ( 8 cpus): 5.291 ± 0.002M/s ( 0.661M/s/cpu) uretprobe-nop (16 cpus): 5.065 ± 0.019M/s ( 0.317M/s/cpu) uretprobe-nop (32 cpus): 3.622 ± 0.003M/s ( 0.113M/s/cpu) uretprobe-nop (64 cpus): 3.723 ± 0.002M/s ( 0.058M/s/cpu) Peak througput increased from 3.7 mln/s (uprobe triggerings) up to about 8 mln/s. For uretprobes it's a bit more modest with bump from 2.4 mln/s to 5mln/s. Suggested-by: "Peter Zijlstra (Intel)" Signed-off-by: Andrii Nakryiko Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240903174603.3554182-8-andrii@kernel.org --- kernel/events/uprobes.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 694f6790e848..4b7e590dc428 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -40,6 +40,7 @@ static struct rb_root uprobes_tree = RB_ROOT; #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree) static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ +static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock); DEFINE_STATIC_SRCU(uprobes_srcu); @@ -634,8 +635,11 @@ static void put_uprobe(struct uprobe *uprobe) write_lock(&uprobes_treelock); - if (uprobe_is_active(uprobe)) + if (uprobe_is_active(uprobe)) { + write_seqcount_begin(&uprobes_seqcount); rb_erase(&uprobe->rb_node, &uprobes_tree); + write_seqcount_end(&uprobes_seqcount); + } write_unlock(&uprobes_treelock); @@ -701,14 +705,26 @@ static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset) .offset = offset, }; struct rb_node *node; + unsigned int seq; lockdep_assert(srcu_read_lock_held(&uprobes_srcu)); - read_lock(&uprobes_treelock); - node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key); - read_unlock(&uprobes_treelock); + do { + seq = read_seqcount_begin(&uprobes_seqcount); + node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key); + /* + * Lockless RB-tree lookups can result only in false negatives. + * If the element is found, it is correct and can be returned + * under RCU protection. If we find nothing, we need to + * validate that seqcount didn't change. If it did, we have to + * try again as we might have missed the element (false + * negative). If seqcount is unchanged, search truly failed. + */ + if (node) + return __node_2_uprobe(node); + } while (read_seqcount_retry(&uprobes_seqcount, seq)); - return node ? __node_2_uprobe(node) : NULL; + return NULL; } /* @@ -730,7 +746,7 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) { struct rb_node *node; again: - node = rb_find_add(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp); + node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp); if (node) { struct uprobe *u = __node_2_uprobe(node); @@ -755,7 +771,9 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) struct uprobe *u; write_lock(&uprobes_treelock); + write_seqcount_begin(&uprobes_seqcount); u = __insert_uprobe(uprobe); + write_seqcount_end(&uprobes_seqcount); write_unlock(&uprobes_treelock); return u; -- cgit v1.2.3-58-ga151 From 4ba4f1afb6a9fed8ef896c2363076e36572f71da Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 2 Aug 2024 08:16:37 -0700 Subject: perf: Generic hotplug support for a PMU with a scope The perf subsystem assumes that the counters of a PMU are per-CPU. So the user space tool reads a counter from each CPU in the system wide mode. However, many PMUs don't have a per-CPU counter. The counter is effective for a scope, e.g., a die or a socket. To address this, a cpumask is exposed by the kernel driver to restrict to one CPU to stand for a specific scope. In case the given CPU is removed, the hotplug support has to be implemented for each such driver. The codes to support the cpumask and hotplug are very similar. - Expose a cpumask into sysfs - Pickup another CPU in the same scope if the given CPU is removed. - Invoke the perf_pmu_migrate_context() to migrate to a new CPU. - In event init, always set the CPU in the cpumask to event->cpu Similar duplicated codes are implemented for each such PMU driver. It would be good to introduce a generic infrastructure to avoid such duplication. 5 popular scopes are implemented here, core, die, cluster, pkg, and the system-wide. The scope can be set when a PMU is registered. If so, a "cpumask" is automatically exposed for the PMU. The "cpumask" is from the perf_online__mask, which is to track the active CPU for each scope. They are set when the first CPU of the scope is online via the generic perf hotplug support. When a corresponding CPU is removed, the perf_online__mask is updated accordingly and the PMU will be moved to a new CPU from the same scope if possible. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240802151643.1691631-2-kan.liang@linux.intel.com --- include/linux/perf_event.h | 18 +++++ kernel/events/core.c | 164 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 180 insertions(+), 2 deletions(-) (limited to 'kernel/events') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 701549967c18..a3cbcd727366 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -295,6 +295,19 @@ struct perf_event_pmu_context; #define PERF_PMU_CAP_AUX_OUTPUT 0x0080 #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 +/** + * pmu::scope + */ +enum perf_pmu_scope { + PERF_PMU_SCOPE_NONE = 0, + PERF_PMU_SCOPE_CORE, + PERF_PMU_SCOPE_DIE, + PERF_PMU_SCOPE_CLUSTER, + PERF_PMU_SCOPE_PKG, + PERF_PMU_SCOPE_SYS_WIDE, + PERF_PMU_MAX_SCOPE, +}; + struct perf_output_handle; #define PMU_NULL_DEV ((void *)(~0UL)) @@ -318,6 +331,11 @@ struct pmu { */ int capabilities; + /* + * PMU scope + */ + unsigned int scope; + int __percpu *pmu_disable_count; struct perf_cpu_pmu_context __percpu *cpu_pmu_context; atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 67e115d4ef96..5ff973513fac 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -436,6 +436,11 @@ static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); static struct srcu_struct pmus_srcu; static cpumask_var_t perf_online_mask; +static cpumask_var_t perf_online_core_mask; +static cpumask_var_t perf_online_die_mask; +static cpumask_var_t perf_online_cluster_mask; +static cpumask_var_t perf_online_pkg_mask; +static cpumask_var_t perf_online_sys_mask; static struct kmem_cache *perf_event_cache; /* @@ -11578,10 +11583,60 @@ perf_event_mux_interval_ms_store(struct device *dev, } static DEVICE_ATTR_RW(perf_event_mux_interval_ms); +static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu) +{ + switch (scope) { + case PERF_PMU_SCOPE_CORE: + return topology_sibling_cpumask(cpu); + case PERF_PMU_SCOPE_DIE: + return topology_die_cpumask(cpu); + case PERF_PMU_SCOPE_CLUSTER: + return topology_cluster_cpumask(cpu); + case PERF_PMU_SCOPE_PKG: + return topology_core_cpumask(cpu); + case PERF_PMU_SCOPE_SYS_WIDE: + return cpu_online_mask; + } + + return NULL; +} + +static inline struct cpumask *perf_scope_cpumask(unsigned int scope) +{ + switch (scope) { + case PERF_PMU_SCOPE_CORE: + return perf_online_core_mask; + case PERF_PMU_SCOPE_DIE: + return perf_online_die_mask; + case PERF_PMU_SCOPE_CLUSTER: + return perf_online_cluster_mask; + case PERF_PMU_SCOPE_PKG: + return perf_online_pkg_mask; + case PERF_PMU_SCOPE_SYS_WIDE: + return perf_online_sys_mask; + } + + return NULL; +} + +static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct pmu *pmu = dev_get_drvdata(dev); + struct cpumask *mask = perf_scope_cpumask(pmu->scope); + + if (mask) + return cpumap_print_to_pagebuf(true, buf, mask); + return 0; +} + +static DEVICE_ATTR_RO(cpumask); + static struct attribute *pmu_dev_attrs[] = { &dev_attr_type.attr, &dev_attr_perf_event_mux_interval_ms.attr, &dev_attr_nr_addr_filters.attr, + &dev_attr_cpumask.attr, NULL, }; @@ -11593,6 +11648,10 @@ static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int if (n == 2 && !pmu->nr_addr_filters) return 0; + /* cpumask */ + if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE) + return 0; + return a->mode; } @@ -11677,6 +11736,11 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) goto free_pdc; } + if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) { + ret = -EINVAL; + goto free_pdc; + } + pmu->name = name; if (type >= 0) @@ -11831,6 +11895,22 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) event_has_any_exclude_flag(event)) ret = -EINVAL; + if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu); + struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope); + int cpu; + + if (pmu_cpumask && cpumask) { + cpu = cpumask_any_and(pmu_cpumask, cpumask); + if (cpu >= nr_cpu_ids) + ret = -ENODEV; + else + event->cpu = cpu; + } else { + ret = -ENODEV; + } + } + if (ret && event->destroy) event->destroy(event); } @@ -13784,6 +13864,12 @@ static void __init perf_event_init_all_cpus(void) int cpu; zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL); + for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); @@ -13833,6 +13919,40 @@ static void __perf_event_exit_context(void *__info) raw_spin_unlock(&ctx->lock); } +static void perf_event_clear_cpumask(unsigned int cpu) +{ + int target[PERF_PMU_MAX_SCOPE]; + unsigned int scope; + struct pmu *pmu; + + cpumask_clear_cpu(cpu, perf_online_mask); + + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu); + struct cpumask *pmu_cpumask = perf_scope_cpumask(scope); + + target[scope] = -1; + if (WARN_ON_ONCE(!pmu_cpumask || !cpumask)) + continue; + + if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask)) + continue; + target[scope] = cpumask_any_but(cpumask, cpu); + if (target[scope] < nr_cpu_ids) + cpumask_set_cpu(target[scope], pmu_cpumask); + } + + /* migrate */ + list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { + if (pmu->scope == PERF_PMU_SCOPE_NONE || + WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE)) + continue; + + if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids) + perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]); + } +} + static void perf_event_exit_cpu_context(int cpu) { struct perf_cpu_context *cpuctx; @@ -13840,6 +13960,11 @@ static void perf_event_exit_cpu_context(int cpu) // XXX simplify cpuctx->online mutex_lock(&pmus_lock); + /* + * Clear the cpumasks, and migrate to other CPUs if possible. + * Must be invoked before the __perf_event_exit_context. + */ + perf_event_clear_cpumask(cpu); cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); ctx = &cpuctx->ctx; @@ -13847,7 +13972,6 @@ static void perf_event_exit_cpu_context(int cpu) smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); cpuctx->online = 0; mutex_unlock(&ctx->mutex); - cpumask_clear_cpu(cpu, perf_online_mask); mutex_unlock(&pmus_lock); } #else @@ -13856,6 +13980,42 @@ static void perf_event_exit_cpu_context(int cpu) { } #endif +static void perf_event_setup_cpumask(unsigned int cpu) +{ + struct cpumask *pmu_cpumask; + unsigned int scope; + + cpumask_set_cpu(cpu, perf_online_mask); + + /* + * Early boot stage, the cpumask hasn't been set yet. + * The perf_online__masks includes the first CPU of each domain. + * Always uncondifionally set the boot CPU for the perf_online__masks. + */ + if (!topology_sibling_cpumask(cpu)) { + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { + pmu_cpumask = perf_scope_cpumask(scope); + if (WARN_ON_ONCE(!pmu_cpumask)) + continue; + cpumask_set_cpu(cpu, pmu_cpumask); + } + return; + } + + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu); + + pmu_cpumask = perf_scope_cpumask(scope); + + if (WARN_ON_ONCE(!pmu_cpumask || !cpumask)) + continue; + + if (!cpumask_empty(cpumask) && + cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids) + cpumask_set_cpu(cpu, pmu_cpumask); + } +} + int perf_event_init_cpu(unsigned int cpu) { struct perf_cpu_context *cpuctx; @@ -13864,7 +14024,7 @@ int perf_event_init_cpu(unsigned int cpu) perf_swevent_init_cpu(cpu); mutex_lock(&pmus_lock); - cpumask_set_cpu(cpu, perf_online_mask); + perf_event_setup_cpumask(cpu); cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); ctx = &cpuctx->ctx; -- cgit v1.2.3-58-ga151 From a48a36b316ae5d3ab83f9b545dba15998e96d59c Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 2 Aug 2024 08:16:38 -0700 Subject: perf: Add PERF_EV_CAP_READ_SCOPE Usually, an event can be read from any CPU of the scope. It doesn't need to be read from the advertised CPU. Add a new event cap, PERF_EV_CAP_READ_SCOPE. An event of a PMU with scope can be read from any active CPU in the scope. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240802151643.1691631-3-kan.liang@linux.intel.com --- include/linux/perf_event.h | 3 +++ kernel/events/core.c | 14 +++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel/events') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index a3cbcd727366..794f66057878 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -636,10 +636,13 @@ typedef void (*perf_overflow_handler_t)(struct perf_event *, * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and * cannot be a group leader. If an event with this flag is detached from the * group it is scheduled out and moved into an unrecoverable ERROR state. + * PERF_EV_CAP_READ_SCOPE: A CPU event that can be read from any CPU of the + * PMU scope where it is active. */ #define PERF_EV_CAP_SOFTWARE BIT(0) #define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1) #define PERF_EV_CAP_SIBLING BIT(2) +#define PERF_EV_CAP_READ_SCOPE BIT(3) #define SWEVENT_HLIST_BITS 8 #define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) diff --git a/kernel/events/core.c b/kernel/events/core.c index 5ff973513fac..2766090de84e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4556,16 +4556,24 @@ struct perf_read_data { int ret; }; +static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu); + static int __perf_event_read_cpu(struct perf_event *event, int event_cpu) { + int local_cpu = smp_processor_id(); u16 local_pkg, event_pkg; if ((unsigned)event_cpu >= nr_cpu_ids) return event_cpu; - if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { - int local_cpu = smp_processor_id(); + if (event->group_caps & PERF_EV_CAP_READ_SCOPE) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu); + + if (cpumask && cpumask_test_cpu(local_cpu, cpumask)) + return local_cpu; + } + if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { event_pkg = topology_physical_package_id(event_cpu); local_pkg = topology_physical_package_id(local_cpu); @@ -11905,7 +11913,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) if (cpu >= nr_cpu_ids) ret = -ENODEV; else - event->cpu = cpu; + event->event_caps |= PERF_EV_CAP_READ_SCOPE; } else { ret = -ENODEV; } -- cgit v1.2.3-58-ga151