diff options
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r-- | kernel/events/core.c | 345 |
1 files changed, 254 insertions, 91 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index 03db40f6cba9..928b166d888e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -405,6 +405,7 @@ static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); static struct srcu_struct pmus_srcu; static cpumask_var_t perf_online_mask; +static struct kmem_cache *perf_event_cache; /* * perf event paranoia level: @@ -2204,6 +2205,26 @@ out: perf_event__header_size(leader); } +static void sync_child_event(struct perf_event *child_event); + +static void perf_child_detach(struct perf_event *event) +{ + struct perf_event *parent_event = event->parent; + + if (!(event->attach_state & PERF_ATTACH_CHILD)) + return; + + event->attach_state &= ~PERF_ATTACH_CHILD; + + if (WARN_ON_ONCE(!parent_event)) + return; + + lockdep_assert_held(&parent_event->child_mutex); + + sync_child_event(event); + list_del_init(&event->child_list); +} + static bool is_orphaned_event(struct perf_event *event) { return event->state == PERF_EVENT_STATE_DEAD; @@ -2311,6 +2332,7 @@ group_sched_out(struct perf_event *group_event, } #define DETACH_GROUP 0x01UL +#define DETACH_CHILD 0x02UL /* * Cross CPU call to remove a performance event @@ -2334,6 +2356,8 @@ __perf_remove_from_context(struct perf_event *event, event_sched_out(event, cpuctx, ctx); if (flags & DETACH_GROUP) perf_group_detach(event); + if (flags & DETACH_CHILD) + perf_child_detach(event); list_del_event(event, ctx); if (!ctx->nr_events && ctx->is_active) { @@ -2362,25 +2386,21 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla lockdep_assert_held(&ctx->mutex); - event_function_call(event, __perf_remove_from_context, (void *)flags); - /* - * The above event_function_call() can NO-OP when it hits - * TASK_TOMBSTONE. In that case we must already have been detached - * from the context (by perf_event_exit_event()) but the grouping - * might still be in-tact. + * Because of perf_event_exit_task(), perf_remove_from_context() ought + * to work in the face of TASK_TOMBSTONE, unlike every other + * event_function_call() user. */ - WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); - if ((flags & DETACH_GROUP) && - (event->attach_state & PERF_ATTACH_GROUP)) { - /* - * Since in that case we cannot possibly be scheduled, simply - * detach now. - */ - raw_spin_lock_irq(&ctx->lock); - perf_group_detach(event); + raw_spin_lock_irq(&ctx->lock); + if (!ctx->is_active) { + __perf_remove_from_context(event, __get_cpu_context(ctx), + ctx, (void *)flags); raw_spin_unlock_irq(&ctx->lock); + return; } + raw_spin_unlock_irq(&ctx->lock); + + event_function_call(event, __perf_remove_from_context, (void *)flags); } /* @@ -3180,16 +3200,36 @@ static int perf_event_modify_breakpoint(struct perf_event *bp, static int perf_event_modify_attr(struct perf_event *event, struct perf_event_attr *attr) { + int (*func)(struct perf_event *, struct perf_event_attr *); + struct perf_event *child; + int err; + if (event->attr.type != attr->type) return -EINVAL; switch (event->attr.type) { case PERF_TYPE_BREAKPOINT: - return perf_event_modify_breakpoint(event, attr); + func = perf_event_modify_breakpoint; + break; default: /* Place holder for future additions. */ return -EOPNOTSUPP; } + + WARN_ON_ONCE(event->ctx->parent_ctx); + + mutex_lock(&event->child_mutex); + err = func(event, attr); + if (err) + goto out; + list_for_each_entry(child, &event->child_list, child_list) { + err = func(child, attr); + if (err) + goto out; + } +out: + mutex_unlock(&event->child_mutex); + return err; } static void ctx_sched_out(struct perf_event_context *ctx, @@ -4208,6 +4248,57 @@ out: put_ctx(clone_ctx); } +static void perf_remove_from_owner(struct perf_event *event); +static void perf_event_exit_event(struct perf_event *event, + struct perf_event_context *ctx); + +/* + * Removes all events from the current task that have been marked + * remove-on-exec, and feeds their values back to parent events. + */ +static void perf_event_remove_on_exec(int ctxn) +{ + struct perf_event_context *ctx, *clone_ctx = NULL; + struct perf_event *event, *next; + LIST_HEAD(free_list); + unsigned long flags; + bool modified = false; + + ctx = perf_pin_task_context(current, ctxn); + if (!ctx) + return; + + mutex_lock(&ctx->mutex); + + if (WARN_ON_ONCE(ctx->task != current)) + goto unlock; + + list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) { + if (!event->attr.remove_on_exec) + continue; + + if (!is_kernel_event(event)) + perf_remove_from_owner(event); + + modified = true; + + perf_event_exit_event(event, ctx); + } + + raw_spin_lock_irqsave(&ctx->lock, flags); + if (modified) + clone_ctx = unclone_ctx(ctx); + --ctx->pin_count; + raw_spin_unlock_irqrestore(&ctx->lock, flags); + +unlock: + mutex_unlock(&ctx->mutex); + + put_ctx(ctx); + if (clone_ctx) + put_ctx(clone_ctx); +} + struct perf_read_data { struct perf_event *event; bool group; @@ -4611,7 +4702,7 @@ static void free_event_rcu(struct rcu_head *head) if (event->ns) put_pid_ns(event->ns); perf_event_free_filter(event); - kfree(event); + kmem_cache_free(perf_event_cache, event); } static void ring_buffer_attach(struct perf_event *event, @@ -6301,6 +6392,33 @@ void perf_event_wakeup(struct perf_event *event) } } +static void perf_sigtrap(struct perf_event *event) +{ + struct kernel_siginfo info; + + /* + * We'd expect this to only occur if the irq_work is delayed and either + * ctx->task or current has changed in the meantime. This can be the + * case on architectures that do not implement arch_irq_work_raise(). + */ + if (WARN_ON_ONCE(event->ctx->task != current)) + return; + + /* + * perf_pending_event() can race with the task exiting. + */ + if (current->flags & PF_EXITING) + return; + + clear_siginfo(&info); + info.si_signo = SIGTRAP; + info.si_code = TRAP_PERF; + info.si_errno = event->attr.type; + info.si_perf = event->attr.sig_data; + info.si_addr = (void __user *)event->pending_addr; + force_sig_info(&info); +} + static void perf_pending_event_disable(struct perf_event *event) { int cpu = READ_ONCE(event->pending_disable); @@ -6310,6 +6428,13 @@ static void perf_pending_event_disable(struct perf_event *event) if (cpu == smp_processor_id()) { WRITE_ONCE(event->pending_disable, -1); + + if (event->attr.sigtrap) { + perf_sigtrap(event); + atomic_set_release(&event->event_limit, 1); /* rearm event */ + return; + } + perf_event_disable_local(event); return; } @@ -7520,18 +7645,18 @@ void perf_event_exec(void) struct perf_event_context *ctx; int ctxn; - rcu_read_lock(); for_each_task_context_nr(ctxn) { - ctx = current->perf_event_ctxp[ctxn]; - if (!ctx) - continue; - perf_event_enable_on_exec(ctxn); + perf_event_remove_on_exec(ctxn); - perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, - true); + rcu_read_lock(); + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) { + perf_iterate_ctx(ctx, perf_event_addr_filters_exec, + NULL, true); + } + rcu_read_unlock(); } - rcu_read_unlock(); } struct remote_output { @@ -9012,6 +9137,7 @@ static int __perf_event_overflow(struct perf_event *event, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; + event->pending_addr = data->addr; perf_event_disable_inatomic(event); } @@ -11094,6 +11220,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) static struct pmu *perf_init_event(struct perf_event *event) { + bool extended_type = false; int idx, type, ret; struct pmu *pmu; @@ -11112,16 +11239,27 @@ static struct pmu *perf_init_event(struct perf_event *event) * are often aliases for PERF_TYPE_RAW. */ type = event->attr.type; - if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) - type = PERF_TYPE_RAW; + if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) { + type = event->attr.config >> PERF_PMU_TYPE_SHIFT; + if (!type) { + type = PERF_TYPE_RAW; + } else { + extended_type = true; + event->attr.config &= PERF_HW_EVENT_MASK; + } + } again: rcu_read_lock(); pmu = idr_find(&pmu_idr, type); rcu_read_unlock(); if (pmu) { + if (event->attr.type != type && type != PERF_TYPE_RAW && + !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE)) + goto fail; + ret = perf_try_init_event(pmu, event); - if (ret == -ENOENT && event->attr.type != type) { + if (ret == -ENOENT && event->attr.type != type && !extended_type) { type = event->attr.type; goto again; } @@ -11142,6 +11280,7 @@ again: goto unlock; } } +fail: pmu = ERR_PTR(-ENOENT); unlock: srcu_read_unlock(&pmus_srcu, idx); @@ -11287,13 +11426,20 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, struct perf_event *event; struct hw_perf_event *hwc; long err = -EINVAL; + int node; if ((unsigned)cpu >= nr_cpu_ids) { if (!task || cpu != -1) return ERR_PTR(-EINVAL); } + if (attr->sigtrap && !task) { + /* Requires a task: avoid signalling random tasks. */ + return ERR_PTR(-EINVAL); + } - event = kzalloc(sizeof(*event), GFP_KERNEL); + node = (cpu >= 0) ? cpu_to_node(cpu) : -1; + event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, + node); if (!event) return ERR_PTR(-ENOMEM); @@ -11338,6 +11484,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->state = PERF_EVENT_STATE_INACTIVE; + if (event->attr.sigtrap) + atomic_set(&event->event_limit, 1); + if (task) { event->attach_state = PERF_ATTACH_TASK; /* @@ -11497,7 +11646,7 @@ err_ns: put_pid_ns(event->ns); if (event->hw.target) put_task_struct(event->hw.target); - kfree(event); + kmem_cache_free(perf_event_cache, event); return ERR_PTR(err); } @@ -11610,6 +11759,15 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT)) return -EINVAL; + if (!attr->inherit && attr->inherit_thread) + return -EINVAL; + + if (attr->remove_on_exec && attr->enable_on_exec) + return -EINVAL; + + if (attr->sigtrap && !attr->remove_on_exec) + return -EINVAL; + out: return ret; @@ -11829,12 +11987,12 @@ SYSCALL_DEFINE5(perf_event_open, return err; } - err = security_locked_down(LOCKDOWN_PERF); - if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR)) - /* REGS_INTR can leak data, lockdown must prevent this */ - return err; - - err = 0; + /* REGS_INTR can leak data, lockdown must prevent this */ + if (attr.sample_type & PERF_SAMPLE_REGS_INTR) { + err = security_locked_down(LOCKDOWN_PERF); + if (err) + return err; + } /* * In cgroup mode, the pid argument is used to pass the fd @@ -12373,14 +12531,17 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) } EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); -static void sync_child_event(struct perf_event *child_event, - struct task_struct *child) +static void sync_child_event(struct perf_event *child_event) { struct perf_event *parent_event = child_event->parent; u64 child_val; - if (child_event->attr.inherit_stat) - perf_event_read_event(child_event, child); + if (child_event->attr.inherit_stat) { + struct task_struct *task = child_event->ctx->task; + + if (task && task != TASK_TOMBSTONE) + perf_event_read_event(child_event, task); + } child_val = perf_event_count(child_event); @@ -12395,60 +12556,53 @@ static void sync_child_event(struct perf_event *child_event, } static void -perf_event_exit_event(struct perf_event *child_event, - struct perf_event_context *child_ctx, - struct task_struct *child) +perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_event *parent_event = child_event->parent; + struct perf_event *parent_event = event->parent; + unsigned long detach_flags = 0; - /* - * Do not destroy the 'original' grouping; because of the context - * switch optimization the original events could've ended up in a - * random child task. - * - * If we were to destroy the original group, all group related - * operations would cease to function properly after this random - * child dies. - * - * Do destroy all inherited groups, we don't care about those - * and being thorough is better. - */ - raw_spin_lock_irq(&child_ctx->lock); - WARN_ON_ONCE(child_ctx->is_active); + if (parent_event) { + /* + * Do not destroy the 'original' grouping; because of the + * context switch optimization the original events could've + * ended up in a random child task. + * + * If we were to destroy the original group, all group related + * operations would cease to function properly after this + * random child dies. + * + * Do destroy all inherited groups, we don't care about those + * and being thorough is better. + */ + detach_flags = DETACH_GROUP | DETACH_CHILD; + mutex_lock(&parent_event->child_mutex); + } - if (parent_event) - perf_group_detach(child_event); - list_del_event(child_event, child_ctx); - perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */ - raw_spin_unlock_irq(&child_ctx->lock); + perf_remove_from_context(event, detach_flags); + + raw_spin_lock_irq(&ctx->lock); + if (event->state > PERF_EVENT_STATE_EXIT) + perf_event_set_state(event, PERF_EVENT_STATE_EXIT); + raw_spin_unlock_irq(&ctx->lock); /* - * Parent events are governed by their filedesc, retain them. + * Child events can be freed. */ - if (!parent_event) { - perf_event_wakeup(child_event); + if (parent_event) { + mutex_unlock(&parent_event->child_mutex); + /* + * Kick perf_poll() for is_event_hup(); + */ + perf_event_wakeup(parent_event); + free_event(event); + put_event(parent_event); return; } - /* - * Child events can be cleaned up. - */ - - sync_child_event(child_event, child); /* - * Remove this event from the parent's list - */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); - list_del_init(&child_event->child_list); - mutex_unlock(&parent_event->child_mutex); - - /* - * Kick perf_poll() for is_event_hup(). + * Parent events are governed by their filedesc, retain them. */ - perf_event_wakeup(parent_event); - free_event(child_event); - put_event(parent_event); + perf_event_wakeup(event); } static void perf_event_exit_task_context(struct task_struct *child, int ctxn) @@ -12505,7 +12659,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) perf_event_task(child, child_ctx, 0); list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) - perf_event_exit_event(child_event, child_ctx, child); + perf_event_exit_event(child_event, child_ctx); mutex_unlock(&child_ctx->mutex); @@ -12765,6 +12919,7 @@ inherit_event(struct perf_event *parent_event, */ raw_spin_lock_irqsave(&child_ctx->lock, flags); add_event_to_ctx(child_event, child_ctx); + child_event->attach_state |= PERF_ATTACH_CHILD; raw_spin_unlock_irqrestore(&child_ctx->lock, flags); /* @@ -12833,12 +12988,15 @@ static int inherit_task_group(struct perf_event *event, struct task_struct *parent, struct perf_event_context *parent_ctx, struct task_struct *child, int ctxn, - int *inherited_all) + u64 clone_flags, int *inherited_all) { int ret; struct perf_event_context *child_ctx; - if (!event->attr.inherit) { + if (!event->attr.inherit || + (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) || + /* Do not inherit if sigtrap and signal handlers were cleared. */ + (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) { *inherited_all = 0; return 0; } @@ -12870,7 +13028,8 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, /* * Initialize the perf_event context in task_struct */ -static int perf_event_init_context(struct task_struct *child, int ctxn) +static int perf_event_init_context(struct task_struct *child, int ctxn, + u64 clone_flags) { struct perf_event_context *child_ctx, *parent_ctx; struct perf_event_context *cloned_ctx; @@ -12910,7 +13069,8 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) */ perf_event_groups_for_each(event, &parent_ctx->pinned_groups) { ret = inherit_task_group(event, parent, parent_ctx, - child, ctxn, &inherited_all); + child, ctxn, clone_flags, + &inherited_all); if (ret) goto out_unlock; } @@ -12926,7 +13086,8 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) perf_event_groups_for_each(event, &parent_ctx->flexible_groups) { ret = inherit_task_group(event, parent, parent_ctx, - child, ctxn, &inherited_all); + child, ctxn, clone_flags, + &inherited_all); if (ret) goto out_unlock; } @@ -12968,7 +13129,7 @@ out_unlock: /* * Initialize the perf_event context in task_struct */ -int perf_event_init_task(struct task_struct *child) +int perf_event_init_task(struct task_struct *child, u64 clone_flags) { int ctxn, ret; @@ -12977,7 +13138,7 @@ int perf_event_init_task(struct task_struct *child) INIT_LIST_HEAD(&child->perf_event_list); for_each_task_context_nr(ctxn) { - ret = perf_event_init_context(child, ctxn); + ret = perf_event_init_context(child, ctxn, clone_flags); if (ret) { perf_event_free_task(child); return ret; @@ -13130,6 +13291,8 @@ void __init perf_event_init(void) ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); + perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC); + /* * Build time assertion that we keep the data_head at the intended * location. IOW, validation we got the __reserved[] size right. |