From cba77f03f2c7b6cc0b0a44a3c679e0abade7da62 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 11 Jul 2015 21:19:19 -0400 Subject: locking/pvqspinlock: Fix kernel panic in locking-selftest Enabling locking-selftest in a VM guest may cause the following kernel panic: kernel BUG at .../kernel/locking/qspinlock_paravirt.h:137! This is due to the fact that the pvqspinlock unlock function is expecting either a _Q_LOCKED_VAL or _Q_SLOW_VAL in the lock byte. This patch prevents that bug report by ignoring it when debug_locks_silent is set. Otherwise, a warning will be printed if it contains an unexpected value. With this patch applied, the kernel locking-selftest completed without any noise. Tested-by: Masami Hiramatsu Signed-off-by: Waiman Long Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1436663959-53092-1-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock_paravirt.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 04ab18151cc8..df19ae4debd0 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -4,6 +4,7 @@ #include #include +#include /* * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead @@ -286,15 +287,23 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) { struct __qspinlock *l = (void *)lock; struct pv_node *node; + u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); /* * We must not unlock if SLOW, because in that case we must first * unhash. Otherwise it would be possible to have multiple @lock * entries, which would be BAD. */ - if (likely(cmpxchg(&l->locked, _Q_LOCKED_VAL, 0) == _Q_LOCKED_VAL)) + if (likely(lockval == _Q_LOCKED_VAL)) return; + if (unlikely(lockval != _Q_SLOW_VAL)) { + if (debug_locks_silent) + return; + WARN(1, "pvqspinlock: lock %p has corrupted value 0x%x!\n", lock, atomic_read(&lock->val)); + return; + } + /* * Since the above failed to release, this must be the SLOW path. * Therefore start by looking up the blocked node and unhashing it. -- cgit v1.2.3-58-ga151 From 00a2916f7f82c348a2a94dbb572874173bc308a3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 27 Jul 2015 10:35:07 +0200 Subject: perf: Fix running time accounting A recent fix to the shadow timestamp inadvertly broke the running time accounting. We must not update the running timestamp if we fail to schedule the event, the event will not have ran. This can (and did) result in negative total runtime because the stopped timestamp was before the running timestamp (we 'started' but never stopped the event -- because it never really started we didn't have to stop it either). Reported-and-Tested-by: Vince Weaver Fixes: 72f669c0086f ("perf: Update shadow timestamp before add event") Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org # 4.1 Cc: Shaohua Li Signed-off-by: Thomas Gleixner --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d3dae3419b99..10d076b2572c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1868,8 +1868,6 @@ event_sched_in(struct perf_event *event, perf_pmu_disable(event->pmu); - event->tstamp_running += tstamp - event->tstamp_stopped; - perf_set_shadow_time(event, ctx, tstamp); perf_log_itrace_start(event); @@ -1881,6 +1879,8 @@ event_sched_in(struct perf_event *event, goto out; } + event->tstamp_running += tstamp - event->tstamp_stopped; + if (!is_software_event(event)) cpuctx->active_oncpu++; if (!ctx->nr_active++) -- cgit v1.2.3-58-ga151 From fed66e2cdd4f127a43fd11b8d92a99bdd429528c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2015 10:32:01 +0200 Subject: perf: Fix fasync handling on inherited events Vince reported that the fasync signal stuff doesn't work proper for inherited events. So fix that. Installing fasync allocates memory and sets filp->f_flags |= FASYNC, which upon the demise of the file descriptor ensures the allocation is freed and state is updated. Now for perf, we can have the events stick around for a while after the original FD is dead because of references from child events. So we cannot copy the fasync pointer around. We can however consistently use the parent's fasync, as that will be updated. Reported-and-Tested-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Arnaldo Carvalho deMelo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: eranian@google.com Link: http://lkml.kernel.org/r/1434011521.1495.71.camel@twins Signed-off-by: Ingo Molnar --- kernel/events/core.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 10d076b2572c..072b8a686517 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4740,12 +4740,20 @@ static const struct file_operations perf_fops = { * to user-space before waking everybody up. */ +static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) +{ + /* only the parent has fasync state */ + if (event->parent) + event = event->parent; + return &event->fasync; +} + void perf_event_wakeup(struct perf_event *event) { ring_buffer_wakeup(event); if (event->pending_kill) { - kill_fasync(&event->fasync, SIGIO, event->pending_kill); + kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill); event->pending_kill = 0; } } @@ -6124,7 +6132,7 @@ static int __perf_event_overflow(struct perf_event *event, else perf_event_output(event, data, regs); - if (event->fasync && event->pending_kill) { + if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; irq_work_queue(&event->pending); } -- cgit v1.2.3-58-ga151 From 24ee3cf89bef04e8bc23788aca4e029a3f0f06d9 Mon Sep 17 00:00:00 2001 From: Alban Crequy Date: Thu, 6 Aug 2015 16:21:05 +0200 Subject: cpuset: use trialcs->mems_allowed as a temp variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The comment says it's using trialcs->mems_allowed as a temp variable but it didn't match the code. Change the code to match the comment. This fixes an issue when writing in cpuset.mems when a sub-directory exists: we need to write several times for the information to persist: | root@alban:/sys/fs/cgroup/cpuset# mkdir footest9 | root@alban:/sys/fs/cgroup/cpuset# cd footest9 | root@alban:/sys/fs/cgroup/cpuset/footest9# mkdir aa | root@alban:/sys/fs/cgroup/cpuset/footest9# cat cpuset.mems | | root@alban:/sys/fs/cgroup/cpuset/footest9# echo 0 > cpuset.mems | root@alban:/sys/fs/cgroup/cpuset/footest9# cat cpuset.mems | | root@alban:/sys/fs/cgroup/cpuset/footest9# echo 0 > cpuset.mems | root@alban:/sys/fs/cgroup/cpuset/footest9# cat cpuset.mems | 0 | root@alban:/sys/fs/cgroup/cpuset/footest9# cat aa/cpuset.mems | | root@alban:/sys/fs/cgroup/cpuset/footest9# echo 0 > aa/cpuset.mems | root@alban:/sys/fs/cgroup/cpuset/footest9# cat aa/cpuset.mems | 0 | root@alban:/sys/fs/cgroup/cpuset/footest9# This should help to fix the following issue in Docker: https://github.com/opencontainers/runc/issues/133 In some conditions, a Docker container needs to be started twice in order to work. Signed-off-by: Alban Crequy Tested-by: Iago López Galeiras Cc: # 3.17+ Acked-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ee14e3a35a29..f0acff0f66c9 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1223,7 +1223,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, spin_unlock_irq(&callback_lock); /* use trialcs->mems_allowed as a temp variable */ - update_nodemasks_hier(cs, &cs->mems_allowed); + update_nodemasks_hier(cs, &trialcs->mems_allowed); done: return retval; } -- cgit v1.2.3-58-ga151 From ee9397a6fb9bc4e52677f5e33eed4abee0f515e6 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 27 Jul 2015 00:31:08 +0100 Subject: perf: Fix double-free of the AUX buffer If rb->aux_refcount is decremented to zero before rb->refcount, __rb_free_aux() may be called twice resulting in a double free of rb->aux_pages. Fix this by adding a check to __rb_free_aux(). Signed-off-by: Ben Hutchings Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: 57ffc5ca679f ("perf: Fix AUX buffer refcounting") Link: http://lkml.kernel.org/r/1437953468.12842.17.camel@decadent.org.uk Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index b2be01b1aa9d..c8aa3f75bc4d 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -559,11 +559,13 @@ static void __rb_free_aux(struct ring_buffer *rb) rb->aux_priv = NULL; } - for (pg = 0; pg < rb->aux_nr_pages; pg++) - rb_free_aux_page(rb, pg); + if (rb->aux_nr_pages) { + for (pg = 0; pg < rb->aux_nr_pages; pg++) + rb_free_aux_page(rb, pg); - kfree(rb->aux_pages); - rb->aux_nr_pages = 0; + kfree(rb->aux_pages); + rb->aux_nr_pages = 0; + } } void rb_free_aux(struct ring_buffer *rb) -- cgit v1.2.3-58-ga151 From c7999c6f3fed9e383d3131474588f282ae6d56b9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Aug 2015 19:22:49 +0200 Subject: perf: Fix PERF_EVENT_IOC_PERIOD migration race I ran the perf fuzzer, which triggered some WARN()s which are due to trying to stop/restart an event on the wrong CPU. Use the normal IPI pattern to ensure we run the code on the correct CPU. Signed-off-by: Peter Zijlstra (Intel) Cc: Vince Weaver Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: bad7192b842c ("perf: Fix PERF_EVENT_IOC_PERIOD to force-reset the period") Signed-off-by: Ingo Molnar --- kernel/events/core.c | 75 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 55 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 072b8a686517..e6feb5114134 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3958,28 +3958,21 @@ static void perf_event_for_each(struct perf_event *event, perf_event_for_each_child(sibling, func); } -static int perf_event_period(struct perf_event *event, u64 __user *arg) -{ - struct perf_event_context *ctx = event->ctx; - int ret = 0, active; +struct period_event { + struct perf_event *event; u64 value; +}; - if (!is_sampling_event(event)) - return -EINVAL; - - if (copy_from_user(&value, arg, sizeof(value))) - return -EFAULT; - - if (!value) - return -EINVAL; +static int __perf_event_period(void *info) +{ + struct period_event *pe = info; + struct perf_event *event = pe->event; + struct perf_event_context *ctx = event->ctx; + u64 value = pe->value; + bool active; - raw_spin_lock_irq(&ctx->lock); + raw_spin_lock(&ctx->lock); if (event->attr.freq) { - if (value > sysctl_perf_event_sample_rate) { - ret = -EINVAL; - goto unlock; - } - event->attr.sample_freq = value; } else { event->attr.sample_period = value; @@ -3998,11 +3991,53 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) event->pmu->start(event, PERF_EF_RELOAD); perf_pmu_enable(ctx->pmu); } + raw_spin_unlock(&ctx->lock); -unlock: + return 0; +} + +static int perf_event_period(struct perf_event *event, u64 __user *arg) +{ + struct period_event pe = { .event = event, }; + struct perf_event_context *ctx = event->ctx; + struct task_struct *task; + u64 value; + + if (!is_sampling_event(event)) + return -EINVAL; + + if (copy_from_user(&value, arg, sizeof(value))) + return -EFAULT; + + if (!value) + return -EINVAL; + + if (event->attr.freq && value > sysctl_perf_event_sample_rate) + return -EINVAL; + + task = ctx->task; + pe.value = value; + + if (!task) { + cpu_function_call(event->cpu, __perf_event_period, &pe); + return 0; + } + +retry: + if (!task_function_call(task, __perf_event_period, &pe)) + return 0; + + raw_spin_lock_irq(&ctx->lock); + if (ctx->is_active) { + raw_spin_unlock_irq(&ctx->lock); + task = ctx->task; + goto retry; + } + + __perf_event_period(&pe); raw_spin_unlock_irq(&ctx->lock); - return ret; + return 0; } static const struct file_operations perf_fops; -- cgit v1.2.3-58-ga151