From 6b44f519017b219a12b37173c7eef8dfce2c0100 Mon Sep 17 00:00:00 2001 From: Scot Doyle Date: Sun, 24 Aug 2014 17:12:27 +0000 Subject: sched/wait: Document timeout corner case The timeout may elapse without 0 being returned, such as when waiting on an unused queue. Document this possibility. Signed-off-by: Scot Doyle Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/alpine.LNX.2.11.1408241710070.6462@localhost.localdomain Signed-off-by: Ingo Molnar --- include/linux/wait.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index 6fb1ba5f9b2f..034f6fc7c65f 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -280,9 +280,11 @@ do { \ * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * - * The function returns 0 if the @timeout elapsed, or the remaining - * jiffies (at least 1) if the @condition evaluated to %true before - * the @timeout elapsed. + * Returns: + * 0 if the @condition evaluated to %false after the @timeout elapsed, + * 1 if the @condition evaluated to %true after the @timeout elapsed, + * or the remaining jiffies (at least 1) if the @condition evaluated + * to %true before the @timeout elapsed. */ #define wait_event_timeout(wq, condition, timeout) \ ({ \ @@ -363,9 +365,11 @@ do { \ * change the result of the wait condition. * * Returns: - * 0 if the @timeout elapsed, -%ERESTARTSYS if it was interrupted by - * a signal, or the remaining jiffies (at least 1) if the @condition - * evaluated to %true before the @timeout elapsed. + * 0 if the @condition evaluated to %false after the @timeout elapsed, + * 1 if the @condition evaluated to %true after the @timeout elapsed, + * the remaining jiffies (at least 1) if the @condition evaluated + * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was + * interrupted by a signal. */ #define wait_event_interruptible_timeout(wq, condition, timeout) \ ({ \ -- cgit v1.2.3-58-ga151 From e78c3496790ee8a36522a838b59b388e8a709e65 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Sat, 16 Aug 2014 13:40:10 -0400 Subject: time, signal: Protect resource use statistics with seqlock Both times() and clock_gettime(CLOCK_PROCESS_CPUTIME_ID) have scalability issues on large systems, due to both functions being serialized with a lock. The lock protects against reporting a wrong value, due to a thread in the task group exiting, its statistics reporting up to the signal struct, and that exited task's statistics being counted twice (or not at all). Protecting that with a lock results in times() and clock_gettime() being completely serialized on large systems. This can be fixed by using a seqlock around the events that gather and propagate statistics. As an additional benefit, the protection code can be moved into thread_group_cputime(), slightly simplifying the calling functions. In the case of posix_cpu_clock_get_task() things can be simplified a lot, because the calling function already ensures that the task sticks around, and the rest is now taken care of in thread_group_cputime(). This way the statistics reporting code can run lockless. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Alex Thorlton Cc: Andrew Morton Cc: Daeseok Youn Cc: David Rientjes Cc: Dongsheng Yang Cc: Geert Uytterhoeven Cc: Guillaume Morin Cc: Ionut Alexa Cc: Kees Cook Cc: Linus Torvalds Cc: Li Zefan Cc: Michal Hocko Cc: Michal Schmidt Cc: Oleg Nesterov Cc: Vladimir Davydov Cc: umgwanakikbuti@gmail.com Cc: fweisbec@gmail.com Cc: srao@redhat.com Cc: lwoodman@redhat.com Cc: atheurer@redhat.com Link: http://lkml.kernel.org/r/20140816134010.26a9b572@annuminas.surriel.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/exit.c | 4 ++++ kernel/fork.c | 1 + kernel/sched/cputime.c | 33 ++++++++++++++++++++------------- kernel/sys.c | 2 -- kernel/time/posix-cpu-timers.c | 14 -------------- 6 files changed, 26 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c2c885ee52b..dd9eb4807389 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -645,6 +645,7 @@ struct signal_struct { * Live threads maintain their own counters and add to these * in __exit_signal, except for the group leader. */ + seqlock_t stats_lock; cputime_t utime, stime, cutime, cstime; cputime_t gtime; cputime_t cgtime; diff --git a/kernel/exit.c b/kernel/exit.c index b93d46dab6fc..fa09b86609db 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -127,6 +127,7 @@ static void __exit_signal(struct task_struct *tsk) * the signal_struct. */ task_cputime(tsk, &utime, &stime); + write_seqlock(&sig->stats_lock); sig->utime += utime; sig->stime += stime; sig->gtime += task_gtime(tsk); @@ -140,6 +141,7 @@ static void __exit_signal(struct task_struct *tsk) sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); /* * Do this under ->siglock, we can race with another thread @@ -1042,6 +1044,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) spin_lock_irq(&p->real_parent->sighand->siglock); psig = p->real_parent->signal; sig = p->signal; + write_seqlock(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; @@ -1064,6 +1067,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); + write_sequnlock(&psig->stats_lock); spin_unlock_irq(&p->real_parent->sighand->siglock); } diff --git a/kernel/fork.c b/kernel/fork.c index 0cf9cdb6e491..9387ae8ab048 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1068,6 +1068,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); + seqlock_init(&sig->stats_lock); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 3e52836359ba..49b7cfe98f7a 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -288,18 +288,28 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) struct signal_struct *sig = tsk->signal; cputime_t utime, stime; struct task_struct *t; - - times->utime = sig->utime; - times->stime = sig->stime; - times->sum_exec_runtime = sig->sum_sched_runtime; + unsigned int seq, nextseq; rcu_read_lock(); - for_each_thread(tsk, t) { - task_cputime(t, &utime, &stime); - times->utime += utime; - times->stime += stime; - times->sum_exec_runtime += task_sched_runtime(t); - } + /* Attempt a lockless read on the first round. */ + nextseq = 0; + do { + seq = nextseq; + read_seqbegin_or_lock(&sig->stats_lock, &seq); + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; + + for_each_thread(tsk, t) { + task_cputime(t, &utime, &stime); + times->utime += utime; + times->stime += stime; + times->sum_exec_runtime += task_sched_runtime(t); + } + /* If lockless access failed, take the lock. */ + nextseq = 1; + } while (need_seqretry(&sig->stats_lock, seq)); + done_seqretry(&sig->stats_lock, seq); rcu_read_unlock(); } @@ -611,9 +621,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) cputime_adjust(&cputime, &p->prev_cputime, ut, st); } -/* - * Must be called with siglock held. - */ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct task_cputime cputime; diff --git a/kernel/sys.c b/kernel/sys.c index ce8129192a26..b6636643cbd1 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -862,11 +862,9 @@ void do_sys_times(struct tms *tms) { cputime_t tgutime, tgstime, cutime, cstime; - spin_lock_irq(¤t->sighand->siglock); thread_group_cputime_adjusted(current, &tgutime, &tgstime); cutime = current->signal->cutime; cstime = current->signal->cstime; - spin_unlock_irq(¤t->sighand->siglock); tms->tms_utime = cputime_to_clock_t(tgutime); tms->tms_stime = cputime_to_clock_t(tgstime); tms->tms_cutime = cputime_to_clock_t(cutime); diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 3b8946416a5f..492b986195d5 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk, if (same_thread_group(tsk, current)) err = cpu_clock_sample(which_clock, tsk, &rtn); } else { - unsigned long flags; - struct sighand_struct *sighand; - - /* - * while_each_thread() is not yet entirely RCU safe, - * keep locking the group while sampling process - * clock for now. - */ - sighand = lock_task_sighand(tsk, &flags); - if (!sighand) - return err; - if (tsk == current || thread_group_leader(tsk)) err = cpu_clock_sample_group(which_clock, tsk, &rtn); - - unlock_task_sighand(tsk, &flags); } if (!err) -- cgit v1.2.3-58-ga151 From f6be8af1c95de4a46e325e728900a70ceadb52cf Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Thu, 4 Sep 2014 15:17:53 +0800 Subject: sched: Add new API wake_up_if_idle() to wake up the idle cpu Implementing one new API wake_up_if_idle(), which is used to wake up the idle CPU. Suggested-by: Andy Lutomirski Signed-off-by: Chuansheng Liu Signed-off-by: Peter Zijlstra (Intel) Cc: daniel.lezcano@linaro.org Cc: rjw@rjwysocki.net Cc: linux-pm@vger.kernel.org Cc: changcheng.liu@intel.com Cc: xiaoming.wang@intel.com Cc: souvik.k.chakravarty@intel.com Cc: chuansheng.liu@intel.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1409815075-4180-1-git-send-email-chuansheng.liu@intel.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/core.c | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index dd9eb4807389..82ff3d6efb19 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1024,6 +1024,7 @@ struct sched_domain_topology_level { extern struct sched_domain_topology_level *sched_domain_topology; extern void set_sched_topology(struct sched_domain_topology_level *tl); +extern void wake_up_if_idle(int cpu); #ifdef CONFIG_SCHED_DEBUG # define SD_INIT_NAME(type) .name = #type diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 78e5c839df13..f7c6ed2fd69d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1634,6 +1634,25 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) } } +void wake_up_if_idle(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + if (!is_idle_task(rq->curr)) + return; + + if (set_nr_if_polling(rq->idle)) { + trace_sched_wake_idle_without_ipi(cpu); + } else { + raw_spin_lock_irqsave(&rq->lock, flags); + if (is_idle_task(rq->curr)) + smp_send_reschedule(cpu); + /* Else cpu is not in idle, do nothing here */ + raw_spin_unlock_irqrestore(&rq->lock, flags); + } +} + bool cpus_share_cache(int this_cpu, int that_cpu) { return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); -- cgit v1.2.3-58-ga151 From c6f4459fc3ba532e896cb678e29b45cb985f82bf Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Thu, 4 Sep 2014 15:17:54 +0800 Subject: smp: Add new wake_up_all_idle_cpus() function Currently kick_all_cpus_sync() can break non-polling idle cpus thru IPI interrupts. But sometimes we need to break the polling idle cpus immediately to reselect the suitable c-state, also for non-idle cpus, we need to do nothing if we try to wake up them. Here adding one new function wake_up_all_idle_cpus() to let all cpus out of idle based on function wake_up_if_idle(). Signed-off-by: Chuansheng Liu Signed-off-by: Peter Zijlstra (Intel) Cc: daniel.lezcano@linaro.org Cc: rjw@rjwysocki.net Cc: linux-pm@vger.kernel.org Cc: changcheng.liu@intel.com Cc: xiaoming.wang@intel.com Cc: souvik.k.chakravarty@intel.com Cc: luto@amacapital.net Cc: Andrew Morton Cc: Christoph Hellwig Cc: Frederic Weisbecker Cc: Geert Uytterhoeven Cc: Jan Kara Cc: Jens Axboe Cc: Jens Axboe Cc: Linus Torvalds Cc: Michal Hocko Cc: Paul Gortmaker Cc: Roman Gushchin Cc: Srivatsa S. Bhat Link: http://lkml.kernel.org/r/1409815075-4180-2-git-send-email-chuansheng.liu@intel.com Signed-off-by: Ingo Molnar --- include/linux/smp.h | 2 ++ kernel/smp.c | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'include') diff --git a/include/linux/smp.h b/include/linux/smp.h index 34347f26be9b..93dff5fff524 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -100,6 +100,7 @@ int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait); void kick_all_cpus_sync(void); +void wake_up_all_idle_cpus(void); /* * Generic and arch helpers @@ -148,6 +149,7 @@ smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, } static inline void kick_all_cpus_sync(void) { } +static inline void wake_up_all_idle_cpus(void) { } #endif /* !SMP */ diff --git a/kernel/smp.c b/kernel/smp.c index aff8aa14f547..9e0d0b289118 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "smpboot.h" @@ -699,3 +700,24 @@ void kick_all_cpus_sync(void) smp_call_function(do_nothing, NULL, 1); } EXPORT_SYMBOL_GPL(kick_all_cpus_sync); + +/** + * wake_up_all_idle_cpus - break all cpus out of idle + * wake_up_all_idle_cpus try to break all cpus which is in idle state even + * including idle polling cpus, for non-idle cpus, we will do nothing + * for them. + */ +void wake_up_all_idle_cpus(void) +{ + int cpu; + + preempt_disable(); + for_each_online_cpu(cpu) { + if (cpu == smp_processor_id()) + continue; + + wake_up_if_idle(cpu); + } + preempt_enable(); +} +EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); -- cgit v1.2.3-58-ga151 From ef8ac06359ddf95431cf6bb04ad2b36fff562328 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 12 Sep 2014 09:12:14 -0400 Subject: seqlock: Add irqsave variant of read_seqbegin_or_lock() There are cases where read_seqbegin_or_lock() needs to block irqs, because the seqlock in question nests inside a lock that is also be taken from irq context. Add read_seqbegin_or_lock_irqsave() and done_seqretry_irqrestore(), which are almost identical to read_seqbegin_or_lock() and done_seqretry(). Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: prarit@redhat.com Cc: oleg@redhat.com Cc: sgruszka@redhat.com Cc: Al Viro Cc: John Stultz Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Stephen Boyd Cc: Trond Myklebust Link: http://lkml.kernel.org/r/1410527535-9814-2-git-send-email-riel@redhat.com [ Improved the readability of the code a bit. ] Signed-off-by: Ingo Molnar --- include/linux/seqlock.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index cc359636cfa3..f5df8f687b4d 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -456,4 +456,23 @@ read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags) spin_unlock_irqrestore(&sl->lock, flags); } +static inline unsigned long +read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq) +{ + unsigned long flags = 0; + + if (!(*seq & 1)) /* Even */ + *seq = read_seqbegin(lock); + else /* Odd */ + read_seqlock_excl_irqsave(lock, flags); + + return flags; +} + +static inline void +done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags) +{ + if (seq & 1) + read_sequnlock_excl_irqrestore(lock, flags); +} #endif /* __LINUX_SEQLOCK_H */ -- cgit v1.2.3-58-ga151 From d4311ff1a8da48d609db9500f121c15580dfeeb7 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Fri, 12 Sep 2014 14:16:17 +0100 Subject: init/main.c: Give init_task a canary Tasks get their end of stack set to STACK_END_MAGIC with the aim to catch stack overruns. Currently this feature does not apply to init_task. This patch removes this restriction. Note that a similar patch was posted by Prarit Bhargava some time ago but was never merged: http://marc.info/?l=linux-kernel&m=127144305403241&w=2 Signed-off-by: Aaron Tomlin Signed-off-by: Peter Zijlstra (Intel) Acked-by: Oleg Nesterov Acked-by: Michael Ellerman Cc: aneesh.kumar@linux.vnet.ibm.com Cc: dzickus@redhat.com Cc: bmr@redhat.com Cc: jcastillo@redhat.com Cc: jgh@redhat.com Cc: minchan@kernel.org Cc: tglx@linutronix.de Cc: hannes@cmpxchg.org Cc: Alex Thorlton Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Daeseok Youn Cc: David Rientjes Cc: Fabian Frederick Cc: Geert Uytterhoeven Cc: Jiri Olsa Cc: Kees Cook Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Michael Opdenacker Cc: Paul Mackerras Cc: Prarit Bhargava Cc: Rik van Riel Cc: Rusty Russell Cc: Seiji Aguchi Cc: Steven Rostedt Cc: Vladimir Davydov Cc: Yasuaki Ishimatsu Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1410527779-8133-2-git-send-email-atomlin@redhat.com Signed-off-by: Ingo Molnar --- arch/powerpc/mm/fault.c | 3 +-- arch/x86/mm/fault.c | 3 +-- include/linux/sched.h | 2 ++ init/main.c | 1 + kernel/fork.c | 12 +++++++++--- kernel/trace/trace_stack.c | 4 +--- 6 files changed, 15 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 51ab9e7e6c39..35d0760c3fa4 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include @@ -538,7 +537,7 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) regs->nip); stackend = end_of_stack(current); - if (current != &init_task && *stackend != STACK_END_MAGIC) + if (*stackend != STACK_END_MAGIC) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); die("Kernel access of bad area", regs, sig); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a24194681513..bc23a7043c65 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -3,7 +3,6 @@ * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar */ -#include /* STACK_END_MAGIC */ #include /* test_thread_flag(), ... */ #include /* oops_begin/end, ... */ #include /* search_exception_table */ @@ -710,7 +709,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, show_fault_oops(regs, error_code, address); stackend = end_of_stack(tsk); - if (tsk != &init_task && *stackend != STACK_END_MAGIC) + if (*stackend != STACK_END_MAGIC) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); tsk->thread.cr2 = address; diff --git a/include/linux/sched.h b/include/linux/sched.h index 82ff3d6efb19..118dca7d5a28 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -57,6 +57,7 @@ struct sched_param { #include #include #include +#include #include @@ -2638,6 +2639,7 @@ static inline unsigned long stack_not_used(struct task_struct *p) return (unsigned long)n - (unsigned long)end_of_stack(p); } #endif +extern void set_task_stack_end_magic(struct task_struct *tsk); /* set thread flags in other task's structures * - see asm/thread_info.h for TIF_xxxx flags available diff --git a/init/main.c b/init/main.c index bb1aed928f21..5fc3fc7bd475 100644 --- a/init/main.c +++ b/init/main.c @@ -508,6 +508,7 @@ asmlinkage __visible void __init start_kernel(void) * lockdep hash: */ lockdep_init(); + set_task_stack_end_magic(&init_task); smp_setup_processor_id(); debug_objects_early_init(); diff --git a/kernel/fork.c b/kernel/fork.c index 9387ae8ab048..ad64248c4b18 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -294,11 +294,18 @@ int __weak arch_dup_task_struct(struct task_struct *dst, return 0; } +void set_task_stack_end_magic(struct task_struct *tsk) +{ + unsigned long *stackend; + + stackend = end_of_stack(tsk); + *stackend = STACK_END_MAGIC; /* for overflow detection */ +} + static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; struct thread_info *ti; - unsigned long *stackend; int node = tsk_fork_get_node(orig); int err; @@ -328,8 +335,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); - stackend = end_of_stack(tsk); - *stackend = STACK_END_MAGIC; /* for overflow detection */ + set_task_stack_end_magic(tsk); #ifdef CONFIG_CC_STACKPROTECTOR tsk->stack_canary = get_random_int(); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 8a4e5cb66a4c..1636e41828c2 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -13,7 +13,6 @@ #include #include #include -#include #include @@ -171,8 +170,7 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } - if ((current != &init_task && - *(end_of_stack(current)) != STACK_END_MAGIC)) { + if (*end_of_stack(current) != STACK_END_MAGIC) { print_max_stack(); BUG(); } -- cgit v1.2.3-58-ga151 From a70857e46dd13e87ae06bf0e64cb6a2d4f436265 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Fri, 12 Sep 2014 14:16:18 +0100 Subject: sched: Add helper for task stack page overrun checking This facility is used in a few places so let's introduce a helper function to improve code readability. Signed-off-by: Aaron Tomlin Signed-off-by: Peter Zijlstra (Intel) Cc: aneesh.kumar@linux.vnet.ibm.com Cc: dzickus@redhat.com Cc: bmr@redhat.com Cc: jcastillo@redhat.com Cc: oleg@redhat.com Cc: riel@redhat.com Cc: prarit@redhat.com Cc: jgh@redhat.com Cc: minchan@kernel.org Cc: mpe@ellerman.id.au Cc: tglx@linutronix.de Cc: hannes@cmpxchg.org Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Michael Ellerman Cc: Paul Mackerras Cc: Seiji Aguchi Cc: Steven Rostedt Cc: Yasuaki Ishimatsu Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1410527779-8133-3-git-send-email-atomlin@redhat.com Signed-off-by: Ingo Molnar --- arch/powerpc/mm/fault.c | 4 +--- arch/x86/mm/fault.c | 4 +--- include/linux/sched.h | 2 ++ kernel/trace/trace_stack.c | 2 +- 4 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 35d0760c3fa4..99b2f2775658 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -507,7 +507,6 @@ bail: void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) { const struct exception_table_entry *entry; - unsigned long *stackend; /* Are we prepared to handle this fault? */ if ((entry = search_exception_tables(regs->nip)) != NULL) { @@ -536,8 +535,7 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n", regs->nip); - stackend = end_of_stack(current); - if (*stackend != STACK_END_MAGIC) + if (task_stack_end_corrupted(current)) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); die("Kernel access of bad area", regs, sig); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index bc23a7043c65..6240bc7ae741 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -648,7 +648,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, unsigned long address, int signal, int si_code) { struct task_struct *tsk = current; - unsigned long *stackend; unsigned long flags; int sig; @@ -708,8 +707,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, show_fault_oops(regs, error_code, address); - stackend = end_of_stack(tsk); - if (*stackend != STACK_END_MAGIC) + if (task_stack_end_corrupted(tsk)) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); tsk->thread.cr2 = address; diff --git a/include/linux/sched.h b/include/linux/sched.h index 118dca7d5a28..18f52624eaa6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2617,6 +2617,8 @@ static inline unsigned long *end_of_stack(struct task_struct *p) } #endif +#define task_stack_end_corrupted(task) \ + (*(end_of_stack(task)) != STACK_END_MAGIC) static inline int object_is_on_stack(void *obj) { diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 1636e41828c2..16eddb308c33 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -170,7 +170,7 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } - if (*end_of_stack(current) != STACK_END_MAGIC) { + if (task_stack_end_corrupted(current)) { print_max_stack(); BUG(); } -- cgit v1.2.3-58-ga151 From 347abad981c1ef815ea5ba861adba6a8c6aa1580 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 30 Sep 2014 15:59:47 -0400 Subject: sched, time: Fix build error with 64 bit cputime_t on 32 bit systems On 32 bit systems cmpxchg cannot handle 64 bit values, so some additional magic is required to allow a 32 bit system with CONFIG_VIRT_CPU_ACCOUNTING_GEN=y enabled to build. Make sure the correct cmpxchg function is used when doing an atomic swap of a cputime_t. Reported-by: Arnd Bergmann Signed-off-by: Rik van Riel Acked-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Cc: umgwanakikbuti@gmail.com Cc: fweisbec@gmail.com Cc: srao@redhat.com Cc: lwoodman@redhat.com Cc: atheurer@redhat.com Cc: oleg@redhat.com Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: linux390@de.ibm.com Cc: linux-arch@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-s390@vger.kernel.org Link: http://lkml.kernel.org/r/20140930155947.070cdb1f@annuminas.surriel.com Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/cputime.h | 2 ++ arch/s390/include/asm/cputime.h | 2 ++ include/asm-generic/cputime_jiffies.h | 2 ++ include/asm-generic/cputime_nsecs.h | 2 ++ kernel/sched/cputime.c | 29 +++++++++++++++++++---------- 5 files changed, 27 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 607559ab271f..6c840ceab820 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -32,6 +32,8 @@ static inline void setup_cputime_one_jiffy(void) { } typedef u64 __nocast cputime_t; typedef u64 __nocast cputime64_t; +#define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new) + #ifdef __KERNEL__ /* diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index f65bd3634519..3001887f94b7 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h @@ -18,6 +18,8 @@ typedef unsigned long long __nocast cputime_t; typedef unsigned long long __nocast cputime64_t; +#define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new) + static inline unsigned long __div(unsigned long long n, unsigned long base) { #ifndef CONFIG_64BIT diff --git a/include/asm-generic/cputime_jiffies.h b/include/asm-generic/cputime_jiffies.h index d5cb78f53986..fe386fc6e85e 100644 --- a/include/asm-generic/cputime_jiffies.h +++ b/include/asm-generic/cputime_jiffies.h @@ -3,6 +3,8 @@ typedef unsigned long __nocast cputime_t; +#define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new) + #define cputime_one_jiffy jiffies_to_cputime(1) #define cputime_to_jiffies(__ct) (__force unsigned long)(__ct) #define cputime_to_scaled(__ct) (__ct) diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h index 4e817606c549..0419485891f2 100644 --- a/include/asm-generic/cputime_nsecs.h +++ b/include/asm-generic/cputime_nsecs.h @@ -21,6 +21,8 @@ typedef u64 __nocast cputime_t; typedef u64 __nocast cputime64_t; +#define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new) + #define cputime_one_jiffy jiffies_to_cputime(1) #define cputime_div(__ct, divisor) div_u64((__force u64)__ct, divisor) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 64492dff8a81..8394b1ee600c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -554,6 +554,23 @@ drop_precision: return (__force cputime_t) scaled; } +/* + * Atomically advance counter to the new value. Interrupts, vcpu + * scheduling, and scaling inaccuracies can cause cputime_advance + * to be occasionally called with a new value smaller than counter. + * Let's enforce atomicity. + * + * Normally a caller will only go through this loop once, or not + * at all in case a previous caller updated counter the same jiffy. + */ +static void cputime_advance(cputime_t *counter, cputime_t new) +{ + cputime_t old; + + while (new > (old = ACCESS_ONCE(*counter))) + cmpxchg_cputime(counter, old, new); +} + /* * Adjust tick based cputime random precision against scheduler * runtime accounting. @@ -599,16 +616,8 @@ static void cputime_adjust(struct task_cputime *curr, utime = rtime - stime; } - /* - * If the tick based count grows faster than the scheduler one, - * the result of the scaling may go backward. - * Let's enforce monotonicity. - * Atomic exchange protects against concurrent cputime_adjust(). - */ - while (stime > (rtime = ACCESS_ONCE(prev->stime))) - cmpxchg(&prev->stime, rtime, stime); - while (utime > (rtime = ACCESS_ONCE(prev->utime))) - cmpxchg(&prev->utime, rtime, utime); + cputime_advance(&prev->stime, stime); + cputime_advance(&prev->utime, utime); out: *ut = prev->utime; -- cgit v1.2.3-58-ga151