diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/compat.c | 8 | ||||
-rw-r--r-- | kernel/exit.c | 15 | ||||
-rw-r--r-- | kernel/fork.c | 2 | ||||
-rw-r--r-- | kernel/irq/affinity.c | 72 | ||||
-rw-r--r-- | kernel/irq/msi.c | 4 | ||||
-rw-r--r-- | kernel/printk/printk.c | 25 | ||||
-rw-r--r-- | kernel/signal.c | 6 | ||||
-rw-r--r-- | kernel/sys.c | 3 | ||||
-rw-r--r-- | kernel/sysctl.c | 7 | ||||
-rw-r--r-- | kernel/time/Makefile | 10 | ||||
-rw-r--r-- | kernel/time/alarmtimer.c | 59 | ||||
-rw-r--r-- | kernel/time/hrtimer.c | 20 | ||||
-rw-r--r-- | kernel/time/itimer.c | 15 | ||||
-rw-r--r-- | kernel/time/posix-cpu-timers.c | 4 | ||||
-rw-r--r-- | kernel/time/posix-stubs.c | 123 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 33 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 90 | ||||
-rw-r--r-- | kernel/time/timer.c | 48 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 137 | ||||
-rw-r--r-- | kernel/trace/trace.c | 16 |
20 files changed, 431 insertions, 266 deletions
diff --git a/kernel/compat.c b/kernel/compat.c index 333d364be29d..b3a047f208a7 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -307,12 +307,17 @@ static inline long put_compat_itimerval(struct compat_itimerval __user *o, __put_user(i->it_value.tv_usec, &o->it_value.tv_usec))); } +asmlinkage long sys_ni_posix_timers(void); + COMPAT_SYSCALL_DEFINE2(getitimer, int, which, struct compat_itimerval __user *, it) { struct itimerval kit; int error; + if (!IS_ENABLED(CONFIG_POSIX_TIMERS)) + return sys_ni_posix_timers(); + error = do_getitimer(which, &kit); if (!error && put_compat_itimerval(it, &kit)) error = -EFAULT; @@ -326,6 +331,9 @@ COMPAT_SYSCALL_DEFINE3(setitimer, int, which, struct itimerval kin, kout; int error; + if (!IS_ENABLED(CONFIG_POSIX_TIMERS)) + return sys_ni_posix_timers(); + if (in) { if (get_compat_itimerval(&kin, in)) return -EFAULT; diff --git a/kernel/exit.c b/kernel/exit.c index 3076f3089919..aacff8e2aec0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -54,6 +54,7 @@ #include <linux/writeback.h> #include <linux/shm.h> #include <linux/kcov.h> +#include <linux/random.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -91,11 +92,10 @@ static void __exit_signal(struct task_struct *tsk) lockdep_tasklist_lock_is_held()); spin_lock(&sighand->siglock); +#ifdef CONFIG_POSIX_TIMERS posix_cpu_timers_exit(tsk); if (group_dead) { posix_cpu_timers_exit_group(tsk); - tty = sig->tty; - sig->tty = NULL; } else { /* * This can only happen if the caller is de_thread(). @@ -104,7 +104,13 @@ static void __exit_signal(struct task_struct *tsk) */ if (unlikely(has_group_leader_pid(tsk))) posix_cpu_timers_exit_group(tsk); + } +#endif + if (group_dead) { + tty = sig->tty; + sig->tty = NULL; + } else { /* * If there is any task waiting for the group exit * then notify it: @@ -116,6 +122,9 @@ static void __exit_signal(struct task_struct *tsk) sig->curr_target = next_thread(tsk); } + add_device_randomness((const void*) &tsk->se.sum_exec_runtime, + sizeof(unsigned long long)); + /* * Accumulate here the counters for all threads as they die. We could * skip the group leader because it is the last user of signal_struct, @@ -799,8 +808,10 @@ void __noreturn do_exit(long code) acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { +#ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); +#endif if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } diff --git a/kernel/fork.c b/kernel/fork.c index 00492b22adfe..7377f414f3ce 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1347,8 +1347,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) seqlock_init(&sig->stats_lock); prev_cputime_init(&sig->prev_cputime); +#ifdef CONFIG_POSIX_TIMERS hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; +#endif task_lock(current->group_leader); memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 17f51d63da56..9be9bda7c1f9 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -51,16 +51,17 @@ static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk) /** * irq_create_affinity_masks - Create affinity masks for multiqueue spreading - * @affinity: The affinity mask to spread. If NULL cpu_online_mask - * is used - * @nvecs: The number of vectors + * @nvecs: The total number of vectors + * @affd: Description of the affinity requirements * * Returns the masks pointer or NULL if allocation failed. */ -struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity, - int nvec) +struct cpumask * +irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) { - int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec = 0; + int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec; + int affv = nvecs - affd->pre_vectors - affd->post_vectors; + int last_affv = affv + affd->pre_vectors; nodemask_t nodemsk = NODE_MASK_NONE; struct cpumask *masks; cpumask_var_t nmsk; @@ -68,46 +69,47 @@ struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity, if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return NULL; - masks = kzalloc(nvec * sizeof(*masks), GFP_KERNEL); + masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); if (!masks) goto out; + /* Fill out vectors at the beginning that don't need affinity */ + for (curvec = 0; curvec < affd->pre_vectors; curvec++) + cpumask_copy(masks + curvec, irq_default_affinity); + /* Stabilize the cpumasks */ get_online_cpus(); - /* If the supplied affinity mask is NULL, use cpu online mask */ - if (!affinity) - affinity = cpu_online_mask; - - nodes = get_nodes_in_cpumask(affinity, &nodemsk); + nodes = get_nodes_in_cpumask(cpu_online_mask, &nodemsk); /* * If the number of nodes in the mask is less than or equal the * number of vectors we just spread the vectors across the nodes. */ - if (nvec <= nodes) { + if (affv <= nodes) { for_each_node_mask(n, nodemsk) { cpumask_copy(masks + curvec, cpumask_of_node(n)); - if (++curvec == nvec) + if (++curvec == last_affv) break; } - goto outonl; + goto done; } /* Spread the vectors per node */ - vecs_per_node = nvec / nodes; + vecs_per_node = affv / nodes; /* Account for rounding errors */ - extra_vecs = nvec - (nodes * vecs_per_node); + extra_vecs = affv - (nodes * vecs_per_node); for_each_node_mask(n, nodemsk) { int ncpus, v, vecs_to_assign = vecs_per_node; /* Get the cpus on this node which are in the mask */ - cpumask_and(nmsk, affinity, cpumask_of_node(n)); + cpumask_and(nmsk, cpu_online_mask, cpumask_of_node(n)); /* Calculate the number of cpus per vector */ ncpus = cpumask_weight(nmsk); - for (v = 0; curvec < nvec && v < vecs_to_assign; curvec++, v++) { + for (v = 0; curvec < last_affv && v < vecs_to_assign; + curvec++, v++) { cpus_per_vec = ncpus / vecs_to_assign; /* Account for extra vectors to compensate rounding errors */ @@ -119,36 +121,36 @@ struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity, irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec); } - if (curvec >= nvec) + if (curvec >= last_affv) break; } -outonl: +done: put_online_cpus(); + + /* Fill out vectors at the end that don't need affinity */ + for (; curvec < nvecs; curvec++) + cpumask_copy(masks + curvec, irq_default_affinity); out: free_cpumask_var(nmsk); return masks; } /** - * irq_calc_affinity_vectors - Calculate to optimal number of vectors for a given affinity mask - * @affinity: The affinity mask to spread. If NULL cpu_online_mask - * is used - * @maxvec: The maximum number of vectors available + * irq_calc_affinity_vectors - Calculate the optimal number of vectors + * @maxvec: The maximum number of vectors available + * @affd: Description of the affinity requirements */ -int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec) +int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd) { - int cpus, ret; + int resv = affd->pre_vectors + affd->post_vectors; + int vecs = maxvec - resv; + int cpus; /* Stabilize the cpumasks */ get_online_cpus(); - /* If the supplied affinity mask is NULL, use cpu online mask */ - if (!affinity) - affinity = cpu_online_mask; - - cpus = cpumask_weight(affinity); - ret = (cpus < maxvec) ? cpus : maxvec; - + cpus = cpumask_weight(cpu_online_mask); put_online_cpus(); - return ret; + + return min(cpus, vecs) + resv; } diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 8a3e872798f3..ee230063f033 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -14,9 +14,7 @@ #include <linux/irq.h> #include <linux/irqdomain.h> #include <linux/msi.h> - -/* Temparory solution for building, will be removed later */ -#include <linux/pci.h> +#include <linux/slab.h> /** * alloc_msi_entry - Allocate an initialize msi_entry diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index f7a55e9ff2f7..577f2288d19f 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2166,27 +2166,20 @@ void resume_console(void) /** * console_cpu_notify - print deferred console messages after CPU hotplug - * @self: notifier struct - * @action: CPU hotplug event - * @hcpu: unused + * @cpu: unused * * If printk() is called from a CPU that is not online yet, the messages * will be spooled but will not show up on the console. This function is * called when a new CPU comes online (or fails to come up), and ensures * that any such output gets printed. */ -static int console_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - switch (action) { - case CPU_ONLINE: - case CPU_DEAD: - case CPU_DOWN_FAILED: - case CPU_UP_CANCELED: +static int console_cpu_notify(unsigned int cpu) +{ + if (!cpuhp_tasks_frozen) { console_lock(); console_unlock(); } - return NOTIFY_OK; + return 0; } /** @@ -2824,6 +2817,7 @@ EXPORT_SYMBOL(unregister_console); static int __init printk_late_init(void) { struct console *con; + int ret; for_each_console(con) { if (!keep_bootcon && con->flags & CON_BOOT) { @@ -2838,7 +2832,12 @@ static int __init printk_late_init(void) unregister_console(con); } } - hotcpu_notifier(console_cpu_notify, 0); + ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL, + console_cpu_notify); + WARN_ON(ret < 0); + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online", + console_cpu_notify, NULL); + WARN_ON(ret < 0); return 0; } late_initcall(printk_late_init); diff --git a/kernel/signal.c b/kernel/signal.c index 75761acc77cf..29a410780aa9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -427,6 +427,7 @@ void flush_signals(struct task_struct *t) spin_unlock_irqrestore(&t->sighand->siglock, flags); } +#ifdef CONFIG_POSIX_TIMERS static void __flush_itimer_signals(struct sigpending *pending) { sigset_t signal, retain; @@ -460,6 +461,7 @@ void flush_itimer_signals(void) __flush_itimer_signals(&tsk->signal->shared_pending); spin_unlock_irqrestore(&tsk->sighand->siglock, flags); } +#endif void ignore_signals(struct task_struct *t) { @@ -567,6 +569,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) if (!signr) { signr = __dequeue_signal(&tsk->signal->shared_pending, mask, info); +#ifdef CONFIG_POSIX_TIMERS /* * itimer signal ? * @@ -590,6 +593,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) hrtimer_restart(tmr); } } +#endif } recalc_sigpending(); @@ -611,6 +615,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) */ current->jobctl |= JOBCTL_STOP_DEQUEUED; } +#ifdef CONFIG_POSIX_TIMERS if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { /* * Release the siglock to ensure proper locking order @@ -622,6 +627,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) do_schedule_next_timer(info); spin_lock(&tsk->sighand->siglock); } +#endif return signr; } diff --git a/kernel/sys.c b/kernel/sys.c index fd6f50809b6e..9758892a2d09 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1416,7 +1416,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, * applications, so we live with it */ if (!retval && new_rlim && resource == RLIMIT_CPU && - new_rlim->rlim_cur != RLIM_INFINITY) + new_rlim->rlim_cur != RLIM_INFINITY && + IS_ENABLED(CONFIG_POSIX_TIMERS)) update_rlimit_cpu(tsk, new_rlim->rlim_cur); out: read_unlock(&tasklist_lock); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 739fb17371af..39b3368f6de6 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -983,13 +983,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, { - .procname = "kstack_depth_to_print", - .data = &kstack_depth_to_print, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { .procname = "io_delay_type", .data = &io_delay_type, .maxlen = sizeof(int), diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 49eca0beed32..976840d29a71 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,6 +1,12 @@ -obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o +obj-y += time.o timer.o hrtimer.o obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o -obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o +obj-y += timeconv.o timecounter.o alarmtimer.o + +ifeq ($(CONFIG_POSIX_TIMERS),y) + obj-y += posix-timers.o posix-cpu-timers.o posix-clock.o itimer.o +else + obj-y += posix-stubs.o +endif obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o tick-common.o ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 12dd190634ab..9b08ca391aed 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -26,6 +26,9 @@ #include <linux/workqueue.h> #include <linux/freezer.h> +#define CREATE_TRACE_POINTS +#include <trace/events/alarmtimer.h> + /** * struct alarm_base - Alarm timer bases * @lock: Lock for syncrhonized access to the base @@ -40,7 +43,9 @@ static struct alarm_base { clockid_t base_clockid; } alarm_bases[ALARM_NUMTYPE]; -/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */ +/* freezer information to handle clock_nanosleep triggered wakeups */ +static enum alarmtimer_type freezer_alarmtype; +static ktime_t freezer_expires; static ktime_t freezer_delta; static DEFINE_SPINLOCK(freezer_delta_lock); @@ -194,6 +199,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) } spin_unlock_irqrestore(&base->lock, flags); + trace_alarmtimer_fired(alarm, base->gettime()); return ret; } @@ -218,15 +224,16 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining); */ static int alarmtimer_suspend(struct device *dev) { - struct rtc_time tm; - ktime_t min, now; - unsigned long flags; + ktime_t min, now, expires; + int i, ret, type; struct rtc_device *rtc; - int i; - int ret; + unsigned long flags; + struct rtc_time tm; spin_lock_irqsave(&freezer_delta_lock, flags); min = freezer_delta; + expires = freezer_expires; + type = freezer_alarmtype; freezer_delta = ktime_set(0, 0); spin_unlock_irqrestore(&freezer_delta_lock, flags); @@ -247,8 +254,11 @@ static int alarmtimer_suspend(struct device *dev) if (!next) continue; delta = ktime_sub(next->expires, base->gettime()); - if (!min.tv64 || (delta.tv64 < min.tv64)) + if (!min.tv64 || (delta.tv64 < min.tv64)) { + expires = next->expires; min = delta; + type = i; + } } if (min.tv64 == 0) return 0; @@ -258,6 +268,8 @@ static int alarmtimer_suspend(struct device *dev) return -EBUSY; } + trace_alarmtimer_suspend(expires, type); + /* Setup an rtc timer to fire that far in the future */ rtc_timer_cancel(rtc, &rtctimer); rtc_read_time(rtc, &tm); @@ -295,15 +307,32 @@ static int alarmtimer_resume(struct device *dev) static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) { - ktime_t delta; + struct alarm_base *base; unsigned long flags; - struct alarm_base *base = &alarm_bases[type]; + ktime_t delta; + + switch(type) { + case ALARM_REALTIME: + base = &alarm_bases[ALARM_REALTIME]; + type = ALARM_REALTIME_FREEZER; + break; + case ALARM_BOOTTIME: + base = &alarm_bases[ALARM_BOOTTIME]; + type = ALARM_BOOTTIME_FREEZER; + break; + default: + WARN_ONCE(1, "Invalid alarm type: %d\n", type); + return; + } delta = ktime_sub(absexp, base->gettime()); spin_lock_irqsave(&freezer_delta_lock, flags); - if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64)) + if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64)) { freezer_delta = delta; + freezer_expires = absexp; + freezer_alarmtype = type; + } spin_unlock_irqrestore(&freezer_delta_lock, flags); } @@ -342,6 +371,8 @@ void alarm_start(struct alarm *alarm, ktime_t start) alarmtimer_enqueue(base, alarm); hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); spin_unlock_irqrestore(&base->lock, flags); + + trace_alarmtimer_start(alarm, base->gettime()); } EXPORT_SYMBOL_GPL(alarm_start); @@ -390,6 +421,8 @@ int alarm_try_to_cancel(struct alarm *alarm) if (ret >= 0) alarmtimer_dequeue(base, alarm); spin_unlock_irqrestore(&base->lock, flags); + + trace_alarmtimer_cancel(alarm, base->gettime()); return ret; } EXPORT_SYMBOL_GPL(alarm_try_to_cancel); @@ -846,8 +879,10 @@ static int __init alarmtimer_init(void) alarmtimer_rtc_timer_init(); - posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); - posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); + if (IS_ENABLED(CONFIG_POSIX_TIMERS)) { + posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); + posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); + } /* Initialize alarm bases */ alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index bb5ec425dfe0..08be5c99d26b 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1742,15 +1742,19 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns. + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. + * delivered to the current task or the current task is explicitly woken + * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. * - * Returns 0 when the timer has expired otherwise -EINTR + * Returns 0 when the timer has expired. If the task was woken before the + * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or + * by an explicit wakeup, it returns -EINTR. */ int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode) @@ -1772,15 +1776,19 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns. + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. + * delivered to the current task or the current task is explicitly woken + * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. * - * Returns 0 when the timer has expired otherwise -EINTR + * Returns 0 when the timer has expired. If the task was woken before the + * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or + * by an explicit wakeup, it returns -EINTR. */ int __sched schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode) diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 1d5c7204ddc9..2b9f45bc955d 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -238,6 +238,8 @@ again: return 0; } +#ifdef __ARCH_WANT_SYS_ALARM + /** * alarm_setitimer - set alarm in seconds * @@ -250,7 +252,7 @@ again: * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid * negative timeval settings which would cause immediate expiry. */ -unsigned int alarm_setitimer(unsigned int seconds) +static unsigned int alarm_setitimer(unsigned int seconds) { struct itimerval it_new, it_old; @@ -275,6 +277,17 @@ unsigned int alarm_setitimer(unsigned int seconds) return it_old.it_value.tv_sec; } +/* + * For backwards compatibility? This can be done in libc so Alpha + * and all newer ports shouldn't need it. + */ +SYSCALL_DEFINE1(alarm, unsigned int, seconds) +{ + return alarm_setitimer(seconds); +} + +#endif + SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, struct itimerval __user *, ovalue) { diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index e887ffc8eef3..f246763c9947 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -9,7 +9,6 @@ #include <asm/uaccess.h> #include <linux/kernel_stat.h> #include <trace/events/timer.h> -#include <linux/random.h> #include <linux/tick.h> #include <linux/workqueue.h> @@ -447,10 +446,7 @@ static void cleanup_timers(struct list_head *head) */ void posix_cpu_timers_exit(struct task_struct *tsk) { - add_device_randomness((const void*) &tsk->se.sum_exec_runtime, - sizeof(unsigned long long)); cleanup_timers(tsk->cpu_timers); - } void posix_cpu_timers_exit_group(struct task_struct *tsk) { diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c new file mode 100644 index 000000000000..cd6716e115e8 --- /dev/null +++ b/kernel/time/posix-stubs.c @@ -0,0 +1,123 @@ +/* + * Dummy stubs used when CONFIG_POSIX_TIMERS=n + * + * Created by: Nicolas Pitre, July 2016 + * Copyright: (C) 2016 Linaro Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/syscalls.h> +#include <linux/ktime.h> +#include <linux/timekeeping.h> +#include <linux/posix-timers.h> + +asmlinkage long sys_ni_posix_timers(void) +{ + pr_err_once("process %d (%s) attempted a POSIX timer syscall " + "while CONFIG_POSIX_TIMERS is not set\n", + current->pid, current->comm); + return -ENOSYS; +} + +#define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers) + +SYS_NI(timer_create); +SYS_NI(timer_gettime); +SYS_NI(timer_getoverrun); +SYS_NI(timer_settime); +SYS_NI(timer_delete); +SYS_NI(clock_adjtime); +SYS_NI(getitimer); +SYS_NI(setitimer); +#ifdef __ARCH_WANT_SYS_ALARM +SYS_NI(alarm); +#endif + +/* + * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC + * as it is easy to remain compatible with little code. CLOCK_BOOTTIME + * is also included for convenience as at least systemd uses it. + */ + +SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, + const struct timespec __user *, tp) +{ + struct timespec new_tp; + + if (which_clock != CLOCK_REALTIME) + return -EINVAL; + if (copy_from_user(&new_tp, tp, sizeof (*tp))) + return -EFAULT; + return do_sys_settimeofday(&new_tp, NULL); +} + +SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, + struct timespec __user *,tp) +{ + struct timespec kernel_tp; + + switch (which_clock) { + case CLOCK_REALTIME: ktime_get_real_ts(&kernel_tp); break; + case CLOCK_MONOTONIC: ktime_get_ts(&kernel_tp); break; + case CLOCK_BOOTTIME: get_monotonic_boottime(&kernel_tp); break; + default: return -EINVAL; + } + if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) + return -EFAULT; + return 0; +} + +SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) +{ + struct timespec rtn_tp = { + .tv_sec = 0, + .tv_nsec = hrtimer_resolution, + }; + + switch (which_clock) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_BOOTTIME: + if (copy_to_user(tp, &rtn_tp, sizeof(rtn_tp))) + return -EFAULT; + return 0; + default: + return -EINVAL; + } +} + +SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, + const struct timespec __user *, rqtp, + struct timespec __user *, rmtp) +{ + struct timespec t; + + switch (which_clock) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_BOOTTIME: + if (copy_from_user(&t, rqtp, sizeof (struct timespec))) + return -EFAULT; + if (!timespec_valid(&t)) + return -EINVAL; + return hrtimer_nanosleep(&t, rmtp, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); + default: + return -EINVAL; + } +} + +#ifdef CONFIG_COMPAT +long clock_nanosleep_restart(struct restart_block *restart_block) +{ + return hrtimer_nanosleep_restart(restart_block); +} +#endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3bcb61b52f6c..71496a20e670 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -390,24 +390,16 @@ static int __init tick_nohz_full_setup(char *str) } __setup("nohz_full=", tick_nohz_full_setup); -static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static int tick_nohz_cpu_down(unsigned int cpu) { - unsigned int cpu = (unsigned long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_PREPARE: - /* - * The boot CPU handles housekeeping duty (unbound timers, - * workqueues, timekeeping, ...) on behalf of full dynticks - * CPUs. It must remain online when nohz full is enabled. - */ - if (tick_nohz_full_running && tick_do_timer_cpu == cpu) - return NOTIFY_BAD; - break; - } - return NOTIFY_OK; + /* + * The boot CPU handles housekeeping duty (unbound timers, + * workqueues, timekeeping, ...) on behalf of full dynticks + * CPUs. It must remain online when nohz full is enabled. + */ + if (tick_nohz_full_running && tick_do_timer_cpu == cpu) + return -EBUSY; + return 0; } static int tick_nohz_init_all(void) @@ -428,7 +420,7 @@ static int tick_nohz_init_all(void) void __init tick_nohz_init(void) { - int cpu; + int cpu, ret; if (!tick_nohz_full_running) { if (tick_nohz_init_all() < 0) @@ -469,7 +461,10 @@ void __init tick_nohz_init(void) for_each_cpu(cpu, tick_nohz_full_mask) context_tracking_cpu_set(cpu); - cpu_notifier(tick_nohz_cpu_down_callback, 0); + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "kernel/nohz:predown", NULL, + tick_nohz_cpu_down); + WARN_ON(ret < 0); pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", cpumask_pr_args(tick_nohz_full_mask)); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 37dec7e3db43..da233cdf89b0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -258,10 +258,9 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) tk->cycle_interval = interval; /* Go back from cycles -> shifted ns */ - tk->xtime_interval = (u64) interval * clock->mult; + tk->xtime_interval = interval * clock->mult; tk->xtime_remainder = ntpinterval - tk->xtime_interval; - tk->raw_interval = - ((u64) interval * clock->mult) >> clock->shift; + tk->raw_interval = (interval * clock->mult) >> clock->shift; /* if changing clocks, convert xtime_nsec shift units */ if (old_clock) { @@ -299,10 +298,10 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; static inline u32 arch_gettimeoffset(void) { return 0; } #endif -static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr, +static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr, cycle_t delta) { - s64 nsec; + u64 nsec; nsec = delta * tkr->mult + tkr->xtime_nsec; nsec >>= tkr->shift; @@ -311,7 +310,7 @@ static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr, return nsec + arch_gettimeoffset(); } -static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) +static inline u64 timekeeping_get_ns(struct tk_read_base *tkr) { cycle_t delta; @@ -319,8 +318,8 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) return timekeeping_delta_to_ns(tkr, delta); } -static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr, - cycle_t cycles) +static inline u64 timekeeping_cycles_to_ns(struct tk_read_base *tkr, + cycle_t cycles) { cycle_t delta; @@ -425,6 +424,35 @@ u64 ktime_get_raw_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); +/** + * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock. + * + * To keep it NMI safe since we're accessing from tracing, we're not using a + * separate timekeeper with updates to monotonic clock and boot offset + * protected with seqlocks. This has the following minor side effects: + * + * (1) Its possible that a timestamp be taken after the boot offset is updated + * but before the timekeeper is updated. If this happens, the new boot offset + * is added to the old timekeeping making the clock appear to update slightly + * earlier: + * CPU 0 CPU 1 + * timekeeping_inject_sleeptime64() + * __timekeeping_inject_sleeptime(tk, delta); + * timestamp(); + * timekeeping_update(tk, TK_CLEAR_NTP...); + * + * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be + * partially updated. Since the tk->offs_boot update is a rare event, this + * should be a rare occurrence which postprocessing should be able to handle. + */ +u64 notrace ktime_get_boot_fast_ns(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot)); +} +EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); + /* Suspend-time cycles value for halted fast timekeeper. */ static cycle_t cycles_at_suspend; @@ -623,7 +651,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) { struct clocksource *clock = tk->tkr_mono.clock; cycle_t cycle_now, delta; - s64 nsec; + u64 nsec; cycle_now = tk->tkr_mono.read(clock); delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); @@ -652,7 +680,7 @@ int __getnstimeofday64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; - s64 nsecs = 0; + u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); @@ -692,7 +720,7 @@ ktime_t ktime_get(void) struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base; - s64 nsecs; + u64 nsecs; WARN_ON(timekeeping_suspended); @@ -735,7 +763,7 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs) struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base, *offset = offsets[offs]; - s64 nsecs; + u64 nsecs; WARN_ON(timekeeping_suspended); @@ -779,7 +807,7 @@ ktime_t ktime_get_raw(void) struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base; - s64 nsecs; + u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); @@ -804,8 +832,8 @@ void ktime_get_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 tomono; - s64 nsec; unsigned int seq; + u64 nsec; WARN_ON(timekeeping_suspended); @@ -893,8 +921,8 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) unsigned long seq; ktime_t base_raw; ktime_t base_real; - s64 nsec_raw; - s64 nsec_real; + u64 nsec_raw; + u64 nsec_real; cycle_t now; WARN_ON_ONCE(timekeeping_suspended); @@ -1052,7 +1080,7 @@ int get_device_system_crosststamp(int (*get_time_fn) cycle_t cycles, now, interval_start; unsigned int clock_was_set_seq = 0; ktime_t base_real, base_raw; - s64 nsec_real, nsec_raw; + u64 nsec_real, nsec_raw; u8 cs_was_changed_seq; unsigned long seq; bool do_interp; @@ -1365,7 +1393,7 @@ void getrawmonotonic64(struct timespec64 *ts) struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 ts64; unsigned long seq; - s64 nsecs; + u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); @@ -1616,7 +1644,7 @@ void timekeeping_resume(void) struct clocksource *clock = tk->tkr_mono.clock; unsigned long flags; struct timespec64 ts_new, ts_delta; - cycle_t cycle_now, cycle_delta; + cycle_t cycle_now; sleeptime_injected = false; read_persistent_clock64(&ts_new); @@ -1642,27 +1670,11 @@ void timekeeping_resume(void) cycle_now = tk->tkr_mono.read(clock); if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && cycle_now > tk->tkr_mono.cycle_last) { - u64 num, max = ULLONG_MAX; - u32 mult = clock->mult; - u32 shift = clock->shift; - s64 nsec = 0; - - cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, - tk->tkr_mono.mask); - - /* - * "cycle_delta * mutl" may cause 64 bits overflow, if the - * suspended time is too long. In that case we need do the - * 64 bits math carefully - */ - do_div(max, mult); - if (cycle_delta > max) { - num = div64_u64(cycle_delta, max); - nsec = (((u64) max * mult) >> shift) * num; - cycle_delta -= num * max; - } - nsec += ((u64) cycle_delta * mult) >> shift; + u64 nsec, cyc_delta; + cyc_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, + tk->tkr_mono.mask); + nsec = mul_u64_u32_shr(cyc_delta, clock->mult, clock->shift); ts_delta = ns_to_timespec64(nsec); sleeptime_injected = true; } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { diff --git a/kernel/time/timer.c b/kernel/time/timer.c index c611c47de884..ea4fbf8477a9 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1615,7 +1615,8 @@ void update_process_times(int user_tick) irq_work_tick(); #endif scheduler_tick(); - run_posix_cpu_timers(p); + if (IS_ENABLED(CONFIG_POSIX_TIMERS)) + run_posix_cpu_timers(p); } /** @@ -1676,19 +1677,6 @@ void run_local_timers(void) raise_softirq(TIMER_SOFTIRQ); } -#ifdef __ARCH_WANT_SYS_ALARM - -/* - * For backwards compatibility? This can be done in libc so Alpha - * and all newer ports shouldn't need it. - */ -SYSCALL_DEFINE1(alarm, unsigned int, seconds) -{ - return alarm_setitimer(seconds); -} - -#endif - static void process_timeout(unsigned long __data) { wake_up_process((struct task_struct *)__data); @@ -1705,11 +1693,12 @@ static void process_timeout(unsigned long __data) * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to - * pass before the routine returns. The routine will return 0 + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process())". * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. In this case the remaining time - * in jiffies will be returned, or 0 if the timer expired in time + * delivered to the current task or the current task is explicitly woken + * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. @@ -1718,7 +1707,9 @@ static void process_timeout(unsigned long __data) * the CPU away without a bound on the timeout. In this case the return * value will be %MAX_SCHEDULE_TIMEOUT. * - * In all cases the return value is guaranteed to be non-negative. + * Returns 0 when the timer has expired otherwise the remaining time in + * jiffies will be returned. In all cases the return value is guaranteed + * to be non-negative. */ signed long __sched schedule_timeout(signed long timeout) { @@ -1910,16 +1901,6 @@ unsigned long msleep_interruptible(unsigned int msecs) EXPORT_SYMBOL(msleep_interruptible); -static void __sched do_usleep_range(unsigned long min, unsigned long max) -{ - ktime_t kmin; - u64 delta; - - kmin = ktime_set(0, min * NSEC_PER_USEC); - delta = (u64)(max - min) * NSEC_PER_USEC; - schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); -} - /** * usleep_range - Sleep for an approximate time * @min: Minimum time in usecs to sleep @@ -1933,7 +1914,14 @@ static void __sched do_usleep_range(unsigned long min, unsigned long max) */ void __sched usleep_range(unsigned long min, unsigned long max) { - __set_current_state(TASK_UNINTERRUPTIBLE); - do_usleep_range(min, max); + ktime_t exp = ktime_add_us(ktime_get(), min); + u64 delta = (u64)(max - min) * NSEC_PER_USEC; + + for (;;) { + __set_current_state(TASK_UNINTERRUPTIBLE); + /* Do not return before the requested sleep time has elapsed */ + if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) + break; + } } EXPORT_SYMBOL(usleep_range); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9c143739b8d7..89a2611a1635 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -479,9 +479,7 @@ struct ring_buffer { struct ring_buffer_per_cpu **buffers; -#ifdef CONFIG_HOTPLUG_CPU - struct notifier_block cpu_notify; -#endif + struct hlist_node node; u64 (*clock)(void); struct rb_irq_work irq_work; @@ -1274,11 +1272,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) kfree(cpu_buffer); } -#ifdef CONFIG_HOTPLUG_CPU -static int rb_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu); -#endif - /** * __ring_buffer_alloc - allocate a new ring_buffer * @size: the size in bytes per cpu that is needed. @@ -1296,6 +1289,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, long nr_pages; int bsize; int cpu; + int ret; /* keep it in its own cache line */ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), @@ -1303,7 +1297,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (!buffer) return NULL; - if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) goto fail_free_buffer; nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); @@ -1318,17 +1312,6 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (nr_pages < 2) nr_pages = 2; - /* - * In case of non-hotplug cpu, if the ring-buffer is allocated - * in early initcall, it will not be notified of secondary cpus. - * In that off case, we need to allocate for all possible cpus. - */ -#ifdef CONFIG_HOTPLUG_CPU - cpu_notifier_register_begin(); - cpumask_copy(buffer->cpumask, cpu_online_mask); -#else - cpumask_copy(buffer->cpumask, cpu_possible_mask); -#endif buffer->cpus = nr_cpu_ids; bsize = sizeof(void *) * nr_cpu_ids; @@ -1337,19 +1320,15 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (!buffer->buffers) goto fail_free_cpumask; - for_each_buffer_cpu(buffer, cpu) { - buffer->buffers[cpu] = - rb_allocate_cpu_buffer(buffer, nr_pages, cpu); - if (!buffer->buffers[cpu]) - goto fail_free_buffers; - } + cpu = raw_smp_processor_id(); + cpumask_set_cpu(cpu, buffer->cpumask); + buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); + if (!buffer->buffers[cpu]) + goto fail_free_buffers; -#ifdef CONFIG_HOTPLUG_CPU - buffer->cpu_notify.notifier_call = rb_cpu_notify; - buffer->cpu_notify.priority = 0; - __register_cpu_notifier(&buffer->cpu_notify); - cpu_notifier_register_done(); -#endif + ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); + if (ret < 0) + goto fail_free_buffers; mutex_init(&buffer->mutex); @@ -1364,9 +1343,6 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, fail_free_cpumask: free_cpumask_var(buffer->cpumask); -#ifdef CONFIG_HOTPLUG_CPU - cpu_notifier_register_done(); -#endif fail_free_buffer: kfree(buffer); @@ -1383,18 +1359,11 @@ ring_buffer_free(struct ring_buffer *buffer) { int cpu; -#ifdef CONFIG_HOTPLUG_CPU - cpu_notifier_register_begin(); - __unregister_cpu_notifier(&buffer->cpu_notify); -#endif + cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); for_each_buffer_cpu(buffer, cpu) rb_free_cpu_buffer(buffer->buffers[cpu]); -#ifdef CONFIG_HOTPLUG_CPU - cpu_notifier_register_done(); -#endif - kfree(buffer->buffers); free_cpumask_var(buffer->cpumask); @@ -4633,62 +4602,48 @@ int ring_buffer_read_page(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_read_page); -#ifdef CONFIG_HOTPLUG_CPU -static int rb_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +/* + * We only allocate new buffers, never free them if the CPU goes down. + * If we were to free the buffer, then the user would lose any trace that was in + * the buffer. + */ +int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node) { - struct ring_buffer *buffer = - container_of(self, struct ring_buffer, cpu_notify); - long cpu = (long)hcpu; + struct ring_buffer *buffer; long nr_pages_same; int cpu_i; unsigned long nr_pages; - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - if (cpumask_test_cpu(cpu, buffer->cpumask)) - return NOTIFY_OK; - - nr_pages = 0; - nr_pages_same = 1; - /* check if all cpu sizes are same */ - for_each_buffer_cpu(buffer, cpu_i) { - /* fill in the size from first enabled cpu */ - if (nr_pages == 0) - nr_pages = buffer->buffers[cpu_i]->nr_pages; - if (nr_pages != buffer->buffers[cpu_i]->nr_pages) { - nr_pages_same = 0; - break; - } - } - /* allocate minimum pages, user can later expand it */ - if (!nr_pages_same) - nr_pages = 2; - buffer->buffers[cpu] = - rb_allocate_cpu_buffer(buffer, nr_pages, cpu); - if (!buffer->buffers[cpu]) { - WARN(1, "failed to allocate ring buffer on CPU %ld\n", - cpu); - return NOTIFY_OK; + buffer = container_of(node, struct ring_buffer, node); + if (cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + nr_pages = 0; + nr_pages_same = 1; + /* check if all cpu sizes are same */ + for_each_buffer_cpu(buffer, cpu_i) { + /* fill in the size from first enabled cpu */ + if (nr_pages == 0) + nr_pages = buffer->buffers[cpu_i]->nr_pages; + if (nr_pages != buffer->buffers[cpu_i]->nr_pages) { + nr_pages_same = 0; + break; } - smp_wmb(); - cpumask_set_cpu(cpu, buffer->cpumask); - break; - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - /* - * Do nothing. - * If we were to free the buffer, then the user would - * lose any trace that was in the buffer. - */ - break; - default: - break; } - return NOTIFY_OK; + /* allocate minimum pages, user can later expand it */ + if (!nr_pages_same) + nr_pages = 2; + buffer->buffers[cpu] = + rb_allocate_cpu_buffer(buffer, nr_pages, cpu); + if (!buffer->buffers[cpu]) { + WARN(1, "failed to allocate ring buffer on CPU %u\n", + cpu); + return -ENOMEM; + } + smp_wmb(); + cpumask_set_cpu(cpu, buffer->cpumask); + return 0; } -#endif #ifdef CONFIG_RING_BUFFER_STARTUP_TEST /* diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8696ce6bf2f6..54d5270a5042 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1125,6 +1125,7 @@ static struct { { trace_clock, "perf", 1 }, { ktime_get_mono_fast_ns, "mono", 1 }, { ktime_get_raw_fast_ns, "mono_raw", 1 }, + { ktime_get_boot_fast_ns, "boot", 1 }, ARCH_TRACE_CLOCKS }; @@ -7659,10 +7660,21 @@ __init static int tracer_alloc_buffers(void) raw_spin_lock_init(&global_trace.start_lock); + /* + * The prepare callbacks allocates some memory for the ring buffer. We + * don't free the buffer if the if the CPU goes down. If we were to free + * the buffer, then the user would lose any trace that was in the + * buffer. The memory will be removed once the "instance" is removed. + */ + ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE, + "trace/RB:preapre", trace_rb_cpu_prepare, + NULL); + if (ret < 0) + goto out_free_cpumask; /* Used for event triggers */ temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE); if (!temp_buffer) - goto out_free_cpumask; + goto out_rm_hp_state; if (trace_create_savedcmd() < 0) goto out_free_temp_buffer; @@ -7723,6 +7735,8 @@ out_free_savedcmd: free_saved_cmdlines_buffer(savedcmd); out_free_temp_buffer: ring_buffer_free(temp_buffer); +out_rm_hp_state: + cpuhp_remove_multi_state(CPUHP_TRACE_RB_PREPARE); out_free_cpumask: free_cpumask_var(global_trace.tracing_cpumask); out_free_buffer_mask: |