diff options
Diffstat (limited to 'kernel')
151 files changed, 10232 insertions, 3831 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index ebdb0043203a..84d882f3e299 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -225,7 +225,7 @@ config ARCH_SUPPORTS_ATOMIC_RMW config MUTEX_SPIN_ON_OWNER def_bool y - depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW + depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW config RWSEM_SPIN_ON_OWNER def_bool y diff --git a/kernel/Makefile b/kernel/Makefile index e2ec54e2b952..eaee9de224bd 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,7 @@ obj-y = fork.o exec_domain.o panic.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ - async.o range.o smpboot.o + async.o range.o smpboot.o ucount.o obj-$(CONFIG_MULTIUSER) += groups.o @@ -115,8 +115,6 @@ obj-$(CONFIG_HAS_IOMEM) += memremap.o $(obj)/configs.o: $(obj)/config_data.h -# config_data.h contains the same information as ikconfig.h but gzipped. -# Info from config_data can be extracted from /proc/config* targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) diff --git a/kernel/audit.c b/kernel/audit.c index 41017685f9f2..91bff3c0b368 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -125,7 +125,7 @@ static atomic_t audit_lost = ATOMIC_INIT(0); /* The netlink socket. */ static struct sock *audit_sock; -static int audit_net_id; +static unsigned int audit_net_id; /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; @@ -1007,6 +1007,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err; } if (s.mask & AUDIT_STATUS_PID) { + /* NOTE: we are using task_tgid_vnr() below because + * the s.pid value is relative to the namespace + * of the caller; at present this doesn't matter + * much since you can really only run auditd + * from the initial pid namespace, but something + * to keep in mind if this changes */ int new_pid = s.pid; pid_t requesting_pid = task_tgid_vnr(current); @@ -1304,9 +1310,8 @@ static void __net_exit audit_net_exit(struct net *net) auditd_reset(); mutex_unlock(&audit_cmd_mutex); - RCU_INIT_POINTER(aunet->nlsk, NULL); - synchronize_net(); netlink_kernel_release(sock); + aunet->nlsk = NULL; } static struct pernet_operations audit_net_ops __net_initdata = { @@ -2051,7 +2056,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) " euid=%u suid=%u fsuid=%u" " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", task_ppid_nr(tsk), - task_pid_nr(tsk), + task_tgid_nr(tsk), from_kuid(&init_user_ns, audit_get_loginuid(tsk)), from_kuid(&init_user_ns, cred->uid), from_kgid(&init_user_ns, cred->gid), diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f78cb1b3fa74..cf1fa43512c1 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -458,7 +458,7 @@ static int audit_filter_rules(struct task_struct *tsk, switch (f->type) { case AUDIT_PID: - pid = task_pid_nr(tsk); + pid = task_tgid_nr(tsk); result = audit_comparator(pid, f->op, f->val); break; case AUDIT_PPID: @@ -1998,7 +1998,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, loginuid = from_kuid(&init_user_ns, kloginuid), tty = audit_get_tty(current); - audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid); + audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); audit_log_task_context(ab); audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", @@ -2228,7 +2228,7 @@ void __audit_ptrace(struct task_struct *t) { struct audit_context *context = current->audit_context; - context->target_pid = task_pid_nr(t); + context->target_pid = task_tgid_nr(t); context->target_auid = audit_get_loginuid(t); context->target_uid = task_uid(t); context->target_sessionid = audit_get_sessionid(t); @@ -2253,7 +2253,7 @@ int __audit_signal_info(int sig, struct task_struct *t) if (audit_pid && t->tgid == audit_pid) { if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { - audit_sig_pid = task_pid_nr(tsk); + audit_sig_pid = task_tgid_nr(tsk); if (uid_valid(tsk->loginuid)) audit_sig_uid = tsk->loginuid; else @@ -2353,7 +2353,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, void __audit_log_capset(const struct cred *new, const struct cred *old) { struct audit_context *context = current->audit_context; - context->capset.pid = task_pid_nr(current); + context->capset.pid = task_tgid_nr(current); context->capset.cap.effective = new->cap_effective; context->capset.cap.inheritable = new->cap_effective; context->capset.cap.permitted = new->cap_permitted; @@ -2385,7 +2385,7 @@ static void audit_log_task(struct audit_buffer *ab) from_kgid(&init_user_ns, gid), sessionid); audit_log_task_context(ab); - audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); + audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current)); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_d_path_exe(ab, current->mm); } diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index eed911d091da..1276474ac3cd 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,7 +1,8 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif +obj-$(CONFIG_CGROUP_BPF) += cgroup.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 633a650d7aeb..a2ac051c342f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -538,7 +538,7 @@ static int __init register_perf_event_array_map(void) } late_initcall(register_perf_event_array_map); -#ifdef CONFIG_SOCK_CGROUP_DATA +#ifdef CONFIG_CGROUPS static void *cgroup_fd_array_get_ptr(struct bpf_map *map, struct file *map_file /* not used */, int fd) diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c new file mode 100644 index 000000000000..89b7ef41c86b --- /dev/null +++ b/kernel/bpf/bpf_lru_list.c @@ -0,0 +1,695 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/cpumask.h> +#include <linux/spinlock.h> +#include <linux/percpu.h> + +#include "bpf_lru_list.h" + +#define LOCAL_FREE_TARGET (128) +#define LOCAL_NR_SCANS LOCAL_FREE_TARGET + +#define PERCPU_FREE_TARGET (16) +#define PERCPU_NR_SCANS PERCPU_FREE_TARGET + +/* Helpers to get the local list index */ +#define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET) +#define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE) +#define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING) +#define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET) + +static int get_next_cpu(int cpu) +{ + cpu = cpumask_next(cpu, cpu_possible_mask); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(cpu_possible_mask); + return cpu; +} + +/* Local list helpers */ +static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l) +{ + return &loc_l->lists[LOCAL_FREE_LIST_IDX]; +} + +static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l) +{ + return &loc_l->lists[LOCAL_PENDING_LIST_IDX]; +} + +/* bpf_lru_node helpers */ +static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node) +{ + return node->ref; +} + +static void bpf_lru_list_count_inc(struct bpf_lru_list *l, + enum bpf_lru_list_type type) +{ + if (type < NR_BPF_LRU_LIST_COUNT) + l->counts[type]++; +} + +static void bpf_lru_list_count_dec(struct bpf_lru_list *l, + enum bpf_lru_list_type type) +{ + if (type < NR_BPF_LRU_LIST_COUNT) + l->counts[type]--; +} + +static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l, + struct bpf_lru_node *node, + struct list_head *free_list, + enum bpf_lru_list_type tgt_free_type) +{ + if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) + return; + + /* If the removing node is the next_inactive_rotation candidate, + * move the next_inactive_rotation pointer also. + */ + if (&node->list == l->next_inactive_rotation) + l->next_inactive_rotation = l->next_inactive_rotation->prev; + + bpf_lru_list_count_dec(l, node->type); + + node->type = tgt_free_type; + list_move(&node->list, free_list); +} + +/* Move nodes from local list to the LRU list */ +static void __bpf_lru_node_move_in(struct bpf_lru_list *l, + struct bpf_lru_node *node, + enum bpf_lru_list_type tgt_type) +{ + if (WARN_ON_ONCE(!IS_LOCAL_LIST_TYPE(node->type)) || + WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) + return; + + bpf_lru_list_count_inc(l, tgt_type); + node->type = tgt_type; + node->ref = 0; + list_move(&node->list, &l->lists[tgt_type]); +} + +/* Move nodes between or within active and inactive list (like + * active to inactive, inactive to active or tail of active back to + * the head of active). + */ +static void __bpf_lru_node_move(struct bpf_lru_list *l, + struct bpf_lru_node *node, + enum bpf_lru_list_type tgt_type) +{ + if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)) || + WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) + return; + + if (node->type != tgt_type) { + bpf_lru_list_count_dec(l, node->type); + bpf_lru_list_count_inc(l, tgt_type); + node->type = tgt_type; + } + node->ref = 0; + + /* If the moving node is the next_inactive_rotation candidate, + * move the next_inactive_rotation pointer also. + */ + if (&node->list == l->next_inactive_rotation) + l->next_inactive_rotation = l->next_inactive_rotation->prev; + + list_move(&node->list, &l->lists[tgt_type]); +} + +static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l) +{ + return l->counts[BPF_LRU_LIST_T_INACTIVE] < + l->counts[BPF_LRU_LIST_T_ACTIVE]; +} + +/* Rotate the active list: + * 1. Start from tail + * 2. If the node has the ref bit set, it will be rotated + * back to the head of active list with the ref bit cleared. + * Give this node one more chance to survive in the active list. + * 3. If the ref bit is not set, move it to the head of the + * inactive list. + * 4. It will at most scan nr_scans nodes + */ +static void __bpf_lru_list_rotate_active(struct bpf_lru *lru, + struct bpf_lru_list *l) +{ + struct list_head *active = &l->lists[BPF_LRU_LIST_T_ACTIVE]; + struct bpf_lru_node *node, *tmp_node, *first_node; + unsigned int i = 0; + + first_node = list_first_entry(active, struct bpf_lru_node, list); + list_for_each_entry_safe_reverse(node, tmp_node, active, list) { + if (bpf_lru_node_is_ref(node)) + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); + else + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); + + if (++i == lru->nr_scans || node == first_node) + break; + } +} + +/* Rotate the inactive list. It starts from the next_inactive_rotation + * 1. If the node has ref bit set, it will be moved to the head + * of active list with the ref bit cleared. + * 2. If the node does not have ref bit set, it will leave it + * at its current location (i.e. do nothing) so that it can + * be considered during the next inactive_shrink. + * 3. It will at most scan nr_scans nodes + */ +static void __bpf_lru_list_rotate_inactive(struct bpf_lru *lru, + struct bpf_lru_list *l) +{ + struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; + struct list_head *cur, *last, *next = inactive; + struct bpf_lru_node *node; + unsigned int i = 0; + + if (list_empty(inactive)) + return; + + last = l->next_inactive_rotation->next; + if (last == inactive) + last = last->next; + + cur = l->next_inactive_rotation; + while (i < lru->nr_scans) { + if (cur == inactive) { + cur = cur->prev; + continue; + } + + node = list_entry(cur, struct bpf_lru_node, list); + next = cur->prev; + if (bpf_lru_node_is_ref(node)) + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); + if (cur == last) + break; + cur = next; + i++; + } + + l->next_inactive_rotation = next; +} + +/* Shrink the inactive list. It starts from the tail of the + * inactive list and only move the nodes without the ref bit + * set to the designated free list. + */ +static unsigned int +__bpf_lru_list_shrink_inactive(struct bpf_lru *lru, + struct bpf_lru_list *l, + unsigned int tgt_nshrink, + struct list_head *free_list, + enum bpf_lru_list_type tgt_free_type) +{ + struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; + struct bpf_lru_node *node, *tmp_node, *first_node; + unsigned int nshrinked = 0; + unsigned int i = 0; + + first_node = list_first_entry(inactive, struct bpf_lru_node, list); + list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) { + if (bpf_lru_node_is_ref(node)) { + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); + } else if (lru->del_from_htab(lru->del_arg, node)) { + __bpf_lru_node_move_to_free(l, node, free_list, + tgt_free_type); + if (++nshrinked == tgt_nshrink) + break; + } + + if (++i == lru->nr_scans) + break; + } + + return nshrinked; +} + +/* 1. Rotate the active list (if needed) + * 2. Always rotate the inactive list + */ +static void __bpf_lru_list_rotate(struct bpf_lru *lru, struct bpf_lru_list *l) +{ + if (bpf_lru_list_inactive_low(l)) + __bpf_lru_list_rotate_active(lru, l); + + __bpf_lru_list_rotate_inactive(lru, l); +} + +/* Calls __bpf_lru_list_shrink_inactive() to shrink some + * ref-bit-cleared nodes and move them to the designated + * free list. + * + * If it cannot get a free node after calling + * __bpf_lru_list_shrink_inactive(). It will just remove + * one node from either inactive or active list without + * honoring the ref-bit. It prefers inactive list to active + * list in this situation. + */ +static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru, + struct bpf_lru_list *l, + unsigned int tgt_nshrink, + struct list_head *free_list, + enum bpf_lru_list_type tgt_free_type) + +{ + struct bpf_lru_node *node, *tmp_node; + struct list_head *force_shrink_list; + unsigned int nshrinked; + + nshrinked = __bpf_lru_list_shrink_inactive(lru, l, tgt_nshrink, + free_list, tgt_free_type); + if (nshrinked) + return nshrinked; + + /* Do a force shrink by ignoring the reference bit */ + if (!list_empty(&l->lists[BPF_LRU_LIST_T_INACTIVE])) + force_shrink_list = &l->lists[BPF_LRU_LIST_T_INACTIVE]; + else + force_shrink_list = &l->lists[BPF_LRU_LIST_T_ACTIVE]; + + list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list, + list) { + if (lru->del_from_htab(lru->del_arg, node)) { + __bpf_lru_node_move_to_free(l, node, free_list, + tgt_free_type); + return 1; + } + } + + return 0; +} + +/* Flush the nodes from the local pending list to the LRU list */ +static void __local_list_flush(struct bpf_lru_list *l, + struct bpf_lru_locallist *loc_l) +{ + struct bpf_lru_node *node, *tmp_node; + + list_for_each_entry_safe_reverse(node, tmp_node, + local_pending_list(loc_l), list) { + if (bpf_lru_node_is_ref(node)) + __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE); + else + __bpf_lru_node_move_in(l, node, + BPF_LRU_LIST_T_INACTIVE); + } +} + +static void bpf_lru_list_push_free(struct bpf_lru_list *l, + struct bpf_lru_node *node) +{ + unsigned long flags; + + if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) + return; + + raw_spin_lock_irqsave(&l->lock, flags); + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); + raw_spin_unlock_irqrestore(&l->lock, flags); +} + +static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, + struct bpf_lru_locallist *loc_l) +{ + struct bpf_lru_list *l = &lru->common_lru.lru_list; + struct bpf_lru_node *node, *tmp_node; + unsigned int nfree = 0; + + raw_spin_lock(&l->lock); + + __local_list_flush(l, loc_l); + + __bpf_lru_list_rotate(lru, l); + + list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE], + list) { + __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), + BPF_LRU_LOCAL_LIST_T_FREE); + if (++nfree == LOCAL_FREE_TARGET) + break; + } + + if (nfree < LOCAL_FREE_TARGET) + __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree, + local_free_list(loc_l), + BPF_LRU_LOCAL_LIST_T_FREE); + + raw_spin_unlock(&l->lock); +} + +static void __local_list_add_pending(struct bpf_lru *lru, + struct bpf_lru_locallist *loc_l, + int cpu, + struct bpf_lru_node *node, + u32 hash) +{ + *(u32 *)((void *)node + lru->hash_offset) = hash; + node->cpu = cpu; + node->type = BPF_LRU_LOCAL_LIST_T_PENDING; + node->ref = 0; + list_add(&node->list, local_pending_list(loc_l)); +} + +struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l) +{ + struct bpf_lru_node *node; + + node = list_first_entry_or_null(local_free_list(loc_l), + struct bpf_lru_node, + list); + if (node) + list_del(&node->list); + + return node; +} + +struct bpf_lru_node *__local_list_pop_pending(struct bpf_lru *lru, + struct bpf_lru_locallist *loc_l) +{ + struct bpf_lru_node *node; + bool force = false; + +ignore_ref: + /* Get from the tail (i.e. older element) of the pending list. */ + list_for_each_entry_reverse(node, local_pending_list(loc_l), + list) { + if ((!bpf_lru_node_is_ref(node) || force) && + lru->del_from_htab(lru->del_arg, node)) { + list_del(&node->list); + return node; + } + } + + if (!force) { + force = true; + goto ignore_ref; + } + + return NULL; +} + +static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, + u32 hash) +{ + struct list_head *free_list; + struct bpf_lru_node *node = NULL; + struct bpf_lru_list *l; + unsigned long flags; + int cpu = raw_smp_processor_id(); + + l = per_cpu_ptr(lru->percpu_lru, cpu); + + raw_spin_lock_irqsave(&l->lock, flags); + + __bpf_lru_list_rotate(lru, l); + + free_list = &l->lists[BPF_LRU_LIST_T_FREE]; + if (list_empty(free_list)) + __bpf_lru_list_shrink(lru, l, PERCPU_FREE_TARGET, free_list, + BPF_LRU_LIST_T_FREE); + + if (!list_empty(free_list)) { + node = list_first_entry(free_list, struct bpf_lru_node, list); + *(u32 *)((void *)node + lru->hash_offset) = hash; + node->ref = 0; + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); + } + + raw_spin_unlock_irqrestore(&l->lock, flags); + + return node; +} + +static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, + u32 hash) +{ + struct bpf_lru_locallist *loc_l, *steal_loc_l; + struct bpf_common_lru *clru = &lru->common_lru; + struct bpf_lru_node *node; + int steal, first_steal; + unsigned long flags; + int cpu = raw_smp_processor_id(); + + loc_l = per_cpu_ptr(clru->local_list, cpu); + + raw_spin_lock_irqsave(&loc_l->lock, flags); + + node = __local_list_pop_free(loc_l); + if (!node) { + bpf_lru_list_pop_free_to_local(lru, loc_l); + node = __local_list_pop_free(loc_l); + } + + if (node) + __local_list_add_pending(lru, loc_l, cpu, node, hash); + + raw_spin_unlock_irqrestore(&loc_l->lock, flags); + + if (node) + return node; + + /* No free nodes found from the local free list and + * the global LRU list. + * + * Steal from the local free/pending list of the + * current CPU and remote CPU in RR. It starts + * with the loc_l->next_steal CPU. + */ + + first_steal = loc_l->next_steal; + steal = first_steal; + do { + steal_loc_l = per_cpu_ptr(clru->local_list, steal); + + raw_spin_lock_irqsave(&steal_loc_l->lock, flags); + + node = __local_list_pop_free(steal_loc_l); + if (!node) + node = __local_list_pop_pending(lru, steal_loc_l); + + raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags); + + steal = get_next_cpu(steal); + } while (!node && steal != first_steal); + + loc_l->next_steal = steal; + + if (node) { + raw_spin_lock_irqsave(&loc_l->lock, flags); + __local_list_add_pending(lru, loc_l, cpu, node, hash); + raw_spin_unlock_irqrestore(&loc_l->lock, flags); + } + + return node; +} + +struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash) +{ + if (lru->percpu) + return bpf_percpu_lru_pop_free(lru, hash); + else + return bpf_common_lru_pop_free(lru, hash); +} + +static void bpf_common_lru_push_free(struct bpf_lru *lru, + struct bpf_lru_node *node) +{ + unsigned long flags; + + if (WARN_ON_ONCE(node->type == BPF_LRU_LIST_T_FREE) || + WARN_ON_ONCE(node->type == BPF_LRU_LOCAL_LIST_T_FREE)) + return; + + if (node->type == BPF_LRU_LOCAL_LIST_T_PENDING) { + struct bpf_lru_locallist *loc_l; + + loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu); + + raw_spin_lock_irqsave(&loc_l->lock, flags); + + if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) { + raw_spin_unlock_irqrestore(&loc_l->lock, flags); + goto check_lru_list; + } + + node->type = BPF_LRU_LOCAL_LIST_T_FREE; + node->ref = 0; + list_move(&node->list, local_free_list(loc_l)); + + raw_spin_unlock_irqrestore(&loc_l->lock, flags); + return; + } + +check_lru_list: + bpf_lru_list_push_free(&lru->common_lru.lru_list, node); +} + +static void bpf_percpu_lru_push_free(struct bpf_lru *lru, + struct bpf_lru_node *node) +{ + struct bpf_lru_list *l; + unsigned long flags; + + l = per_cpu_ptr(lru->percpu_lru, node->cpu); + + raw_spin_lock_irqsave(&l->lock, flags); + + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); + + raw_spin_unlock_irqrestore(&l->lock, flags); +} + +void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) +{ + if (lru->percpu) + bpf_percpu_lru_push_free(lru, node); + else + bpf_common_lru_push_free(lru, node); +} + +void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, + u32 elem_size, u32 nr_elems) +{ + struct bpf_lru_list *l = &lru->common_lru.lru_list; + u32 i; + + for (i = 0; i < nr_elems; i++) { + struct bpf_lru_node *node; + + node = (struct bpf_lru_node *)(buf + node_offset); + node->type = BPF_LRU_LIST_T_FREE; + node->ref = 0; + list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); + buf += elem_size; + } +} + +void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, + u32 elem_size, u32 nr_elems) +{ + u32 i, pcpu_entries; + int cpu; + struct bpf_lru_list *l; + + pcpu_entries = nr_elems / num_possible_cpus(); + + i = 0; + + for_each_possible_cpu(cpu) { + struct bpf_lru_node *node; + + l = per_cpu_ptr(lru->percpu_lru, cpu); +again: + node = (struct bpf_lru_node *)(buf + node_offset); + node->cpu = cpu; + node->type = BPF_LRU_LIST_T_FREE; + node->ref = 0; + list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); + i++; + buf += elem_size; + if (i == nr_elems) + break; + if (i % pcpu_entries) + goto again; + } +} + +void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, + u32 elem_size, u32 nr_elems) +{ + if (lru->percpu) + bpf_percpu_lru_populate(lru, buf, node_offset, elem_size, + nr_elems); + else + bpf_common_lru_populate(lru, buf, node_offset, elem_size, + nr_elems); +} + +static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu) +{ + int i; + + for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++) + INIT_LIST_HEAD(&loc_l->lists[i]); + + loc_l->next_steal = cpu; + + raw_spin_lock_init(&loc_l->lock); +} + +static void bpf_lru_list_init(struct bpf_lru_list *l) +{ + int i; + + for (i = 0; i < NR_BPF_LRU_LIST_T; i++) + INIT_LIST_HEAD(&l->lists[i]); + + for (i = 0; i < NR_BPF_LRU_LIST_COUNT; i++) + l->counts[i] = 0; + + l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE]; + + raw_spin_lock_init(&l->lock); +} + +int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, + del_from_htab_func del_from_htab, void *del_arg) +{ + int cpu; + + if (percpu) { + lru->percpu_lru = alloc_percpu(struct bpf_lru_list); + if (!lru->percpu_lru) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct bpf_lru_list *l; + + l = per_cpu_ptr(lru->percpu_lru, cpu); + bpf_lru_list_init(l); + } + lru->nr_scans = PERCPU_NR_SCANS; + } else { + struct bpf_common_lru *clru = &lru->common_lru; + + clru->local_list = alloc_percpu(struct bpf_lru_locallist); + if (!clru->local_list) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct bpf_lru_locallist *loc_l; + + loc_l = per_cpu_ptr(clru->local_list, cpu); + bpf_lru_locallist_init(loc_l, cpu); + } + + bpf_lru_list_init(&clru->lru_list); + lru->nr_scans = LOCAL_NR_SCANS; + } + + lru->percpu = percpu; + lru->del_from_htab = del_from_htab; + lru->del_arg = del_arg; + lru->hash_offset = hash_offset; + + return 0; +} + +void bpf_lru_destroy(struct bpf_lru *lru) +{ + if (lru->percpu) + free_percpu(lru->percpu_lru); + else + free_percpu(lru->common_lru.local_list); +} diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h new file mode 100644 index 000000000000..5c35a98d02bf --- /dev/null +++ b/kernel/bpf/bpf_lru_list.h @@ -0,0 +1,84 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef __BPF_LRU_LIST_H_ +#define __BPF_LRU_LIST_H_ + +#include <linux/list.h> +#include <linux/spinlock_types.h> + +#define NR_BPF_LRU_LIST_T (3) +#define NR_BPF_LRU_LIST_COUNT (2) +#define NR_BPF_LRU_LOCAL_LIST_T (2) +#define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T + +enum bpf_lru_list_type { + BPF_LRU_LIST_T_ACTIVE, + BPF_LRU_LIST_T_INACTIVE, + BPF_LRU_LIST_T_FREE, + BPF_LRU_LOCAL_LIST_T_FREE, + BPF_LRU_LOCAL_LIST_T_PENDING, +}; + +struct bpf_lru_node { + struct list_head list; + u16 cpu; + u8 type; + u8 ref; +}; + +struct bpf_lru_list { + struct list_head lists[NR_BPF_LRU_LIST_T]; + unsigned int counts[NR_BPF_LRU_LIST_COUNT]; + /* The next inacitve list rotation starts from here */ + struct list_head *next_inactive_rotation; + + raw_spinlock_t lock ____cacheline_aligned_in_smp; +}; + +struct bpf_lru_locallist { + struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T]; + u16 next_steal; + raw_spinlock_t lock; +}; + +struct bpf_common_lru { + struct bpf_lru_list lru_list; + struct bpf_lru_locallist __percpu *local_list; +}; + +typedef bool (*del_from_htab_func)(void *arg, struct bpf_lru_node *node); + +struct bpf_lru { + union { + struct bpf_common_lru common_lru; + struct bpf_lru_list __percpu *percpu_lru; + }; + del_from_htab_func del_from_htab; + void *del_arg; + unsigned int hash_offset; + unsigned int nr_scans; + bool percpu; +}; + +static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node) +{ + /* ref is an approximation on access frequency. It does not + * have to be very accurate. Hence, no protection is used. + */ + node->ref = 1; +} + +int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, + del_from_htab_func del_from_htab, void *delete_arg); +void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, + u32 elem_size, u32 nr_elems); +void bpf_lru_destroy(struct bpf_lru *lru); +struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash); +void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node); +void bpf_lru_promote(struct bpf_lru *lru, struct bpf_lru_node *node); + +#endif diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c new file mode 100644 index 000000000000..a515f7b007c6 --- /dev/null +++ b/kernel/bpf/cgroup.c @@ -0,0 +1,200 @@ +/* + * Functions to manage eBPF programs attached to cgroups + * + * Copyright (c) 2016 Daniel Mack + * + * This file is subject to the terms and conditions of version 2 of the GNU + * General Public License. See the file COPYING in the main directory of the + * Linux distribution for more details. + */ + +#include <linux/kernel.h> +#include <linux/atomic.h> +#include <linux/cgroup.h> +#include <linux/slab.h> +#include <linux/bpf.h> +#include <linux/bpf-cgroup.h> +#include <net/sock.h> + +DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); +EXPORT_SYMBOL(cgroup_bpf_enabled_key); + +/** + * cgroup_bpf_put() - put references of all bpf programs + * @cgrp: the cgroup to modify + */ +void cgroup_bpf_put(struct cgroup *cgrp) +{ + unsigned int type; + + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) { + struct bpf_prog *prog = cgrp->bpf.prog[type]; + + if (prog) { + bpf_prog_put(prog); + static_branch_dec(&cgroup_bpf_enabled_key); + } + } +} + +/** + * cgroup_bpf_inherit() - inherit effective programs from parent + * @cgrp: the cgroup to modify + * @parent: the parent to inherit from + */ +void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent) +{ + unsigned int type; + + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) { + struct bpf_prog *e; + + e = rcu_dereference_protected(parent->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)); + rcu_assign_pointer(cgrp->bpf.effective[type], e); + } +} + +/** + * __cgroup_bpf_update() - Update the pinned program of a cgroup, and + * propagate the change to descendants + * @cgrp: The cgroup which descendants to traverse + * @parent: The parent of @cgrp, or %NULL if @cgrp is the root + * @prog: A new program to pin + * @type: Type of pinning operation (ingress/egress) + * + * Each cgroup has a set of two pointers for bpf programs; one for eBPF + * programs it owns, and which is effective for execution. + * + * If @prog is not %NULL, this function attaches a new program to the cgroup + * and releases the one that is currently attached, if any. @prog is then made + * the effective program of type @type in that cgroup. + * + * If @prog is %NULL, the currently attached program of type @type is released, + * and the effective program of the parent cgroup (if any) is inherited to + * @cgrp. + * + * Then, the descendants of @cgrp are walked and the effective program for + * each of them is set to the effective program of @cgrp unless the + * descendant has its own program attached, in which case the subbranch is + * skipped. This ensures that delegated subcgroups with own programs are left + * untouched. + * + * Must be called with cgroup_mutex held. + */ +void __cgroup_bpf_update(struct cgroup *cgrp, + struct cgroup *parent, + struct bpf_prog *prog, + enum bpf_attach_type type) +{ + struct bpf_prog *old_prog, *effective; + struct cgroup_subsys_state *pos; + + old_prog = xchg(cgrp->bpf.prog + type, prog); + + effective = (!prog && parent) ? + rcu_dereference_protected(parent->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)) : + prog; + + css_for_each_descendant_pre(pos, &cgrp->self) { + struct cgroup *desc = container_of(pos, struct cgroup, self); + + /* skip the subtree if the descendant has its own program */ + if (desc->bpf.prog[type] && desc != cgrp) + pos = css_rightmost_descendant(pos); + else + rcu_assign_pointer(desc->bpf.effective[type], + effective); + } + + if (prog) + static_branch_inc(&cgroup_bpf_enabled_key); + + if (old_prog) { + bpf_prog_put(old_prog); + static_branch_dec(&cgroup_bpf_enabled_key); + } +} + +/** + * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering + * @sk: The socken sending or receiving traffic + * @skb: The skb that is being sent or received + * @type: The type of program to be exectuted + * + * If no socket is passed, or the socket is not of type INET or INET6, + * this function does nothing and returns 0. + * + * The program type passed in via @type must be suitable for network + * filtering. No further check is performed to assert that. + * + * This function will return %-EPERM if any if an attached program was found + * and if it returned != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter_skb(struct sock *sk, + struct sk_buff *skb, + enum bpf_attach_type type) +{ + struct bpf_prog *prog; + struct cgroup *cgrp; + int ret = 0; + + if (!sk || !sk_fullsock(sk)) + return 0; + + if (sk->sk_family != AF_INET && + sk->sk_family != AF_INET6) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + + rcu_read_lock(); + + prog = rcu_dereference(cgrp->bpf.effective[type]); + if (prog) { + unsigned int offset = skb->data - skb_network_header(skb); + + __skb_push(skb, offset); + ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM; + __skb_pull(skb, offset); + } + + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); + +/** + * __cgroup_bpf_run_filter_sk() - Run a program on a sock + * @sk: sock structure to manipulate + * @type: The type of program to be exectuted + * + * socket is passed is expected to be of type INET or INET6. + * + * The program type passed in via @type must be suitable for sock + * filtering. No further check is performed to assert that. + * + * This function will return %-EPERM if any if an attached program was found + * and if it returned != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter_sk(struct sock *sk, + enum bpf_attach_type type) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_prog *prog; + int ret = 0; + + + rcu_read_lock(); + + prog = rcu_dereference(cgrp->bpf.effective[type]); + if (prog) + ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM; + + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 03fd23d4d587..83e0d153b0b4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -136,6 +136,71 @@ void __bpf_prog_free(struct bpf_prog *fp) vfree(fp); } +#define SHA_BPF_RAW_SIZE \ + round_up(MAX_BPF_SIZE + sizeof(__be64) + 1, SHA_MESSAGE_BYTES) + +/* Called under verifier mutex. */ +void bpf_prog_calc_digest(struct bpf_prog *fp) +{ + const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64); + static u32 ws[SHA_WORKSPACE_WORDS]; + static u8 raw[SHA_BPF_RAW_SIZE]; + struct bpf_insn *dst = (void *)raw; + u32 i, bsize, psize, blocks; + bool was_ld_map; + u8 *todo = raw; + __be32 *result; + __be64 *bits; + + sha_init(fp->digest); + memset(ws, 0, sizeof(ws)); + + /* We need to take out the map fd for the digest calculation + * since they are unstable from user space side. + */ + for (i = 0, was_ld_map = false; i < fp->len; i++) { + dst[i] = fp->insnsi[i]; + if (!was_ld_map && + dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) && + dst[i].src_reg == BPF_PSEUDO_MAP_FD) { + was_ld_map = true; + dst[i].imm = 0; + } else if (was_ld_map && + dst[i].code == 0 && + dst[i].dst_reg == 0 && + dst[i].src_reg == 0 && + dst[i].off == 0) { + was_ld_map = false; + dst[i].imm = 0; + } else { + was_ld_map = false; + } + } + + psize = fp->len * sizeof(struct bpf_insn); + memset(&raw[psize], 0, sizeof(raw) - psize); + raw[psize++] = 0x80; + + bsize = round_up(psize, SHA_MESSAGE_BYTES); + blocks = bsize / SHA_MESSAGE_BYTES; + if (bsize - psize >= sizeof(__be64)) { + bits = (__be64 *)(todo + bsize - sizeof(__be64)); + } else { + bits = (__be64 *)(todo + bsize + bits_offset); + blocks++; + } + *bits = cpu_to_be64((psize - 1) << 3); + + while (blocks--) { + sha_transform(fp->digest, todo, ws); + todo += SHA_MESSAGE_BYTES; + } + + result = (__force __be32 *)fp->digest; + for (i = 0; i < SHA_DIGEST_WORDS; i++) + result[i] = cpu_to_be32(fp->digest[i]); +} + static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn) { return BPF_CLASS(insn->code) == BPF_JMP && @@ -1018,7 +1083,7 @@ void bpf_user_rnd_init_once(void) prandom_init_once(&bpf_user_rnd_state); } -u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_user_rnd_u32) { /* Should someone ever have the rather unwise idea to use some * of the registers passed into this function, then note that @@ -1031,7 +1096,7 @@ u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) state = &get_cpu_var(bpf_user_rnd_state); res = prandom_u32_state(state); - put_cpu_var(state); + put_cpu_var(bpf_user_rnd_state); return res; } @@ -1043,6 +1108,7 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; +const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; @@ -1077,7 +1143,7 @@ struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog) return prog; } -bool __weak bpf_helper_changes_skb_data(void *func) +bool __weak bpf_helper_changes_pkt_data(void *func) { return false; } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 570eeca7bdfa..34debc1a9641 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -15,6 +15,7 @@ #include <linux/filter.h> #include <linux/vmalloc.h> #include "percpu_freelist.h" +#include "bpf_lru_list.h" struct bucket { struct hlist_head head; @@ -25,7 +26,10 @@ struct bpf_htab { struct bpf_map map; struct bucket *buckets; void *elems; - struct pcpu_freelist freelist; + union { + struct pcpu_freelist freelist; + struct bpf_lru lru; + }; void __percpu *extra_elems; atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ @@ -48,11 +52,26 @@ struct htab_elem { union { struct rcu_head rcu; enum extra_elem_state state; + struct bpf_lru_node lru_node; }; u32 hash; char key[0] __aligned(8); }; +static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); + +static bool htab_is_lru(const struct bpf_htab *htab) +{ + return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH || + htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; +} + +static bool htab_is_percpu(const struct bpf_htab *htab) +{ + return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || + htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; +} + static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { @@ -73,7 +92,7 @@ static void htab_free_elems(struct bpf_htab *htab) { int i; - if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + if (!htab_is_percpu(htab)) goto free_elems; for (i = 0; i < htab->map.max_entries; i++) { @@ -87,7 +106,22 @@ free_elems: vfree(htab->elems); } -static int prealloc_elems_and_freelist(struct bpf_htab *htab) +static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, + u32 hash) +{ + struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash); + struct htab_elem *l; + + if (node) { + l = container_of(node, struct htab_elem, lru_node); + memcpy(l->key, key, htab->map.key_size); + return l; + } + + return NULL; +} + +static int prealloc_init(struct bpf_htab *htab) { int err = -ENOMEM, i; @@ -95,7 +129,7 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab) if (!htab->elems) return -ENOMEM; - if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + if (!htab_is_percpu(htab)) goto skip_percpu_elems; for (i = 0; i < htab->map.max_entries; i++) { @@ -110,12 +144,27 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab) } skip_percpu_elems: - err = pcpu_freelist_init(&htab->freelist); + if (htab_is_lru(htab)) + err = bpf_lru_init(&htab->lru, + htab->map.map_flags & BPF_F_NO_COMMON_LRU, + offsetof(struct htab_elem, hash) - + offsetof(struct htab_elem, lru_node), + htab_lru_map_delete_node, + htab); + else + err = pcpu_freelist_init(&htab->freelist); + if (err) goto free_elems; - pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size, - htab->map.max_entries); + if (htab_is_lru(htab)) + bpf_lru_populate(&htab->lru, htab->elems, + offsetof(struct htab_elem, lru_node), + htab->elem_size, htab->map.max_entries); + else + pcpu_freelist_populate(&htab->freelist, htab->elems, + htab->elem_size, htab->map.max_entries); + return 0; free_elems: @@ -123,6 +172,16 @@ free_elems: return err; } +static void prealloc_destroy(struct bpf_htab *htab) +{ + htab_free_elems(htab); + + if (htab_is_lru(htab)) + bpf_lru_destroy(&htab->lru); + else + pcpu_freelist_destroy(&htab->freelist); +} + static int alloc_extra_elems(struct bpf_htab *htab) { void __percpu *pptr; @@ -143,15 +202,37 @@ static int alloc_extra_elems(struct bpf_htab *htab) /* Called from syscall */ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { - bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH; + bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || + attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); + bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || + attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); + /* percpu_lru means each cpu has its own LRU list. + * it is different from BPF_MAP_TYPE_PERCPU_HASH where + * the map's value itself is percpu. percpu_lru has + * nothing to do with the map's value. + */ + bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); + bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); struct bpf_htab *htab; int err, i; u64 cost; - if (attr->map_flags & ~BPF_F_NO_PREALLOC) + if (lru && !capable(CAP_SYS_ADMIN)) + /* LRU implementation is much complicated than other + * maps. Hence, limit to CAP_SYS_ADMIN for now. + */ + return ERR_PTR(-EPERM); + + if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU)) /* reserved bits should not be used */ return ERR_PTR(-EINVAL); + if (!lru && percpu_lru) + return ERR_PTR(-EINVAL); + + if (lru && !prealloc) + return ERR_PTR(-ENOTSUPP); + htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); @@ -171,6 +252,18 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->map.value_size == 0) goto free_htab; + if (percpu_lru) { + /* ensure each CPU's lru list has >=1 elements. + * since we are at it, make each lru list has the same + * number of elements. + */ + htab->map.max_entries = roundup(attr->max_entries, + num_possible_cpus()); + if (htab->map.max_entries < attr->max_entries) + htab->map.max_entries = rounddown(attr->max_entries, + num_possible_cpus()); + } + /* hash table size must be power of 2 */ htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); @@ -241,14 +334,17 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) raw_spin_lock_init(&htab->buckets[i].lock); } - if (!percpu) { + if (!percpu && !lru) { + /* lru itself can remove the least used element, so + * there is no need for an extra elem during map_update. + */ err = alloc_extra_elems(htab); if (err) goto free_buckets; } - if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { - err = prealloc_elems_and_freelist(htab); + if (prealloc) { + err = prealloc_init(htab); if (err) goto free_extra_elems; } @@ -323,6 +419,46 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + + if (l) { + bpf_lru_node_set_ref(&l->lru_node); + return l->key + round_up(map->key_size, 8); + } + + return NULL; +} + +/* It is called from the bpf_lru_list when the LRU needs to delete + * older elements from the htab. + */ +static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) +{ + struct bpf_htab *htab = (struct bpf_htab *)arg; + struct htab_elem *l, *tgt_l; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + + tgt_l = container_of(node, struct htab_elem, lru_node); + b = __select_bucket(htab, tgt_l->hash); + head = &b->head; + + raw_spin_lock_irqsave(&b->lock, flags); + + hlist_for_each_entry_rcu(l, head, hash_node) + if (l == tgt_l) { + hlist_del_rcu(&l->hash_node); + break; + } + + raw_spin_unlock_irqrestore(&b->lock, flags); + + return l == tgt_l; +} + /* Called from syscall */ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { @@ -420,6 +556,24 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) } } +static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, + void *value, bool onallcpus) +{ + if (!onallcpus) { + /* copy true value_size bytes */ + memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); + } else { + u32 size = round_up(htab->map.value_size, 8); + int off = 0, cpu; + + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), + value + off, size); + off += size; + } + } +} + static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, @@ -479,18 +633,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, } } - if (!onallcpus) { - /* copy true value_size bytes */ - memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); - } else { - int off = 0, cpu; + pcpu_copy_value(htab, pptr, value, onallcpus); - for_each_possible_cpu(cpu) { - bpf_long_memcpy(per_cpu_ptr(pptr, cpu), - value + off, size); - off += size; - } - } if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr); } else { @@ -571,6 +715,70 @@ err: return ret; } +static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new, *l_old = NULL; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + u32 key_size, hash; + int ret; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + b = __select_bucket(htab, hash); + head = &b->head; + + /* For LRU, we need to alloc before taking bucket's + * spinlock because getting free nodes from LRU may need + * to remove older elements from htab and this removal + * operation will need a bucket lock. + */ + l_new = prealloc_lru_pop(htab, key, hash); + if (!l_new) + return -ENOMEM; + memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_irqsave(&b->lock, flags); + + l_old = lookup_elem_raw(head, hash, key, key_size); + + ret = check_flags(htab, l_old, map_flags); + if (ret) + goto err; + + /* add new element to the head of the list, so that + * concurrent search will find it before old elem + */ + hlist_add_head_rcu(&l_new->hash_node, head); + if (l_old) { + bpf_lru_node_set_ref(&l_new->lru_node); + hlist_del_rcu(&l_old->hash_node); + } + ret = 0; + +err: + raw_spin_unlock_irqrestore(&b->lock, flags); + + if (ret) + bpf_lru_push_free(&htab->lru, &l_new->lru_node); + else if (l_old) + bpf_lru_push_free(&htab->lru, &l_old->lru_node); + + return ret; +} + static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags, bool onallcpus) @@ -606,22 +814,9 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, goto err; if (l_old) { - void __percpu *pptr = htab_elem_get_ptr(l_old, key_size); - u32 size = htab->map.value_size; - /* per-cpu hash map can update value in-place */ - if (!onallcpus) { - memcpy(this_cpu_ptr(pptr), value, size); - } else { - int off = 0, cpu; - - size = round_up(size, 8); - for_each_possible_cpu(cpu) { - bpf_long_memcpy(per_cpu_ptr(pptr, cpu), - value + off, size); - off += size; - } - } + pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), + value, onallcpus); } else { l_new = alloc_htab_elem(htab, key, value, key_size, hash, true, onallcpus, false); @@ -637,12 +832,84 @@ err: return ret; } +static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags, + bool onallcpus) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new = NULL, *l_old; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + u32 key_size, hash; + int ret; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + b = __select_bucket(htab, hash); + head = &b->head; + + /* For LRU, we need to alloc before taking bucket's + * spinlock because LRU's elem alloc may need + * to remove older elem from htab and this removal + * operation will need a bucket lock. + */ + if (map_flags != BPF_EXIST) { + l_new = prealloc_lru_pop(htab, key, hash); + if (!l_new) + return -ENOMEM; + } + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_irqsave(&b->lock, flags); + + l_old = lookup_elem_raw(head, hash, key, key_size); + + ret = check_flags(htab, l_old, map_flags); + if (ret) + goto err; + + if (l_old) { + bpf_lru_node_set_ref(&l_old->lru_node); + + /* per-cpu hash map can update value in-place */ + pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), + value, onallcpus); + } else { + pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), + value, onallcpus); + hlist_add_head_rcu(&l_new->hash_node, head); + l_new = NULL; + } + ret = 0; +err: + raw_spin_unlock_irqrestore(&b->lock, flags); + if (l_new) + bpf_lru_push_free(&htab->lru, &l_new->lru_node); + return ret; +} + static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return __htab_percpu_map_update_elem(map, key, value, map_flags, false); } +static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, + false); +} + /* Called from syscall or from eBPF program */ static int htab_map_delete_elem(struct bpf_map *map, void *key) { @@ -676,6 +943,39 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) return ret; } +static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct bucket *b; + struct htab_elem *l; + unsigned long flags; + u32 hash, key_size; + int ret = -ENOENT; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; + + raw_spin_lock_irqsave(&b->lock, flags); + + l = lookup_elem_raw(head, hash, key, key_size); + + if (l) { + hlist_del_rcu(&l->hash_node); + ret = 0; + } + + raw_spin_unlock_irqrestore(&b->lock, flags); + if (l) + bpf_lru_push_free(&htab->lru, &l->lru_node); + return ret; +} + static void delete_all_elements(struct bpf_htab *htab) { int i; @@ -687,7 +987,8 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_for_each_entry_safe(l, n, head, hash_node) { hlist_del_rcu(&l->hash_node); - htab_elem_free(htab, l); + if (l->state != HTAB_EXTRA_ELEM_USED) + htab_elem_free(htab, l); } } } @@ -707,12 +1008,11 @@ static void htab_map_free(struct bpf_map *map) * not have executed. Wait for them. */ rcu_barrier(); - if (htab->map.map_flags & BPF_F_NO_PREALLOC) { + if (htab->map.map_flags & BPF_F_NO_PREALLOC) delete_all_elements(htab); - } else { - htab_free_elems(htab); - pcpu_freelist_destroy(&htab->freelist); - } + else + prealloc_destroy(htab); + free_percpu(htab->extra_elems); kvfree(htab->buckets); kfree(htab); @@ -732,6 +1032,20 @@ static struct bpf_map_type_list htab_type __read_mostly = { .type = BPF_MAP_TYPE_HASH, }; +static const struct bpf_map_ops htab_lru_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_lru_map_lookup_elem, + .map_update_elem = htab_lru_map_update_elem, + .map_delete_elem = htab_lru_map_delete_elem, +}; + +static struct bpf_map_type_list htab_lru_type __read_mostly = { + .ops = &htab_lru_ops, + .type = BPF_MAP_TYPE_LRU_HASH, +}; + /* Called from eBPF program */ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) { @@ -743,8 +1057,21 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + + if (l) { + bpf_lru_node_set_ref(&l->lru_node); + return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); + } + + return NULL; +} + int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) { + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l; void __percpu *pptr; int ret = -ENOENT; @@ -760,6 +1087,8 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) l = __htab_map_lookup_elem(map, key); if (!l) goto out; + if (htab_is_lru(htab)) + bpf_lru_node_set_ref(&l->lru_node); pptr = htab_elem_get_ptr(l, map->key_size); for_each_possible_cpu(cpu) { bpf_long_memcpy(value + off, @@ -775,10 +1104,16 @@ out: int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 map_flags) { + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); int ret; rcu_read_lock(); - ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true); + if (htab_is_lru(htab)) + ret = __htab_lru_percpu_map_update_elem(map, key, value, + map_flags, true); + else + ret = __htab_percpu_map_update_elem(map, key, value, map_flags, + true); rcu_read_unlock(); return ret; @@ -798,10 +1133,26 @@ static struct bpf_map_type_list htab_percpu_type __read_mostly = { .type = BPF_MAP_TYPE_PERCPU_HASH, }; +static const struct bpf_map_ops htab_lru_percpu_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_lru_percpu_map_lookup_elem, + .map_update_elem = htab_lru_percpu_map_update_elem, + .map_delete_elem = htab_lru_map_delete_elem, +}; + +static struct bpf_map_type_list htab_lru_percpu_type __read_mostly = { + .ops = &htab_lru_percpu_ops, + .type = BPF_MAP_TYPE_LRU_PERCPU_HASH, +}; + static int __init register_htab_map(void) { bpf_register_map_type(&htab_type); bpf_register_map_type(&htab_percpu_type); + bpf_register_map_type(&htab_lru_type); + bpf_register_map_type(&htab_lru_percpu_type); return 0; } late_initcall(register_htab_map); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1ea3afba1a4f..045cbe673356 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -13,9 +13,11 @@ #include <linux/rcupdate.h> #include <linux/random.h> #include <linux/smp.h> +#include <linux/topology.h> #include <linux/ktime.h> #include <linux/sched.h> #include <linux/uidgid.h> +#include <linux/filter.h> /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return @@ -26,48 +28,32 @@ * if program is allowed to access maps, so check rcu_read_lock_held in * all three functions. */ -static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) { - /* verifier checked that R1 contains a valid pointer to bpf_map - * and R2 points to a program stack and map->key_size bytes were - * initialized - */ - struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; - void *key = (void *) (unsigned long) r2; - void *value; - WARN_ON_ONCE(!rcu_read_lock_held()); - - value = map->ops->map_lookup_elem(map, key); - - /* lookup() returns either pointer to element value or NULL - * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type - */ - return (unsigned long) value; + return (unsigned long) map->ops->map_lookup_elem(map, key); } const struct bpf_func_proto bpf_map_lookup_elem_proto = { .func = bpf_map_lookup_elem, .gpl_only = false, + .pkt_access = true, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, }; -static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, + void *, value, u64, flags) { - struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; - void *key = (void *) (unsigned long) r2; - void *value = (void *) (unsigned long) r3; - WARN_ON_ONCE(!rcu_read_lock_held()); - - return map->ops->map_update_elem(map, key, value, r4); + return map->ops->map_update_elem(map, key, value, flags); } const struct bpf_func_proto bpf_map_update_elem_proto = { .func = bpf_map_update_elem, .gpl_only = false, + .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, @@ -75,19 +61,16 @@ const struct bpf_func_proto bpf_map_update_elem_proto = { .arg4_type = ARG_ANYTHING, }; -static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) { - struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; - void *key = (void *) (unsigned long) r2; - WARN_ON_ONCE(!rcu_read_lock_held()); - return map->ops->map_delete_elem(map, key); } const struct bpf_func_proto bpf_map_delete_elem_proto = { .func = bpf_map_delete_elem, .gpl_only = false, + .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, @@ -99,7 +82,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_get_smp_processor_id) { return smp_processor_id(); } @@ -110,7 +93,18 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_get_numa_node_id) +{ + return numa_node_id(); +} + +const struct bpf_func_proto bpf_get_numa_node_id_proto = { + .func = bpf_get_numa_node_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +BPF_CALL_0(bpf_ktime_get_ns) { /* NMI safe access to clock monotonic */ return ktime_get_mono_fast_ns(); @@ -122,11 +116,11 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_get_current_pid_tgid) { struct task_struct *task = current; - if (!task) + if (unlikely(!task)) return -EINVAL; return (u64) task->tgid << 32 | task->pid; @@ -138,18 +132,18 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_get_current_uid_gid) { struct task_struct *task = current; kuid_t uid; kgid_t gid; - if (!task) + if (unlikely(!task)) return -EINVAL; current_uid_gid(&uid, &gid); return (u64) from_kgid(&init_user_ns, gid) << 32 | - from_kuid(&init_user_ns, uid); + from_kuid(&init_user_ns, uid); } const struct bpf_func_proto bpf_get_current_uid_gid_proto = { @@ -158,10 +152,9 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size) { struct task_struct *task = current; - char *buf = (char *) (long) r1; if (unlikely(!task)) goto err_clear; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5967b870a895..0b030c9126d3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -18,6 +18,7 @@ #include <linux/namei.h> #include <linux/fs.h> #include <linux/kdev_t.h> +#include <linux/parser.h> #include <linux/filter.h> #include <linux/bpf.h> @@ -87,6 +88,7 @@ static struct inode *bpf_get_inode(struct super_block *sb, switch (mode & S_IFMT) { case S_IFDIR: case S_IFREG: + case S_IFLNK: break; default: return ERR_PTR(-EINVAL); @@ -97,7 +99,7 @@ static struct inode *bpf_get_inode(struct super_block *sb, return ERR_PTR(-ENOSPC); inode->i_ino = get_next_ino(); - inode->i_atime = CURRENT_TIME; + inode->i_atime = current_time(inode); inode->i_mtime = inode->i_atime; inode->i_ctime = inode->i_atime; @@ -119,6 +121,16 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) return 0; } +static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, + struct inode *dir) +{ + d_instantiate(dentry, inode); + dget(dentry); + + dir->i_mtime = current_time(dir); + dir->i_ctime = dir->i_mtime; +} + static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -133,9 +145,7 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inc_nlink(inode); inc_nlink(dir); - d_instantiate(dentry, inode); - dget(dentry); - + bpf_dentry_finalize(dentry, inode, dir); return 0; } @@ -151,9 +161,7 @@ static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry, inode->i_op = iops; inode->i_private = dentry->d_fsdata; - d_instantiate(dentry, inode); - dget(dentry); - + bpf_dentry_finalize(dentry, inode, dir); return 0; } @@ -181,13 +189,37 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { if (strchr(dentry->d_name.name, '.')) return ERR_PTR(-EPERM); + return simple_lookup(dir, dentry, flags); } +static int bpf_symlink(struct inode *dir, struct dentry *dentry, + const char *target) +{ + char *link = kstrdup(target, GFP_USER | __GFP_NOWARN); + struct inode *inode; + + if (!link) + return -ENOMEM; + + inode = bpf_get_inode(dir->i_sb, dir, S_IRWXUGO | S_IFLNK); + if (IS_ERR(inode)) { + kfree(link); + return PTR_ERR(inode); + } + + inode->i_op = &simple_symlink_inode_operations; + inode->i_link = link; + + bpf_dentry_finalize(dentry, inode, dir); + return 0; +} + static const struct inode_operations bpf_dir_iops = { .lookup = bpf_lookup, .mknod = bpf_mkobj, .mkdir = bpf_mkdir, + .symlink = bpf_symlink, .rmdir = simple_rmdir, .rename = simple_rename, .link = simple_link, @@ -324,6 +356,8 @@ static void bpf_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); clear_inode(inode); + if (S_ISLNK(inode->i_mode)) + kfree(inode->i_link); if (!bpf_inode_type(inode, &type)) bpf_any_put(inode->i_private, type); } @@ -331,15 +365,66 @@ static void bpf_evict_inode(struct inode *inode) static const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, + .show_options = generic_show_options, .evict_inode = bpf_evict_inode, }; +enum { + OPT_MODE, + OPT_ERR, +}; + +static const match_table_t bpf_mount_tokens = { + { OPT_MODE, "mode=%o" }, + { OPT_ERR, NULL }, +}; + +struct bpf_mount_opts { + umode_t mode; +}; + +static int bpf_parse_options(char *data, struct bpf_mount_opts *opts) +{ + substring_t args[MAX_OPT_ARGS]; + int option, token; + char *ptr; + + opts->mode = S_IRWXUGO; + + while ((ptr = strsep(&data, ",")) != NULL) { + if (!*ptr) + continue; + + token = match_token(ptr, bpf_mount_tokens, args); + switch (token) { + case OPT_MODE: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->mode = option & S_IALLUGO; + break; + /* We might like to report bad mount options here, but + * traditionally we've ignored all mount options, so we'd + * better continue to ignore non-existing options for bpf. + */ + } + } + + return 0; +} + static int bpf_fill_super(struct super_block *sb, void *data, int silent) { static struct tree_descr bpf_rfiles[] = { { "" } }; + struct bpf_mount_opts opts; struct inode *inode; int ret; + save_mount_options(sb, data); + + ret = bpf_parse_options(data, &opts); + if (ret) + return ret; + ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); if (ret) return ret; @@ -349,7 +434,7 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent) inode = sb->s_root->d_inode; inode->i_op = &bpf_dir_iops; inode->i_mode &= ~S_IALLUGO; - inode->i_mode |= S_ISVTX | S_IRWXUGO; + inode->i_mode |= S_ISVTX | opts.mode; return 0; } diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index bf4495fcd25d..732ae16d12b7 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -116,10 +116,9 @@ free_smap: return ERR_PTR(err); } -u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, + u64, flags) { - struct pt_regs *regs = (struct pt_regs *) (long) r1; - struct bpf_map *map = (struct bpf_map *) (long) r2; struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct perf_callchain_entry *trace; struct stack_map_bucket *bucket, *new_bucket, *old_bucket; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 228f962447a5..4819ec9d95f6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -17,6 +17,7 @@ #include <linux/license.h> #include <linux/filter.h> #include <linux/version.h> +#include <linux/kernel.h> DEFINE_PER_CPU(int, bpf_prog_active); @@ -137,18 +138,31 @@ static int bpf_map_release(struct inode *inode, struct file *filp) static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_map *map = filp->private_data; + const struct bpf_array *array; + u32 owner_prog_type = 0; + + if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { + array = container_of(map, struct bpf_array, map); + owner_prog_type = array->owner_prog_type; + } seq_printf(m, "map_type:\t%u\n" "key_size:\t%u\n" "value_size:\t%u\n" "max_entries:\t%u\n" - "map_flags:\t%#x\n", + "map_flags:\t%#x\n" + "memlock:\t%llu\n", map->map_type, map->key_size, map->value_size, map->max_entries, - map->map_flags); + map->map_flags, + map->pages * 1ULL << PAGE_SHIFT); + + if (owner_prog_type) + seq_printf(m, "owner_prog_type:\t%u\n", + owner_prog_type); } #endif @@ -194,7 +208,7 @@ static int map_create(union bpf_attr *attr) err = bpf_map_charge_memlock(map); if (err) - goto free_map; + goto free_map_nouncharge; err = bpf_map_new_fd(map); if (err < 0) @@ -204,6 +218,8 @@ static int map_create(union bpf_attr *attr) return err; free_map: + bpf_map_uncharge_memlock(map); +free_map_nouncharge: map->ops->map_free(map); return err; } @@ -252,12 +268,6 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) return map; } -/* helper to convert user pointers passed inside __aligned_u64 fields */ -static void __user *u64_to_ptr(__u64 val) -{ - return (void __user *) (unsigned long) val; -} - int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) { return -ENOTSUPP; @@ -268,8 +278,8 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) static int map_lookup_elem(union bpf_attr *attr) { - void __user *ukey = u64_to_ptr(attr->key); - void __user *uvalue = u64_to_ptr(attr->value); + void __user *ukey = u64_to_user_ptr(attr->key); + void __user *uvalue = u64_to_user_ptr(attr->value); int ufd = attr->map_fd; struct bpf_map *map; void *key, *value, *ptr; @@ -295,6 +305,7 @@ static int map_lookup_elem(union bpf_attr *attr) goto free_key; if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) value_size = round_up(map->value_size, 8) * num_possible_cpus(); else @@ -305,7 +316,8 @@ static int map_lookup_elem(union bpf_attr *attr) if (!value) goto free_key; - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); @@ -342,8 +354,8 @@ err_put: static int map_update_elem(union bpf_attr *attr) { - void __user *ukey = u64_to_ptr(attr->key); - void __user *uvalue = u64_to_ptr(attr->value); + void __user *ukey = u64_to_user_ptr(attr->key); + void __user *uvalue = u64_to_user_ptr(attr->value); int ufd = attr->map_fd; struct bpf_map *map; void *key, *value; @@ -369,6 +381,7 @@ static int map_update_elem(union bpf_attr *attr) goto free_key; if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) value_size = round_up(map->value_size, 8) * num_possible_cpus(); else @@ -388,7 +401,8 @@ static int map_update_elem(union bpf_attr *attr) */ preempt_disable(); __this_cpu_inc(bpf_prog_active); - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); @@ -420,7 +434,7 @@ err_put: static int map_delete_elem(union bpf_attr *attr) { - void __user *ukey = u64_to_ptr(attr->key); + void __user *ukey = u64_to_user_ptr(attr->key); int ufd = attr->map_fd; struct bpf_map *map; struct fd f; @@ -464,8 +478,8 @@ err_put: static int map_get_next_key(union bpf_attr *attr) { - void __user *ukey = u64_to_ptr(attr->key); - void __user *unext_key = u64_to_ptr(attr->next_key); + void __user *ukey = u64_to_user_ptr(attr->key); + void __user *unext_key = u64_to_user_ptr(attr->next_key); int ufd = attr->map_fd; struct bpf_map *map; void *key, *next_key; @@ -565,6 +579,8 @@ static void fixup_bpf_calls(struct bpf_prog *prog) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) bpf_user_rnd_init_once(); + if (insn->imm == BPF_FUNC_xdp_adjust_head) + prog->xdp_adjust_head = 1; if (insn->imm == BPF_FUNC_tail_call) { /* mark bpf_tail_call as different opcode * to avoid conditional branch in @@ -648,8 +664,30 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) return 0; } +#ifdef CONFIG_PROC_FS +static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) +{ + const struct bpf_prog *prog = filp->private_data; + char prog_digest[sizeof(prog->digest) * 2 + 1] = { }; + + bin2hex(prog_digest, prog->digest, sizeof(prog->digest)); + seq_printf(m, + "prog_type:\t%u\n" + "prog_jited:\t%u\n" + "prog_digest:\t%s\n" + "memlock:\t%llu\n", + prog->type, + prog->jited, + prog_digest, + prog->pages * 1ULL << PAGE_SHIFT); +} +#endif + static const struct file_operations bpf_prog_fops = { - .release = bpf_prog_release, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_prog_show_fdinfo, +#endif + .release = bpf_prog_release, }; int bpf_prog_new_fd(struct bpf_prog *prog) @@ -680,10 +718,22 @@ struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) } EXPORT_SYMBOL_GPL(bpf_prog_add); +void bpf_prog_sub(struct bpf_prog *prog, int i) +{ + /* Only to be used for undoing previous bpf_prog_add() in some + * error path. We still know that another entity in our call + * path holds a reference to the program, thus atomic_sub() can + * be safely used in such cases! + */ + WARN_ON(atomic_sub_return(i, &prog->aux->refcnt) == 0); +} +EXPORT_SYMBOL_GPL(bpf_prog_sub); + struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) { return bpf_prog_add(prog, 1); } +EXPORT_SYMBOL_GPL(bpf_prog_inc); static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) { @@ -730,7 +780,7 @@ static int bpf_prog_load(union bpf_attr *attr) return -EINVAL; /* copy eBPF program license from user space */ - if (strncpy_from_user(license, u64_to_ptr(attr->license), + if (strncpy_from_user(license, u64_to_user_ptr(attr->license), sizeof(license) - 1) < 0) return -EFAULT; license[sizeof(license) - 1] = 0; @@ -738,8 +788,8 @@ static int bpf_prog_load(union bpf_attr *attr) /* eBPF programs must be GPL compatible to use GPL-ed functions */ is_gpl = license_is_gpl_compatible(license); - if (attr->insn_cnt >= BPF_MAXINSNS) - return -EINVAL; + if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS) + return -E2BIG; if (type == BPF_PROG_TYPE_KPROBE && attr->kern_version != LINUX_VERSION_CODE) @@ -760,7 +810,7 @@ static int bpf_prog_load(union bpf_attr *attr) prog->len = attr->insn_cnt; err = -EFAULT; - if (copy_from_user(prog->insns, u64_to_ptr(attr->insns), + if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns), prog->len * sizeof(struct bpf_insn)) != 0) goto free_prog; @@ -811,7 +861,7 @@ static int bpf_obj_pin(const union bpf_attr *attr) if (CHECK_ATTR(BPF_OBJ)) return -EINVAL; - return bpf_obj_pin_user(attr->bpf_fd, u64_to_ptr(attr->pathname)); + return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname)); } static int bpf_obj_get(const union bpf_attr *attr) @@ -819,9 +869,85 @@ static int bpf_obj_get(const union bpf_attr *attr) if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0) return -EINVAL; - return bpf_obj_get_user(u64_to_ptr(attr->pathname)); + return bpf_obj_get_user(u64_to_user_ptr(attr->pathname)); +} + +#ifdef CONFIG_CGROUP_BPF + +#define BPF_PROG_ATTACH_LAST_FIELD attach_type + +static int bpf_prog_attach(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct cgroup *cgrp; + enum bpf_prog_type ptype; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (CHECK_ATTR(BPF_PROG_ATTACH)) + return -EINVAL; + + switch (attr->attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + ptype = BPF_PROG_TYPE_CGROUP_SKB; + break; + case BPF_CGROUP_INET_SOCK_CREATE: + ptype = BPF_PROG_TYPE_CGROUP_SOCK; + break; + default: + return -EINVAL; + } + + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) { + bpf_prog_put(prog); + return PTR_ERR(cgrp); + } + + cgroup_bpf_update(cgrp, prog, attr->attach_type); + cgroup_put(cgrp); + + return 0; } +#define BPF_PROG_DETACH_LAST_FIELD attach_type + +static int bpf_prog_detach(const union bpf_attr *attr) +{ + struct cgroup *cgrp; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (CHECK_ATTR(BPF_PROG_DETACH)) + return -EINVAL; + + switch (attr->attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + case BPF_CGROUP_INET_SOCK_CREATE: + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + cgroup_bpf_update(cgrp, NULL, attr->attach_type); + cgroup_put(cgrp); + break; + + default: + return -EINVAL; + } + + return 0; +} +#endif /* CONFIG_CGROUP_BPF */ + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -888,6 +1014,16 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET: err = bpf_obj_get(&attr); break; + +#ifdef CONFIG_CGROUP_BPF + case BPF_PROG_ATTACH: + err = bpf_prog_attach(&attr); + break; + case BPF_PROG_DETACH: + err = bpf_prog_detach(&attr); + break; +#endif + default: err = -EINVAL; break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index daea765d72e6..d28f9a3380a9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14,10 +14,12 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/bpf.h> +#include <linux/bpf_verifier.h> #include <linux/filter.h> #include <net/netlink.h> #include <linux/file.h> #include <linux/vmalloc.h> +#include <linux/stringify.h> /* bpf_check() is a static code analyzer that walks eBPF program * instruction by instruction and updates register/stack state. @@ -126,76 +128,16 @@ * are set to NOT_INIT to indicate that they are no longer readable. */ -struct reg_state { - enum bpf_reg_type type; - union { - /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */ - s64 imm; - - /* valid when type == PTR_TO_PACKET* */ - struct { - u32 id; - u16 off; - u16 range; - }; - - /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | - * PTR_TO_MAP_VALUE_OR_NULL - */ - struct bpf_map *map_ptr; - }; -}; - -enum bpf_stack_slot_type { - STACK_INVALID, /* nothing was stored in this stack slot */ - STACK_SPILL, /* register spilled into stack */ - STACK_MISC /* BPF program wrote some data into this slot */ -}; - -#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ - -/* state of the program: - * type of all registers and stack info - */ -struct verifier_state { - struct reg_state regs[MAX_BPF_REG]; - u8 stack_slot_type[MAX_BPF_STACK]; - struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE]; -}; - -/* linked list of verifier states used to prune search */ -struct verifier_state_list { - struct verifier_state state; - struct verifier_state_list *next; -}; - /* verifier_state + insn_idx are pushed to stack when branch is encountered */ -struct verifier_stack_elem { +struct bpf_verifier_stack_elem { /* verifer state is 'st' * before processing instruction 'insn_idx' * and after processing instruction 'prev_insn_idx' */ - struct verifier_state st; + struct bpf_verifier_state st; int insn_idx; int prev_insn_idx; - struct verifier_stack_elem *next; -}; - -#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ - -/* single container for all structs - * one verifier_env per bpf_check() call - */ -struct verifier_env { - struct bpf_prog *prog; /* eBPF program being verified */ - struct verifier_stack_elem *head; /* stack of verifier states to be processed */ - int stack_size; /* number of states to be processed */ - struct verifier_state cur_state; /* current verifier state */ - struct verifier_state_list **explored_states; /* search pruning optimization */ - struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ - u32 used_map_cnt; /* number of used maps */ - u32 id_gen; /* used to generate unique reg IDs */ - bool allow_ptr_leaks; + struct bpf_verifier_stack_elem *next; }; #define BPF_COMPLEXITY_LIMIT_INSNS 65536 @@ -204,6 +146,7 @@ struct verifier_env { struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; + bool pkt_access; int regno; int access_size; }; @@ -240,6 +183,7 @@ static const char * const reg_type_str[] = { [CONST_PTR_TO_MAP] = "map_ptr", [PTR_TO_MAP_VALUE] = "map_value", [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", + [PTR_TO_MAP_VALUE_ADJ] = "map_value_adj", [FRAME_PTR] = "fp", [PTR_TO_STACK] = "fp", [CONST_IMM] = "imm", @@ -247,9 +191,25 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_END] = "pkt_end", }; -static void print_verifier_state(struct verifier_state *state) +#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) +static const char * const func_id_str[] = { + __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) +}; +#undef __BPF_FUNC_STR_FN + +static const char *func_id_name(int id) +{ + BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); + + if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) + return func_id_str[id]; + else + return "unknown"; +} + +static void print_verifier_state(struct bpf_verifier_state *state) { - struct reg_state *reg; + struct bpf_reg_state *reg; enum bpf_reg_type t; int i; @@ -267,10 +227,18 @@ static void print_verifier_state(struct verifier_state *state) else if (t == UNKNOWN_VALUE && reg->imm) verbose("%lld", reg->imm); else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || - t == PTR_TO_MAP_VALUE_OR_NULL) - verbose("(ks=%d,vs=%d)", + t == PTR_TO_MAP_VALUE_OR_NULL || + t == PTR_TO_MAP_VALUE_ADJ) + verbose("(ks=%d,vs=%d,id=%u)", reg->map_ptr->key_size, - reg->map_ptr->value_size); + reg->map_ptr->value_size, + reg->id); + if (reg->min_value != BPF_REGISTER_MIN_RANGE) + verbose(",min_value=%lld", + (long long)reg->min_value); + if (reg->max_value != BPF_REGISTER_MAX_RANGE) + verbose(",max_value=%llu", + (unsigned long long)reg->max_value); } for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] == STACK_SPILL) @@ -403,7 +371,8 @@ static void print_bpf_insn(struct bpf_insn *insn) u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { - verbose("(%02x) call %d\n", insn->code, insn->imm); + verbose("(%02x) call %s#%d\n", insn->code, + func_id_name(insn->imm), insn->imm); } else if (insn->code == (BPF_JMP | BPF_JA)) { verbose("(%02x) goto pc%+d\n", insn->code, insn->off); @@ -425,9 +394,9 @@ static void print_bpf_insn(struct bpf_insn *insn) } } -static int pop_stack(struct verifier_env *env, int *prev_insn_idx) +static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx) { - struct verifier_stack_elem *elem; + struct bpf_verifier_stack_elem *elem; int insn_idx; if (env->head == NULL) @@ -444,12 +413,12 @@ static int pop_stack(struct verifier_env *env, int *prev_insn_idx) return insn_idx; } -static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx, - int prev_insn_idx) +static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx) { - struct verifier_stack_elem *elem; + struct bpf_verifier_stack_elem *elem; - elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL); + elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); if (!elem) goto err; @@ -475,13 +444,15 @@ static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; -static void init_reg_state(struct reg_state *regs) +static void init_reg_state(struct bpf_reg_state *regs) { int i; for (i = 0; i < MAX_BPF_REG; i++) { regs[i].type = NOT_INIT; regs[i].imm = 0; + regs[i].min_value = BPF_REGISTER_MIN_RANGE; + regs[i].max_value = BPF_REGISTER_MAX_RANGE; } /* frame pointer */ @@ -491,20 +462,27 @@ static void init_reg_state(struct reg_state *regs) regs[BPF_REG_1].type = PTR_TO_CTX; } -static void mark_reg_unknown_value(struct reg_state *regs, u32 regno) +static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) { BUG_ON(regno >= MAX_BPF_REG); regs[regno].type = UNKNOWN_VALUE; + regs[regno].id = 0; regs[regno].imm = 0; } +static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) +{ + regs[regno].min_value = BPF_REGISTER_MIN_RANGE; + regs[regno].max_value = BPF_REGISTER_MAX_RANGE; +} + enum reg_arg_type { SRC_OP, /* register is used as source operand */ DST_OP, /* register is used as destination operand */ DST_OP_NO_MARK /* same as above, check only, don't mark */ }; -static int check_reg_arg(struct reg_state *regs, u32 regno, +static int check_reg_arg(struct bpf_reg_state *regs, u32 regno, enum reg_arg_type t) { if (regno >= MAX_BPF_REG) { @@ -564,8 +542,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ -static int check_stack_write(struct verifier_state *state, int off, int size, - int value_regno) +static int check_stack_write(struct bpf_verifier_state *state, int off, + int size, int value_regno) { int i; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, @@ -590,7 +568,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size, } else { /* regular write of data into stack */ state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = - (struct reg_state) {}; + (struct bpf_reg_state) {}; for (i = 0; i < size; i++) state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; @@ -598,7 +576,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size, return 0; } -static int check_stack_read(struct verifier_state *state, int off, int size, +static int check_stack_read(struct bpf_verifier_state *state, int off, int size, int value_regno) { u8 *slot_type; @@ -639,7 +617,7 @@ static int check_stack_read(struct verifier_state *state, int off, int size, } /* check read/write into map element returned by bpf_map_lookup_elem() */ -static int check_map_access(struct verifier_env *env, u32 regno, int off, +static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size) { struct bpf_map *map = env->cur_state.regs[regno].map_ptr; @@ -654,24 +632,38 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off, #define MAX_PACKET_OFF 0xffff -static bool may_write_pkt_data(enum bpf_prog_type type) +static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, + const struct bpf_call_arg_meta *meta, + enum bpf_access_type t) { - switch (type) { + switch (env->prog->type) { + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_OUT: + /* dst_input() and dst_output() can't write for now */ + if (t == BPF_WRITE) + return false; + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: + case BPF_PROG_TYPE_LWT_XMIT: + if (meta) + return meta->pkt_access; + + env->seen_direct_write = true; return true; default: return false; } } -static int check_packet_access(struct verifier_env *env, u32 regno, int off, +static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size) { - struct reg_state *regs = env->cur_state.regs; - struct reg_state *reg = ®s[regno]; + struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *reg = ®s[regno]; off += reg->off; - if (off < 0 || off + size > reg->range) { + if (off < 0 || size <= 0 || off + size > reg->range) { verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", off, size, regno, reg->id, reg->off, reg->range); return -EACCES; @@ -680,9 +672,13 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off, } /* check access to 'struct bpf_context' fields */ -static int check_ctx_access(struct verifier_env *env, int off, int size, +static int check_ctx_access(struct bpf_verifier_env *env, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type) { + /* for analyzer ctx accesses are already validated and converted */ + if (env->analyzer_ops) + return 0; + if (env->prog->aux->ops->is_valid_access && env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) { /* remember the offset of last byte accessed in ctx */ @@ -695,7 +691,7 @@ static int check_ctx_access(struct verifier_env *env, int off, int size, return -EACCES; } -static bool is_pointer_value(struct verifier_env *env, int regno) +static bool is_pointer_value(struct bpf_verifier_env *env, int regno) { if (env->allow_ptr_leaks) return false; @@ -709,28 +705,19 @@ static bool is_pointer_value(struct verifier_env *env, int regno) } } -static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, - int off, int size) +static int check_ptr_alignment(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, int off, int size) { - if (reg->type != PTR_TO_PACKET) { + if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_MAP_VALUE_ADJ) { if (off % size != 0) { - verbose("misaligned access off %d size %d\n", off, size); + verbose("misaligned access off %d size %d\n", + off, size); return -EACCES; } else { return 0; } } - switch (env->prog->type) { - case BPF_PROG_TYPE_SCHED_CLS: - case BPF_PROG_TYPE_SCHED_ACT: - case BPF_PROG_TYPE_XDP: - break; - default: - verbose("verifier is misconfigured\n"); - return -EACCES; - } - if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) /* misaligned access to packet is ok on x86,arm,arm64 */ return 0; @@ -741,7 +728,8 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, } /* skb->data is NET_IP_ALIGN-ed */ - if ((NET_IP_ALIGN + reg->off + off) % size != 0) { + if (reg->type == PTR_TO_PACKET && + (NET_IP_ALIGN + reg->off + off) % size != 0) { verbose("misaligned packet access off %d+%d+%d size %d\n", NET_IP_ALIGN, reg->off, off, size); return -EACCES; @@ -755,12 +743,12 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, * if t==write && value_regno==-1, some unknown value is stored into memory * if t==read && value_regno==-1, don't care what we read from memory */ -static int check_mem_access(struct verifier_env *env, u32 regno, int off, +static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, int bpf_size, enum bpf_access_type t, int value_regno) { - struct verifier_state *state = &env->cur_state; - struct reg_state *reg = &state->regs[regno]; + struct bpf_verifier_state *state = &env->cur_state; + struct bpf_reg_state *reg = &state->regs[regno]; int size, err = 0; if (reg->type == PTR_TO_STACK) @@ -774,12 +762,52 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, if (err) return err; - if (reg->type == PTR_TO_MAP_VALUE) { + if (reg->type == PTR_TO_MAP_VALUE || + reg->type == PTR_TO_MAP_VALUE_ADJ) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose("R%d leaks addr into map\n", value_regno); return -EACCES; } + + /* If we adjusted the register to this map value at all then we + * need to change off and size to min_value and max_value + * respectively to make sure our theoretical access will be + * safe. + */ + if (reg->type == PTR_TO_MAP_VALUE_ADJ) { + if (log_level) + print_verifier_state(state); + env->varlen_map_value_access = true; + /* The minimum value is only important with signed + * comparisons where we can't assume the floor of a + * value is 0. If we are using signed variables for our + * index'es we need to make sure that whatever we use + * will have a set floor within our range. + */ + if (reg->min_value < 0) { + verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + regno); + return -EACCES; + } + err = check_map_access(env, regno, reg->min_value + off, + size); + if (err) { + verbose("R%d min value is outside of the array range\n", + regno); + return err; + } + + /* If we haven't set a max value then we need to bail + * since we can't be sure we won't do bad things. + */ + if (reg->max_value == BPF_REGISTER_MAX_RANGE) { + verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", + regno); + return -EACCES; + } + off += reg->max_value; + } err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown_value(state->regs, value_regno); @@ -795,9 +823,8 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, err = check_ctx_access(env, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { mark_reg_unknown_value(state->regs, value_regno); - if (env->allow_ptr_leaks) - /* note that reg.[id|off|range] == 0 */ - state->regs[value_regno].type = reg_type; + /* note that reg.[id|off|range] == 0 */ + state->regs[value_regno].type = reg_type; } } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { @@ -817,7 +844,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, err = check_stack_read(state, off, size, value_regno); } } else if (state->regs[regno].type == PTR_TO_PACKET) { - if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) { + if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { verbose("cannot write into packet\n"); return -EACCES; } @@ -846,9 +873,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, return err; } -static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) +static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = env->cur_state.regs; int err; if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || @@ -882,12 +909,12 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) * bytes from that pointer, make sure that it's within stack boundary * and all elements of stack are initialized */ -static int check_stack_boundary(struct verifier_env *env, int regno, +static int check_stack_boundary(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct verifier_state *state = &env->cur_state; - struct reg_state *regs = state->regs; + struct bpf_verifier_state *state = &env->cur_state; + struct bpf_reg_state *regs = state->regs; int off, i; if (regs[regno].type != PTR_TO_STACK) { @@ -926,18 +953,18 @@ static int check_stack_boundary(struct verifier_env *env, int regno, return 0; } -static int check_func_arg(struct verifier_env *env, u32 regno, +static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) { - struct reg_state *reg = env->cur_state.regs + regno; - enum bpf_reg_type expected_type; + struct bpf_reg_state *regs = env->cur_state.regs, *reg = ®s[regno]; + enum bpf_reg_type expected_type, type = reg->type; int err = 0; if (arg_type == ARG_DONTCARE) return 0; - if (reg->type == NOT_INIT) { + if (type == NOT_INIT) { verbose("R%d !read_ok\n", regno); return -EACCES; } @@ -950,16 +977,30 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return 0; } + if (type == PTR_TO_PACKET && + !may_access_direct_pkt_data(env, meta, BPF_READ)) { + verbose("helper access to the packet is not allowed\n"); + return -EACCES; + } + if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { expected_type = PTR_TO_STACK; + if (type != PTR_TO_PACKET && type != expected_type) + goto err_type; } else if (arg_type == ARG_CONST_STACK_SIZE || arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { expected_type = CONST_IMM; + if (type != expected_type) + goto err_type; } else if (arg_type == ARG_CONST_MAP_PTR) { expected_type = CONST_PTR_TO_MAP; + if (type != expected_type) + goto err_type; } else if (arg_type == ARG_PTR_TO_CTX) { expected_type = PTR_TO_CTX; + if (type != expected_type) + goto err_type; } else if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_RAW_STACK) { expected_type = PTR_TO_STACK; @@ -967,20 +1008,16 @@ static int check_func_arg(struct verifier_env *env, u32 regno, * passed in as argument, it's a CONST_IMM type. Final test * happens during stack boundary checking. */ - if (reg->type == CONST_IMM && reg->imm == 0) - expected_type = CONST_IMM; + if (type == CONST_IMM && reg->imm == 0) + /* final test in check_stack_boundary() */; + else if (type != PTR_TO_PACKET && type != expected_type) + goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK; } else { verbose("unsupported arg_type %d\n", arg_type); return -EFAULT; } - if (reg->type != expected_type) { - verbose("R%d type=%s expected=%s\n", regno, - reg_type_str[reg->type], reg_type_str[expected_type]); - return -EACCES; - } - if (arg_type == ARG_CONST_MAP_PTR) { /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ meta->map_ptr = reg->map_ptr; @@ -998,8 +1035,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->key\n"); return -EACCES; } - err = check_stack_boundary(env, regno, meta->map_ptr->key_size, - false, NULL); + if (type == PTR_TO_PACKET) + err = check_packet_access(env, regno, 0, + meta->map_ptr->key_size); + else + err = check_stack_boundary(env, regno, + meta->map_ptr->key_size, + false, NULL); } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity @@ -1009,9 +1051,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->value\n"); return -EACCES; } - err = check_stack_boundary(env, regno, - meta->map_ptr->value_size, - false, NULL); + if (type == PTR_TO_PACKET) + err = check_packet_access(env, regno, 0, + meta->map_ptr->value_size); + else + err = check_stack_boundary(env, regno, + meta->map_ptr->value_size, + false, NULL); } else if (arg_type == ARG_CONST_STACK_SIZE || arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); @@ -1025,11 +1071,18 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); return -EACCES; } - err = check_stack_boundary(env, regno - 1, reg->imm, - zero_size_allowed, meta); + if (regs[regno - 1].type == PTR_TO_PACKET) + err = check_packet_access(env, regno - 1, 0, reg->imm); + else + err = check_stack_boundary(env, regno - 1, reg->imm, + zero_size_allowed, meta); } return err; +err_type: + verbose("R%d type=%s expected=%s\n", regno, + reg_type_str[type], reg_type_str[expected_type]); + return -EACCES; } static int check_map_func_compatibility(struct bpf_map *map, int func_id) @@ -1053,7 +1106,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) goto error; break; case BPF_MAP_TYPE_CGROUP_ARRAY: - if (func_id != BPF_FUNC_skb_under_cgroup) + if (func_id != BPF_FUNC_skb_under_cgroup && + func_id != BPF_FUNC_current_task_under_cgroup) goto error; break; default: @@ -1075,6 +1129,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) goto error; break; + case BPF_FUNC_current_task_under_cgroup: case BPF_FUNC_skb_under_cgroup: if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) goto error; @@ -1085,8 +1140,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) return 0; error: - verbose("cannot pass map_type %d into func %d\n", - map->map_type, func_id); + verbose("cannot pass map_type %d into func %s#%d\n", + map->map_type, func_id_name(func_id), func_id); return -EINVAL; } @@ -1108,10 +1163,10 @@ static int check_raw_mode(const struct bpf_func_proto *fn) return count > 1 ? -EINVAL : 0; } -static void clear_all_pkt_pointers(struct verifier_env *env) +static void clear_all_pkt_pointers(struct bpf_verifier_env *env) { - struct verifier_state *state = &env->cur_state; - struct reg_state *regs = state->regs, *reg; + struct bpf_verifier_state *state = &env->cur_state; + struct bpf_reg_state *regs = state->regs, *reg; int i; for (i = 0; i < MAX_BPF_REG; i++) @@ -1131,19 +1186,19 @@ static void clear_all_pkt_pointers(struct verifier_env *env) } } -static int check_call(struct verifier_env *env, int func_id) +static int check_call(struct bpf_verifier_env *env, int func_id) { - struct verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state = &env->cur_state; const struct bpf_func_proto *fn = NULL; - struct reg_state *regs = state->regs; - struct reg_state *reg; + struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *reg; struct bpf_call_arg_meta meta; bool changes_data; int i, err; /* find function prototype */ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { - verbose("invalid func %d\n", func_id); + verbose("invalid func %s#%d\n", func_id_name(func_id), func_id); return -EINVAL; } @@ -1151,7 +1206,7 @@ static int check_call(struct verifier_env *env, int func_id) fn = env->prog->aux->ops->get_func_proto(func_id); if (!fn) { - verbose("unknown func %d\n", func_id); + verbose("unknown func %s#%d\n", func_id_name(func_id), func_id); return -EINVAL; } @@ -1161,16 +1216,18 @@ static int check_call(struct verifier_env *env, int func_id) return -EINVAL; } - changes_data = bpf_helper_changes_skb_data(fn->func); + changes_data = bpf_helper_changes_pkt_data(fn->func); memset(&meta, 0, sizeof(meta)); + meta.pkt_access = fn->pkt_access; /* We only support one arg being in raw mode at the moment, which * is sufficient for the helper functions we have right now. */ err = check_raw_mode(fn); if (err) { - verbose("kernel subsystem misconfigured func %d\n", func_id); + verbose("kernel subsystem misconfigured func %s#%d\n", + func_id_name(func_id), func_id); return err; } @@ -1214,6 +1271,7 @@ static int check_call(struct verifier_env *env, int func_id) regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0; /* remember map_ptr, so that check_map_access() * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem() @@ -1223,9 +1281,10 @@ static int check_call(struct verifier_env *env, int func_id) return -EINVAL; } regs[BPF_REG_0].map_ptr = meta.map_ptr; + regs[BPF_REG_0].id = ++env->id_gen; } else { - verbose("unknown return type %d of func %d\n", - fn->ret_type, func_id); + verbose("unknown return type %d of func %s#%d\n", + fn->ret_type, func_id_name(func_id), func_id); return -EINVAL; } @@ -1238,12 +1297,13 @@ static int check_call(struct verifier_env *env, int func_id) return 0; } -static int check_packet_ptr_add(struct verifier_env *env, struct bpf_insn *insn) +static int check_packet_ptr_add(struct bpf_verifier_env *env, + struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; - struct reg_state *dst_reg = ®s[insn->dst_reg]; - struct reg_state *src_reg = ®s[insn->src_reg]; - struct reg_state tmp_reg; + struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; + struct bpf_reg_state *src_reg = ®s[insn->src_reg]; + struct bpf_reg_state tmp_reg; s32 imm; if (BPF_SRC(insn->code) == BPF_K) { @@ -1311,10 +1371,10 @@ add_imm: return 0; } -static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn) +static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; - struct reg_state *dst_reg = ®s[insn->dst_reg]; + struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; u8 opcode = BPF_OP(insn->code); s64 imm_log2; @@ -1324,7 +1384,7 @@ static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn) */ if (BPF_SRC(insn->code) == BPF_X) { - struct reg_state *src_reg = ®s[insn->src_reg]; + struct bpf_reg_state *src_reg = ®s[insn->src_reg]; if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 && dst_reg->imm && opcode == BPF_ADD) { @@ -1413,30 +1473,158 @@ static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn) return 0; } -static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn) +static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, + struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; - struct reg_state *dst_reg = ®s[insn->dst_reg]; - struct reg_state *src_reg = ®s[insn->src_reg]; + struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; + struct bpf_reg_state *src_reg = ®s[insn->src_reg]; u8 opcode = BPF_OP(insn->code); - /* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn. - * Don't care about overflow or negative values, just add them + /* dst_reg->type == CONST_IMM here, simulate execution of 'add'/'or' + * insn. Don't care about overflow or negative values, just add them */ if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) dst_reg->imm += insn->imm; else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X && src_reg->type == CONST_IMM) dst_reg->imm += src_reg->imm; + else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) + dst_reg->imm |= insn->imm; + else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) + dst_reg->imm |= src_reg->imm; else mark_reg_unknown_value(regs, insn->dst_reg); return 0; } +static void check_reg_overflow(struct bpf_reg_state *reg) +{ + if (reg->max_value > BPF_REGISTER_MAX_RANGE) + reg->max_value = BPF_REGISTER_MAX_RANGE; + if (reg->min_value < BPF_REGISTER_MIN_RANGE || + reg->min_value > BPF_REGISTER_MAX_RANGE) + reg->min_value = BPF_REGISTER_MIN_RANGE; +} + +static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; + s64 min_val = BPF_REGISTER_MIN_RANGE; + u64 max_val = BPF_REGISTER_MAX_RANGE; + u8 opcode = BPF_OP(insn->code); + + dst_reg = ®s[insn->dst_reg]; + if (BPF_SRC(insn->code) == BPF_X) { + check_reg_overflow(®s[insn->src_reg]); + min_val = regs[insn->src_reg].min_value; + max_val = regs[insn->src_reg].max_value; + + /* If the source register is a random pointer then the + * min_value/max_value values represent the range of the known + * accesses into that value, not the actual min/max value of the + * register itself. In this case we have to reset the reg range + * values so we know it is not safe to look at. + */ + if (regs[insn->src_reg].type != CONST_IMM && + regs[insn->src_reg].type != UNKNOWN_VALUE) { + min_val = BPF_REGISTER_MIN_RANGE; + max_val = BPF_REGISTER_MAX_RANGE; + } + } else if (insn->imm < BPF_REGISTER_MAX_RANGE && + (s64)insn->imm > BPF_REGISTER_MIN_RANGE) { + min_val = max_val = insn->imm; + } + + /* We don't know anything about what was done to this register, mark it + * as unknown. + */ + if (min_val == BPF_REGISTER_MIN_RANGE && + max_val == BPF_REGISTER_MAX_RANGE) { + reset_reg_range_values(regs, insn->dst_reg); + return; + } + + /* If one of our values was at the end of our ranges then we can't just + * do our normal operations to the register, we need to set the values + * to the min/max since they are undefined. + */ + if (min_val == BPF_REGISTER_MIN_RANGE) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + if (max_val == BPF_REGISTER_MAX_RANGE) + dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + + switch (opcode) { + case BPF_ADD: + if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) + dst_reg->min_value += min_val; + if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) + dst_reg->max_value += max_val; + break; + case BPF_SUB: + if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) + dst_reg->min_value -= min_val; + if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) + dst_reg->max_value -= max_val; + break; + case BPF_MUL: + if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) + dst_reg->min_value *= min_val; + if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) + dst_reg->max_value *= max_val; + break; + case BPF_AND: + /* Disallow AND'ing of negative numbers, ain't nobody got time + * for that. Otherwise the minimum is 0 and the max is the max + * value we could AND against. + */ + if (min_val < 0) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + else + dst_reg->min_value = 0; + dst_reg->max_value = max_val; + break; + case BPF_LSH: + /* Gotta have special overflow logic here, if we're shifting + * more than MAX_RANGE then just assume we have an invalid + * range. + */ + if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) + dst_reg->min_value <<= min_val; + + if (max_val > ilog2(BPF_REGISTER_MAX_RANGE)) + dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) + dst_reg->max_value <<= max_val; + break; + case BPF_RSH: + /* RSH by a negative number is undefined, and the BPF_RSH is an + * unsigned shift, so make the appropriate casts. + */ + if (min_val < 0 || dst_reg->min_value < 0) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + else + dst_reg->min_value = + (u64)(dst_reg->min_value) >> min_val; + if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) + dst_reg->max_value >>= max_val; + break; + default: + reset_reg_range_values(regs, insn->dst_reg); + break; + } + + check_reg_overflow(dst_reg); +} + /* check validity of 32-bit and 64-bit arithmetic operations */ -static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) +static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs, *dst_reg; + struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; u8 opcode = BPF_OP(insn->code); int err; @@ -1496,6 +1684,11 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) if (err) return err; + /* we are setting our register to something new, we need to + * reset its range values. + */ + reset_reg_range_values(regs, insn->dst_reg); + if (BPF_SRC(insn->code) == BPF_X) { if (BPF_CLASS(insn->code) == BPF_ALU64) { /* case: R1 = R2 @@ -1508,8 +1701,7 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) insn->src_reg); return -EACCES; } - regs[insn->dst_reg].type = UNKNOWN_VALUE; - regs[insn->dst_reg].map_ptr = NULL; + mark_reg_unknown_value(regs, insn->dst_reg); } } else { /* case: R = imm @@ -1517,6 +1709,8 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) */ regs[insn->dst_reg].type = CONST_IMM; regs[insn->dst_reg].imm = insn->imm; + regs[insn->dst_reg].max_value = insn->imm; + regs[insn->dst_reg].min_value = insn->imm; } } else if (opcode > BPF_END) { @@ -1569,6 +1763,9 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) dst_reg = ®s[insn->dst_reg]; + /* first we want to adjust our ranges. */ + adjust_reg_min_max_vals(env, insn); + /* pattern match 'bpf_add Rx, imm' instruction */ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) { @@ -1603,28 +1800,58 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) return -EACCES; } - /* mark dest operand */ - mark_reg_unknown_value(regs, insn->dst_reg); + /* If we did pointer math on a map value then just set it to our + * PTR_TO_MAP_VALUE_ADJ type so we can deal with any stores or + * loads to this register appropriately, otherwise just mark the + * register as unknown. + */ + if (env->allow_ptr_leaks && + (dst_reg->type == PTR_TO_MAP_VALUE || + dst_reg->type == PTR_TO_MAP_VALUE_ADJ)) + dst_reg->type = PTR_TO_MAP_VALUE_ADJ; + else + mark_reg_unknown_value(regs, insn->dst_reg); } return 0; } -static void find_good_pkt_pointers(struct verifier_env *env, - struct reg_state *dst_reg) +static void find_good_pkt_pointers(struct bpf_verifier_state *state, + struct bpf_reg_state *dst_reg) { - struct verifier_state *state = &env->cur_state; - struct reg_state *regs = state->regs, *reg; + struct bpf_reg_state *regs = state->regs, *reg; int i; - /* r2 = r3; - * r2 += 8 - * if (r2 > pkt_end) goto somewhere - * r2 == dst_reg, pkt_end == src_reg, - * r2=pkt(id=n,off=8,r=0) - * r3=pkt(id=n,off=0,r=0) - * find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) - * so that range of bytes [r3, r3 + 8) is safe to access + + /* LLVM can generate two kind of checks: + * + * Type 1: + * + * r2 = r3; + * r2 += 8; + * if (r2 > pkt_end) goto <handle exception> + * <access okay> + * + * Where: + * r2 == dst_reg, pkt_end == src_reg + * r2=pkt(id=n,off=8,r=0) + * r3=pkt(id=n,off=0,r=0) + * + * Type 2: + * + * r2 = r3; + * r2 += 8; + * if (pkt_end >= r2) goto <access okay> + * <handle exception> + * + * Where: + * pkt_end == dst_reg, r2 == src_reg + * r2=pkt(id=n,off=8,r=0) + * r3=pkt(id=n,off=0,r=0) + * + * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) + * so that range of bytes [r3, r3 + 8) is safe to access. */ + for (i = 0; i < MAX_BPF_REG; i++) if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) regs[i].range = dst_reg->off; @@ -1638,11 +1865,141 @@ static void find_good_pkt_pointers(struct verifier_env *env, } } -static int check_cond_jmp_op(struct verifier_env *env, +/* Adjusts the register min/max values in the case that the dst_reg is the + * variable register that we are working on, and src_reg is a constant or we're + * simply doing a BPF_K check. + */ +static void reg_set_min_max(struct bpf_reg_state *true_reg, + struct bpf_reg_state *false_reg, u64 val, + u8 opcode) +{ + switch (opcode) { + case BPF_JEQ: + /* If this is false then we know nothing Jon Snow, but if it is + * true then we know for sure. + */ + true_reg->max_value = true_reg->min_value = val; + break; + case BPF_JNE: + /* If this is true we know nothing Jon Snow, but if it is false + * we know the value for sure; + */ + false_reg->max_value = false_reg->min_value = val; + break; + case BPF_JGT: + /* Unsigned comparison, the minimum value is 0. */ + false_reg->min_value = 0; + case BPF_JSGT: + /* If this is false then we know the maximum val is val, + * otherwise we know the min val is val+1. + */ + false_reg->max_value = val; + true_reg->min_value = val + 1; + break; + case BPF_JGE: + /* Unsigned comparison, the minimum value is 0. */ + false_reg->min_value = 0; + case BPF_JSGE: + /* If this is false then we know the maximum value is val - 1, + * otherwise we know the mimimum value is val. + */ + false_reg->max_value = val - 1; + true_reg->min_value = val; + break; + default: + break; + } + + check_reg_overflow(false_reg); + check_reg_overflow(true_reg); +} + +/* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg + * is the variable reg. + */ +static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, + struct bpf_reg_state *false_reg, u64 val, + u8 opcode) +{ + switch (opcode) { + case BPF_JEQ: + /* If this is false then we know nothing Jon Snow, but if it is + * true then we know for sure. + */ + true_reg->max_value = true_reg->min_value = val; + break; + case BPF_JNE: + /* If this is true we know nothing Jon Snow, but if it is false + * we know the value for sure; + */ + false_reg->max_value = false_reg->min_value = val; + break; + case BPF_JGT: + /* Unsigned comparison, the minimum value is 0. */ + true_reg->min_value = 0; + case BPF_JSGT: + /* + * If this is false, then the val is <= the register, if it is + * true the register <= to the val. + */ + false_reg->min_value = val; + true_reg->max_value = val - 1; + break; + case BPF_JGE: + /* Unsigned comparison, the minimum value is 0. */ + true_reg->min_value = 0; + case BPF_JSGE: + /* If this is false then constant < register, if it is true then + * the register < constant. + */ + false_reg->min_value = val + 1; + true_reg->max_value = val; + break; + default: + break; + } + + check_reg_overflow(false_reg); + check_reg_overflow(true_reg); +} + +static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, + enum bpf_reg_type type) +{ + struct bpf_reg_state *reg = ®s[regno]; + + if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { + reg->type = type; + if (type == UNKNOWN_VALUE) + mark_reg_unknown_value(regs, regno); + } +} + +/* The logic is similar to find_good_pkt_pointers(), both could eventually + * be folded together at some point. + */ +static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, + enum bpf_reg_type type) +{ + struct bpf_reg_state *regs = state->regs; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) + mark_map_reg(regs, i, regs[regno].id, type); + + for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { + if (state->stack_slot_type[i] != STACK_SPILL) + continue; + mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, + regs[regno].id, type); + } +} + +static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { - struct reg_state *regs = env->cur_state.regs, *dst_reg; - struct verifier_state *other_branch; + struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state; + struct bpf_reg_state *regs = this_branch->regs, *dst_reg; u8 opcode = BPF_OP(insn->code); int err; @@ -1704,32 +2061,48 @@ static int check_cond_jmp_op(struct verifier_env *env, if (!other_branch) return -EFAULT; - /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */ + /* detect if we are comparing against a constant value so we can adjust + * our min/max values for our dst register. + */ + if (BPF_SRC(insn->code) == BPF_X) { + if (regs[insn->src_reg].type == CONST_IMM) + reg_set_min_max(&other_branch->regs[insn->dst_reg], + dst_reg, regs[insn->src_reg].imm, + opcode); + else if (dst_reg->type == CONST_IMM) + reg_set_min_max_inv(&other_branch->regs[insn->src_reg], + ®s[insn->src_reg], dst_reg->imm, + opcode); + } else { + reg_set_min_max(&other_branch->regs[insn->dst_reg], + dst_reg, insn->imm, opcode); + } + + /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */ if (BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - if (opcode == BPF_JEQ) { - /* next fallthrough insn can access memory via - * this register - */ - regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; - /* branch targer cannot access it, since reg == 0 */ - mark_reg_unknown_value(other_branch->regs, - insn->dst_reg); - } else { - other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; - mark_reg_unknown_value(regs, insn->dst_reg); - } + /* Mark all identical map registers in each branch as either + * safe or unknown depending R == 0 or R != 0 conditional. + */ + mark_map_regs(this_branch, insn->dst_reg, + opcode == BPF_JEQ ? PTR_TO_MAP_VALUE : UNKNOWN_VALUE); + mark_map_regs(other_branch, insn->dst_reg, + opcode == BPF_JEQ ? UNKNOWN_VALUE : PTR_TO_MAP_VALUE); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { - find_good_pkt_pointers(env, dst_reg); + find_good_pkt_pointers(this_branch, dst_reg); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && + dst_reg->type == PTR_TO_PACKET_END && + regs[insn->src_reg].type == PTR_TO_PACKET) { + find_good_pkt_pointers(other_branch, ®s[insn->src_reg]); } else if (is_pointer_value(env, insn->dst_reg)) { verbose("R%d pointer comparison prohibited\n", insn->dst_reg); return -EACCES; } if (log_level) - print_verifier_state(&env->cur_state); + print_verifier_state(this_branch); return 0; } @@ -1742,9 +2115,9 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) } /* verify BPF_LD_IMM64 instruction */ -static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) +static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = env->cur_state.regs; int err; if (BPF_SIZE(insn->code) != BPF_DW) { @@ -1760,9 +2133,19 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) if (err) return err; - if (insn->src_reg == 0) - /* generic move 64-bit immediate into a register */ + if (insn->src_reg == 0) { + /* generic move 64-bit immediate into a register, + * only analyzer needs to collect the ld_imm value. + */ + u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; + + if (!env->analyzer_ops) + return 0; + + regs[insn->dst_reg].type = CONST_IMM; + regs[insn->dst_reg].imm = imm; return 0; + } /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); @@ -1799,11 +2182,11 @@ static bool may_access_skb(enum bpf_prog_type type) * Output: * R0 - 8/16/32-bit skb data converted to cpu endianness */ -static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) +static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = env->cur_state.regs; u8 mode = BPF_MODE(insn->code); - struct reg_state *reg; + struct bpf_reg_state *reg; int i, err; if (!may_access_skb(env->prog->type)) { @@ -1889,7 +2272,7 @@ enum { BRANCH = 2, }; -#define STATE_LIST_MARK ((struct verifier_state_list *) -1L) +#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) static int *insn_stack; /* stack of insns to process */ static int cur_stack; /* current stack index */ @@ -1900,7 +2283,7 @@ static int *insn_state; * w - next instruction * e - edge */ -static int push_insn(int t, int w, int e, struct verifier_env *env) +static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) { if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) return 0; @@ -1941,7 +2324,7 @@ static int push_insn(int t, int w, int e, struct verifier_env *env) /* non-recursive depth-first-search to detect loops in BPF program * loop == back-edge in directed graph */ -static int check_cfg(struct verifier_env *env) +static int check_cfg(struct bpf_verifier_env *env) { struct bpf_insn *insns = env->prog->insnsi; int insn_cnt = env->prog->len; @@ -2050,7 +2433,8 @@ err_free: /* the following conditions reduce the number of explored insns * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet */ -static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur) +static bool compare_ptrs_to_packet(struct bpf_reg_state *old, + struct bpf_reg_state *cur) { if (old->id != cur->id) return false; @@ -2125,9 +2509,12 @@ static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur) * whereas register type in current state is meaningful, it means that * the current state will reach 'bpf_exit' instruction safely */ -static bool states_equal(struct verifier_state *old, struct verifier_state *cur) +static bool states_equal(struct bpf_verifier_env *env, + struct bpf_verifier_state *old, + struct bpf_verifier_state *cur) { - struct reg_state *rold, *rcur; + bool varlen_map_access = env->varlen_map_value_access; + struct bpf_reg_state *rold, *rcur; int i; for (i = 0; i < MAX_BPF_REG; i++) { @@ -2137,8 +2524,20 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) if (memcmp(rold, rcur, sizeof(*rold)) == 0) continue; + /* If the ranges were not the same, but everything else was and + * we didn't do a variable access into a map then we are a-ok. + */ + if (!varlen_map_access && + memcmp(rold, rcur, offsetofend(struct bpf_reg_state, id)) == 0) + continue; + + /* If we didn't map access then again we don't care about the + * mismatched range values and it's ok if our old type was + * UNKNOWN and we didn't go to a NOT_INIT'ed reg. + */ if (rold->type == NOT_INIT || - (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT)) + (!varlen_map_access && rold->type == UNKNOWN_VALUE && + rcur->type != NOT_INIT)) continue; if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && @@ -2167,9 +2566,9 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) * the same, check that stored pointers types * are the same as well. * Ex: explored safe path could have stored - * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8} + * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -8} * but current path has stored: - * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16} + * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -16} * such verifier states are not equivalent. * return false to continue verification of this path */ @@ -2180,10 +2579,10 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) return true; } -static int is_state_visited(struct verifier_env *env, int insn_idx) +static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { - struct verifier_state_list *new_sl; - struct verifier_state_list *sl; + struct bpf_verifier_state_list *new_sl; + struct bpf_verifier_state_list *sl; sl = env->explored_states[insn_idx]; if (!sl) @@ -2193,7 +2592,7 @@ static int is_state_visited(struct verifier_env *env, int insn_idx) return 0; while (sl != STATE_LIST_MARK) { - if (states_equal(&sl->state, &env->cur_state)) + if (states_equal(env, &sl->state, &env->cur_state)) /* reached equivalent register/stack state, * prune the search */ @@ -2207,7 +2606,7 @@ static int is_state_visited(struct verifier_env *env, int insn_idx) * it will be rejected. Since there are no loops, we won't be * seeing this 'insn_idx' instruction again on the way to bpf_exit */ - new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER); + new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER); if (!new_sl) return -ENOMEM; @@ -2218,11 +2617,20 @@ static int is_state_visited(struct verifier_env *env, int insn_idx) return 0; } -static int do_check(struct verifier_env *env) +static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx) { - struct verifier_state *state = &env->cur_state; + if (!env->analyzer_ops || !env->analyzer_ops->insn_hook) + return 0; + + return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx); +} + +static int do_check(struct bpf_verifier_env *env) +{ + struct bpf_verifier_state *state = &env->cur_state; struct bpf_insn *insns = env->prog->insnsi; - struct reg_state *regs = state->regs; + struct bpf_reg_state *regs = state->regs; int insn_cnt = env->prog->len; int insn_idx, prev_insn_idx = 0; int insn_processed = 0; @@ -2230,6 +2638,7 @@ static int do_check(struct verifier_env *env) init_reg_state(regs); insn_idx = 0; + env->varlen_map_value_access = false; for (;;) { struct bpf_insn *insn; u8 class; @@ -2276,13 +2685,17 @@ static int do_check(struct verifier_env *env) print_bpf_insn(insn); } + err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); + if (err) + return err; + if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); if (err) return err; } else if (class == BPF_LDX) { - enum bpf_reg_type src_reg_type; + enum bpf_reg_type *prev_src_type, src_reg_type; /* check for reserved fields is already done */ @@ -2306,21 +2719,25 @@ static int do_check(struct verifier_env *env) if (err) return err; - if (BPF_SIZE(insn->code) != BPF_W) { + reset_reg_range_values(regs, insn->dst_reg); + if (BPF_SIZE(insn->code) != BPF_W && + BPF_SIZE(insn->code) != BPF_DW) { insn_idx++; continue; } - if (insn->imm == 0) { + prev_src_type = &env->insn_aux_data[insn_idx].ptr_type; + + if (*prev_src_type == NOT_INIT) { /* saw a valid insn * dst_reg = *(u32 *)(src_reg + off) - * use reserved 'imm' field to mark this insn + * save type to validate intersecting paths */ - insn->imm = src_reg_type; + *prev_src_type = src_reg_type; - } else if (src_reg_type != insn->imm && + } else if (src_reg_type != *prev_src_type && (src_reg_type == PTR_TO_CTX || - insn->imm == PTR_TO_CTX)) { + *prev_src_type == PTR_TO_CTX)) { /* ABuser program is trying to use the same insn * dst_reg = *(u32*) (src_reg + off) * with different pointer types: @@ -2333,7 +2750,7 @@ static int do_check(struct verifier_env *env) } } else if (class == BPF_STX) { - enum bpf_reg_type dst_reg_type; + enum bpf_reg_type *prev_dst_type, dst_reg_type; if (BPF_MODE(insn->code) == BPF_XADD) { err = check_xadd(env, insn); @@ -2361,11 +2778,13 @@ static int do_check(struct verifier_env *env) if (err) return err; - if (insn->imm == 0) { - insn->imm = dst_reg_type; - } else if (dst_reg_type != insn->imm && + prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type; + + if (*prev_dst_type == NOT_INIT) { + *prev_dst_type = dst_reg_type; + } else if (dst_reg_type != *prev_dst_type && (dst_reg_type == PTR_TO_CTX || - insn->imm == PTR_TO_CTX)) { + *prev_dst_type == PTR_TO_CTX)) { verbose("same insn cannot be used with different pointers\n"); return -EINVAL; } @@ -2471,6 +2890,7 @@ process_bpf_exit: verbose("invalid BPF_LD mode\n"); return -EINVAL; } + reset_reg_range_values(regs, insn->dst_reg); } else { verbose("unknown insn class %d\n", class); return -EINVAL; @@ -2483,14 +2903,28 @@ process_bpf_exit: return 0; } +static int check_map_prog_compatibility(struct bpf_map *map, + struct bpf_prog *prog) + +{ + if (prog->type == BPF_PROG_TYPE_PERF_EVENT && + (map->map_type == BPF_MAP_TYPE_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_HASH) && + (map->map_flags & BPF_F_NO_PREALLOC)) { + verbose("perf_event programs can only use preallocated hash map\n"); + return -EINVAL; + } + return 0; +} + /* look for pseudo eBPF instructions that access map FDs and * replace them with actual map pointers */ -static int replace_map_fd_with_map_ptr(struct verifier_env *env) +static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) { struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; - int i, j; + int i, j, err; for (i = 0; i < insn_cnt; i++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && @@ -2534,6 +2968,12 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) return PTR_ERR(map); } + err = check_map_prog_compatibility(map, env->prog); + if (err) { + fdput(f); + return err; + } + /* store map pointer inside BPF_LD_IMM64 instruction */ insn[0].imm = (u32) (unsigned long) map; insn[1].imm = ((u64) (unsigned long) map) >> 32; @@ -2577,7 +3017,7 @@ next_insn: } /* drop refcnt of maps used by the rejected program */ -static void release_maps(struct verifier_env *env) +static void release_maps(struct bpf_verifier_env *env) { int i; @@ -2586,7 +3026,7 @@ static void release_maps(struct verifier_env *env) } /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ -static void convert_pseudo_ld_imm64(struct verifier_env *env) +static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) { struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; @@ -2600,62 +3040,74 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) /* convert load instructions that access fields of 'struct __sk_buff' * into sequence of instructions that access fields of 'struct sk_buff' */ -static int convert_ctx_accesses(struct verifier_env *env) +static int convert_ctx_accesses(struct bpf_verifier_env *env) { - struct bpf_insn *insn = env->prog->insnsi; - int insn_cnt = env->prog->len; - struct bpf_insn insn_buf[16]; + const struct bpf_verifier_ops *ops = env->prog->aux->ops; + const int insn_cnt = env->prog->len; + struct bpf_insn insn_buf[16], *insn; struct bpf_prog *new_prog; enum bpf_access_type type; - int i; + int i, cnt, delta = 0; + + if (ops->gen_prologue) { + cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, + env->prog); + if (cnt >= ARRAY_SIZE(insn_buf)) { + verbose("bpf verifier is misconfigured\n"); + return -EINVAL; + } else if (cnt) { + new_prog = bpf_patch_insn_single(env->prog, 0, + insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + env->prog = new_prog; + delta += cnt - 1; + } + } - if (!env->prog->aux->ops->convert_ctx_access) + if (!ops->convert_ctx_access) return 0; - for (i = 0; i < insn_cnt; i++, insn++) { - u32 insn_delta, cnt; + insn = env->prog->insnsi + delta; - if (insn->code == (BPF_LDX | BPF_MEM | BPF_W)) + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) || + insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) type = BPF_READ; - else if (insn->code == (BPF_STX | BPF_MEM | BPF_W)) + else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) || + insn->code == (BPF_STX | BPF_MEM | BPF_DW)) type = BPF_WRITE; else continue; - if (insn->imm != PTR_TO_CTX) { - /* clear internal mark */ - insn->imm = 0; + if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX) continue; - } - cnt = env->prog->aux->ops-> - convert_ctx_access(type, insn->dst_reg, insn->src_reg, - insn->off, insn_buf, env->prog); + cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg, + insn->off, insn_buf, env->prog); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { verbose("bpf verifier is misconfigured\n"); return -EINVAL; } - new_prog = bpf_patch_insn_single(env->prog, i, insn_buf, cnt); + new_prog = bpf_patch_insn_single(env->prog, i + delta, insn_buf, + cnt); if (!new_prog) return -ENOMEM; - insn_delta = cnt - 1; + delta += cnt - 1; /* keep walking new program and skip insns we just inserted */ env->prog = new_prog; - insn = new_prog->insnsi + i + insn_delta; - - insn_cnt += insn_delta; - i += insn_delta; + insn = new_prog->insnsi + i + delta; } return 0; } -static void free_states(struct verifier_env *env) +static void free_states(struct bpf_verifier_env *env) { - struct verifier_state_list *sl, *sln; + struct bpf_verifier_state_list *sl, *sln; int i; if (!env->explored_states) @@ -2678,19 +3130,21 @@ static void free_states(struct verifier_env *env) int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { char __user *log_ubuf = NULL; - struct verifier_env *env; + struct bpf_verifier_env *env; int ret = -EINVAL; - if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) - return -E2BIG; - - /* 'struct verifier_env' can be global, but since it's not small, + /* 'struct bpf_verifier_env' can be global, but since it's not small, * allocate/free it every time bpf_check() is called */ - env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL); + env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; + env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * + (*prog)->len); + ret = -ENOMEM; + if (!env->insn_aux_data) + goto err_free_env; env->prog = *prog; /* grab the mutex to protect few globals used by verifier */ @@ -2709,22 +3163,24 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) /* log_* values have to be sane */ if (log_size < 128 || log_size > UINT_MAX >> 8 || log_level == 0 || log_ubuf == NULL) - goto free_env; + goto err_unlock; ret = -ENOMEM; log_buf = vmalloc(log_size); if (!log_buf) - goto free_env; + goto err_unlock; } else { log_level = 0; } + bpf_prog_calc_digest(env->prog); + ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; env->explored_states = kcalloc(env->prog->len, - sizeof(struct verifier_state_list *), + sizeof(struct bpf_verifier_state_list *), GFP_USER); ret = -ENOMEM; if (!env->explored_states) @@ -2783,14 +3239,67 @@ skip_full_check: free_log_buf: if (log_level) vfree(log_buf); -free_env: if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_bpf_prog_info() will release them. */ release_maps(env); *prog = env->prog; +err_unlock: + mutex_unlock(&bpf_verifier_lock); + vfree(env->insn_aux_data); +err_free_env: kfree(env); + return ret; +} + +int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, + void *priv) +{ + struct bpf_verifier_env *env; + int ret; + + env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); + if (!env) + return -ENOMEM; + + env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * + prog->len); + ret = -ENOMEM; + if (!env->insn_aux_data) + goto err_free_env; + env->prog = prog; + env->analyzer_ops = ops; + env->analyzer_priv = priv; + + /* grab the mutex to protect few globals used by verifier */ + mutex_lock(&bpf_verifier_lock); + + log_level = 0; + + env->explored_states = kcalloc(env->prog->len, + sizeof(struct bpf_verifier_state_list *), + GFP_KERNEL); + ret = -ENOMEM; + if (!env->explored_states) + goto skip_full_check; + + ret = check_cfg(env); + if (ret < 0) + goto skip_full_check; + + env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); + + ret = do_check(env); + +skip_full_check: + while (pop_stack(env, NULL) >= 0); + free_states(env); + mutex_unlock(&bpf_verifier_lock); + vfree(env->insn_aux_data); +err_free_env: + kfree(env); return ret; } +EXPORT_SYMBOL_GPL(bpf_analyzer); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d6b729beba49..2ee9ec3051b2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -64,6 +64,9 @@ #include <linux/file.h> #include <net/sock.h> +#define CREATE_TRACE_POINTS +#include <trace/events/cgroup.h> + /* * pidlists linger the following amount before being destroyed. The goal * is avoiding frequent destruction in the middle of consecutive read calls @@ -1176,6 +1179,8 @@ static void cgroup_destroy_root(struct cgroup_root *root) struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; + trace_cgroup_destroy_root(root); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); BUG_ON(atomic_read(&root->nr_cgrps)); @@ -1874,6 +1879,9 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) strcpy(root->release_agent_path, opts.release_agent); spin_unlock(&release_agent_path_lock); } + + trace_cgroup_remount(root); + out_unlock: kfree(opts.release_agent); kfree(opts.name); @@ -2031,6 +2039,8 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto destroy_root; + trace_cgroup_setup_root(root); + /* * There must be no failure case after here, since rebinding takes * care of subsystems' refcounts, which are explicitly dropped in @@ -2315,22 +2325,18 @@ static struct file_system_type cgroup2_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; -static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns) +static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) { struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); - int ret; - ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); - if (ret < 0 || ret >= buflen) - return NULL; - return buf; + return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); } -char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns) +int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) { - char *ret; + int ret; mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); @@ -2357,12 +2363,12 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); * * Return value is the same as kernfs_path(). */ -char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) +int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) { struct cgroup_root *root; struct cgroup *cgrp; int hierarchy_id = 1; - char *path = NULL; + int ret; mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); @@ -2371,16 +2377,15 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) if (root) { cgrp = task_cgroup_from_root(task, root); - path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); + ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); } else { /* if no hierarchy exists, everyone is in "/" */ - if (strlcpy(buf, "/", buflen) < buflen) - path = buf; + ret = strlcpy(buf, "/", buflen); } spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); - return path; + return ret; } EXPORT_SYMBOL_GPL(task_cgroup_path); @@ -2830,6 +2835,10 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); cgroup_migrate_finish(&preloaded_csets); + + if (!ret) + trace_cgroup_attach_task(dst_cgrp, leader, threadgroup); + return ret; } @@ -3611,6 +3620,8 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, mutex_lock(&cgroup_mutex); ret = kernfs_rename(kn, new_parent, new_name_str); + if (!ret) + trace_cgroup_rename(cgrp); mutex_unlock(&cgroup_mutex); @@ -4381,6 +4392,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) if (task) { ret = cgroup_migrate(task, false, to->root); + if (!ret) + trace_cgroup_transfer_tasks(to, task, false); put_task_struct(task); } } while (task && !ret); @@ -5046,6 +5059,8 @@ static void css_release_work_fn(struct work_struct *work) ss->css_released(css); } else { /* cgroup release path */ + trace_cgroup_release(cgrp); + cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -5059,6 +5074,8 @@ static void css_release_work_fn(struct work_struct *work) if (cgrp->kn) RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); + + cgroup_bpf_put(cgrp); } mutex_unlock(&cgroup_mutex); @@ -5266,6 +5283,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (!cgroup_on_dfl(cgrp)) cgrp->subtree_control = cgroup_control(cgrp); + if (parent) + cgroup_bpf_inherit(cgrp, parent); + cgroup_propagate_control(cgrp); /* @cgrp doesn't have dir yet so the following will only create csses */ @@ -5332,6 +5352,8 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; + trace_cgroup_mkdir(cgrp); + /* let's create and online css's */ kernfs_activate(kn); @@ -5507,6 +5529,9 @@ static int cgroup_rmdir(struct kernfs_node *kn) ret = cgroup_destroy_locked(cgrp); + if (!ret) + trace_cgroup_rmdir(cgrp); + cgroup_kn_unlock(kn); return ret; } @@ -5627,6 +5652,12 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); + /* + * The latency of the synchronize_sched() is too high for cgroups, + * avoid it at the cost of forcing all readers into the slow path. + */ + rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); + get_user_ns(init_cgroup_ns.user_ns); mutex_lock(&cgroup_mutex); @@ -5737,7 +5768,7 @@ core_initcall(cgroup_wq_init); int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk) { - char *buf, *path; + char *buf; int retval; struct cgroup_root *root; @@ -5780,18 +5811,18 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, * " (deleted)" is appended to the cgroup path. */ if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { - path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, + retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, current->nsproxy->cgroup_ns); - if (!path) { + if (retval >= PATH_MAX) retval = -ENAMETOOLONG; + if (retval < 0) goto out_unlock; - } + + seq_puts(m, buf); } else { - path = "/"; + seq_puts(m, "/"); } - seq_puts(m, path); - if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) seq_puts(m, " (deleted)\n"); else @@ -6056,8 +6087,9 @@ static void cgroup_release_agent(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, release_agent_work); - char *pathbuf = NULL, *agentbuf = NULL, *path; + char *pathbuf = NULL, *agentbuf = NULL; char *argv[3], *envp[3]; + int ret; mutex_lock(&cgroup_mutex); @@ -6067,13 +6099,13 @@ static void cgroup_release_agent(struct work_struct *work) goto out; spin_lock_irq(&css_set_lock); - path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); + ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); spin_unlock_irq(&css_set_lock); - if (!path) + if (ret < 0 || ret >= PATH_MAX) goto out; argv[0] = agentbuf; - argv[1] = path; + argv[1] = pathbuf; argv[2] = NULL; /* minimal command environment */ @@ -6322,6 +6354,16 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) /* cgroup namespaces */ +static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES); +} + +static void dec_cgroup_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES); +} + static struct cgroup_namespace *alloc_cgroup_ns(void) { struct cgroup_namespace *new_ns; @@ -6343,6 +6385,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) void free_cgroup_ns(struct cgroup_namespace *ns) { put_css_set(ns->root_cset); + dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kfree(ns); @@ -6354,6 +6397,7 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, struct cgroup_namespace *old_ns) { struct cgroup_namespace *new_ns; + struct ucounts *ucounts; struct css_set *cset; BUG_ON(!old_ns); @@ -6367,6 +6411,10 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); + ucounts = inc_cgroup_namespaces(user_ns); + if (!ucounts) + return ERR_PTR(-ENOSPC); + /* It is not safe to take cgroup_mutex here */ spin_lock_irq(&css_set_lock); cset = task_css_set(current); @@ -6376,10 +6424,12 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, new_ns = alloc_cgroup_ns(); if (IS_ERR(new_ns)) { put_css_set(cset); + dec_cgroup_namespaces(ucounts); return new_ns; } new_ns->user_ns = get_user_ns(user_ns); + new_ns->ucounts = ucounts; new_ns->root_cset = cset; return new_ns; @@ -6430,12 +6480,18 @@ static void cgroupns_put(struct ns_common *ns) put_cgroup_ns(to_cg_ns(ns)); } +static struct user_namespace *cgroupns_owner(struct ns_common *ns) +{ + return to_cg_ns(ns)->user_ns; +} + const struct proc_ns_operations cgroupns_operations = { .name = "cgroup", .type = CLONE_NEWCGROUP, .get = cgroupns_get, .put = cgroupns_put, .install = cgroupns_install, + .owner = cgroupns_owner, }; static __init int cgroup_namespaces_init(void) @@ -6444,6 +6500,19 @@ static __init int cgroup_namespaces_init(void) } subsys_initcall(cgroup_namespaces_init); +#ifdef CONFIG_CGROUP_BPF +void cgroup_bpf_update(struct cgroup *cgrp, + struct bpf_prog *prog, + enum bpf_attach_type type) +{ + struct cgroup *parent = cgroup_parent(cgrp); + + mutex_lock(&cgroup_mutex); + __cgroup_bpf_update(cgrp, parent, prog, type); + mutex_unlock(&cgroup_mutex); +} +#endif /* CONFIG_CGROUP_BPF */ + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state * debug_css_alloc(struct cgroup_subsys_state *parent_css) diff --git a/kernel/compat.c b/kernel/compat.c index 333d364be29d..b3a047f208a7 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -307,12 +307,17 @@ static inline long put_compat_itimerval(struct compat_itimerval __user *o, __put_user(i->it_value.tv_usec, &o->it_value.tv_usec))); } +asmlinkage long sys_ni_posix_timers(void); + COMPAT_SYSCALL_DEFINE2(getitimer, int, which, struct compat_itimerval __user *, it) { struct itimerval kit; int error; + if (!IS_ENABLED(CONFIG_POSIX_TIMERS)) + return sys_ni_posix_timers(); + error = do_getitimer(which, &kit); if (!error && put_compat_itimerval(it, &kit)) error = -EFAULT; @@ -326,6 +331,9 @@ COMPAT_SYSCALL_DEFINE3(setitimer, int, which, struct itimerval kin, kout; int error; + if (!IS_ENABLED(CONFIG_POSIX_TIMERS)) + return sys_ni_posix_timers(); + if (in) { if (get_compat_itimerval(&kin, in)) return -EFAULT; diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index 9f748ed7bea8..1a8f34f63601 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -11,7 +11,6 @@ CONFIG_ANDROID_LOW_MEMORY_KILLER=y CONFIG_ARMV8_DEPRECATED=y CONFIG_ASHMEM=y CONFIG_AUDIT=y -CONFIG_BLK_DEV_DM=y CONFIG_BLK_DEV_INITRD=y CONFIG_CGROUPS=y CONFIG_CGROUP_CPUACCT=y @@ -19,9 +18,7 @@ CONFIG_CGROUP_DEBUG=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_SCHED=y CONFIG_CP15_BARRIER_EMULATION=y -CONFIG_DM_CRYPT=y -CONFIG_DM_VERITY=y -CONFIG_DM_VERITY_FEC=y +CONFIG_DEFAULT_SECURITY_SELINUX=y CONFIG_EMBEDDED=y CONFIG_FB=y CONFIG_HIGH_RES_TIMERS=y @@ -41,7 +38,6 @@ CONFIG_IPV6=y CONFIG_IPV6_MIP6=y CONFIG_IPV6_MULTIPLE_TABLES=y CONFIG_IPV6_OPTIMISTIC_DAD=y -CONFIG_IPV6_PRIVACY=y CONFIG_IPV6_ROUTER_PREF=y CONFIG_IPV6_ROUTE_INFO=y CONFIG_IP_ADVANCED_ROUTER=y @@ -135,6 +131,7 @@ CONFIG_PREEMPT=y CONFIG_QUOTA=y CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y +CONFIG_SECCOMP=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index e3b953e966d2..297756be369c 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -6,12 +6,16 @@ # CONFIG_PM_WAKELOCKS_GC is not set # CONFIG_VT is not set CONFIG_BACKLIGHT_LCD_SUPPORT=y +CONFIG_BLK_DEV_DM=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 CONFIG_COMPACTION=y CONFIG_DEBUG_RODATA=y +CONFIG_DM_CRYPT=y CONFIG_DM_UEVENT=y +CONFIG_DM_VERITY=y +CONFIG_DM_VERITY_FEC=y CONFIG_DRAGONRISE_FF=y CONFIG_ENABLE_DEFAULT_TRACERS=y CONFIG_EXT4_FS=y diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config new file mode 100644 index 000000000000..8d9643767142 --- /dev/null +++ b/kernel/configs/kvm_guest.config @@ -0,0 +1,32 @@ +CONFIG_NET=y +CONFIG_NET_CORE=y +CONFIG_NETDEVICES=y +CONFIG_BLOCK=y +CONFIG_BLK_DEV=y +CONFIG_NETWORK_FILESYSTEMS=y +CONFIG_INET=y +CONFIG_TTY=y +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_BINFMT_ELF=y +CONFIG_PCI=y +CONFIG_PCI_MSI=y +CONFIG_DEBUG_KERNEL=y +CONFIG_VIRTUALIZATION=y +CONFIG_HYPERVISOR_GUEST=y +CONFIG_PARAVIRT=y +CONFIG_KVM_GUEST=y +CONFIG_VIRTIO=y +CONFIG_VIRTIO_PCI=y +CONFIG_VIRTIO_BLK=y +CONFIG_VIRTIO_CONSOLE=y +CONFIG_VIRTIO_NET=y +CONFIG_9P_FS=y +CONFIG_NET_9P=y +CONFIG_NET_9P_VIRTIO=y +CONFIG_SCSI_LOWLEVEL=y +CONFIG_SCSI_VIRTIO=y +CONFIG_VIRTIO_INPUT=y +CONFIG_DRM_VIRTIO_GPU=y diff --git a/kernel/cpu.c b/kernel/cpu.c index 341bf80f80bd..217fd2e7f435 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -23,6 +23,8 @@ #include <linux/tick.h> #include <linux/irq.h> #include <linux/smpboot.h> +#include <linux/relay.h> +#include <linux/slab.h> #include <trace/events/power.h> #define CREATE_TRACE_POINTS @@ -37,8 +39,9 @@ * @thread: Pointer to the hotplug thread * @should_run: Thread should execute * @rollback: Perform a rollback - * @cb_stat: The state for a single callback (install/uninstall) - * @cb: Single callback function (install/uninstall) + * @single: Single callback invocation + * @bringup: Single callback bringup or teardown selector + * @cb_state: The state for a single callback (install/uninstall) * @result: Result of the operation * @done: Signal completion to the issuer of the task */ @@ -49,8 +52,10 @@ struct cpuhp_cpu_state { struct task_struct *thread; bool should_run; bool rollback; + bool single; + bool bringup; + struct hlist_node *node; enum cpuhp_state cb_state; - int (*cb)(unsigned int cpu); int result; struct completion done; #endif @@ -68,35 +73,103 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); * @cant_stop: Bringup/teardown can't be stopped at this step */ struct cpuhp_step { - const char *name; - int (*startup)(unsigned int cpu); - int (*teardown)(unsigned int cpu); - bool skip_onerr; - bool cant_stop; + const char *name; + union { + int (*single)(unsigned int cpu); + int (*multi)(unsigned int cpu, + struct hlist_node *node); + } startup; + union { + int (*single)(unsigned int cpu); + int (*multi)(unsigned int cpu, + struct hlist_node *node); + } teardown; + struct hlist_head list; + bool skip_onerr; + bool cant_stop; + bool multi_instance; }; static DEFINE_MUTEX(cpuhp_state_mutex); static struct cpuhp_step cpuhp_bp_states[]; static struct cpuhp_step cpuhp_ap_states[]; +static bool cpuhp_is_ap_state(enum cpuhp_state state) +{ + /* + * The extra check for CPUHP_TEARDOWN_CPU is only for documentation + * purposes as that state is handled explicitly in cpu_down. + */ + return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; +} + +static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) +{ + struct cpuhp_step *sp; + + sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states; + return sp + state; +} + /** * cpuhp_invoke_callback _ Invoke the callbacks for a given state * @cpu: The cpu for which the callback should be invoked * @step: The step in the state machine - * @cb: The callback function to invoke + * @bringup: True if the bringup callback should be invoked * - * Called from cpu hotplug and from the state register machinery + * Called from cpu hotplug and from the state register machinery. */ -static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state step, - int (*cb)(unsigned int)) +static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, + bool bringup, struct hlist_node *node) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); - int ret = 0; - - if (cb) { - trace_cpuhp_enter(cpu, st->target, step, cb); + struct cpuhp_step *step = cpuhp_get_step(state); + int (*cbm)(unsigned int cpu, struct hlist_node *node); + int (*cb)(unsigned int cpu); + int ret, cnt; + + if (!step->multi_instance) { + cb = bringup ? step->startup.single : step->teardown.single; + if (!cb) + return 0; + trace_cpuhp_enter(cpu, st->target, state, cb); ret = cb(cpu); - trace_cpuhp_exit(cpu, st->state, step, ret); + trace_cpuhp_exit(cpu, st->state, state, ret); + return ret; + } + cbm = bringup ? step->startup.multi : step->teardown.multi; + if (!cbm) + return 0; + + /* Single invocation for instance add/remove */ + if (node) { + trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); + ret = cbm(cpu, node); + trace_cpuhp_exit(cpu, st->state, state, ret); + return ret; + } + + /* State transition. Invoke on all instances */ + cnt = 0; + hlist_for_each(node, &step->list) { + trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); + ret = cbm(cpu, node); + trace_cpuhp_exit(cpu, st->state, state, ret); + if (ret) + goto err; + cnt++; + } + return 0; +err: + /* Rollback the instances if one failed */ + cbm = !bringup ? step->startup.multi : step->teardown.multi; + if (!cbm) + return ret; + + hlist_for_each(node, &step->list) { + if (!cnt--) + break; + cbm(cpu, node); } return ret; } @@ -155,7 +228,7 @@ static struct { .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), #ifdef CONFIG_DEBUG_LOCK_ALLOC - .dep_map = {.name = "cpu_hotplug.lock" }, + .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map), #endif }; @@ -260,10 +333,17 @@ void cpu_hotplug_disable(void) } EXPORT_SYMBOL_GPL(cpu_hotplug_disable); +static void __cpu_hotplug_enable(void) +{ + if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n")) + return; + cpu_hotplug_disabled--; +} + void cpu_hotplug_enable(void) { cpu_maps_update_begin(); - WARN_ON(--cpu_hotplug_disabled < 0); + __cpu_hotplug_enable(); cpu_maps_update_done(); } EXPORT_SYMBOL_GPL(cpu_hotplug_enable); @@ -330,12 +410,6 @@ static int notify_online(unsigned int cpu) return 0; } -static int notify_starting(unsigned int cpu) -{ - cpu_notify(CPU_STARTING, cpu); - return 0; -} - static int bringup_wait_for_ap(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); @@ -349,8 +423,16 @@ static int bringup_cpu(unsigned int cpu) struct task_struct *idle = idle_thread_get(cpu); int ret; + /* + * Some architectures have to walk the irq descriptors to + * setup the vector space for the cpu which comes online. + * Prevent irq alloc/free across the bringup. + */ + irq_lock_sparse(); + /* Arch-specific enabling code. */ ret = __cpu_up(cpu, idle); + irq_unlock_sparse(); if (ret) { cpu_notify(CPU_UP_CANCELED, cpu); return ret; @@ -363,62 +445,55 @@ static int bringup_cpu(unsigned int cpu) /* * Hotplug state machine related functions */ -static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st, - struct cpuhp_step *steps) +static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) { for (st->state++; st->state < st->target; st->state++) { - struct cpuhp_step *step = steps + st->state; + struct cpuhp_step *step = cpuhp_get_step(st->state); if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, step->startup); + cpuhp_invoke_callback(cpu, st->state, true, NULL); } } static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, - struct cpuhp_step *steps, enum cpuhp_state target) + enum cpuhp_state target) { enum cpuhp_state prev_state = st->state; int ret = 0; for (; st->state > target; st->state--) { - struct cpuhp_step *step = steps + st->state; - - ret = cpuhp_invoke_callback(cpu, st->state, step->teardown); + ret = cpuhp_invoke_callback(cpu, st->state, false, NULL); if (ret) { st->target = prev_state; - undo_cpu_down(cpu, st, steps); + undo_cpu_down(cpu, st); break; } } return ret; } -static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st, - struct cpuhp_step *steps) +static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) { for (st->state--; st->state > st->target; st->state--) { - struct cpuhp_step *step = steps + st->state; + struct cpuhp_step *step = cpuhp_get_step(st->state); if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, step->teardown); + cpuhp_invoke_callback(cpu, st->state, false, NULL); } } static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, - struct cpuhp_step *steps, enum cpuhp_state target) + enum cpuhp_state target) { enum cpuhp_state prev_state = st->state; int ret = 0; while (st->state < target) { - struct cpuhp_step *step; - st->state++; - step = steps + st->state; - ret = cpuhp_invoke_callback(cpu, st->state, step->startup); + ret = cpuhp_invoke_callback(cpu, st->state, true, NULL); if (ret) { st->target = prev_state; - undo_cpu_up(cpu, st, steps); + undo_cpu_up(cpu, st); break; } } @@ -447,13 +522,13 @@ static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st) { enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU); - return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target); + return cpuhp_down_callbacks(cpu, st, target); } /* Execute the online startup callbacks. Used to be CPU_ONLINE */ static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st) { - return cpuhp_up_callbacks(cpu, st, cpuhp_ap_states, st->target); + return cpuhp_up_callbacks(cpu, st, st->target); } /* @@ -476,18 +551,20 @@ static void cpuhp_thread_fun(unsigned int cpu) st->should_run = false; /* Single callback invocation for [un]install ? */ - if (st->cb) { + if (st->single) { if (st->cb_state < CPUHP_AP_ONLINE) { local_irq_disable(); - ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); + ret = cpuhp_invoke_callback(cpu, st->cb_state, + st->bringup, st->node); local_irq_enable(); } else { - ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); + ret = cpuhp_invoke_callback(cpu, st->cb_state, + st->bringup, st->node); } } else if (st->rollback) { BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); - undo_cpu_down(cpu, st, cpuhp_ap_states); + undo_cpu_down(cpu, st); /* * This is a momentary workaround to keep the notifier users * happy. Will go away once we got rid of the notifiers. @@ -509,8 +586,9 @@ static void cpuhp_thread_fun(unsigned int cpu) } /* Invoke a single callback on a remote cpu */ -static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, - int (*cb)(unsigned int)) +static int +cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, + struct hlist_node *node) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); @@ -522,10 +600,13 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, * we invoke the thread function directly. */ if (!st->thread) - return cpuhp_invoke_callback(cpu, state, cb); + return cpuhp_invoke_callback(cpu, state, bringup, node); st->cb_state = state; - st->cb = cb; + st->single = true; + st->bringup = bringup; + st->node = node; + /* * Make sure the above stores are visible before should_run becomes * true. Paired with the mb() above in cpuhp_thread_fun() @@ -541,7 +622,7 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st) { st->result = 0; - st->cb = NULL; + st->single = false; /* * Make sure the above stores are visible before should_run becomes * true. Paired with the mb() above in cpuhp_thread_fun() @@ -578,7 +659,6 @@ void __init cpuhp_threads_init(void) kthread_unpark(this_cpu_read(cpuhp_state.thread)); } -#ifdef CONFIG_HOTPLUG_CPU EXPORT_SYMBOL(register_cpu_notifier); EXPORT_SYMBOL(__register_cpu_notifier); void unregister_cpu_notifier(struct notifier_block *nb) @@ -595,6 +675,7 @@ void __unregister_cpu_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(__unregister_cpu_notifier); +#ifdef CONFIG_HOTPLUG_CPU /** * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU * @cpu: a CPU id @@ -674,12 +755,6 @@ static int notify_down_prepare(unsigned int cpu) return err; } -static int notify_dying(unsigned int cpu) -{ - cpu_notify(CPU_DYING, cpu); - return 0; -} - /* Take this CPU down. */ static int take_cpu_down(void *_param) { @@ -692,12 +767,16 @@ static int take_cpu_down(void *_param) if (err < 0) return err; + /* + * We get here while we are in CPUHP_TEARDOWN_CPU state and we must not + * do this step again. + */ + WARN_ON(st->state != CPUHP_TEARDOWN_CPU); + st->state--; /* Invoke the former CPU_DYING callbacks */ - for (; st->state > target; st->state--) { - struct cpuhp_step *step = cpuhp_ap_states + st->state; + for (; st->state > target; st->state--) + cpuhp_invoke_callback(cpu, st->state, false, NULL); - cpuhp_invoke_callback(cpu, st->state, step->teardown); - } /* Give up timekeeping duties */ tick_handover_do_timer(); /* Park the stopper thread */ @@ -734,7 +813,7 @@ static int takedown_cpu(unsigned int cpu) BUG_ON(cpu_online(cpu)); /* - * The migration_call() CPU_DYING callback will have removed all + * The CPUHP_AP_SCHED_MIGRATE_DYING callback will have removed all * runnable tasks from the cpu, there's only the idle task left now * that the migration thread is done doing the stop_machine thing. * @@ -787,7 +866,6 @@ void cpuhp_report_idle_dead(void) #define notify_down_prepare NULL #define takedown_cpu NULL #define notify_dead NULL -#define notify_dying NULL #endif #ifdef CONFIG_HOTPLUG_CPU @@ -836,7 +914,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need * to do the further cleanups. */ - ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target); + ret = cpuhp_down_callbacks(cpu, st, target); if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { st->target = prev_state; st->rollback = true; @@ -877,10 +955,9 @@ EXPORT_SYMBOL(cpu_down); #endif /*CONFIG_HOTPLUG_CPU*/ /** - * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers + * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU * @cpu: cpu that just started * - * This function calls the cpu_chain notifiers with CPU_STARTING. * It must be called by the arch code on the new cpu, before the new cpu * enables interrupts and before the "boot" cpu returns from __cpu_up(). */ @@ -889,12 +966,10 @@ void notify_cpu_starting(unsigned int cpu) struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); + rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ while (st->state < target) { - struct cpuhp_step *step; - st->state++; - step = cpuhp_ap_states + st->state; - cpuhp_invoke_callback(cpu, st->state, step->startup); + cpuhp_invoke_callback(cpu, st->state, true, NULL); } } @@ -979,7 +1054,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) * responsible for bringing it up to the target state. */ target = min((int)target, CPUHP_BRINGUP_CPU); - ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target); + ret = cpuhp_up_callbacks(cpu, st, target); out: cpu_hotplug_done(); return ret; @@ -1024,12 +1099,13 @@ EXPORT_SYMBOL_GPL(cpu_up); #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; -int disable_nonboot_cpus(void) +int freeze_secondary_cpus(int primary) { - int cpu, first_cpu, error = 0; + int cpu, error = 0; cpu_maps_update_begin(); - first_cpu = cpumask_first(cpu_online_mask); + if (!cpu_online(primary)) + primary = cpumask_first(cpu_online_mask); /* * We take down all of the non-boot CPUs in one shot to avoid races * with the userspace trying to use the CPU hotplug at the same time @@ -1038,7 +1114,7 @@ int disable_nonboot_cpus(void) pr_info("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { - if (cpu == first_cpu) + if (cpu == primary) continue; trace_suspend_resume(TPS("CPU_OFF"), cpu, true); error = _cpu_down(cpu, 1, CPUHP_OFFLINE); @@ -1081,7 +1157,7 @@ void enable_nonboot_cpus(void) /* Allow everyone to use the CPU hotplug again */ cpu_maps_update_begin(); - WARN_ON(--cpu_hotplug_disabled < 0); + __cpu_hotplug_enable(); if (cpumask_empty(frozen_cpus)) goto out; @@ -1170,40 +1246,50 @@ core_initcall(cpu_hotplug_pm_sync_init); static struct cpuhp_step cpuhp_bp_states[] = { [CPUHP_OFFLINE] = { .name = "offline", - .startup = NULL, - .teardown = NULL, + .startup.single = NULL, + .teardown.single = NULL, }, #ifdef CONFIG_SMP [CPUHP_CREATE_THREADS]= { - .name = "threads:create", - .startup = smpboot_create_threads, - .teardown = NULL, + .name = "threads:prepare", + .startup.single = smpboot_create_threads, + .teardown.single = NULL, .cant_stop = true, }, [CPUHP_PERF_PREPARE] = { - .name = "perf prepare", - .startup = perf_event_init_cpu, - .teardown = perf_event_exit_cpu, + .name = "perf:prepare", + .startup.single = perf_event_init_cpu, + .teardown.single = perf_event_exit_cpu, }, [CPUHP_WORKQUEUE_PREP] = { - .name = "workqueue prepare", - .startup = workqueue_prepare_cpu, - .teardown = NULL, + .name = "workqueue:prepare", + .startup.single = workqueue_prepare_cpu, + .teardown.single = NULL, }, [CPUHP_HRTIMERS_PREPARE] = { - .name = "hrtimers prepare", - .startup = hrtimers_prepare_cpu, - .teardown = hrtimers_dead_cpu, + .name = "hrtimers:prepare", + .startup.single = hrtimers_prepare_cpu, + .teardown.single = hrtimers_dead_cpu, }, [CPUHP_SMPCFD_PREPARE] = { - .name = "SMPCFD prepare", - .startup = smpcfd_prepare_cpu, - .teardown = smpcfd_dead_cpu, + .name = "smpcfd:prepare", + .startup.single = smpcfd_prepare_cpu, + .teardown.single = smpcfd_dead_cpu, + }, + [CPUHP_RELAY_PREPARE] = { + .name = "relay:prepare", + .startup.single = relay_prepare_cpu, + .teardown.single = NULL, + }, + [CPUHP_SLAB_PREPARE] = { + .name = "slab:prepare", + .startup.single = slab_prepare_cpu, + .teardown.single = slab_dead_cpu, }, [CPUHP_RCUTREE_PREP] = { - .name = "RCU-tree prepare", - .startup = rcutree_prepare_cpu, - .teardown = rcutree_dead_cpu, + .name = "RCU/tree:prepare", + .startup.single = rcutree_prepare_cpu, + .teardown.single = rcutree_dead_cpu, }, /* * Preparatory and dead notifiers. Will be replaced once the notifiers @@ -1211,8 +1297,8 @@ static struct cpuhp_step cpuhp_bp_states[] = { */ [CPUHP_NOTIFY_PREPARE] = { .name = "notify:prepare", - .startup = notify_prepare, - .teardown = notify_dead, + .startup.single = notify_prepare, + .teardown.single = notify_dead, .skip_onerr = true, .cant_stop = true, }, @@ -1222,20 +1308,21 @@ static struct cpuhp_step cpuhp_bp_states[] = { * otherwise a RCU stall occurs. */ [CPUHP_TIMERS_DEAD] = { - .name = "timers dead", - .startup = NULL, - .teardown = timers_dead_cpu, + .name = "timers:dead", + .startup.single = NULL, + .teardown.single = timers_dead_cpu, }, /* Kicks the plugged cpu into life */ [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", - .startup = bringup_cpu, - .teardown = NULL, + .startup.single = bringup_cpu, + .teardown.single = NULL, .cant_stop = true, }, [CPUHP_AP_SMPCFD_DYING] = { - .startup = NULL, - .teardown = smpcfd_dying_cpu, + .name = "smpcfd:dying", + .startup.single = NULL, + .teardown.single = smpcfd_dying_cpu, }, /* * Handled on controll processor until the plugged processor manages @@ -1243,8 +1330,8 @@ static struct cpuhp_step cpuhp_bp_states[] = { */ [CPUHP_TEARDOWN_CPU] = { .name = "cpu:teardown", - .startup = NULL, - .teardown = takedown_cpu, + .startup.single = NULL, + .teardown.single = takedown_cpu, .cant_stop = true, }, #else @@ -1270,24 +1357,13 @@ static struct cpuhp_step cpuhp_ap_states[] = { /* First state is scheduler control. Interrupts are disabled */ [CPUHP_AP_SCHED_STARTING] = { .name = "sched:starting", - .startup = sched_cpu_starting, - .teardown = sched_cpu_dying, + .startup.single = sched_cpu_starting, + .teardown.single = sched_cpu_dying, }, [CPUHP_AP_RCUTREE_DYING] = { - .startup = NULL, - .teardown = rcutree_dying_cpu, - }, - /* - * Low level startup/teardown notifiers. Run with interrupts - * disabled. Will be removed once the notifiers are converted to - * states. - */ - [CPUHP_AP_NOTIFY_STARTING] = { - .name = "notify:starting", - .startup = notify_starting, - .teardown = notify_dying, - .skip_onerr = true, - .cant_stop = true, + .name = "RCU/tree:dying", + .startup.single = NULL, + .teardown.single = rcutree_dying_cpu, }, /* Entry state on starting. Interrupts enabled from here on. Transient * state for synchronsization */ @@ -1296,24 +1372,24 @@ static struct cpuhp_step cpuhp_ap_states[] = { }, /* Handle smpboot threads park/unpark */ [CPUHP_AP_SMPBOOT_THREADS] = { - .name = "smpboot:threads", - .startup = smpboot_unpark_threads, - .teardown = NULL, + .name = "smpboot/threads:online", + .startup.single = smpboot_unpark_threads, + .teardown.single = NULL, }, [CPUHP_AP_PERF_ONLINE] = { - .name = "perf online", - .startup = perf_event_init_cpu, - .teardown = perf_event_exit_cpu, + .name = "perf:online", + .startup.single = perf_event_init_cpu, + .teardown.single = perf_event_exit_cpu, }, [CPUHP_AP_WORKQUEUE_ONLINE] = { - .name = "workqueue online", - .startup = workqueue_online_cpu, - .teardown = workqueue_offline_cpu, + .name = "workqueue:online", + .startup.single = workqueue_online_cpu, + .teardown.single = workqueue_offline_cpu, }, [CPUHP_AP_RCUTREE_ONLINE] = { - .name = "RCU-tree online", - .startup = rcutree_online_cpu, - .teardown = rcutree_offline_cpu, + .name = "RCU/tree:online", + .startup.single = rcutree_online_cpu, + .teardown.single = rcutree_offline_cpu, }, /* @@ -1322,8 +1398,8 @@ static struct cpuhp_step cpuhp_ap_states[] = { */ [CPUHP_AP_NOTIFY_ONLINE] = { .name = "notify:online", - .startup = notify_online, - .teardown = notify_down_prepare, + .startup.single = notify_online, + .teardown.single = notify_down_prepare, .skip_onerr = true, }, #endif @@ -1335,16 +1411,16 @@ static struct cpuhp_step cpuhp_ap_states[] = { /* Last state is scheduler control setting the cpu active */ [CPUHP_AP_ACTIVE] = { .name = "sched:active", - .startup = sched_cpu_activate, - .teardown = sched_cpu_deactivate, + .startup.single = sched_cpu_activate, + .teardown.single = sched_cpu_deactivate, }, #endif /* CPU is fully up and running. */ [CPUHP_ONLINE] = { .name = "online", - .startup = NULL, - .teardown = NULL, + .startup.single = NULL, + .teardown.single = NULL, }, }; @@ -1356,54 +1432,42 @@ static int cpuhp_cb_check(enum cpuhp_state state) return 0; } -static bool cpuhp_is_ap_state(enum cpuhp_state state) -{ - /* - * The extra check for CPUHP_TEARDOWN_CPU is only for documentation - * purposes as that state is handled explicitely in cpu_down. - */ - return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; -} - -static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) -{ - struct cpuhp_step *sp; - - sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states; - return sp + state; -} - static void cpuhp_store_callbacks(enum cpuhp_state state, const char *name, int (*startup)(unsigned int cpu), - int (*teardown)(unsigned int cpu)) + int (*teardown)(unsigned int cpu), + bool multi_instance) { /* (Un)Install the callbacks for further cpu hotplug operations */ struct cpuhp_step *sp; mutex_lock(&cpuhp_state_mutex); sp = cpuhp_get_step(state); - sp->startup = startup; - sp->teardown = teardown; + sp->startup.single = startup; + sp->teardown.single = teardown; sp->name = name; + sp->multi_instance = multi_instance; + INIT_HLIST_HEAD(&sp->list); mutex_unlock(&cpuhp_state_mutex); } static void *cpuhp_get_teardown_cb(enum cpuhp_state state) { - return cpuhp_get_step(state)->teardown; + return cpuhp_get_step(state)->teardown.single; } /* * Call the startup/teardown function for a step either on the AP or * on the current CPU. */ -static int cpuhp_issue_call(int cpu, enum cpuhp_state state, - int (*cb)(unsigned int), bool bringup) +static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, + struct hlist_node *node) { + struct cpuhp_step *sp = cpuhp_get_step(state); int ret; - if (!cb) + if ((bringup && !sp->startup.single) || + (!bringup && !sp->teardown.single)) return 0; /* * The non AP bound callbacks can fail on bringup. On teardown @@ -1411,11 +1475,11 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, */ #ifdef CONFIG_SMP if (cpuhp_is_ap_state(state)) - ret = cpuhp_invoke_ap_callback(cpu, state, cb); + ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node); else - ret = cpuhp_invoke_callback(cpu, state, cb); + ret = cpuhp_invoke_callback(cpu, state, bringup, node); #else - ret = cpuhp_invoke_callback(cpu, state, cb); + ret = cpuhp_invoke_callback(cpu, state, bringup, node); #endif BUG_ON(ret && !bringup); return ret; @@ -1427,13 +1491,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, * Note: The teardown callbacks for rollback are not allowed to fail! */ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, - int (*teardown)(unsigned int cpu)) + struct hlist_node *node) { int cpu; - if (!teardown) - return; - /* Roll back the already executed steps on the other cpus */ for_each_present_cpu(cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); @@ -1444,7 +1505,7 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, /* Did we invoke the startup call on that cpu ? */ if (cpustate >= state) - cpuhp_issue_call(cpu, state, teardown, false); + cpuhp_issue_call(cpu, state, false, node); } } @@ -1471,6 +1532,52 @@ static int cpuhp_reserve_state(enum cpuhp_state state) return -ENOSPC; } +int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, + bool invoke) +{ + struct cpuhp_step *sp; + int cpu; + int ret; + + sp = cpuhp_get_step(state); + if (sp->multi_instance == false) + return -EINVAL; + + get_online_cpus(); + + if (!invoke || !sp->startup.multi) + goto add_node; + + /* + * Try to call the startup callback for each present cpu + * depending on the hotplug state of the cpu. + */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpustate < state) + continue; + + ret = cpuhp_issue_call(cpu, state, true, node); + if (ret) { + if (sp->teardown.multi) + cpuhp_rollback_install(cpu, state, node); + goto err; + } + } +add_node: + ret = 0; + mutex_lock(&cpuhp_state_mutex); + hlist_add_head(node, &sp->list); + mutex_unlock(&cpuhp_state_mutex); + +err: + put_online_cpus(); + return ret; +} +EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); + /** * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state * @state: The state to setup @@ -1484,7 +1591,8 @@ static int cpuhp_reserve_state(enum cpuhp_state state) int __cpuhp_setup_state(enum cpuhp_state state, const char *name, bool invoke, int (*startup)(unsigned int cpu), - int (*teardown)(unsigned int cpu)) + int (*teardown)(unsigned int cpu), + bool multi_instance) { int cpu, ret = 0; int dyn_state = 0; @@ -1503,7 +1611,7 @@ int __cpuhp_setup_state(enum cpuhp_state state, state = ret; } - cpuhp_store_callbacks(state, name, startup, teardown); + cpuhp_store_callbacks(state, name, startup, teardown, multi_instance); if (!invoke || !startup) goto out; @@ -1519,10 +1627,11 @@ int __cpuhp_setup_state(enum cpuhp_state state, if (cpustate < state) continue; - ret = cpuhp_issue_call(cpu, state, startup, true); + ret = cpuhp_issue_call(cpu, state, true, NULL); if (ret) { - cpuhp_rollback_install(cpu, state, teardown); - cpuhp_store_callbacks(state, NULL, NULL, NULL); + if (teardown) + cpuhp_rollback_install(cpu, state, NULL); + cpuhp_store_callbacks(state, NULL, NULL, NULL, false); goto out; } } @@ -1534,6 +1643,42 @@ out: } EXPORT_SYMBOL(__cpuhp_setup_state); +int __cpuhp_state_remove_instance(enum cpuhp_state state, + struct hlist_node *node, bool invoke) +{ + struct cpuhp_step *sp = cpuhp_get_step(state); + int cpu; + + BUG_ON(cpuhp_cb_check(state)); + + if (!sp->multi_instance) + return -EINVAL; + + get_online_cpus(); + if (!invoke || !cpuhp_get_teardown_cb(state)) + goto remove; + /* + * Call the teardown callback for each present cpu depending + * on the hotplug state of the cpu. This function is not + * allowed to fail currently! + */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpustate >= state) + cpuhp_issue_call(cpu, state, false, node); + } + +remove: + mutex_lock(&cpuhp_state_mutex); + hlist_del(node); + mutex_unlock(&cpuhp_state_mutex); + put_online_cpus(); + + return 0; +} +EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance); /** * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state * @state: The state to remove @@ -1545,14 +1690,21 @@ EXPORT_SYMBOL(__cpuhp_setup_state); */ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) { - int (*teardown)(unsigned int cpu) = cpuhp_get_teardown_cb(state); + struct cpuhp_step *sp = cpuhp_get_step(state); int cpu; BUG_ON(cpuhp_cb_check(state)); get_online_cpus(); - if (!invoke || !teardown) + if (sp->multi_instance) { + WARN(!hlist_empty(&sp->list), + "Error: Removing state %d which has instances left.\n", + state); + goto remove; + } + + if (!invoke || !cpuhp_get_teardown_cb(state)) goto remove; /* @@ -1565,10 +1717,10 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) int cpustate = st->state; if (cpustate >= state) - cpuhp_issue_call(cpu, state, teardown, false); + cpuhp_issue_call(cpu, state, false, NULL); } remove: - cpuhp_store_callbacks(state, NULL, NULL, NULL); + cpuhp_store_callbacks(state, NULL, NULL, NULL, false); put_online_cpus(); } EXPORT_SYMBOL(__cpuhp_remove_state); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 2b4c20ab5bbe..29f815d2ef7e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2715,7 +2715,7 @@ void __cpuset_memory_pressure_bump(void) int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk) { - char *buf, *p; + char *buf; struct cgroup_subsys_state *css; int retval; @@ -2724,14 +2724,15 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, if (!buf) goto out; - retval = -ENAMETOOLONG; css = task_get_css(tsk, cpuset_cgrp_id); - p = cgroup_path_ns(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); + retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); css_put(css); - if (!p) + if (retval >= PATH_MAX) + retval = -ENAMETOOLONG; + if (retval < 0) goto out_free; - seq_puts(m, p); + seq_puts(m, buf); seq_putc(m, '\n'); retval = 0; out_free: diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index fc1ef736253c..98c9011eac78 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -697,7 +697,7 @@ kdb_printit: * Write to all consoles. */ retlen = strlen(kdb_buffer); - cp = (char *) printk_skip_level(kdb_buffer); + cp = (char *) printk_skip_headers(kdb_buffer); if (!dbg_kdb_mode && kgdb_connected) { gdbstub_msg_write(cp, retlen - (cp - kdb_buffer)); } else { diff --git a/kernel/events/core.c b/kernel/events/core.c index fc9bb2225291..faf073d0287f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -902,7 +902,15 @@ list_update_cgroup_event(struct perf_event *event, * this will always be called from the right CPU. */ cpuctx = __get_cpu_context(ctx); - cpuctx->cgrp = add ? event->cgrp : NULL; + + /* + * cpuctx->cgrp is NULL until a cgroup event is sched in or + * ctx->nr_cgroup == 0 . + */ + if (add && perf_cgroup_from_task(current, ctx) == event->cgrp) + cpuctx->cgrp = event->cgrp; + else if (!add) + cpuctx->cgrp = NULL; } #else /* !CONFIG_CGROUP_PERF */ @@ -1475,8 +1483,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader == event) { struct list_head *list; - if (is_software_event(event)) - event->group_flags |= PERF_GROUP_SOFTWARE; + event->group_caps = event->event_caps; list = ctx_group_list(event, ctx); list_add_tail(&event->group_entry, list); @@ -1630,9 +1637,7 @@ static void perf_group_attach(struct perf_event *event) WARN_ON_ONCE(group_leader->ctx != event->ctx); - if (group_leader->group_flags & PERF_GROUP_SOFTWARE && - !is_software_event(event)) - group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; + group_leader->group_caps &= event->event_caps; list_add_tail(&event->group_entry, &group_leader->sibling_list); group_leader->nr_siblings++; @@ -1723,7 +1728,7 @@ static void perf_group_detach(struct perf_event *event) sibling->group_leader = sibling; /* Inherit group flags from the previous leader */ - sibling->group_flags = event->group_flags; + sibling->group_caps = event->group_caps; WARN_ON_ONCE(sibling->ctx != event->ctx); } @@ -1832,6 +1837,8 @@ group_sched_out(struct perf_event *group_event, struct perf_event *event; int state = group_event->state; + perf_pmu_disable(ctx->pmu); + event_sched_out(group_event, cpuctx, ctx); /* @@ -1840,6 +1847,8 @@ group_sched_out(struct perf_event *group_event, list_for_each_entry(event, &group_event->sibling_list, group_entry) event_sched_out(event, cpuctx, ctx); + perf_pmu_enable(ctx->pmu); + if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) cpuctx->exclusive = 0; } @@ -1959,6 +1968,12 @@ void perf_event_disable(struct perf_event *event) } EXPORT_SYMBOL_GPL(perf_event_disable); +void perf_event_disable_inatomic(struct perf_event *event) +{ + event->pending_disable = 1; + irq_work_queue(&event->pending); +} + static void perf_set_shadow_time(struct perf_event *event, struct perf_event_context *ctx, u64 tstamp) @@ -2145,7 +2160,7 @@ static int group_can_go_on(struct perf_event *event, /* * Groups consisting entirely of software events can always go on. */ - if (event->group_flags & PERF_GROUP_SOFTWARE) + if (event->group_caps & PERF_EV_CAP_SOFTWARE) return 1; /* * If an exclusive group is already on, no other hardware @@ -2491,7 +2506,7 @@ static int __perf_event_stop(void *info) * while restarting. */ if (sd->restart) - event->pmu->start(event, PERF_EF_START); + event->pmu->start(event, 0); return 0; } @@ -2837,19 +2852,36 @@ unlock: } } +static DEFINE_PER_CPU(struct list_head, sched_cb_list); + void perf_sched_cb_dec(struct pmu *pmu) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + this_cpu_dec(perf_sched_cb_usages); + + if (!--cpuctx->sched_cb_usage) + list_del(&cpuctx->sched_cb_entry); } + void perf_sched_cb_inc(struct pmu *pmu) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + if (!cpuctx->sched_cb_usage++) + list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); + this_cpu_inc(perf_sched_cb_usages); } /* * This function provides the context switch callback to the lower code * layer. It is invoked ONLY when the context switch callback is enabled. + * + * This callback is relevant even to per-cpu events; for example multi event + * PEBS requires this to provide PID/TID information. This requires we flush + * all queued PEBS records before we context switch to a new task. */ static void perf_pmu_sched_task(struct task_struct *prev, struct task_struct *next, @@ -2857,34 +2889,24 @@ static void perf_pmu_sched_task(struct task_struct *prev, { struct perf_cpu_context *cpuctx; struct pmu *pmu; - unsigned long flags; if (prev == next) return; - local_irq_save(flags); - - rcu_read_lock(); - - list_for_each_entry_rcu(pmu, &pmus, entry) { - if (pmu->sched_task) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - perf_ctx_lock(cpuctx, cpuctx->task_ctx); + list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { + pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */ - perf_pmu_disable(pmu); + if (WARN_ON_ONCE(!pmu->sched_task)) + continue; - pmu->sched_task(cpuctx->task_ctx, sched_in); + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_pmu_disable(pmu); - perf_pmu_enable(pmu); + pmu->sched_task(cpuctx->task_ctx, sched_in); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - } + perf_pmu_enable(pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } - - rcu_read_unlock(); - - local_irq_restore(flags); } static void perf_event_switch(struct task_struct *task, @@ -3416,6 +3438,22 @@ struct perf_read_data { int ret; }; +static int find_cpu_to_read(struct perf_event *event, int local_cpu) +{ + int event_cpu = event->oncpu; + u16 local_pkg, event_pkg; + + if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { + event_pkg = topology_physical_package_id(event_cpu); + local_pkg = topology_physical_package_id(local_cpu); + + if (event_pkg == local_pkg) + return local_cpu; + } + + return event_cpu; +} + /* * Cross CPU call to read the hardware event */ @@ -3537,7 +3575,7 @@ u64 perf_event_read_local(struct perf_event *event) static int perf_event_read(struct perf_event *event, bool group) { - int ret = 0; + int ret = 0, cpu_to_read, local_cpu; /* * If event is enabled and currently active on a CPU, update the @@ -3549,6 +3587,11 @@ static int perf_event_read(struct perf_event *event, bool group) .group = group, .ret = 0, }; + + local_cpu = get_cpu(); + cpu_to_read = find_cpu_to_read(event, local_cpu); + put_cpu(); + /* * Purposely ignore the smp_call_function_single() return * value. @@ -3559,7 +3602,7 @@ static int perf_event_read(struct perf_event *event, bool group) * Therefore, either way, we'll have an up-to-date event count * after this. */ - (void)smp_call_function_single(event->oncpu, __perf_event_read, &data, 1); + (void)smp_call_function_single(cpu_to_read, __perf_event_read, &data, 1); ret = data.ret; } else if (event->state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; @@ -5350,9 +5393,10 @@ perf_output_sample_regs(struct perf_output_handle *handle, struct pt_regs *regs, u64 mask) { int bit; + DECLARE_BITMAP(_mask, 64); - for_each_set_bit(bit, (const unsigned long *) &mask, - sizeof(mask) * BITS_PER_BYTE) { + bitmap_from_u64(_mask, mask); + for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) { u64 val; val = perf_reg_value(regs, bit); @@ -7045,11 +7089,11 @@ static int __perf_event_overflow(struct perf_event *event, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; - event->pending_disable = 1; - irq_work_queue(&event->pending); + + perf_event_disable_inatomic(event); } - event->overflow_handler(event, data, regs); + READ_ONCE(event->overflow_handler)(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; @@ -7664,11 +7708,83 @@ static void perf_event_free_filter(struct perf_event *event) ftrace_profile_free_filter(event); } +#ifdef CONFIG_BPF_SYSCALL +static void bpf_overflow_handler(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct bpf_perf_event_data_kern ctx = { + .data = data, + .regs = regs, + }; + int ret = 0; + + preempt_disable(); + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) + goto out; + rcu_read_lock(); + ret = BPF_PROG_RUN(event->prog, &ctx); + rcu_read_unlock(); +out: + __this_cpu_dec(bpf_prog_active); + preempt_enable(); + if (!ret) + return; + + event->orig_overflow_handler(event, data, regs); +} + +static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) +{ + struct bpf_prog *prog; + + if (event->overflow_handler_context) + /* hw breakpoint or kernel counter */ + return -EINVAL; + + if (event->prog) + return -EEXIST; + + prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + event->prog = prog; + event->orig_overflow_handler = READ_ONCE(event->overflow_handler); + WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); + return 0; +} + +static void perf_event_free_bpf_handler(struct perf_event *event) +{ + struct bpf_prog *prog = event->prog; + + if (!prog) + return; + + WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler); + event->prog = NULL; + bpf_prog_put(prog); +} +#else +static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) +{ + return -EOPNOTSUPP; +} +static void perf_event_free_bpf_handler(struct perf_event *event) +{ +} +#endif + static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { bool is_kprobe, is_tracepoint; struct bpf_prog *prog; + if (event->attr.type == PERF_TYPE_HARDWARE || + event->attr.type == PERF_TYPE_SOFTWARE) + return perf_event_set_bpf_handler(event, prog_fd); + if (event->attr.type != PERF_TYPE_TRACEPOINT) return -EINVAL; @@ -7709,6 +7825,8 @@ static void perf_event_free_bpf_prog(struct perf_event *event) { struct bpf_prog *prog; + perf_event_free_bpf_handler(event); + if (!event->tp_event) return; @@ -7908,6 +8026,7 @@ restart: * if <size> is not specified, the range is treated as a single address. */ enum { + IF_ACT_NONE = -1, IF_ACT_FILTER, IF_ACT_START, IF_ACT_STOP, @@ -7931,6 +8050,7 @@ static const match_table_t if_tokens = { { IF_SRC_KERNEL, "%u/%u" }, { IF_SRC_FILEADDR, "%u@%s" }, { IF_SRC_KERNELADDR, "%u" }, + { IF_ACT_NONE, NULL }, }; /* @@ -8751,7 +8871,10 @@ EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { + int remove_device; + mutex_lock(&pmus_lock); + remove_device = pmu_bus_running; list_del_rcu(&pmu->entry); mutex_unlock(&pmus_lock); @@ -8765,10 +8888,12 @@ void perf_pmu_unregister(struct pmu *pmu) free_percpu(pmu->pmu_disable_count); if (pmu->type >= PERF_TYPE_MAX) idr_remove(&pmu_idr, pmu->type); - if (pmu->nr_addr_filters) - device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); - device_del(pmu->dev); - put_device(pmu->dev); + if (remove_device) { + if (pmu->nr_addr_filters) + device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); + device_del(pmu->dev); + put_device(pmu->dev); + } free_pmu_context(pmu); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); @@ -9025,6 +9150,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!overflow_handler && parent_event) { overflow_handler = parent_event->overflow_handler; context = parent_event->overflow_handler_context; +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) + if (overflow_handler == bpf_overflow_handler) { + struct bpf_prog *prog = bpf_prog_inc(parent_event->prog); + + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto err_ns; + } + event->prog = prog; + event->orig_overflow_handler = + parent_event->orig_overflow_handler; + } +#endif } if (overflow_handler) { @@ -9505,6 +9643,9 @@ SYSCALL_DEFINE5(perf_event_open, goto err_alloc; } + if (pmu->task_ctx_nr == perf_sw_context) + event->event_caps |= PERF_EV_CAP_SOFTWARE; + if (group_leader && (is_software_event(event) != is_software_event(group_leader))) { if (is_software_event(event)) { @@ -9518,7 +9659,7 @@ SYSCALL_DEFINE5(perf_event_open, */ pmu = group_leader->pmu; } else if (is_software_event(group_leader) && - (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { + (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { /* * In case the group is a pure software group, and we * try to add a hardware event, move the whole group to @@ -10453,6 +10594,8 @@ static void __init perf_event_init_all_cpus(void) INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); + + INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); } } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8c50276b60d1..f9ec9add2164 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -150,7 +150,7 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) * Returns 0 on success, -EFAULT on failure. */ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, - struct page *page, struct page *kpage) + struct page *old_page, struct page *new_page) { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; @@ -161,49 +161,49 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, const unsigned long mmun_end = addr + PAGE_SIZE; struct mem_cgroup *memcg; - err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg, + err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg, false); if (err) return err; /* For try_to_free_swap() and munlock_vma_page() below */ - lock_page(page); + lock_page(old_page); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); err = -EAGAIN; - ptep = page_check_address(page, mm, addr, &ptl, 0); + ptep = page_check_address(old_page, mm, addr, &ptl, 0); if (!ptep) { - mem_cgroup_cancel_charge(kpage, memcg, false); + mem_cgroup_cancel_charge(new_page, memcg, false); goto unlock; } - get_page(kpage); - page_add_new_anon_rmap(kpage, vma, addr, false); - mem_cgroup_commit_charge(kpage, memcg, false, false); - lru_cache_add_active_or_unevictable(kpage, vma); + get_page(new_page); + page_add_new_anon_rmap(new_page, vma, addr, false); + mem_cgroup_commit_charge(new_page, memcg, false, false); + lru_cache_add_active_or_unevictable(new_page, vma); - if (!PageAnon(page)) { - dec_mm_counter(mm, mm_counter_file(page)); + if (!PageAnon(old_page)) { + dec_mm_counter(mm, mm_counter_file(old_page)); inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, addr, pte_pfn(*ptep)); ptep_clear_flush_notify(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); + set_pte_at_notify(mm, addr, ptep, mk_pte(new_page, vma->vm_page_prot)); - page_remove_rmap(page, false); - if (!page_mapped(page)) - try_to_free_swap(page); + page_remove_rmap(old_page, false); + if (!page_mapped(old_page)) + try_to_free_swap(old_page); pte_unmap_unlock(ptep, ptl); if (vma->vm_flags & VM_LOCKED) - munlock_vma_page(page); - put_page(page); + munlock_vma_page(old_page); + put_page(old_page); err = 0; unlock: mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - unlock_page(page); + unlock_page(old_page); return err; } @@ -300,7 +300,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, retry: /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); + ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, + &vma); if (ret <= 0) return ret; @@ -1710,7 +1711,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) * but we treat this as a 'remote' access since it is * essentially a kernel access to the memory. */ - result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL); + result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, + NULL); if (result < 0) return result; diff --git a/kernel/exit.c b/kernel/exit.c index 091a78be3b09..aacff8e2aec0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -54,6 +54,7 @@ #include <linux/writeback.h> #include <linux/shm.h> #include <linux/kcov.h> +#include <linux/random.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -91,11 +92,10 @@ static void __exit_signal(struct task_struct *tsk) lockdep_tasklist_lock_is_held()); spin_lock(&sighand->siglock); +#ifdef CONFIG_POSIX_TIMERS posix_cpu_timers_exit(tsk); if (group_dead) { posix_cpu_timers_exit_group(tsk); - tty = sig->tty; - sig->tty = NULL; } else { /* * This can only happen if the caller is de_thread(). @@ -104,7 +104,13 @@ static void __exit_signal(struct task_struct *tsk) */ if (unlikely(has_group_leader_pid(tsk))) posix_cpu_timers_exit_group(tsk); + } +#endif + if (group_dead) { + tty = sig->tty; + sig->tty = NULL; + } else { /* * If there is any task waiting for the group exit * then notify it: @@ -116,6 +122,9 @@ static void __exit_signal(struct task_struct *tsk) sig->curr_target = next_thread(tsk); } + add_device_randomness((const void*) &tsk->se.sum_exec_runtime, + sizeof(unsigned long long)); + /* * Accumulate here the counters for all threads as they die. We could * skip the group leader because it is the last user of signal_struct, @@ -511,7 +520,7 @@ static void exit_mm(struct task_struct *tsk) mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) - exit_oom_victim(tsk); + exit_oom_victim(); } static struct task_struct *find_alive_thread(struct task_struct *p) @@ -725,7 +734,7 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif -void do_exit(long code) +void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; @@ -799,8 +808,10 @@ void do_exit(long code) acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { +#ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); +#endif if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } @@ -836,6 +847,7 @@ void do_exit(long code) */ perf_event_exit_task(tsk); + sched_autogroup_exit_task(tsk); cgroup_exit(tsk); /* @@ -882,29 +894,7 @@ void do_exit(long code) exit_rcu(); TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); - /* - * The setting of TASK_RUNNING by try_to_wake_up() may be delayed - * when the following two conditions become true. - * - There is race condition of mmap_sem (It is acquired by - * exit_mm()), and - * - SMI occurs before setting TASK_RUNINNG. - * (or hypervisor of virtual machine switches to other guest) - * As a result, we may become TASK_RUNNING after becoming TASK_DEAD - * - * To avoid it, we have to wait for releasing tsk->pi_lock which - * is held by try_to_wake_up() - */ - smp_mb(); - raw_spin_unlock_wait(&tsk->pi_lock); - - /* causes final put_task_struct in finish_task_switch(). */ - tsk->state = TASK_DEAD; - tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ - schedule(); - BUG(); - /* Avoid "noreturn function does return". */ - for (;;) - cpu_relax(); /* For when BUG is null */ + do_task_dead(); } EXPORT_SYMBOL_GPL(do_exit); diff --git a/kernel/fork.c b/kernel/fork.c index beb31725f7e2..a439ac429669 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -158,19 +158,83 @@ void __weak arch_release_thread_stack(unsigned long *stack) * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a * kmemcache based allocator. */ -# if THREAD_SIZE >= PAGE_SIZE -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, - int node) +# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) + +#ifdef CONFIG_VMAP_STACK +/* + * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB + * flush. Try to minimize the number of calls by caching stacks. + */ +#define NR_CACHED_STACKS 2 +static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); +#endif + +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { +#ifdef CONFIG_VMAP_STACK + void *stack; + int i; + + local_irq_disable(); + for (i = 0; i < NR_CACHED_STACKS; i++) { + struct vm_struct *s = this_cpu_read(cached_stacks[i]); + + if (!s) + continue; + this_cpu_write(cached_stacks[i], NULL); + + tsk->stack_vm_area = s; + local_irq_enable(); + return s->addr; + } + local_irq_enable(); + + stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, + VMALLOC_START, VMALLOC_END, + THREADINFO_GFP | __GFP_HIGHMEM, + PAGE_KERNEL, + 0, node, __builtin_return_address(0)); + + /* + * We can't call find_vm_area() in interrupt context, and + * free_thread_stack() can be called in interrupt context, + * so cache the vm_struct. + */ + if (stack) + tsk->stack_vm_area = find_vm_area(stack); + return stack; +#else struct page *page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; +#endif } -static inline void free_thread_stack(unsigned long *stack) +static inline void free_thread_stack(struct task_struct *tsk) { - __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER); +#ifdef CONFIG_VMAP_STACK + if (task_stack_vm_area(tsk)) { + unsigned long flags; + int i; + + local_irq_save(flags); + for (i = 0; i < NR_CACHED_STACKS; i++) { + if (this_cpu_read(cached_stacks[i])) + continue; + + this_cpu_write(cached_stacks[i], tsk->stack_vm_area); + local_irq_restore(flags); + return; + } + local_irq_restore(flags); + + vfree_atomic(tsk->stack); + return; + } +#endif + + __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_stack_cache; @@ -181,9 +245,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); } -static void free_thread_stack(unsigned long *stack) +static void free_thread_stack(struct task_struct *tsk) { - kmem_cache_free(thread_stack_cache, stack); + kmem_cache_free(thread_stack_cache, tsk->stack); } void thread_stack_cache_init(void) @@ -213,28 +277,85 @@ struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -static void account_kernel_stack(unsigned long *stack, int account) +static void account_kernel_stack(struct task_struct *tsk, int account) +{ + void *stack = task_stack_page(tsk); + struct vm_struct *vm = task_stack_vm_area(tsk); + + BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); + + if (vm) { + int i; + + BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); + + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { + mod_zone_page_state(page_zone(vm->pages[i]), + NR_KERNEL_STACK_KB, + PAGE_SIZE / 1024 * account); + } + + /* All stack pages belong to the same memcg. */ + memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); + } else { + /* + * All stack pages are in the same zone and belong to the + * same memcg. + */ + struct page *first_page = virt_to_page(stack); + + mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, + THREAD_SIZE / 1024 * account); + + memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); + } +} + +static void release_task_stack(struct task_struct *tsk) { - /* All stack pages are in the same zone and belong to the same memcg. */ - struct page *first_page = virt_to_page(stack); + if (WARN_ON(tsk->state != TASK_DEAD)) + return; /* Better to leak the stack than to free prematurely */ - mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, - THREAD_SIZE / 1024 * account); + account_kernel_stack(tsk, -1); + arch_release_thread_stack(tsk->stack); + free_thread_stack(tsk); + tsk->stack = NULL; +#ifdef CONFIG_VMAP_STACK + tsk->stack_vm_area = NULL; +#endif +} - memcg_kmem_update_page_stat( - first_page, MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); +#ifdef CONFIG_THREAD_INFO_IN_TASK +void put_task_stack(struct task_struct *tsk) +{ + if (atomic_dec_and_test(&tsk->stack_refcount)) + release_task_stack(tsk); } +#endif void free_task(struct task_struct *tsk) { - account_kernel_stack(tsk->stack, -1); - arch_release_thread_stack(tsk->stack); - free_thread_stack(tsk->stack); +#ifndef CONFIG_THREAD_INFO_IN_TASK + /* + * The task is finally done with both the stack and thread_info, + * so free both. + */ + release_task_stack(tsk); +#else + /* + * If the task had a separate stack allocation, it should be gone + * by now. + */ + WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0); +#endif rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); put_seccomp_filter(tsk); arch_release_task_struct(tsk); + if (tsk->flags & PF_KTHREAD) + free_kthread_struct(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -243,6 +364,12 @@ static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); sched_autogroup_exit(sig); + /* + * __mmdrop is not safe to call from softirq context on x86 due to + * pgd_dtor so postpone it to the async context + */ + if (sig->oom_mm) + mmdrop_async(sig->oom_mm); kmem_cache_free(signal_cachep, sig); } @@ -302,6 +429,7 @@ int arch_task_struct_size __read_mostly; void __init fork_init(void) { + int i; #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES @@ -321,6 +449,10 @@ void __init fork_init(void) init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; + + for (i = 0; i < UCOUNT_COUNTS; i++) { + init_user_ns.ucount_max[i] = max_threads/2; + } } int __weak arch_dup_task_struct(struct task_struct *dst, @@ -342,6 +474,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; unsigned long *stack; + struct vm_struct *stack_vm_area; int err; if (node == NUMA_NO_NODE) @@ -354,11 +487,26 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (!stack) goto free_tsk; + stack_vm_area = task_stack_vm_area(tsk); + err = arch_dup_task_struct(tsk, orig); + + /* + * arch_dup_task_struct() clobbers the stack-related fields. Make + * sure they're properly initialized before using any stack-related + * functions again. + */ + tsk->stack = stack; +#ifdef CONFIG_VMAP_STACK + tsk->stack_vm_area = stack_vm_area; +#endif +#ifdef CONFIG_THREAD_INFO_IN_TASK + atomic_set(&tsk->stack_refcount, 1); +#endif + if (err) goto free_stack; - tsk->stack = stack; #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under @@ -390,21 +538,22 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->task_frag.page = NULL; tsk->wake_q.next = NULL; - account_kernel_stack(stack, 1); + account_kernel_stack(tsk, 1); kcov_task_init(tsk); return tsk; free_stack: - free_thread_stack(stack); + free_thread_stack(tsk); free_tsk: free_task_struct(tsk); return NULL; } #ifdef CONFIG_MMU -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +static __latent_entropy int dup_mmap(struct mm_struct *mm, + struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp, *prev, **pprev; struct rb_node **rb_link, *rb_parent; @@ -711,6 +860,7 @@ static inline void __mmput(struct mm_struct *mm) ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); + mm_put_huge_zero_page(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); @@ -719,6 +869,7 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); + set_bit(MMF_OOM_SKIP, &mm->flags); mmdrop(mm); } @@ -1196,8 +1347,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) seqlock_init(&sig->stats_lock); prev_cputime_init(&sig->prev_cputime); +#ifdef CONFIG_POSIX_TIMERS hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; +#endif task_lock(current->group_leader); memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); @@ -1296,7 +1449,8 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ -static struct task_struct *copy_process(unsigned long clone_flags, +static __latent_entropy struct task_struct *copy_process( + unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *child_tidptr, @@ -1390,7 +1544,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ - p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); + p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE); p->flags |= PF_FORKNOEXEC; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); @@ -1401,7 +1555,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, init_sigpending(&p->pending); p->utime = p->stime = p->gtime = 0; +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME p->utimescaled = p->stimescaled = 0; +#endif prev_cputime_init(&p->prev_cputime); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN @@ -1715,6 +1871,8 @@ bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); exit_creds(p); bad_fork_free: + p->state = TASK_DEAD; + put_task_stack(p); free_task(p); fork_out: return ERR_PTR(retval); @@ -1780,6 +1938,7 @@ long _do_fork(unsigned long clone_flags, p = copy_process(clone_flags, stack_start, stack_size, child_tidptr, NULL, trace, tls, NUMA_NO_NODE); + add_latent_entropy(); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. diff --git a/kernel/futex.c b/kernel/futex.c index 46cb3a301bc1..9246d9f593d1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -381,8 +381,12 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb) #endif } -/* - * We hash on the keys returned from get_futex_key (see below). +/** + * hash_futex - Return the hash bucket in the global hash + * @key: Pointer to the futex key for which the hash is calculated + * + * We hash on the keys returned from get_futex_key (see below) and return the + * corresponding hash bucket in the global hash. */ static struct futex_hash_bucket *hash_futex(union futex_key *key) { @@ -392,7 +396,12 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key) return &futex_queues[hash & (futex_hashsize - 1)]; } -/* + +/** + * match_futex - Check whether two futex keys are equal + * @key1: Pointer to key1 + * @key2: Pointer to key2 + * * Return 1 if two futex_keys are equal, 0 otherwise. */ static inline int match_futex(union futex_key *key1, union futex_key *key2) @@ -1289,7 +1298,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, struct task_struct *new_owner; struct futex_pi_state *pi_state = this->pi_state; u32 uninitialized_var(curval), newval; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); bool deboost; int ret = 0; @@ -1406,7 +1415,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) struct futex_q *this, *next; union futex_key key = FUTEX_KEY_INIT; int ret; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); if (!bitset) return -EINVAL; @@ -1460,7 +1469,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; int ret, op_ret; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); retry: ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); @@ -1699,7 +1708,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); if (requeue_pi) { /* diff --git a/kernel/groups.c b/kernel/groups.c index 74d431d25251..2fcadd66a8fd 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -7,55 +7,31 @@ #include <linux/security.h> #include <linux/syscalls.h> #include <linux/user_namespace.h> +#include <linux/vmalloc.h> #include <asm/uaccess.h> struct group_info *groups_alloc(int gidsetsize) { - struct group_info *group_info; - int nblocks; - int i; - - nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK; - /* Make sure we always allocate at least one indirect block pointer */ - nblocks = nblocks ? : 1; - group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER); - if (!group_info) + struct group_info *gi; + unsigned int len; + + len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; + gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); + if (!gi) + gi = __vmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_HIGHMEM, PAGE_KERNEL); + if (!gi) return NULL; - group_info->ngroups = gidsetsize; - group_info->nblocks = nblocks; - atomic_set(&group_info->usage, 1); - - if (gidsetsize <= NGROUPS_SMALL) - group_info->blocks[0] = group_info->small_block; - else { - for (i = 0; i < nblocks; i++) { - kgid_t *b; - b = (void *)__get_free_page(GFP_USER); - if (!b) - goto out_undo_partial_alloc; - group_info->blocks[i] = b; - } - } - return group_info; -out_undo_partial_alloc: - while (--i >= 0) { - free_page((unsigned long)group_info->blocks[i]); - } - kfree(group_info); - return NULL; + atomic_set(&gi->usage, 1); + gi->ngroups = gidsetsize; + return gi; } EXPORT_SYMBOL(groups_alloc); void groups_free(struct group_info *group_info) { - if (group_info->blocks[0] != group_info->small_block) { - int i; - for (i = 0; i < group_info->nblocks; i++) - free_page((unsigned long)group_info->blocks[i]); - } - kfree(group_info); + kvfree(group_info); } EXPORT_SYMBOL(groups_free); @@ -70,7 +46,7 @@ static int groups_to_user(gid_t __user *grouplist, for (i = 0; i < count; i++) { gid_t gid; - gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i)); + gid = from_kgid_munged(user_ns, group_info->gid[i]); if (put_user(gid, grouplist+i)) return -EFAULT; } @@ -95,7 +71,7 @@ static int groups_from_user(struct group_info *group_info, if (!gid_valid(kgid)) return -EINVAL; - GROUP_AT(group_info, i) = kgid; + group_info->gid[i] = kgid; } return 0; } @@ -115,15 +91,14 @@ static void groups_sort(struct group_info *group_info) for (base = 0; base < max; base++) { int left = base; int right = left + stride; - kgid_t tmp = GROUP_AT(group_info, right); + kgid_t tmp = group_info->gid[right]; - while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) { - GROUP_AT(group_info, right) = - GROUP_AT(group_info, left); + while (left >= 0 && gid_gt(group_info->gid[left], tmp)) { + group_info->gid[right] = group_info->gid[left]; right = left; left -= stride; } - GROUP_AT(group_info, right) = tmp; + group_info->gid[right] = tmp; } stride /= 3; } @@ -141,9 +116,9 @@ int groups_search(const struct group_info *group_info, kgid_t grp) right = group_info->ngroups; while (left < right) { unsigned int mid = (left+right)/2; - if (gid_gt(grp, GROUP_AT(group_info, mid))) + if (gid_gt(grp, group_info->gid[mid])) left = mid + 1; - else if (gid_lt(grp, GROUP_AT(group_info, mid))) + else if (gid_lt(grp, group_info->gid[mid])) right = mid; else return 1; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index d234022805dc..40c07e4fa116 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -98,26 +98,27 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) trace_sched_process_hang(t); - if (!sysctl_hung_task_warnings) + if (!sysctl_hung_task_warnings && !sysctl_hung_task_panic) return; - if (sysctl_hung_task_warnings > 0) - sysctl_hung_task_warnings--; - /* * Ok, the task did not get scheduled for more than 2 minutes, * complain: */ - pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", - t->comm, t->pid, timeout); - pr_err(" %s %s %.*s\n", - print_tainted(), init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" - " disables this message.\n"); - sched_show_task(t); - debug_show_held_locks(t); + if (sysctl_hung_task_warnings) { + if (sysctl_hung_task_warnings > 0) + sysctl_hung_task_warnings--; + pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", + t->comm, t->pid, timeout); + pr_err(" %s %s %.*s\n", + print_tainted(), init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" + " disables this message.\n"); + sched_show_task(t); + debug_show_all_locks(); + } touch_nmi_watchdog(); diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 32f6cfcff212..9be9bda7c1f9 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -4,60 +4,153 @@ #include <linux/slab.h> #include <linux/cpu.h> -static int get_first_sibling(unsigned int cpu) +static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, + int cpus_per_vec) { - unsigned int ret; + const struct cpumask *siblmsk; + int cpu, sibl; - ret = cpumask_first(topology_sibling_cpumask(cpu)); - if (ret < nr_cpu_ids) - return ret; - return cpu; + for ( ; cpus_per_vec > 0; ) { + cpu = cpumask_first(nmsk); + + /* Should not happen, but I'm too lazy to think about it */ + if (cpu >= nr_cpu_ids) + return; + + cpumask_clear_cpu(cpu, nmsk); + cpumask_set_cpu(cpu, irqmsk); + cpus_per_vec--; + + /* If the cpu has siblings, use them first */ + siblmsk = topology_sibling_cpumask(cpu); + for (sibl = -1; cpus_per_vec > 0; ) { + sibl = cpumask_next(sibl, siblmsk); + if (sibl >= nr_cpu_ids) + break; + if (!cpumask_test_and_clear_cpu(sibl, nmsk)) + continue; + cpumask_set_cpu(sibl, irqmsk); + cpus_per_vec--; + } + } +} + +static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk) +{ + int n, nodes; + + /* Calculate the number of nodes in the supplied affinity mask */ + for (n = 0, nodes = 0; n < num_online_nodes(); n++) { + if (cpumask_intersects(mask, cpumask_of_node(n))) { + node_set(n, *nodemsk); + nodes++; + } + } + return nodes; } -/* - * Take a map of online CPUs and the number of available interrupt vectors - * and generate an output cpumask suitable for spreading MSI/MSI-X vectors - * so that they are distributed as good as possible around the CPUs. If - * more vectors than CPUs are available we'll map one to each CPU, - * otherwise we map one to the first sibling of each socket. +/** + * irq_create_affinity_masks - Create affinity masks for multiqueue spreading + * @nvecs: The total number of vectors + * @affd: Description of the affinity requirements * - * If there are more vectors than CPUs we will still only have one bit - * set per CPU, but interrupt code will keep on assigning the vectors from - * the start of the bitmap until we run out of vectors. + * Returns the masks pointer or NULL if allocation failed. */ -struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) +struct cpumask * +irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) { - struct cpumask *affinity_mask; - unsigned int max_vecs = *nr_vecs; + int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec; + int affv = nvecs - affd->pre_vectors - affd->post_vectors; + int last_affv = affv + affd->pre_vectors; + nodemask_t nodemsk = NODE_MASK_NONE; + struct cpumask *masks; + cpumask_var_t nmsk; - if (max_vecs == 1) + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return NULL; - affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL); - if (!affinity_mask) { - *nr_vecs = 1; - return NULL; - } + masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); + if (!masks) + goto out; + /* Fill out vectors at the beginning that don't need affinity */ + for (curvec = 0; curvec < affd->pre_vectors; curvec++) + cpumask_copy(masks + curvec, irq_default_affinity); + + /* Stabilize the cpumasks */ get_online_cpus(); - if (max_vecs >= num_online_cpus()) { - cpumask_copy(affinity_mask, cpu_online_mask); - *nr_vecs = num_online_cpus(); - } else { - unsigned int vecs = 0, cpu; - - for_each_online_cpu(cpu) { - if (cpu == get_first_sibling(cpu)) { - cpumask_set_cpu(cpu, affinity_mask); - vecs++; - } + nodes = get_nodes_in_cpumask(cpu_online_mask, &nodemsk); - if (--max_vecs == 0) + /* + * If the number of nodes in the mask is less than or equal the + * number of vectors we just spread the vectors across the nodes. + */ + if (affv <= nodes) { + for_each_node_mask(n, nodemsk) { + cpumask_copy(masks + curvec, cpumask_of_node(n)); + if (++curvec == last_affv) break; } - *nr_vecs = vecs; + goto done; } + + /* Spread the vectors per node */ + vecs_per_node = affv / nodes; + /* Account for rounding errors */ + extra_vecs = affv - (nodes * vecs_per_node); + + for_each_node_mask(n, nodemsk) { + int ncpus, v, vecs_to_assign = vecs_per_node; + + /* Get the cpus on this node which are in the mask */ + cpumask_and(nmsk, cpu_online_mask, cpumask_of_node(n)); + + /* Calculate the number of cpus per vector */ + ncpus = cpumask_weight(nmsk); + + for (v = 0; curvec < last_affv && v < vecs_to_assign; + curvec++, v++) { + cpus_per_vec = ncpus / vecs_to_assign; + + /* Account for extra vectors to compensate rounding errors */ + if (extra_vecs) { + cpus_per_vec++; + if (!--extra_vecs) + vecs_per_node++; + } + irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec); + } + + if (curvec >= last_affv) + break; + } + +done: + put_online_cpus(); + + /* Fill out vectors at the end that don't need affinity */ + for (; curvec < nvecs; curvec++) + cpumask_copy(masks + curvec, irq_default_affinity); +out: + free_cpumask_var(nmsk); + return masks; +} + +/** + * irq_calc_affinity_vectors - Calculate the optimal number of vectors + * @maxvec: The maximum number of vectors available + * @affd: Description of the affinity requirements + */ +int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd) +{ + int resv = affd->pre_vectors + affd->post_vectors; + int vecs = maxvec - resv; + int cpus; + + /* Stabilize the cpumasks */ + get_online_cpus(); + cpus = cpumask_weight(cpu_online_mask); put_online_cpus(); - return affinity_mask; + return min(cpus, vecs) + resv; } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 26ba5654d9d5..be3c34e4f2ac 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -76,7 +76,6 @@ int irq_set_irq_type(unsigned int irq, unsigned int type) if (!desc) return -EINVAL; - type &= IRQ_TYPE_SENSE_MASK; ret = __irq_set_trigger(desc, type); irq_put_desc_busunlock(desc, flags); return ret; @@ -756,7 +755,6 @@ void handle_percpu_devid_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); struct irqaction *action = desc->action; - void *dev_id = raw_cpu_ptr(action->percpu_dev_id); unsigned int irq = irq_desc_get_irq(desc); irqreturn_t res; @@ -765,15 +763,26 @@ void handle_percpu_devid_irq(struct irq_desc *desc) if (chip->irq_ack) chip->irq_ack(&desc->irq_data); - trace_irq_handler_entry(irq, action); - res = action->handler(irq, dev_id); - trace_irq_handler_exit(irq, action, res); + if (likely(action)) { + trace_irq_handler_entry(irq, action); + res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); + trace_irq_handler_exit(irq, action, res); + } else { + unsigned int cpu = smp_processor_id(); + bool enabled = cpumask_test_cpu(cpu, desc->percpu_enabled); + + if (enabled) + irq_percpu_disable(desc, cpu); + + pr_err_once("Spurious%s percpu IRQ%u on CPU%u\n", + enabled ? " and unmasked" : "", irq, cpu); + } if (chip->irq_eoi) chip->irq_eoi(&desc->irq_data); } -void +static void __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, int is_chained, const char *name) { diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index abd286afbd27..ee32870079c9 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -260,9 +260,9 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) } /** - * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain + * __irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain * @d: irq domain for which to allocate chips - * @irqs_per_chip: Number of interrupts each chip handles + * @irqs_per_chip: Number of interrupts each chip handles (max 32) * @num_ct: Number of irq_chip_type instances associated with this * @name: Name of the irq chip * @handler: Default flow handler associated with these chips @@ -270,11 +270,11 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) * @set: IRQ_* bits to set in the mapping function * @gcflags: Generic chip specific setup flags */ -int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, - int num_ct, const char *name, - irq_flow_handler_t handler, - unsigned int clr, unsigned int set, - enum irq_gc_flags gcflags) +int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, + int num_ct, const char *name, + irq_flow_handler_t handler, + unsigned int clr, unsigned int set, + enum irq_gc_flags gcflags) { struct irq_domain_chip_generic *dgc; struct irq_chip_generic *gc; @@ -326,7 +326,21 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, d->name = name; return 0; } -EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); +EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips); + +static struct irq_chip_generic * +__irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) +{ + struct irq_domain_chip_generic *dgc = d->gc; + int idx; + + if (!dgc) + return ERR_PTR(-ENODEV); + idx = hw_irq / dgc->irqs_per_chip; + if (idx >= dgc->num_chips) + return ERR_PTR(-EINVAL); + return dgc->gc[idx]; +} /** * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq @@ -336,15 +350,9 @@ EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); struct irq_chip_generic * irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) { - struct irq_domain_chip_generic *dgc = d->gc; - int idx; + struct irq_chip_generic *gc = __irq_get_domain_generic_chip(d, hw_irq); - if (!dgc) - return NULL; - idx = hw_irq / dgc->irqs_per_chip; - if (idx >= dgc->num_chips) - return NULL; - return dgc->gc[idx]; + return !IS_ERR(gc) ? gc : NULL; } EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip); @@ -368,13 +376,9 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, unsigned long flags; int idx; - if (!d->gc) - return -ENODEV; - - idx = hw_irq / dgc->irqs_per_chip; - if (idx >= dgc->num_chips) - return -EINVAL; - gc = dgc->gc[idx]; + gc = __irq_get_domain_generic_chip(d, hw_irq); + if (IS_ERR(gc)) + return PTR_ERR(gc); idx = hw_irq % dgc->irqs_per_chip; @@ -409,10 +413,30 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); return 0; } -EXPORT_SYMBOL_GPL(irq_map_generic_chip); + +static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq) +{ + struct irq_data *data = irq_domain_get_irq_data(d, virq); + struct irq_domain_chip_generic *dgc = d->gc; + unsigned int hw_irq = data->hwirq; + struct irq_chip_generic *gc; + int irq_idx; + + gc = irq_get_domain_generic_chip(d, hw_irq); + if (!gc) + return; + + irq_idx = hw_irq % dgc->irqs_per_chip; + + clear_bit(irq_idx, &gc->installed); + irq_domain_set_info(d, virq, hw_irq, &no_irq_chip, NULL, NULL, NULL, + NULL); + +} struct irq_domain_ops irq_generic_chip_ops = { .map = irq_map_generic_chip, + .unmap = irq_unmap_generic_chip, .xlate = irq_domain_xlate_onetwocell, }; EXPORT_SYMBOL_GPL(irq_generic_chip_ops); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index a623b44f2d4b..00bb0aeea1d0 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -15,6 +15,7 @@ #include <linux/radix-tree.h> #include <linux/bitmap.h> #include <linux/irqdomain.h> +#include <linux/sysfs.h> #include "internals.h" @@ -123,6 +124,181 @@ static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS); #ifdef CONFIG_SPARSE_IRQ +static void irq_kobj_release(struct kobject *kobj); + +#ifdef CONFIG_SYSFS +static struct kobject *irq_kobj_base; + +#define IRQ_ATTR_RO(_name) \ +static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +static ssize_t per_cpu_count_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + int cpu, irq = desc->irq_data.irq; + ssize_t ret = 0; + char *p = ""; + + for_each_possible_cpu(cpu) { + unsigned int c = kstat_irqs_cpu(irq, cpu); + + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c); + p = ","; + } + + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + return ret; +} +IRQ_ATTR_RO(per_cpu_count); + +static ssize_t chip_name_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + ssize_t ret = 0; + + raw_spin_lock_irq(&desc->lock); + if (desc->irq_data.chip && desc->irq_data.chip->name) { + ret = scnprintf(buf, PAGE_SIZE, "%s\n", + desc->irq_data.chip->name); + } + raw_spin_unlock_irq(&desc->lock); + + return ret; +} +IRQ_ATTR_RO(chip_name); + +static ssize_t hwirq_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + ssize_t ret = 0; + + raw_spin_lock_irq(&desc->lock); + if (desc->irq_data.domain) + ret = sprintf(buf, "%d\n", (int)desc->irq_data.hwirq); + raw_spin_unlock_irq(&desc->lock); + + return ret; +} +IRQ_ATTR_RO(hwirq); + +static ssize_t type_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + ssize_t ret = 0; + + raw_spin_lock_irq(&desc->lock); + ret = sprintf(buf, "%s\n", + irqd_is_level_type(&desc->irq_data) ? "level" : "edge"); + raw_spin_unlock_irq(&desc->lock); + + return ret; + +} +IRQ_ATTR_RO(type); + +static ssize_t name_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + ssize_t ret = 0; + + raw_spin_lock_irq(&desc->lock); + if (desc->name) + ret = scnprintf(buf, PAGE_SIZE, "%s\n", desc->name); + raw_spin_unlock_irq(&desc->lock); + + return ret; +} +IRQ_ATTR_RO(name); + +static ssize_t actions_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + struct irqaction *action; + ssize_t ret = 0; + char *p = ""; + + raw_spin_lock_irq(&desc->lock); + for (action = desc->action; action != NULL; action = action->next) { + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", + p, action->name); + p = ","; + } + raw_spin_unlock_irq(&desc->lock); + + if (ret) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + + return ret; +} +IRQ_ATTR_RO(actions); + +static struct attribute *irq_attrs[] = { + &per_cpu_count_attr.attr, + &chip_name_attr.attr, + &hwirq_attr.attr, + &type_attr.attr, + &name_attr.attr, + &actions_attr.attr, + NULL +}; + +static struct kobj_type irq_kobj_type = { + .release = irq_kobj_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_attrs = irq_attrs, +}; + +static void irq_sysfs_add(int irq, struct irq_desc *desc) +{ + if (irq_kobj_base) { + /* + * Continue even in case of failure as this is nothing + * crucial. + */ + if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq)) + pr_warn("Failed to add kobject for irq %d\n", irq); + } +} + +static int __init irq_sysfs_init(void) +{ + struct irq_desc *desc; + int irq; + + /* Prevent concurrent irq alloc/free */ + irq_lock_sparse(); + + irq_kobj_base = kobject_create_and_add("irq", kernel_kobj); + if (!irq_kobj_base) { + irq_unlock_sparse(); + return -ENOMEM; + } + + /* Add the already allocated interrupts */ + for_each_irq_desc(irq, desc) + irq_sysfs_add(irq, desc); + irq_unlock_sparse(); + + return 0; +} +postcore_initcall(irq_sysfs_init); + +#else /* !CONFIG_SYSFS */ + +static struct kobj_type irq_kobj_type = { + .release = irq_kobj_release, +}; + +static void irq_sysfs_add(int irq, struct irq_desc *desc) {} + +#endif /* CONFIG_SYSFS */ + static RADIX_TREE(irq_desc_tree, GFP_KERNEL); static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) @@ -187,6 +363,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, desc_set_defaults(irq, desc, node, affinity, owner); irqd_set(&desc->irq_data, flags); + kobject_init(&desc->kobj, &irq_kobj_type); return desc; @@ -197,15 +374,22 @@ err_desc: return NULL; } -static void delayed_free_desc(struct rcu_head *rhp) +static void irq_kobj_release(struct kobject *kobj) { - struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu); + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); free_masks(desc); free_percpu(desc->kstat_irqs); kfree(desc); } +static void delayed_free_desc(struct rcu_head *rhp) +{ + struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu); + + kobject_put(&desc->kobj); +} + static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -217,8 +401,12 @@ static void free_desc(unsigned int irq) * kstat_irq_usr(). Once we deleted the descriptor from the * sparse tree we can free it. Access in proc will fail to * lookup the descriptor. + * + * The sysfs entry must be serialized against a concurrent + * irq_sysfs_init() as well. */ mutex_lock(&sparse_irq_lock); + kobject_del(&desc->kobj); delete_irq_desc(irq); mutex_unlock(&sparse_irq_lock); @@ -236,31 +424,31 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, const struct cpumask *mask = NULL; struct irq_desc *desc; unsigned int flags; - int i, cpu = -1; + int i; - if (affinity && cpumask_empty(affinity)) - return -EINVAL; + /* Validate affinity mask(s) */ + if (affinity) { + for (i = 0, mask = affinity; i < cnt; i++, mask++) { + if (cpumask_empty(mask)) + return -EINVAL; + } + } flags = affinity ? IRQD_AFFINITY_MANAGED : 0; + mask = NULL; for (i = 0; i < cnt; i++) { if (affinity) { - cpu = cpumask_next(cpu, affinity); - if (cpu >= nr_cpu_ids) - cpu = cpumask_first(affinity); - node = cpu_to_node(cpu); - - /* - * For single allocations we use the caller provided - * mask otherwise we use the mask of the target cpu - */ - mask = cnt == 1 ? affinity : cpumask_of(cpu); + node = cpu_to_node(cpumask_first(affinity)); + mask = affinity; + affinity++; } desc = alloc_desc(start + i, node, flags, mask, owner); if (!desc) goto err; mutex_lock(&sparse_irq_lock); irq_insert_desc(start + i, desc); + irq_sysfs_add(start + i, desc); mutex_unlock(&sparse_irq_lock); } return start; @@ -481,9 +669,9 @@ EXPORT_SYMBOL_GPL(irq_free_descs); * @cnt: Number of consecutive irqs to allocate. * @node: Preferred node on which the irq descriptor should be allocated * @owner: Owning module (can be NULL) - * @affinity: Optional pointer to an affinity mask which hints where the - * irq descriptors should be allocated and which default - * affinities to use + * @affinity: Optional pointer to an affinity mask array of size @cnt which + * hints where the irq descriptors should be allocated and which + * default affinities to use * * Returns the first irq number or error code */ diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 4752b43662e0..8c0a0ae43521 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); /** * __irq_domain_add() - Allocate a new irq_domain data structure - * @of_node: optional device-tree node of the interrupt controller + * @fwnode: firmware node for the interrupt controller * @size: Size of linear map; 0 for radix mapping only * @hwirq_max: Maximum number of interrupts supported by controller * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no @@ -96,10 +96,8 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, const struct irq_domain_ops *ops, void *host_data) { + struct device_node *of_node = to_of_node(fwnode); struct irq_domain *domain; - struct device_node *of_node; - - of_node = to_of_node(fwnode); domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), GFP_KERNEL, of_node_to_nid(of_node)); @@ -868,7 +866,10 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d, if (WARN_ON(intsize < 1)) return -EINVAL; *out_hwirq = intspec[0]; - *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE; + if (intsize > 1) + *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; + else + *out_type = IRQ_TYPE_NONE; return 0; } EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9530fcd27704..6b669593e7eb 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -669,8 +669,6 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) return 0; } - flags &= IRQ_TYPE_SENSE_MASK; - if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { if (!irqd_irq_masked(&desc->irq_data)) mask_irq(desc); @@ -678,7 +676,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) unmask = 1; } - /* caller masked out all except trigger mode flags */ + /* Mask all flags except trigger mode */ + flags &= IRQ_TYPE_SENSE_MASK; ret = chip->irq_set_type(&desc->irq_data, flags); switch (ret) { @@ -722,6 +721,7 @@ int irq_set_parent(int irq, int parent_irq) irq_put_desc_unlock(desc, flags); return 0; } +EXPORT_SYMBOL_GPL(irq_set_parent); #endif /* @@ -1341,12 +1341,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } else if (new->flags & IRQF_TRIGGER_MASK) { unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; - unsigned int omsk = irq_settings_get_trigger_mask(desc); + unsigned int omsk = irqd_get_trigger_type(&desc->irq_data); if (nmsk != omsk) /* hope the handler works with current trigger mode */ pr_warn("irq %d uses trigger mode %u; requested %u\n", - irq, nmsk, omsk); + irq, omsk, nmsk); } *old_ptr = new; diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 19e9dfbe97fa..ee230063f033 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -14,24 +14,44 @@ #include <linux/irq.h> #include <linux/irqdomain.h> #include <linux/msi.h> +#include <linux/slab.h> -/* Temparory solution for building, will be removed later */ -#include <linux/pci.h> - -struct msi_desc *alloc_msi_entry(struct device *dev) +/** + * alloc_msi_entry - Allocate an initialize msi_entry + * @dev: Pointer to the device for which this is allocated + * @nvec: The number of vectors used in this entry + * @affinity: Optional pointer to an affinity mask array size of @nvec + * + * If @affinity is not NULL then a an affinity array[@nvec] is allocated + * and the affinity masks from @affinity are copied. + */ +struct msi_desc * +alloc_msi_entry(struct device *dev, int nvec, const struct cpumask *affinity) { - struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL); + struct msi_desc *desc; + + desc = kzalloc(sizeof(*desc), GFP_KERNEL); if (!desc) return NULL; INIT_LIST_HEAD(&desc->list); desc->dev = dev; + desc->nvec_used = nvec; + if (affinity) { + desc->affinity = kmemdup(affinity, + nvec * sizeof(*desc->affinity), GFP_KERNEL); + if (!desc->affinity) { + kfree(desc); + return NULL; + } + } return desc; } void free_msi_entry(struct msi_desc *entry) { + kfree(entry->affinity); kfree(entry); } diff --git a/kernel/kcov.c b/kernel/kcov.c index 8d44b3fea9d0..3cbb0c879705 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -7,6 +7,7 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/printk.h> +#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/vmalloc.h> @@ -53,8 +54,15 @@ void notrace __sanitizer_cov_trace_pc(void) /* * We are interested in code coverage as a function of a syscall inputs, * so we ignore code executed in interrupts. + * The checks for whether we are in an interrupt are open-coded, because + * 1. We can't use in_interrupt() here, since it also returns true + * when we are inside local_bh_disable() section. + * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()), + * since that leads to slower generated code (three separate tests, + * one for each of the flags). */ - if (!t || in_interrupt()) + if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET + | NMI_MASK))) return; mode = READ_ONCE(t->kcov_mode); if (mode == KCOV_MODE_TRACE) { diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d10ab6b9b5e0..d63095472ea9 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -49,7 +49,7 @@ #include <linux/cpu.h> #include <linux/jump_label.h> -#include <asm-generic/sections.h> +#include <asm/sections.h> #include <asm/cacheflush.h> #include <asm/errno.h> #include <asm/uaccess.h> diff --git a/kernel/kthread.c b/kernel/kthread.c index 9ff173dca1ae..2318fba86277 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -53,20 +53,29 @@ enum KTHREAD_BITS { KTHREAD_IS_PARKED, }; -#define __to_kthread(vfork) \ - container_of(vfork, struct kthread, exited) +static inline void set_kthread_struct(void *kthread) +{ + /* + * We abuse ->set_child_tid to avoid the new member and because it + * can't be wrongly copied by copy_process(). We also rely on fact + * that the caller can't exec, so PF_KTHREAD can't be cleared. + */ + current->set_child_tid = (__force void __user *)kthread; +} static inline struct kthread *to_kthread(struct task_struct *k) { - return __to_kthread(k->vfork_done); + WARN_ON(!(k->flags & PF_KTHREAD)); + return (__force void *)k->set_child_tid; } -static struct kthread *to_live_kthread(struct task_struct *k) +void free_kthread_struct(struct task_struct *k) { - struct completion *vfork = ACCESS_ONCE(k->vfork_done); - if (likely(vfork)) - return __to_kthread(vfork); - return NULL; + /* + * Can be NULL if this kthread was created by kernel_thread() + * or if kmalloc() in kthread() failed. + */ + kfree(to_kthread(k)); } /** @@ -138,7 +147,7 @@ void *kthread_data(struct task_struct *task) } /** - * probe_kthread_data - speculative version of kthread_data() + * kthread_probe_data - speculative version of kthread_data() * @task: possible kthread task in question * * @task could be a kthread task. Return the data value specified when it @@ -146,7 +155,7 @@ void *kthread_data(struct task_struct *task) * inaccessible for any reason, %NULL is returned. This function requires * that @task itself is safe to dereference. */ -void *probe_kthread_data(struct task_struct *task) +void *kthread_probe_data(struct task_struct *task) { struct kthread *kthread = to_kthread(task); void *data = NULL; @@ -181,14 +190,11 @@ static int kthread(void *_create) int (*threadfn)(void *data) = create->threadfn; void *data = create->data; struct completion *done; - struct kthread self; + struct kthread *self; int ret; - self.flags = 0; - self.data = data; - init_completion(&self.exited); - init_completion(&self.parked); - current->vfork_done = &self.exited; + self = kmalloc(sizeof(*self), GFP_KERNEL); + set_kthread_struct(self); /* If user was SIGKILLed, I release the structure. */ done = xchg(&create->done, NULL); @@ -196,6 +202,19 @@ static int kthread(void *_create) kfree(create); do_exit(-EINTR); } + + if (!self) { + create->result = ERR_PTR(-ENOMEM); + complete(done); + do_exit(-ENOMEM); + } + + self->flags = 0; + self->data = data; + init_completion(&self->exited); + init_completion(&self->parked); + current->vfork_done = &self->exited; + /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; @@ -203,12 +222,10 @@ static int kthread(void *_create) schedule(); ret = -EINTR; - - if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) { - __kthread_parkme(&self); + if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { + __kthread_parkme(self); ret = threadfn(data); } - /* we can't just return, we must preserve "self" on stack */ do_exit(ret); } @@ -244,33 +261,11 @@ static void create_kthread(struct kthread_create_info *create) } } -/** - * kthread_create_on_node - create a kthread. - * @threadfn: the function to run until signal_pending(current). - * @data: data ptr for @threadfn. - * @node: task and thread structures for the thread are allocated on this node - * @namefmt: printf-style name for the thread. - * - * Description: This helper function creates and names a kernel - * thread. The thread will be stopped: use wake_up_process() to start - * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and - * is affine to all CPUs. - * - * If thread is going to be bound on a particular cpu, give its node - * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. - * When woken, the thread will run @threadfn() with @data as its - * argument. @threadfn() can either call do_exit() directly if it is a - * standalone thread for which no one will call kthread_stop(), or - * return when 'kthread_should_stop()' is true (which means - * kthread_stop() has been called). The return value should be zero - * or a negative error number; it will be passed to kthread_stop(). - * - * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). - */ -struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), - void *data, int node, - const char namefmt[], - ...) +static __printf(4, 0) +struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), + void *data, int node, + const char namefmt[], + va_list args) { DECLARE_COMPLETION_ONSTACK(done); struct task_struct *task; @@ -311,11 +306,8 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), task = create->result; if (!IS_ERR(task)) { static const struct sched_param param = { .sched_priority = 0 }; - va_list args; - va_start(args, namefmt); vsnprintf(task->comm, sizeof(task->comm), namefmt, args); - va_end(args); /* * root may have changed our (kthreadd's) priority or CPU mask. * The kernel thread should not inherit these properties. @@ -326,6 +318,44 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), kfree(create); return task; } + +/** + * kthread_create_on_node - create a kthread. + * @threadfn: the function to run until signal_pending(current). + * @data: data ptr for @threadfn. + * @node: task and thread structures for the thread are allocated on this node + * @namefmt: printf-style name for the thread. + * + * Description: This helper function creates and names a kernel + * thread. The thread will be stopped: use wake_up_process() to start + * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and + * is affine to all CPUs. + * + * If thread is going to be bound on a particular cpu, give its node + * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. + * When woken, the thread will run @threadfn() with @data as its + * argument. @threadfn() can either call do_exit() directly if it is a + * standalone thread for which no one will call kthread_stop(), or + * return when 'kthread_should_stop()' is true (which means + * kthread_stop() has been called). The return value should be zero + * or a negative error number; it will be passed to kthread_stop(). + * + * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). + */ +struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), + void *data, int node, + const char namefmt[], + ...) +{ + struct task_struct *task; + va_list args; + + va_start(args, namefmt); + task = __kthread_create_on_node(threadfn, data, node, namefmt, args); + va_end(args); + + return task; +} EXPORT_SYMBOL(kthread_create_on_node); static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state) @@ -390,15 +420,25 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), cpu); if (IS_ERR(p)) return p; + kthread_bind(p, cpu); + /* CPU hotplug need to bind once again when unparking the thread. */ set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); to_kthread(p)->cpu = cpu; - /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */ - kthread_park(p); return p; } -static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) +/** + * kthread_unpark - unpark a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * + * Sets kthread_should_park() for @k to return false, wakes it, and + * waits for it to return. If the thread is marked percpu then its + * bound to the cpu again. + */ +void kthread_unpark(struct task_struct *k) { + struct kthread *kthread = to_kthread(k); + clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); /* * We clear the IS_PARKED bit here as we don't wait @@ -407,27 +447,15 @@ static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) * which might be about to be cleared. */ if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { + /* + * Newly created kthread was parked when the CPU was offline. + * The binding was lost and we need to set it again. + */ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) __kthread_bind(k, kthread->cpu, TASK_PARKED); wake_up_state(k, TASK_PARKED); } } - -/** - * kthread_unpark - unpark a thread created by kthread_create(). - * @k: thread created by kthread_create(). - * - * Sets kthread_should_park() for @k to return false, wakes it, and - * waits for it to return. If the thread is marked percpu then its - * bound to the cpu again. - */ -void kthread_unpark(struct task_struct *k) -{ - struct kthread *kthread = to_live_kthread(k); - - if (kthread) - __kthread_unpark(k, kthread); -} EXPORT_SYMBOL_GPL(kthread_unpark); /** @@ -444,20 +472,20 @@ EXPORT_SYMBOL_GPL(kthread_unpark); */ int kthread_park(struct task_struct *k) { - struct kthread *kthread = to_live_kthread(k); - int ret = -ENOSYS; + struct kthread *kthread = to_kthread(k); + + if (WARN_ON(k->flags & PF_EXITING)) + return -ENOSYS; - if (kthread) { - if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) { - set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); - if (k != current) { - wake_up_process(k); - wait_for_completion(&kthread->parked); - } + if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) { + set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + if (k != current) { + wake_up_process(k); + wait_for_completion(&kthread->parked); } - ret = 0; } - return ret; + + return 0; } EXPORT_SYMBOL_GPL(kthread_park); @@ -484,13 +512,11 @@ int kthread_stop(struct task_struct *k) trace_sched_kthread_stop(k); get_task_struct(k); - kthread = to_live_kthread(k); - if (kthread) { - set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); - __kthread_unpark(k, kthread); - wake_up_process(k); - wait_for_completion(&kthread->exited); - } + kthread = to_kthread(k); + set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); + kthread_unpark(k); + wake_up_process(k); + wait_for_completion(&kthread->exited); ret = k->exit_code; put_task_struct(k); @@ -536,39 +562,48 @@ int kthreadd(void *unused) return 0; } -void __init_kthread_worker(struct kthread_worker *worker, +void __kthread_init_worker(struct kthread_worker *worker, const char *name, struct lock_class_key *key) { + memset(worker, 0, sizeof(struct kthread_worker)); spin_lock_init(&worker->lock); lockdep_set_class_and_name(&worker->lock, key, name); INIT_LIST_HEAD(&worker->work_list); - worker->task = NULL; + INIT_LIST_HEAD(&worker->delayed_work_list); } -EXPORT_SYMBOL_GPL(__init_kthread_worker); +EXPORT_SYMBOL_GPL(__kthread_init_worker); /** * kthread_worker_fn - kthread function to process kthread_worker * @worker_ptr: pointer to initialized kthread_worker * - * This function can be used as @threadfn to kthread_create() or - * kthread_run() with @worker_ptr argument pointing to an initialized - * kthread_worker. The started kthread will process work_list until - * the it is stopped with kthread_stop(). A kthread can also call - * this function directly after extra initialization. + * This function implements the main cycle of kthread worker. It processes + * work_list until it is stopped with kthread_stop(). It sleeps when the queue + * is empty. * - * Different kthreads can be used for the same kthread_worker as long - * as there's only one kthread attached to it at any given time. A - * kthread_worker without an attached kthread simply collects queued - * kthread_works. + * The works are not allowed to keep any locks, disable preemption or interrupts + * when they finish. There is defined a safe point for freezing when one work + * finishes and before a new one is started. + * + * Also the works must not be handled by more than one worker at the same time, + * see also kthread_queue_work(). */ int kthread_worker_fn(void *worker_ptr) { struct kthread_worker *worker = worker_ptr; struct kthread_work *work; - WARN_ON(worker->task); + /* + * FIXME: Update the check and remove the assignment when all kthread + * worker users are created using kthread_create_worker*() functions. + */ + WARN_ON(worker->task && worker->task != current); worker->task = current; + + if (worker->flags & KTW_FREEZABLE) + set_freezable(); + repeat: set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ @@ -601,12 +636,124 @@ repeat: } EXPORT_SYMBOL_GPL(kthread_worker_fn); -/* insert @work before @pos in @worker */ -static void insert_kthread_work(struct kthread_worker *worker, - struct kthread_work *work, - struct list_head *pos) +static __printf(3, 0) struct kthread_worker * +__kthread_create_worker(int cpu, unsigned int flags, + const char namefmt[], va_list args) +{ + struct kthread_worker *worker; + struct task_struct *task; + int node = -1; + + worker = kzalloc(sizeof(*worker), GFP_KERNEL); + if (!worker) + return ERR_PTR(-ENOMEM); + + kthread_init_worker(worker); + + if (cpu >= 0) + node = cpu_to_node(cpu); + + task = __kthread_create_on_node(kthread_worker_fn, worker, + node, namefmt, args); + if (IS_ERR(task)) + goto fail_task; + + if (cpu >= 0) + kthread_bind(task, cpu); + + worker->flags = flags; + worker->task = task; + wake_up_process(task); + return worker; + +fail_task: + kfree(worker); + return ERR_CAST(task); +} + +/** + * kthread_create_worker - create a kthread worker + * @flags: flags modifying the default behavior of the worker + * @namefmt: printf-style name for the kthread worker (task). + * + * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) + * when the needed structures could not get allocated, and ERR_PTR(-EINTR) + * when the worker was SIGKILLed. + */ +struct kthread_worker * +kthread_create_worker(unsigned int flags, const char namefmt[], ...) +{ + struct kthread_worker *worker; + va_list args; + + va_start(args, namefmt); + worker = __kthread_create_worker(-1, flags, namefmt, args); + va_end(args); + + return worker; +} +EXPORT_SYMBOL(kthread_create_worker); + +/** + * kthread_create_worker_on_cpu - create a kthread worker and bind it + * it to a given CPU and the associated NUMA node. + * @cpu: CPU number + * @flags: flags modifying the default behavior of the worker + * @namefmt: printf-style name for the kthread worker (task). + * + * Use a valid CPU number if you want to bind the kthread worker + * to the given CPU and the associated NUMA node. + * + * A good practice is to add the cpu number also into the worker name. + * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu). + * + * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) + * when the needed structures could not get allocated, and ERR_PTR(-EINTR) + * when the worker was SIGKILLed. + */ +struct kthread_worker * +kthread_create_worker_on_cpu(int cpu, unsigned int flags, + const char namefmt[], ...) +{ + struct kthread_worker *worker; + va_list args; + + va_start(args, namefmt); + worker = __kthread_create_worker(cpu, flags, namefmt, args); + va_end(args); + + return worker; +} +EXPORT_SYMBOL(kthread_create_worker_on_cpu); + +/* + * Returns true when the work could not be queued at the moment. + * It happens when it is already pending in a worker list + * or when it is being cancelled. + */ +static inline bool queuing_blocked(struct kthread_worker *worker, + struct kthread_work *work) +{ + lockdep_assert_held(&worker->lock); + + return !list_empty(&work->node) || work->canceling; +} + +static void kthread_insert_work_sanity_check(struct kthread_worker *worker, + struct kthread_work *work) { lockdep_assert_held(&worker->lock); + WARN_ON_ONCE(!list_empty(&work->node)); + /* Do not use a work with >1 worker, see kthread_queue_work() */ + WARN_ON_ONCE(work->worker && work->worker != worker); +} + +/* insert @work before @pos in @worker */ +static void kthread_insert_work(struct kthread_worker *worker, + struct kthread_work *work, + struct list_head *pos) +{ + kthread_insert_work_sanity_check(worker, work); list_add_tail(&work->node, pos); work->worker = worker; @@ -615,29 +762,133 @@ static void insert_kthread_work(struct kthread_worker *worker, } /** - * queue_kthread_work - queue a kthread_work + * kthread_queue_work - queue a kthread_work * @worker: target kthread_worker * @work: kthread_work to queue * * Queue @work to work processor @task for async execution. @task * must have been created with kthread_worker_create(). Returns %true * if @work was successfully queued, %false if it was already pending. + * + * Reinitialize the work if it needs to be used by another worker. + * For example, when the worker was stopped and started again. */ -bool queue_kthread_work(struct kthread_worker *worker, +bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work) { bool ret = false; unsigned long flags; spin_lock_irqsave(&worker->lock, flags); - if (list_empty(&work->node)) { - insert_kthread_work(worker, work, &worker->work_list); + if (!queuing_blocked(worker, work)) { + kthread_insert_work(worker, work, &worker->work_list); ret = true; } spin_unlock_irqrestore(&worker->lock, flags); return ret; } -EXPORT_SYMBOL_GPL(queue_kthread_work); +EXPORT_SYMBOL_GPL(kthread_queue_work); + +/** + * kthread_delayed_work_timer_fn - callback that queues the associated kthread + * delayed work when the timer expires. + * @__data: pointer to the data associated with the timer + * + * The format of the function is defined by struct timer_list. + * It should have been called from irqsafe timer with irq already off. + */ +void kthread_delayed_work_timer_fn(unsigned long __data) +{ + struct kthread_delayed_work *dwork = + (struct kthread_delayed_work *)__data; + struct kthread_work *work = &dwork->work; + struct kthread_worker *worker = work->worker; + + /* + * This might happen when a pending work is reinitialized. + * It means that it is used a wrong way. + */ + if (WARN_ON_ONCE(!worker)) + return; + + spin_lock(&worker->lock); + /* Work must not be used with >1 worker, see kthread_queue_work(). */ + WARN_ON_ONCE(work->worker != worker); + + /* Move the work from worker->delayed_work_list. */ + WARN_ON_ONCE(list_empty(&work->node)); + list_del_init(&work->node); + kthread_insert_work(worker, work, &worker->work_list); + + spin_unlock(&worker->lock); +} +EXPORT_SYMBOL(kthread_delayed_work_timer_fn); + +void __kthread_queue_delayed_work(struct kthread_worker *worker, + struct kthread_delayed_work *dwork, + unsigned long delay) +{ + struct timer_list *timer = &dwork->timer; + struct kthread_work *work = &dwork->work; + + WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn || + timer->data != (unsigned long)dwork); + + /* + * If @delay is 0, queue @dwork->work immediately. This is for + * both optimization and correctness. The earliest @timer can + * expire is on the closest next tick and delayed_work users depend + * on that there's no such delay when @delay is 0. + */ + if (!delay) { + kthread_insert_work(worker, work, &worker->work_list); + return; + } + + /* Be paranoid and try to detect possible races already now. */ + kthread_insert_work_sanity_check(worker, work); + + list_add(&work->node, &worker->delayed_work_list); + work->worker = worker; + timer_stats_timer_set_start_info(&dwork->timer); + timer->expires = jiffies + delay; + add_timer(timer); +} + +/** + * kthread_queue_delayed_work - queue the associated kthread work + * after a delay. + * @worker: target kthread_worker + * @dwork: kthread_delayed_work to queue + * @delay: number of jiffies to wait before queuing + * + * If the work has not been pending it starts a timer that will queue + * the work after the given @delay. If @delay is zero, it queues the + * work immediately. + * + * Return: %false if the @work has already been pending. It means that + * either the timer was running or the work was queued. It returns %true + * otherwise. + */ +bool kthread_queue_delayed_work(struct kthread_worker *worker, + struct kthread_delayed_work *dwork, + unsigned long delay) +{ + struct kthread_work *work = &dwork->work; + unsigned long flags; + bool ret = false; + + spin_lock_irqsave(&worker->lock, flags); + + if (!queuing_blocked(worker, work)) { + __kthread_queue_delayed_work(worker, dwork, delay); + ret = true; + } + + spin_unlock_irqrestore(&worker->lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); struct kthread_flush_work { struct kthread_work work; @@ -652,12 +903,12 @@ static void kthread_flush_work_fn(struct kthread_work *work) } /** - * flush_kthread_work - flush a kthread_work + * kthread_flush_work - flush a kthread_work * @work: work to flush * * If @work is queued or executing, wait for it to finish execution. */ -void flush_kthread_work(struct kthread_work *work) +void kthread_flush_work(struct kthread_work *work) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), @@ -666,21 +917,19 @@ void flush_kthread_work(struct kthread_work *work) struct kthread_worker *worker; bool noop = false; -retry: worker = work->worker; if (!worker) return; spin_lock_irq(&worker->lock); - if (work->worker != worker) { - spin_unlock_irq(&worker->lock); - goto retry; - } + /* Work must not be used with >1 worker, see kthread_queue_work(). */ + WARN_ON_ONCE(work->worker != worker); if (!list_empty(&work->node)) - insert_kthread_work(worker, &fwork.work, work->node.next); + kthread_insert_work(worker, &fwork.work, work->node.next); else if (worker->current_work == work) - insert_kthread_work(worker, &fwork.work, worker->work_list.next); + kthread_insert_work(worker, &fwork.work, + worker->work_list.next); else noop = true; @@ -689,23 +938,214 @@ retry: if (!noop) wait_for_completion(&fwork.done); } -EXPORT_SYMBOL_GPL(flush_kthread_work); +EXPORT_SYMBOL_GPL(kthread_flush_work); + +/* + * This function removes the work from the worker queue. Also it makes sure + * that it won't get queued later via the delayed work's timer. + * + * The work might still be in use when this function finishes. See the + * current_work proceed by the worker. + * + * Return: %true if @work was pending and successfully canceled, + * %false if @work was not pending + */ +static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork, + unsigned long *flags) +{ + /* Try to cancel the timer if exists. */ + if (is_dwork) { + struct kthread_delayed_work *dwork = + container_of(work, struct kthread_delayed_work, work); + struct kthread_worker *worker = work->worker; + + /* + * del_timer_sync() must be called to make sure that the timer + * callback is not running. The lock must be temporary released + * to avoid a deadlock with the callback. In the meantime, + * any queuing is blocked by setting the canceling counter. + */ + work->canceling++; + spin_unlock_irqrestore(&worker->lock, *flags); + del_timer_sync(&dwork->timer); + spin_lock_irqsave(&worker->lock, *flags); + work->canceling--; + } + + /* + * Try to remove the work from a worker list. It might either + * be from worker->work_list or from worker->delayed_work_list. + */ + if (!list_empty(&work->node)) { + list_del_init(&work->node); + return true; + } + + return false; +} /** - * flush_kthread_worker - flush all current works on a kthread_worker + * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work + * @worker: kthread worker to use + * @dwork: kthread delayed work to queue + * @delay: number of jiffies to wait before queuing + * + * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise, + * modify @dwork's timer so that it expires after @delay. If @delay is zero, + * @work is guaranteed to be queued immediately. + * + * Return: %true if @dwork was pending and its timer was modified, + * %false otherwise. + * + * A special case is when the work is being canceled in parallel. + * It might be caused either by the real kthread_cancel_delayed_work_sync() + * or yet another kthread_mod_delayed_work() call. We let the other command + * win and return %false here. The caller is supposed to synchronize these + * operations a reasonable way. + * + * This function is safe to call from any context including IRQ handler. + * See __kthread_cancel_work() and kthread_delayed_work_timer_fn() + * for details. + */ +bool kthread_mod_delayed_work(struct kthread_worker *worker, + struct kthread_delayed_work *dwork, + unsigned long delay) +{ + struct kthread_work *work = &dwork->work; + unsigned long flags; + int ret = false; + + spin_lock_irqsave(&worker->lock, flags); + + /* Do not bother with canceling when never queued. */ + if (!work->worker) + goto fast_queue; + + /* Work must not be used with >1 worker, see kthread_queue_work() */ + WARN_ON_ONCE(work->worker != worker); + + /* Do not fight with another command that is canceling this work. */ + if (work->canceling) + goto out; + + ret = __kthread_cancel_work(work, true, &flags); +fast_queue: + __kthread_queue_delayed_work(worker, dwork, delay); +out: + spin_unlock_irqrestore(&worker->lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); + +static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) +{ + struct kthread_worker *worker = work->worker; + unsigned long flags; + int ret = false; + + if (!worker) + goto out; + + spin_lock_irqsave(&worker->lock, flags); + /* Work must not be used with >1 worker, see kthread_queue_work(). */ + WARN_ON_ONCE(work->worker != worker); + + ret = __kthread_cancel_work(work, is_dwork, &flags); + + if (worker->current_work != work) + goto out_fast; + + /* + * The work is in progress and we need to wait with the lock released. + * In the meantime, block any queuing by setting the canceling counter. + */ + work->canceling++; + spin_unlock_irqrestore(&worker->lock, flags); + kthread_flush_work(work); + spin_lock_irqsave(&worker->lock, flags); + work->canceling--; + +out_fast: + spin_unlock_irqrestore(&worker->lock, flags); +out: + return ret; +} + +/** + * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish + * @work: the kthread work to cancel + * + * Cancel @work and wait for its execution to finish. This function + * can be used even if the work re-queues itself. On return from this + * function, @work is guaranteed to be not pending or executing on any CPU. + * + * kthread_cancel_work_sync(&delayed_work->work) must not be used for + * delayed_work's. Use kthread_cancel_delayed_work_sync() instead. + * + * The caller must ensure that the worker on which @work was last + * queued can't be destroyed before this function returns. + * + * Return: %true if @work was pending, %false otherwise. + */ +bool kthread_cancel_work_sync(struct kthread_work *work) +{ + return __kthread_cancel_work_sync(work, false); +} +EXPORT_SYMBOL_GPL(kthread_cancel_work_sync); + +/** + * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and + * wait for it to finish. + * @dwork: the kthread delayed work to cancel + * + * This is kthread_cancel_work_sync() for delayed works. + * + * Return: %true if @dwork was pending, %false otherwise. + */ +bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork) +{ + return __kthread_cancel_work_sync(&dwork->work, true); +} +EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync); + +/** + * kthread_flush_worker - flush all current works on a kthread_worker * @worker: worker to flush * * Wait until all currently executing or pending works on @worker are * finished. */ -void flush_kthread_worker(struct kthread_worker *worker) +void kthread_flush_worker(struct kthread_worker *worker) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; - queue_kthread_work(worker, &fwork.work); + kthread_queue_work(worker, &fwork.work); wait_for_completion(&fwork.done); } -EXPORT_SYMBOL_GPL(flush_kthread_worker); +EXPORT_SYMBOL_GPL(kthread_flush_worker); + +/** + * kthread_destroy_worker - destroy a kthread worker + * @worker: worker to be destroyed + * + * Flush and destroy @worker. The simple flush is enough because the kthread + * worker API is used only in trivial scenarios. There are no multi-step state + * machines needed. + */ +void kthread_destroy_worker(struct kthread_worker *worker) +{ + struct task_struct *task; + + task = worker->task; + if (WARN_ON(!task)) + return; + + kthread_flush_worker(worker); + kthread_stop(task); + WARN_ON(!list_empty(&worker->work_list)); + kfree(worker); +} +EXPORT_SYMBOL(kthread_destroy_worker); diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 8bbe50704621..af4643873e71 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -274,7 +274,6 @@ static int klp_write_object_relocations(struct module *pmod, objname = klp_is_module(obj) ? obj->name : "vmlinux"; - module_disable_ro(pmod); /* For each klp relocation section */ for (i = 1; i < pmod->klp_info->hdr.e_shnum; i++) { sec = pmod->klp_info->sechdrs + i; @@ -309,7 +308,6 @@ static int klp_write_object_relocations(struct module *pmod, break; } - module_enable_ro(pmod, true); return ret; } @@ -547,9 +545,6 @@ static int __klp_enable_patch(struct klp_patch *patch) list_prev_entry(patch, list)->state == KLP_DISABLED) return -EBUSY; - pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n"); - add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK); - pr_notice("enabling patch '%s'\n", patch->mod->name); klp_for_each_object(patch, obj) { @@ -763,6 +758,12 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) func->old_sympos ? func->old_sympos : 1); } +/* Arches may override this to finish any remaining arch-specific tasks */ +void __weak arch_klp_init_object_loaded(struct klp_patch *patch, + struct klp_object *obj) +{ +} + /* parts of the initialization that is done only when the object is loaded */ static int klp_init_object_loaded(struct klp_patch *patch, struct klp_object *obj) @@ -770,9 +771,15 @@ static int klp_init_object_loaded(struct klp_patch *patch, struct klp_func *func; int ret; + module_disable_ro(patch->mod); ret = klp_write_object_relocations(patch->mod, obj); - if (ret) + if (ret) { + module_enable_ro(patch->mod, true); return ret; + } + + arch_klp_init_object_loaded(patch, obj); + module_enable_ro(patch->mod, true); klp_for_each_func(obj, func) { ret = klp_find_object_symbol(obj->name, func->old_name, diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 31322a4275cd..6f88e352cd4f 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -18,7 +18,6 @@ obj-$(CONFIG_LOCKDEP) += lockdep_proc.o endif obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o -obj-$(CONFIG_SMP) += lglock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c deleted file mode 100644 index 951cfcd10b4a..000000000000 --- a/kernel/locking/lglock.c +++ /dev/null @@ -1,111 +0,0 @@ -/* See include/linux/lglock.h for description */ -#include <linux/module.h> -#include <linux/lglock.h> -#include <linux/cpu.h> -#include <linux/string.h> - -/* - * Note there is no uninit, so lglocks cannot be defined in - * modules (but it's fine to use them from there) - * Could be added though, just undo lg_lock_init - */ - -void lg_lock_init(struct lglock *lg, char *name) -{ - LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0); -} -EXPORT_SYMBOL(lg_lock_init); - -void lg_local_lock(struct lglock *lg) -{ - arch_spinlock_t *lock; - - preempt_disable(); - lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); - lock = this_cpu_ptr(lg->lock); - arch_spin_lock(lock); -} -EXPORT_SYMBOL(lg_local_lock); - -void lg_local_unlock(struct lglock *lg) -{ - arch_spinlock_t *lock; - - lock_release(&lg->lock_dep_map, 1, _RET_IP_); - lock = this_cpu_ptr(lg->lock); - arch_spin_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(lg_local_unlock); - -void lg_local_lock_cpu(struct lglock *lg, int cpu) -{ - arch_spinlock_t *lock; - - preempt_disable(); - lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); - lock = per_cpu_ptr(lg->lock, cpu); - arch_spin_lock(lock); -} -EXPORT_SYMBOL(lg_local_lock_cpu); - -void lg_local_unlock_cpu(struct lglock *lg, int cpu) -{ - arch_spinlock_t *lock; - - lock_release(&lg->lock_dep_map, 1, _RET_IP_); - lock = per_cpu_ptr(lg->lock, cpu); - arch_spin_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(lg_local_unlock_cpu); - -void lg_double_lock(struct lglock *lg, int cpu1, int cpu2) -{ - BUG_ON(cpu1 == cpu2); - - /* lock in cpu order, just like lg_global_lock */ - if (cpu2 < cpu1) - swap(cpu1, cpu2); - - preempt_disable(); - lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); - arch_spin_lock(per_cpu_ptr(lg->lock, cpu1)); - arch_spin_lock(per_cpu_ptr(lg->lock, cpu2)); -} - -void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2) -{ - lock_release(&lg->lock_dep_map, 1, _RET_IP_); - arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1)); - arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2)); - preempt_enable(); -} - -void lg_global_lock(struct lglock *lg) -{ - int i; - - preempt_disable(); - lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); - for_each_possible_cpu(i) { - arch_spinlock_t *lock; - lock = per_cpu_ptr(lg->lock, i); - arch_spin_lock(lock); - } -} -EXPORT_SYMBOL(lg_global_lock); - -void lg_global_unlock(struct lglock *lg) -{ - int i; - - lock_release(&lg->lock_dep_map, 1, _RET_IP_); - for_each_possible_cpu(i) { - arch_spinlock_t *lock; - lock = per_cpu_ptr(lg->lock, i); - arch_spin_unlock(lock); - } - preempt_enable(); -} -EXPORT_SYMBOL(lg_global_unlock); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 589d763a49b3..7bd265f6b098 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -506,13 +506,13 @@ static void __print_lock_name(struct lock_class *class) name = class->name; if (!name) { name = __get_key_name(class->key, str); - printk("%s", name); + printk(KERN_CONT "%s", name); } else { - printk("%s", name); + printk(KERN_CONT "%s", name); if (class->name_version > 1) - printk("#%d", class->name_version); + printk(KERN_CONT "#%d", class->name_version); if (class->subclass) - printk("/%d", class->subclass); + printk(KERN_CONT "/%d", class->subclass); } } @@ -522,9 +522,9 @@ static void print_lock_name(struct lock_class *class) get_usage_chars(class, usage); - printk(" ("); + printk(KERN_CONT " ("); __print_lock_name(class); - printk("){%s}", usage); + printk(KERN_CONT "){%s}", usage); } static void print_lockdep_cache(struct lockdep_map *lock) @@ -536,7 +536,7 @@ static void print_lockdep_cache(struct lockdep_map *lock) if (!name) name = __get_key_name(lock->key->subkeys, str); - printk("%s", name); + printk(KERN_CONT "%s", name); } static void print_lock(struct held_lock *hlock) @@ -551,13 +551,13 @@ static void print_lock(struct held_lock *hlock) barrier(); if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) { - printk("<RELEASED>\n"); + printk(KERN_CONT "<RELEASED>\n"); return; } print_lock_name(lock_classes + class_idx - 1); - printk(", at: "); - print_ip_sym(hlock->acquire_ip); + printk(KERN_CONT ", at: [<%p>] %pS\n", + (void *)hlock->acquire_ip, (void *)hlock->acquire_ip); } static void lockdep_print_held_locks(struct task_struct *curr) @@ -792,8 +792,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) printk("\nnew class %p: %s", class->key, class->name); if (class->name_version > 1) - printk("#%d", class->name_version); - printk("\n"); + printk(KERN_CONT "#%d", class->name_version); + printk(KERN_CONT "\n"); dump_stack(); if (!graph_lock()) { @@ -840,9 +840,9 @@ static struct lock_list *alloc_list_entry(void) /* * Add a new dependency to the head of the list: */ -static int add_lock_to_list(struct lock_class *class, struct lock_class *this, - struct list_head *head, unsigned long ip, - int distance, struct stack_trace *trace) +static int add_lock_to_list(struct lock_class *this, struct list_head *head, + unsigned long ip, int distance, + struct stack_trace *trace) { struct lock_list *entry; /* @@ -1071,7 +1071,7 @@ print_circular_bug_entry(struct lock_list *target, int depth) return 0; printk("\n-> #%u", depth); print_lock_name(target->class); - printk(":\n"); + printk(KERN_CONT ":\n"); print_stack_trace(&target->trace, 6); return 0; @@ -1102,11 +1102,11 @@ print_circular_lock_scenario(struct held_lock *src, if (parent != source) { printk("Chain exists of:\n "); __print_lock_name(source); - printk(" --> "); + printk(KERN_CONT " --> "); __print_lock_name(parent); - printk(" --> "); + printk(KERN_CONT " --> "); __print_lock_name(target); - printk("\n\n"); + printk(KERN_CONT "\n\n"); } printk(" Possible unsafe locking scenario:\n\n"); @@ -1114,16 +1114,16 @@ print_circular_lock_scenario(struct held_lock *src, printk(" ---- ----\n"); printk(" lock("); __print_lock_name(target); - printk(");\n"); + printk(KERN_CONT ");\n"); printk(" lock("); __print_lock_name(parent); - printk(");\n"); + printk(KERN_CONT ");\n"); printk(" lock("); __print_lock_name(target); - printk(");\n"); + printk(KERN_CONT ");\n"); printk(" lock("); __print_lock_name(source); - printk(");\n"); + printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); } @@ -1359,22 +1359,22 @@ static void print_lock_class_header(struct lock_class *class, int depth) printk("%*s->", depth, ""); print_lock_name(class); - printk(" ops: %lu", class->ops); - printk(" {\n"); + printk(KERN_CONT " ops: %lu", class->ops); + printk(KERN_CONT " {\n"); for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { if (class->usage_mask & (1 << bit)) { int len = depth; len += printk("%*s %s", depth, "", usage_str[bit]); - len += printk(" at:\n"); + len += printk(KERN_CONT " at:\n"); print_stack_trace(class->usage_traces + bit, len); } } printk("%*s }\n", depth, ""); - printk("%*s ... key at: ",depth,""); - print_ip_sym((unsigned long)class->key); + printk("%*s ... key at: [<%p>] %pS\n", + depth, "", class->key, class->key); } /* @@ -1437,11 +1437,11 @@ print_irq_lock_scenario(struct lock_list *safe_entry, if (middle_class != unsafe_class) { printk("Chain exists of:\n "); __print_lock_name(safe_class); - printk(" --> "); + printk(KERN_CONT " --> "); __print_lock_name(middle_class); - printk(" --> "); + printk(KERN_CONT " --> "); __print_lock_name(unsafe_class); - printk("\n\n"); + printk(KERN_CONT "\n\n"); } printk(" Possible interrupt unsafe locking scenario:\n\n"); @@ -1449,18 +1449,18 @@ print_irq_lock_scenario(struct lock_list *safe_entry, printk(" ---- ----\n"); printk(" lock("); __print_lock_name(unsafe_class); - printk(");\n"); + printk(KERN_CONT ");\n"); printk(" local_irq_disable();\n"); printk(" lock("); __print_lock_name(safe_class); - printk(");\n"); + printk(KERN_CONT ");\n"); printk(" lock("); __print_lock_name(middle_class); - printk(");\n"); + printk(KERN_CONT ");\n"); printk(" <Interrupt>\n"); printk(" lock("); __print_lock_name(safe_class); - printk(");\n"); + printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); } @@ -1497,9 +1497,9 @@ print_bad_irq_dependency(struct task_struct *curr, print_lock(prev); printk("which would create a new lock dependency:\n"); print_lock_name(hlock_class(prev)); - printk(" ->"); + printk(KERN_CONT " ->"); print_lock_name(hlock_class(next)); - printk("\n"); + printk(KERN_CONT "\n"); printk("\nbut this new dependency connects a %s-irq-safe lock:\n", irqclass); @@ -1521,8 +1521,7 @@ print_bad_irq_dependency(struct task_struct *curr, lockdep_print_held_locks(curr); - printk("\nthe dependencies between %s-irq-safe lock", irqclass); - printk(" and the holding lock:\n"); + printk("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); if (!save_trace(&prev_root->trace)) return 0; print_shortest_lock_dependencies(backwards_entry, prev_root); @@ -1694,10 +1693,10 @@ print_deadlock_scenario(struct held_lock *nxt, printk(" ----\n"); printk(" lock("); __print_lock_name(prev); - printk(");\n"); + printk(KERN_CONT ");\n"); printk(" lock("); __print_lock_name(next); - printk(");\n"); + printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); printk(" May be due to missing lock nesting notation\n\n"); } @@ -1869,14 +1868,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * Ok, all validations passed, add the new lock * to the previous lock's dependency list: */ - ret = add_lock_to_list(hlock_class(prev), hlock_class(next), + ret = add_lock_to_list(hlock_class(next), &hlock_class(prev)->locks_after, next->acquire_ip, distance, &trace); if (!ret) return 0; - ret = add_lock_to_list(hlock_class(next), hlock_class(prev), + ret = add_lock_to_list(hlock_class(prev), &hlock_class(next)->locks_before, next->acquire_ip, distance, &trace); if (!ret) @@ -1891,9 +1890,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, graph_unlock(); printk("\n new dependency: "); print_lock_name(hlock_class(prev)); - printk(" => "); + printk(KERN_CONT " => "); print_lock_name(hlock_class(next)); - printk("\n"); + printk(KERN_CONT "\n"); dump_stack(); return graph_lock(); } @@ -2343,11 +2342,11 @@ print_usage_bug_scenario(struct held_lock *lock) printk(" ----\n"); printk(" lock("); __print_lock_name(class); - printk(");\n"); + printk(KERN_CONT ");\n"); printk(" <Interrupt>\n"); printk(" lock("); __print_lock_name(class); - printk(");\n"); + printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); } @@ -2522,14 +2521,18 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, void print_irqtrace_events(struct task_struct *curr) { printk("irq event stamp: %u\n", curr->irq_events); - printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event); - print_ip_sym(curr->hardirq_enable_ip); - printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event); - print_ip_sym(curr->hardirq_disable_ip); - printk("softirqs last enabled at (%u): ", curr->softirq_enable_event); - print_ip_sym(curr->softirq_enable_ip); - printk("softirqs last disabled at (%u): ", curr->softirq_disable_event); - print_ip_sym(curr->softirq_disable_ip); + printk("hardirqs last enabled at (%u): [<%p>] %pS\n", + curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip, + (void *)curr->hardirq_enable_ip); + printk("hardirqs last disabled at (%u): [<%p>] %pS\n", + curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip, + (void *)curr->hardirq_disable_ip); + printk("softirqs last enabled at (%u): [<%p>] %pS\n", + curr->softirq_enable_event, (void *)curr->softirq_enable_ip, + (void *)curr->softirq_enable_ip); + printk("softirqs last disabled at (%u): [<%p>] %pS\n", + curr->softirq_disable_event, (void *)curr->softirq_disable_ip, + (void *)curr->softirq_disable_ip); } static int HARDIRQ_verbose(struct lock_class *class) @@ -3235,8 +3238,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (very_verbose(class)) { printk("\nacquire class [%p] %s", class->key, class->name); if (class->name_version > 1) - printk("#%d", class->name_version); - printk("\n"); + printk(KERN_CONT "#%d", class->name_version); + printk(KERN_CONT "\n"); dump_stack(); } @@ -3378,7 +3381,7 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, printk("%s/%d is trying to release lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); - printk(") at:\n"); + printk(KERN_CONT ") at:\n"); print_ip_sym(ip); printk("but there are no more locks to release!\n"); printk("\nother info that might help us debug this:\n"); @@ -3871,7 +3874,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, printk("%s/%d is trying to contend lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); - printk(") at:\n"); + printk(KERN_CONT ") at:\n"); print_ip_sym(ip); printk("but there are no locks held!\n"); printk("\nother info that might help us debug this:\n"); diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 51c4b24b6328..c2b88490d857 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -46,6 +46,14 @@ enum { (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) /* + * CONFIG_PROVE_LOCKING_SMALL is defined for sparc. Sparc requires .text, + * .data and .bss to fit in required 32MB limit for the kernel. With + * PROVE_LOCKING we could go over this limit and cause system boot-up problems. + * So, reduce the static allocations for lockdeps related structures so that + * everything fits in current required size limit. + */ +#ifdef CONFIG_PROVE_LOCKING_SMALL +/* * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies * we track. * @@ -54,18 +62,24 @@ enum { * table (if it's not there yet), and we check it for lock order * conflicts and deadlocks. */ +#define MAX_LOCKDEP_ENTRIES 16384UL +#define MAX_LOCKDEP_CHAINS_BITS 15 +#define MAX_STACK_TRACE_ENTRIES 262144UL +#else #define MAX_LOCKDEP_ENTRIES 32768UL #define MAX_LOCKDEP_CHAINS_BITS 16 -#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) - -#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) /* * Stack-trace: tightly packed array of stack backtrace * addresses. Protected by the hash_lock. */ #define MAX_STACK_TRACE_ENTRIES 524288UL +#endif + +#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) + +#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) extern struct list_head all_lock_classes; extern struct lock_chain lock_chains[]; diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index c835270f0c2f..6a385aabcce7 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -28,7 +28,7 @@ struct mcs_spinlock { #define arch_mcs_spin_lock_contended(l) \ do { \ while (!(smp_load_acquire(l))) \ - cpu_relax_lowlatency(); \ + cpu_relax(); \ } while (0) #endif @@ -108,7 +108,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) return; /* Wait until the next pointer is set */ while (!(next = READ_ONCE(node->next))) - cpu_relax_lowlatency(); + cpu_relax(); } /* Pass lock to next waiter. */ diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 9c951fade415..9aa713629387 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -73,21 +73,8 @@ void debug_mutex_unlock(struct mutex *lock) { if (likely(debug_locks)) { DEBUG_LOCKS_WARN_ON(lock->magic != lock); - - if (!lock->owner) - DEBUG_LOCKS_WARN_ON(!lock->owner); - else - DEBUG_LOCKS_WARN_ON(lock->owner != current); - DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); } - - /* - * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug - * mutexes so that we can do it here after we've verified state. - */ - mutex_clear_owner(lock); - atomic_set(&lock->count, 1); } void debug_mutex_init(struct mutex *lock, const char *name, diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index 57a871ae3c81..a459faa48987 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -27,16 +27,6 @@ extern void debug_mutex_unlock(struct mutex *lock); extern void debug_mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key); -static inline void mutex_set_owner(struct mutex *lock) -{ - WRITE_ONCE(lock->owner, current); -} - -static inline void mutex_clear_owner(struct mutex *lock) -{ - WRITE_ONCE(lock->owner, NULL); -} - #define spin_lock_mutex(lock, flags) \ do { \ struct mutex *l = container_of(lock, struct mutex, wait_lock); \ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index a70b90db3909..9b349619f431 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -27,41 +27,176 @@ #include <linux/debug_locks.h> #include <linux/osq_lock.h> -/* - * In the DEBUG case we are using the "NULL fastpath" for mutexes, - * which forces all calls into the slowpath: - */ #ifdef CONFIG_DEBUG_MUTEXES # include "mutex-debug.h" -# include <asm-generic/mutex-null.h> -/* - * Must be 0 for the debug case so we do not do the unlock outside of the - * wait_lock region. debug_mutex_unlock() will do the actual unlock in this - * case. - */ -# undef __mutex_slowpath_needs_to_unlock -# define __mutex_slowpath_needs_to_unlock() 0 #else # include "mutex.h" -# include <asm/mutex.h> #endif void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { - atomic_set(&lock->count, 1); + atomic_long_set(&lock->owner, 0); spin_lock_init(&lock->wait_lock); INIT_LIST_HEAD(&lock->wait_list); - mutex_clear_owner(lock); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER osq_lock_init(&lock->osq); #endif debug_mutex_init(lock, name, key); } - EXPORT_SYMBOL(__mutex_init); +/* + * @owner: contains: 'struct task_struct *' to the current lock owner, + * NULL means not owned. Since task_struct pointers are aligned at + * ARCH_MIN_TASKALIGN (which is at least sizeof(void *)), we have low + * bits to store extra state. + * + * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup. + * Bit1 indicates unlock needs to hand the lock to the top-waiter + */ +#define MUTEX_FLAG_WAITERS 0x01 +#define MUTEX_FLAG_HANDOFF 0x02 + +#define MUTEX_FLAGS 0x03 + +static inline struct task_struct *__owner_task(unsigned long owner) +{ + return (struct task_struct *)(owner & ~MUTEX_FLAGS); +} + +static inline unsigned long __owner_flags(unsigned long owner) +{ + return owner & MUTEX_FLAGS; +} + +/* + * Actual trylock that will work on any unlocked state. + * + * When setting the owner field, we must preserve the low flag bits. + * + * Be careful with @handoff, only set that in a wait-loop (where you set + * HANDOFF) to avoid recursive lock attempts. + */ +static inline bool __mutex_trylock(struct mutex *lock, const bool handoff) +{ + unsigned long owner, curr = (unsigned long)current; + + owner = atomic_long_read(&lock->owner); + for (;;) { /* must loop, can race against a flag */ + unsigned long old, flags = __owner_flags(owner); + + if (__owner_task(owner)) { + if (handoff && unlikely(__owner_task(owner) == current)) { + /* + * Provide ACQUIRE semantics for the lock-handoff. + * + * We cannot easily use load-acquire here, since + * the actual load is a failed cmpxchg, which + * doesn't imply any barriers. + * + * Also, this is a fairly unlikely scenario, and + * this contains the cost. + */ + smp_mb(); /* ACQUIRE */ + return true; + } + + return false; + } + + /* + * We set the HANDOFF bit, we must make sure it doesn't live + * past the point where we acquire it. This would be possible + * if we (accidentally) set the bit on an unlocked mutex. + */ + if (handoff) + flags &= ~MUTEX_FLAG_HANDOFF; + + old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags); + if (old == owner) + return true; + + owner = old; + } +} + +#ifndef CONFIG_DEBUG_LOCK_ALLOC +/* + * Lockdep annotations are contained to the slow paths for simplicity. + * There is nothing that would stop spreading the lockdep annotations outwards + * except more code. + */ + +/* + * Optimistic trylock that only works in the uncontended case. Make sure to + * follow with a __mutex_trylock() before failing. + */ +static __always_inline bool __mutex_trylock_fast(struct mutex *lock) +{ + unsigned long curr = (unsigned long)current; + + if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr)) + return true; + + return false; +} + +static __always_inline bool __mutex_unlock_fast(struct mutex *lock) +{ + unsigned long curr = (unsigned long)current; + + if (atomic_long_cmpxchg_release(&lock->owner, curr, 0UL) == curr) + return true; + + return false; +} +#endif + +static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag) +{ + atomic_long_or(flag, &lock->owner); +} + +static inline void __mutex_clear_flag(struct mutex *lock, unsigned long flag) +{ + atomic_long_andnot(flag, &lock->owner); +} + +static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_waiter *waiter) +{ + return list_first_entry(&lock->wait_list, struct mutex_waiter, list) == waiter; +} + +/* + * Give up ownership to a specific task, when @task = NULL, this is equivalent + * to a regular unlock. Clears HANDOFF, preserves WAITERS. Provides RELEASE + * semantics like a regular unlock, the __mutex_trylock() provides matching + * ACQUIRE semantics for the handoff. + */ +static void __mutex_handoff(struct mutex *lock, struct task_struct *task) +{ + unsigned long owner = atomic_long_read(&lock->owner); + + for (;;) { + unsigned long old, new; + +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current); +#endif + + new = (owner & MUTEX_FLAG_WAITERS); + new |= (unsigned long)task; + + old = atomic_long_cmpxchg_release(&lock->owner, owner, new); + if (old == owner) + break; + + owner = old; + } +} + #ifndef CONFIG_DEBUG_LOCK_ALLOC /* * We split the mutex lock/unlock logic into separate fastpath and @@ -69,7 +204,7 @@ EXPORT_SYMBOL(__mutex_init); * We also put the fastpath first in the kernel image, to make sure the * branch is predicted by the CPU as default-untaken. */ -__visible void __sched __mutex_lock_slowpath(atomic_t *lock_count); +static void __sched __mutex_lock_slowpath(struct mutex *lock); /** * mutex_lock - acquire the mutex @@ -95,14 +230,10 @@ __visible void __sched __mutex_lock_slowpath(atomic_t *lock_count); void __sched mutex_lock(struct mutex *lock) { might_sleep(); - /* - * The locking fastpath is the 1->0 transition from - * 'unlocked' into 'locked' state. - */ - __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); - mutex_set_owner(lock); -} + if (!__mutex_trylock_fast(lock)) + __mutex_lock_slowpath(lock); +} EXPORT_SYMBOL(mutex_lock); #endif @@ -149,9 +280,6 @@ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, /* * After acquiring lock with fastpath or when we lost out in contested * slowpath, set ctx and wake up any waiters so they can recheck. - * - * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, - * as the fastpath and opportunistic spinning are disabled in that case. */ static __always_inline void ww_mutex_set_context_fastpath(struct ww_mutex *lock, @@ -176,7 +304,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, /* * Check if lock is contended, if not there is nobody to wake up */ - if (likely(atomic_read(&lock->base.count) == 0)) + if (likely(!(atomic_long_read(&lock->base.owner) & MUTEX_FLAG_WAITERS))) return; /* @@ -227,7 +355,7 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) bool ret = true; rcu_read_lock(); - while (lock->owner == owner) { + while (__mutex_owner(lock) == owner) { /* * Ensure we emit the owner->on_cpu, dereference _after_ * checking lock->owner still matches owner. If that fails, @@ -236,12 +364,16 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) */ barrier(); - if (!owner->on_cpu || need_resched()) { + /* + * Use vcpu_is_preempted to detect lock holder preemption issue. + */ + if (!owner->on_cpu || need_resched() || + vcpu_is_preempted(task_cpu(owner))) { ret = false; break; } - cpu_relax_lowlatency(); + cpu_relax(); } rcu_read_unlock(); @@ -260,27 +392,25 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) return 0; rcu_read_lock(); - owner = READ_ONCE(lock->owner); + owner = __mutex_owner(lock); + + /* + * As lock holder preemption issue, we both skip spinning if task is not + * on cpu or its cpu is preempted + */ if (owner) - retval = owner->on_cpu; + retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); rcu_read_unlock(); + /* - * if lock->owner is not set, the mutex owner may have just acquired - * it and not set the owner yet or the mutex has been released. + * If lock->owner is not set, the mutex has been released. Return true + * such that we'll trylock in the spin path, which is a faster option + * than the blocking slow path. */ return retval; } /* - * Atomically try to take the lock when it is available - */ -static inline bool mutex_try_to_acquire(struct mutex *lock) -{ - return !mutex_is_locked(lock) && - (atomic_cmpxchg_acquire(&lock->count, 1, 0) == 1); -} - -/* * Optimistic spinning. * * We try to spin for acquisition when we find that the lock owner @@ -288,13 +418,6 @@ static inline bool mutex_try_to_acquire(struct mutex *lock) * need to reschedule. The rationale is that if the lock owner is * running, it is likely to release the lock soon. * - * Since this needs the lock owner, and this mutex implementation - * doesn't track the owner atomically in the lock field, we need to - * track it non-atomically. - * - * We can't do this for DEBUG_MUTEXES because that relies on wait_lock - * to serialize everything. - * * The mutex spinners are queued up using MCS lock so that only one * spinner can compete for the mutex. However, if mutex spinning isn't * going to happen, there is no point in going through the lock/unlock @@ -302,24 +425,39 @@ static inline bool mutex_try_to_acquire(struct mutex *lock) * * Returns true when the lock was taken, otherwise false, indicating * that we need to jump to the slowpath and sleep. + * + * The waiter flag is set to true if the spinner is a waiter in the wait + * queue. The waiter-spinner will spin on the lock directly and concurrently + * with the spinner at the head of the OSQ, if present, until the owner is + * changed to itself. */ static bool mutex_optimistic_spin(struct mutex *lock, - struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) + struct ww_acquire_ctx *ww_ctx, + const bool use_ww_ctx, const bool waiter) { struct task_struct *task = current; - if (!mutex_can_spin_on_owner(lock)) - goto done; + if (!waiter) { + /* + * The purpose of the mutex_can_spin_on_owner() function is + * to eliminate the overhead of osq_lock() and osq_unlock() + * in case spinning isn't possible. As a waiter-spinner + * is not going to take OSQ lock anyway, there is no need + * to call mutex_can_spin_on_owner(). + */ + if (!mutex_can_spin_on_owner(lock)) + goto fail; - /* - * In order to avoid a stampede of mutex spinners trying to - * acquire the mutex all at once, the spinners need to take a - * MCS (queued) lock first before spinning on the owner field. - */ - if (!osq_lock(&lock->osq)) - goto done; + /* + * In order to avoid a stampede of mutex spinners trying to + * acquire the mutex all at once, the spinners need to take a + * MCS (queued) lock first before spinning on the owner field. + */ + if (!osq_lock(&lock->osq)) + goto fail; + } - while (true) { + for (;;) { struct task_struct *owner; if (use_ww_ctx && ww_ctx->acquired > 0) { @@ -335,40 +473,26 @@ static bool mutex_optimistic_spin(struct mutex *lock, * performed the optimistic spinning cannot be done. */ if (READ_ONCE(ww->ctx)) - break; + goto fail_unlock; } /* * If there's an owner, wait for it to either * release the lock or go to sleep. */ - owner = READ_ONCE(lock->owner); - if (owner && !mutex_spin_on_owner(lock, owner)) - break; - - /* Try to acquire the mutex if it is unlocked. */ - if (mutex_try_to_acquire(lock)) { - lock_acquired(&lock->dep_map, ip); - - if (use_ww_ctx) { - struct ww_mutex *ww; - ww = container_of(lock, struct ww_mutex, base); - - ww_mutex_set_context_fastpath(ww, ww_ctx); + owner = __mutex_owner(lock); + if (owner) { + if (waiter && owner == task) { + smp_mb(); /* ACQUIRE */ + break; } - mutex_set_owner(lock); - osq_unlock(&lock->osq); - return true; + if (!mutex_spin_on_owner(lock, owner)) + goto fail_unlock; } - /* - * When there's no owner, we might have preempted between the - * owner acquiring the lock and setting the owner field. If - * we're an RT task that will live-lock because we won't let - * the owner complete. - */ - if (!owner && (need_resched() || rt_task(task))) + /* Try to acquire the mutex if it is unlocked. */ + if (__mutex_trylock(lock, waiter)) break; /* @@ -377,11 +501,20 @@ static bool mutex_optimistic_spin(struct mutex *lock, * memory barriers as we'll eventually observe the right * values at the cost of a few extra spins. */ - cpu_relax_lowlatency(); + cpu_relax(); } - osq_unlock(&lock->osq); -done: + if (!waiter) + osq_unlock(&lock->osq); + + return true; + + +fail_unlock: + if (!waiter) + osq_unlock(&lock->osq); + +fail: /* * If we fell out of the spin path because of need_resched(), * reschedule now, before we try-lock the mutex. This avoids getting @@ -400,14 +533,14 @@ done: } #else static bool mutex_optimistic_spin(struct mutex *lock, - struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) + struct ww_acquire_ctx *ww_ctx, + const bool use_ww_ctx, const bool waiter) { return false; } #endif -__visible __used noinline -void __sched __mutex_unlock_slowpath(atomic_t *lock_count); +static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip); /** * mutex_unlock - release the mutex @@ -422,21 +555,12 @@ void __sched __mutex_unlock_slowpath(atomic_t *lock_count); */ void __sched mutex_unlock(struct mutex *lock) { - /* - * The unlocking fastpath is the 0->1 transition from 'locked' - * into 'unlocked' state: - */ -#ifndef CONFIG_DEBUG_MUTEXES - /* - * When debugging is enabled we must not clear the owner before time, - * the slow path will always be taken, and that clears the owner field - * after verifying that it was indeed current. - */ - mutex_clear_owner(lock); +#ifndef CONFIG_DEBUG_LOCK_ALLOC + if (__mutex_unlock_fast(lock)) + return; #endif - __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); + __mutex_unlock_slowpath(lock, _RET_IP_); } - EXPORT_SYMBOL(mutex_unlock); /** @@ -465,15 +589,7 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock) lock->ctx = NULL; } -#ifndef CONFIG_DEBUG_MUTEXES - /* - * When debugging is enabled we must not clear the owner before time, - * the slow path will always be taken, and that clears the owner field - * after verifying that it was indeed current. - */ - mutex_clear_owner(&lock->base); -#endif - __mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath); + mutex_unlock(&lock->base); } EXPORT_SYMBOL(ww_mutex_unlock); @@ -509,10 +625,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct task_struct *task = current; struct mutex_waiter waiter; unsigned long flags; + bool first = false; + struct ww_mutex *ww; int ret; if (use_ww_ctx) { - struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + ww = container_of(lock, struct ww_mutex, base); if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) return -EALREADY; } @@ -520,20 +638,21 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); - if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) { + if (__mutex_trylock(lock, false) || + mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, false)) { /* got the lock, yay! */ + lock_acquired(&lock->dep_map, ip); + if (use_ww_ctx) + ww_mutex_set_context_fastpath(ww, ww_ctx); preempt_enable(); return 0; } spin_lock_mutex(&lock->wait_lock, flags); - /* - * Once more, try to acquire the lock. Only try-lock the mutex if - * it is unlocked to reduce unnecessary xchg() operations. + * After waiting to acquire the wait_lock, try again. */ - if (!mutex_is_locked(lock) && - (atomic_xchg_acquire(&lock->count, 0) == 1)) + if (__mutex_trylock(lock, false)) goto skip_wait; debug_mutex_lock_common(lock, &waiter); @@ -543,26 +662,26 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, list_add_tail(&waiter.list, &lock->wait_list); waiter.task = task; + if (__mutex_waiter_is_first(lock, &waiter)) + __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); + lock_contended(&lock->dep_map, ip); + set_task_state(task, state); for (;;) { /* - * Lets try to take the lock again - this is needed even if - * we get here for the first time (shortly after failing to - * acquire the lock), to make sure that we get a wakeup once - * it's unlocked. Later on, if we sleep, this is the - * operation that gives us the lock. We xchg it to -1, so - * that when we release the lock, we properly wake up the - * other waiters. We only attempt the xchg if the count is - * non-negative in order to avoid unnecessary xchg operations: + * Once we hold wait_lock, we're serialized against + * mutex_unlock() handing the lock off to us, do a trylock + * before testing the error conditions to make sure we pick up + * the handoff. */ - if (atomic_read(&lock->count) >= 0 && - (atomic_xchg_acquire(&lock->count, -1) == 1)) - break; + if (__mutex_trylock(lock, first)) + goto acquired; /* - * got a signal? (This code gets eliminated in the - * TASK_UNINTERRUPTIBLE case.) + * Check for signals and wound conditions while holding + * wait_lock. This ensures the lock cancellation is ordered + * against mutex_unlock() and wake-ups do not go missing. */ if (unlikely(signal_pending_state(state, task))) { ret = -EINTR; @@ -575,36 +694,49 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto err; } - __set_task_state(task, state); - - /* didn't get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); schedule_preempt_disabled(); + + if (!first && __mutex_waiter_is_first(lock, &waiter)) { + first = true; + __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); + } + + set_task_state(task, state); + /* + * Here we order against unlock; we must either see it change + * state back to RUNNING and fall through the next schedule(), + * or we must see its unlock and acquire. + */ + if ((first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, true)) || + __mutex_trylock(lock, first)) + break; + spin_lock_mutex(&lock->wait_lock, flags); } + spin_lock_mutex(&lock->wait_lock, flags); +acquired: __set_task_state(task, TASK_RUNNING); mutex_remove_waiter(lock, &waiter, task); - /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) - atomic_set(&lock->count, 0); + __mutex_clear_flag(lock, MUTEX_FLAGS); + debug_mutex_free_waiter(&waiter); skip_wait: /* got the lock - cleanup and rejoice! */ lock_acquired(&lock->dep_map, ip); - mutex_set_owner(lock); - if (use_ww_ctx) { - struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + if (use_ww_ctx) ww_mutex_set_context_slowpath(ww, ww_ctx); - } spin_unlock_mutex(&lock->wait_lock, flags); preempt_enable(); return 0; err: + __set_task_state(task, TASK_RUNNING); mutex_remove_waiter(lock, &waiter, task); spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); @@ -631,7 +763,6 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_, NULL, 0); } - EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); int __sched @@ -650,7 +781,6 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, NULL, _RET_IP_, NULL, 0); } - EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); static inline int @@ -715,54 +845,64 @@ EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); /* * Release the lock, slowpath: */ -static inline void -__mutex_unlock_common_slowpath(struct mutex *lock, int nested) +static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip) { - unsigned long flags; - WAKE_Q(wake_q); + struct task_struct *next = NULL; + unsigned long owner, flags; + DEFINE_WAKE_Q(wake_q); + + mutex_release(&lock->dep_map, 1, ip); /* - * As a performance measurement, release the lock before doing other - * wakeup related duties to follow. This allows other tasks to acquire - * the lock sooner, while still handling cleanups in past unlock calls. - * This can be done as we do not enforce strict equivalence between the - * mutex counter and wait_list. - * + * Release the lock before (potentially) taking the spinlock such that + * other contenders can get on with things ASAP. * - * Some architectures leave the lock unlocked in the fastpath failure - * case, others need to leave it locked. In the later case we have to - * unlock it here - as the lock counter is currently 0 or negative. + * Except when HANDOFF, in that case we must not clear the owner field, + * but instead set it to the top waiter. */ - if (__mutex_slowpath_needs_to_unlock()) - atomic_set(&lock->count, 1); + owner = atomic_long_read(&lock->owner); + for (;;) { + unsigned long old; + +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current); +#endif + + if (owner & MUTEX_FLAG_HANDOFF) + break; + + old = atomic_long_cmpxchg_release(&lock->owner, owner, + __owner_flags(owner)); + if (old == owner) { + if (owner & MUTEX_FLAG_WAITERS) + break; + + return; + } + + owner = old; + } spin_lock_mutex(&lock->wait_lock, flags); - mutex_release(&lock->dep_map, nested, _RET_IP_); debug_mutex_unlock(lock); - if (!list_empty(&lock->wait_list)) { /* get the first entry from the wait-list: */ struct mutex_waiter *waiter = - list_entry(lock->wait_list.next, - struct mutex_waiter, list); + list_first_entry(&lock->wait_list, + struct mutex_waiter, list); + + next = waiter->task; debug_mutex_wake_waiter(lock, waiter); - wake_q_add(&wake_q, waiter->task); + wake_q_add(&wake_q, next); } - spin_unlock_mutex(&lock->wait_lock, flags); - wake_up_q(&wake_q); -} + if (owner & MUTEX_FLAG_HANDOFF) + __mutex_handoff(lock, next); -/* - * Release the lock, slowpath: - */ -__visible void -__mutex_unlock_slowpath(atomic_t *lock_count) -{ - struct mutex *lock = container_of(lock_count, struct mutex, count); + spin_unlock_mutex(&lock->wait_lock, flags); - __mutex_unlock_common_slowpath(lock, 1); + wake_up_q(&wake_q); } #ifndef CONFIG_DEBUG_LOCK_ALLOC @@ -789,38 +929,30 @@ __mutex_lock_interruptible_slowpath(struct mutex *lock); */ int __sched mutex_lock_interruptible(struct mutex *lock) { - int ret; - might_sleep(); - ret = __mutex_fastpath_lock_retval(&lock->count); - if (likely(!ret)) { - mutex_set_owner(lock); + + if (__mutex_trylock_fast(lock)) return 0; - } else - return __mutex_lock_interruptible_slowpath(lock); + + return __mutex_lock_interruptible_slowpath(lock); } EXPORT_SYMBOL(mutex_lock_interruptible); int __sched mutex_lock_killable(struct mutex *lock) { - int ret; - might_sleep(); - ret = __mutex_fastpath_lock_retval(&lock->count); - if (likely(!ret)) { - mutex_set_owner(lock); + + if (__mutex_trylock_fast(lock)) return 0; - } else - return __mutex_lock_killable_slowpath(lock); + + return __mutex_lock_killable_slowpath(lock); } EXPORT_SYMBOL(mutex_lock_killable); -__visible void __sched -__mutex_lock_slowpath(atomic_t *lock_count) +static noinline void __sched +__mutex_lock_slowpath(struct mutex *lock) { - struct mutex *lock = container_of(lock_count, struct mutex, count); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_, NULL, 0); } @@ -856,37 +988,6 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, #endif -/* - * Spinlock based trylock, we take the spinlock and check whether we - * can get the lock: - */ -static inline int __mutex_trylock_slowpath(atomic_t *lock_count) -{ - struct mutex *lock = container_of(lock_count, struct mutex, count); - unsigned long flags; - int prev; - - /* No need to trylock if the mutex is locked. */ - if (mutex_is_locked(lock)) - return 0; - - spin_lock_mutex(&lock->wait_lock, flags); - - prev = atomic_xchg_acquire(&lock->count, -1); - if (likely(prev == 1)) { - mutex_set_owner(lock); - mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); - } - - /* Set it back to 0 if there are no waiters: */ - if (likely(list_empty(&lock->wait_list))) - atomic_set(&lock->count, 0); - - spin_unlock_mutex(&lock->wait_lock, flags); - - return prev == 1; -} - /** * mutex_trylock - try to acquire the mutex, without waiting * @lock: the mutex to be acquired @@ -903,13 +1004,12 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) */ int __sched mutex_trylock(struct mutex *lock) { - int ret; + bool locked = __mutex_trylock(lock, false); - ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath); - if (ret) - mutex_set_owner(lock); + if (locked) + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); - return ret; + return locked; } EXPORT_SYMBOL(mutex_trylock); @@ -917,36 +1017,28 @@ EXPORT_SYMBOL(mutex_trylock); int __sched __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - int ret; - might_sleep(); - ret = __mutex_fastpath_lock_retval(&lock->base.count); - - if (likely(!ret)) { + if (__mutex_trylock_fast(&lock->base)) { ww_mutex_set_context_fastpath(lock, ctx); - mutex_set_owner(&lock->base); - } else - ret = __ww_mutex_lock_slowpath(lock, ctx); - return ret; + return 0; + } + + return __ww_mutex_lock_slowpath(lock, ctx); } EXPORT_SYMBOL(__ww_mutex_lock); int __sched __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - int ret; - might_sleep(); - ret = __mutex_fastpath_lock_retval(&lock->base.count); - - if (likely(!ret)) { + if (__mutex_trylock_fast(&lock->base)) { ww_mutex_set_context_fastpath(lock, ctx); - mutex_set_owner(&lock->base); - } else - ret = __ww_mutex_lock_interruptible_slowpath(lock, ctx); - return ret; + return 0; + } + + return __ww_mutex_lock_interruptible_slowpath(lock, ctx); } EXPORT_SYMBOL(__ww_mutex_lock_interruptible); diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 6cd6b8e9efd7..4410a4af42a3 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -16,32 +16,6 @@ #define mutex_remove_waiter(lock, waiter, task) \ __list_del((waiter)->list.prev, (waiter)->list.next) -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER -/* - * The mutex owner can get read and written to locklessly. - * We should use WRITE_ONCE when writing the owner value to - * avoid store tearing, otherwise, a thread could potentially - * read a partially written and incomplete owner value. - */ -static inline void mutex_set_owner(struct mutex *lock) -{ - WRITE_ONCE(lock->owner, current); -} - -static inline void mutex_clear_owner(struct mutex *lock) -{ - WRITE_ONCE(lock->owner, NULL); -} -#else -static inline void mutex_set_owner(struct mutex *lock) -{ -} - -static inline void mutex_clear_owner(struct mutex *lock) -{ -} -#endif - #define debug_mutex_wake_waiter(lock, waiter) do { } while (0) #define debug_mutex_free_waiter(waiter) do { } while (0) #define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index 05a37857ab55..a3167941093b 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -21,6 +21,11 @@ static inline int encode_cpu(int cpu_nr) return cpu_nr + 1; } +static inline int node_cpu(struct optimistic_spin_node *node) +{ + return node->cpu - 1; +} + static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val) { int cpu_nr = encoded_cpu_val - 1; @@ -75,7 +80,7 @@ osq_wait_next(struct optimistic_spin_queue *lock, break; } - cpu_relax_lowlatency(); + cpu_relax(); } return next; @@ -118,11 +123,13 @@ bool osq_lock(struct optimistic_spin_queue *lock) while (!READ_ONCE(node->locked)) { /* * If we need to reschedule bail... so we can block. + * Use vcpu_is_preempted() to avoid waiting for a preempted + * lock holder: */ - if (need_resched()) + if (need_resched() || vcpu_is_preempted(node_cpu(node->prev))) goto unqueue; - cpu_relax_lowlatency(); + cpu_relax(); } return true; @@ -148,7 +155,7 @@ unqueue: if (smp_load_acquire(&node->locked)) return true; - cpu_relax_lowlatency(); + cpu_relax(); /* * Or we race against a concurrent unqueue()'s step-B, in which diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index bec0b647f9cc..ce182599cf2e 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -8,152 +8,186 @@ #include <linux/sched.h> #include <linux/errno.h> -int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, +int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, const char *name, struct lock_class_key *rwsem_key) { - brw->fast_read_ctr = alloc_percpu(int); - if (unlikely(!brw->fast_read_ctr)) + sem->read_count = alloc_percpu(int); + if (unlikely(!sem->read_count)) return -ENOMEM; /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ - __init_rwsem(&brw->rw_sem, name, rwsem_key); - rcu_sync_init(&brw->rss, RCU_SCHED_SYNC); - atomic_set(&brw->slow_read_ctr, 0); - init_waitqueue_head(&brw->write_waitq); + rcu_sync_init(&sem->rss, RCU_SCHED_SYNC); + __init_rwsem(&sem->rw_sem, name, rwsem_key); + init_waitqueue_head(&sem->writer); + sem->readers_block = 0; return 0; } EXPORT_SYMBOL_GPL(__percpu_init_rwsem); -void percpu_free_rwsem(struct percpu_rw_semaphore *brw) +void percpu_free_rwsem(struct percpu_rw_semaphore *sem) { /* * XXX: temporary kludge. The error path in alloc_super() * assumes that percpu_free_rwsem() is safe after kzalloc(). */ - if (!brw->fast_read_ctr) + if (!sem->read_count) return; - rcu_sync_dtor(&brw->rss); - free_percpu(brw->fast_read_ctr); - brw->fast_read_ctr = NULL; /* catch use after free bugs */ + rcu_sync_dtor(&sem->rss); + free_percpu(sem->read_count); + sem->read_count = NULL; /* catch use after free bugs */ } EXPORT_SYMBOL_GPL(percpu_free_rwsem); -/* - * This is the fast-path for down_read/up_read. If it succeeds we rely - * on the barriers provided by rcu_sync_enter/exit; see the comments in - * percpu_down_write() and percpu_up_write(). - * - * If this helper fails the callers rely on the normal rw_semaphore and - * atomic_dec_and_test(), so in this case we have the necessary barriers. - */ -static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) +int __percpu_down_read(struct percpu_rw_semaphore *sem, int try) { - bool success; + /* + * Due to having preemption disabled the decrement happens on + * the same CPU as the increment, avoiding the + * increment-on-one-CPU-and-decrement-on-another problem. + * + * If the reader misses the writer's assignment of readers_block, then + * the writer is guaranteed to see the reader's increment. + * + * Conversely, any readers that increment their sem->read_count after + * the writer looks are guaranteed to see the readers_block value, + * which in turn means that they are guaranteed to immediately + * decrement their sem->read_count, so that it doesn't matter that the + * writer missed them. + */ - preempt_disable(); - success = rcu_sync_is_idle(&brw->rss); - if (likely(success)) - __this_cpu_add(*brw->fast_read_ctr, val); - preempt_enable(); + smp_mb(); /* A matches D */ - return success; -} + /* + * If !readers_block the critical section starts here, matched by the + * release in percpu_up_write(). + */ + if (likely(!smp_load_acquire(&sem->readers_block))) + return 1; -/* - * Like the normal down_read() this is not recursive, the writer can - * come after the first percpu_down_read() and create the deadlock. - * - * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep, - * percpu_up_read() does rwsem_release(). This pairs with the usage - * of ->rw_sem in percpu_down/up_write(). - */ -void percpu_down_read(struct percpu_rw_semaphore *brw) -{ - might_sleep(); - rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); + /* + * Per the above comment; we still have preemption disabled and + * will thus decrement on the same CPU as we incremented. + */ + __percpu_up_read(sem); - if (likely(update_fast_ctr(brw, +1))) - return; + if (try) + return 0; - /* Avoid rwsem_acquire_read() and rwsem_release() */ - __down_read(&brw->rw_sem); - atomic_inc(&brw->slow_read_ctr); - __up_read(&brw->rw_sem); -} -EXPORT_SYMBOL_GPL(percpu_down_read); + /* + * We either call schedule() in the wait, or we'll fall through + * and reschedule on the preempt_enable() in percpu_down_read(). + */ + preempt_enable_no_resched(); -int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) -{ - if (unlikely(!update_fast_ctr(brw, +1))) { - if (!__down_read_trylock(&brw->rw_sem)) - return 0; - atomic_inc(&brw->slow_read_ctr); - __up_read(&brw->rw_sem); - } - - rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_); + /* + * Avoid lockdep for the down/up_read() we already have them. + */ + __down_read(&sem->rw_sem); + this_cpu_inc(*sem->read_count); + __up_read(&sem->rw_sem); + + preempt_disable(); return 1; } +EXPORT_SYMBOL_GPL(__percpu_down_read); -void percpu_up_read(struct percpu_rw_semaphore *brw) +void __percpu_up_read(struct percpu_rw_semaphore *sem) { - rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); - - if (likely(update_fast_ctr(brw, -1))) - return; + smp_mb(); /* B matches C */ + /* + * In other words, if they see our decrement (presumably to aggregate + * zero, as that is the only time it matters) they will also see our + * critical section. + */ + __this_cpu_dec(*sem->read_count); - /* false-positive is possible but harmless */ - if (atomic_dec_and_test(&brw->slow_read_ctr)) - wake_up_all(&brw->write_waitq); + /* Prod writer to recheck readers_active */ + wake_up(&sem->writer); } -EXPORT_SYMBOL_GPL(percpu_up_read); +EXPORT_SYMBOL_GPL(__percpu_up_read); + +#define per_cpu_sum(var) \ +({ \ + typeof(var) __sum = 0; \ + int cpu; \ + compiletime_assert_atomic_type(__sum); \ + for_each_possible_cpu(cpu) \ + __sum += per_cpu(var, cpu); \ + __sum; \ +}) -static int clear_fast_ctr(struct percpu_rw_semaphore *brw) +/* + * Return true if the modular sum of the sem->read_count per-CPU variable is + * zero. If this sum is zero, then it is stable due to the fact that if any + * newly arriving readers increment a given counter, they will immediately + * decrement that same counter. + */ +static bool readers_active_check(struct percpu_rw_semaphore *sem) { - unsigned int sum = 0; - int cpu; + if (per_cpu_sum(*sem->read_count) != 0) + return false; + + /* + * If we observed the decrement; ensure we see the entire critical + * section. + */ - for_each_possible_cpu(cpu) { - sum += per_cpu(*brw->fast_read_ctr, cpu); - per_cpu(*brw->fast_read_ctr, cpu) = 0; - } + smp_mb(); /* C matches B */ - return sum; + return true; } -void percpu_down_write(struct percpu_rw_semaphore *brw) +void percpu_down_write(struct percpu_rw_semaphore *sem) { + /* Notify readers to take the slow path. */ + rcu_sync_enter(&sem->rss); + + down_write(&sem->rw_sem); + /* - * Make rcu_sync_is_idle() == F and thus disable the fast-path in - * percpu_down_read() and percpu_up_read(), and wait for gp pass. - * - * The latter synchronises us with the preceding readers which used - * the fast-past, so we can not miss the result of __this_cpu_add() - * or anything else inside their criticial sections. + * Notify new readers to block; up until now, and thus throughout the + * longish rcu_sync_enter() above, new readers could still come in. */ - rcu_sync_enter(&brw->rss); + WRITE_ONCE(sem->readers_block, 1); - /* exclude other writers, and block the new readers completely */ - down_write(&brw->rw_sem); + smp_mb(); /* D matches A */ - /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */ - atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr); + /* + * If they don't see our writer of readers_block, then we are + * guaranteed to see their sem->read_count increment, and therefore + * will wait for them. + */ - /* wait for all readers to complete their percpu_up_read() */ - wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); + /* Wait for all now active readers to complete. */ + wait_event(sem->writer, readers_active_check(sem)); } EXPORT_SYMBOL_GPL(percpu_down_write); -void percpu_up_write(struct percpu_rw_semaphore *brw) +void percpu_up_write(struct percpu_rw_semaphore *sem) { - /* release the lock, but the readers can't use the fast-path */ - up_write(&brw->rw_sem); /* - * Enable the fast-path in percpu_down_read() and percpu_up_read() - * but only after another gp pass; this adds the necessary barrier - * to ensure the reader can't miss the changes done by us. + * Signal the writer is done, no fast path yet. + * + * One reason that we cannot just immediately flip to readers_fast is + * that new readers might fail to see the results of this writer's + * critical section. + * + * Therefore we force it through the slow path which guarantees an + * acquire and thereby guarantees the critical section's consistency. + */ + smp_store_release(&sem->readers_block, 0); + + /* + * Release the write lock, this will allow readers back in the game. + */ + up_write(&sem->rw_sem); + + /* + * Once this completes (at least one RCU-sched grace period hence) the + * reader fast path will be available again. Safe to use outside the + * exclusive write lock because its counting. */ - rcu_sync_exit(&brw->rss); + rcu_sync_exit(&sem->rss); } EXPORT_SYMBOL_GPL(percpu_up_write); diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 19248ddf37ce..cc3ed0ccdfa2 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -54,7 +54,7 @@ static __always_inline void rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) { while ((cnts & _QW_WMASK) == _QW_LOCKED) { - cpu_relax_lowlatency(); + cpu_relax(); cnts = atomic_read_acquire(&lock->cnts); } } @@ -130,7 +130,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock) (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0)) break; - cpu_relax_lowlatency(); + cpu_relax(); } /* When no more readers, set the locked flag */ @@ -141,7 +141,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock) _QW_LOCKED) == _QW_WAITING)) break; - cpu_relax_lowlatency(); + cpu_relax(); } unlock: arch_spin_unlock(&lock->wait_lock); diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 8a99abf58080..e3b5520005db 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -70,11 +70,14 @@ struct pv_node { static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) { struct __qspinlock *l = (void *)lock; - int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && - (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0); - qstat_inc(qstat_pv_lock_stealing, ret); - return ret; + if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && + (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { + qstat_inc(qstat_pv_lock_stealing, true); + return true; + } + + return false; } /* @@ -257,7 +260,6 @@ static struct pv_node *pv_unhash(struct qspinlock *lock) static inline bool pv_wait_early(struct pv_node *prev, int loop) { - if ((loop & PV_PREV_CHECK_MASK) != 0) return false; @@ -286,12 +288,10 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) { struct pv_node *pn = (struct pv_node *)node; struct pv_node *pp = (struct pv_node *)prev; - int waitcnt = 0; int loop; bool wait_early; - /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */ - for (;; waitcnt++) { + for (;;) { for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) { if (READ_ONCE(node->locked)) return; @@ -315,7 +315,6 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) if (!READ_ONCE(node->locked)) { qstat_inc(qstat_pv_wait_node, true); - qstat_inc(qstat_pv_wait_again, waitcnt); qstat_inc(qstat_pv_wait_early, wait_early); pv_wait(&pn->state, vcpu_halted); } @@ -456,12 +455,9 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) pv_wait(&l->locked, _Q_SLOW_VAL); /* - * The unlocker should have freed the lock before kicking the - * CPU. So if the lock is still not free, it is a spurious - * wakeup or another vCPU has stolen the lock. The current - * vCPU should spin again. + * Because of lock stealing, the queue head vCPU may not be + * able to acquire the lock before it has to wait again. */ - qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked)); } /* @@ -544,7 +540,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) * unhash. Otherwise it would be possible to have multiple @lock * entries, which would be BAD. */ - locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); + locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0); if (likely(locked == _Q_LOCKED_VAL)) return; diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index b9d031516254..eb0a599fcf58 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -24,8 +24,8 @@ * pv_latency_wake - average latency (ns) from vCPU kick to wakeup * pv_lock_slowpath - # of locking operations via the slowpath * pv_lock_stealing - # of lock stealing operations - * pv_spurious_wakeup - # of spurious wakeups - * pv_wait_again - # of vCPU wait's that happened after a vCPU kick + * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs + * pv_wait_again - # of wait's after a queue head vCPU kick * pv_wait_early - # of early vCPU wait's * pv_wait_head - # of vCPU wait's at the queue head * pv_wait_node - # of vCPU wait's at a non-head queue node diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 1ec0f48962b3..2f443ed2320a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -65,8 +65,72 @@ static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) static void fixup_rt_mutex_waiters(struct rt_mutex *lock) { - if (!rt_mutex_has_waiters(lock)) - clear_rt_mutex_waiters(lock); + unsigned long owner, *p = (unsigned long *) &lock->owner; + + if (rt_mutex_has_waiters(lock)) + return; + + /* + * The rbtree has no waiters enqueued, now make sure that the + * lock->owner still has the waiters bit set, otherwise the + * following can happen: + * + * CPU 0 CPU 1 CPU2 + * l->owner=T1 + * rt_mutex_lock(l) + * lock(l->lock) + * l->owner = T1 | HAS_WAITERS; + * enqueue(T2) + * boost() + * unlock(l->lock) + * block() + * + * rt_mutex_lock(l) + * lock(l->lock) + * l->owner = T1 | HAS_WAITERS; + * enqueue(T3) + * boost() + * unlock(l->lock) + * block() + * signal(->T2) signal(->T3) + * lock(l->lock) + * dequeue(T2) + * deboost() + * unlock(l->lock) + * lock(l->lock) + * dequeue(T3) + * ==> wait list is empty + * deboost() + * unlock(l->lock) + * lock(l->lock) + * fixup_rt_mutex_waiters() + * if (wait_list_empty(l) { + * l->owner = owner + * owner = l->owner & ~HAS_WAITERS; + * ==> l->owner = T1 + * } + * lock(l->lock) + * rt_mutex_unlock(l) fixup_rt_mutex_waiters() + * if (wait_list_empty(l) { + * owner = l->owner & ~HAS_WAITERS; + * cmpxchg(l->owner, T1, NULL) + * ===> Success (l->owner = NULL) + * + * l->owner = owner + * ==> l->owner = T1 + * } + * + * With the check for the waiter bit in place T3 on CPU2 will not + * overwrite. All tasks fiddling with the waiters bit are + * serialized by l->lock, so nothing else can modify the waiters + * bit. If the bit is set then nothing can change l->owner either + * so the simple RMW is safe. The cmpxchg() will simply fail if it + * happens in the middle of the RMW because the waiters bit is + * still set. + */ + owner = READ_ONCE(*p); + if (owner & RT_MUTEX_HAS_WAITERS) + WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS); } /* @@ -1382,7 +1446,7 @@ rt_mutex_fastunlock(struct rt_mutex *lock, bool (*slowfn)(struct rt_mutex *lock, struct wake_q_head *wqh)) { - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { rt_mutex_deadlock_account_unlock(current); @@ -1555,11 +1619,15 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init); * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a * proxy owner * - * @lock: the rt_mutex to be locked + * @lock: the rt_mutex to be locked * @proxy_owner:the task to set as owner * * No locking. Caller has to do serializing itself - * Special API call for PI-futex support + * + * Special API call for PI-futex support. This initializes the rtmutex and + * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not + * possible at this point because the pi_state which contains the rtmutex + * is not yet visible to other tasks. */ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner) @@ -1573,10 +1641,14 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, /** * rt_mutex_proxy_unlock - release a lock on behalf of owner * - * @lock: the rt_mutex to be locked + * @lock: the rt_mutex to be locked * * No locking. Caller has to do serializing itself - * Special API call for PI-futex support + * + * Special API call for PI-futex support. This merrily cleans up the rtmutex + * (debugging) state. Concurrent operations on this rt_mutex are not + * possible because it belongs to the pi_state which is about to be freed + * and it is not longer visible to other tasks. */ void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner) diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 4f5f83c7d2d3..990134617b4c 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -71,12 +71,12 @@ task_top_pi_waiter(struct task_struct *p) * lock->owner state tracking: */ #define RT_MUTEX_HAS_WAITERS 1UL -#define RT_MUTEX_OWNER_MASKALL 1UL static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) { - return (struct task_struct *) - ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); + unsigned long owner = (unsigned long) READ_ONCE(lock->owner); + + return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); } /* diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 447e08de1fab..631506004f9e 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -121,16 +121,19 @@ enum rwsem_wake_type { * - woken process blocks are discarded from the list after having task zeroed * - writers are only marked woken if downgrading is false */ -static struct rw_semaphore * -__rwsem_mark_wake(struct rw_semaphore *sem, - enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) +static void __rwsem_mark_wake(struct rw_semaphore *sem, + enum rwsem_wake_type wake_type, + struct wake_q_head *wake_q) { - struct rwsem_waiter *waiter; - struct task_struct *tsk; - struct list_head *next; - long oldcount, woken, loop, adjustment; + struct rwsem_waiter *waiter, *tmp; + long oldcount, woken = 0, adjustment = 0; + + /* + * Take a peek at the queue head waiter such that we can determine + * the wakeup(s) to perform. + */ + waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list); - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); if (waiter->type == RWSEM_WAITING_FOR_WRITE) { if (wake_type == RWSEM_WAKE_ANY) { /* @@ -142,19 +145,19 @@ __rwsem_mark_wake(struct rw_semaphore *sem, */ wake_q_add(wake_q, waiter->task); } - goto out; + + return; } - /* Writers might steal the lock before we grant it to the next reader. + /* + * Writers might steal the lock before we grant it to the next reader. * We prefer to do the first reader grant before counting readers * so we can bail out early if a writer stole the lock. */ - adjustment = 0; if (wake_type != RWSEM_WAKE_READ_OWNED) { adjustment = RWSEM_ACTIVE_READ_BIAS; try_reader_grant: oldcount = atomic_long_fetch_add(adjustment, &sem->count); - if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { /* * If the count is still less than RWSEM_WAITING_BIAS @@ -164,7 +167,8 @@ __rwsem_mark_wake(struct rw_semaphore *sem, */ if (atomic_long_add_return(-adjustment, &sem->count) < RWSEM_WAITING_BIAS) - goto out; + return; + /* Last active locker left. Retry waking readers. */ goto try_reader_grant; } @@ -176,38 +180,23 @@ __rwsem_mark_wake(struct rw_semaphore *sem, rwsem_set_reader_owned(sem); } - /* Grant an infinite number of read locks to the readers at the front - * of the queue. Note we increment the 'active part' of the count by - * the number of readers before waking any processes up. + /* + * Grant an infinite number of read locks to the readers at the front + * of the queue. We know that woken will be at least 1 as we accounted + * for above. Note we increment the 'active part' of the count by the + * number of readers before waking any processes up. */ - woken = 0; - do { - woken++; + list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { + struct task_struct *tsk; - if (waiter->list.next == &sem->wait_list) + if (waiter->type == RWSEM_WAITING_FOR_WRITE) break; - waiter = list_entry(waiter->list.next, - struct rwsem_waiter, list); - - } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - - adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; - if (waiter->type != RWSEM_WAITING_FOR_WRITE) - /* hit end of list above */ - adjustment -= RWSEM_WAITING_BIAS; - - if (adjustment) - atomic_long_add(adjustment, &sem->count); - - next = sem->wait_list.next; - loop = woken; - do { - waiter = list_entry(next, struct rwsem_waiter, list); - next = waiter->list.next; + woken++; tsk = waiter->task; wake_q_add(wake_q, tsk); + list_del(&waiter->list); /* * Ensure that the last operation is setting the reader * waiter to nil such that rwsem_down_read_failed() cannot @@ -215,13 +204,16 @@ __rwsem_mark_wake(struct rw_semaphore *sem, * to the task to wakeup. */ smp_store_release(&waiter->task, NULL); - } while (--loop); + } - sem->wait_list.next = next; - next->prev = &sem->wait_list; + adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; + if (list_empty(&sem->wait_list)) { + /* hit end of list above */ + adjustment -= RWSEM_WAITING_BIAS; + } - out: - return sem; + if (adjustment) + atomic_long_add(adjustment, &sem->count); } /* @@ -233,9 +225,8 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; struct rwsem_waiter waiter; struct task_struct *tsk = current; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); - /* set up my own style of waitqueue */ waiter.task = tsk; waiter.type = RWSEM_WAITING_FOR_READ; @@ -247,7 +238,8 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) /* we're now waiting on the lock, but no longer actively locking */ count = atomic_long_add_return(adjustment, &sem->count); - /* If there are no active locks, wake the front queued process(es). + /* + * If there are no active locks, wake the front queued process(es). * * If there are no writers and we are first in the queue, * wake our own waiter to join the existing active readers ! @@ -255,7 +247,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) if (count == RWSEM_WAITING_BIAS || (count > RWSEM_WAITING_BIAS && adjustment != -RWSEM_ACTIVE_READ_BIAS)) - sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); wake_up_q(&wake_q); @@ -344,7 +336,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) goto done; } - ret = owner->on_cpu; + /* + * As lock holder preemption issue, we both skip spinning if task is not + * on cpu or its cpu is preempted + */ + ret = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); done: rcu_read_unlock(); return ret; @@ -370,13 +366,17 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) */ barrier(); - /* abort spinning when need_resched or owner is not running */ - if (!owner->on_cpu || need_resched()) { + /* + * abort spinning when need_resched or owner is not running or + * owner's cpu is preempted. + */ + if (!owner->on_cpu || need_resched() || + vcpu_is_preempted(task_cpu(owner))) { rcu_read_unlock(); return false; } - cpu_relax_lowlatency(); + cpu_relax(); } rcu_read_unlock(); out: @@ -431,7 +431,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) * memory barriers as we'll eventually observe the right * values at the cost of a few extra spins. */ - cpu_relax_lowlatency(); + cpu_relax(); } osq_unlock(&sem->osq); done: @@ -469,7 +469,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) bool waiting = true; /* any queued threads before us */ struct rwsem_waiter waiter; struct rw_semaphore *ret = sem; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); /* undo write bias from down_write operation, stop active locking */ count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count); @@ -503,9 +503,9 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) * wake any read locks that were queued ahead of us. */ if (count > RWSEM_WAITING_BIAS) { - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); - sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); + __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); /* * The wakeup is normally called _after_ the wait_lock * is released, but given that we are proactively waking @@ -579,7 +579,7 @@ __visible struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); /* * If a spinner is present, it is not necessary to do the wakeup. @@ -614,9 +614,8 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); locked: - /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); wake_up_q(&wake_q); @@ -634,13 +633,12 @@ __visible struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) { unsigned long flags; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); raw_spin_lock_irqsave(&sem->wait_lock, flags); - /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); + __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); wake_up_q(&wake_q); diff --git a/kernel/module.c b/kernel/module.c index 529efae9f481..0e54d5bf0097 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1149,6 +1149,8 @@ static size_t module_flags_taint(struct module *mod, char *buf) buf[l++] = 'C'; if (mod->taints & (1 << TAINT_UNSIGNED_MODULE)) buf[l++] = 'E'; + if (mod->taints & (1 << TAINT_LIVEPATCH)) + buf[l++] = 'K'; /* * TAINT_FORCED_RMMOD: could be added. * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't @@ -1299,8 +1301,9 @@ static int check_version(Elf_Shdr *sechdrs, goto bad_version; } - pr_warn("%s: no symbol version for %s\n", mod->name, symname); - return 0; + /* Broken toolchain. Warn once, then let it go.. */ + pr_warn_once("%s: no symbol version for %s\n", mod->name, symname); + return 1; bad_version: pr_warn("%s: disagrees about version of symbol %s\n", @@ -2792,14 +2795,17 @@ static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned l } #ifdef CONFIG_LIVEPATCH -static int find_livepatch_modinfo(struct module *mod, struct load_info *info) +static int check_modinfo_livepatch(struct module *mod, struct load_info *info) { - mod->klp = get_modinfo(info, "livepatch") ? true : false; + if (get_modinfo(info, "livepatch")) { + mod->klp = true; + add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK); + } return 0; } #else /* !CONFIG_LIVEPATCH */ -static int find_livepatch_modinfo(struct module *mod, struct load_info *info) +static int check_modinfo_livepatch(struct module *mod, struct load_info *info) { if (get_modinfo(info, "livepatch")) { pr_err("%s: module is marked as livepatch module, but livepatch support is disabled", @@ -2969,7 +2975,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) "is unknown, you have been warned.\n", mod->name); } - err = find_livepatch_modinfo(mod, info); + err = check_modinfo_livepatch(mod, info); if (err) return err; diff --git a/kernel/padata.c b/kernel/padata.c index 993278895ccc..05316c9f32da 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -30,6 +30,7 @@ #include <linux/slab.h> #include <linux/sysfs.h> #include <linux/rcupdate.h> +#include <linux/module.h> #define MAX_OBJ_NUM 1000 @@ -63,15 +64,11 @@ static int padata_cpu_hash(struct parallel_data *pd) static void padata_parallel_worker(struct work_struct *parallel_work) { struct padata_parallel_queue *pqueue; - struct parallel_data *pd; - struct padata_instance *pinst; LIST_HEAD(local_list); local_bh_disable(); pqueue = container_of(parallel_work, struct padata_parallel_queue, work); - pd = pqueue->pd; - pinst = pd->pinst; spin_lock(&pqueue->parallel.lock); list_replace_init(&pqueue->parallel.list, &local_list); @@ -769,52 +766,43 @@ static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu) cpumask_test_cpu(cpu, pinst->cpumask.cbcpu); } - -static int padata_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int padata_cpu_online(unsigned int cpu, struct hlist_node *node) { - int err; struct padata_instance *pinst; - int cpu = (unsigned long)hcpu; + int ret; - pinst = container_of(nfb, struct padata_instance, cpu_notifier); + pinst = hlist_entry_safe(node, struct padata_instance, node); + if (!pinst_has_cpu(pinst, cpu)) + return 0; - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - if (!pinst_has_cpu(pinst, cpu)) - break; - mutex_lock(&pinst->lock); - err = __padata_add_cpu(pinst, cpu); - mutex_unlock(&pinst->lock); - if (err) - return notifier_from_errno(err); - break; + mutex_lock(&pinst->lock); + ret = __padata_add_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + return ret; +} - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - if (!pinst_has_cpu(pinst, cpu)) - break; - mutex_lock(&pinst->lock); - err = __padata_remove_cpu(pinst, cpu); - mutex_unlock(&pinst->lock); - if (err) - return notifier_from_errno(err); - break; - } +static int padata_cpu_prep_down(unsigned int cpu, struct hlist_node *node) +{ + struct padata_instance *pinst; + int ret; - return NOTIFY_OK; + pinst = hlist_entry_safe(node, struct padata_instance, node); + if (!pinst_has_cpu(pinst, cpu)) + return 0; + + mutex_lock(&pinst->lock); + ret = __padata_remove_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + return ret; } + +static enum cpuhp_state hp_online; #endif static void __padata_free(struct padata_instance *pinst) { #ifdef CONFIG_HOTPLUG_CPU - unregister_hotcpu_notifier(&pinst->cpu_notifier); + cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node); #endif padata_stop(pinst); @@ -1012,11 +1000,8 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq, mutex_init(&pinst->lock); #ifdef CONFIG_HOTPLUG_CPU - pinst->cpu_notifier.notifier_call = padata_cpu_callback; - pinst->cpu_notifier.priority = 0; - register_hotcpu_notifier(&pinst->cpu_notifier); + cpuhp_state_add_instance_nocalls(hp_online, &pinst->node); #endif - return pinst; err_free_masks: @@ -1039,3 +1024,26 @@ void padata_free(struct padata_instance *pinst) kobject_put(&pinst->kobj); } EXPORT_SYMBOL(padata_free); + +#ifdef CONFIG_HOTPLUG_CPU + +static __init int padata_driver_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online", + padata_cpu_online, + padata_cpu_prep_down); + if (ret < 0) + return ret; + hp_online = ret; + return 0; +} +module_init(padata_driver_init); + +static __exit void padata_driver_exit(void) +{ + cpuhp_remove_multi_state(hp_online); +} +module_exit(padata_driver_exit); +#endif diff --git a/kernel/panic.c b/kernel/panic.c index ca8cea1ef673..e6480e20379e 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -71,6 +71,32 @@ void __weak nmi_panic_self_stop(struct pt_regs *regs) panic_smp_self_stop(); } +/* + * Stop other CPUs in panic. Architecture dependent code may override this + * with more suitable version. For example, if the architecture supports + * crash dump, it should save registers of each stopped CPU and disable + * per-CPU features such as virtualization extensions. + */ +void __weak crash_smp_send_stop(void) +{ + static int cpus_stopped; + + /* + * This function can be called twice in panic path, but obviously + * we execute this only once. + */ + if (cpus_stopped) + return; + + /* + * Note smp_send_stop is the usual smp shutdown function, which + * unfortunately means it may not be hardened to work in a panic + * situation. + */ + smp_send_stop(); + cpus_stopped = 1; +} + atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); /* @@ -164,14 +190,21 @@ void panic(const char *fmt, ...) if (!_crash_kexec_post_notifiers) { printk_nmi_flush_on_panic(); __crash_kexec(NULL); - } - /* - * Note smp_send_stop is the usual smp shutdown function, which - * unfortunately means it may not be hardened to work in a panic - * situation. - */ - smp_send_stop(); + /* + * Note smp_send_stop is the usual smp shutdown function, which + * unfortunately means it may not be hardened to work in a + * panic situation. + */ + smp_send_stop(); + } else { + /* + * If we want to do crash dump after notifier calls and + * kmsg_dump, we will need architecture dependent extra + * works in addition to stopping other CPUs. + */ + crash_smp_send_stop(); + } /* * Run any panic handlers, including those that might need to diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a65ba137fd15..df9e8e9e0be7 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -79,23 +79,36 @@ static void proc_cleanup_work(struct work_struct *work) /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ #define MAX_PID_NS_LEVEL 32 +static struct ucounts *inc_pid_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES); +} + +static void dec_pid_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_PID_NAMESPACES); +} + static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, struct pid_namespace *parent_pid_ns) { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; + struct ucounts *ucounts; int i; int err; - if (level > MAX_PID_NS_LEVEL) { - err = -EINVAL; + err = -ENOSPC; + if (level > MAX_PID_NS_LEVEL) + goto out; + ucounts = inc_pid_namespaces(user_ns); + if (!ucounts) goto out; - } err = -ENOMEM; ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); if (ns == NULL) - goto out; + goto out_dec; ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!ns->pidmap[0].page) @@ -114,6 +127,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); + ns->ucounts = ucounts; ns->nr_hashed = PIDNS_HASH_ADDING; INIT_WORK(&ns->proc_work, proc_cleanup_work); @@ -129,6 +143,8 @@ out_free_map: kfree(ns->pidmap[0].page); out_free: kmem_cache_free(pid_ns_cachep, ns); +out_dec: + dec_pid_namespaces(ucounts); out: return ERR_PTR(err); } @@ -146,6 +162,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns) ns_free_inum(&ns->ns); for (i = 0; i < PIDMAP_ENTRIES; i++) kfree(ns->pidmap[i].page); + dec_pid_namespaces(ns->ucounts); put_user_ns(ns->user_ns); call_rcu(&ns->rcu, delayed_free_pidns); } @@ -388,12 +405,37 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) return 0; } +static struct ns_common *pidns_get_parent(struct ns_common *ns) +{ + struct pid_namespace *active = task_active_pid_ns(current); + struct pid_namespace *pid_ns, *p; + + /* See if the parent is in the current namespace */ + pid_ns = p = to_pid_ns(ns)->parent; + for (;;) { + if (!p) + return ERR_PTR(-EPERM); + if (p == active) + break; + p = p->parent; + } + + return &get_pid_ns(pid_ns)->ns; +} + +static struct user_namespace *pidns_owner(struct ns_common *ns) +{ + return to_pid_ns(ns)->user_ns; +} + const struct proc_ns_operations pidns_operations = { .name = "pid", .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, + .owner = pidns_owner, + .get_parent = pidns_get_parent, }; static __init int pid_namespaces_init(void) diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 68d3ebc12601..e8517b63eb37 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -186,7 +186,7 @@ config PM_SLEEP_DEBUG config DPM_WATCHDOG bool "Device suspend/resume watchdog" - depends on PM_DEBUG && PSTORE + depends on PM_DEBUG && PSTORE && EXPERT ---help--- Sets up a watchdog timer to capture drivers that are locked up attempting to suspend/resume a device. @@ -197,7 +197,7 @@ config DPM_WATCHDOG config DPM_WATCHDOG_TIMEOUT int "Watchdog timeout in seconds" range 1 120 - default 60 + default 120 depends on DPM_WATCHDOG config PM_TRACE diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 33c79b6105c5..b26dbc48c75b 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -306,8 +306,10 @@ static int create_image(int platform_mode) if (error) printk(KERN_ERR "PM: Error %d creating hibernation image\n", error); - if (!in_suspend) + if (!in_suspend) { events_check_enabled = false; + clear_free_pages(); + } platform_leave(platform_mode); @@ -1189,22 +1191,6 @@ static int __init nohibernate_setup(char *str) return 1; } -static int __init page_poison_nohibernate_setup(char *str) -{ -#ifdef CONFIG_PAGE_POISONING_ZERO - /* - * The zeroing option for page poison skips the checks on alloc. - * since hibernation doesn't save free pages there's no way to - * guarantee the pages will still be zeroed. - */ - if (!strcmp(str, "on")) { - pr_info("Disabling hibernation due to page poisoning\n"); - return nohibernate_setup(str); - } -#endif - return 1; -} - __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); @@ -1212,4 +1198,3 @@ __setup("hibernate=", hibernate_setup); __setup("resumewait", resumewait_setup); __setup("resumedelay=", resumedelay_setup); __setup("nohibernate", nohibernate_setup); -__setup("page_poison=", page_poison_nohibernate_setup); diff --git a/kernel/power/main.c b/kernel/power/main.c index 5ea50b1b7595..d401c21136d1 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -78,6 +78,78 @@ static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, power_attr(pm_async); +#ifdef CONFIG_SUSPEND +static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + char *s = buf; + suspend_state_t i; + + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) + if (mem_sleep_states[i]) { + const char *label = mem_sleep_states[i]; + + if (mem_sleep_current == i) + s += sprintf(s, "[%s] ", label); + else + s += sprintf(s, "%s ", label); + } + + /* Convert the last space to a newline if needed. */ + if (s != buf) + *(s-1) = '\n'; + + return (s - buf); +} + +static suspend_state_t decode_suspend_state(const char *buf, size_t n) +{ + suspend_state_t state; + char *p; + int len; + + p = memchr(buf, '\n', n); + len = p ? p - buf : n; + + for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { + const char *label = mem_sleep_states[state]; + + if (label && len == strlen(label) && !strncmp(buf, label, len)) + return state; + } + + return PM_SUSPEND_ON; +} + +static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + suspend_state_t state; + int error; + + error = pm_autosleep_lock(); + if (error) + return error; + + if (pm_autosleep_state() > PM_SUSPEND_ON) { + error = -EBUSY; + goto out; + } + + state = decode_suspend_state(buf, n); + if (state < PM_SUSPEND_MAX && state > PM_SUSPEND_ON) + mem_sleep_current = state; + else + error = -EINVAL; + + out: + pm_autosleep_unlock(); + return error ? error : n; +} + +power_attr(mem_sleep); +#endif /* CONFIG_SUSPEND */ + #ifdef CONFIG_PM_DEBUG int pm_test_level = TEST_NONE; @@ -368,12 +440,16 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, } state = decode_state(buf, n); - if (state < PM_SUSPEND_MAX) + if (state < PM_SUSPEND_MAX) { + if (state == PM_SUSPEND_MEM) + state = mem_sleep_current; + error = pm_suspend(state); - else if (state == PM_SUSPEND_MAX) + } else if (state == PM_SUSPEND_MAX) { error = hibernate(); - else + } else { error = -EINVAL; + } out: pm_autosleep_unlock(); @@ -485,6 +561,9 @@ static ssize_t autosleep_store(struct kobject *kobj, && strcmp(buf, "off") && strcmp(buf, "off\n")) return -EINVAL; + if (state == PM_SUSPEND_MEM) + state = mem_sleep_current; + error = pm_autosleep_set_state(state); return error ? error : n; } @@ -602,6 +681,9 @@ static struct attribute * g[] = { #ifdef CONFIG_PM_SLEEP &pm_async_attr.attr, &wakeup_count_attr.attr, +#ifdef CONFIG_SUSPEND + &mem_sleep_attr.attr, +#endif #ifdef CONFIG_PM_AUTOSLEEP &autosleep_attr.attr, #endif @@ -644,6 +726,7 @@ static int __init pm_init(void) return error; hibernate_image_size_init(); hibernate_reserved_size_init(); + pm_states_init(); power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; diff --git a/kernel/power/power.h b/kernel/power/power.h index 242d8b827dd5..1dfa0da827d3 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -110,6 +110,8 @@ extern int create_basic_memory_bitmaps(void); extern void free_basic_memory_bitmaps(void); extern int hibernate_preallocate_memory(void); +extern void clear_free_pages(void); + /** * Auxiliary structure used for reading the snapshot image data and * metadata from and writing them to the list of page backup entries @@ -187,11 +189,15 @@ extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *); #ifdef CONFIG_SUSPEND /* kernel/power/suspend.c */ -extern const char *pm_labels[]; +extern const char * const pm_labels[]; extern const char *pm_states[]; +extern const char *mem_sleep_states[]; +extern suspend_state_t mem_sleep_current; extern int suspend_devices_and_enter(suspend_state_t state); #else /* !CONFIG_SUSPEND */ +#define mem_sleep_current PM_SUSPEND_ON + static inline int suspend_devices_and_enter(suspend_state_t state) { return -ENOSYS; diff --git a/kernel/power/process.c b/kernel/power/process.c index 8f27d5a8adf6..2fba066e125f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -144,23 +144,12 @@ int freeze_processes(void) /* * Now that the whole userspace is frozen we need to disbale * the OOM killer to disallow any further interference with - * killable tasks. + * killable tasks. There is no guarantee oom victims will + * ever reach a point they go away we have to wait with a timeout. */ - if (!error && !oom_killer_disable()) + if (!error && !oom_killer_disable(msecs_to_jiffies(freeze_timeout_msecs))) error = -EBUSY; - /* - * There is a hard to fix race between oom_reaper kernel thread - * and oom_killer_disable. oom_reaper calls exit_oom_victim - * before the victim reaches exit_mm so try to freeze all the tasks - * again and catch such a left over task. - */ - if (!error) { - pr_info("Double checking all user space processes after OOM killer disable... "); - error = try_to_freeze_tasks(true); - pr_cont("\n"); - } - if (error) thaw_processes(); return error; diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 168ff442ebde..97b0df71303e 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -482,16 +482,7 @@ void pm_qos_update_request(struct pm_qos_request *req, return; } - /* - * This function may be called very early during boot, for example, - * from of_clk_init(), where irq needs to stay disabled. - * cancel_delayed_work_sync() assumes that irq is enabled on - * invocation and re-enables it on return. Avoid calling it until - * workqueue is initialized. - */ - if (keventd_up()) - cancel_delayed_work_sync(&req->work); - + cancel_delayed_work_sync(&req->work); __pm_qos_update_request(req, new_value); } EXPORT_SYMBOL_GPL(pm_qos_update_request); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index b02228411d57..4f0f0604f1c4 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1132,6 +1132,28 @@ void free_basic_memory_bitmaps(void) pr_debug("PM: Basic memory bitmaps freed\n"); } +void clear_free_pages(void) +{ +#ifdef CONFIG_PAGE_POISONING_ZERO + struct memory_bitmap *bm = free_pages_map; + unsigned long pfn; + + if (WARN_ON(!(free_pages_map))) + return; + + memory_bm_position_reset(bm); + pfn = memory_bm_next_pfn(bm); + while (pfn != BM_END_OF_MAP) { + if (pfn_valid(pfn)) + clear_highpage(pfn_to_page(pfn)); + + pfn = memory_bm_next_pfn(bm); + } + memory_bm_position_reset(bm); + pr_info("PM: free pages cleared after restore\n"); +#endif /* PAGE_POISONING_ZERO */ +} + /** * snapshot_additional_pages - Estimate the number of extra pages needed. * @zone: Memory zone to carry out the computation for. diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 0acab9d7f96f..f67ceb7768b8 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -32,8 +32,21 @@ #include "power.h" -const char *pm_labels[] = { "mem", "standby", "freeze", NULL }; +const char * const pm_labels[] = { + [PM_SUSPEND_FREEZE] = "freeze", + [PM_SUSPEND_STANDBY] = "standby", + [PM_SUSPEND_MEM] = "mem", +}; const char *pm_states[PM_SUSPEND_MAX]; +static const char * const mem_sleep_labels[] = { + [PM_SUSPEND_FREEZE] = "s2idle", + [PM_SUSPEND_STANDBY] = "shallow", + [PM_SUSPEND_MEM] = "deep", +}; +const char *mem_sleep_states[PM_SUSPEND_MAX]; + +suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE; +suspend_state_t mem_sleep_default = PM_SUSPEND_MAX; unsigned int pm_suspend_global_flags; EXPORT_SYMBOL_GPL(pm_suspend_global_flags); @@ -110,22 +123,32 @@ static bool valid_state(suspend_state_t state) return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); } -/* - * If this is set, the "mem" label always corresponds to the deepest sleep state - * available, the "standby" label corresponds to the second deepest sleep state - * available (if any), and the "freeze" label corresponds to the remaining - * available sleep state (if there is one). - */ -static bool relative_states; +void __init pm_states_init(void) +{ + /* "mem" and "freeze" are always present in /sys/power/state. */ + pm_states[PM_SUSPEND_MEM] = pm_labels[PM_SUSPEND_MEM]; + pm_states[PM_SUSPEND_FREEZE] = pm_labels[PM_SUSPEND_FREEZE]; + /* + * Suspend-to-idle should be supported even without any suspend_ops, + * initialize mem_sleep_states[] accordingly here. + */ + mem_sleep_states[PM_SUSPEND_FREEZE] = mem_sleep_labels[PM_SUSPEND_FREEZE]; +} -static int __init sleep_states_setup(char *str) +static int __init mem_sleep_default_setup(char *str) { - relative_states = !strncmp(str, "1", 1); - pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2]; + suspend_state_t state; + + for (state = PM_SUSPEND_FREEZE; state <= PM_SUSPEND_MEM; state++) + if (mem_sleep_labels[state] && + !strcmp(str, mem_sleep_labels[state])) { + mem_sleep_default = state; + break; + } + return 1; } - -__setup("relative_sleep_states=", sleep_states_setup); +__setup("mem_sleep_default=", mem_sleep_default_setup); /** * suspend_set_ops - Set the global suspend method table. @@ -133,21 +156,21 @@ __setup("relative_sleep_states=", sleep_states_setup); */ void suspend_set_ops(const struct platform_suspend_ops *ops) { - suspend_state_t i; - int j = 0; - lock_system_sleep(); suspend_ops = ops; - for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) - if (valid_state(i)) { - pm_states[i] = pm_labels[j++]; - } else if (!relative_states) { - pm_states[i] = NULL; - j++; - } - pm_states[PM_SUSPEND_FREEZE] = pm_labels[j]; + if (valid_state(PM_SUSPEND_STANDBY)) { + mem_sleep_states[PM_SUSPEND_STANDBY] = mem_sleep_labels[PM_SUSPEND_STANDBY]; + pm_states[PM_SUSPEND_STANDBY] = pm_labels[PM_SUSPEND_STANDBY]; + if (mem_sleep_default == PM_SUSPEND_STANDBY) + mem_sleep_current = PM_SUSPEND_STANDBY; + } + if (valid_state(PM_SUSPEND_MEM)) { + mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM]; + if (mem_sleep_default >= PM_SUSPEND_MEM) + mem_sleep_current = PM_SUSPEND_MEM; + } unlock_system_sleep(); } @@ -211,7 +234,7 @@ static int platform_suspend_begin(suspend_state_t state) { if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) return freeze_ops->begin(); - else if (suspend_ops->begin) + else if (suspend_ops && suspend_ops->begin) return suspend_ops->begin(state); else return 0; @@ -221,7 +244,7 @@ static void platform_resume_end(suspend_state_t state) { if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) freeze_ops->end(); - else if (suspend_ops->end) + else if (suspend_ops && suspend_ops->end) suspend_ops->end(); } @@ -490,9 +513,9 @@ static int enter_state(suspend_state_t state) #ifndef CONFIG_SUSPEND_SKIP_SYNC trace_suspend_resume(TPS("sync_filesystems"), 0, true); - printk(KERN_INFO "PM: Syncing filesystems ... "); + pr_info("PM: Syncing filesystems ... "); sys_sync(); - printk("done.\n"); + pr_cont("done.\n"); trace_suspend_resume(TPS("sync_filesystems"), 0, false); #endif diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 084452e34a12..bdff5ed57f10 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -203,8 +203,10 @@ static int __init test_suspend(void) /* RTCs have initialized by now too ... can we use one? */ dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm); - if (dev) + if (dev) { rtc = rtc_class_open(dev_name(dev)); + put_device(dev); + } if (!rtc) { printk(warn_no_rtc); return 0; diff --git a/kernel/power/swap.c b/kernel/power/swap.c index a3b1e617bcdc..32e0c232efba 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -307,7 +307,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; - hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block, + hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block, swsusp_header, NULL); if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { @@ -317,7 +317,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) swsusp_header->flags = flags; if (flags & SF_CRC32_MODE) swsusp_header->crc32 = handle->crc32; - error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, + error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { printk(KERN_ERR "PM: Swap header not found!\n"); @@ -397,7 +397,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) } else { src = buf; } - return hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, offset, src, hb); + return hib_submit_io(REQ_OP_WRITE, REQ_SYNC, offset, src, hb); } static void release_swap_writer(struct swap_map_handle *handle) @@ -1000,8 +1000,7 @@ static int get_swap_reader(struct swap_map_handle *handle, return -ENOMEM; } - error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, - tmp->map, NULL); + error = hib_submit_io(REQ_OP_READ, 0, offset, tmp->map, NULL); if (error) { release_swap_reader(handle); return error; @@ -1025,7 +1024,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, offset = handle->cur->entries[handle->k]; if (!offset) return -EFAULT; - error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, buf, hb); + error = hib_submit_io(REQ_OP_READ, 0, offset, buf, hb); if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { @@ -1534,7 +1533,7 @@ int swsusp_check(void) if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); - error = hib_submit_io(REQ_OP_READ, READ_SYNC, + error = hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block, swsusp_header, NULL); if (error) @@ -1543,7 +1542,7 @@ int swsusp_check(void) if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); /* Reset swap signature now */ - error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, + error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { @@ -1588,11 +1587,11 @@ int swsusp_unmark(void) { int error; - hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block, + hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block, swsusp_header, NULL); if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); - error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, + error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index 16bab471c7e2..f011aaef583c 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -67,7 +67,8 @@ static int vprintk_nmi(const char *fmt, va_list args) again: len = atomic_read(&s->len); - if (len >= sizeof(s->buffer)) { + /* The trailing '\0' is not counted into len. */ + if (len >= sizeof(s->buffer) - 1) { atomic_inc(&nmi_message_lost); return 0; } @@ -79,7 +80,7 @@ again: if (!len) smp_rmb(); - add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); + add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); /* * Do it once again if the buffer has been flushed in the meantime. @@ -113,16 +114,51 @@ static void printk_nmi_flush_line(const char *text, int len) } -/* - * printk one line from the temporary buffer from @start index until - * and including the @end index. - */ -static void printk_nmi_flush_seq_line(struct nmi_seq_buf *s, - int start, int end) +/* printk part of the temporary buffer line by line */ +static int printk_nmi_flush_buffer(const char *start, size_t len) { - const char *buf = s->buffer + start; + const char *c, *end; + bool header; + + c = start; + end = start + len; + header = true; + + /* Print line by line. */ + while (c < end) { + if (*c == '\n') { + printk_nmi_flush_line(start, c - start + 1); + start = ++c; + header = true; + continue; + } + + /* Handle continuous lines or missing new line. */ + if ((c + 1 < end) && printk_get_level(c)) { + if (header) { + c = printk_skip_level(c); + continue; + } + + printk_nmi_flush_line(start, c - start); + start = c++; + header = true; + continue; + } + + header = false; + c++; + } - printk_nmi_flush_line(buf, (end - start) + 1); + /* Check if there was a partial line. Ignore pure header. */ + if (start < end && !header) { + static const char newline[] = KERN_CONT "\n"; + + printk_nmi_flush_line(start, end - start); + printk_nmi_flush_line(newline, strlen(newline)); + } + + return len; } /* @@ -135,8 +171,8 @@ static void __printk_nmi_flush(struct irq_work *work) __RAW_SPIN_LOCK_INITIALIZER(read_lock); struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work); unsigned long flags; - size_t len, size; - int i, last_i; + size_t len; + int i; /* * The lock has two functions. First, one reader has to flush all @@ -154,12 +190,14 @@ more: /* * This is just a paranoid check that nobody has manipulated * the buffer an unexpected way. If we printed something then - * @len must only increase. + * @len must only increase. Also it should never overflow the + * buffer size. */ - if (i && i >= len) { + if ((i && i >= len) || len > sizeof(s->buffer)) { const char *msg = "printk_nmi_flush: internal error\n"; printk_nmi_flush_line(msg, strlen(msg)); + len = 0; } if (!len) @@ -167,22 +205,7 @@ more: /* Make sure that data has been written up to the @len */ smp_rmb(); - - size = min(len, sizeof(s->buffer)); - last_i = i; - - /* Print line by line. */ - for (; i < size; i++) { - if (s->buffer[i] == '\n') { - printk_nmi_flush_seq_line(s, last_i, i); - last_i = i + 1; - } - } - /* Check if there was a partial line. */ - if (last_i < size) { - printk_nmi_flush_seq_line(s, last_i, size - 1); - printk_nmi_flush_line("\n", strlen("\n")); - } + i += printk_nmi_flush_buffer(s->buffer + i, len - i); /* * Check that nothing has got added in the meantime and truncate diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index eea6dbc2d8cf..577f2288d19f 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -655,11 +655,8 @@ static ssize_t msg_print_ext_header(char *buf, size_t size, * better readable output. 'c' in the record flags mark the first * fragment of a line, '+' the following. */ - if (msg->flags & LOG_CONT && !(prev_flags & LOG_CONT)) - cont = 'c'; - else if ((msg->flags & LOG_CONT) || - ((prev_flags & LOG_CONT) && !(msg->flags & LOG_PREFIX))) - cont = '+'; + if (msg->flags & LOG_CONT) + cont = (prev_flags & LOG_CONT) ? '+' : 'c'; return scnprintf(buf, size, "%u,%llu,%llu,%c;", (msg->facility << 3) | msg->level, seq, ts_usec, cont); @@ -1643,35 +1640,33 @@ static struct cont { bool flushed:1; /* buffer sealed and committed */ } cont; -static void cont_flush(enum log_flags flags) +static void cont_flush(void) { if (cont.flushed) return; if (cont.len == 0) return; - if (cont.cons) { /* * If a fragment of this line was directly flushed to the * console; wait for the console to pick up the rest of the * line. LOG_NOCONS suppresses a duplicated output. */ - log_store(cont.facility, cont.level, flags | LOG_NOCONS, + log_store(cont.facility, cont.level, cont.flags | LOG_NOCONS, cont.ts_nsec, NULL, 0, cont.buf, cont.len); - cont.flags = flags; cont.flushed = true; } else { /* * If no fragment of this line ever reached the console, * just submit it to the store and free the buffer. */ - log_store(cont.facility, cont.level, flags, 0, + log_store(cont.facility, cont.level, cont.flags, 0, NULL, 0, cont.buf, cont.len); cont.len = 0; } } -static bool cont_add(int facility, int level, const char *text, size_t len) +static bool cont_add(int facility, int level, enum log_flags flags, const char *text, size_t len) { if (cont.len && cont.flushed) return false; @@ -1682,7 +1677,7 @@ static bool cont_add(int facility, int level, const char *text, size_t len) * the line gets too long, split it up in separate records. */ if (nr_ext_console_drivers || cont.len + len > sizeof(cont.buf)) { - cont_flush(LOG_CONT); + cont_flush(); return false; } @@ -1691,7 +1686,7 @@ static bool cont_add(int facility, int level, const char *text, size_t len) cont.level = level; cont.owner = current; cont.ts_nsec = local_clock(); - cont.flags = 0; + cont.flags = flags; cont.cons = 0; cont.flushed = false; } @@ -1699,8 +1694,15 @@ static bool cont_add(int facility, int level, const char *text, size_t len) memcpy(cont.buf + cont.len, text, len); cont.len += len; + // The original flags come from the first line, + // but later continuations can add a newline. + if (flags & LOG_NEWLINE) { + cont.flags |= LOG_NEWLINE; + cont_flush(); + } + if (cont.len > (sizeof(cont.buf) * 80) / 100) - cont_flush(LOG_CONT); + cont_flush(); return true; } @@ -1733,6 +1735,35 @@ static size_t cont_print_text(char *text, size_t size) return textlen; } +static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len) +{ + /* + * If an earlier line was buffered, and we're a continuation + * write from the same process, try to add it to the buffer. + */ + if (cont.len) { + if (cont.owner == current && (lflags & LOG_CONT)) { + if (cont_add(facility, level, lflags, text, text_len)) + return text_len; + } + /* Otherwise, make sure it's flushed */ + cont_flush(); + } + + /* Skip empty continuation lines that couldn't be added - they just flush */ + if (!text_len && (lflags & LOG_CONT)) + return 0; + + /* If it doesn't end in a newline, try to buffer the current line */ + if (!(lflags & LOG_NEWLINE)) { + if (cont_add(facility, level, lflags, text, text_len)) + return text_len; + } + + /* Store it in the record log */ + return log_store(facility, level, lflags, 0, dict, dictlen, text, text_len); +} + asmlinkage int vprintk_emit(int facility, int level, const char *dict, size_t dictlen, const char *fmt, va_list args) @@ -1819,10 +1850,9 @@ asmlinkage int vprintk_emit(int facility, int level, /* strip kernel syslog prefix and extract log level or control flags */ if (facility == 0) { - int kern_level = printk_get_level(text); + int kern_level; - if (kern_level) { - const char *end_of_header = printk_skip_level(text); + while ((kern_level = printk_get_level(text)) != 0) { switch (kern_level) { case '0' ... '7': if (level == LOGLEVEL_DEFAULT) @@ -1830,14 +1860,13 @@ asmlinkage int vprintk_emit(int facility, int level, /* fallthrough */ case 'd': /* KERN_DEFAULT */ lflags |= LOG_PREFIX; + break; + case 'c': /* KERN_CONT */ + lflags |= LOG_CONT; } - /* - * No need to check length here because vscnprintf - * put '\0' at the end of the string. Only valid and - * newly printed level is detected. - */ - text_len -= end_of_header - text; - text = (char *)end_of_header; + + text_len -= 2; + text += 2; } } @@ -1847,45 +1876,7 @@ asmlinkage int vprintk_emit(int facility, int level, if (dict) lflags |= LOG_PREFIX|LOG_NEWLINE; - if (!(lflags & LOG_NEWLINE)) { - /* - * Flush the conflicting buffer. An earlier newline was missing, - * or another task also prints continuation lines. - */ - if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) - cont_flush(LOG_NEWLINE); - - /* buffer line if possible, otherwise store it right away */ - if (cont_add(facility, level, text, text_len)) - printed_len += text_len; - else - printed_len += log_store(facility, level, - lflags | LOG_CONT, 0, - dict, dictlen, text, text_len); - } else { - bool stored = false; - - /* - * If an earlier newline was missing and it was the same task, - * either merge it with the current buffer and flush, or if - * there was a race with interrupts (prefix == true) then just - * flush it out and store this line separately. - * If the preceding printk was from a different task and missed - * a newline, flush and append the newline. - */ - if (cont.len) { - if (cont.owner == current && !(lflags & LOG_PREFIX)) - stored = cont_add(facility, level, text, - text_len); - cont_flush(LOG_NEWLINE); - } - - if (stored) - printed_len += text_len; - else - printed_len += log_store(facility, level, lflags, 0, - dict, dictlen, text, text_len); - } + printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len); logbuf_cpu = UINT_MAX; raw_spin_unlock(&logbuf_lock); @@ -2175,27 +2166,20 @@ void resume_console(void) /** * console_cpu_notify - print deferred console messages after CPU hotplug - * @self: notifier struct - * @action: CPU hotplug event - * @hcpu: unused + * @cpu: unused * * If printk() is called from a CPU that is not online yet, the messages * will be spooled but will not show up on the console. This function is * called when a new CPU comes online (or fails to come up), and ensures * that any such output gets printed. */ -static int console_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - switch (action) { - case CPU_ONLINE: - case CPU_DEAD: - case CPU_DOWN_FAILED: - case CPU_UP_CANCELED: +static int console_cpu_notify(unsigned int cpu) +{ + if (!cpuhp_tasks_frozen) { console_lock(); console_unlock(); } - return NOTIFY_OK; + return 0; } /** @@ -2833,6 +2817,7 @@ EXPORT_SYMBOL(unregister_console); static int __init printk_late_init(void) { struct console *con; + int ret; for_each_console(con) { if (!keep_bootcon && con->flags & CON_BOOT) { @@ -2847,7 +2832,12 @@ static int __init printk_late_init(void) unregister_console(con); } } - hotcpu_notifier(console_cpu_notify, 0); + ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL, + console_cpu_notify); + WARN_ON(ret < 0); + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online", + console_cpu_notify, NULL); + WARN_ON(ret < 0); return 0; } late_initcall(printk_late_init); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1d3b7665d0be..e6474f7272ec 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -73,6 +73,8 @@ void __ptrace_unlink(struct task_struct *child) { BUG_ON(!child->ptrace); + clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + child->parent = child->real_parent; list_del_init(&child->ptrace_entry); @@ -489,7 +491,6 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) /* Architecture-specific hardware disable .. */ ptrace_disable(child); - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); write_lock_irq(&tasklist_lock); /* @@ -536,7 +537,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst int this_len, retval; this_len = (len > sizeof(buf)) ? sizeof(buf) : len; - retval = access_process_vm(tsk, src, buf, this_len, 0); + retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE); if (!retval) { if (copied) break; @@ -563,7 +564,8 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds this_len = (len > sizeof(buf)) ? sizeof(buf) : len; if (copy_from_user(buf, src, this_len)) return -EFAULT; - retval = access_process_vm(tsk, dst, buf, this_len, 1); + retval = access_process_vm(tsk, dst, buf, this_len, + FOLL_FORCE | FOLL_WRITE); if (!retval) { if (copied) break; @@ -1126,7 +1128,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, unsigned long tmp; int copied; - copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0); + copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) return -EIO; return put_user(tmp, (unsigned long __user *)data); @@ -1137,7 +1139,8 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, { int copied; - copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); + copied = access_process_vm(tsk, addr, &data, sizeof(data), + FOLL_FORCE | FOLL_WRITE); return (copied == sizeof(data)) ? 0 : -EIO; } @@ -1154,7 +1157,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, switch (request) { case PTRACE_PEEKTEXT: case PTRACE_PEEKDATA: - ret = access_process_vm(child, addr, &word, sizeof(word), 0); + ret = access_process_vm(child, addr, &word, sizeof(word), + FOLL_FORCE); if (ret != sizeof(word)) ret = -EIO; else @@ -1163,7 +1167,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, case PTRACE_POKETEXT: case PTRACE_POKEDATA: - ret = access_process_vm(child, addr, &data, sizeof(data), 1); + ret = access_process_vm(child, addr, &data, sizeof(data), + FOLL_FORCE | FOLL_WRITE); ret = (ret != sizeof(data) ? -EIO : 0); break; diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index d38ab08a3fe7..123ccbd22449 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -52,7 +52,7 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); #define PERF_FLAG "-perf:" #define PERFOUT_STRING(s) \ - pr_alert("%s" PERF_FLAG s "\n", perf_type) + pr_alert("%s" PERF_FLAG " %s\n", perf_type, s) #define VERBOSE_PERFOUT_STRING(s) \ do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0) #define VERBOSE_PERFOUT_ERRSTRING(s) \ @@ -400,9 +400,8 @@ rcu_perf_writer(void *arg) sp.sched_priority = 0; sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); - pr_alert("%s" PERF_FLAG - "rcu_perf_writer %ld has %d measurements\n", - perf_type, me, MIN_MEAS); + pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n", + perf_type, PERF_FLAG, me, MIN_MEAS); if (atomic_inc_return(&n_rcu_perf_writer_finished) >= nrealwriters) { schedule_timeout_interruptible(10); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 971e2b138063..87c51225ceec 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -289,15 +289,24 @@ static int rcu_torture_read_lock(void) __acquires(RCU) static void rcu_read_delay(struct torture_random_state *rrsp) { + unsigned long started; + unsigned long completed; const unsigned long shortdelay_us = 200; const unsigned long longdelay_ms = 50; + unsigned long long ts; /* We want a short delay sometimes to make a reader delay the grace * period, and we want a long delay occasionally to trigger * force_quiescent_state. */ - if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) + if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { + started = cur_ops->completed(); + ts = rcu_trace_clock_local(); mdelay(longdelay_ms); + completed = cur_ops->completed(); + do_trace_rcu_torture_read(cur_ops->name, NULL, ts, + started, completed); + } if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) udelay(shortdelay_us); #ifdef CONFIG_PREEMPT @@ -1238,6 +1247,7 @@ rcu_torture_stats_print(void) long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; static unsigned long rtcv_snap = ULONG_MAX; + struct task_struct *wtp; for_each_possible_cpu(cpu) { for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { @@ -1258,8 +1268,9 @@ rcu_torture_stats_print(void) atomic_read(&n_rcu_torture_alloc), atomic_read(&n_rcu_torture_alloc_fail), atomic_read(&n_rcu_torture_free)); - pr_cont("rtmbe: %d rtbke: %ld rtbre: %ld ", + pr_cont("rtmbe: %d rtbe: %ld rtbke: %ld rtbre: %ld ", atomic_read(&n_rcu_torture_mberror), + n_rcu_torture_barrier_error, n_rcu_torture_boost_ktrerror, n_rcu_torture_boost_rterror); pr_cont("rtbf: %ld rtb: %ld nt: %ld ", @@ -1312,10 +1323,12 @@ rcu_torture_stats_print(void) rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); - pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x\n", + wtp = READ_ONCE(writer_task); + pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n", rcu_torture_writer_state_getname(), rcu_torture_writer_state, - gpnum, completed, flags); + gpnum, completed, flags, + wtp == NULL ? ~0UL : wtp->state); show_rcu_gp_kthreads(); rcu_ftrace_dump(DUMP_ALL); } @@ -1362,12 +1375,12 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) onoff_interval, onoff_holdoff); } -static void rcutorture_booster_cleanup(int cpu) +static int rcutorture_booster_cleanup(unsigned int cpu) { struct task_struct *t; if (boost_tasks[cpu] == NULL) - return; + return 0; mutex_lock(&boost_mutex); t = boost_tasks[cpu]; boost_tasks[cpu] = NULL; @@ -1375,9 +1388,10 @@ static void rcutorture_booster_cleanup(int cpu) /* This must be outside of the mutex, otherwise deadlock! */ torture_stop_kthread(rcu_torture_boost, t); + return 0; } -static int rcutorture_booster_init(int cpu) +static int rcutorture_booster_init(unsigned int cpu) { int retval; @@ -1577,28 +1591,7 @@ static void rcu_torture_barrier_cleanup(void) } } -static int rcutorture_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - (void)rcutorture_booster_init(cpu); - break; - case CPU_DOWN_PREPARE: - rcutorture_booster_cleanup(cpu); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block rcutorture_cpu_nb = { - .notifier_call = rcutorture_cpu_notify, -}; +static enum cpuhp_state rcutor_hp; static void rcu_torture_cleanup(void) @@ -1638,11 +1631,8 @@ rcu_torture_cleanup(void) for (i = 0; i < ncbflooders; i++) torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]); if ((test_boost == 1 && cur_ops->can_boost) || - test_boost == 2) { - unregister_cpu_notifier(&rcutorture_cpu_nb); - for_each_possible_cpu(i) - rcutorture_booster_cleanup(i); - } + test_boost == 2) + cpuhp_remove_state(rcutor_hp); /* * Wait for all RCU callbacks to fire, then do flavor-specific @@ -1869,14 +1859,13 @@ rcu_torture_init(void) test_boost == 2) { boost_starttime = jiffies + test_boost_interval * HZ; - register_cpu_notifier(&rcutorture_cpu_nb); - for_each_possible_cpu(i) { - if (cpu_is_offline(i)) - continue; /* Heuristic: CPU can go offline. */ - firsterr = rcutorture_booster_init(i); - if (firsterr) - goto unwind; - } + + firsterr = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "RCU_TORTURE", + rcutorture_booster_init, + rcutorture_booster_cleanup); + if (firsterr < 0) + goto unwind; + rcutor_hp = firsterr; } firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); if (firsterr) diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index be922c9f3d37..50d1861f7759 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -68,6 +68,8 @@ void rcu_sync_lockdep_assert(struct rcu_sync *rsp) RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), "suspicious rcu_sync_is_idle() usage"); } + +EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert); #endif /** @@ -83,6 +85,18 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) } /** + * Must be called after rcu_sync_init() and before first use. + * + * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() + * pairs turn into NO-OPs. + */ +void rcu_sync_enter_start(struct rcu_sync *rsp) +{ + rsp->gp_count++; + rsp->gp_state = GP_PASSED; +} + +/** * rcu_sync_enter() - Force readers onto slowpath * @rsp: Pointer to rcu_sync structure to use for synchronization * diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 944b1b491ed8..1898559e6b60 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -170,7 +170,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) false)); } -static void rcu_process_callbacks(struct softirq_action *unused) +static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) { __rcu_process_callbacks(&rcu_sched_ctrlblk); __rcu_process_callbacks(&rcu_bh_ctrlblk); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5d80925e7fc8..96c52e43f7ca 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -41,7 +41,6 @@ #include <linux/export.h> #include <linux/completion.h> #include <linux/moduleparam.h> -#include <linux/module.h> #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/cpu.h> @@ -60,7 +59,6 @@ #include "tree.h" #include "rcu.h" -MODULE_ALIAS("rcutree"); #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX #endif @@ -1306,7 +1304,8 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp) if (!rcu_kick_kthreads) return; j = READ_ONCE(rsp->jiffies_kick_kthreads); - if (time_after(jiffies, j) && rsp->gp_kthread) { + if (time_after(jiffies, j) && rsp->gp_kthread && + (rcu_gp_in_progress(rsp) || READ_ONCE(rsp->gp_flags))) { WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name); rcu_ftrace_dump(DUMP_ALL); wake_up_process(rsp->gp_kthread); @@ -1848,6 +1847,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { bool ret; + bool need_gp; /* Handle the ends of any preceding grace periods first. */ if (rdp->completed == rnp->completed && @@ -1874,9 +1874,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, */ rdp->gpnum = rnp->gpnum; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); - rdp->cpu_no_qs.b.norm = true; + need_gp = !!(rnp->qsmask & rdp->grpmask); + rdp->cpu_no_qs.b.norm = need_gp; rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); - rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask); + rdp->core_needs_qs = need_gp; zero_cpu_stall_ticks(rdp); WRITE_ONCE(rdp->gpwrap, false); } @@ -2344,7 +2345,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); - swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ + rcu_gp_kthread_wake(rsp); } /* @@ -2828,8 +2829,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) * Also schedule RCU core processing. * * This function must be called from hardirq context. It is normally - * invoked from the scheduling-clock interrupt. If rcu_pending returns - * false, there is no point in invoking rcu_check_callbacks(). + * invoked from the scheduling-clock interrupt. */ void rcu_check_callbacks(int user) { @@ -2970,7 +2970,7 @@ static void force_quiescent_state(struct rcu_state *rsp) } WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); - swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ + rcu_gp_kthread_wake(rsp); } /* @@ -3013,7 +3013,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) /* * Do RCU core processing for the current CPU. */ -static void rcu_process_callbacks(struct softirq_action *unused) +static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) { struct rcu_state *rsp; @@ -3121,7 +3121,9 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, unsigned long flags; struct rcu_data *rdp; - WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */ + /* Misaligned rcu_head! */ + WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); + if (debug_rcu_head_queue(head)) { /* Probable double call_rcu(), so leak the callback. */ WRITE_ONCE(head->func, rcu_leak_callback); @@ -3130,13 +3132,6 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, } head->func = func; head->next = NULL; - - /* - * Opportunistically note grace-period endings and beginnings. - * Note that we might see a beginning right after we see an - * end, but never vice versa, since this CPU has to pass through - * a quiescent state betweentimes. - */ local_irq_save(flags); rdp = this_cpu_ptr(rsp->rda); @@ -3792,8 +3787,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rnp = rdp->mynode; mask = rdp->grpmask; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - rnp->qsmaskinitnext |= mask; - rnp->expmaskinitnext |= mask; if (!rdp->beenonline) WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); rdp->beenonline = true; /* We have now been online. */ @@ -3860,6 +3853,32 @@ int rcutree_dead_cpu(unsigned int cpu) return 0; } +/* + * Mark the specified CPU as being online so that subsequent grace periods + * (both expedited and normal) will wait on it. Note that this means that + * incoming CPUs are not allowed to use RCU read-side critical sections + * until this function is called. Failing to observe this restriction + * will result in lockdep splats. + */ +void rcu_cpu_starting(unsigned int cpu) +{ + unsigned long flags; + unsigned long mask; + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + rdp = this_cpu_ptr(rsp->rda); + rnp = rdp->mynode; + mask = rdp->grpmask; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->qsmaskinitnext |= mask; + rnp->expmaskinitnext |= mask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } +} + #ifdef CONFIG_HOTPLUG_CPU /* * The CPU is exiting the idle loop into the arch_cpu_idle_dead() @@ -4209,8 +4228,10 @@ void __init rcu_init(void) * or the scheduler are operational. */ pm_notifier(rcu_pm_notify, 0); - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { rcutree_prepare_cpu(cpu); + rcu_cpu_starting(cpu); + } } #include "tree_exp.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f714f873bf9d..fe98dd24adf8 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -400,9 +400,11 @@ struct rcu_data { #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + atomic_long_t exp_workdone0; /* # done by workqueue. */ atomic_long_t exp_workdone1; /* # done by others #1. */ atomic_long_t exp_workdone2; /* # done by others #2. */ atomic_long_t exp_workdone3; /* # done by others #3. */ + int exp_dynticks_snap; /* Double-check need for IPI. */ /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6d86ab6ec2c9..d3053e99fdb6 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -358,8 +358,11 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + rdp->exp_dynticks_snap = + atomic_add_return(0, &rdtp->dynticks); if (raw_smp_processor_id() == cpu || - !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) + !(rdp->exp_dynticks_snap & 0x1) || + !(rnp->qsmaskinitnext & rdp->grpmask)) mask_ofl_test |= rdp->grpmask; } mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; @@ -376,25 +379,32 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, /* IPI the remaining CPUs for expedited quiescent state. */ for_each_leaf_node_possible_cpu(rnp, cpu) { unsigned long mask = leaf_node_cpu_bit(rnp, cpu); + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + if (!(mask_ofl_ipi & mask)) continue; retry_ipi: + if (atomic_add_return(0, &rdtp->dynticks) != + rdp->exp_dynticks_snap) { + mask_ofl_test |= mask; + continue; + } ret = smp_call_function_single(cpu, func, rsp, 0); if (!ret) { mask_ofl_ipi &= ~mask; continue; } - /* Failed, raced with offline. */ + /* Failed, raced with CPU hotplug operation. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (cpu_online(cpu) && + if ((rnp->qsmaskinitnext & mask) && (rnp->expmask & mask)) { + /* Online, so delay for a bit and try again. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); schedule_timeout_uninterruptible(1); - if (cpu_online(cpu) && - (rnp->expmask & mask)) - goto retry_ipi; - raw_spin_lock_irqsave_rcu_node(rnp, flags); + goto retry_ipi; } + /* CPU really is offline, so we can ignore it. */ if (!(rnp->expmask & mask)) mask_ofl_ipi &= ~mask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -427,12 +437,10 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) jiffies_stall); if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) return; - if (ret < 0) { - /* Hit a signal, disable CPU stall warnings. */ - swait_event(rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp_root)); - return; - } + WARN_ON(ret < 0); /* workqueues should not be signaled. */ + if (rcu_cpu_stall_suppress) + continue; + panic_on_rcu_stall(); pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rsp->name); ndetected = 0; @@ -500,7 +508,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) * next GP, to proceed. */ mutex_lock(&rsp->exp_wake_mutex); - mutex_unlock(&rsp->exp_mutex); rcu_for_each_node_breadth_first(rsp, rnp) { if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { @@ -516,6 +523,70 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) mutex_unlock(&rsp->exp_wake_mutex); } +/* Let the workqueue handler know what it is supposed to do. */ +struct rcu_exp_work { + smp_call_func_t rew_func; + struct rcu_state *rew_rsp; + unsigned long rew_s; + struct work_struct rew_work; +}; + +/* + * Work-queue handler to drive an expedited grace period forward. + */ +static void wait_rcu_exp_gp(struct work_struct *wp) +{ + struct rcu_exp_work *rewp; + + /* Initialize the rcu_node tree in preparation for the wait. */ + rewp = container_of(wp, struct rcu_exp_work, rew_work); + sync_rcu_exp_select_cpus(rewp->rew_rsp, rewp->rew_func); + + /* Wait and clean up, including waking everyone. */ + rcu_exp_wait_wake(rewp->rew_rsp, rewp->rew_s); +} + +/* + * Given an rcu_state pointer and a smp_call_function() handler, kick + * off the specified flavor of expedited grace period. + */ +static void _synchronize_rcu_expedited(struct rcu_state *rsp, + smp_call_func_t func) +{ + struct rcu_data *rdp; + struct rcu_exp_work rew; + struct rcu_node *rnp; + unsigned long s; + + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(rsp->call); + return; + } + + /* Take a snapshot of the sequence number. */ + s = rcu_exp_gp_seq_snap(rsp); + if (exp_funnel_lock(rsp, s)) + return; /* Someone else did our work for us. */ + + /* Marshall arguments and schedule the expedited grace period. */ + rew.rew_func = func; + rew.rew_rsp = rsp; + rew.rew_s = s; + INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); + schedule_work(&rew.rew_work); + + /* Wait for expedited grace period to complete. */ + rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); + rnp = rcu_get_root(rsp); + wait_event(rnp->exp_wq[(s >> 1) & 0x3], + sync_exp_work_done(rsp, + &rdp->exp_workdone0, s)); + + /* Let the next expedited grace period start. */ + mutex_unlock(&rsp->exp_mutex); +} + /** * synchronize_sched_expedited - Brute-force RCU-sched grace period * @@ -534,29 +605,13 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) */ void synchronize_sched_expedited(void) { - unsigned long s; struct rcu_state *rsp = &rcu_sched_state; /* If only one CPU, this is automatically a grace period. */ if (rcu_blocking_is_gp()) return; - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu_sched); - return; - } - - /* Take a snapshot of the sequence number. */ - s = rcu_exp_gp_seq_snap(rsp); - if (exp_funnel_lock(rsp, s)) - return; /* Someone else did our work for us. */ - - /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); - - /* Wait and clean up, including waking everyone. */ - rcu_exp_wait_wake(rsp, s); + _synchronize_rcu_expedited(rsp, sync_sched_exp_handler); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); @@ -620,23 +675,8 @@ static void sync_rcu_exp_handler(void *info) void synchronize_rcu_expedited(void) { struct rcu_state *rsp = rcu_state_p; - unsigned long s; - - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu); - return; - } - - s = rcu_exp_gp_seq_snap(rsp); - if (exp_funnel_lock(rsp, s)) - return; /* Someone else did our work for us. */ - - /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); - /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ - rcu_exp_wait_wake(rsp, s); + _synchronize_rcu_expedited(rsp, sync_rcu_exp_handler); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0082fce402a0..85c5a883c6e3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2173,6 +2173,7 @@ static int rcu_nocb_kthread(void *arg) cl++; c++; local_bh_enable(); + cond_resched_rcu_qs(); list = next; } trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 86782f9a4604..b1f28972872c 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -185,16 +185,17 @@ static int show_rcuexp(struct seq_file *m, void *v) int cpu; struct rcu_state *rsp = (struct rcu_state *)m->private; struct rcu_data *rdp; - unsigned long s1 = 0, s2 = 0, s3 = 0; + unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(rsp->rda, cpu); + s0 += atomic_long_read(&rdp->exp_workdone0); s1 += atomic_long_read(&rdp->exp_workdone1); s2 += atomic_long_read(&rdp->exp_workdone2); s3 += atomic_long_read(&rdp->exp_workdone3); } - seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", - rsp->expedited_sequence, s1, s2, s3, + seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", + rsp->expedited_sequence, s0, s1, s2, s3, atomic_long_read(&rsp->expedited_normal), atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f0d8322bc3ec..f19271dce0a9 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -46,7 +46,7 @@ #include <linux/export.h> #include <linux/hardirq.h> #include <linux/delay.h> -#include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/kthread.h> #include <linux/tick.h> @@ -54,7 +54,6 @@ #include "rcu.h" -MODULE_ALIAS("rcupdate"); #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX #endif diff --git a/kernel/relay.c b/kernel/relay.c index d797502140b9..da79a109dbeb 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -214,7 +214,7 @@ static void relay_destroy_buf(struct rchan_buf *buf) __free_page(buf->page_array[i]); relay_free_page_array(buf->page_array); } - chan->buf[buf->cpu] = NULL; + *per_cpu_ptr(chan->buf, buf->cpu) = NULL; kfree(buf->padding); kfree(buf); kref_put(&chan->kref, relay_destroy_channel); @@ -328,13 +328,15 @@ static struct rchan_callbacks default_channel_callbacks = { /** * wakeup_readers - wake up readers waiting on a channel - * @data: contains the channel buffer + * @work: contains the channel buffer * - * This is the timer function used to defer reader waking. + * This is the function used to defer reader waking */ -static void wakeup_readers(unsigned long data) +static void wakeup_readers(struct irq_work *work) { - struct rchan_buf *buf = (struct rchan_buf *)data; + struct rchan_buf *buf; + + buf = container_of(work, struct rchan_buf, wakeup_work); wake_up_interruptible(&buf->read_wait); } @@ -352,9 +354,10 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) if (init) { init_waitqueue_head(&buf->read_wait); kref_init(&buf->kref); - setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf); - } else - del_timer_sync(&buf->timer); + init_irq_work(&buf->wakeup_work, wakeup_readers); + } else { + irq_work_sync(&buf->wakeup_work); + } buf->subbufs_produced = 0; buf->subbufs_consumed = 0; @@ -382,20 +385,21 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) */ void relay_reset(struct rchan *chan) { + struct rchan_buf *buf; unsigned int i; if (!chan) return; - if (chan->is_global && chan->buf[0]) { - __relay_reset(chan->buf[0], 0); + if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) { + __relay_reset(buf, 0); return; } mutex_lock(&relay_channels_mutex); for_each_possible_cpu(i) - if (chan->buf[i]) - __relay_reset(chan->buf[i], 0); + if ((buf = *per_cpu_ptr(chan->buf, i))) + __relay_reset(buf, 0); mutex_unlock(&relay_channels_mutex); } EXPORT_SYMBOL_GPL(relay_reset); @@ -440,7 +444,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) struct dentry *dentry; if (chan->is_global) - return chan->buf[0]; + return *per_cpu_ptr(chan->buf, 0); buf = relay_create_buf(chan); if (!buf) @@ -464,7 +468,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) __relay_reset(buf, 1); if(chan->is_global) { - chan->buf[0] = buf; + *per_cpu_ptr(chan->buf, 0) = buf; buf->cpu = 0; } @@ -486,7 +490,7 @@ free_buf: static void relay_close_buf(struct rchan_buf *buf) { buf->finalized = 1; - del_timer_sync(&buf->timer); + irq_work_sync(&buf->wakeup_work); buf->chan->cb->remove_buf_file(buf->dentry); kref_put(&buf->kref, relay_remove_buf); } @@ -512,46 +516,25 @@ static void setup_callbacks(struct rchan *chan, chan->cb = cb; } -/** - * relay_hotcpu_callback - CPU hotplug callback - * @nb: notifier block - * @action: hotplug action to take - * @hcpu: CPU number - * - * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD) - */ -static int relay_hotcpu_callback(struct notifier_block *nb, - unsigned long action, - void *hcpu) +int relay_prepare_cpu(unsigned int cpu) { - unsigned int hotcpu = (unsigned long)hcpu; struct rchan *chan; + struct rchan_buf *buf; - switch(action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - mutex_lock(&relay_channels_mutex); - list_for_each_entry(chan, &relay_channels, list) { - if (chan->buf[hotcpu]) - continue; - chan->buf[hotcpu] = relay_open_buf(chan, hotcpu); - if(!chan->buf[hotcpu]) { - printk(KERN_ERR - "relay_hotcpu_callback: cpu %d buffer " - "creation failed\n", hotcpu); - mutex_unlock(&relay_channels_mutex); - return notifier_from_errno(-ENOMEM); - } + mutex_lock(&relay_channels_mutex); + list_for_each_entry(chan, &relay_channels, list) { + if ((buf = *per_cpu_ptr(chan->buf, cpu))) + continue; + buf = relay_open_buf(chan, cpu); + if (!buf) { + pr_err("relay: cpu %d buffer creation failed\n", cpu); + mutex_unlock(&relay_channels_mutex); + return -ENOMEM; } - mutex_unlock(&relay_channels_mutex); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - /* No need to flush the cpu : will be flushed upon - * final relay_flush() call. */ - break; + *per_cpu_ptr(chan->buf, cpu) = buf; } - return NOTIFY_OK; + mutex_unlock(&relay_channels_mutex); + return 0; } /** @@ -583,6 +566,7 @@ struct rchan *relay_open(const char *base_filename, { unsigned int i; struct rchan *chan; + struct rchan_buf *buf; if (!(subbuf_size && n_subbufs)) return NULL; @@ -593,6 +577,7 @@ struct rchan *relay_open(const char *base_filename, if (!chan) return NULL; + chan->buf = alloc_percpu(struct rchan_buf *); chan->version = RELAYFS_CHANNEL_VERSION; chan->n_subbufs = n_subbufs; chan->subbuf_size = subbuf_size; @@ -608,9 +593,10 @@ struct rchan *relay_open(const char *base_filename, mutex_lock(&relay_channels_mutex); for_each_online_cpu(i) { - chan->buf[i] = relay_open_buf(chan, i); - if (!chan->buf[i]) + buf = relay_open_buf(chan, i); + if (!buf) goto free_bufs; + *per_cpu_ptr(chan->buf, i) = buf; } list_add(&chan->list, &relay_channels); mutex_unlock(&relay_channels_mutex); @@ -619,8 +605,8 @@ struct rchan *relay_open(const char *base_filename, free_bufs: for_each_possible_cpu(i) { - if (chan->buf[i]) - relay_close_buf(chan->buf[i]); + if ((buf = *per_cpu_ptr(chan->buf, i))) + relay_close_buf(buf); } kref_put(&chan->kref, relay_destroy_channel); @@ -666,6 +652,7 @@ int relay_late_setup_files(struct rchan *chan, unsigned int i, curr_cpu; unsigned long flags; struct dentry *dentry; + struct rchan_buf *buf; struct rchan_percpu_buf_dispatcher disp; if (!chan || !base_filename) @@ -684,10 +671,11 @@ int relay_late_setup_files(struct rchan *chan, if (chan->is_global) { err = -EINVAL; - if (!WARN_ON_ONCE(!chan->buf[0])) { - dentry = relay_create_buf_file(chan, chan->buf[0], 0); + buf = *per_cpu_ptr(chan->buf, 0); + if (!WARN_ON_ONCE(!buf)) { + dentry = relay_create_buf_file(chan, buf, 0); if (dentry && !WARN_ON_ONCE(!chan->is_global)) { - relay_set_buf_dentry(chan->buf[0], dentry); + relay_set_buf_dentry(buf, dentry); err = 0; } } @@ -702,13 +690,14 @@ int relay_late_setup_files(struct rchan *chan, * on all currently online CPUs. */ for_each_online_cpu(i) { - if (unlikely(!chan->buf[i])) { + buf = *per_cpu_ptr(chan->buf, i); + if (unlikely(!buf)) { WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n"); err = -EINVAL; break; } - dentry = relay_create_buf_file(chan, chan->buf[i], i); + dentry = relay_create_buf_file(chan, buf, i); if (unlikely(!dentry)) { err = -EINVAL; break; @@ -716,10 +705,10 @@ int relay_late_setup_files(struct rchan *chan, if (curr_cpu == i) { local_irq_save(flags); - relay_set_buf_dentry(chan->buf[i], dentry); + relay_set_buf_dentry(buf, dentry); local_irq_restore(flags); } else { - disp.buf = chan->buf[i]; + disp.buf = buf; disp.dentry = dentry; smp_mb(); /* relay_channels_mutex must be held, so wait. */ @@ -768,14 +757,15 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) buf->early_bytes += buf->chan->subbuf_size - buf->padding[old_subbuf]; smp_mb(); - if (waitqueue_active(&buf->read_wait)) + if (waitqueue_active(&buf->read_wait)) { /* * Calling wake_up_interruptible() from here * will deadlock if we happen to be logging * from the scheduler (trying to re-grab * rq->lock), so defer it. */ - mod_timer(&buf->timer, jiffies + 1); + irq_work_queue(&buf->wakeup_work); + } } old = buf->data; @@ -822,11 +812,10 @@ void relay_subbufs_consumed(struct rchan *chan, if (!chan) return; - if (cpu >= NR_CPUS || !chan->buf[cpu] || - subbufs_consumed > chan->n_subbufs) + buf = *per_cpu_ptr(chan->buf, cpu); + if (cpu >= NR_CPUS || !buf || subbufs_consumed > chan->n_subbufs) return; - buf = chan->buf[cpu]; if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed) buf->subbufs_consumed = buf->subbufs_produced; else @@ -842,18 +831,19 @@ EXPORT_SYMBOL_GPL(relay_subbufs_consumed); */ void relay_close(struct rchan *chan) { + struct rchan_buf *buf; unsigned int i; if (!chan) return; mutex_lock(&relay_channels_mutex); - if (chan->is_global && chan->buf[0]) - relay_close_buf(chan->buf[0]); + if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) + relay_close_buf(buf); else for_each_possible_cpu(i) - if (chan->buf[i]) - relay_close_buf(chan->buf[i]); + if ((buf = *per_cpu_ptr(chan->buf, i))) + relay_close_buf(buf); if (chan->last_toobig) printk(KERN_WARNING "relay: one or more items not logged " @@ -874,20 +864,21 @@ EXPORT_SYMBOL_GPL(relay_close); */ void relay_flush(struct rchan *chan) { + struct rchan_buf *buf; unsigned int i; if (!chan) return; - if (chan->is_global && chan->buf[0]) { - relay_switch_subbuf(chan->buf[0], 0); + if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) { + relay_switch_subbuf(buf, 0); return; } mutex_lock(&relay_channels_mutex); for_each_possible_cpu(i) - if (chan->buf[i]) - relay_switch_subbuf(chan->buf[i], 0); + if ((buf = *per_cpu_ptr(chan->buf, i))) + relay_switch_subbuf(buf, 0); mutex_unlock(&relay_channels_mutex); } EXPORT_SYMBOL_GPL(relay_flush); @@ -1121,51 +1112,23 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf, return end_pos; } -/* - * subbuf_read_actor - read up to one subbuf's worth of data - */ -static int subbuf_read_actor(size_t read_start, - struct rchan_buf *buf, - size_t avail, - read_descriptor_t *desc) -{ - void *from; - int ret = 0; - - from = buf->start + read_start; - ret = avail; - if (copy_to_user(desc->arg.buf, from, avail)) { - desc->error = -EFAULT; - ret = 0; - } - desc->arg.data += ret; - desc->written += ret; - desc->count -= ret; - - return ret; -} - -typedef int (*subbuf_actor_t) (size_t read_start, - struct rchan_buf *buf, - size_t avail, - read_descriptor_t *desc); - -/* - * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries - */ -static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, - subbuf_actor_t subbuf_actor, - read_descriptor_t *desc) +static ssize_t relay_file_read(struct file *filp, + char __user *buffer, + size_t count, + loff_t *ppos) { struct rchan_buf *buf = filp->private_data; size_t read_start, avail; + size_t written = 0; int ret; - if (!desc->count) + if (!count) return 0; inode_lock(file_inode(filp)); do { + void *from; + if (!relay_file_read_avail(buf, *ppos)) break; @@ -1174,32 +1137,22 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, if (!avail) break; - avail = min(desc->count, avail); - ret = subbuf_actor(read_start, buf, avail, desc); - if (desc->error < 0) + avail = min(count, avail); + from = buf->start + read_start; + ret = avail; + if (copy_to_user(buffer, from, avail)) break; - if (ret) { - relay_file_read_consume(buf, read_start, ret); - *ppos = relay_file_read_end_pos(buf, read_start, ret); - } - } while (desc->count && ret); - inode_unlock(file_inode(filp)); + buffer += ret; + written += ret; + count -= ret; - return desc->written; -} + relay_file_read_consume(buf, read_start, ret); + *ppos = relay_file_read_end_pos(buf, read_start, ret); + } while (count); + inode_unlock(file_inode(filp)); -static ssize_t relay_file_read(struct file *filp, - char __user *buffer, - size_t count, - loff_t *ppos) -{ - read_descriptor_t desc; - desc.written = 0; - desc.count = count; - desc.arg.buf = buffer; - desc.error = 0; - return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc); + return written; } static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) @@ -1377,12 +1330,3 @@ const struct file_operations relay_file_operations = { .splice_read = relay_file_splice_read, }; EXPORT_SYMBOL_GPL(relay_file_operations); - -static __init int relay_init(void) -{ - - hotcpu_notifier(relay_hotcpu_callback, 0); - return 0; -} - -early_initcall(relay_init); diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index a5d966cb8891..da39489d2d80 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -111,10 +111,13 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) { if (tg != &root_task_group) return false; - /* - * We can only assume the task group can't go away on us if - * autogroup_move_group() can see us on ->thread_group list. + * If we race with autogroup_move_group() the caller can use the old + * value of signal->autogroup but in this case sched_move_task() will + * be called again before autogroup_kref_put(). + * + * However, there is no way sched_autogroup_exit_task() could tell us + * to avoid autogroup->tg, so we abuse PF_EXITING flag for this case. */ if (p->flags & PF_EXITING) return false; @@ -122,6 +125,16 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) return true; } +void sched_autogroup_exit_task(struct task_struct *p) +{ + /* + * We are going to call exit_notify() and autogroup_move_group() can't + * see this thread after that: we can no longer use signal->autogroup. + * See the PF_EXITING check in task_wants_autogroup(). + */ + sched_move_task(p); +} + static void autogroup_move_group(struct task_struct *p, struct autogroup *ag) { @@ -138,13 +151,20 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) } p->signal->autogroup = autogroup_kref_get(ag); - - if (!READ_ONCE(sysctl_sched_autogroup_enabled)) - goto out; - + /* + * We can't avoid sched_move_task() after we changed signal->autogroup, + * this process can already run with task_group() == prev->tg or we can + * race with cgroup code which can read autogroup = prev under rq->lock. + * In the latter case for_each_thread() can not miss a migrating thread, + * cpu_cgroup_attach() must not be possible after cgroup_exit() and it + * can't be removed from thread list, we hold ->siglock. + * + * If an exiting thread was already removed from thread list we rely on + * sched_autogroup_exit_task(). + */ for_each_thread(p, t) sched_move_task(t); -out: + unlock_task_sighand(p, &flags); autogroup_kref_put(prev); } @@ -192,6 +212,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) { static unsigned long next = INITIAL_JIFFIES; struct autogroup *ag; + unsigned long shares; int err; if (nice < MIN_NICE || nice > MAX_NICE) @@ -210,9 +231,10 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) next = HZ / 10 + jiffies; ag = autogroup_task_get(p); + shares = scale_load(sched_prio_to_weight[nice + 20]); down_write(&ag->lock); - err = sched_group_set_shares(ag->tg, sched_prio_to_weight[nice + 20]); + err = sched_group_set_shares(ag->tg, shares); if (!err) ag->nice = nice; up_write(&ag->lock); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 44817c640e99..966556ebdbb3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -75,11 +75,11 @@ #include <linux/compiler.h> #include <linux/frame.h> #include <linux/prefetch.h> +#include <linux/mutex.h> #include <asm/switch_to.h> #include <asm/tlb.h> #include <asm/irq_regs.h> -#include <asm/mutex.h> #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #endif @@ -581,6 +581,8 @@ static bool wake_up_full_nohz_cpu(int cpu) * If needed we can still optimize that later with an * empty IRQ. */ + if (cpu_is_offline(cpu)) + return true; /* Don't try to wake offline CPUs. */ if (tick_nohz_full_cpu(cpu)) { if (cpu != smp_processor_id() || tick_nohz_tick_stopped()) @@ -591,6 +593,11 @@ static bool wake_up_full_nohz_cpu(int cpu) return false; } +/* + * Wake up the specified CPU. If the CPU is going offline, it is the + * caller's responsibility to deal with the lost wakeup, for example, + * by hooking into the CPU_DEAD notifier like timers and hrtimers do. + */ void wake_up_nohz_cpu(int cpu) { if (!wake_up_full_nohz_cpu(cpu)) @@ -1063,8 +1070,12 @@ static int migration_cpu_stop(void *data) * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because * we're holding p->pi_lock. */ - if (task_rq(p) == rq && task_on_rq_queued(p)) - rq = __migrate_task(rq, p, arg->dest_cpu); + if (task_rq(p) == rq) { + if (task_on_rq_queued(p)) + rq = __migrate_task(rq, p, arg->dest_cpu); + else + p->wake_cpu = arg->dest_cpu; + } raw_spin_unlock(&rq->lock); raw_spin_unlock(&p->pi_lock); @@ -1105,10 +1116,10 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) p->sched_class->set_cpus_allowed(p, new_mask); - if (running) - p->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE); + if (running) + set_curr_task(rq, p); } /* @@ -1265,7 +1276,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) /* * Task isn't running anymore; make it appear like we migrated * it before it went to sleep. This means on wakeup we make the - * previous cpu our targer instead of where it really is. + * previous cpu our target instead of where it really is. */ p->wake_cpu = cpu; } @@ -1629,23 +1640,25 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p, static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { -#ifdef CONFIG_SCHEDSTATS - struct rq *rq = this_rq(); + struct rq *rq; -#ifdef CONFIG_SMP - int this_cpu = smp_processor_id(); + if (!schedstat_enabled()) + return; + + rq = this_rq(); - if (cpu == this_cpu) { - schedstat_inc(rq, ttwu_local); - schedstat_inc(p, se.statistics.nr_wakeups_local); +#ifdef CONFIG_SMP + if (cpu == rq->cpu) { + schedstat_inc(rq->ttwu_local); + schedstat_inc(p->se.statistics.nr_wakeups_local); } else { struct sched_domain *sd; - schedstat_inc(p, se.statistics.nr_wakeups_remote); + schedstat_inc(p->se.statistics.nr_wakeups_remote); rcu_read_lock(); - for_each_domain(this_cpu, sd) { + for_each_domain(rq->cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - schedstat_inc(sd, ttwu_wake_remote); + schedstat_inc(sd->ttwu_wake_remote); break; } } @@ -1653,17 +1666,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } if (wake_flags & WF_MIGRATED) - schedstat_inc(p, se.statistics.nr_wakeups_migrate); - + schedstat_inc(p->se.statistics.nr_wakeups_migrate); #endif /* CONFIG_SMP */ - schedstat_inc(rq, ttwu_count); - schedstat_inc(p, se.statistics.nr_wakeups); + schedstat_inc(rq->ttwu_count); + schedstat_inc(p->se.statistics.nr_wakeups); if (wake_flags & WF_SYNC) - schedstat_inc(p, se.statistics.nr_wakeups_sync); - -#endif /* CONFIG_SCHEDSTATS */ + schedstat_inc(p->se.statistics.nr_wakeups_sync); } static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) @@ -1985,14 +1995,15 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * @state: the mask of task states that can be woken * @wake_flags: wake modifier flags (WF_*) * - * Put it on the run-queue if it's not already there. The "current" - * thread is always on the run-queue (except when the actual - * re-schedule is in progress), and as such you're allowed to do - * the simpler "current->state = TASK_RUNNING" to mark yourself - * runnable without the overhead of this. + * If (@state & @p->state) @p->state = TASK_RUNNING. + * + * If the task was not queued/runnable, also place it back on a runqueue. * - * Return: %true if @p was woken up, %false if it was already running. - * or @state didn't match @p's state. + * Atomic against schedule() which would dequeue a task, also see + * set_current_state(). + * + * Return: %true if @p->state changes (an actual wakeup was done), + * %false otherwise. */ static int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) @@ -2084,8 +2095,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ttwu_queue(p, cpu, wake_flags); stat: - if (schedstat_enabled()) - ttwu_stat(p, cpu, wake_flags); + ttwu_stat(p, cpu, wake_flags); out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -2095,6 +2105,7 @@ out: /** * try_to_wake_up_local - try to wake up a local task with rq lock held * @p: the thread to be awakened + * @cookie: context's cookie for pinning * * Put @p on the run-queue if it's not already there. The caller must * ensure that this_rq() is locked, @p is bound to this_rq() and not @@ -2133,8 +2144,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_do_wakeup(rq, p, 0, cookie); - if (schedstat_enabled()) - ttwu_stat(p, smp_processor_id(), 0); + ttwu_stat(p, smp_processor_id(), 0); out: raw_spin_unlock(&p->pi_lock); } @@ -2772,6 +2782,10 @@ static struct rq *finish_task_switch(struct task_struct *prev) * task and put them back on the free list. */ kprobe_flush_task(prev); + + /* Task is done with its stack. */ + put_task_stack(prev); + put_task_struct(prev); } @@ -3192,6 +3206,9 @@ static inline void preempt_latency_stop(int val) { } */ static noinline void __schedule_bug(struct task_struct *prev) { + /* Save this before calling printk(), since that will clobber it */ + unsigned long preempt_disable_ip = get_preempt_disable_ip(current); + if (oops_in_progress) return; @@ -3202,13 +3219,12 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); -#ifdef CONFIG_DEBUG_PREEMPT - if (in_atomic_preempt_off()) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) + && in_atomic_preempt_off()) { pr_err("Preemption disabled at:"); - print_ip_sym(current->preempt_disable_ip); + print_ip_sym(preempt_disable_ip); pr_cont("\n"); } -#endif if (panic_on_warn) panic("scheduling while atomic\n"); @@ -3234,7 +3250,7 @@ static inline void schedule_debug(struct task_struct *prev) profile_hit(SCHED_PROFILING, __builtin_return_address(0)); - schedstat_inc(this_rq(), sched_count); + schedstat_inc(this_rq()->sched_count); } /* @@ -3327,17 +3343,6 @@ static void __sched notrace __schedule(bool preempt) rq = cpu_rq(cpu); prev = rq->curr; - /* - * do_exit() calls schedule() with preemption disabled as an exception; - * however we must fix that up, otherwise the next task will see an - * inconsistent (higher) preempt count. - * - * It also avoids the below schedule_debug() test from complaining - * about this. - */ - if (unlikely(prev->state == TASK_DEAD)) - preempt_enable_no_resched_notrace(); - schedule_debug(prev); if (sched_feat(HRTICK)) @@ -3403,7 +3408,33 @@ static void __sched notrace __schedule(bool preempt) balance_callback(rq); } -STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */ + +void __noreturn do_task_dead(void) +{ + /* + * The setting of TASK_RUNNING by try_to_wake_up() may be delayed + * when the following two conditions become true. + * - There is race condition of mmap_sem (It is acquired by + * exit_mm()), and + * - SMI occurs before setting TASK_RUNINNG. + * (or hypervisor of virtual machine switches to other guest) + * As a result, we may become TASK_RUNNING after becoming TASK_DEAD + * + * To avoid it, we have to wait for releasing tsk->pi_lock which + * is held by try_to_wake_up() + */ + smp_mb(); + raw_spin_unlock_wait(¤t->pi_lock); + + /* causes final put_task_struct in finish_task_switch(). */ + __set_current_state(TASK_DEAD); + current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ + __schedule(false); + BUG(); + /* Avoid "noreturn function does return". */ + for (;;) + cpu_relax(); /* For when BUG is null */ +} static inline void sched_submit_work(struct task_struct *tsk) { @@ -3687,10 +3718,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) p->prio = prio; - if (running) - p->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, p, queue_flag); + if (running) + set_curr_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); out_unlock: @@ -3704,7 +3735,8 @@ out_unlock: void set_user_nice(struct task_struct *p, long nice) { - int old_prio, delta, queued; + bool queued, running; + int old_prio, delta; struct rq_flags rf; struct rq *rq; @@ -3726,8 +3758,11 @@ void set_user_nice(struct task_struct *p, long nice) goto out_unlock; } queued = task_on_rq_queued(p); + running = task_current(rq, p); if (queued) dequeue_task(rq, p, DEQUEUE_SAVE); + if (running) + put_prev_task(rq, p); p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); @@ -3744,6 +3779,8 @@ void set_user_nice(struct task_struct *p, long nice) if (delta < 0 || (delta > 0 && task_running(rq, p))) resched_curr(rq); } + if (running) + set_curr_task(rq, p); out_unlock: task_rq_unlock(rq, p, &rf); } @@ -4243,8 +4280,6 @@ change: prev_class = p->sched_class; __setscheduler(rq, p, attr, pi); - if (running) - p->sched_class->set_curr_task(rq); if (queued) { /* * We enqueue to tail when the priority of a task is @@ -4255,6 +4290,8 @@ change: enqueue_task(rq, p, queue_flags); } + if (running) + set_curr_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); preempt_disable(); /* avoid rq from going away on us */ @@ -4846,7 +4883,7 @@ SYSCALL_DEFINE0(sched_yield) { struct rq *rq = this_rq_lock(); - schedstat_inc(rq, yld_count); + schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); /* @@ -4863,6 +4900,7 @@ SYSCALL_DEFINE0(sched_yield) return 0; } +#ifndef CONFIG_PREEMPT int __sched _cond_resched(void) { if (should_resched(0)) { @@ -4872,6 +4910,7 @@ int __sched _cond_resched(void) return 0; } EXPORT_SYMBOL(_cond_resched); +#endif /* * __cond_resched_lock() - if a reschedule is pending, drop the given lock, @@ -4997,7 +5036,7 @@ again: yielded = curr->sched_class->yield_to_task(rq, p, preempt); if (yielded) { - schedstat_inc(rq, yld_count); + schedstat_inc(rq->yld_count); /* * Make p's CPU reschedule; pick_next_entity takes care of * fairness. @@ -5154,21 +5193,14 @@ void sched_show_task(struct task_struct *p) int ppid; unsigned long state = p->state; + if (!try_get_task_stack(p)) + return; if (state) state = __ffs(state) + 1; printk(KERN_INFO "%-15.15s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); -#if BITS_PER_LONG == 32 - if (state == TASK_RUNNING) - printk(KERN_CONT " running "); - else - printk(KERN_CONT " %08lx ", thread_saved_pc(p)); -#else if (state == TASK_RUNNING) printk(KERN_CONT " running task "); - else - printk(KERN_CONT " %016lx ", thread_saved_pc(p)); -#endif #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif @@ -5183,6 +5215,7 @@ void sched_show_task(struct task_struct *p) print_worker_info(KERN_INFO, p); show_stack(p, NULL); + put_task_stack(p); } void show_state_filter(unsigned long state_filter) @@ -5247,6 +5280,7 @@ void init_idle(struct task_struct *idle, int cpu) __sched_fork(0, idle); idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); + idle->flags |= PF_IDLE; kasan_unpoison_task_stack(idle); @@ -5417,10 +5451,10 @@ void sched_setnuma(struct task_struct *p, int nid) p->numa_preferred_nid = nid; - if (running) - p->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE); + if (running) + set_curr_task(rq, p); task_rq_unlock(rq, p, &rf); } #endif /* CONFIG_NUMA_BALANCING */ @@ -5675,7 +5709,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_CONT " %*pbl", cpumask_pr_args(sched_group_cpus(group))); if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { - printk(KERN_CONT " (cpu_capacity = %d)", + printk(KERN_CONT " (cpu_capacity = %lu)", group->sgc->capacity); } @@ -5717,6 +5751,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) } } #else /* !CONFIG_SCHED_DEBUG */ + +# define sched_debug_enabled 0 # define sched_domain_debug(sd, cpu) do { } while (0) static inline bool sched_debug(void) { @@ -5735,6 +5771,7 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | + SD_ASYM_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN)) { if (sd->groups != sd->groups->next) @@ -5765,6 +5802,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | + SD_ASYM_CPUCAPACITY | SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING | @@ -5909,10 +5947,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc) } while (sg != first); } -static void free_sched_domain(struct rcu_head *rcu) +static void destroy_sched_domain(struct sched_domain *sd) { - struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); - /* * If its an overlapping domain it has private groups, iterate and * nuke them all. @@ -5923,18 +5959,26 @@ static void free_sched_domain(struct rcu_head *rcu) kfree(sd->groups->sgc); kfree(sd->groups); } + if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) + kfree(sd->shared); kfree(sd); } -static void destroy_sched_domain(struct sched_domain *sd, int cpu) +static void destroy_sched_domains_rcu(struct rcu_head *rcu) { - call_rcu(&sd->rcu, free_sched_domain); + struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); + + while (sd) { + struct sched_domain *parent = sd->parent; + destroy_sched_domain(sd); + sd = parent; + } } -static void destroy_sched_domains(struct sched_domain *sd, int cpu) +static void destroy_sched_domains(struct sched_domain *sd) { - for (; sd; sd = sd->parent) - destroy_sched_domain(sd, cpu); + if (sd) + call_rcu(&sd->rcu, destroy_sched_domains_rcu); } /* @@ -5949,14 +5993,14 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) DEFINE_PER_CPU(struct sched_domain *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain *, sd_numa); -DEFINE_PER_CPU(struct sched_domain *, sd_busy); DEFINE_PER_CPU(struct sched_domain *, sd_asym); static void update_top_cache_domain(int cpu) { + struct sched_domain_shared *sds = NULL; struct sched_domain *sd; - struct sched_domain *busy_sd = NULL; int id = cpu; int size = 1; @@ -5964,13 +6008,13 @@ static void update_top_cache_domain(int cpu) if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); - busy_sd = sd->parent; /* sd_busy */ + sds = sd->shared; } - rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd); rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; + rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); sd = lowest_flag_domain(cpu, SD_NUMA); rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); @@ -6006,7 +6050,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) */ if (parent->flags & SD_PREFER_SIBLING) tmp->flags |= SD_PREFER_SIBLING; - destroy_sched_domain(parent, cpu); + destroy_sched_domain(parent); } else tmp = tmp->parent; } @@ -6014,7 +6058,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) if (sd && sd_degenerate(sd)) { tmp = sd; sd = sd->parent; - destroy_sched_domain(tmp, cpu); + destroy_sched_domain(tmp); if (sd) sd->child = NULL; } @@ -6024,7 +6068,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) rq_attach_root(rq, rd); tmp = rq->sd; rcu_assign_pointer(rq->sd, sd); - destroy_sched_domains(tmp, cpu); + destroy_sched_domains(tmp); update_top_cache_domain(cpu); } @@ -6142,6 +6186,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) * die on a /0 trap. */ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); + sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; /* * Make sure the first group of this domain contains the @@ -6259,7 +6304,22 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) WARN_ON(!sg); do { + int cpu, max_cpu = -1; + sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + + if (!(sd->flags & SD_ASYM_PACKING)) + goto next; + + for_each_cpu(cpu, sched_group_cpus(sg)) { + if (max_cpu < 0) + max_cpu = cpu; + else if (sched_asym_prefer(cpu, max_cpu)) + max_cpu = cpu; + } + sg->asym_prefer_cpu = max_cpu; + +next: sg = sg->next; } while (sg != sd->groups); @@ -6267,7 +6327,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) return; update_group_capacity(sd, cpu); - atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); } /* @@ -6355,6 +6414,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd) WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); *per_cpu_ptr(sdd->sd, cpu) = NULL; + if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) + *per_cpu_ptr(sdd->sds, cpu) = NULL; + if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) *per_cpu_ptr(sdd->sg, cpu) = NULL; @@ -6374,26 +6436,37 @@ static int sched_domains_curr_level; /* * SD_flags allowed in topology descriptions. * - * SD_SHARE_CPUCAPACITY - describes SMT topologies - * SD_SHARE_PKG_RESOURCES - describes shared caches - * SD_NUMA - describes NUMA topologies - * SD_SHARE_POWERDOMAIN - describes shared power domain + * These flags are purely descriptive of the topology and do not prescribe + * behaviour. Behaviour is artificial and mapped in the below sd_init() + * function: * - * Odd one out: - * SD_ASYM_PACKING - describes SMT quirks + * SD_SHARE_CPUCAPACITY - describes SMT topologies + * SD_SHARE_PKG_RESOURCES - describes shared caches + * SD_NUMA - describes NUMA topologies + * SD_SHARE_POWERDOMAIN - describes shared power domain + * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies + * + * Odd one out, which beside describing the topology has a quirk also + * prescribes the desired behaviour that goes along with it: + * + * SD_ASYM_PACKING - describes SMT quirks */ #define TOPOLOGY_SD_FLAGS \ (SD_SHARE_CPUCAPACITY | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING | \ + SD_ASYM_CPUCAPACITY | \ SD_SHARE_POWERDOMAIN) static struct sched_domain * -sd_init(struct sched_domain_topology_level *tl, int cpu) +sd_init(struct sched_domain_topology_level *tl, + const struct cpumask *cpu_map, + struct sched_domain *child, int cpu) { - struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); - int sd_weight, sd_flags = 0; + struct sd_data *sdd = &tl->data; + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); + int sd_id, sd_weight, sd_flags = 0; #ifdef CONFIG_NUMA /* @@ -6442,15 +6515,26 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) .smt_gain = 0, .max_newidle_lb_cost = 0, .next_decay_max_lb_cost = jiffies, + .child = child, #ifdef CONFIG_SCHED_DEBUG .name = tl->name, #endif }; + cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); + sd_id = cpumask_first(sched_domain_span(sd)); + /* * Convert topological properties into behaviour. */ + if (sd->flags & SD_ASYM_CPUCAPACITY) { + struct sched_domain *t = sd; + + for_each_lower_domain(t) + t->flags |= SD_BALANCE_WAKE; + } + if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 110; @@ -6482,7 +6566,17 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) sd->idle_idx = 1; } - sd->private = &tl->data; + /* + * For all levels sharing cache; connect a sched_domain_shared + * instance. + */ + if (sd->flags & SD_SHARE_PKG_RESOURCES) { + sd->shared = *per_cpu_ptr(sdd->sds, sd_id); + atomic_inc(&sd->shared->ref); + atomic_set(&sd->shared->nr_busy_cpus, sd_weight); + } + + sd->private = sdd; return sd; } @@ -6509,6 +6603,9 @@ static struct sched_domain_topology_level *sched_domain_topology = void set_sched_topology(struct sched_domain_topology_level *tl) { + if (WARN_ON_ONCE(sched_smp_initialized)) + return; + sched_domain_topology = tl; } @@ -6789,6 +6886,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sdd->sd) return -ENOMEM; + sdd->sds = alloc_percpu(struct sched_domain_shared *); + if (!sdd->sds) + return -ENOMEM; + sdd->sg = alloc_percpu(struct sched_group *); if (!sdd->sg) return -ENOMEM; @@ -6799,6 +6900,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) for_each_cpu(j, cpu_map) { struct sched_domain *sd; + struct sched_domain_shared *sds; struct sched_group *sg; struct sched_group_capacity *sgc; @@ -6809,6 +6911,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map) *per_cpu_ptr(sdd->sd, j) = sd; + sds = kzalloc_node(sizeof(struct sched_domain_shared), + GFP_KERNEL, cpu_to_node(j)); + if (!sds) + return -ENOMEM; + + *per_cpu_ptr(sdd->sds, j) = sds; + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); if (!sg) @@ -6848,6 +6957,8 @@ static void __sdt_free(const struct cpumask *cpu_map) kfree(*per_cpu_ptr(sdd->sd, j)); } + if (sdd->sds) + kfree(*per_cpu_ptr(sdd->sds, j)); if (sdd->sg) kfree(*per_cpu_ptr(sdd->sg, j)); if (sdd->sgc) @@ -6855,6 +6966,8 @@ static void __sdt_free(const struct cpumask *cpu_map) } free_percpu(sdd->sd); sdd->sd = NULL; + free_percpu(sdd->sds); + sdd->sds = NULL; free_percpu(sdd->sg); sdd->sg = NULL; free_percpu(sdd->sgc); @@ -6866,16 +6979,12 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) { - struct sched_domain *sd = sd_init(tl, cpu); - if (!sd) - return child; + struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); - cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); if (child) { sd->level = child->level + 1; sched_domain_level_max = max(sched_domain_level_max, sd->level); child->parent = sd; - sd->child = child; if (!cpumask_subset(sched_domain_span(child), sched_domain_span(sd))) { @@ -6906,6 +7015,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, enum s_alloc alloc_state; struct sched_domain *sd; struct s_data d; + struct rq *rq = NULL; int i, ret = -ENOMEM; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); @@ -6956,11 +7066,22 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { + rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); + + /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ + if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) + WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); + cpu_attach_domain(sd, d.rd, i); } rcu_read_unlock(); + if (rq && sched_debug_enabled) { + pr_info("span: %*pbl (max cpu_capacity = %lu)\n", + cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); + } + ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); @@ -7319,6 +7440,22 @@ int sched_cpu_dying(unsigned int cpu) } #endif +#ifdef CONFIG_SCHED_SMT +DEFINE_STATIC_KEY_FALSE(sched_smt_present); + +static void sched_init_smt(void) +{ + /* + * We've enumerated all CPUs and will assume that if any CPU + * has SMT siblings, CPU0 will too. + */ + if (cpumask_weight(cpu_smt_mask(0)) > 1) + static_branch_enable(&sched_smt_present); +} +#else +static inline void sched_init_smt(void) { } +#endif + void __init sched_init_smp(void) { cpumask_var_t non_isolated_cpus; @@ -7348,6 +7485,9 @@ void __init sched_init_smp(void) init_sched_rt_class(); init_sched_dl_class(); + + sched_init_smt(); + sched_smp_initialized = true; } @@ -7385,12 +7525,29 @@ static struct kmem_cache *task_group_cache __read_mostly; #endif DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); +DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); + +#define WAIT_TABLE_BITS 8 +#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) +static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; + +wait_queue_head_t *bit_waitqueue(void *word, int bit) +{ + const int shift = BITS_PER_LONG == 32 ? 5 : 6; + unsigned long val = (unsigned long)word << shift | bit; + + return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); +} +EXPORT_SYMBOL(bit_waitqueue); void __init sched_init(void) { int i, j; unsigned long alloc_size = 0, ptr; + for (i = 0; i < WAIT_TABLE_SIZE; i++) + init_waitqueue_head(bit_wait_table + i); + #ifdef CONFIG_FAIR_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); #endif @@ -7421,6 +7578,8 @@ void __init sched_init(void) for_each_possible_cpu(i) { per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( cpumask_size(), GFP_KERNEL, cpu_to_node(i)); + per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node( + cpumask_size(), GFP_KERNEL, cpu_to_node(i)); } #endif /* CONFIG_CPUMASK_OFFSTACK */ @@ -7461,6 +7620,7 @@ void __init sched_init(void) #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; /* * How much cpu bandwidth does root_task_group get? * @@ -7523,10 +7683,6 @@ void __init sched_init(void) set_load_weight(&init_task); -#ifdef CONFIG_PREEMPT_NOTIFIERS - INIT_HLIST_HEAD(&init_task.preempt_notifiers); -#endif - /* * The boot idle thread does lazy MMU switching as well: */ @@ -7534,11 +7690,6 @@ void __init sched_init(void) enter_lazy_tlb(&init_mm, current); /* - * During early bootup we pretend to be a normal task: - */ - current->sched_class = &fair_sched_class; - - /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, * but because we are the idle thread, we just pick up running again @@ -7592,6 +7743,7 @@ EXPORT_SYMBOL(__might_sleep); void ___might_sleep(const char *file, int line, int preempt_offset) { static unsigned long prev_jiffy; /* ratelimiting */ + unsigned long preempt_disable_ip; rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && @@ -7602,6 +7754,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset) return; prev_jiffy = jiffies; + /* Save this before calling printk(), since that will clobber it */ + preempt_disable_ip = get_preempt_disable_ip(current); + printk(KERN_ERR "BUG: sleeping function called from invalid context at %s:%d\n", file, line); @@ -7616,14 +7771,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset) debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); -#ifdef CONFIG_DEBUG_PREEMPT - if (!preempt_count_equals(preempt_offset)) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) + && !preempt_count_equals(preempt_offset)) { pr_err("Preemption disabled at:"); - print_ip_sym(current->preempt_disable_ip); + print_ip_sym(preempt_disable_ip); pr_cont("\n"); } -#endif dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } EXPORT_SYMBOL(___might_sleep); #endif @@ -7644,12 +7799,10 @@ void normalize_rt_tasks(void) if (p->flags & PF_KTHREAD) continue; - p->se.exec_start = 0; -#ifdef CONFIG_SCHEDSTATS - p->se.statistics.wait_start = 0; - p->se.statistics.sleep_start = 0; - p->se.statistics.block_start = 0; -#endif + p->se.exec_start = 0; + schedstat_set(p->se.statistics.wait_start, 0); + schedstat_set(p->se.statistics.sleep_start, 0); + schedstat_set(p->se.statistics.block_start, 0); if (!dl_task(p) && !rt_task(p)) { /* @@ -7710,7 +7863,7 @@ struct task_struct *curr_task(int cpu) * * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! */ -void set_curr_task(int cpu, struct task_struct *p) +void ia64_set_curr_task(int cpu, struct task_struct *p) { cpu_curr(cpu) = p; } @@ -7841,10 +7994,10 @@ void sched_move_task(struct task_struct *tsk) sched_change_group(tsk, TASK_MOVE_GROUP); - if (unlikely(running)) - tsk->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); + if (unlikely(running)) + set_curr_task(rq, tsk); task_rq_unlock(rq, tsk, &rf); } diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index bc0b309c3f19..9add206b5608 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -297,7 +297,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v) for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[stat], - cputime64_to_clock_t(val[stat])); + (long long)cputime64_to_clock_t(val[stat])); } return 0; diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index d4184498c9f5..e73119013c53 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -31,56 +31,81 @@ static inline int right_child(int i) return (i << 1) + 2; } -static void cpudl_exchange(struct cpudl *cp, int a, int b) +static void cpudl_heapify_down(struct cpudl *cp, int idx) { - int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; + int l, r, largest; - swap(cp->elements[a].cpu, cp->elements[b].cpu); - swap(cp->elements[a].dl , cp->elements[b].dl ); + int orig_cpu = cp->elements[idx].cpu; + u64 orig_dl = cp->elements[idx].dl; - swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx); -} - -static void cpudl_heapify(struct cpudl *cp, int idx) -{ - int l, r, largest; + if (left_child(idx) >= cp->size) + return; /* adapted from lib/prio_heap.c */ while(1) { + u64 largest_dl; l = left_child(idx); r = right_child(idx); largest = idx; + largest_dl = orig_dl; - if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, - cp->elements[l].dl)) + if ((l < cp->size) && dl_time_before(orig_dl, + cp->elements[l].dl)) { largest = l; - if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, - cp->elements[r].dl)) + largest_dl = cp->elements[l].dl; + } + if ((r < cp->size) && dl_time_before(largest_dl, + cp->elements[r].dl)) largest = r; + if (largest == idx) break; - /* Push idx down the heap one level and bump one up */ - cpudl_exchange(cp, largest, idx); + /* pull largest child onto idx */ + cp->elements[idx].cpu = cp->elements[largest].cpu; + cp->elements[idx].dl = cp->elements[largest].dl; + cp->elements[cp->elements[idx].cpu].idx = idx; idx = largest; } + /* actual push down of saved original values orig_* */ + cp->elements[idx].cpu = orig_cpu; + cp->elements[idx].dl = orig_dl; + cp->elements[cp->elements[idx].cpu].idx = idx; } -static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) +static void cpudl_heapify_up(struct cpudl *cp, int idx) { - WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); + int p; - if (dl_time_before(new_dl, cp->elements[idx].dl)) { - cp->elements[idx].dl = new_dl; - cpudl_heapify(cp, idx); - } else { - cp->elements[idx].dl = new_dl; - while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, - cp->elements[idx].dl)) { - cpudl_exchange(cp, idx, parent(idx)); - idx = parent(idx); - } - } + int orig_cpu = cp->elements[idx].cpu; + u64 orig_dl = cp->elements[idx].dl; + + if (idx == 0) + return; + + do { + p = parent(idx); + if (dl_time_before(orig_dl, cp->elements[p].dl)) + break; + /* pull parent onto idx */ + cp->elements[idx].cpu = cp->elements[p].cpu; + cp->elements[idx].dl = cp->elements[p].dl; + cp->elements[cp->elements[idx].cpu].idx = idx; + idx = p; + } while (idx != 0); + /* actual push up of saved original values orig_* */ + cp->elements[idx].cpu = orig_cpu; + cp->elements[idx].dl = orig_dl; + cp->elements[cp->elements[idx].cpu].idx = idx; +} + +static void cpudl_heapify(struct cpudl *cp, int idx) +{ + if (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, + cp->elements[idx].dl)) + cpudl_heapify_up(cp, idx); + else + cpudl_heapify_down(cp, idx); } static inline int cpudl_maximum(struct cpudl *cp) @@ -120,16 +145,15 @@ out: } /* - * cpudl_set - update the cpudl max-heap + * cpudl_clear - remove a cpu from the cpudl max-heap * @cp: the cpudl max-heap context * @cpu: the target cpu - * @dl: the new earliest deadline for this cpu * * Notes: assumes cpu_rq(cpu)->lock is locked * * Returns: (void) */ -void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) +void cpudl_clear(struct cpudl *cp, int cpu) { int old_idx, new_cpu; unsigned long flags; @@ -137,47 +161,60 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) WARN_ON(!cpu_present(cpu)); raw_spin_lock_irqsave(&cp->lock, flags); + old_idx = cp->elements[cpu].idx; - if (!is_valid) { - /* remove item */ - if (old_idx == IDX_INVALID) { - /* - * Nothing to remove if old_idx was invalid. - * This could happen if a rq_offline_dl is - * called for a CPU without -dl tasks running. - */ - goto out; - } + if (old_idx == IDX_INVALID) { + /* + * Nothing to remove if old_idx was invalid. + * This could happen if a rq_offline_dl is + * called for a CPU without -dl tasks running. + */ + } else { new_cpu = cp->elements[cp->size - 1].cpu; cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; cp->elements[old_idx].cpu = new_cpu; cp->size--; cp->elements[new_cpu].idx = old_idx; cp->elements[cpu].idx = IDX_INVALID; - while (old_idx > 0 && dl_time_before( - cp->elements[parent(old_idx)].dl, - cp->elements[old_idx].dl)) { - cpudl_exchange(cp, old_idx, parent(old_idx)); - old_idx = parent(old_idx); - } - cpumask_set_cpu(cpu, cp->free_cpus); - cpudl_heapify(cp, old_idx); + cpudl_heapify(cp, old_idx); - goto out; + cpumask_set_cpu(cpu, cp->free_cpus); } + raw_spin_unlock_irqrestore(&cp->lock, flags); +} + +/* + * cpudl_set - update the cpudl max-heap + * @cp: the cpudl max-heap context + * @cpu: the target cpu + * @dl: the new earliest deadline for this cpu + * + * Notes: assumes cpu_rq(cpu)->lock is locked + * + * Returns: (void) + */ +void cpudl_set(struct cpudl *cp, int cpu, u64 dl) +{ + int old_idx; + unsigned long flags; + WARN_ON(!cpu_present(cpu)); + + raw_spin_lock_irqsave(&cp->lock, flags); + + old_idx = cp->elements[cpu].idx; if (old_idx == IDX_INVALID) { - cp->size++; - cp->elements[cp->size - 1].dl = dl; - cp->elements[cp->size - 1].cpu = cpu; - cp->elements[cpu].idx = cp->size - 1; - cpudl_change_key(cp, cp->size - 1, dl); + int new_idx = cp->size++; + cp->elements[new_idx].dl = dl; + cp->elements[new_idx].cpu = cpu; + cp->elements[cpu].idx = new_idx; + cpudl_heapify_up(cp, new_idx); cpumask_clear_cpu(cpu, cp->free_cpus); } else { - cpudl_change_key(cp, old_idx, dl); + cp->elements[old_idx].dl = dl; + cpudl_heapify(cp, old_idx); } -out: raw_spin_unlock_irqrestore(&cp->lock, flags); } diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index fcbdf83fed7e..f7da8c55bba0 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -23,7 +23,8 @@ struct cpudl { #ifdef CONFIG_SMP int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); -void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); +void cpudl_set(struct cpudl *cp, int cpu, u64 dl); +void cpudl_clear(struct cpudl *cp, int cpu); int cpudl_init(struct cpudl *cp); void cpudl_set_freecpu(struct cpudl *cp, int cpu); void cpudl_clear_freecpu(struct cpudl *cp, int cpu); diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 1141954e73b4..dbc51442ecbc 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); */ void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, void (*func)(struct update_util_data *data, u64 time, - unsigned long util, unsigned long max)) + unsigned int flags)) { if (WARN_ON(!data || !func)) return; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index a84641b222c1..fd4659313640 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -12,12 +12,14 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/cpufreq.h> -#include <linux/module.h> +#include <linux/kthread.h> #include <linux/slab.h> #include <trace/events/power.h> #include "sched.h" +#define SUGOV_KTHREAD_PRIORITY 50 + struct sugov_tunables { struct gov_attr_set attr_set; unsigned int rate_limit_us; @@ -36,8 +38,10 @@ struct sugov_policy { /* The next fields are only needed if fast switch cannot be used. */ struct irq_work irq_work; - struct work_struct work; + struct kthread_work work; struct mutex work_lock; + struct kthread_worker worker; + struct task_struct *thread; bool work_in_progress; bool need_freq_update; @@ -48,11 +52,14 @@ struct sugov_cpu { struct sugov_policy *sg_policy; unsigned int cached_raw_freq; + unsigned long iowait_boost; + unsigned long iowait_boost_max; + u64 last_update; /* The fields below are only needed when sharing a policy. */ unsigned long util; unsigned long max; - u64 last_update; + unsigned int flags; }; static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); @@ -144,24 +151,75 @@ static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util, return cpufreq_driver_resolve_freq(policy, freq); } +static void sugov_get_util(unsigned long *util, unsigned long *max) +{ + struct rq *rq = this_rq(); + unsigned long cfs_max; + + cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id()); + + *util = min(rq->cfs.avg.util_avg, cfs_max); + *max = cfs_max; +} + +static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, + unsigned int flags) +{ + if (flags & SCHED_CPUFREQ_IOWAIT) { + sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + } else if (sg_cpu->iowait_boost) { + s64 delta_ns = time - sg_cpu->last_update; + + /* Clear iowait_boost if the CPU apprears to have been idle. */ + if (delta_ns > TICK_NSEC) + sg_cpu->iowait_boost = 0; + } +} + +static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, + unsigned long *max) +{ + unsigned long boost_util = sg_cpu->iowait_boost; + unsigned long boost_max = sg_cpu->iowait_boost_max; + + if (!boost_util) + return; + + if (*util * boost_max < *max * boost_util) { + *util = boost_util; + *max = boost_max; + } + sg_cpu->iowait_boost >>= 1; +} + static void sugov_update_single(struct update_util_data *hook, u64 time, - unsigned long util, unsigned long max) + unsigned int flags) { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; + unsigned long util, max; unsigned int next_f; + sugov_set_iowait_boost(sg_cpu, time, flags); + sg_cpu->last_update = time; + if (!sugov_should_update_freq(sg_policy, time)) return; - next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq : - get_next_freq(sg_cpu, util, max); + if (flags & SCHED_CPUFREQ_RT_DL) { + next_f = policy->cpuinfo.max_freq; + } else { + sugov_get_util(&util, &max); + sugov_iowait_boost(sg_cpu, &util, &max); + next_f = get_next_freq(sg_cpu, util, max); + } sugov_update_commit(sg_policy, time, next_f); } static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, - unsigned long util, unsigned long max) + unsigned long util, unsigned long max, + unsigned int flags) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; @@ -169,9 +227,11 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 last_freq_update_time = sg_policy->last_freq_update_time; unsigned int j; - if (util == ULONG_MAX) + if (flags & SCHED_CPUFREQ_RT_DL) return max_f; + sugov_iowait_boost(sg_cpu, &util, &max); + for_each_cpu(j, policy->cpus) { struct sugov_cpu *j_sg_cpu; unsigned long j_util, j_max; @@ -186,48 +246,57 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, * frequency update and the time elapsed between the last update * of the CPU utilization and the last frequency update is long * enough, don't take the CPU into account as it probably is - * idle now. + * idle now (and clear iowait_boost for it). */ delta_ns = last_freq_update_time - j_sg_cpu->last_update; - if (delta_ns > TICK_NSEC) + if (delta_ns > TICK_NSEC) { + j_sg_cpu->iowait_boost = 0; continue; - - j_util = j_sg_cpu->util; - if (j_util == ULONG_MAX) + } + if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) return max_f; + j_util = j_sg_cpu->util; j_max = j_sg_cpu->max; if (j_util * max > j_max * util) { util = j_util; max = j_max; } + + sugov_iowait_boost(j_sg_cpu, &util, &max); } return get_next_freq(sg_cpu, util, max); } static void sugov_update_shared(struct update_util_data *hook, u64 time, - unsigned long util, unsigned long max) + unsigned int flags) { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_policy *sg_policy = sg_cpu->sg_policy; + unsigned long util, max; unsigned int next_f; + sugov_get_util(&util, &max); + raw_spin_lock(&sg_policy->update_lock); sg_cpu->util = util; sg_cpu->max = max; + sg_cpu->flags = flags; + + sugov_set_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; if (sugov_should_update_freq(sg_policy, time)) { - next_f = sugov_next_freq_shared(sg_cpu, util, max); + next_f = sugov_next_freq_shared(sg_cpu, util, max, flags); sugov_update_commit(sg_policy, time, next_f); } raw_spin_unlock(&sg_policy->update_lock); } -static void sugov_work(struct work_struct *work) +static void sugov_work(struct kthread_work *work) { struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); @@ -244,7 +313,21 @@ static void sugov_irq_work(struct irq_work *irq_work) struct sugov_policy *sg_policy; sg_policy = container_of(irq_work, struct sugov_policy, irq_work); - schedule_work_on(smp_processor_id(), &sg_policy->work); + + /* + * For RT and deadline tasks, the schedutil governor shoots the + * frequency to maximum. Special care must be taken to ensure that this + * kthread doesn't result in the same behavior. + * + * This is (mostly) guaranteed by the work_in_progress flag. The flag is + * updated only at the end of the sugov_work() function and before that + * the schedutil governor rejects all other frequency scaling requests. + * + * There is a very rare case though, where the RT thread yields right + * after the work_in_progress flag is cleared. The effects of that are + * neglected for now. + */ + kthread_queue_work(&sg_policy->worker, &sg_policy->work); } /************************** sysfs interface ************************/ @@ -307,19 +390,64 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) return NULL; sg_policy->policy = policy; - init_irq_work(&sg_policy->irq_work, sugov_irq_work); - INIT_WORK(&sg_policy->work, sugov_work); - mutex_init(&sg_policy->work_lock); raw_spin_lock_init(&sg_policy->update_lock); return sg_policy; } static void sugov_policy_free(struct sugov_policy *sg_policy) { - mutex_destroy(&sg_policy->work_lock); kfree(sg_policy); } +static int sugov_kthread_create(struct sugov_policy *sg_policy) +{ + struct task_struct *thread; + struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 }; + struct cpufreq_policy *policy = sg_policy->policy; + int ret; + + /* kthread only required for slow path */ + if (policy->fast_switch_enabled) + return 0; + + kthread_init_work(&sg_policy->work, sugov_work); + kthread_init_worker(&sg_policy->worker); + thread = kthread_create(kthread_worker_fn, &sg_policy->worker, + "sugov:%d", + cpumask_first(policy->related_cpus)); + if (IS_ERR(thread)) { + pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread)); + return PTR_ERR(thread); + } + + ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, ¶m); + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + return ret; + } + + sg_policy->thread = thread; + kthread_bind_mask(thread, policy->related_cpus); + init_irq_work(&sg_policy->irq_work, sugov_irq_work); + mutex_init(&sg_policy->work_lock); + + wake_up_process(thread); + + return 0; +} + +static void sugov_kthread_stop(struct sugov_policy *sg_policy) +{ + /* kthread only required for slow path */ + if (sg_policy->policy->fast_switch_enabled) + return; + + kthread_flush_worker(&sg_policy->worker); + kthread_stop(sg_policy->thread); + mutex_destroy(&sg_policy->work_lock); +} + static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) { struct sugov_tunables *tunables; @@ -352,16 +480,24 @@ static int sugov_init(struct cpufreq_policy *policy) if (policy->governor_data) return -EBUSY; + cpufreq_enable_fast_switch(policy); + sg_policy = sugov_policy_alloc(policy); - if (!sg_policy) - return -ENOMEM; + if (!sg_policy) { + ret = -ENOMEM; + goto disable_fast_switch; + } + + ret = sugov_kthread_create(sg_policy); + if (ret) + goto free_sg_policy; mutex_lock(&global_tunables_lock); if (global_tunables) { if (WARN_ON(have_governor_per_policy())) { ret = -EINVAL; - goto free_sg_policy; + goto stop_kthread; } policy->governor_data = sg_policy; sg_policy->tunables = global_tunables; @@ -373,7 +509,7 @@ static int sugov_init(struct cpufreq_policy *policy) tunables = sugov_tunables_alloc(sg_policy); if (!tunables) { ret = -ENOMEM; - goto free_sg_policy; + goto stop_kthread; } tunables->rate_limit_us = LATENCY_MULTIPLIER; @@ -390,20 +526,25 @@ static int sugov_init(struct cpufreq_policy *policy) if (ret) goto fail; - out: +out: mutex_unlock(&global_tunables_lock); - - cpufreq_enable_fast_switch(policy); return 0; - fail: +fail: policy->governor_data = NULL; sugov_tunables_free(tunables); - free_sg_policy: +stop_kthread: + sugov_kthread_stop(sg_policy); + +free_sg_policy: mutex_unlock(&global_tunables_lock); sugov_policy_free(sg_policy); + +disable_fast_switch: + cpufreq_disable_fast_switch(policy); + pr_err("initialization failed (error %d)\n", ret); return ret; } @@ -414,8 +555,6 @@ static void sugov_exit(struct cpufreq_policy *policy) struct sugov_tunables *tunables = sg_policy->tunables; unsigned int count; - cpufreq_disable_fast_switch(policy); - mutex_lock(&global_tunables_lock); count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); @@ -425,7 +564,9 @@ static void sugov_exit(struct cpufreq_policy *policy) mutex_unlock(&global_tunables_lock); + sugov_kthread_stop(sg_policy); sugov_policy_free(sg_policy); + cpufreq_disable_fast_switch(policy); } static int sugov_start(struct cpufreq_policy *policy) @@ -444,10 +585,13 @@ static int sugov_start(struct cpufreq_policy *policy) sg_cpu->sg_policy = sg_policy; if (policy_is_shared(policy)) { - sg_cpu->util = ULONG_MAX; + sg_cpu->util = 0; sg_cpu->max = 0; + sg_cpu->flags = SCHED_CPUFREQ_RT; sg_cpu->last_update = 0; sg_cpu->cached_raw_freq = 0; + sg_cpu->iowait_boost = 0; + sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, sugov_update_shared); } else { @@ -468,8 +612,10 @@ static void sugov_stop(struct cpufreq_policy *policy) synchronize_sched(); - irq_work_sync(&sg_policy->irq_work); - cancel_work_sync(&sg_policy->work); + if (!policy->fast_switch_enabled) { + irq_work_sync(&sg_policy->irq_work); + kthread_cancel_work_sync(&sg_policy->work); + } } static void sugov_limits(struct cpufreq_policy *policy) @@ -495,28 +641,15 @@ static struct cpufreq_governor schedutil_gov = { .limits = sugov_limits, }; -static int __init sugov_module_init(void) -{ - return cpufreq_register_governor(&schedutil_gov); -} - -static void __exit sugov_module_exit(void) -{ - cpufreq_unregister_governor(&schedutil_gov); -} - -MODULE_AUTHOR("Rafael J. Wysocki <rafael.j.wysocki@intel.com>"); -MODULE_DESCRIPTION("Utilization-based CPU frequency selection"); -MODULE_LICENSE("GPL"); - #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL struct cpufreq_governor *cpufreq_default_governor(void) { return &schedutil_gov; } - -fs_initcall(sugov_module_init); -#else -module_init(sugov_module_init); #endif -module_exit(sugov_module_exit); + +static int __init sugov_register(void) +{ + return cpufreq_register_governor(&schedutil_gov); +} +fs_initcall(sugov_register); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a846cf89eb96..7700a9cba335 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -23,10 +23,8 @@ * task when irq is in progress while we read rq->clock. That is a worthy * compromise in place of having locks on each irq in account_system_time. */ -DEFINE_PER_CPU(u64, cpu_hardirq_time); -DEFINE_PER_CPU(u64, cpu_softirq_time); +DEFINE_PER_CPU(struct irqtime, cpu_irqtime); -static DEFINE_PER_CPU(u64, irq_start_time); static int sched_clock_irqtime; void enable_sched_clock_irqtime(void) @@ -39,16 +37,13 @@ void disable_sched_clock_irqtime(void) sched_clock_irqtime = 0; } -#ifndef CONFIG_64BIT -DEFINE_PER_CPU(seqcount_t, irq_time_seq); -#endif /* CONFIG_64BIT */ - /* * Called before incrementing preempt_count on {soft,}irq_enter * and before decrementing preempt_count on {soft,}irq_exit. */ void irqtime_account_irq(struct task_struct *curr) { + struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); s64 delta; int cpu; @@ -56,10 +51,10 @@ void irqtime_account_irq(struct task_struct *curr) return; cpu = smp_processor_id(); - delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); - __this_cpu_add(irq_start_time, delta); + delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; + irqtime->irq_start_time += delta; - irq_time_write_begin(); + u64_stats_update_begin(&irqtime->sync); /* * We do not account for softirq time from ksoftirqd here. * We want to continue accounting softirq time to ksoftirqd thread @@ -67,42 +62,36 @@ void irqtime_account_irq(struct task_struct *curr) * that do not consume any time, but still wants to run. */ if (hardirq_count()) - __this_cpu_add(cpu_hardirq_time, delta); + irqtime->hardirq_time += delta; else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) - __this_cpu_add(cpu_softirq_time, delta); + irqtime->softirq_time += delta; - irq_time_write_end(); + u64_stats_update_end(&irqtime->sync); } EXPORT_SYMBOL_GPL(irqtime_account_irq); -static cputime_t irqtime_account_hi_update(cputime_t maxtime) +static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime) { u64 *cpustat = kcpustat_this_cpu->cpustat; - unsigned long flags; cputime_t irq_cputime; - local_irq_save(flags); - irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) - - cpustat[CPUTIME_IRQ]; + irq_cputime = nsecs_to_cputime64(irqtime) - cpustat[idx]; irq_cputime = min(irq_cputime, maxtime); - cpustat[CPUTIME_IRQ] += irq_cputime; - local_irq_restore(flags); + cpustat[idx] += irq_cputime; + return irq_cputime; } -static cputime_t irqtime_account_si_update(cputime_t maxtime) +static cputime_t irqtime_account_hi_update(cputime_t maxtime) { - u64 *cpustat = kcpustat_this_cpu->cpustat; - unsigned long flags; - cputime_t softirq_cputime; + return irqtime_account_update(__this_cpu_read(cpu_irqtime.hardirq_time), + CPUTIME_IRQ, maxtime); +} - local_irq_save(flags); - softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) - - cpustat[CPUTIME_SOFTIRQ]; - softirq_cputime = min(softirq_cputime, maxtime); - cpustat[CPUTIME_SOFTIRQ] += softirq_cputime; - local_irq_restore(flags); - return softirq_cputime; +static cputime_t irqtime_account_si_update(cputime_t maxtime) +{ + return irqtime_account_update(__this_cpu_read(cpu_irqtime.softirq_time), + CPUTIME_SOFTIRQ, maxtime); } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ @@ -139,16 +128,13 @@ static inline void task_group_account_field(struct task_struct *p, int index, * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in user space since the last update - * @cputime_scaled: cputime scaled by cpu frequency */ -void account_user_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) +void account_user_time(struct task_struct *p, cputime_t cputime) { int index; /* Add user time to process. */ p->utime += cputime; - p->utimescaled += cputime_scaled; account_group_user_time(p, cputime); index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; @@ -164,16 +150,13 @@ void account_user_time(struct task_struct *p, cputime_t cputime, * Account guest cpu time to a process. * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in virtual machine since the last update - * @cputime_scaled: cputime scaled by cpu frequency */ -static void account_guest_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) +static void account_guest_time(struct task_struct *p, cputime_t cputime) { u64 *cpustat = kcpustat_this_cpu->cpustat; /* Add guest time to process. */ p->utime += cputime; - p->utimescaled += cputime_scaled; account_group_user_time(p, cputime); p->gtime += cputime; @@ -191,16 +174,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, * Account system cpu time to a process and desired cpustat field * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - * @target_cputime64: pointer to cpustat field that has to be updated + * @index: pointer to cpustat field that has to be updated */ static inline -void __account_system_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled, int index) +void __account_system_time(struct task_struct *p, cputime_t cputime, int index) { /* Add system time to process. */ p->stime += cputime; - p->stimescaled += cputime_scaled; account_group_system_time(p, cputime); /* Add system time to cpustat. */ @@ -215,15 +195,14 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency */ void account_system_time(struct task_struct *p, int hardirq_offset, - cputime_t cputime, cputime_t cputime_scaled) + cputime_t cputime) { int index; if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { - account_guest_time(p, cputime, cputime_scaled); + account_guest_time(p, cputime); return; } @@ -234,7 +213,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, else index = CPUTIME_SYSTEM; - __account_system_time(p, cputime, cputime_scaled, index); + __account_system_time(p, cputime, index); } /* @@ -295,6 +274,9 @@ static inline cputime_t account_other_time(cputime_t max) { cputime_t accounted; + /* Shall be converted to a lockdep-enabled lightweight check */ + WARN_ON_ONCE(!irqs_disabled()); + accounted = steal_account_process_time(max); if (accounted < max) @@ -306,6 +288,26 @@ static inline cputime_t account_other_time(cputime_t max) return accounted; } +#ifdef CONFIG_64BIT +static inline u64 read_sum_exec_runtime(struct task_struct *t) +{ + return t->se.sum_exec_runtime; +} +#else +static u64 read_sum_exec_runtime(struct task_struct *t) +{ + u64 ns; + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(t, &rf); + ns = t->se.sum_exec_runtime; + task_rq_unlock(rq, t, &rf); + + return ns; +} +#endif + /* * Accumulate raw cputime values of dead tasks (sig->[us]time) and live * tasks (sum on group iteration) belonging to @tsk's group. @@ -318,6 +320,17 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) unsigned int seq, nextseq; unsigned long flags; + /* + * Update current task runtime to account pending time since last + * scheduler action or thread_group_cputime() call. This thread group + * might have other running tasks on different CPUs, but updating + * their runtime can affect syscall performance, so we skip account + * those pending times and rely only on values updated on tick or + * other scheduler action. + */ + if (same_thread_group(current, tsk)) + (void) task_sched_runtime(current); + rcu_read_lock(); /* Attempt a lockless read on the first round. */ nextseq = 0; @@ -332,7 +345,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) task_cputime(t, &utime, &stime); times->utime += utime; times->stime += stime; - times->sum_exec_runtime += task_sched_runtime(t); + times->sum_exec_runtime += read_sum_exec_runtime(t); } /* If lockless access failed, take the lock. */ nextseq = 1; @@ -367,7 +380,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, struct rq *rq, int ticks) { u64 cputime = (__force u64) cputime_one_jiffy * ticks; - cputime_t scaled, other; + cputime_t other; /* * When returning from idle, many ticks can get accounted at @@ -380,7 +393,6 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, if (other >= cputime) return; cputime -= other; - scaled = cputime_to_scaled(cputime); if (this_cpu_ksoftirqd() == p) { /* @@ -388,15 +400,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, * So, we have to handle it separately here. * Also, p->stime needs to be updated for ksoftirqd. */ - __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ); + __account_system_time(p, cputime, CPUTIME_SOFTIRQ); } else if (user_tick) { - account_user_time(p, cputime, scaled); + account_user_time(p, cputime); } else if (p == rq->idle) { account_idle_time(cputime); } else if (p->flags & PF_VCPU) { /* System time or guest time */ - account_guest_time(p, cputime, scaled); + account_guest_time(p, cputime); } else { - __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM); + __account_system_time(p, cputime, CPUTIME_SYSTEM); } } @@ -479,7 +491,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime */ void account_process_tick(struct task_struct *p, int user_tick) { - cputime_t cputime, scaled, steal; + cputime_t cputime, steal; struct rq *rq = this_rq(); if (vtime_accounting_cpu_enabled()) @@ -497,12 +509,11 @@ void account_process_tick(struct task_struct *p, int user_tick) return; cputime -= steal; - scaled = cputime_to_scaled(cputime); if (user_tick) - account_user_time(p, cputime, scaled); + account_user_time(p, cputime); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, cputime, scaled); + account_system_time(p, HARDIRQ_OFFSET, cputime); else account_idle_time(cputime); } @@ -723,7 +734,7 @@ static void __vtime_account_system(struct task_struct *tsk) { cputime_t delta_cpu = get_vtime_delta(tsk); - account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu)); + account_system_time(tsk, irq_count(), delta_cpu); } void vtime_account_system(struct task_struct *tsk) @@ -744,7 +755,7 @@ void vtime_account_user(struct task_struct *tsk) tsk->vtime_snap_whence = VTIME_SYS; if (vtime_delta(tsk)) { delta_cpu = get_vtime_delta(tsk); - account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); + account_user_time(tsk, delta_cpu); } write_seqcount_end(&tsk->vtime_seqcount); } @@ -840,29 +851,25 @@ cputime_t task_gtime(struct task_struct *t) * add up the pending nohz execution time since the last * cputime snapshot. */ -static void -fetch_task_cputime(struct task_struct *t, - cputime_t *u_dst, cputime_t *s_dst, - cputime_t *u_src, cputime_t *s_src, - cputime_t *udelta, cputime_t *sdelta) +void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) { + cputime_t delta; unsigned int seq; - unsigned long long delta; - do { - *udelta = 0; - *sdelta = 0; + if (!vtime_accounting_enabled()) { + *utime = t->utime; + *stime = t->stime; + return; + } + do { seq = read_seqcount_begin(&t->vtime_seqcount); - if (u_dst) - *u_dst = *u_src; - if (s_dst) - *s_dst = *s_src; + *utime = t->utime; + *stime = t->stime; /* Task is sleeping, nothing to add */ - if (t->vtime_snap_whence == VTIME_INACTIVE || - is_idle_task(t)) + if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t)) continue; delta = vtime_delta(t); @@ -871,54 +878,10 @@ fetch_task_cputime(struct task_struct *t, * Task runs either in user or kernel space, add pending nohz time to * the right place. */ - if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) { - *udelta = delta; - } else { - if (t->vtime_snap_whence == VTIME_SYS) - *sdelta = delta; - } + if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) + *utime += delta; + else if (t->vtime_snap_whence == VTIME_SYS) + *stime += delta; } while (read_seqcount_retry(&t->vtime_seqcount, seq)); } - - -void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) -{ - cputime_t udelta, sdelta; - - if (!vtime_accounting_enabled()) { - if (utime) - *utime = t->utime; - if (stime) - *stime = t->stime; - return; - } - - fetch_task_cputime(t, utime, stime, &t->utime, - &t->stime, &udelta, &sdelta); - if (utime) - *utime += udelta; - if (stime) - *stime += sdelta; -} - -void task_cputime_scaled(struct task_struct *t, - cputime_t *utimescaled, cputime_t *stimescaled) -{ - cputime_t udelta, sdelta; - - if (!vtime_accounting_enabled()) { - if (utimescaled) - *utimescaled = t->utimescaled; - if (stimescaled) - *stimescaled = t->stimescaled; - return; - } - - fetch_task_cputime(t, utimescaled, stimescaled, - &t->utimescaled, &t->stimescaled, &udelta, &sdelta); - if (utimescaled) - *utimescaled += cputime_to_scaled(udelta); - if (stimescaled) - *stimescaled += cputime_to_scaled(sdelta); -} #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 1ce8867283dc..70ef2b1901e4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -243,10 +243,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p) { struct rq *later_rq = NULL; - bool fallback = false; later_rq = find_lock_later_rq(p, rq); - if (!later_rq) { int cpu; @@ -254,7 +252,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p * If we cannot preempt any rq, fall back to pick any * online cpu. */ - fallback = true; cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); if (cpu >= nr_cpu_ids) { /* @@ -274,16 +271,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p double_lock_balance(rq, later_rq); } - /* - * By now the task is replenished and enqueued; migrate it. - */ - deactivate_task(rq, p, 0); set_task_cpu(p, later_rq->cpu); - activate_task(later_rq, p, 0); - - if (!fallback) - resched_curr(later_rq); - double_unlock_balance(later_rq, rq); return later_rq; @@ -346,12 +334,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, * one, and to (try to!) reconcile itself with its own scheduling * parameters. */ -static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se) +static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); + WARN_ON(dl_se->dl_boosted); WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); /* @@ -367,8 +355,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, * future; in fact, we must consider execution overheads (time * spent on hardirq context, etc.). */ - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline; + dl_se->runtime = dl_se->dl_runtime; } /* @@ -598,7 +586,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) /* * The task might have changed its scheduling policy to something - * different than SCHED_DEADLINE (through switched_fromd_dl()). + * different than SCHED_DEADLINE (through switched_from_dl()). */ if (!dl_task(p)) { __dl_clear_params(p); @@ -641,29 +629,31 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) goto unlock; } - enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (dl_task(rq->curr)) - check_preempt_curr_dl(rq, p, 0); - else - resched_curr(rq); - #ifdef CONFIG_SMP - /* - * Perform balancing operations here; after the replenishments. We - * cannot drop rq->lock before this, otherwise the assertion in - * start_dl_timer() about not missing updates is not true. - * - * If we find that the rq the task was on is no longer available, we - * need to select a new rq. - * - * XXX figure out if select_task_rq_dl() deals with offline cpus. - */ if (unlikely(!rq->online)) { + /* + * If the runqueue is no longer available, migrate the + * task elsewhere. This necessarily changes rq. + */ lockdep_unpin_lock(&rq->lock, rf.cookie); rq = dl_task_offline_migration(rq, p); rf.cookie = lockdep_pin_lock(&rq->lock); + + /* + * Now that the task has been migrated to the new RQ and we + * have that locked, proceed as normal and enqueue the task + * there. + */ } +#endif + + enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); + if (dl_task(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_curr(rq); +#ifdef CONFIG_SMP /* * Queueing this task back might have overloaded rq, check if we need * to kick someone away. @@ -735,9 +725,8 @@ static void update_curr_dl(struct rq *rq) return; } - /* kick cpufreq (see the comment in linux/cpufreq.h). */ - if (cpu_of(rq) == smp_processor_id()) - cpufreq_trigger_update(rq_clock(rq)); + /* kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL); schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -798,7 +787,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (dl_rq->earliest_dl.curr == 0 || dl_time_before(deadline, dl_rq->earliest_dl.curr)) { dl_rq->earliest_dl.curr = deadline; - cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); + cpudl_set(&rq->rd->cpudl, rq->cpu, deadline); } } @@ -813,14 +802,14 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (!dl_rq->dl_nr_running) { dl_rq->earliest_dl.curr = 0; dl_rq->earliest_dl.next = 0; - cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); + cpudl_clear(&rq->rd->cpudl, rq->cpu); } else { struct rb_node *leftmost = dl_rq->rb_leftmost; struct sched_dl_entity *entry; entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); dl_rq->earliest_dl.curr = entry->deadline; - cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); + cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline); } } @@ -1148,7 +1137,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie coo pull_dl_task(rq); lockdep_repin_lock(&rq->lock, cookie); /* - * pull_rt_task() can drop (and re-acquire) rq->lock; this + * pull_dl_task() can drop (and re-acquire) rq->lock; this * means a stop task can slip in, in which case we need to * re-start task selection. */ @@ -1671,7 +1660,7 @@ static void rq_online_dl(struct rq *rq) cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); if (rq->dl.dl_nr_running > 0) - cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); + cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr); } /* Assumes rq->lock is held */ @@ -1680,7 +1669,7 @@ static void rq_offline_dl(struct rq *rq) if (rq->dl.overloaded) dl_clear_overload(rq); - cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); + cpudl_clear(&rq->rd->cpudl, rq->cpu); cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); } @@ -1723,10 +1712,20 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { + + /* If p is not queued we will update its parameters at next wakeup. */ + if (!task_on_rq_queued(p)) + return; + + /* + * If p is boosted we already updated its params in + * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), + * p's deadline being now already after rq_clock(rq). + */ if (dl_time_before(p->dl.deadline, rq_clock(rq))) - setup_new_dl_entity(&p->dl, &p->dl); + setup_new_dl_entity(&p->dl); - if (task_on_rq_queued(p) && rq->curr != p) { + if (rq->curr != p) { #ifdef CONFIG_SMP if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) queue_push_tasks(rq); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2a0a9995256d..fa178b62ea79 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -369,8 +369,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #define P(F) \ SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) +#define P_SCHEDSTAT(F) \ + SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) #define PN(F) \ SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) +#define PN_SCHEDSTAT(F) \ + SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) if (!se) return; @@ -378,26 +382,27 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group PN(se->exec_start); PN(se->vruntime); PN(se->sum_exec_runtime); -#ifdef CONFIG_SCHEDSTATS if (schedstat_enabled()) { - PN(se->statistics.wait_start); - PN(se->statistics.sleep_start); - PN(se->statistics.block_start); - PN(se->statistics.sleep_max); - PN(se->statistics.block_max); - PN(se->statistics.exec_max); - PN(se->statistics.slice_max); - PN(se->statistics.wait_max); - PN(se->statistics.wait_sum); - P(se->statistics.wait_count); + PN_SCHEDSTAT(se->statistics.wait_start); + PN_SCHEDSTAT(se->statistics.sleep_start); + PN_SCHEDSTAT(se->statistics.block_start); + PN_SCHEDSTAT(se->statistics.sleep_max); + PN_SCHEDSTAT(se->statistics.block_max); + PN_SCHEDSTAT(se->statistics.exec_max); + PN_SCHEDSTAT(se->statistics.slice_max); + PN_SCHEDSTAT(se->statistics.wait_max); + PN_SCHEDSTAT(se->statistics.wait_sum); + P_SCHEDSTAT(se->statistics.wait_count); } -#endif P(se->load.weight); #ifdef CONFIG_SMP P(se->avg.load_avg); P(se->avg.util_avg); #endif + +#undef PN_SCHEDSTAT #undef PN +#undef P_SCHEDSTAT #undef P } #endif @@ -410,7 +415,8 @@ static char *task_group_path(struct task_group *tg) if (autogroup_path(tg, group_path, PATH_MAX)) return group_path; - return cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + return group_path; } #endif @@ -429,9 +435,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) p->prio); SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)), + SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)), SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime))); + SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime))); #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); @@ -626,9 +632,7 @@ do { \ #undef P64 #endif -#ifdef CONFIG_SCHEDSTATS -#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); - +#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n)); if (schedstat_enabled()) { P(yld_count); P(sched_count); @@ -636,9 +640,8 @@ do { \ P(ttwu_count); P(ttwu_local); } - #undef P -#endif + spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); print_rt_stats(m, cpu); @@ -868,10 +871,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) #define P(F) \ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) +#define P_SCHEDSTAT(F) \ + SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F)) #define __PN(F) \ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) #define PN(F) \ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) +#define PN_SCHEDSTAT(F) \ + SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F))) PN(se.exec_start); PN(se.vruntime); @@ -881,37 +888,36 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.nr_migrations); -#ifdef CONFIG_SCHEDSTATS if (schedstat_enabled()) { u64 avg_atom, avg_per_cpu; - PN(se.statistics.sum_sleep_runtime); - PN(se.statistics.wait_start); - PN(se.statistics.sleep_start); - PN(se.statistics.block_start); - PN(se.statistics.sleep_max); - PN(se.statistics.block_max); - PN(se.statistics.exec_max); - PN(se.statistics.slice_max); - PN(se.statistics.wait_max); - PN(se.statistics.wait_sum); - P(se.statistics.wait_count); - PN(se.statistics.iowait_sum); - P(se.statistics.iowait_count); - P(se.statistics.nr_migrations_cold); - P(se.statistics.nr_failed_migrations_affine); - P(se.statistics.nr_failed_migrations_running); - P(se.statistics.nr_failed_migrations_hot); - P(se.statistics.nr_forced_migrations); - P(se.statistics.nr_wakeups); - P(se.statistics.nr_wakeups_sync); - P(se.statistics.nr_wakeups_migrate); - P(se.statistics.nr_wakeups_local); - P(se.statistics.nr_wakeups_remote); - P(se.statistics.nr_wakeups_affine); - P(se.statistics.nr_wakeups_affine_attempts); - P(se.statistics.nr_wakeups_passive); - P(se.statistics.nr_wakeups_idle); + PN_SCHEDSTAT(se.statistics.sum_sleep_runtime); + PN_SCHEDSTAT(se.statistics.wait_start); + PN_SCHEDSTAT(se.statistics.sleep_start); + PN_SCHEDSTAT(se.statistics.block_start); + PN_SCHEDSTAT(se.statistics.sleep_max); + PN_SCHEDSTAT(se.statistics.block_max); + PN_SCHEDSTAT(se.statistics.exec_max); + PN_SCHEDSTAT(se.statistics.slice_max); + PN_SCHEDSTAT(se.statistics.wait_max); + PN_SCHEDSTAT(se.statistics.wait_sum); + P_SCHEDSTAT(se.statistics.wait_count); + PN_SCHEDSTAT(se.statistics.iowait_sum); + P_SCHEDSTAT(se.statistics.iowait_count); + P_SCHEDSTAT(se.statistics.nr_migrations_cold); + P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine); + P_SCHEDSTAT(se.statistics.nr_failed_migrations_running); + P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot); + P_SCHEDSTAT(se.statistics.nr_forced_migrations); + P_SCHEDSTAT(se.statistics.nr_wakeups); + P_SCHEDSTAT(se.statistics.nr_wakeups_sync); + P_SCHEDSTAT(se.statistics.nr_wakeups_migrate); + P_SCHEDSTAT(se.statistics.nr_wakeups_local); + P_SCHEDSTAT(se.statistics.nr_wakeups_remote); + P_SCHEDSTAT(se.statistics.nr_wakeups_affine); + P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts); + P_SCHEDSTAT(se.statistics.nr_wakeups_passive); + P_SCHEDSTAT(se.statistics.nr_wakeups_idle); avg_atom = p->se.sum_exec_runtime; if (nr_switches) @@ -930,7 +936,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) __PN(avg_atom); __PN(avg_per_cpu); } -#endif + __P(nr_switches); SEQ_printf(m, "%-45s:%21Ld\n", "nr_voluntary_switches", (long long)p->nvcsw); @@ -947,8 +953,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) #endif P(policy); P(prio); +#undef PN_SCHEDSTAT #undef PN #undef __PN +#undef P_SCHEDSTAT #undef P #undef __P diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 039de34f1521..6559d197e08a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -37,7 +37,6 @@ /* * Targeted preemption latency for CPU-bound tasks: - * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) * * NOTE: this latency value is not the same as the concept of * 'timeslice length' - timeslices in CFS are of variable length @@ -46,31 +45,35 @@ * * (to see the precise effective timeslice length of your workload, * run vmstat and monitor the context-switches (cs) field) + * + * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_latency = 6000000ULL; -unsigned int normalized_sysctl_sched_latency = 6000000ULL; +unsigned int sysctl_sched_latency = 6000000ULL; +unsigned int normalized_sysctl_sched_latency = 6000000ULL; /* * The initial- and re-scaling of tunables is configurable - * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) * * Options are: - * SCHED_TUNABLESCALING_NONE - unscaled, always *1 - * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) - * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus + * + * SCHED_TUNABLESCALING_NONE - unscaled, always *1 + * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) + * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus + * + * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) */ -enum sched_tunable_scaling sysctl_sched_tunable_scaling - = SCHED_TUNABLESCALING_LOG; +enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; /* * Minimal preemption granularity for CPU-bound tasks: + * * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity = 750000ULL; -unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; +unsigned int sysctl_sched_min_granularity = 750000ULL; +unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; /* - * is kept at sysctl_sched_latency / sysctl_sched_min_granularity + * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity */ static unsigned int sched_nr_latency = 8; @@ -82,23 +85,27 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; /* * SCHED_OTHER wake-up granularity. - * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. + * + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_wakeup_granularity = 1000000UL; -unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; +unsigned int sysctl_sched_wakeup_granularity = 1000000UL; +unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; -const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +#ifdef CONFIG_SMP /* - * The exponential sliding window over which load is averaged for shares - * distribution. - * (default: 10msec) + * For asym packing, by default the lower numbered cpu has higher priority. */ -unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; +int __weak arch_asym_cpu_priority(int cpu) +{ + return -cpu; +} +#endif #ifdef CONFIG_CFS_BANDWIDTH /* @@ -109,11 +116,19 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; * to consumption or the quota being specified to be smaller than the slice) * we will always only issue the remaining available time. * - * default: 5 msec, units: microseconds - */ -unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; + * (default: 5 msec, units: microseconds) + */ +unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +/* + * The margin used when comparing utilization with CPU capacity: + * util * margin < capacity * 1024 + * + * (default: ~20%) + */ +unsigned int capacity_margin = 1280; + static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; @@ -256,9 +271,7 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) static inline struct task_struct *task_of(struct sched_entity *se) { -#ifdef CONFIG_SCHED_DEBUG - WARN_ON_ONCE(!entity_is_task(se)); -#endif + SCHED_WARN_ON(!entity_is_task(se)); return container_of(se, struct task_struct, se); } @@ -286,19 +299,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); /* * Ensure we either appear before our parent (if already * enqueued) or force our parent to appear after us when it is - * enqueued. The fact that we always enqueue bottom-up - * reduces this to two cases. + * enqueued. The fact that we always enqueue bottom-up + * reduces this to two cases and a special case for the root + * cfs_rq. Furthermore, it also means that we will always reset + * tmp_alone_branch either when the branch is connected + * to a tree or when we reach the beg of the tree */ if (cfs_rq->tg->parent && - cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); - } else { + cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { + /* + * If parent is already on the list, we add the child + * just before. Thanks to circular linked property of + * the list, this means to put the child at the tail + * of the list that starts by parent. + */ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); + &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); + /* + * The branch is now connected to its tree so we can + * reset tmp_alone_branch to the beginning of the + * list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + } else if (!cfs_rq->tg->parent) { + /* + * cfs rq without parent should be put + * at the tail of the list. + */ + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &rq->leaf_cfs_rq_list); + /* + * We have reach the beg of a tree so we can reset + * tmp_alone_branch to the beginning of the list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + } else { + /* + * The parent has not already been added so we want to + * make sure that it will be put after us. + * tmp_alone_branch points to the beg of the branch + * where we will add parent. + */ + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, + rq->tmp_alone_branch); + /* + * update tmp_alone_branch to points to the new beg + * of the branch + */ + rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; } cfs_rq->on_list = 1; @@ -456,17 +509,23 @@ static inline int entity_before(struct sched_entity *a, static void update_min_vruntime(struct cfs_rq *cfs_rq) { + struct sched_entity *curr = cfs_rq->curr; + u64 vruntime = cfs_rq->min_vruntime; - if (cfs_rq->curr) - vruntime = cfs_rq->curr->vruntime; + if (curr) { + if (curr->on_rq) + vruntime = curr->vruntime; + else + curr = NULL; + } if (cfs_rq->rb_leftmost) { struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, struct sched_entity, run_node); - if (!cfs_rq->curr) + if (!curr) vruntime = se->vruntime; else vruntime = min_vruntime(vruntime, se->vruntime); @@ -656,7 +715,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP -static int select_idle_sibling(struct task_struct *p, int cpu); +static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); /* @@ -680,7 +739,14 @@ void init_entity_runnable_average(struct sched_entity *se) * will definitely be update (after enqueue). */ sa->period_contrib = 1023; - sa->load_avg = scale_load_down(se->load.weight); + /* + * Tasks are intialized with full load to be seen as heavy tasks until + * they get a chance to stabilize to their real load level. + * Group entities are intialized with zero load to reflect the fact that + * nothing has been attached to the task group yet. + */ + if (entity_is_task(se)) + sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; /* * At this point, util_avg won't be used in select_task_rq_fair anyway @@ -691,9 +757,7 @@ void init_entity_runnable_average(struct sched_entity *se) } static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); -static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); -static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force); -static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); +static void attach_entity_cfs_rq(struct sched_entity *se); /* * With new tasks being created, their initial util_avgs are extrapolated @@ -725,8 +789,6 @@ void post_init_entity_util_avg(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; - u64 now = cfs_rq_clock_task(cfs_rq); - int tg_update; if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { @@ -754,15 +816,12 @@ void post_init_entity_util_avg(struct sched_entity *se) * such that the next switched_to_fair() has the * expected state. */ - se->avg.last_update_time = now; + se->avg.last_update_time = cfs_rq_clock_task(cfs_rq); return; } } - tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); - attach_entity_load_avg(cfs_rq, se); - if (tg_update) - update_tg_load_avg(cfs_rq, false); + attach_entity_cfs_rq(se); } #else /* !CONFIG_SMP */ @@ -799,7 +858,7 @@ static void update_curr(struct cfs_rq *cfs_rq) max(delta_exec, curr->statistics.exec_max)); curr->sum_exec_runtime += delta_exec; - schedstat_add(cfs_rq, exec_clock, delta_exec); + schedstat_add(cfs_rq->exec_clock, delta_exec); curr->vruntime += calc_delta_fair(delta_exec, curr); update_min_vruntime(cfs_rq); @@ -820,26 +879,34 @@ static void update_curr_fair(struct rq *rq) update_curr(cfs_rq_of(&rq->curr->se)); } -#ifdef CONFIG_SCHEDSTATS static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 wait_start = rq_clock(rq_of(cfs_rq)); + u64 wait_start, prev_wait_start; + + if (!schedstat_enabled()) + return; + + wait_start = rq_clock(rq_of(cfs_rq)); + prev_wait_start = schedstat_val(se->statistics.wait_start); if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && - likely(wait_start > se->statistics.wait_start)) - wait_start -= se->statistics.wait_start; + likely(wait_start > prev_wait_start)) + wait_start -= prev_wait_start; - se->statistics.wait_start = wait_start; + schedstat_set(se->statistics.wait_start, wait_start); } -static void +static inline void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct task_struct *p; u64 delta; - delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; + if (!schedstat_enabled()) + return; + + delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); if (entity_is_task(se)) { p = task_of(se); @@ -849,35 +916,114 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) * time stamp can be adjusted to accumulate wait time * prior to migration. */ - se->statistics.wait_start = delta; + schedstat_set(se->statistics.wait_start, delta); return; } trace_sched_stat_wait(p, delta); } - se->statistics.wait_max = max(se->statistics.wait_max, delta); - se->statistics.wait_count++; - se->statistics.wait_sum += delta; - se->statistics.wait_start = 0; + schedstat_set(se->statistics.wait_max, + max(schedstat_val(se->statistics.wait_max), delta)); + schedstat_inc(se->statistics.wait_count); + schedstat_add(se->statistics.wait_sum, delta); + schedstat_set(se->statistics.wait_start, 0); +} + +static inline void +update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct task_struct *tsk = NULL; + u64 sleep_start, block_start; + + if (!schedstat_enabled()) + return; + + sleep_start = schedstat_val(se->statistics.sleep_start); + block_start = schedstat_val(se->statistics.block_start); + + if (entity_is_task(se)) + tsk = task_of(se); + + if (sleep_start) { + u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) + schedstat_set(se->statistics.sleep_max, delta); + + schedstat_set(se->statistics.sleep_start, 0); + schedstat_add(se->statistics.sum_sleep_runtime, delta); + + if (tsk) { + account_scheduler_latency(tsk, delta >> 10, 1); + trace_sched_stat_sleep(tsk, delta); + } + } + if (block_start) { + u64 delta = rq_clock(rq_of(cfs_rq)) - block_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > schedstat_val(se->statistics.block_max))) + schedstat_set(se->statistics.block_max, delta); + + schedstat_set(se->statistics.block_start, 0); + schedstat_add(se->statistics.sum_sleep_runtime, delta); + + if (tsk) { + if (tsk->in_iowait) { + schedstat_add(se->statistics.iowait_sum, delta); + schedstat_inc(se->statistics.iowait_count); + trace_sched_stat_iowait(tsk, delta); + } + + trace_sched_stat_blocked(tsk, delta); + + /* + * Blocking time is in units of nanosecs, so shift by + * 20 to get a milliseconds-range estimation of the + * amount of time that the task spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + profile_hits(SLEEP_PROFILING, + (void *)get_wchan(tsk), + delta >> 20); + } + account_scheduler_latency(tsk, delta >> 10, 0); + } + } } /* * Task is being enqueued - update stats: */ static inline void -update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + if (!schedstat_enabled()) + return; + /* * Are we enqueueing a waiting task? (for current tasks * a dequeue/enqueue event is a NOP) */ if (se != cfs_rq->curr) update_stats_wait_start(cfs_rq, se); + + if (flags & ENQUEUE_WAKEUP) + update_stats_enqueue_sleeper(cfs_rq, se); } static inline void update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + + if (!schedstat_enabled()) + return; + /* * Mark the end of the wait period if dequeueing a * waiting task: @@ -885,40 +1031,18 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) update_stats_wait_end(cfs_rq, se); - if (flags & DEQUEUE_SLEEP) { - if (entity_is_task(se)) { - struct task_struct *tsk = task_of(se); + if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { + struct task_struct *tsk = task_of(se); - if (tsk->state & TASK_INTERRUPTIBLE) - se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); - if (tsk->state & TASK_UNINTERRUPTIBLE) - se->statistics.block_start = rq_clock(rq_of(cfs_rq)); - } + if (tsk->state & TASK_INTERRUPTIBLE) + schedstat_set(se->statistics.sleep_start, + rq_clock(rq_of(cfs_rq))); + if (tsk->state & TASK_UNINTERRUPTIBLE) + schedstat_set(se->statistics.block_start, + rq_clock(rq_of(cfs_rq))); } - -} -#else -static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ } -static inline void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -} - -static inline void -update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -} - -static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) -{ -} -#endif - /* * We are picking a new current task - update its stats: */ @@ -1513,8 +1637,16 @@ balance: * One idle CPU per node is evaluated for a task numa move. * Call select_idle_sibling to maybe find a better one. */ - if (!cur) - env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); + if (!cur) { + /* + * select_idle_siblings() uses an per-cpu cpumask that + * can be used from IRQ context. + */ + local_irq_disable(); + env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, + env->dst_cpu); + local_irq_enable(); + } assign: task_numa_assign(env, cur, imp); @@ -2292,7 +2424,7 @@ void task_numa_work(struct callback_head *work) unsigned long nr_pte_updates = 0; long pages, virtpages; - WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); + SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); work->next = work; /* protect against double add */ /* @@ -2802,10 +2934,42 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, return decayed; } -#ifdef CONFIG_FAIR_GROUP_SCHED /* - * Updating tg's load_avg is necessary before update_cfs_share (which is done) - * and effective_load (which is not done because it is too costly). + * Signed add and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define add_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(_val) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + \ + res = var + val; \ + \ + if (val < 0 && res > var) \ + res = 0; \ + \ + WRITE_ONCE(*ptr, res); \ +} while (0) + +#ifdef CONFIG_FAIR_GROUP_SCHED +/** + * update_tg_load_avg - update the tg's load avg + * @cfs_rq: the cfs_rq whose avg changed + * @force: update regardless of how small the difference + * + * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. + * However, because tg->load_avg is a global value there are performance + * considerations. + * + * In order to avoid having to look at the other cfs_rq's, we use a + * differential update where we store the last value we propagated. This in + * turn allows skipping updates if the differential is 'small'. + * + * Updating tg's load_avg is necessary before update_cfs_share() (which is + * done) and effective_load() (which is not done because it is too costly). */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { @@ -2869,18 +3033,143 @@ void set_task_rq_fair(struct sched_entity *se, se->avg.last_update_time = n_last_update_time; } } + +/* Take into account change of utilization of a child task group */ +static inline void +update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; + + /* Nothing to update */ + if (!delta) + return; + + /* Set new sched_entity's utilization */ + se->avg.util_avg = gcfs_rq->avg.util_avg; + se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; + + /* Update parent cfs_rq utilization */ + add_positive(&cfs_rq->avg.util_avg, delta); + cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; +} + +/* Take into account change of load of a child task group */ +static inline void +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long delta, load = gcfs_rq->avg.load_avg; + + /* + * If the load of group cfs_rq is null, the load of the + * sched_entity will also be null so we can skip the formula + */ + if (load) { + long tg_load; + + /* Get tg's load and ensure tg_load > 0 */ + tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1; + + /* Ensure tg_load >= load and updated with current load*/ + tg_load -= gcfs_rq->tg_load_avg_contrib; + tg_load += load; + + /* + * We need to compute a correction term in the case that the + * task group is consuming more CPU than a task of equal + * weight. A task with a weight equals to tg->shares will have + * a load less or equal to scale_load_down(tg->shares). + * Similarly, the sched_entities that represent the task group + * at parent level, can't have a load higher than + * scale_load_down(tg->shares). And the Sum of sched_entities' + * load must be <= scale_load_down(tg->shares). + */ + if (tg_load > scale_load_down(gcfs_rq->tg->shares)) { + /* scale gcfs_rq's load into tg's shares*/ + load *= scale_load_down(gcfs_rq->tg->shares); + load /= tg_load; + } + } + + delta = load - se->avg.load_avg; + + /* Nothing to update */ + if (!delta) + return; + + /* Set new sched_entity's load */ + se->avg.load_avg = load; + se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX; + + /* Update parent cfs_rq load */ + add_positive(&cfs_rq->avg.load_avg, delta); + cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX; + + /* + * If the sched_entity is already enqueued, we also have to update the + * runnable load avg. + */ + if (se->on_rq) { + /* Update parent cfs_rq runnable_load_avg */ + add_positive(&cfs_rq->runnable_load_avg, delta); + cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX; + } +} + +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) +{ + cfs_rq->propagate_avg = 1; +} + +static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = group_cfs_rq(se); + + if (!cfs_rq->propagate_avg) + return 0; + + cfs_rq->propagate_avg = 0; + return 1; +} + +/* Update task and its cfs_rq load average */ +static inline int propagate_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq; + + if (entity_is_task(se)) + return 0; + + if (!test_and_clear_tg_cfs_propagate(se)) + return 0; + + cfs_rq = cfs_rq_of(se); + + set_tg_cfs_propagate(cfs_rq); + + update_tg_cfs_util(cfs_rq, se); + update_tg_cfs_load(cfs_rq, se); + + return 1; +} + #else /* CONFIG_FAIR_GROUP_SCHED */ + static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} -#endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) +static inline int propagate_entity_load_avg(struct sched_entity *se) { - struct rq *rq = rq_of(cfs_rq); - int cpu = cpu_of(rq); + return 0; +} - if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { - unsigned long max = rq->cpu_capacity_orig; +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) +{ + if (&this_rq()->cfs == cfs_rq) { /* * There are a few boundary cases this might miss but it should * get called often enough that that should (hopefully) not be @@ -2897,8 +3186,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) * * See cpu_util(). */ - cpufreq_update_util(rq_clock(rq), - min(cfs_rq->avg.util_avg, max), max); + cpufreq_update_util(rq_of(cfs_rq), 0); } } @@ -2931,10 +3219,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) * * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. * - * Returns true if the load decayed or we removed utilization. It is expected - * that one calls update_tg_load_avg() on this condition, but after you've - * modified the cfs_rq avg (attach/detach), such that we propagate the new - * avg up. + * Returns true if the load decayed or we removed load. + * + * Since both these conditions indicate a changed cfs_rq->avg.load we should + * call update_tg_load_avg() when this function returns true. */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) @@ -2947,6 +3235,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) sub_positive(&sa->load_avg, r); sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); removed_load = 1; + set_tg_cfs_propagate(cfs_rq); } if (atomic_long_read(&cfs_rq->removed_util_avg)) { @@ -2954,6 +3243,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); removed_util = 1; + set_tg_cfs_propagate(cfs_rq); } decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, @@ -2970,23 +3260,35 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) return decayed || removed_load; } +/* + * Optional action to be done while updating the load average + */ +#define UPDATE_TG 0x1 +#define SKIP_AGE_LOAD 0x2 + /* Update task and its cfs_rq load average */ -static inline void update_load_avg(struct sched_entity *se, int update_tg) +static inline void update_load_avg(struct sched_entity *se, int flags) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); struct rq *rq = rq_of(cfs_rq); int cpu = cpu_of(rq); + int decayed; /* * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calc in migration */ - __update_load_avg(now, cpu, &se->avg, + if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) { + __update_load_avg(now, cpu, &se->avg, se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); + } + + decayed = update_cfs_rq_load_avg(now, cfs_rq, true); + decayed |= propagate_entity_load_avg(se); - if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg) + if (decayed && (flags & UPDATE_TG)) update_tg_load_avg(cfs_rq, 0); } @@ -3000,31 +3302,12 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (!sched_feat(ATTACH_AGE_LOAD)) - goto skip_aging; - - /* - * If we got migrated (either between CPUs or between cgroups) we'll - * have aged the average right before clearing @last_update_time. - * - * Or we're fresh through post_init_entity_util_avg(). - */ - if (se->avg.last_update_time) { - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), - &se->avg, 0, 0, NULL); - - /* - * XXX: we could have just aged the entire load away if we've been - * absent from the fair class for too long. - */ - } - -skip_aging: se->avg.last_update_time = cfs_rq->avg.last_update_time; cfs_rq->avg.load_avg += se->avg.load_avg; cfs_rq->avg.load_sum += se->avg.load_sum; cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; + set_tg_cfs_propagate(cfs_rq); cfs_rq_util_change(cfs_rq); } @@ -3039,14 +3322,12 @@ skip_aging: */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), - &se->avg, se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + set_tg_cfs_propagate(cfs_rq); cfs_rq_util_change(cfs_rq); } @@ -3056,34 +3337,20 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_avg *sa = &se->avg; - u64 now = cfs_rq_clock_task(cfs_rq); - int migrated, decayed; - - migrated = !sa->last_update_time; - if (!migrated) { - __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, - se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); - } - - decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated); cfs_rq->runnable_load_avg += sa->load_avg; cfs_rq->runnable_load_sum += sa->load_sum; - if (migrated) + if (!sa->last_update_time) { attach_entity_load_avg(cfs_rq, se); - - if (decayed || migrated) update_tg_load_avg(cfs_rq, 0); + } } /* Remove the runnable load generated by se from cfs_rq's runnable load average */ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - update_load_avg(se, 1); - cfs_rq->runnable_load_avg = max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); cfs_rq->runnable_load_sum = @@ -3112,13 +3379,25 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) #endif /* + * Synchronize entity load avg of dequeued entity without locking + * the previous rq. + */ +void sync_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 last_update_time; + + last_update_time = cfs_rq_last_update_time(cfs_rq); + __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); +} + +/* * Task first catches up with cfs_rq, and then subtract * itself from the cfs_rq (task must be off the queue now). */ void remove_entity_load_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 last_update_time; /* * tasks cannot exit without having gone through wake_up_new_task() -> @@ -3130,9 +3409,7 @@ void remove_entity_load_avg(struct sched_entity *se) * calls this. */ - last_update_time = cfs_rq_last_update_time(cfs_rq); - - __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + sync_entity_load_avg(se); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } @@ -3157,12 +3434,12 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) return 0; } -static inline void update_load_avg(struct sched_entity *se, int not_used) -{ - struct cfs_rq *cfs_rq = cfs_rq_of(se); - struct rq *rq = rq_of(cfs_rq); +#define UPDATE_TG 0x0 +#define SKIP_AGE_LOAD 0x0 - cpufreq_trigger_update(rq_clock(rq)); +static inline void update_load_avg(struct sched_entity *se, int not_used1) +{ + cpufreq_update_util(rq_of(cfs_rq_of(se)), 0); } static inline void @@ -3183,68 +3460,6 @@ static inline int idle_balance(struct rq *rq) #endif /* CONFIG_SMP */ -static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -#ifdef CONFIG_SCHEDSTATS - struct task_struct *tsk = NULL; - - if (entity_is_task(se)) - tsk = task_of(se); - - if (se->statistics.sleep_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > se->statistics.sleep_max)) - se->statistics.sleep_max = delta; - - se->statistics.sleep_start = 0; - se->statistics.sum_sleep_runtime += delta; - - if (tsk) { - account_scheduler_latency(tsk, delta >> 10, 1); - trace_sched_stat_sleep(tsk, delta); - } - } - if (se->statistics.block_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > se->statistics.block_max)) - se->statistics.block_max = delta; - - se->statistics.block_start = 0; - se->statistics.sum_sleep_runtime += delta; - - if (tsk) { - if (tsk->in_iowait) { - se->statistics.iowait_sum += delta; - se->statistics.iowait_count++; - trace_sched_stat_iowait(tsk, delta); - } - - trace_sched_stat_blocked(tsk, delta); - - /* - * Blocking time is in units of nanosecs, so shift by - * 20 to get a milliseconds-range estimation of the - * amount of time that the task spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - profile_hits(SLEEP_PROFILING, - (void *)get_wchan(tsk), - delta >> 20); - } - account_scheduler_latency(tsk, delta >> 10, 0); - } - } -#endif -} - static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHED_DEBUG @@ -3254,7 +3469,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) d = -d; if (d > 3*sysctl_sched_latency) - schedstat_inc(cfs_rq, nr_spread_over); + schedstat_inc(cfs_rq->nr_spread_over); #endif } @@ -3367,21 +3582,17 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (renorm && !curr) se->vruntime += cfs_rq->min_vruntime; + update_load_avg(se, UPDATE_TG); enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); - if (flags & ENQUEUE_WAKEUP) { + if (flags & ENQUEUE_WAKEUP) place_entity(cfs_rq, se, 0); - if (schedstat_enabled()) - enqueue_sleeper(cfs_rq, se); - } check_schedstat_required(); - if (schedstat_enabled()) { - update_stats_enqueue(cfs_rq, se); - check_spread(cfs_rq, se); - } + update_stats_enqueue(cfs_rq, se, flags); + check_spread(cfs_rq, se); if (!curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; @@ -3446,10 +3657,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + update_load_avg(se, UPDATE_TG); dequeue_entity_load_avg(cfs_rq, se); - if (schedstat_enabled()) - update_stats_dequeue(cfs_rq, se, flags); + update_stats_dequeue(cfs_rq, se, flags); clear_buddies(cfs_rq, se); @@ -3459,9 +3670,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) account_entity_dequeue(cfs_rq, se); /* - * Normalize the entity after updating the min_vruntime because the - * update can refer to the ->curr item and we need to reflect this - * movement in our normalized position. + * Normalize after update_curr(); which will also have moved + * min_vruntime if @se is the one holding it back. But before doing + * update_min_vruntime() again, which will discount @se's position and + * can move min_vruntime forward still more. */ if (!(flags & DEQUEUE_SLEEP)) se->vruntime -= cfs_rq->min_vruntime; @@ -3469,8 +3681,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); - update_min_vruntime(cfs_rq); update_cfs_shares(cfs_rq); + + /* + * Now advance min_vruntime if @se was the entity holding it back, + * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be + * put back on, and if we advance min_vruntime, we'll be placed back + * further than we started -- ie. we'll be penalized. + */ + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) + update_min_vruntime(cfs_rq); } /* @@ -3523,25 +3743,25 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * a CPU. So account for the time it spent waiting on the * runqueue. */ - if (schedstat_enabled()) - update_stats_wait_end(cfs_rq, se); + update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); - update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); } update_stats_curr_start(cfs_rq, se); cfs_rq->curr = se; -#ifdef CONFIG_SCHEDSTATS + /* * Track our maximum slice length, if the CPU's load is at * least twice that of our own weight (i.e. dont track it * when there are only lesser-weight tasks around): */ if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { - se->statistics.slice_max = max(se->statistics.slice_max, - se->sum_exec_runtime - se->prev_sum_exec_runtime); + schedstat_set(se->statistics.slice_max, + max((u64)schedstat_val(se->statistics.slice_max), + se->sum_exec_runtime - se->prev_sum_exec_runtime)); } -#endif + se->prev_sum_exec_runtime = se->sum_exec_runtime; } @@ -3620,13 +3840,10 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* throttle cfs_rqs exceeding runtime */ check_cfs_rq_runtime(cfs_rq); - if (schedstat_enabled()) { - check_spread(cfs_rq, prev); - if (prev->on_rq) - update_stats_wait_start(cfs_rq, prev); - } + check_spread(cfs_rq, prev); if (prev->on_rq) { + update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ @@ -3646,7 +3863,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) /* * Ensure that runnable average is periodically updated. */ - update_load_avg(curr, 1); + update_load_avg(curr, UPDATE_TG); update_cfs_shares(cfs_rq); #ifdef CONFIG_SCHED_HRTICK @@ -4456,9 +4673,9 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); - WARN_ON(task_rq(p) != rq); + SCHED_WARN_ON(task_rq(p) != rq); - if (cfs_rq->nr_running > 1) { + if (rq->cfs.h_nr_running > 1) { u64 slice = sched_slice(cfs_rq, se); u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; s64 delta = slice - ran; @@ -4509,6 +4726,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + /* + * If in_iowait is set, the code below may not trigger any cpufreq + * utilization updates, so do it here explicitly with the IOWAIT flag + * passed. + */ + if (p->in_iowait) + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); + for_each_sched_entity(se) { if (se->on_rq) break; @@ -4535,7 +4760,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); update_cfs_shares(cfs_rq); } @@ -4594,7 +4819,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); update_cfs_shares(cfs_rq); } @@ -4605,6 +4830,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP + +/* Working cpumask for: load_balance, load_balance_newidle. */ +DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); +DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); + #ifdef CONFIG_NO_HZ_COMMON /* * per rq 'load' arrray crap; XXX kill this. @@ -5006,9 +5236,9 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) * wl = S * s'_i; see (2) */ if (W > 0 && w < W) - wl = (w * (long)tg->shares) / W; + wl = (w * (long)scale_load_down(tg->shares)) / W; else - wl = tg->shares; + wl = scale_load_down(tg->shares); /* * Per the above, wl is the new se->load.weight value; since @@ -5091,18 +5321,18 @@ static int wake_wide(struct task_struct *p) return 1; } -static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) +static int wake_affine(struct sched_domain *sd, struct task_struct *p, + int prev_cpu, int sync) { s64 this_load, load; s64 this_eff_load, prev_eff_load; - int idx, this_cpu, prev_cpu; + int idx, this_cpu; struct task_group *tg; unsigned long weight; int balanced; idx = sd->wake_idx; this_cpu = smp_processor_id(); - prev_cpu = task_cpu(p); load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); @@ -5146,17 +5376,25 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) balanced = this_eff_load <= prev_eff_load; - schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); + schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); if (!balanced) return 0; - schedstat_inc(sd, ttwu_move_affine); - schedstat_inc(p, se.statistics.nr_wakeups_affine); + schedstat_inc(sd->ttwu_move_affine); + schedstat_inc(p->se.statistics.nr_wakeups_affine); return 1; } +static inline int task_util(struct task_struct *p); +static int cpu_util_wake(int cpu, struct task_struct *p); + +static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) +{ + return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -5166,15 +5404,21 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; - unsigned long min_load = ULONG_MAX, this_load = 0; + struct sched_group *most_spare_sg = NULL; + unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0; + unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0; + unsigned long most_spare = 0, this_spare = 0; int load_idx = sd->forkexec_idx; - int imbalance = 100 + (sd->imbalance_pct-100)/2; + int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; + unsigned long imbalance = scale_load_down(NICE_0_LOAD) * + (sd->imbalance_pct-100) / 100; if (sd_flag & SD_BALANCE_WAKE) load_idx = sd->wake_idx; do { - unsigned long load, avg_load; + unsigned long load, avg_load, runnable_load; + unsigned long spare_cap, max_spare_cap; int local_group; int i; @@ -5186,8 +5430,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); - /* Tally up the load of all CPUs in the group */ + /* + * Tally up the load of all CPUs in the group and find + * the group containing the CPU with most spare capacity. + */ avg_load = 0; + runnable_load = 0; + max_spare_cap = 0; for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ @@ -5196,22 +5445,84 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, else load = target_load(i, load_idx); - avg_load += load; + runnable_load += load; + + avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); + + spare_cap = capacity_spare_wake(i, p); + + if (spare_cap > max_spare_cap) + max_spare_cap = spare_cap; } /* Adjust by relative CPU capacity of the group */ - avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity; + avg_load = (avg_load * SCHED_CAPACITY_SCALE) / + group->sgc->capacity; + runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) / + group->sgc->capacity; if (local_group) { - this_load = avg_load; - } else if (avg_load < min_load) { - min_load = avg_load; - idlest = group; + this_runnable_load = runnable_load; + this_avg_load = avg_load; + this_spare = max_spare_cap; + } else { + if (min_runnable_load > (runnable_load + imbalance)) { + /* + * The runnable load is significantly smaller + * so we can pick this new cpu + */ + min_runnable_load = runnable_load; + min_avg_load = avg_load; + idlest = group; + } else if ((runnable_load < (min_runnable_load + imbalance)) && + (100*min_avg_load > imbalance_scale*avg_load)) { + /* + * The runnable loads are close so take the + * blocked load into account through avg_load. + */ + min_avg_load = avg_load; + idlest = group; + } + + if (most_spare < max_spare_cap) { + most_spare = max_spare_cap; + most_spare_sg = group; + } } } while (group = group->next, group != sd->groups); - if (!idlest || 100*this_load < imbalance*min_load) + /* + * The cross-over point between using spare capacity or least load + * is too conservative for high utilization tasks on partially + * utilized systems if we require spare_capacity > task_util(p), + * so we allow for some task stuffing by using + * spare_capacity > task_util(p)/2. + * + * Spare capacity can't be used for fork because the utilization has + * not been set yet, we must first select a rq to compute the initial + * utilization. + */ + if (sd_flag & SD_BALANCE_FORK) + goto skip_spare; + + if (this_spare > task_util(p) / 2 && + imbalance_scale*this_spare > 100*most_spare) + return NULL; + + if (most_spare > task_util(p) / 2) + return most_spare_sg; + +skip_spare: + if (!idlest) + return NULL; + + if (min_runnable_load > (this_runnable_load + imbalance)) + return NULL; + + if ((this_runnable_load < (min_runnable_load + imbalance)) && + (100*this_avg_load < imbalance_scale*min_avg_load)) return NULL; + return idlest; } @@ -5228,6 +5539,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) int shallowest_idle_cpu = -1; int i; + /* Check if we have any choice: */ + if (group->group_weight == 1) + return cpumask_first(sched_group_cpus(group)); + /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { if (idle_cpu(i)) { @@ -5265,64 +5580,242 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) } /* - * Try and locate an idle CPU in the sched_domain. + * Implement a for_each_cpu() variant that starts the scan at a given cpu + * (@start), and wraps around. + * + * This is used to scan for idle CPUs; such that not all CPUs looking for an + * idle CPU find the same CPU. The down-side is that tasks tend to cycle + * through the LLC domain. + * + * Especially tbench is found sensitive to this. */ -static int select_idle_sibling(struct task_struct *p, int target) + +static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped) +{ + int next; + +again: + next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1); + + if (*wrapped) { + if (next >= start) + return nr_cpumask_bits; + } else { + if (next >= nr_cpumask_bits) { + *wrapped = 1; + n = -1; + goto again; + } + } + + return next; +} + +#define for_each_cpu_wrap(cpu, mask, start, wrap) \ + for ((wrap) = 0, (cpu) = (start)-1; \ + (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \ + (cpu) < nr_cpumask_bits; ) + +#ifdef CONFIG_SCHED_SMT + +static inline void set_idle_cores(int cpu, int val) +{ + struct sched_domain_shared *sds; + + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) + WRITE_ONCE(sds->has_idle_cores, val); +} + +static inline bool test_idle_cores(int cpu, bool def) +{ + struct sched_domain_shared *sds; + + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) + return READ_ONCE(sds->has_idle_cores); + + return def; +} + +/* + * Scans the local SMT mask to see if the entire core is idle, and records this + * information in sd_llc_shared->has_idle_cores. + * + * Since SMT siblings share all cache levels, inspecting this limited remote + * state should be fairly cheap. + */ +void __update_idle_core(struct rq *rq) +{ + int core = cpu_of(rq); + int cpu; + + rcu_read_lock(); + if (test_idle_cores(core, true)) + goto unlock; + + for_each_cpu(cpu, cpu_smt_mask(core)) { + if (cpu == core) + continue; + + if (!idle_cpu(cpu)) + goto unlock; + } + + set_idle_cores(core, 1); +unlock: + rcu_read_unlock(); +} + +/* + * Scan the entire LLC domain for idle cores; this dynamically switches off if + * there are no idle cores left in the system; tracked through + * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above. + */ +static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +{ + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + int core, cpu, wrap; + + if (!static_branch_likely(&sched_smt_present)) + return -1; + + if (!test_idle_cores(target, false)) + return -1; + + cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p)); + + for_each_cpu_wrap(core, cpus, target, wrap) { + bool idle = true; + + for_each_cpu(cpu, cpu_smt_mask(core)) { + cpumask_clear_cpu(cpu, cpus); + if (!idle_cpu(cpu)) + idle = false; + } + + if (idle) + return core; + } + + /* + * Failed to find an idle core; stop looking for one. + */ + set_idle_cores(target, 0); + + return -1; +} + +/* + * Scan the local SMT mask for idle CPUs. + */ +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +{ + int cpu; + + if (!static_branch_likely(&sched_smt_present)) + return -1; + + for_each_cpu(cpu, cpu_smt_mask(target)) { + if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) + continue; + if (idle_cpu(cpu)) + return cpu; + } + + return -1; +} + +#else /* CONFIG_SCHED_SMT */ + +static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +{ + return -1; +} + +static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +{ + return -1; +} + +#endif /* CONFIG_SCHED_SMT */ + +/* + * Scan the LLC domain for idle CPUs; this is dynamically regulated by + * comparing the average scan cost (tracked in sd->avg_scan_cost) against the + * average idle time for this rq (as found in rq->avg_idle). + */ +static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) +{ + struct sched_domain *this_sd; + u64 avg_cost, avg_idle = this_rq()->avg_idle; + u64 time, cost; + s64 delta; + int cpu, wrap; + + this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); + if (!this_sd) + return -1; + + avg_cost = this_sd->avg_scan_cost; + + /* + * Due to large variance we need a large fuzz factor; hackbench in + * particularly is sensitive here. + */ + if ((avg_idle / 512) < avg_cost) + return -1; + + time = local_clock(); + + for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) { + if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) + continue; + if (idle_cpu(cpu)) + break; + } + + time = local_clock() - time; + cost = this_sd->avg_scan_cost; + delta = (s64)(time - cost) / 8; + this_sd->avg_scan_cost += delta; + + return cpu; +} + +/* + * Try and locate an idle core/thread in the LLC cache domain. + */ +static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; - struct sched_group *sg; - int i = task_cpu(p); + int i; if (idle_cpu(target)) return target; /* - * If the prevous cpu is cache affine and idle, don't be stupid. + * If the previous cpu is cache affine and idle, don't be stupid. */ - if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) - return i; + if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) + return prev; - /* - * Otherwise, iterate the domains and find an eligible idle cpu. - * - * A completely idle sched group at higher domains is more - * desirable than an idle group at a lower level, because lower - * domains have smaller groups and usually share hardware - * resources which causes tasks to contend on them, e.g. x86 - * hyperthread siblings in the lowest domain (SMT) can contend - * on the shared cpu pipeline. - * - * However, while we prefer idle groups at higher domains - * finding an idle cpu at the lowest domain is still better than - * returning 'target', which we've already established, isn't - * idle. - */ sd = rcu_dereference(per_cpu(sd_llc, target)); - for_each_lower_domain(sd) { - sg = sd->groups; - do { - if (!cpumask_intersects(sched_group_cpus(sg), - tsk_cpus_allowed(p))) - goto next; - - /* Ensure the entire group is idle */ - for_each_cpu(i, sched_group_cpus(sg)) { - if (i == target || !idle_cpu(i)) - goto next; - } + if (!sd) + return target; + + i = select_idle_core(p, sd, target); + if ((unsigned)i < nr_cpumask_bits) + return i; + + i = select_idle_cpu(p, sd, target); + if ((unsigned)i < nr_cpumask_bits) + return i; + + i = select_idle_smt(p, sd, target); + if ((unsigned)i < nr_cpumask_bits) + return i; - /* - * It doesn't matter which cpu we pick, the - * whole group is idle. - */ - target = cpumask_first_and(sched_group_cpus(sg), - tsk_cpus_allowed(p)); - goto done; -next: - sg = sg->next; - } while (sg != sd->groups); - } -done: return target; } @@ -5360,6 +5853,53 @@ static int cpu_util(int cpu) return (util >= capacity) ? capacity : util; } +static inline int task_util(struct task_struct *p) +{ + return p->se.avg.util_avg; +} + +/* + * cpu_util_wake: Compute cpu utilization with any contributions from + * the waking task p removed. + */ +static int cpu_util_wake(int cpu, struct task_struct *p) +{ + unsigned long util, capacity; + + /* Task has no contribution or is new */ + if (cpu != task_cpu(p) || !p->se.avg.last_update_time) + return cpu_util(cpu); + + capacity = capacity_orig_of(cpu); + util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0); + + return (util >= capacity) ? capacity : util; +} + +/* + * Disable WAKE_AFFINE in the case where task @p doesn't fit in the + * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. + * + * In that case WAKE_AFFINE doesn't make sense and we'll let + * BALANCE_WAKE sort things out. + */ +static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) +{ + long min_cap, max_cap; + + min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); + max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; + + /* Minimum capacity is close to max, no need to abort wake_affine */ + if (max_cap - min_cap < max_cap >> 3) + return 0; + + /* Bring task utilization in sync with prev_cpu */ + sync_entity_load_avg(&p->se); + + return min_cap * 1024 < task_util(p) * capacity_margin; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -5383,7 +5923,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); - want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) + && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); } rcu_read_lock(); @@ -5409,13 +5950,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (affine_sd) { sd = NULL; /* Prefer wake_affine over balance flags */ - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync)) new_cpu = cpu; } if (!sd) { if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ - new_cpu = select_idle_sibling(p, new_cpu); + new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); } else while (sd) { struct sched_group *group; @@ -5939,7 +6480,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * The adjacency matrix of the resulting graph is given by: * - * log_2 n + * log_2 n * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) * k = 0 * @@ -5985,7 +6526,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * [XXX write more on how we solve this.. _after_ merging pjt's patches that * rewrite all of this once again.] - */ + */ static unsigned long __read_mostly max_load_balance_interval = HZ/10; @@ -6133,7 +6674,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { int cpu; - schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + schedstat_inc(p->se.statistics.nr_failed_migrations_affine); env->flags |= LBF_SOME_PINNED; @@ -6164,7 +6705,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) env->flags &= ~LBF_ALL_PINNED; if (task_running(env->src_rq, p)) { - schedstat_inc(p, se.statistics.nr_failed_migrations_running); + schedstat_inc(p->se.statistics.nr_failed_migrations_running); return 0; } @@ -6181,13 +6722,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (tsk_cache_hot <= 0 || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { if (tsk_cache_hot == 1) { - schedstat_inc(env->sd, lb_hot_gained[env->idle]); - schedstat_inc(p, se.statistics.nr_forced_migrations); + schedstat_inc(env->sd->lb_hot_gained[env->idle]); + schedstat_inc(p->se.statistics.nr_forced_migrations); } return 1; } - schedstat_inc(p, se.statistics.nr_failed_migrations_hot); + schedstat_inc(p->se.statistics.nr_failed_migrations_hot); return 0; } @@ -6227,7 +6768,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) * so we can safely collect stats here rather than * inside detach_tasks(). */ - schedstat_inc(env->sd, lb_gained[env->idle]); + schedstat_inc(env->sd->lb_gained[env->idle]); return p; } return NULL; @@ -6319,7 +6860,7 @@ next: * so we can safely collect detach_one_task() stats here rather * than inside detach_one_task(). */ - schedstat_add(env->sd, lb_gained[env->idle], detached); + schedstat_add(env->sd->lb_gained[env->idle], detached); return detached; } @@ -6390,6 +6931,10 @@ static void update_blocked_averages(int cpu) if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) update_tg_load_avg(cfs_rq, 0); + + /* Propagate pending load changes to the parent */ + if (cfs_rq->tg->se[cpu]) + update_load_avg(cfs_rq->tg->se[cpu], 0); } raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -6594,13 +7139,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; + sdg->sgc->min_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity; + unsigned long capacity, min_capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -6613,6 +7159,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } capacity = 0; + min_capacity = ULONG_MAX; if (child->flags & SD_OVERLAP) { /* @@ -6637,26 +7184,31 @@ void update_group_capacity(struct sched_domain *sd, int cpu) */ if (unlikely(!rq->sd)) { capacity += capacity_of(cpu); - continue; + } else { + sgc = rq->sd->groups->sgc; + capacity += sgc->capacity; } - sgc = rq->sd->groups->sgc; - capacity += sgc->capacity; + min_capacity = min(capacity, min_capacity); } } else { /* * !SD_OVERLAP domains can assume that child groups * span the current group. - */ + */ group = child->groups; do { - capacity += group->sgc->capacity; + struct sched_group_capacity *sgc = group->sgc; + + capacity += sgc->capacity; + min_capacity = min(sgc->min_capacity, min_capacity); group = group->next; } while (group != child->groups); } sdg->sgc->capacity = capacity; + sdg->sgc->min_capacity = min_capacity; } /* @@ -6679,8 +7231,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) * cpumask covering 1 cpu of the first group and 3 cpus of the second group. * Something like: * - * { 0 1 2 3 } { 4 5 6 7 } - * * * * * + * { 0 1 2 3 } { 4 5 6 7 } + * * * * * * * If we were to balance group-wise we'd place two tasks in the first group and * two tasks in the second group. Clearly this is undesired as it will overload @@ -6751,6 +7303,17 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) return false; } +/* + * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller + * per-CPU capacity than sched_group ref. + */ +static inline bool +group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +{ + return sg->sgc->min_capacity * capacity_margin < + ref->sgc->min_capacity * 1024; +} + static inline enum group_type group_classify(struct sched_group *group, struct sg_lb_stats *sgs) @@ -6854,6 +7417,20 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->avg_load <= busiest->avg_load) return false; + if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) + goto asym_packing; + + /* + * Candidate sg has no more than one task per CPU and + * has higher per-CPU capacity. Migrating tasks to less + * capable CPUs may harm throughput. Maximize throughput, + * power/energy consequences are not considered. + */ + if (sgs->sum_nr_running <= sgs->group_weight && + group_smaller_cpu_capacity(sds->local, sg)) + return false; + +asym_packing: /* This is the busiest node in its class. */ if (!(env->sd->flags & SD_ASYM_PACKING)) return true; @@ -6862,16 +7439,18 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (env->idle == CPU_NOT_IDLE) return true; /* - * ASYM_PACKING needs to move all the work to the lowest - * numbered CPUs in the group, therefore mark all groups - * higher than ourself as busy. + * ASYM_PACKING needs to move all the work to the highest + * prority CPUs in the group, therefore mark all groups + * of lower priority than ourself as busy. */ - if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) { + if (sgs->sum_nr_running && + sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) { if (!sds->busiest) return true; - /* Prefer to move from highest possible cpu's work */ - if (group_first_cpu(sds->busiest) < group_first_cpu(sg)) + /* Prefer to move from lowest priority cpu's work */ + if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, + sg->asym_prefer_cpu)) return true; } @@ -7023,8 +7602,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) if (!sds->busiest) return 0; - busiest_cpu = group_first_cpu(sds->busiest); - if (env->dst_cpu > busiest_cpu) + busiest_cpu = sds->busiest->asym_prefer_cpu; + if (sched_asym_prefer(busiest_cpu, env->dst_cpu)) return 0; env->imbalance = DIV_ROUND_CLOSEST( @@ -7147,7 +7726,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; if (load_above_capacity > busiest->group_capacity) { load_above_capacity -= busiest->group_capacity; - load_above_capacity *= NICE_0_LOAD; + load_above_capacity *= scale_load_down(NICE_0_LOAD); load_above_capacity /= busiest->group_capacity; } else load_above_capacity = ~0UL; @@ -7354,9 +7933,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, */ #define MAX_PINNED_INTERVAL 512 -/* Working cpumask for load_balance and load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); - static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; @@ -7365,10 +7941,11 @@ static int need_active_balance(struct lb_env *env) /* * ASYM_PACKING needs to force migrate tasks from busy but - * higher numbered CPUs in order to pack all tasks in the - * lowest numbered CPUs. + * lower priority CPUs in order to pack all tasks in the + * highest priority CPUs. */ - if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu) + if ((sd->flags & SD_ASYM_PACKING) && + sched_asym_prefer(env->dst_cpu, env->src_cpu)) return 1; } @@ -7460,7 +8037,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, cpumask_copy(cpus, cpu_active_mask); - schedstat_inc(sd, lb_count[idle]); + schedstat_inc(sd->lb_count[idle]); redo: if (!should_we_balance(&env)) { @@ -7470,19 +8047,19 @@ redo: group = find_busiest_group(&env); if (!group) { - schedstat_inc(sd, lb_nobusyg[idle]); + schedstat_inc(sd->lb_nobusyg[idle]); goto out_balanced; } busiest = find_busiest_queue(&env, group); if (!busiest) { - schedstat_inc(sd, lb_nobusyq[idle]); + schedstat_inc(sd->lb_nobusyq[idle]); goto out_balanced; } BUG_ON(busiest == env.dst_rq); - schedstat_add(sd, lb_imbalance[idle], env.imbalance); + schedstat_add(sd->lb_imbalance[idle], env.imbalance); env.src_cpu = busiest->cpu; env.src_rq = busiest; @@ -7589,7 +8166,7 @@ more_balance: } if (!ld_moved) { - schedstat_inc(sd, lb_failed[idle]); + schedstat_inc(sd->lb_failed[idle]); /* * Increment the failure counter only on periodic balance. * We do not want newidle balance, which can be very @@ -7672,7 +8249,7 @@ out_all_pinned: * we can't migrate them. Let the imbalance flag set so parent level * can try to migrate them. */ - schedstat_inc(sd, lb_balanced[idle]); + schedstat_inc(sd->lb_balanced[idle]); sd->nr_balance_failed = 0; @@ -7704,11 +8281,12 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) } static inline void -update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) +update_next_balance(struct sched_domain *sd, unsigned long *next_balance) { unsigned long interval, next; - interval = get_sd_balance_interval(sd, cpu_busy); + /* used by idle balance, so cpu_busy = 0 */ + interval = get_sd_balance_interval(sd, 0); next = sd->last_balance + interval; if (time_after(*next_balance, next)) @@ -7738,7 +8316,7 @@ static int idle_balance(struct rq *this_rq) rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) - update_next_balance(sd, 0, &next_balance); + update_next_balance(sd, &next_balance); rcu_read_unlock(); goto out; @@ -7756,7 +8334,7 @@ static int idle_balance(struct rq *this_rq) continue; if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { - update_next_balance(sd, 0, &next_balance); + update_next_balance(sd, &next_balance); break; } @@ -7774,7 +8352,7 @@ static int idle_balance(struct rq *this_rq) curr_cost += domain_cost; } - update_next_balance(sd, 0, &next_balance); + update_next_balance(sd, &next_balance); /* * Stop searching for tasks to pull if there are @@ -7864,15 +8442,15 @@ static int active_load_balance_cpu_stop(void *data) .idle = CPU_IDLE, }; - schedstat_inc(sd, alb_count); + schedstat_inc(sd->alb_count); p = detach_one_task(&env); if (p) { - schedstat_inc(sd, alb_pushed); + schedstat_inc(sd->alb_pushed); /* Active balancing done, reset the failure counter. */ sd->nr_balance_failed = 0; } else { - schedstat_inc(sd, alb_failed); + schedstat_inc(sd->alb_failed); } } rcu_read_unlock(); @@ -7964,13 +8542,13 @@ static inline void set_cpu_sd_state_busy(void) int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_busy, cpu)); + sd = rcu_dereference(per_cpu(sd_llc, cpu)); if (!sd || !sd->nohz_idle) goto unlock; sd->nohz_idle = 0; - atomic_inc(&sd->groups->sgc->nr_busy_cpus); + atomic_inc(&sd->shared->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -7981,13 +8559,13 @@ void set_cpu_sd_state_idle(void) int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_busy, cpu)); + sd = rcu_dereference(per_cpu(sd_llc, cpu)); if (!sd || sd->nohz_idle) goto unlock; sd->nohz_idle = 1; - atomic_dec(&sd->groups->sgc->nr_busy_cpus); + atomic_dec(&sd->shared->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -8214,9 +8792,9 @@ end: static inline bool nohz_kick_needed(struct rq *rq) { unsigned long now = jiffies; + struct sched_domain_shared *sds; struct sched_domain *sd; - struct sched_group_capacity *sgc; - int nr_busy, cpu = rq->cpu; + int nr_busy, i, cpu = rq->cpu; bool kick = false; if (unlikely(rq->idle_balance)) @@ -8243,11 +8821,13 @@ static inline bool nohz_kick_needed(struct rq *rq) return true; rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd) { - sgc = sd->groups->sgc; - nr_busy = atomic_read(&sgc->nr_busy_cpus); - + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) { + /* + * XXX: write a coherent comment on why we do this. + * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com + */ + nr_busy = atomic_read(&sds->nr_busy_cpus); if (nr_busy > 1) { kick = true; goto unlock; @@ -8265,12 +8845,18 @@ static inline bool nohz_kick_needed(struct rq *rq) } sd = rcu_dereference(per_cpu(sd_asym, cpu)); - if (sd && (cpumask_first_and(nohz.idle_cpus_mask, - sched_domain_span(sd)) < cpu)) { - kick = true; - goto unlock; - } + if (sd) { + for_each_cpu(i, sched_domain_span(sd)) { + if (i == cpu || + !cpumask_test_cpu(i, nohz.idle_cpus_mask)) + continue; + if (sched_asym_prefer(i, cpu)) { + kick = true; + goto unlock; + } + } + } unlock: rcu_read_unlock(); return kick; @@ -8283,7 +8869,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } * run_rebalance_domains is triggered when needed from the scheduler tick. * Also triggered for nohz idle balancing (with nohz_balancing_kick set). */ -static void run_rebalance_domains(struct softirq_action *h) +static __latent_entropy void run_rebalance_domains(struct softirq_action *h) { struct rq *this_rq = this_rq(); enum cpu_idle_type idle = this_rq->idle_balance ? @@ -8436,12 +9022,65 @@ static inline bool vruntime_normalized(struct task_struct *p) return false; } +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Propagate the changes of the sched_entity across the tg tree to make it + * visible to the root + */ +static void propagate_entity_cfs_rq(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq; + + /* Start to propagate at parent */ + se = se->parent; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + if (cfs_rq_throttled(cfs_rq)) + break; + + update_load_avg(se, UPDATE_TG); + } +} +#else +static void propagate_entity_cfs_rq(struct sched_entity *se) { } +#endif + +static void detach_entity_cfs_rq(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* Catch up with the cfs_rq and remove our load when we leave */ + update_load_avg(se, 0); + detach_entity_load_avg(cfs_rq, se); + update_tg_load_avg(cfs_rq, false); + propagate_entity_cfs_rq(se); +} + +static void attach_entity_cfs_rq(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * Since the real-depth could have been changed (only FAIR + * class maintain depth value), reset depth properly. + */ + se->depth = se->parent ? se->parent->depth + 1 : 0; +#endif + + /* Synchronize entity with its cfs_rq */ + update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); + attach_entity_load_avg(cfs_rq, se); + update_tg_load_avg(cfs_rq, false); + propagate_entity_cfs_rq(se); +} + static void detach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 now = cfs_rq_clock_task(cfs_rq); - int tg_update; if (!vruntime_normalized(p)) { /* @@ -8452,33 +9091,15 @@ static void detach_task_cfs_rq(struct task_struct *p) se->vruntime -= cfs_rq->min_vruntime; } - /* Catch up with the cfs_rq and remove our load when we leave */ - tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); - detach_entity_load_avg(cfs_rq, se); - if (tg_update) - update_tg_load_avg(cfs_rq, false); + detach_entity_cfs_rq(se); } static void attach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 now = cfs_rq_clock_task(cfs_rq); - int tg_update; - -#ifdef CONFIG_FAIR_GROUP_SCHED - /* - * Since the real-depth could have been changed (only FAIR - * class maintain depth value), reset depth properly. - */ - se->depth = se->parent ? se->parent->depth + 1 : 0; -#endif - /* Synchronize task with its cfs_rq */ - tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); - attach_entity_load_avg(cfs_rq, se); - if (tg_update) - update_tg_load_avg(cfs_rq, false); + attach_entity_cfs_rq(se); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; @@ -8532,6 +9153,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif #ifdef CONFIG_SMP +#ifdef CONFIG_FAIR_GROUP_SCHED + cfs_rq->propagate_avg = 0; +#endif atomic_long_set(&cfs_rq->removed_load_avg, 0); atomic_long_set(&cfs_rq->removed_util_avg, 0); #endif @@ -8592,7 +9216,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct sched_entity *se; struct cfs_rq *cfs_rq; - struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8607,8 +9230,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(i) { - rq = cpu_rq(i); - cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) @@ -8643,7 +9264,7 @@ void online_fair_sched_group(struct task_group *tg) se = tg->se[i]; raw_spin_lock_irq(&rq->lock); - post_init_entity_util_avg(se); + attach_entity_cfs_rq(se); sync_throttle(tg, i); raw_spin_unlock_irq(&rq->lock); } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 9fb873cfc75c..6a4bae0a649d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -16,6 +16,9 @@ #include "sched.h" +/* Linker adds these: start and end of __cpuidle functions */ +extern char __cpuidle_text_start[], __cpuidle_text_end[]; + /** * sched_idle_set_state - Record idle state for the current CPU. * @idle_state: State to record. @@ -53,7 +56,7 @@ static int __init cpu_idle_nopoll_setup(char *__unused) __setup("hlt", cpu_idle_nopoll_setup); #endif -static inline int cpu_idle_poll(void) +static noinline int __cpuidle cpu_idle_poll(void) { rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); @@ -84,7 +87,7 @@ void __weak arch_cpu_idle(void) * * To use when the cpuidle framework cannot be used. */ -void default_idle_call(void) +void __cpuidle default_idle_call(void) { if (current_clr_polling_and_test()) { local_irq_enable(); @@ -161,11 +164,14 @@ static void cpuidle_idle_call(void) * timekeeping to prevent timer interrupts from kicking us out of idle * until a proper wakeup interrupt happens. */ - if (idle_should_freeze()) { - entered_state = cpuidle_enter_freeze(drv, dev); - if (entered_state > 0) { - local_irq_enable(); - goto exit_idle; + + if (idle_should_freeze() || dev->use_deepest_state) { + if (idle_should_freeze()) { + entered_state = cpuidle_enter_freeze(drv, dev); + if (entered_state > 0) { + local_irq_enable(); + goto exit_idle; + } } next_state = cpuidle_find_deepest_state(drv, dev); @@ -199,77 +205,122 @@ exit_idle: * * Called with polling cleared. */ -static void cpu_idle_loop(void) +static void do_idle(void) { - int cpu = smp_processor_id(); + /* + * If the arch has a polling bit, we maintain an invariant: + * + * Our polling bit is clear if we're not scheduled (i.e. if rq->curr != + * rq->idle). This means that, if rq->idle has the polling bit set, + * then setting need_resched is guaranteed to cause the CPU to + * reschedule. + */ + + __current_set_polling(); + tick_nohz_idle_enter(); + + while (!need_resched()) { + check_pgt_cache(); + rmb(); + + if (cpu_is_offline(smp_processor_id())) { + cpuhp_report_idle_dead(); + arch_cpu_idle_dead(); + } + + local_irq_disable(); + arch_cpu_idle_enter(); - while (1) { /* - * If the arch has a polling bit, we maintain an invariant: - * - * Our polling bit is clear if we're not scheduled (i.e. if - * rq->curr != rq->idle). This means that, if rq->idle has - * the polling bit set, then setting need_resched is - * guaranteed to cause the cpu to reschedule. + * In poll mode we reenable interrupts and spin. Also if we + * detected in the wakeup from idle path that the tick + * broadcast device expired for us, we don't want to go deep + * idle as we know that the IPI is going to arrive right away. */ + if (cpu_idle_force_poll || tick_check_broadcast_expired()) + cpu_idle_poll(); + else + cpuidle_idle_call(); + arch_cpu_idle_exit(); + } - __current_set_polling(); - quiet_vmstat(); - tick_nohz_idle_enter(); + /* + * Since we fell out of the loop above, we know TIF_NEED_RESCHED must + * be set, propagate it into PREEMPT_NEED_RESCHED. + * + * This is required because for polling idle loops we will not have had + * an IPI to fold the state for us. + */ + preempt_set_need_resched(); + tick_nohz_idle_exit(); + __current_clr_polling(); - while (!need_resched()) { - check_pgt_cache(); - rmb(); + /* + * We promise to call sched_ttwu_pending() and reschedule if + * need_resched() is set while polling is set. That means that clearing + * polling needs to be visible before doing these things. + */ + smp_mb__after_atomic(); - if (cpu_is_offline(cpu)) { - cpuhp_report_idle_dead(); - arch_cpu_idle_dead(); - } + sched_ttwu_pending(); + schedule_preempt_disabled(); +} - local_irq_disable(); - arch_cpu_idle_enter(); - - /* - * In poll mode we reenable interrupts and spin. - * - * Also if we detected in the wakeup from idle - * path that the tick broadcast device expired - * for us, we don't want to go deep idle as we - * know that the IPI is going to arrive right - * away - */ - if (cpu_idle_force_poll || tick_check_broadcast_expired()) - cpu_idle_poll(); - else - cpuidle_idle_call(); - - arch_cpu_idle_exit(); - } +bool cpu_in_idle(unsigned long pc) +{ + return pc >= (unsigned long)__cpuidle_text_start && + pc < (unsigned long)__cpuidle_text_end; +} - /* - * Since we fell out of the loop above, we know - * TIF_NEED_RESCHED must be set, propagate it into - * PREEMPT_NEED_RESCHED. - * - * This is required because for polling idle loops we will - * not have had an IPI to fold the state for us. - */ - preempt_set_need_resched(); - tick_nohz_idle_exit(); - __current_clr_polling(); +struct idle_timer { + struct hrtimer timer; + int done; +}; - /* - * We promise to call sched_ttwu_pending and reschedule - * if need_resched is set while polling is set. That - * means that clearing polling needs to be visible - * before doing these things. - */ - smp_mb__after_atomic(); +static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer) +{ + struct idle_timer *it = container_of(timer, struct idle_timer, timer); - sched_ttwu_pending(); - schedule_preempt_disabled(); - } + WRITE_ONCE(it->done, 1); + set_tsk_need_resched(current); + + return HRTIMER_NORESTART; +} + +void play_idle(unsigned long duration_ms) +{ + struct idle_timer it; + + /* + * Only FIFO tasks can disable the tick since they don't need the forced + * preemption. + */ + WARN_ON_ONCE(current->policy != SCHED_FIFO); + WARN_ON_ONCE(current->nr_cpus_allowed != 1); + WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); + WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); + WARN_ON_ONCE(!duration_ms); + + rcu_sleep_check(); + preempt_disable(); + current->flags |= PF_IDLE; + cpuidle_use_deepest_state(true); + + it.done = 0; + hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + it.timer.function = idle_inject_timer_fn; + hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED); + + while (!READ_ONCE(it.done)) + do_idle(); + + cpuidle_use_deepest_state(false); + current->flags &= ~PF_IDLE; + + preempt_fold_need_resched(); + preempt_enable(); } +EXPORT_SYMBOL_GPL(play_idle); void cpu_startup_entry(enum cpuhp_state state) { @@ -290,5 +341,6 @@ void cpu_startup_entry(enum cpuhp_state state) #endif arch_cpu_idle_prepare(); cpuhp_online_idle(state); - cpu_idle_loop(); + while (1) + do_idle(); } diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 2ce5458bbe1d..5405d3feb112 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -27,8 +27,8 @@ static struct task_struct * pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { put_prev_task(rq, prev); - - schedstat_inc(rq, sched_goidle); + update_idle_core(rq); + schedstat_inc(rq->sched_goidle); return rq->idle; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index d5690b722691..2516b8df6dbb 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -957,9 +957,8 @@ static void update_curr_rt(struct rq *rq) if (unlikely((s64)delta_exec <= 0)) return; - /* Kick cpufreq (see the comment in linux/cpufreq.h). */ - if (cpu_of(rq) == smp_processor_id()) - cpufreq_trigger_update(rq_clock(rq)); + /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT); schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c64fc5114004..7b34c7826ca5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2,6 +2,7 @@ #include <linux/sched.h> #include <linux/sched/sysctl.h> #include <linux/sched/rt.h> +#include <linux/u64_stats_sync.h> #include <linux/sched/deadline.h> #include <linux/binfmts.h> #include <linux/mutex.h> @@ -15,6 +16,12 @@ #include "cpudeadline.h" #include "cpuacct.h" +#ifdef CONFIG_SCHED_DEBUG +#define SCHED_WARN_ON(x) WARN_ONCE(x, #x) +#else +#define SCHED_WARN_ON(x) ((void)(x)) +#endif + struct rq; struct cpuidle_state; @@ -397,6 +404,7 @@ struct cfs_rq { unsigned long runnable_load_avg; #ifdef CONFIG_FAIR_GROUP_SCHED unsigned long tg_load_avg_contrib; + unsigned long propagate_avg; #endif atomic_long_t removed_load_avg, removed_util_avg; #ifndef CONFIG_64BIT @@ -532,6 +540,11 @@ struct dl_rq { #ifdef CONFIG_SMP +static inline bool sched_asym_prefer(int a, int b) +{ + return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); +} + /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by @@ -565,6 +578,8 @@ struct root_domain { */ cpumask_var_t rto_mask; struct cpupri cpupri; + + unsigned long max_cpu_capacity; }; extern struct root_domain def_root_domain; @@ -597,7 +612,6 @@ struct rq { #ifdef CONFIG_SMP unsigned long last_load_update_tick; #endif /* CONFIG_SMP */ - u64 nohz_stamp; unsigned long nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL @@ -615,6 +629,7 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; + struct list_head *tmp_alone_branch; #endif /* CONFIG_FAIR_GROUP_SCHED */ /* @@ -723,6 +738,23 @@ static inline int cpu_of(struct rq *rq) #endif } + +#ifdef CONFIG_SCHED_SMT + +extern struct static_key_false sched_smt_present; + +extern void __update_idle_core(struct rq *rq); + +static inline void update_idle_core(struct rq *rq) +{ + if (static_branch_unlikely(&sched_smt_present)) + __update_idle_core(rq); +} + +#else +static inline void update_idle_core(struct rq *rq) { } +#endif + DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) @@ -857,8 +889,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); +DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain *, sd_numa); -DECLARE_PER_CPU(struct sched_domain *, sd_busy); DECLARE_PER_CPU(struct sched_domain *, sd_asym); struct sched_group_capacity { @@ -867,13 +899,10 @@ struct sched_group_capacity { * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity * for a single CPU. */ - unsigned int capacity; + unsigned long capacity; + unsigned long min_capacity; /* Min per-CPU capacity in group */ unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ - /* - * Number of busy cpus in this group. - */ - atomic_t nr_busy_cpus; unsigned long cpumask[0]; /* iteration mask */ }; @@ -884,6 +913,7 @@ struct sched_group { unsigned int group_weight; struct sched_group_capacity *sgc; + int asym_prefer_cpu; /* cpu of highest priority in group */ /* * The CPUs this group covers. @@ -1000,7 +1030,11 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) * per-task data have been completed by this moment. */ smp_wmb(); +#ifdef CONFIG_THREAD_INFO_IN_TASK + p->cpu = cpu; +#else task_thread_info(p)->cpu = cpu; +#endif p->wake_cpu = cpu; #endif } @@ -1260,6 +1294,11 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev) prev->sched_class->put_prev_task(rq, prev); } +static inline void set_curr_task(struct rq *rq, struct task_struct *curr) +{ + curr->sched_class->set_curr_task(rq); +} + #define sched_class_highest (&stop_sched_class) #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) @@ -1290,7 +1329,7 @@ static inline void idle_set_state(struct rq *rq, static inline struct cpuidle_state *idle_get_state(struct rq *rq) { - WARN_ON(!rcu_read_lock_held()); + SCHED_WARN_ON(!rcu_read_lock_held()); return rq->idle_state; } #else @@ -1710,52 +1749,28 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { } #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING +struct irqtime { + u64 hardirq_time; + u64 softirq_time; + u64 irq_start_time; + struct u64_stats_sync sync; +}; -DECLARE_PER_CPU(u64, cpu_hardirq_time); -DECLARE_PER_CPU(u64, cpu_softirq_time); - -#ifndef CONFIG_64BIT -DECLARE_PER_CPU(seqcount_t, irq_time_seq); - -static inline void irq_time_write_begin(void) -{ - __this_cpu_inc(irq_time_seq.sequence); - smp_wmb(); -} - -static inline void irq_time_write_end(void) -{ - smp_wmb(); - __this_cpu_inc(irq_time_seq.sequence); -} +DECLARE_PER_CPU(struct irqtime, cpu_irqtime); static inline u64 irq_time_read(int cpu) { - u64 irq_time; - unsigned seq; + struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); + unsigned int seq; + u64 total; do { - seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); - irq_time = per_cpu(cpu_softirq_time, cpu) + - per_cpu(cpu_hardirq_time, cpu); - } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); + seq = __u64_stats_fetch_begin(&irqtime->sync); + total = irqtime->softirq_time + irqtime->hardirq_time; + } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); - return irq_time; -} -#else /* CONFIG_64BIT */ -static inline void irq_time_write_begin(void) -{ + return total; } - -static inline void irq_time_write_end(void) -{ -} - -static inline u64 irq_time_read(int cpu) -{ - return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); -} -#endif /* CONFIG_64BIT */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ #ifdef CONFIG_CPU_FREQ @@ -1763,27 +1778,13 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); /** * cpufreq_update_util - Take a note about CPU utilization changes. - * @time: Current time. - * @util: Current utilization. - * @max: Utilization ceiling. + * @rq: Runqueue to carry out the update for. + * @flags: Update reason flags. * - * This function is called by the scheduler on every invocation of - * update_load_avg() on the CPU whose utilization is being updated. + * This function is called by the scheduler on the CPU whose utilization is + * being updated. * * It can only be called from RCU-sched read-side critical sections. - */ -static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) -{ - struct update_util_data *data; - - data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); - if (data) - data->func(data, time, util, max); -} - -/** - * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed. - * @time: Current time. * * The way cpufreq is currently arranged requires it to evaluate the CPU * performance state (frequency/voltage) on a regular basis to prevent it from @@ -1797,13 +1798,23 @@ static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned lo * but that really is a band-aid. Going forward it should be replaced with * solutions targeted more specifically at RT and DL tasks. */ -static inline void cpufreq_trigger_update(u64 time) +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) +{ + struct update_util_data *data; + + data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); + if (data) + data->func(data, rq_clock(rq), flags); +} + +static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) { - cpufreq_update_util(time, ULONG_MAX, 0); + if (cpu_of(rq) == smp_processor_id()) + cpufreq_update_util(rq, flags); } #else -static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {} -static inline void cpufreq_trigger_update(u64 time) {} +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} +static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ #ifdef arch_scale_freq_capacity diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 78955cbea31c..34659a853505 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -29,11 +29,12 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) if (rq) rq->rq_sched_info.run_delay += delta; } -# define schedstat_enabled() static_branch_unlikely(&sched_schedstats) -# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) -# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) -# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) -# define schedstat_val(rq, field) ((schedstat_enabled()) ? (rq)->field : 0) +#define schedstat_enabled() static_branch_unlikely(&sched_schedstats) +#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) +#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) +#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) +#define schedstat_val(var) (var) +#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) #else /* !CONFIG_SCHEDSTATS */ static inline void @@ -45,12 +46,13 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) static inline void rq_sched_info_depart(struct rq *rq, unsigned long long delta) {} -# define schedstat_enabled() 0 -# define schedstat_inc(rq, field) do { } while (0) -# define schedstat_add(rq, field, amt) do { } while (0) -# define schedstat_set(var, val) do { } while (0) -# define schedstat_val(rq, field) 0 -#endif +#define schedstat_enabled() 0 +#define schedstat_inc(var) do { } while (0) +#define schedstat_add(var, amt) do { } while (0) +#define schedstat_set(var, val) do { } while (0) +#define schedstat_val(var) 0 +#define schedstat_val_or_zero(var) 0 +#endif /* CONFIG_SCHEDSTATS */ #ifdef CONFIG_SCHED_INFO static inline void sched_info_reset_dequeued(struct task_struct *t) diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index f15d6b6a538a..9453efe9b25a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -196,27 +196,48 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) } EXPORT_SYMBOL(prepare_to_wait_exclusive); -long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) +void init_wait_entry(wait_queue_t *wait, int flags) { - unsigned long flags; - - if (signal_pending_state(state, current)) - return -ERESTARTSYS; - + wait->flags = flags; wait->private = current; wait->func = autoremove_wake_function; + INIT_LIST_HEAD(&wait->task_list); +} +EXPORT_SYMBOL(init_wait_entry); + +long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + long ret = 0; spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) { - if (wait->flags & WQ_FLAG_EXCLUSIVE) - __add_wait_queue_tail(q, wait); - else - __add_wait_queue(q, wait); + if (unlikely(signal_pending_state(state, current))) { + /* + * Exclusive waiter must not fail if it was selected by wakeup, + * it should "consume" the condition we were waiting for. + * + * The caller will recheck the condition and return success if + * we were already woken up, we can not miss the event because + * wakeup locks/unlocks the same q->lock. + * + * But we need to ensure that set-condition + wakeup after that + * can't see us, it should wake up another exclusive waiter if + * we fail. + */ + list_del_init(&wait->task_list); + ret = -ERESTARTSYS; + } else { + if (list_empty(&wait->task_list)) { + if (wait->flags & WQ_FLAG_EXCLUSIVE) + __add_wait_queue_tail(q, wait); + else + __add_wait_queue(q, wait); + } + set_current_state(state); } - set_current_state(state); spin_unlock_irqrestore(&q->lock, flags); - return 0; + return ret; } EXPORT_SYMBOL(prepare_to_wait_event); @@ -255,39 +276,6 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) } EXPORT_SYMBOL(finish_wait); -/** - * abort_exclusive_wait - abort exclusive waiting in a queue - * @q: waitqueue waited on - * @wait: wait descriptor - * @mode: runstate of the waiter to be woken - * @key: key to identify a wait bit queue or %NULL - * - * Sets current thread back to running state and removes - * the wait descriptor from the given waitqueue if still - * queued. - * - * Wakes up the next waiter if the caller is concurrently - * woken up through the queue. - * - * This prevents waiter starvation where an exclusive waiter - * aborts and is woken up concurrently and no one wakes up - * the next waiter. - */ -void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, - unsigned int mode, void *key) -{ - unsigned long flags; - - __set_current_state(TASK_RUNNING); - spin_lock_irqsave(&q->lock, flags); - if (!list_empty(&wait->task_list)) - list_del_init(&wait->task_list); - else if (waitqueue_active(q)) - __wake_up_locked_key(q, mode, key); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(abort_exclusive_wait); - int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); @@ -425,20 +413,29 @@ int __sched __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, wait_bit_action_f *action, unsigned mode) { - do { - int ret; + int ret = 0; + for (;;) { prepare_to_wait_exclusive(wq, &q->wait, mode); - if (!test_bit(q->key.bit_nr, q->key.flags)) - continue; - ret = action(&q->key, mode); - if (!ret) - continue; - abort_exclusive_wait(wq, &q->wait, mode, &q->key); - return ret; - } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); - finish_wait(wq, &q->wait); - return 0; + if (test_bit(q->key.bit_nr, q->key.flags)) { + ret = action(&q->key, mode); + /* + * See the comment in prepare_to_wait_event(). + * finish_wait() does not necessarily takes wq->lock, + * but test_and_set_bit() implies mb() which pairs with + * smp_mb__after_atomic() before wake_up_page(). + */ + if (ret) + finish_wait(wq, &q->wait); + } + if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) { + if (!ret) + finish_wait(wq, &q->wait); + return 0; + } else if (ret) { + return ret; + } + } } EXPORT_SYMBOL(__wait_on_bit_lock); @@ -483,16 +480,6 @@ void wake_up_bit(void *word, int bit) } EXPORT_SYMBOL(wake_up_bit); -wait_queue_head_t *bit_waitqueue(void *word, int bit) -{ - const int shift = BITS_PER_LONG == 32 ? 5 : 6; - const struct zone *zone = page_zone(virt_to_page(word)); - unsigned long val = (unsigned long)word << shift | bit; - - return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; -} -EXPORT_SYMBOL(bit_waitqueue); - /* * Manipulate the atomic_t address to produce a better bit waitqueue table hash * index (we're keying off bit -1, but that would produce a horrible hash diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 0db7c8a2afe2..f7ce79a46050 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -41,8 +41,7 @@ * outside of a lifetime-guarded section. In general, this * is only needed for handling filters shared across tasks. * @prev: points to a previously installed, or inherited, filter - * @len: the number of instructions in the program - * @insnsi: the BPF program instructions to evaluate + * @prog: the BPF program to evaluate * * seccomp_filter objects are organized in a tree linked via the @prev * pointer. For any task, it appears to be a singly-linked list starting @@ -168,8 +167,8 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) } /** - * seccomp_run_filters - evaluates all seccomp filters against @syscall - * @syscall: number of the current system call + * seccomp_run_filters - evaluates all seccomp filters against @sd + * @sd: optional seccomp data to be passed to filters * * Returns valid seccomp BPF response codes. */ @@ -195,7 +194,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd) * value always takes priority (ignoring the DATA). */ for (; f; f = f->prev) { - u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd); + u32 cur_ret = BPF_PROG_RUN(f->prog, sd); if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) ret = cur_ret; diff --git a/kernel/signal.c b/kernel/signal.c index af21afc00d08..29a410780aa9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -427,6 +427,7 @@ void flush_signals(struct task_struct *t) spin_unlock_irqrestore(&t->sighand->siglock, flags); } +#ifdef CONFIG_POSIX_TIMERS static void __flush_itimer_signals(struct sigpending *pending) { sigset_t signal, retain; @@ -460,6 +461,7 @@ void flush_itimer_signals(void) __flush_itimer_signals(&tsk->signal->shared_pending); spin_unlock_irqrestore(&tsk->sighand->siglock, flags); } +#endif void ignore_signals(struct task_struct *t) { @@ -567,6 +569,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) if (!signr) { signr = __dequeue_signal(&tsk->signal->shared_pending, mask, info); +#ifdef CONFIG_POSIX_TIMERS /* * itimer signal ? * @@ -590,6 +593,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) hrtimer_restart(tmr); } } +#endif } recalc_sigpending(); @@ -611,6 +615,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) */ current->jobctl |= JOBCTL_STOP_DEQUEUED; } +#ifdef CONFIG_POSIX_TIMERS if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { /* * Release the siglock to ensure proper locking order @@ -622,6 +627,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) do_schedule_next_timer(info); spin_lock(&tsk->sighand->siglock); } +#endif return signr; } @@ -3044,6 +3050,11 @@ void kernel_sigaction(int sig, __sighandler_t action) } EXPORT_SYMBOL(kernel_sigaction); +void __weak sigaction_compat_abi(struct k_sigaction *act, + struct k_sigaction *oact) +{ +} + int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { struct task_struct *p = current, *t; @@ -3059,6 +3070,8 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) if (oact) *oact = *k; + sigaction_compat_abi(act, oact); + if (act) { sigdelsetmask(&act->sa.sa_mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); diff --git a/kernel/smp.c b/kernel/smp.c index 3aa642d39c03..77fcdb9f2775 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -3,6 +3,9 @@ * * (C) Jens Axboe <jens.axboe@oracle.com> 2008 */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/irq_work.h> #include <linux/rcupdate.h> #include <linux/rculist.h> @@ -14,6 +17,7 @@ #include <linux/smp.h> #include <linux/cpu.h> #include <linux/sched.h> +#include <linux/hypervisor.h> #include "smpboot.h" @@ -542,19 +546,17 @@ void __init setup_nr_cpu_ids(void) nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; } -void __weak smp_announce(void) -{ - printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus()); -} - /* Called by boot processor to activate the rest. */ void __init smp_init(void) { + int num_nodes, num_cpus; unsigned int cpu; idle_threads_init(); cpuhp_threads_init(); + pr_info("Bringing up secondary CPUs ...\n"); + /* FIXME: This should be done in userspace --RR */ for_each_present_cpu(cpu) { if (num_online_cpus() >= setup_max_cpus) @@ -563,8 +565,13 @@ void __init smp_init(void) cpu_up(cpu); } + num_nodes = num_online_nodes(); + num_cpus = num_online_cpus(); + pr_info("Brought up %d node%s, %d CPU%s\n", + num_nodes, (num_nodes > 1 ? "s" : ""), + num_cpus, (num_cpus > 1 ? "s" : "")); + /* Any cleanup work */ - smp_announce(); smp_cpus_done(setup_max_cpus); } @@ -724,3 +731,54 @@ void wake_up_all_idle_cpus(void) preempt_enable(); } EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); + +/** + * smp_call_on_cpu - Call a function on a specific cpu + * + * Used to call a function on a specific cpu and wait for it to return. + * Optionally make sure the call is done on a specified physical cpu via vcpu + * pinning in order to support virtualized environments. + */ +struct smp_call_on_cpu_struct { + struct work_struct work; + struct completion done; + int (*func)(void *); + void *data; + int ret; + int cpu; +}; + +static void smp_call_on_cpu_callback(struct work_struct *work) +{ + struct smp_call_on_cpu_struct *sscs; + + sscs = container_of(work, struct smp_call_on_cpu_struct, work); + if (sscs->cpu >= 0) + hypervisor_pin_vcpu(sscs->cpu); + sscs->ret = sscs->func(sscs->data); + if (sscs->cpu >= 0) + hypervisor_pin_vcpu(-1); + + complete(&sscs->done); +} + +int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) +{ + struct smp_call_on_cpu_struct sscs = { + .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done), + .func = func, + .data = par, + .cpu = phys ? cpu : -1, + }; + + INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback); + + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) + return -ENXIO; + + queue_work_on(cpu, system_wq, &sscs.work); + wait_for_completion(&sscs.done); + + return sscs.ret; +} +EXPORT_SYMBOL_GPL(smp_call_on_cpu); diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 13bc43d1fb22..4a5c6e73ecd4 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -186,6 +186,11 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) kfree(td); return PTR_ERR(tsk); } + /* + * Park the thread so that it could start right on the CPU + * when it is available. + */ + kthread_park(tsk); get_task_struct(tsk); *per_cpu_ptr(ht->store, cpu) = tsk; if (ht->create) { diff --git a/kernel/softirq.c b/kernel/softirq.c index 17caf4b63342..744fa611cae0 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -58,7 +58,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp DEFINE_PER_CPU(struct task_struct *, ksoftirqd); const char * const softirq_to_name[NR_SOFTIRQS] = { - "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", + "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL", "TASKLET", "SCHED", "HRTIMER", "RCU" }; @@ -78,6 +78,17 @@ static void wakeup_softirqd(void) } /* + * If ksoftirqd is scheduled, we do not want to process pending softirqs + * right now. Let ksoftirqd handle this at its own rate, to get fairness. + */ +static bool ksoftirqd_running(void) +{ + struct task_struct *tsk = __this_cpu_read(ksoftirqd); + + return tsk && (tsk->state == TASK_RUNNING); +} + +/* * preempt_count and SOFTIRQ_OFFSET usage: * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving * softirq processing. @@ -313,7 +324,7 @@ asmlinkage __visible void do_softirq(void) pending = local_softirq_pending(); - if (pending) + if (pending && !ksoftirqd_running()) do_softirq_own_stack(); local_irq_restore(flags); @@ -340,6 +351,9 @@ void irq_enter(void) static inline void invoke_softirq(void) { + if (ksoftirqd_running()) + return; + if (!force_irqthreads) { #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK /* @@ -482,7 +496,7 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t) } EXPORT_SYMBOL(__tasklet_hi_schedule_first); -static void tasklet_action(struct softirq_action *a) +static __latent_entropy void tasklet_action(struct softirq_action *a) { struct tasklet_struct *list; @@ -518,7 +532,7 @@ static void tasklet_action(struct softirq_action *a) } } -static void tasklet_hi_action(struct softirq_action *a) +static __latent_entropy void tasklet_hi_action(struct softirq_action *a) { struct tasklet_struct *list; @@ -700,7 +714,7 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) BUG(); } -static void takeover_tasklets(unsigned int cpu) +static int takeover_tasklets(unsigned int cpu) { /* CPU is dead, so no lock needed. */ local_irq_disable(); @@ -723,27 +737,12 @@ static void takeover_tasklets(unsigned int cpu) raise_softirq_irqoff(HI_SOFTIRQ); local_irq_enable(); + return 0; } +#else +#define takeover_tasklets NULL #endif /* CONFIG_HOTPLUG_CPU */ -static int cpu_callback(struct notifier_block *nfb, unsigned long action, - void *hcpu) -{ - switch (action) { -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - case CPU_DEAD_FROZEN: - takeover_tasklets((unsigned long)hcpu); - break; -#endif /* CONFIG_HOTPLUG_CPU */ - } - return NOTIFY_OK; -} - -static struct notifier_block cpu_nfb = { - .notifier_call = cpu_callback -}; - static struct smp_hotplug_thread softirq_threads = { .store = &ksoftirqd, .thread_should_run = ksoftirqd_should_run, @@ -753,8 +752,8 @@ static struct smp_hotplug_thread softirq_threads = { static __init int spawn_ksoftirqd(void) { - register_cpu_notifier(&cpu_nfb); - + cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL, + takeover_tasklets); BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); return 0; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 4a1ca5f6da7e..1eb82661ecdb 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -20,7 +20,6 @@ #include <linux/kallsyms.h> #include <linux/smpboot.h> #include <linux/atomic.h> -#include <linux/lglock.h> #include <linux/nmi.h> /* @@ -47,13 +46,9 @@ struct cpu_stopper { static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); static bool stop_machine_initialized = false; -/* - * Avoids a race between stop_two_cpus and global stop_cpus, where - * the stoppers could get queued up in reverse order, leading to - * system deadlock. Using an lglock means stop_two_cpus remains - * relatively cheap. - */ -DEFINE_STATIC_LGLOCK(stop_cpus_lock); +/* static data for stop_cpus */ +static DEFINE_MUTEX(stop_cpus_mutex); +static bool stop_cpus_in_progress; static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) { @@ -126,6 +121,11 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) cpu_stop_init_done(&done, 1); if (!cpu_stop_queue_work(cpu, &work)) return -ENOENT; + /* + * In case @cpu == smp_proccessor_id() we can avoid a sleep+wakeup + * cycle by doing a preemption: + */ + cond_resched(); wait_for_completion(&done.completion); return done.ret; } @@ -194,7 +194,7 @@ static int multi_cpu_stop(void *data) /* Simple state machine */ do { /* Chill out and ensure we re-read multi_stop_state. */ - cpu_relax(); + cpu_relax_yield(); if (msdata->state != curstate) { curstate = msdata->state; switch (curstate) { @@ -230,14 +230,26 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); int err; - - lg_double_lock(&stop_cpus_lock, cpu1, cpu2); +retry: spin_lock_irq(&stopper1->lock); spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); err = -ENOENT; if (!stopper1->enabled || !stopper2->enabled) goto unlock; + /* + * Ensure that if we race with __stop_cpus() the stoppers won't get + * queued up in reverse order leading to system deadlock. + * + * We can't miss stop_cpus_in_progress if queue_stop_cpus_work() has + * queued a work on cpu1 but not on cpu2, we hold both locks. + * + * It can be falsely true but it is safe to spin until it is cleared, + * queue_stop_cpus_work() does everything under preempt_disable(). + */ + err = -EDEADLK; + if (unlikely(stop_cpus_in_progress)) + goto unlock; err = 0; __cpu_stop_queue_work(stopper1, work1); @@ -245,8 +257,12 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, unlock: spin_unlock(&stopper2->lock); spin_unlock_irq(&stopper1->lock); - lg_double_unlock(&stop_cpus_lock, cpu1, cpu2); + if (unlikely(err == -EDEADLK)) { + while (stop_cpus_in_progress) + cpu_relax(); + goto retry; + } return err; } /** @@ -316,9 +332,6 @@ bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, return cpu_stop_queue_work(cpu, work_buf); } -/* static data for stop_cpus */ -static DEFINE_MUTEX(stop_cpus_mutex); - static bool queue_stop_cpus_work(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg, struct cpu_stop_done *done) @@ -332,7 +345,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask, * preempted by a stopper which might wait for other stoppers * to enter @fn which can lead to deadlock. */ - lg_global_lock(&stop_cpus_lock); + preempt_disable(); + stop_cpus_in_progress = true; for_each_cpu(cpu, cpumask) { work = &per_cpu(cpu_stopper.stop_work, cpu); work->fn = fn; @@ -341,7 +355,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask, if (cpu_stop_queue_work(cpu, work)) queued = true; } - lg_global_unlock(&stop_cpus_lock); + stop_cpus_in_progress = false; + preempt_enable(); return queued; } diff --git a/kernel/sys.c b/kernel/sys.c index 89d5be418157..9758892a2d09 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1416,7 +1416,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, * applications, so we live with it */ if (!retval && new_rlim && resource == RLIMIT_CPU && - new_rlim->rlim_cur != RLIM_INFINITY) + new_rlim->rlim_cur != RLIM_INFINITY && + IS_ENABLED(CONFIG_POSIX_TIMERS)) update_rlimit_cpu(tsk, new_rlim->rlim_cur); out: read_unlock(&tasklist_lock); @@ -1696,16 +1697,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) fput(exe_file); } - /* - * The symlink can be changed only once, just to disallow arbitrary - * transitions malicious software might bring in. This means one - * could make a snapshot over all processes running and monitor - * /proc/pid/exe changes to notice unusual activity if needed. - */ - err = -EPERM; - if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) - goto exit; - err = 0; /* set the new file, lockless */ get_file(exe.file); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 2c5e3a8e00d7..635482e60ca3 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -250,3 +250,8 @@ cond_syscall(sys_execveat); /* membarrier */ cond_syscall(sys_membarrier); + +/* memory protection keys */ +cond_syscall(sys_pkey_mprotect); +cond_syscall(sys_pkey_alloc); +cond_syscall(sys_pkey_free); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a13bbdaab47d..39b3368f6de6 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -65,6 +65,7 @@ #include <linux/sched/sysctl.h> #include <linux/kexec.h> #include <linux/bpf.h> +#include <linux/mount.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -106,9 +107,8 @@ extern unsigned int core_pipe_limit; extern int pid_max; extern int pid_max_min, pid_max_max; extern int percpu_pagelist_fraction; -extern int compat_log; extern int latencytop_enabled; -extern int sysctl_nr_open_min, sysctl_nr_open_max; +extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max; #ifndef CONFIG_MMU extern int sysctl_nr_trim_pages; #endif @@ -347,13 +347,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "sched_shares_window_ns", - .data = &sysctl_sched_shares_window, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #ifdef CONFIG_SCHEDSTATS { .procname = "sched_schedstats", @@ -990,13 +983,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, { - .procname = "kstack_depth_to_print", - .data = &kstack_depth_to_print, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { .procname = "io_delay_type", .data = &io_delay_type, .maxlen = sizeof(int), @@ -1084,15 +1070,6 @@ static struct ctl_table kern_table[] = { .extra1 = &neg_one, }, #endif -#ifdef CONFIG_COMPAT - { - .procname = "compat-log", - .data = &compat_log, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_RT_MUTEXES { .procname = "max_lock_depth", @@ -1692,7 +1669,7 @@ static struct ctl_table fs_table[] = { { .procname = "nr_open", .data = &sysctl_nr_open, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &sysctl_nr_open_min, @@ -1838,6 +1815,14 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, + { + .procname = "mount-max", + .data = &sysctl_mount_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + }, { } }; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index b3f05ee20d18..8a5e44236f78 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -41,12 +41,7 @@ static DEFINE_PER_CPU(__u32, taskstats_seqnum); static int family_registered; struct kmem_cache *taskstats_cache; -static struct genl_family family = { - .id = GENL_ID_GENERATE, - .name = TASKSTATS_GENL_NAME, - .version = TASKSTATS_GENL_VERSION, - .maxattr = TASKSTATS_CMD_ATTR_MAX, -}; +static struct genl_family family; static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, @@ -54,7 +49,11 @@ static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; -static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = { +/* + * We have to use TASKSTATS_CMD_ATTR_MAX here, it is the maxattr in the family. + * Make sure they are always aligned. + */ +static const struct nla_policy cgroupstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, }; @@ -651,6 +650,15 @@ static const struct genl_ops taskstats_ops[] = { }, }; +static struct genl_family family __ro_after_init = { + .name = TASKSTATS_GENL_NAME, + .version = TASKSTATS_GENL_VERSION, + .maxattr = TASKSTATS_CMD_ATTR_MAX, + .module = THIS_MODULE, + .ops = taskstats_ops, + .n_ops = ARRAY_SIZE(taskstats_ops), +}; + /* Needed early in initialization */ void __init taskstats_init_early(void) { @@ -667,7 +675,7 @@ static int __init taskstats_init(void) { int rc; - rc = genl_register_family_with_ops(&family, taskstats_ops); + rc = genl_register_family(&family); if (rc) return rc; diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 49eca0beed32..976840d29a71 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,6 +1,12 @@ -obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o +obj-y += time.o timer.o hrtimer.o obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o -obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o +obj-y += timeconv.o timecounter.o alarmtimer.o + +ifeq ($(CONFIG_POSIX_TIMERS),y) + obj-y += posix-timers.o posix-cpu-timers.o posix-clock.o itimer.o +else + obj-y += posix-stubs.o +endif obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o tick-common.o ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c3aad685bbc0..9b08ca391aed 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -26,6 +26,9 @@ #include <linux/workqueue.h> #include <linux/freezer.h> +#define CREATE_TRACE_POINTS +#include <trace/events/alarmtimer.h> + /** * struct alarm_base - Alarm timer bases * @lock: Lock for syncrhonized access to the base @@ -40,7 +43,9 @@ static struct alarm_base { clockid_t base_clockid; } alarm_bases[ALARM_NUMTYPE]; -/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */ +/* freezer information to handle clock_nanosleep triggered wakeups */ +static enum alarmtimer_type freezer_alarmtype; +static ktime_t freezer_expires; static ktime_t freezer_delta; static DEFINE_SPINLOCK(freezer_delta_lock); @@ -194,6 +199,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) } spin_unlock_irqrestore(&base->lock, flags); + trace_alarmtimer_fired(alarm, base->gettime()); return ret; } @@ -218,15 +224,16 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining); */ static int alarmtimer_suspend(struct device *dev) { - struct rtc_time tm; - ktime_t min, now; - unsigned long flags; + ktime_t min, now, expires; + int i, ret, type; struct rtc_device *rtc; - int i; - int ret; + unsigned long flags; + struct rtc_time tm; spin_lock_irqsave(&freezer_delta_lock, flags); min = freezer_delta; + expires = freezer_expires; + type = freezer_alarmtype; freezer_delta = ktime_set(0, 0); spin_unlock_irqrestore(&freezer_delta_lock, flags); @@ -247,8 +254,11 @@ static int alarmtimer_suspend(struct device *dev) if (!next) continue; delta = ktime_sub(next->expires, base->gettime()); - if (!min.tv64 || (delta.tv64 < min.tv64)) + if (!min.tv64 || (delta.tv64 < min.tv64)) { + expires = next->expires; min = delta; + type = i; + } } if (min.tv64 == 0) return 0; @@ -258,6 +268,8 @@ static int alarmtimer_suspend(struct device *dev) return -EBUSY; } + trace_alarmtimer_suspend(expires, type); + /* Setup an rtc timer to fire that far in the future */ rtc_timer_cancel(rtc, &rtctimer); rtc_read_time(rtc, &tm); @@ -295,15 +307,32 @@ static int alarmtimer_resume(struct device *dev) static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) { - ktime_t delta; + struct alarm_base *base; unsigned long flags; - struct alarm_base *base = &alarm_bases[type]; + ktime_t delta; + + switch(type) { + case ALARM_REALTIME: + base = &alarm_bases[ALARM_REALTIME]; + type = ALARM_REALTIME_FREEZER; + break; + case ALARM_BOOTTIME: + base = &alarm_bases[ALARM_BOOTTIME]; + type = ALARM_BOOTTIME_FREEZER; + break; + default: + WARN_ONCE(1, "Invalid alarm type: %d\n", type); + return; + } delta = ktime_sub(absexp, base->gettime()); spin_lock_irqsave(&freezer_delta_lock, flags); - if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64)) + if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64)) { freezer_delta = delta; + freezer_expires = absexp; + freezer_alarmtype = type; + } spin_unlock_irqrestore(&freezer_delta_lock, flags); } @@ -342,6 +371,8 @@ void alarm_start(struct alarm *alarm, ktime_t start) alarmtimer_enqueue(base, alarm); hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); spin_unlock_irqrestore(&base->lock, flags); + + trace_alarmtimer_start(alarm, base->gettime()); } EXPORT_SYMBOL_GPL(alarm_start); @@ -390,6 +421,8 @@ int alarm_try_to_cancel(struct alarm *alarm) if (ret >= 0) alarmtimer_dequeue(base, alarm); spin_unlock_irqrestore(&base->lock, flags); + + trace_alarmtimer_cancel(alarm, base->gettime()); return ret; } EXPORT_SYMBOL_GPL(alarm_try_to_cancel); @@ -542,7 +575,6 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) static int alarm_timer_create(struct k_itimer *new_timer) { enum alarmtimer_type type; - struct alarm_base *base; if (!alarmtimer_get_rtcdev()) return -ENOTSUPP; @@ -551,7 +583,6 @@ static int alarm_timer_create(struct k_itimer *new_timer) return -EPERM; type = clock2alarm(new_timer->it_clock); - base = &alarm_bases[type]; alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); return 0; } @@ -848,8 +879,10 @@ static int __init alarmtimer_init(void) alarmtimer_rtc_timer_init(); - posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); - posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); + if (IS_ENABLED(CONFIG_POSIX_TIMERS)) { + posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); + posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); + } /* Initialize alarm bases */ alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 6a5a310a1a53..150242ccfcd2 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -89,6 +89,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) *mult = tmp; *shift = sft; } +EXPORT_SYMBOL_GPL(clocks_calc_mult_shift); /*[Clocksource internal variables]--------- * curr_clocksource: @@ -600,9 +601,18 @@ static void __clocksource_select(bool skipcur) */ if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { /* Override clocksource cannot be used. */ - pr_warn("Override clocksource %s is not HRT compatible - cannot switch while in HRT/NOHZ mode\n", - cs->name); - override_name[0] = 0; + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + pr_warn("Override clocksource %s is unstable and not HRT compatible - cannot switch while in HRT/NOHZ mode\n", + cs->name); + override_name[0] = 0; + } else { + /* + * The override cannot be currently verified. + * Deferring to let the watchdog check. + */ + pr_info("Override clocksource %s is not currently HRT compatible - deferring\n", + cs->name); + } } else /* Override clocksource can be used. */ best = cs; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 9ba7c820fc23..08be5c99d26b 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -307,7 +307,7 @@ EXPORT_SYMBOL_GPL(__ktime_divns); */ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) { - ktime_t res = ktime_add(lhs, rhs); + ktime_t res = ktime_add_unsafe(lhs, rhs); /* * We use KTIME_SEC_MAX here, the maximum timeout which we can @@ -703,7 +703,7 @@ static void clock_was_set_work(struct work_struct *work) static DECLARE_WORK(hrtimer_work, clock_was_set_work); /* - * Called from timekeeping and resume code to reprogramm the hrtimer + * Called from timekeeping and resume code to reprogram the hrtimer * interrupt device on all cpus. */ void clock_was_set_delayed(void) @@ -1241,7 +1241,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, /* * Note: We clear the running state after enqueue_hrtimer and - * we do not reprogramm the event hardware. Happens either in + * we do not reprogram the event hardware. Happens either in * hrtimer_start_range_ns() or in hrtimer_interrupt() * * Note: Because we dropped the cpu_base->lock above, @@ -1742,15 +1742,19 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns. + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. + * delivered to the current task or the current task is explicitly woken + * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. * - * Returns 0 when the timer has expired otherwise -EINTR + * Returns 0 when the timer has expired. If the task was woken before the + * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or + * by an explicit wakeup, it returns -EINTR. */ int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode) @@ -1772,15 +1776,19 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns. + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. + * delivered to the current task or the current task is explicitly woken + * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. * - * Returns 0 when the timer has expired otherwise -EINTR + * Returns 0 when the timer has expired. If the task was woken before the + * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or + * by an explicit wakeup, it returns -EINTR. */ int __sched schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode) diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 1d5c7204ddc9..2b9f45bc955d 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -238,6 +238,8 @@ again: return 0; } +#ifdef __ARCH_WANT_SYS_ALARM + /** * alarm_setitimer - set alarm in seconds * @@ -250,7 +252,7 @@ again: * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid * negative timeval settings which would cause immediate expiry. */ -unsigned int alarm_setitimer(unsigned int seconds) +static unsigned int alarm_setitimer(unsigned int seconds) { struct itimerval it_new, it_old; @@ -275,6 +277,17 @@ unsigned int alarm_setitimer(unsigned int seconds) return it_old.it_value.tv_sec; } +/* + * For backwards compatibility? This can be done in libc so Alpha + * and all newer ports shouldn't need it. + */ +SYSCALL_DEFINE1(alarm, unsigned int, seconds) +{ + return alarm_setitimer(seconds); +} + +#endif + SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, struct itimerval __user *, ovalue) { diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 39008d78927a..f246763c9947 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -9,7 +9,6 @@ #include <asm/uaccess.h> #include <linux/kernel_stat.h> #include <trace/events/timer.h> -#include <linux/random.h> #include <linux/tick.h> #include <linux/workqueue.h> @@ -133,9 +132,9 @@ static inline unsigned long long prof_ticks(struct task_struct *p) } static inline unsigned long long virt_ticks(struct task_struct *p) { - cputime_t utime; + cputime_t utime, stime; - task_cputime(p, &utime, NULL); + task_cputime(p, &utime, &stime); return cputime_to_expires(utime); } @@ -447,10 +446,7 @@ static void cleanup_timers(struct list_head *head) */ void posix_cpu_timers_exit(struct task_struct *tsk) { - add_device_randomness((const void*) &tsk->se.sum_exec_runtime, - sizeof(unsigned long long)); cleanup_timers(tsk->cpu_timers); - } void posix_cpu_timers_exit_group(struct task_struct *tsk) { diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c new file mode 100644 index 000000000000..cd6716e115e8 --- /dev/null +++ b/kernel/time/posix-stubs.c @@ -0,0 +1,123 @@ +/* + * Dummy stubs used when CONFIG_POSIX_TIMERS=n + * + * Created by: Nicolas Pitre, July 2016 + * Copyright: (C) 2016 Linaro Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/syscalls.h> +#include <linux/ktime.h> +#include <linux/timekeeping.h> +#include <linux/posix-timers.h> + +asmlinkage long sys_ni_posix_timers(void) +{ + pr_err_once("process %d (%s) attempted a POSIX timer syscall " + "while CONFIG_POSIX_TIMERS is not set\n", + current->pid, current->comm); + return -ENOSYS; +} + +#define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers) + +SYS_NI(timer_create); +SYS_NI(timer_gettime); +SYS_NI(timer_getoverrun); +SYS_NI(timer_settime); +SYS_NI(timer_delete); +SYS_NI(clock_adjtime); +SYS_NI(getitimer); +SYS_NI(setitimer); +#ifdef __ARCH_WANT_SYS_ALARM +SYS_NI(alarm); +#endif + +/* + * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC + * as it is easy to remain compatible with little code. CLOCK_BOOTTIME + * is also included for convenience as at least systemd uses it. + */ + +SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, + const struct timespec __user *, tp) +{ + struct timespec new_tp; + + if (which_clock != CLOCK_REALTIME) + return -EINVAL; + if (copy_from_user(&new_tp, tp, sizeof (*tp))) + return -EFAULT; + return do_sys_settimeofday(&new_tp, NULL); +} + +SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, + struct timespec __user *,tp) +{ + struct timespec kernel_tp; + + switch (which_clock) { + case CLOCK_REALTIME: ktime_get_real_ts(&kernel_tp); break; + case CLOCK_MONOTONIC: ktime_get_ts(&kernel_tp); break; + case CLOCK_BOOTTIME: get_monotonic_boottime(&kernel_tp); break; + default: return -EINVAL; + } + if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) + return -EFAULT; + return 0; +} + +SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) +{ + struct timespec rtn_tp = { + .tv_sec = 0, + .tv_nsec = hrtimer_resolution, + }; + + switch (which_clock) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_BOOTTIME: + if (copy_to_user(tp, &rtn_tp, sizeof(rtn_tp))) + return -EFAULT; + return 0; + default: + return -EINVAL; + } +} + +SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, + const struct timespec __user *, rqtp, + struct timespec __user *, rmtp) +{ + struct timespec t; + + switch (which_clock) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_BOOTTIME: + if (copy_from_user(&t, rqtp, sizeof (struct timespec))) + return -EFAULT; + if (!timespec_valid(&t)) + return -EINVAL; + return hrtimer_nanosleep(&t, rmtp, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); + default: + return -EINVAL; + } +} + +#ifdef CONFIG_COMPAT +long clock_nanosleep_restart(struct restart_block *restart_block) +{ + return hrtimer_nanosleep_restart(restart_block); +} +#endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 2ec7c00228f3..71496a20e670 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -186,10 +186,13 @@ static bool check_tick_dependency(atomic_t *dep) return false; } -static bool can_stop_full_tick(struct tick_sched *ts) +static bool can_stop_full_tick(int cpu, struct tick_sched *ts) { WARN_ON_ONCE(!irqs_disabled()); + if (unlikely(!cpu_online(cpu))) + return false; + if (check_tick_dependency(&tick_dep_mask)) return false; @@ -387,24 +390,16 @@ static int __init tick_nohz_full_setup(char *str) } __setup("nohz_full=", tick_nohz_full_setup); -static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static int tick_nohz_cpu_down(unsigned int cpu) { - unsigned int cpu = (unsigned long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_PREPARE: - /* - * The boot CPU handles housekeeping duty (unbound timers, - * workqueues, timekeeping, ...) on behalf of full dynticks - * CPUs. It must remain online when nohz full is enabled. - */ - if (tick_nohz_full_running && tick_do_timer_cpu == cpu) - return NOTIFY_BAD; - break; - } - return NOTIFY_OK; + /* + * The boot CPU handles housekeeping duty (unbound timers, + * workqueues, timekeeping, ...) on behalf of full dynticks + * CPUs. It must remain online when nohz full is enabled. + */ + if (tick_nohz_full_running && tick_do_timer_cpu == cpu) + return -EBUSY; + return 0; } static int tick_nohz_init_all(void) @@ -425,7 +420,7 @@ static int tick_nohz_init_all(void) void __init tick_nohz_init(void) { - int cpu; + int cpu, ret; if (!tick_nohz_full_running) { if (tick_nohz_init_all() < 0) @@ -466,7 +461,10 @@ void __init tick_nohz_init(void) for_each_cpu(cpu, tick_nohz_full_mask) context_tracking_cpu_set(cpu); - cpu_notifier(tick_nohz_cpu_down_callback, 0); + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "kernel/nohz:predown", NULL, + tick_nohz_cpu_down); + WARN_ON(ret < 0); pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", cpumask_pr_args(tick_nohz_full_mask)); @@ -843,7 +841,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) return; - if (can_stop_full_tick(ts)) + if (can_stop_full_tick(cpu, ts)) tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); else if (ts->tick_stopped) tick_nohz_restart_sched_tick(ts, ktime_get()); diff --git a/kernel/time/time.c b/kernel/time/time.c index 667b9335f5d6..bd62fb8e8e77 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -780,7 +780,7 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, { struct timespec64 res; - set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec, + set_normalized_timespec64(&res, (timeu64_t) lhs.tv_sec + rhs.tv_sec, lhs.tv_nsec + rhs.tv_nsec); if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) { diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e07fb093f819..da233cdf89b0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -258,10 +258,9 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) tk->cycle_interval = interval; /* Go back from cycles -> shifted ns */ - tk->xtime_interval = (u64) interval * clock->mult; + tk->xtime_interval = interval * clock->mult; tk->xtime_remainder = ntpinterval - tk->xtime_interval; - tk->raw_interval = - ((u64) interval * clock->mult) >> clock->shift; + tk->raw_interval = (interval * clock->mult) >> clock->shift; /* if changing clocks, convert xtime_nsec shift units */ if (old_clock) { @@ -299,10 +298,10 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; static inline u32 arch_gettimeoffset(void) { return 0; } #endif -static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr, +static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr, cycle_t delta) { - s64 nsec; + u64 nsec; nsec = delta * tkr->mult + tkr->xtime_nsec; nsec >>= tkr->shift; @@ -311,7 +310,7 @@ static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr, return nsec + arch_gettimeoffset(); } -static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) +static inline u64 timekeeping_get_ns(struct tk_read_base *tkr) { cycle_t delta; @@ -319,8 +318,8 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) return timekeeping_delta_to_ns(tkr, delta); } -static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr, - cycle_t cycles) +static inline u64 timekeeping_cycles_to_ns(struct tk_read_base *tkr, + cycle_t cycles) { cycle_t delta; @@ -403,8 +402,11 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); - now += clocksource_delta(tkr->read(tkr->clock), - tkr->cycle_last, tkr->mask); + now += timekeeping_delta_to_ns(tkr, + clocksource_delta( + tkr->read(tkr->clock), + tkr->cycle_last, + tkr->mask)); } while (read_seqcount_retry(&tkf->seq, seq)); return now; @@ -422,6 +424,35 @@ u64 ktime_get_raw_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); +/** + * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock. + * + * To keep it NMI safe since we're accessing from tracing, we're not using a + * separate timekeeper with updates to monotonic clock and boot offset + * protected with seqlocks. This has the following minor side effects: + * + * (1) Its possible that a timestamp be taken after the boot offset is updated + * but before the timekeeper is updated. If this happens, the new boot offset + * is added to the old timekeeping making the clock appear to update slightly + * earlier: + * CPU 0 CPU 1 + * timekeeping_inject_sleeptime64() + * __timekeeping_inject_sleeptime(tk, delta); + * timestamp(); + * timekeeping_update(tk, TK_CLEAR_NTP...); + * + * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be + * partially updated. Since the tk->offs_boot update is a rare event, this + * should be a rare occurrence which postprocessing should be able to handle. + */ +u64 notrace ktime_get_boot_fast_ns(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot)); +} +EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); + /* Suspend-time cycles value for halted fast timekeeper. */ static cycle_t cycles_at_suspend; @@ -620,7 +651,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) { struct clocksource *clock = tk->tkr_mono.clock; cycle_t cycle_now, delta; - s64 nsec; + u64 nsec; cycle_now = tk->tkr_mono.read(clock); delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); @@ -649,7 +680,7 @@ int __getnstimeofday64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; - s64 nsecs = 0; + u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); @@ -689,7 +720,7 @@ ktime_t ktime_get(void) struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base; - s64 nsecs; + u64 nsecs; WARN_ON(timekeeping_suspended); @@ -732,7 +763,7 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs) struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base, *offset = offsets[offs]; - s64 nsecs; + u64 nsecs; WARN_ON(timekeeping_suspended); @@ -776,7 +807,7 @@ ktime_t ktime_get_raw(void) struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base; - s64 nsecs; + u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); @@ -801,8 +832,8 @@ void ktime_get_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 tomono; - s64 nsec; unsigned int seq; + u64 nsec; WARN_ON(timekeeping_suspended); @@ -890,8 +921,8 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) unsigned long seq; ktime_t base_raw; ktime_t base_real; - s64 nsec_raw; - s64 nsec_real; + u64 nsec_raw; + u64 nsec_real; cycle_t now; WARN_ON_ONCE(timekeeping_suspended); @@ -1049,7 +1080,7 @@ int get_device_system_crosststamp(int (*get_time_fn) cycle_t cycles, now, interval_start; unsigned int clock_was_set_seq = 0; ktime_t base_real, base_raw; - s64 nsec_real, nsec_raw; + u64 nsec_real, nsec_raw; u8 cs_was_changed_seq; unsigned long seq; bool do_interp; @@ -1362,7 +1393,7 @@ void getrawmonotonic64(struct timespec64 *ts) struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 ts64; unsigned long seq; - s64 nsecs; + u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); @@ -1613,7 +1644,7 @@ void timekeeping_resume(void) struct clocksource *clock = tk->tkr_mono.clock; unsigned long flags; struct timespec64 ts_new, ts_delta; - cycle_t cycle_now, cycle_delta; + cycle_t cycle_now; sleeptime_injected = false; read_persistent_clock64(&ts_new); @@ -1639,27 +1670,11 @@ void timekeeping_resume(void) cycle_now = tk->tkr_mono.read(clock); if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && cycle_now > tk->tkr_mono.cycle_last) { - u64 num, max = ULLONG_MAX; - u32 mult = clock->mult; - u32 shift = clock->shift; - s64 nsec = 0; - - cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, - tk->tkr_mono.mask); - - /* - * "cycle_delta * mutl" may cause 64 bits overflow, if the - * suspended time is too long. In that case we need do the - * 64 bits math carefully - */ - do_div(max, mult); - if (cycle_delta > max) { - num = div64_u64(cycle_delta, max); - nsec = (((u64) max * mult) >> shift) * num; - cycle_delta -= num * max; - } - nsec += ((u64) cycle_delta * mult) >> shift; + u64 nsec, cyc_delta; + cyc_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, + tk->tkr_mono.mask); + nsec = mul_u64_u32_shr(cyc_delta, clock->mult, clock->shift); ts_delta = ns_to_timespec64(nsec); sleeptime_injected = true; } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index 107310a6f36f..ca9fb800336b 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -75,5 +75,7 @@ void tk_debug_account_sleep_time(struct timespec64 *t) int bin = min(fls(t->tv_sec), NUM_BINS-1); sleep_time_bin[bin]++; + pr_info("Suspended for %lld.%03lu seconds\n", (s64)t->tv_sec, + t->tv_nsec / NSEC_PER_MSEC); } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 32bf6f75a8fe..ea4fbf8477a9 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -878,7 +878,7 @@ static inline struct timer_base *get_timer_base(u32 tflags) #ifdef CONFIG_NO_HZ_COMMON static inline struct timer_base * -__get_target_base(struct timer_base *base, unsigned tflags) +get_target_base(struct timer_base *base, unsigned tflags) { #ifdef CONFIG_SMP if ((tflags & TIMER_PINNED) || !base->migration_enabled) @@ -891,25 +891,27 @@ __get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { + unsigned long jnow = READ_ONCE(jiffies); + /* * We only forward the base when it's idle and we have a delta between * base clock and jiffies. */ - if (!base->is_idle || (long) (jiffies - base->clk) < 2) + if (!base->is_idle || (long) (jnow - base->clk) < 2) return; /* * If the next expiry value is > jiffies, then we fast forward to * jiffies otherwise we forward to the next expiry value. */ - if (time_after(base->next_expiry, jiffies)) - base->clk = jiffies; + if (time_after(base->next_expiry, jnow)) + base->clk = jnow; else base->clk = base->next_expiry; } #else static inline struct timer_base * -__get_target_base(struct timer_base *base, unsigned tflags) +get_target_base(struct timer_base *base, unsigned tflags) { return get_timer_this_cpu_base(tflags); } @@ -917,14 +919,6 @@ __get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { } #endif -static inline struct timer_base * -get_target_base(struct timer_base *base, unsigned tflags) -{ - struct timer_base *target = __get_target_base(base, tflags); - - forward_timer_base(target); - return target; -} /* * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means @@ -943,7 +937,14 @@ static struct timer_base *lock_timer_base(struct timer_list *timer, { for (;;) { struct timer_base *base; - u32 tf = timer->flags; + u32 tf; + + /* + * We need to use READ_ONCE() here, otherwise the compiler + * might re-read @tf between the check for TIMER_MIGRATING + * and spin_lock(). + */ + tf = READ_ONCE(timer->flags); if (!(tf & TIMER_MIGRATING)) { base = get_timer_base(tf); @@ -964,6 +965,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) unsigned long clk = 0, flags; int ret = 0; + BUG_ON(!timer->function); + /* * This is a common optimization triggered by the networking code - if * the timer is re-modified to have the same timeout or ends up in the @@ -972,13 +975,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) if (timer_pending(timer)) { if (timer->expires == expires) return 1; + /* - * Take the current timer_jiffies of base, but without holding - * the lock! + * We lock timer base and calculate the bucket index right + * here. If the timer ends up in the same bucket, then we + * just update the expiry time and avoid the whole + * dequeue/enqueue dance. */ - base = get_timer_base(timer->flags); - clk = base->clk; + base = lock_timer_base(timer, &flags); + clk = base->clk; idx = calc_wheel_index(expires, clk); /* @@ -988,14 +994,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) */ if (idx == timer_get_idx(timer)) { timer->expires = expires; - return 1; + ret = 1; + goto out_unlock; } + } else { + base = lock_timer_base(timer, &flags); } timer_stats_timer_set_start_info(timer); - BUG_ON(!timer->function); - - base = lock_timer_base(timer, &flags); ret = detach_if_pending(timer, base, false); if (!ret && pending_only) @@ -1025,12 +1031,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) } } + /* Try to forward a stale timer base clock */ + forward_timer_base(base); + timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance - * between calculating 'idx' and taking the lock, only enqueue_timer() - * and trigger_dyntick_cpu() is required. Otherwise we need to - * (re)calculate the wheel index via internal_add_timer(). + * between calculating 'idx' and possibly switching the base, only + * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise + * we need to (re)calculate the wheel index via + * internal_add_timer(). */ if (idx != UINT_MAX && clk == base->clk) { enqueue_timer(base, timer, idx); @@ -1510,12 +1520,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); base->next_expiry = nextevt; /* - * We have a fresh next event. Check whether we can forward the base: + * We have a fresh next event. Check whether we can forward the + * base. We can only do that when @basej is past base->clk + * otherwise we might rewind base->clk. */ - if (time_after(nextevt, jiffies)) - base->clk = jiffies; - else if (time_after(nextevt, base->clk)) - base->clk = nextevt; + if (time_after(basej, base->clk)) { + if (time_after(nextevt, basej)) + base->clk = basej; + else if (time_after(nextevt, base->clk)) + base->clk = nextevt; + } if (time_before_eq(nextevt, basej)) { expires = basem; @@ -1601,7 +1615,8 @@ void update_process_times(int user_tick) irq_work_tick(); #endif scheduler_tick(); - run_posix_cpu_timers(p); + if (IS_ENABLED(CONFIG_POSIX_TIMERS)) + run_posix_cpu_timers(p); } /** @@ -1633,7 +1648,7 @@ static inline void __run_timers(struct timer_base *base) /* * This function runs timers and the timer-tq in bottom half context. */ -static void run_timer_softirq(struct softirq_action *h) +static __latent_entropy void run_timer_softirq(struct softirq_action *h) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); @@ -1662,19 +1677,6 @@ void run_local_timers(void) raise_softirq(TIMER_SOFTIRQ); } -#ifdef __ARCH_WANT_SYS_ALARM - -/* - * For backwards compatibility? This can be done in libc so Alpha - * and all newer ports shouldn't need it. - */ -SYSCALL_DEFINE1(alarm, unsigned int, seconds) -{ - return alarm_setitimer(seconds); -} - -#endif - static void process_timeout(unsigned long __data) { wake_up_process((struct task_struct *)__data); @@ -1691,11 +1693,12 @@ static void process_timeout(unsigned long __data) * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to - * pass before the routine returns. The routine will return 0 + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process())". * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. In this case the remaining time - * in jiffies will be returned, or 0 if the timer expired in time + * delivered to the current task or the current task is explicitly woken + * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. @@ -1704,7 +1707,9 @@ static void process_timeout(unsigned long __data) * the CPU away without a bound on the timeout. In this case the return * value will be %MAX_SCHEDULE_TIMEOUT. * - * In all cases the return value is guaranteed to be non-negative. + * Returns 0 when the timer has expired otherwise the remaining time in + * jiffies will be returned. In all cases the return value is guaranteed + * to be non-negative. */ signed long __sched schedule_timeout(signed long timeout) { @@ -1896,16 +1901,6 @@ unsigned long msleep_interruptible(unsigned int msecs) EXPORT_SYMBOL(msleep_interruptible); -static void __sched do_usleep_range(unsigned long min, unsigned long max) -{ - ktime_t kmin; - u64 delta; - - kmin = ktime_set(0, min * NSEC_PER_USEC); - delta = (u64)(max - min) * NSEC_PER_USEC; - schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); -} - /** * usleep_range - Sleep for an approximate time * @min: Minimum time in usecs to sleep @@ -1919,7 +1914,14 @@ static void __sched do_usleep_range(unsigned long min, unsigned long max) */ void __sched usleep_range(unsigned long min, unsigned long max) { - __set_current_state(TASK_UNINTERRUPTIBLE); - do_usleep_range(min, max); + ktime_t exp = ktime_add_us(ktime_get(), min); + u64 delta = (u64)(max - min) * NSEC_PER_USEC; + + for (;;) { + __set_current_state(TASK_UNINTERRUPTIBLE); + /* Do not return before the requested sleep time has elapsed */ + if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) + break; + } } EXPORT_SYMBOL(usleep_range); diff --git a/kernel/torture.c b/kernel/torture.c index 75961b3decfe..0d887eb62856 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -43,6 +43,7 @@ #include <linux/stat.h> #include <linux/slab.h> #include <linux/trace_clock.h> +#include <linux/ktime.h> #include <asm/byteorder.h> #include <linux/torture.h> @@ -446,9 +447,8 @@ EXPORT_SYMBOL_GPL(torture_shuffle_cleanup); * Variables for auto-shutdown. This allows "lights out" torture runs * to be fully scripted. */ -static int shutdown_secs; /* desired test duration in seconds. */ static struct task_struct *shutdown_task; -static unsigned long shutdown_time; /* jiffies to system shutdown. */ +static ktime_t shutdown_time; /* time to system shutdown. */ static void (*torture_shutdown_hook)(void); /* @@ -471,20 +471,20 @@ EXPORT_SYMBOL_GPL(torture_shutdown_absorb); */ static int torture_shutdown(void *arg) { - long delta; - unsigned long jiffies_snap; + ktime_t ktime_snap; VERBOSE_TOROUT_STRING("torture_shutdown task started"); - jiffies_snap = jiffies; - while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && + ktime_snap = ktime_get(); + while (ktime_before(ktime_snap, shutdown_time) && !torture_must_stop()) { - delta = shutdown_time - jiffies_snap; if (verbose) pr_alert("%s" TORTURE_FLAG - "torture_shutdown task: %lu jiffies remaining\n", - torture_type, delta); - schedule_timeout_interruptible(delta); - jiffies_snap = jiffies; + "torture_shutdown task: %llu ms remaining\n", + torture_type, + ktime_ms_delta(shutdown_time, ktime_snap)); + set_current_state(TASK_INTERRUPTIBLE); + schedule_hrtimeout(&shutdown_time, HRTIMER_MODE_ABS); + ktime_snap = ktime_get(); } if (torture_must_stop()) { torture_kthread_stopping("torture_shutdown"); @@ -511,10 +511,9 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void)) { int ret = 0; - shutdown_secs = ssecs; torture_shutdown_hook = cleanup; - if (shutdown_secs > 0) { - shutdown_time = jiffies + shutdown_secs * HZ; + if (ssecs > 0) { + shutdown_time = ktime_add(ktime_get(), ktime_set(ssecs, 0)); ret = torture_create_kthread(torture_shutdown, NULL, shutdown_task); } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f4b86e8ca1e7..2a96b063d659 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -24,11 +24,6 @@ config HAVE_FUNCTION_GRAPH_TRACER help See Documentation/trace/ftrace-design.txt -config HAVE_FUNCTION_GRAPH_FP_TEST - bool - help - See Documentation/trace/ftrace-design.txt - config HAVE_DYNAMIC_FTRACE bool help @@ -221,6 +216,41 @@ config SCHED_TRACER This tracer tracks the latency of the highest priority task to be scheduled in, starting from the point it has woken up. +config HWLAT_TRACER + bool "Tracer to detect hardware latencies (like SMIs)" + select GENERIC_TRACER + help + This tracer, when enabled will create one or more kernel threads, + depening on what the cpumask file is set to, which each thread + spinning in a loop looking for interruptions caused by + something other than the kernel. For example, if a + System Management Interrupt (SMI) takes a noticeable amount of + time, this tracer will detect it. This is useful for testing + if a system is reliable for Real Time tasks. + + Some files are created in the tracing directory when this + is enabled: + + hwlat_detector/width - time in usecs for how long to spin for + hwlat_detector/window - time in usecs between the start of each + iteration + + A kernel thread is created that will spin with interrupts disabled + for "width" microseconds in every "widow" cycle. It will not spin + for "window - width" microseconds, where the system can + continue to operate. + + The output will appear in the trace and trace_pipe files. + + When the tracer is not running, it has no affect on the system, + but when it is running, it can cause the system to be + periodically non responsive. Do not run this tracer on a + production system. + + To enable this tracer, echo in "hwlat" into the current_tracer + file. Every time a latency is greater than tracing_thresh, it will + be recorded into the ring buffer. + config ENABLE_DEFAULT_TRACERS bool "Trace process context switches and events" depends on !GENERIC_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index d0a1617b52b4..e57980845549 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -1,8 +1,4 @@ -# We are fully aware of the dangers of __builtin_return_address() -FRAME_CFLAGS := $(call cc-disable-warning,frame-address) -KBUILD_CFLAGS += $(FRAME_CFLAGS) - # Do not instrument the tracer itself: ifdef CONFIG_FUNCTION_TRACER @@ -41,6 +37,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o +obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index dbafc5df03f3..95cecbf67f5c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1777,14 +1777,14 @@ void blk_dump_cmd(char *buf, struct request *rq) } } -void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes) +void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes) { int i = 0; - if (rw & REQ_PREFLUSH) + if (op & REQ_PREFLUSH) rwbs[i++] = 'F'; - switch (op) { + switch (op & REQ_OP_MASK) { case REQ_OP_WRITE: case REQ_OP_WRITE_SAME: rwbs[i++] = 'W'; @@ -1806,13 +1806,13 @@ void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes) rwbs[i++] = 'N'; } - if (rw & REQ_FUA) + if (op & REQ_FUA) rwbs[i++] = 'F'; - if (rw & REQ_RAHEAD) + if (op & REQ_RAHEAD) rwbs[i++] = 'A'; - if (rw & REQ_SYNC) + if (op & REQ_SYNC) rwbs[i++] = 'S'; - if (rw & REQ_META) + if (op & REQ_META) rwbs[i++] = 'M'; rwbs[i] = '\0'; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b20438fdb029..fa77311dadb2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1,4 +1,5 @@ /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -8,6 +9,7 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/bpf.h> +#include <linux/bpf_perf_event.h> #include <linux/filter.h> #include <linux/uaccess.h> #include <linux/ctype.h> @@ -59,11 +61,9 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) } EXPORT_SYMBOL_GPL(trace_call_bpf); -static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { - void *dst = (void *) (long) r1; - int ret, size = (int) r2; - void *unsafe_ptr = (void *) (long) r3; + int ret; ret = probe_kernel_read(dst, unsafe_ptr, size); if (unlikely(ret < 0)) @@ -81,12 +81,9 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .arg3_type = ARG_ANYTHING, }; -static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, + u32, size) { - void *unsafe_ptr = (void *) (long) r1; - void *src = (void *) (long) r2; - int size = (int) r3; - /* * Ensure we're in user context which is safe for the helper to * run. This helper has no business in a kthread. @@ -128,9 +125,9 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) * limited trace_printk() * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed */ -static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) +BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, + u64, arg2, u64, arg3) { - char *fmt = (char *) (long) r1; bool str_seen = false; int mod[3] = {}; int fmt_cnt = 0; @@ -176,16 +173,16 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) switch (fmt_cnt) { case 1: - unsafe_addr = r3; - r3 = (long) buf; + unsafe_addr = arg1; + arg1 = (long) buf; break; case 2: - unsafe_addr = r4; - r4 = (long) buf; + unsafe_addr = arg2; + arg2 = (long) buf; break; case 3: - unsafe_addr = r5; - r5 = (long) buf; + unsafe_addr = arg3; + arg3 = (long) buf; break; } buf[0] = 0; @@ -207,9 +204,9 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) } return __trace_printk(1/* fake ip will not be printed */, fmt, - mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3, - mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4, - mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5); + mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1, + mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2, + mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3); } static const struct bpf_func_proto bpf_trace_printk_proto = { @@ -231,9 +228,8 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } -static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) { - struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; struct bpf_array *array = container_of(map, struct bpf_array, map); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; @@ -310,11 +306,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, return 0; } -static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) +BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, + u64, flags, void *, data, u64, size) { - struct pt_regs *regs = (struct pt_regs *)(long) r1; - struct bpf_map *map = (struct bpf_map *)(long) r2; - void *data = (void *)(long) r4; struct perf_raw_record raw = { .frag = { .size = size, @@ -365,7 +359,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, return __bpf_perf_event_output(regs, map, flags, &raw); } -static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_get_current_task) { return (long) current; } @@ -376,6 +370,31 @@ static const struct bpf_func_proto bpf_get_current_task_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct cgroup *cgrp; + + if (unlikely(in_interrupt())) + return -EINVAL; + if (unlikely(idx >= array->map.max_entries)) + return -E2BIG; + + cgrp = READ_ONCE(array->ptrs[idx]); + if (unlikely(!cgrp)) + return -EAGAIN; + + return task_under_cgroup_hierarchy(current, cgrp); +} + +static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { + .func = bpf_current_task_under_cgroup, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -403,10 +422,16 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) return bpf_get_trace_printk_proto(); case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_get_numa_node_id: + return &bpf_get_numa_node_id_proto; case BPF_FUNC_perf_event_read: return &bpf_perf_event_read_proto; case BPF_FUNC_probe_write_user: return bpf_get_probe_write_proto(); + case BPF_FUNC_current_task_under_cgroup: + return &bpf_current_task_under_cgroup_proto; + case BPF_FUNC_get_prandom_u32: + return &bpf_get_prandom_u32_proto; default: return NULL; } @@ -447,16 +472,17 @@ static struct bpf_prog_type_list kprobe_tl = { .type = BPF_PROG_TYPE_KPROBE, }; -static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size) +BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, + u64, flags, void *, data, u64, size) { + struct pt_regs *regs = *(struct pt_regs **)tp_buff; + /* * r1 points to perf tracepoint buffer where first 8 bytes are hidden * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it - * from there and call the same bpf_perf_event_output() helper + * from there and call the same bpf_perf_event_output() helper inline. */ - u64 ctx = *(long *)(uintptr_t)r1; - - return bpf_perf_event_output(ctx, r2, index, r4, size); + return ____bpf_perf_event_output(regs, map, flags, data, size); } static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { @@ -470,11 +496,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { .arg5_type = ARG_CONST_STACK_SIZE, }; -static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map, + u64, flags) { - u64 ctx = *(long *)(uintptr_t)r1; + struct pt_regs *regs = *(struct pt_regs **)tp_buff; - return bpf_get_stackid(ctx, r2, r3, r4, r5); + /* + * Same comment as in bpf_perf_event_output_tp(), only that this time + * the other helper's function body cannot be inlined due to being + * external, thus we need to call raw helper function. + */ + return bpf_get_stackid((unsigned long) regs, (unsigned long) map, + flags, 0, 0); } static const struct bpf_func_proto bpf_get_stackid_proto_tp = { @@ -520,10 +553,69 @@ static struct bpf_prog_type_list tracepoint_tl = { .type = BPF_PROG_TYPE_TRACEPOINT, }; +static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + if (off == offsetof(struct bpf_perf_event_data, sample_period)) { + if (size != sizeof(u64)) + return false; + } else { + if (size != sizeof(long)) + return false; + } + return true; +} + +static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg, + int src_reg, int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct bpf_perf_event_data, sample_period): + BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, + data), dst_reg, src_reg, + offsetof(struct bpf_perf_event_data_kern, data)); + *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg, + offsetof(struct perf_sample_data, period)); + break; + default: + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, + regs), dst_reg, src_reg, + offsetof(struct bpf_perf_event_data_kern, regs)); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off); + break; + } + + return insn - insn_buf; +} + +static const struct bpf_verifier_ops perf_event_prog_ops = { + .get_func_proto = tp_prog_func_proto, + .is_valid_access = pe_prog_is_valid_access, + .convert_ctx_access = pe_prog_convert_ctx_access, +}; + +static struct bpf_prog_type_list perf_event_tl = { + .ops = &perf_event_prog_ops, + .type = BPF_PROG_TYPE_PERF_EVENT, +}; + static int __init register_kprobe_prog_ops(void) { bpf_register_prog_type(&kprobe_tl); bpf_register_prog_type(&tracepoint_tl); + bpf_register_prog_type(&perf_event_tl); return 0; } late_initcall(register_kprobe_prog_ops); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 84752c8e28b5..33dd57f53f88 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -872,7 +872,13 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int profile_graph_entry(struct ftrace_graph_ent *trace) { + int index = trace->depth; + function_profile_call(trace->func, 0, NULL, NULL); + + if (index >= 0 && index < FTRACE_RETFUNC_DEPTH) + current->ret_stack[index].subtime = 0; + return 1; } @@ -1856,6 +1862,10 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, /* Update rec->flags */ do_for_each_ftrace_rec(pg, rec) { + + if (rec->flags & FTRACE_FL_DISABLED) + continue; + /* We need to update only differences of filter_hash */ in_old = !!ftrace_lookup_ip(old_hash, rec->ip); in_new = !!ftrace_lookup_ip(new_hash, rec->ip); @@ -1878,6 +1888,10 @@ rollback: /* Roll back what we did above */ do_for_each_ftrace_rec(pg, rec) { + + if (rec->flags & FTRACE_FL_DISABLED) + continue; + if (rec == end) goto err_out; @@ -2391,6 +2405,10 @@ void __weak ftrace_replace_code(int enable) return; do_for_each_ftrace_rec(pg, rec) { + + if (rec->flags & FTRACE_FL_DISABLED) + continue; + failed = __ftrace_replace_code(rec, enable); if (failed) { ftrace_bug(failed, rec); @@ -2757,7 +2775,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) struct dyn_ftrace *rec; do_for_each_ftrace_rec(pg, rec) { - if (FTRACE_WARN_ON_ONCE(rec->flags)) + if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_FL_DISABLED)) pr_warn(" %pS flags:%lx\n", (void *)rec->ip, rec->flags); } while_for_each_ftrace_rec(); @@ -3592,6 +3610,10 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) goto out_unlock; do_for_each_ftrace_rec(pg, rec) { + + if (rec->flags & FTRACE_FL_DISABLED) + continue; + if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) { ret = enter_record(hash, rec, clear_filter); if (ret < 0) { @@ -3787,6 +3809,9 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, do_for_each_ftrace_rec(pg, rec) { + if (rec->flags & FTRACE_FL_DISABLED) + continue; + if (!ftrace_match_record(rec, &func_g, NULL, 0)) continue; @@ -4233,6 +4258,23 @@ int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, } EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); +/** + * ftrace_ops_set_global_filter - setup ops to use global filters + * @ops - the ops which will use the global filters + * + * ftrace users who need global function trace filtering should call this. + * It can set the global filter only if ops were not initialized before. + */ +void ftrace_ops_set_global_filter(struct ftrace_ops *ops) +{ + if (ops->flags & FTRACE_OPS_FL_INITIALIZED) + return; + + ftrace_ops_init(ops); + ops->func_hash = &global_ops.local_hash; +} +EXPORT_SYMBOL_GPL(ftrace_ops_set_global_filter); + static int ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, int reset, int enable) @@ -4679,6 +4721,9 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) do_for_each_ftrace_rec(pg, rec) { + if (rec->flags & FTRACE_FL_DISABLED) + continue; + if (ftrace_match_record(rec, &func_g, NULL, 0)) { /* if it is in the array */ exists = false; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9c143739b8d7..89a2611a1635 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -479,9 +479,7 @@ struct ring_buffer { struct ring_buffer_per_cpu **buffers; -#ifdef CONFIG_HOTPLUG_CPU - struct notifier_block cpu_notify; -#endif + struct hlist_node node; u64 (*clock)(void); struct rb_irq_work irq_work; @@ -1274,11 +1272,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) kfree(cpu_buffer); } -#ifdef CONFIG_HOTPLUG_CPU -static int rb_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu); -#endif - /** * __ring_buffer_alloc - allocate a new ring_buffer * @size: the size in bytes per cpu that is needed. @@ -1296,6 +1289,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, long nr_pages; int bsize; int cpu; + int ret; /* keep it in its own cache line */ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), @@ -1303,7 +1297,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (!buffer) return NULL; - if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) goto fail_free_buffer; nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); @@ -1318,17 +1312,6 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (nr_pages < 2) nr_pages = 2; - /* - * In case of non-hotplug cpu, if the ring-buffer is allocated - * in early initcall, it will not be notified of secondary cpus. - * In that off case, we need to allocate for all possible cpus. - */ -#ifdef CONFIG_HOTPLUG_CPU - cpu_notifier_register_begin(); - cpumask_copy(buffer->cpumask, cpu_online_mask); -#else - cpumask_copy(buffer->cpumask, cpu_possible_mask); -#endif buffer->cpus = nr_cpu_ids; bsize = sizeof(void *) * nr_cpu_ids; @@ -1337,19 +1320,15 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (!buffer->buffers) goto fail_free_cpumask; - for_each_buffer_cpu(buffer, cpu) { - buffer->buffers[cpu] = - rb_allocate_cpu_buffer(buffer, nr_pages, cpu); - if (!buffer->buffers[cpu]) - goto fail_free_buffers; - } + cpu = raw_smp_processor_id(); + cpumask_set_cpu(cpu, buffer->cpumask); + buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); + if (!buffer->buffers[cpu]) + goto fail_free_buffers; -#ifdef CONFIG_HOTPLUG_CPU - buffer->cpu_notify.notifier_call = rb_cpu_notify; - buffer->cpu_notify.priority = 0; - __register_cpu_notifier(&buffer->cpu_notify); - cpu_notifier_register_done(); -#endif + ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); + if (ret < 0) + goto fail_free_buffers; mutex_init(&buffer->mutex); @@ -1364,9 +1343,6 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, fail_free_cpumask: free_cpumask_var(buffer->cpumask); -#ifdef CONFIG_HOTPLUG_CPU - cpu_notifier_register_done(); -#endif fail_free_buffer: kfree(buffer); @@ -1383,18 +1359,11 @@ ring_buffer_free(struct ring_buffer *buffer) { int cpu; -#ifdef CONFIG_HOTPLUG_CPU - cpu_notifier_register_begin(); - __unregister_cpu_notifier(&buffer->cpu_notify); -#endif + cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); for_each_buffer_cpu(buffer, cpu) rb_free_cpu_buffer(buffer->buffers[cpu]); -#ifdef CONFIG_HOTPLUG_CPU - cpu_notifier_register_done(); -#endif - kfree(buffer->buffers); free_cpumask_var(buffer->cpumask); @@ -4633,62 +4602,48 @@ int ring_buffer_read_page(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_read_page); -#ifdef CONFIG_HOTPLUG_CPU -static int rb_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +/* + * We only allocate new buffers, never free them if the CPU goes down. + * If we were to free the buffer, then the user would lose any trace that was in + * the buffer. + */ +int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node) { - struct ring_buffer *buffer = - container_of(self, struct ring_buffer, cpu_notify); - long cpu = (long)hcpu; + struct ring_buffer *buffer; long nr_pages_same; int cpu_i; unsigned long nr_pages; - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - if (cpumask_test_cpu(cpu, buffer->cpumask)) - return NOTIFY_OK; - - nr_pages = 0; - nr_pages_same = 1; - /* check if all cpu sizes are same */ - for_each_buffer_cpu(buffer, cpu_i) { - /* fill in the size from first enabled cpu */ - if (nr_pages == 0) - nr_pages = buffer->buffers[cpu_i]->nr_pages; - if (nr_pages != buffer->buffers[cpu_i]->nr_pages) { - nr_pages_same = 0; - break; - } - } - /* allocate minimum pages, user can later expand it */ - if (!nr_pages_same) - nr_pages = 2; - buffer->buffers[cpu] = - rb_allocate_cpu_buffer(buffer, nr_pages, cpu); - if (!buffer->buffers[cpu]) { - WARN(1, "failed to allocate ring buffer on CPU %ld\n", - cpu); - return NOTIFY_OK; + buffer = container_of(node, struct ring_buffer, node); + if (cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + nr_pages = 0; + nr_pages_same = 1; + /* check if all cpu sizes are same */ + for_each_buffer_cpu(buffer, cpu_i) { + /* fill in the size from first enabled cpu */ + if (nr_pages == 0) + nr_pages = buffer->buffers[cpu_i]->nr_pages; + if (nr_pages != buffer->buffers[cpu_i]->nr_pages) { + nr_pages_same = 0; + break; } - smp_wmb(); - cpumask_set_cpu(cpu, buffer->cpumask); - break; - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - /* - * Do nothing. - * If we were to free the buffer, then the user would - * lose any trace that was in the buffer. - */ - break; - default: - break; } - return NOTIFY_OK; + /* allocate minimum pages, user can later expand it */ + if (!nr_pages_same) + nr_pages = 2; + buffer->buffers[cpu] = + rb_allocate_cpu_buffer(buffer, nr_pages, cpu); + if (!buffer->buffers[cpu]) { + WARN(1, "failed to allocate ring buffer on CPU %u\n", + cpu); + return -ENOMEM; + } + smp_wmb(); + cpumask_set_cpu(cpu, buffer->cpumask); + return 0; } -#endif #ifdef CONFIG_RING_BUFFER_STARTUP_TEST /* diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7bc56762ca35..54d5270a5042 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1047,7 +1047,7 @@ void disable_trace_on_warning(void) * * Shows real state of the ring buffer if it is enabled or not. */ -static int tracer_tracing_is_on(struct trace_array *tr) +int tracer_tracing_is_on(struct trace_array *tr) { if (tr->trace_buffer.buffer) return ring_buffer_record_is_on(tr->trace_buffer.buffer); @@ -1125,6 +1125,7 @@ static struct { { trace_clock, "perf", 1 }, { ktime_get_mono_fast_ns, "mono", 1 }, { ktime_get_raw_fast_ns, "mono_raw", 1 }, + { ktime_get_boot_fast_ns, "boot", 1 }, ARCH_TRACE_CLOCKS }; @@ -4123,6 +4124,30 @@ static const char readme_msg[] = "\t\t\t traces\n" #endif #endif /* CONFIG_STACK_TRACER */ +#ifdef CONFIG_KPROBE_EVENT + " kprobe_events\t\t- Add/remove/show the kernel dynamic events\n" + "\t\t\t Write into this file to define/undefine new trace events.\n" +#endif +#ifdef CONFIG_UPROBE_EVENT + " uprobe_events\t\t- Add/remove/show the userspace dynamic events\n" + "\t\t\t Write into this file to define/undefine new trace events.\n" +#endif +#if defined(CONFIG_KPROBE_EVENT) || defined(CONFIG_UPROBE_EVENT) + "\t accepts: event-definitions (one definition per line)\n" + "\t Format: p|r[:[<group>/]<event>] <place> [<args>]\n" + "\t -:[<group>/]<event>\n" +#ifdef CONFIG_KPROBE_EVENT + "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" +#endif +#ifdef CONFIG_UPROBE_EVENT + "\t place: <path>:<offset>\n" +#endif + "\t args: <name>=fetcharg[:type]\n" + "\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n" + "\t $stack<index>, $stack, $retval, $comm\n" + "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string,\n" + "\t b<bit-width>@<bit-offset>/<container-size>\n" +#endif " events/\t\t- Directory containing all trace event subsystems:\n" " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" " events/<system>/\t- Directory containing all trace events for <system>:\n" @@ -4945,7 +4970,7 @@ out: return ret; } -#ifdef CONFIG_TRACER_MAX_TRACE +#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) static ssize_t tracing_max_lat_read(struct file *filp, char __user *ubuf, @@ -5868,7 +5893,7 @@ static const struct file_operations tracing_thresh_fops = { .llseek = generic_file_llseek, }; -#ifdef CONFIG_TRACER_MAX_TRACE +#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -7198,7 +7223,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) create_trace_options_dir(tr); -#ifdef CONFIG_TRACER_MAX_TRACE +#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) trace_create_file("tracing_max_latency", 0644, d_tracer, &tr->max_latency, &tracing_max_lat_fops); #endif @@ -7635,10 +7660,21 @@ __init static int tracer_alloc_buffers(void) raw_spin_lock_init(&global_trace.start_lock); + /* + * The prepare callbacks allocates some memory for the ring buffer. We + * don't free the buffer if the if the CPU goes down. If we were to free + * the buffer, then the user would lose any trace that was in the + * buffer. The memory will be removed once the "instance" is removed. + */ + ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE, + "trace/RB:preapre", trace_rb_cpu_prepare, + NULL); + if (ret < 0) + goto out_free_cpumask; /* Used for event triggers */ temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE); if (!temp_buffer) - goto out_free_cpumask; + goto out_rm_hp_state; if (trace_create_savedcmd() < 0) goto out_free_temp_buffer; @@ -7699,6 +7735,8 @@ out_free_savedcmd: free_saved_cmdlines_buffer(savedcmd); out_free_temp_buffer: ring_buffer_free(temp_buffer); +out_rm_hp_state: + cpuhp_remove_multi_state(CPUHP_TRACE_RB_PREPARE); out_free_cpumask: free_cpumask_var(global_trace.tracing_cpumask); out_free_buffer_mask: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f783df416726..fd24b1f9ac43 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -38,6 +38,7 @@ enum trace_type { TRACE_USER_STACK, TRACE_BLK, TRACE_BPUTS, + TRACE_HWLAT, __TRACE_LAST_TYPE, }; @@ -213,6 +214,8 @@ struct trace_array { */ struct trace_buffer max_buffer; bool allocated_snapshot; +#endif +#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) unsigned long max_latency; #endif struct trace_pid_list __rcu *filtered_pids; @@ -326,6 +329,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ + IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ @@ -571,6 +575,7 @@ void tracing_reset_current(int cpu); void tracing_reset_all_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); bool tracing_is_disabled(void); +int tracer_tracing_is_on(struct trace_array *tr); struct dentry *trace_create_file(const char *name, umode_t mode, struct dentry *parent, diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 5c30efcda5e6..d1cc37e78f99 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -322,3 +322,30 @@ FTRACE_ENTRY(branch, trace_branch, FILTER_OTHER ); + +FTRACE_ENTRY(hwlat, hwlat_entry, + + TRACE_HWLAT, + + F_STRUCT( + __field( u64, duration ) + __field( u64, outer_duration ) + __field( u64, nmi_total_ts ) + __field_struct( struct timespec, timestamp ) + __field_desc( long, timestamp, tv_sec ) + __field_desc( long, timestamp, tv_nsec ) + __field( unsigned int, nmi_count ) + __field( unsigned int, seqnum ) + ), + + F_printk("cnt:%u\tts:%010lu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n", + __entry->seqnum, + __entry->tv_sec, + __entry->tv_nsec, + __entry->duration, + __entry->outer_duration, + __entry->nmi_total_ts, + __entry->nmi_count), + + FILTER_OTHER +); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index a975571cde24..6721a1e89f39 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1028,6 +1028,7 @@ static struct event_command trigger_traceon_cmd = { static struct event_command trigger_traceoff_cmd = { .name = "traceoff", .trigger_type = ETT_TRACE_ONOFF, + .flags = EVENT_CMD_FL_POST_TRIGGER, .func = event_trigger_callback, .reg = register_trigger, .unreg = unregister_trigger, diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 7363ccf79512..4e480e870474 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -119,7 +119,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, /* Add a function return address to the trace stack on thread info.*/ int ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, - unsigned long frame_pointer) + unsigned long frame_pointer, unsigned long *retp) { unsigned long long calltime; int index; @@ -170,8 +170,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, current->ret_stack[index].ret = ret; current->ret_stack[index].func = func; current->ret_stack[index].calltime = calltime; - current->ret_stack[index].subtime = 0; +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST current->ret_stack[index].fp = frame_pointer; +#endif +#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR + current->ret_stack[index].retp = retp; +#endif *depth = current->curr_ret_stack; return 0; @@ -204,7 +208,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, return; } -#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY) +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST /* * The arch may choose to record the frame pointer used * and check it here to make sure that it is what we expect it @@ -279,6 +283,64 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } +/** + * ftrace_graph_ret_addr - convert a potentially modified stack return address + * to its original value + * + * This function can be called by stack unwinding code to convert a found stack + * return address ('ret') to its original value, in case the function graph + * tracer has modified it to be 'return_to_handler'. If the address hasn't + * been modified, the unchanged value of 'ret' is returned. + * + * 'idx' is a state variable which should be initialized by the caller to zero + * before the first call. + * + * 'retp' is a pointer to the return address on the stack. It's ignored if + * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined. + */ +#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR +unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, + unsigned long ret, unsigned long *retp) +{ + int index = task->curr_ret_stack; + int i; + + if (ret != (unsigned long)return_to_handler) + return ret; + + if (index < -1) + index += FTRACE_NOTRACE_DEPTH; + + if (index < 0) + return ret; + + for (i = 0; i <= index; i++) + if (task->ret_stack[i].retp == retp) + return task->ret_stack[i].ret; + + return ret; +} +#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ +unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, + unsigned long ret, unsigned long *retp) +{ + int task_idx; + + if (ret != (unsigned long)return_to_handler) + return ret; + + task_idx = task->curr_ret_stack; + + if (!task->ret_stack || task_idx < *idx) + return ret; + + task_idx -= *idx; + (*idx)++; + + return task->ret_stack[task_idx].ret; +} +#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ + int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned long flags, @@ -1120,6 +1182,11 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, trace_seq_puts(s, "/* "); switch (iter->ent->type) { + case TRACE_BPUTS: + ret = trace_print_bputs_msg_only(iter); + if (ret != TRACE_TYPE_HANDLED) + return ret; + break; case TRACE_BPRINT: ret = trace_print_bprintk_msg_only(iter); if (ret != TRACE_TYPE_HANDLED) diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c new file mode 100644 index 000000000000..b97286c48735 --- /dev/null +++ b/kernel/trace/trace_hwlat.c @@ -0,0 +1,633 @@ +/* + * trace_hwlatdetect.c - A simple Hardware Latency detector. + * + * Use this tracer to detect large system latencies induced by the behavior of + * certain underlying system hardware or firmware, independent of Linux itself. + * The code was developed originally to detect the presence of SMIs on Intel + * and AMD systems, although there is no dependency upon x86 herein. + * + * The classical example usage of this tracer is in detecting the presence of + * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a + * somewhat special form of hardware interrupt spawned from earlier CPU debug + * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge + * LPC (or other device) to generate a special interrupt under certain + * circumstances, for example, upon expiration of a special SMI timer device, + * due to certain external thermal readings, on certain I/O address accesses, + * and other situations. An SMI hits a special CPU pin, triggers a special + * SMI mode (complete with special memory map), and the OS is unaware. + * + * Although certain hardware-inducing latencies are necessary (for example, + * a modern system often requires an SMI handler for correct thermal control + * and remote management) they can wreak havoc upon any OS-level performance + * guarantees toward low-latency, especially when the OS is not even made + * aware of the presence of these interrupts. For this reason, we need a + * somewhat brute force mechanism to detect these interrupts. In this case, + * we do it by hogging all of the CPU(s) for configurable timer intervals, + * sampling the built-in CPU timer, looking for discontiguous readings. + * + * WARNING: This implementation necessarily introduces latencies. Therefore, + * you should NEVER use this tracer while running in a production + * environment requiring any kind of low-latency performance + * guarantee(s). + * + * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com> + * Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. <srostedt@redhat.com> + * + * Includes useful feedback from Clark Williams <clark@redhat.com> + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ +#include <linux/kthread.h> +#include <linux/tracefs.h> +#include <linux/uaccess.h> +#include <linux/cpumask.h> +#include <linux/delay.h> +#include "trace.h" + +static struct trace_array *hwlat_trace; + +#define U64STR_SIZE 22 /* 20 digits max */ + +#define BANNER "hwlat_detector: " +#define DEFAULT_SAMPLE_WINDOW 1000000 /* 1s */ +#define DEFAULT_SAMPLE_WIDTH 500000 /* 0.5s */ +#define DEFAULT_LAT_THRESHOLD 10 /* 10us */ + +/* sampling thread*/ +static struct task_struct *hwlat_kthread; + +static struct dentry *hwlat_sample_width; /* sample width us */ +static struct dentry *hwlat_sample_window; /* sample window us */ + +/* Save the previous tracing_thresh value */ +static unsigned long save_tracing_thresh; + +/* NMI timestamp counters */ +static u64 nmi_ts_start; +static u64 nmi_total_ts; +static int nmi_count; +static int nmi_cpu; + +/* Tells NMIs to call back to the hwlat tracer to record timestamps */ +bool trace_hwlat_callback_enabled; + +/* If the user changed threshold, remember it */ +static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC; + +/* Individual latency samples are stored here when detected. */ +struct hwlat_sample { + u64 seqnum; /* unique sequence */ + u64 duration; /* delta */ + u64 outer_duration; /* delta (outer loop) */ + u64 nmi_total_ts; /* Total time spent in NMIs */ + struct timespec timestamp; /* wall time */ + int nmi_count; /* # NMIs during this sample */ +}; + +/* keep the global state somewhere. */ +static struct hwlat_data { + + struct mutex lock; /* protect changes */ + + u64 count; /* total since reset */ + + u64 sample_window; /* total sampling window (on+off) */ + u64 sample_width; /* active sampling portion of window */ + +} hwlat_data = { + .sample_window = DEFAULT_SAMPLE_WINDOW, + .sample_width = DEFAULT_SAMPLE_WIDTH, +}; + +static void trace_hwlat_sample(struct hwlat_sample *sample) +{ + struct trace_array *tr = hwlat_trace; + struct trace_event_call *call = &event_hwlat; + struct ring_buffer *buffer = tr->trace_buffer.buffer; + struct ring_buffer_event *event; + struct hwlat_entry *entry; + unsigned long flags; + int pc; + + pc = preempt_count(); + local_save_flags(flags); + + event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry), + flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->seqnum = sample->seqnum; + entry->duration = sample->duration; + entry->outer_duration = sample->outer_duration; + entry->timestamp = sample->timestamp; + entry->nmi_total_ts = sample->nmi_total_ts; + entry->nmi_count = sample->nmi_count; + + if (!call_filter_check_discard(call, entry, buffer, event)) + __buffer_unlock_commit(buffer, event); +} + +/* Macros to encapsulate the time capturing infrastructure */ +#define time_type u64 +#define time_get() trace_clock_local() +#define time_to_us(x) div_u64(x, 1000) +#define time_sub(a, b) ((a) - (b)) +#define init_time(a, b) (a = b) +#define time_u64(a) a + +void trace_hwlat_callback(bool enter) +{ + if (smp_processor_id() != nmi_cpu) + return; + + /* + * Currently trace_clock_local() calls sched_clock() and the + * generic version is not NMI safe. + */ + if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) { + if (enter) + nmi_ts_start = time_get(); + else + nmi_total_ts = time_get() - nmi_ts_start; + } + + if (enter) + nmi_count++; +} + +/** + * get_sample - sample the CPU TSC and look for likely hardware latencies + * + * Used to repeatedly capture the CPU TSC (or similar), looking for potential + * hardware-induced latency. Called with interrupts disabled and with + * hwlat_data.lock held. + */ +static int get_sample(void) +{ + struct trace_array *tr = hwlat_trace; + time_type start, t1, t2, last_t2; + s64 diff, total, last_total = 0; + u64 sample = 0; + u64 thresh = tracing_thresh; + u64 outer_sample = 0; + int ret = -1; + + do_div(thresh, NSEC_PER_USEC); /* modifies interval value */ + + nmi_cpu = smp_processor_id(); + nmi_total_ts = 0; + nmi_count = 0; + /* Make sure NMIs see this first */ + barrier(); + + trace_hwlat_callback_enabled = true; + + init_time(last_t2, 0); + start = time_get(); /* start timestamp */ + + do { + + t1 = time_get(); /* we'll look for a discontinuity */ + t2 = time_get(); + + if (time_u64(last_t2)) { + /* Check the delta from outer loop (t2 to next t1) */ + diff = time_to_us(time_sub(t1, last_t2)); + /* This shouldn't happen */ + if (diff < 0) { + pr_err(BANNER "time running backwards\n"); + goto out; + } + if (diff > outer_sample) + outer_sample = diff; + } + last_t2 = t2; + + total = time_to_us(time_sub(t2, start)); /* sample width */ + + /* Check for possible overflows */ + if (total < last_total) { + pr_err("Time total overflowed\n"); + break; + } + last_total = total; + + /* This checks the inner loop (t1 to t2) */ + diff = time_to_us(time_sub(t2, t1)); /* current diff */ + + /* This shouldn't happen */ + if (diff < 0) { + pr_err(BANNER "time running backwards\n"); + goto out; + } + + if (diff > sample) + sample = diff; /* only want highest value */ + + } while (total <= hwlat_data.sample_width); + + barrier(); /* finish the above in the view for NMIs */ + trace_hwlat_callback_enabled = false; + barrier(); /* Make sure nmi_total_ts is no longer updated */ + + ret = 0; + + /* If we exceed the threshold value, we have found a hardware latency */ + if (sample > thresh || outer_sample > thresh) { + struct hwlat_sample s; + + ret = 1; + + /* We read in microseconds */ + if (nmi_total_ts) + do_div(nmi_total_ts, NSEC_PER_USEC); + + hwlat_data.count++; + s.seqnum = hwlat_data.count; + s.duration = sample; + s.outer_duration = outer_sample; + s.timestamp = CURRENT_TIME; + s.nmi_total_ts = nmi_total_ts; + s.nmi_count = nmi_count; + trace_hwlat_sample(&s); + + /* Keep a running maximum ever recorded hardware latency */ + if (sample > tr->max_latency) + tr->max_latency = sample; + } + +out: + return ret; +} + +static struct cpumask save_cpumask; +static bool disable_migrate; + +static void move_to_next_cpu(void) +{ + static struct cpumask *current_mask; + int next_cpu; + + if (disable_migrate) + return; + + /* Just pick the first CPU on first iteration */ + if (!current_mask) { + current_mask = &save_cpumask; + get_online_cpus(); + cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); + put_online_cpus(); + next_cpu = cpumask_first(current_mask); + goto set_affinity; + } + + /* + * If for some reason the user modifies the CPU affinity + * of this thread, than stop migrating for the duration + * of the current test. + */ + if (!cpumask_equal(current_mask, ¤t->cpus_allowed)) + goto disable; + + get_online_cpus(); + cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); + next_cpu = cpumask_next(smp_processor_id(), current_mask); + put_online_cpus(); + + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(current_mask); + + set_affinity: + if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */ + goto disable; + + cpumask_clear(current_mask); + cpumask_set_cpu(next_cpu, current_mask); + + sched_setaffinity(0, current_mask); + return; + + disable: + disable_migrate = true; +} + +/* + * kthread_fn - The CPU time sampling/hardware latency detection kernel thread + * + * Used to periodically sample the CPU TSC via a call to get_sample. We + * disable interrupts, which does (intentionally) introduce latency since we + * need to ensure nothing else might be running (and thus preempting). + * Obviously this should never be used in production environments. + * + * Currently this runs on which ever CPU it was scheduled on, but most + * real-world hardware latency situations occur across several CPUs, + * but we might later generalize this if we find there are any actualy + * systems with alternate SMI delivery or other hardware latencies. + */ +static int kthread_fn(void *data) +{ + u64 interval; + + while (!kthread_should_stop()) { + + move_to_next_cpu(); + + local_irq_disable(); + get_sample(); + local_irq_enable(); + + mutex_lock(&hwlat_data.lock); + interval = hwlat_data.sample_window - hwlat_data.sample_width; + mutex_unlock(&hwlat_data.lock); + + do_div(interval, USEC_PER_MSEC); /* modifies interval value */ + + /* Always sleep for at least 1ms */ + if (interval < 1) + interval = 1; + + if (msleep_interruptible(interval)) + break; + } + + return 0; +} + +/** + * start_kthread - Kick off the hardware latency sampling/detector kthread + * + * This starts the kernel thread that will sit and sample the CPU timestamp + * counter (TSC or similar) and look for potential hardware latencies. + */ +static int start_kthread(struct trace_array *tr) +{ + struct task_struct *kthread; + + kthread = kthread_create(kthread_fn, NULL, "hwlatd"); + if (IS_ERR(kthread)) { + pr_err(BANNER "could not start sampling thread\n"); + return -ENOMEM; + } + hwlat_kthread = kthread; + wake_up_process(kthread); + + return 0; +} + +/** + * stop_kthread - Inform the hardware latency samping/detector kthread to stop + * + * This kicks the running hardware latency sampling/detector kernel thread and + * tells it to stop sampling now. Use this on unload and at system shutdown. + */ +static void stop_kthread(void) +{ + if (!hwlat_kthread) + return; + kthread_stop(hwlat_kthread); + hwlat_kthread = NULL; +} + +/* + * hwlat_read - Wrapper read function for reading both window and width + * @filp: The active open file structure + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * This function provides a generic read implementation for the global state + * "hwlat_data" structure filesystem entries. + */ +static ssize_t hwlat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[U64STR_SIZE]; + u64 *entry = filp->private_data; + u64 val; + int len; + + if (!entry) + return -EFAULT; + + if (cnt > sizeof(buf)) + cnt = sizeof(buf); + + val = *entry; + + len = snprintf(buf, sizeof(buf), "%llu\n", val); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); +} + +/** + * hwlat_width_write - Write function for "width" entry + * @filp: The active open file structure + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in @file + * + * This function provides a write implementation for the "width" interface + * to the hardware latency detector. It can be used to configure + * for how many us of the total window us we will actively sample for any + * hardware-induced latency periods. Obviously, it is not possible to + * sample constantly and have the system respond to a sample reader, or, + * worse, without having the system appear to have gone out to lunch. It + * is enforced that width is less that the total window size. + */ +static ssize_t +hwlat_width_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + u64 val; + int err; + + err = kstrtoull_from_user(ubuf, cnt, 10, &val); + if (err) + return err; + + mutex_lock(&hwlat_data.lock); + if (val < hwlat_data.sample_window) + hwlat_data.sample_width = val; + else + err = -EINVAL; + mutex_unlock(&hwlat_data.lock); + + if (err) + return err; + + return cnt; +} + +/** + * hwlat_window_write - Write function for "window" entry + * @filp: The active open file structure + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in @file + * + * This function provides a write implementation for the "window" interface + * to the hardware latency detetector. The window is the total time + * in us that will be considered one sample period. Conceptually, windows + * occur back-to-back and contain a sample width period during which + * actual sampling occurs. Can be used to write a new total window size. It + * is enfoced that any value written must be greater than the sample width + * size, or an error results. + */ +static ssize_t +hwlat_window_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + u64 val; + int err; + + err = kstrtoull_from_user(ubuf, cnt, 10, &val); + if (err) + return err; + + mutex_lock(&hwlat_data.lock); + if (hwlat_data.sample_width < val) + hwlat_data.sample_window = val; + else + err = -EINVAL; + mutex_unlock(&hwlat_data.lock); + + if (err) + return err; + + return cnt; +} + +static const struct file_operations width_fops = { + .open = tracing_open_generic, + .read = hwlat_read, + .write = hwlat_width_write, +}; + +static const struct file_operations window_fops = { + .open = tracing_open_generic, + .read = hwlat_read, + .write = hwlat_window_write, +}; + +/** + * init_tracefs - A function to initialize the tracefs interface files + * + * This function creates entries in tracefs for "hwlat_detector". + * It creates the hwlat_detector directory in the tracing directory, + * and within that directory is the count, width and window files to + * change and view those values. + */ +static int init_tracefs(void) +{ + struct dentry *d_tracer; + struct dentry *top_dir; + + d_tracer = tracing_init_dentry(); + if (IS_ERR(d_tracer)) + return -ENOMEM; + + top_dir = tracefs_create_dir("hwlat_detector", d_tracer); + if (!top_dir) + return -ENOMEM; + + hwlat_sample_window = tracefs_create_file("window", 0640, + top_dir, + &hwlat_data.sample_window, + &window_fops); + if (!hwlat_sample_window) + goto err; + + hwlat_sample_width = tracefs_create_file("width", 0644, + top_dir, + &hwlat_data.sample_width, + &width_fops); + if (!hwlat_sample_width) + goto err; + + return 0; + + err: + tracefs_remove_recursive(top_dir); + return -ENOMEM; +} + +static void hwlat_tracer_start(struct trace_array *tr) +{ + int err; + + err = start_kthread(tr); + if (err) + pr_err(BANNER "Cannot start hwlat kthread\n"); +} + +static void hwlat_tracer_stop(struct trace_array *tr) +{ + stop_kthread(); +} + +static bool hwlat_busy; + +static int hwlat_tracer_init(struct trace_array *tr) +{ + /* Only allow one instance to enable this */ + if (hwlat_busy) + return -EBUSY; + + hwlat_trace = tr; + + disable_migrate = false; + hwlat_data.count = 0; + tr->max_latency = 0; + save_tracing_thresh = tracing_thresh; + + /* tracing_thresh is in nsecs, we speak in usecs */ + if (!tracing_thresh) + tracing_thresh = last_tracing_thresh; + + if (tracer_tracing_is_on(tr)) + hwlat_tracer_start(tr); + + hwlat_busy = true; + + return 0; +} + +static void hwlat_tracer_reset(struct trace_array *tr) +{ + stop_kthread(); + + /* the tracing threshold is static between runs */ + last_tracing_thresh = tracing_thresh; + + tracing_thresh = save_tracing_thresh; + hwlat_busy = false; +} + +static struct tracer hwlat_tracer __read_mostly = +{ + .name = "hwlat", + .init = hwlat_tracer_init, + .reset = hwlat_tracer_reset, + .start = hwlat_tracer_start, + .stop = hwlat_tracer_stop, + .allow_instances = true, +}; + +__init static int init_hwlat_tracer(void) +{ + int ret; + + mutex_init(&hwlat_data.lock); + + ret = register_tracer(&hwlat_tracer); + if (ret) + return ret; + + init_tracefs(); + + return 0; +} +late_initcall(init_hwlat_tracer); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9aedb0b06683..eb6c9f1d3a93 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -253,6 +253,10 @@ static const struct fetch_type kprobes_fetch_type_table[] = { ASSIGN_FETCH_TYPE(s16, u16, 1), ASSIGN_FETCH_TYPE(s32, u32, 1), ASSIGN_FETCH_TYPE(s64, u64, 1), + ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0), + ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), + ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), + ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), ASSIGN_FETCH_TYPE_END }; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0bb9cf2d53e6..3fc20422c166 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1098,6 +1098,71 @@ static struct trace_event trace_user_stack_event = { .funcs = &trace_user_stack_funcs, }; +/* TRACE_HWLAT */ +static enum print_line_t +trace_hwlat_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct hwlat_entry *field; + + trace_assign_type(field, entry); + + trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ld.%09ld", + field->seqnum, + field->duration, + field->outer_duration, + field->timestamp.tv_sec, + field->timestamp.tv_nsec); + + if (field->nmi_count) { + /* + * The generic sched_clock() is not NMI safe, thus + * we only record the count and not the time. + */ + if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) + trace_seq_printf(s, " nmi-total:%llu", + field->nmi_total_ts); + trace_seq_printf(s, " nmi-count:%u", + field->nmi_count); + } + + trace_seq_putc(s, '\n'); + + return trace_handle_return(s); +} + + +static enum print_line_t +trace_hwlat_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct hwlat_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(s, "%llu %lld %ld %09ld %u\n", + field->duration, + field->outer_duration, + field->timestamp.tv_sec, + field->timestamp.tv_nsec, + field->seqnum); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_hwlat_funcs = { + .trace = trace_hwlat_print, + .raw = trace_hwlat_raw, +}; + +static struct trace_event trace_hwlat_event = { + .type = TRACE_HWLAT, + .funcs = &trace_hwlat_funcs, +}; + /* TRACE_BPUTS */ static enum print_line_t trace_bputs_print(struct trace_iterator *iter, int flags, @@ -1233,6 +1298,7 @@ static struct trace_event *events[] __initdata = { &trace_bputs_event, &trace_bprint_event, &trace_print_event, + &trace_hwlat_event, NULL }; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 74e80a582c28..8c0553d9afd3 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -36,24 +36,28 @@ const char *reserved_field_names[] = { }; /* Printing in basic type function template */ -#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \ -int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ +#define DEFINE_BASIC_PRINT_TYPE_FUNC(tname, type, fmt) \ +int PRINT_TYPE_FUNC_NAME(tname)(struct trace_seq *s, const char *name, \ void *data, void *ent) \ { \ trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ return !trace_seq_has_overflowed(s); \ } \ -const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \ -NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); - -DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x") -DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x") -DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "0x%x") -DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "0x%Lx") -DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d") -DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d") -DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d") -DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld") +const char PRINT_TYPE_FMT_NAME(tname)[] = fmt; \ +NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(tname)); + +DEFINE_BASIC_PRINT_TYPE_FUNC(u8, u8, "%u") +DEFINE_BASIC_PRINT_TYPE_FUNC(u16, u16, "%u") +DEFINE_BASIC_PRINT_TYPE_FUNC(u32, u32, "%u") +DEFINE_BASIC_PRINT_TYPE_FUNC(u64, u64, "%Lu") +DEFINE_BASIC_PRINT_TYPE_FUNC(s8, s8, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s16, s16, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s32, s32, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s64, s64, "%Ld") +DEFINE_BASIC_PRINT_TYPE_FUNC(x8, u8, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(x16, u16, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx") /* Print type function for string type */ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 45400ca5ded1..0c0ae54d44c6 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -149,6 +149,11 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(s8); DECLARE_BASIC_PRINT_TYPE_FUNC(s16); DECLARE_BASIC_PRINT_TYPE_FUNC(s32); DECLARE_BASIC_PRINT_TYPE_FUNC(s64); +DECLARE_BASIC_PRINT_TYPE_FUNC(x8); +DECLARE_BASIC_PRINT_TYPE_FUNC(x16); +DECLARE_BASIC_PRINT_TYPE_FUNC(x32); +DECLARE_BASIC_PRINT_TYPE_FUNC(x64); + DECLARE_BASIC_PRINT_TYPE_FUNC(string); #define FETCH_FUNC_NAME(method, type) fetch_##method##_##type @@ -203,7 +208,7 @@ DEFINE_FETCH_##method(u32) \ DEFINE_FETCH_##method(u64) /* Default (unsigned long) fetch type */ -#define __DEFAULT_FETCH_TYPE(t) u##t +#define __DEFAULT_FETCH_TYPE(t) x##t #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) @@ -234,6 +239,10 @@ ASSIGN_FETCH_FUNC(file_offset, ftype), \ #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) +/* If ptype is an alias of atype, use this macro (show atype in format) */ +#define ASSIGN_FETCH_TYPE_ALIAS(ptype, atype, ftype, sign) \ + __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #atype) + #define ASSIGN_FETCH_TYPE_END {} #define FETCH_TYPE_STRING 0 diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index b2b6efc083a4..5e10395da88e 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -610,8 +610,7 @@ static int perf_sysenter_enable(struct trace_event_call *call) if (!sys_perf_refcount_enter) ret = register_trace_sys_enter(perf_syscall_enter, NULL); if (ret) { - pr_info("event trace: Could not activate" - "syscall entry trace point"); + pr_info("event trace: Could not activate syscall entry trace point"); } else { set_bit(num, enabled_perf_enter_syscalls); sys_perf_refcount_enter++; @@ -682,8 +681,7 @@ static int perf_sysexit_enable(struct trace_event_call *call) if (!sys_perf_refcount_exit) ret = register_trace_sys_exit(perf_syscall_exit, NULL); if (ret) { - pr_info("event trace: Could not activate" - "syscall exit trace point"); + pr_info("event trace: Could not activate syscall exit trace point"); } else { set_bit(num, enabled_perf_exit_syscalls); sys_perf_refcount_exit++; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c53485441c88..0913693caf6e 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -211,6 +211,10 @@ static const struct fetch_type uprobes_fetch_type_table[] = { ASSIGN_FETCH_TYPE(s16, u16, 1), ASSIGN_FETCH_TYPE(s32, u32, 1), ASSIGN_FETCH_TYPE(s64, u64, 1), + ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0), + ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), + ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), + ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), ASSIGN_FETCH_TYPE_END }; @@ -427,10 +431,6 @@ static int create_trace_uprobe(int argc, char **argv) pr_info("Probe point is not specified.\n"); return -EINVAL; } - if (isdigit(argv[1][0])) { - pr_info("probe point must be have a filename.\n"); - return -EINVAL; - } arg = strchr(argv[1], ':'); if (!arg) { ret = -EINVAL; diff --git a/kernel/ucount.c b/kernel/ucount.c new file mode 100644 index 000000000000..9d20d5dd298a --- /dev/null +++ b/kernel/ucount.c @@ -0,0 +1,235 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include <linux/stat.h> +#include <linux/sysctl.h> +#include <linux/slab.h> +#include <linux/hash.h> +#include <linux/user_namespace.h> + +#define UCOUNTS_HASHTABLE_BITS 10 +static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; +static DEFINE_SPINLOCK(ucounts_lock); + +#define ucounts_hashfn(ns, uid) \ + hash_long((unsigned long)__kuid_val(uid) + (unsigned long)(ns), \ + UCOUNTS_HASHTABLE_BITS) +#define ucounts_hashentry(ns, uid) \ + (ucounts_hashtable + ucounts_hashfn(ns, uid)) + + +#ifdef CONFIG_SYSCTL +static struct ctl_table_set * +set_lookup(struct ctl_table_root *root) +{ + return ¤t_user_ns()->set; +} + +static int set_is_seen(struct ctl_table_set *set) +{ + return ¤t_user_ns()->set == set; +} + +static int set_permissions(struct ctl_table_header *head, + struct ctl_table *table) +{ + struct user_namespace *user_ns = + container_of(head->set, struct user_namespace, set); + int mode; + + /* Allow users with CAP_SYS_RESOURCE unrestrained access */ + if (ns_capable(user_ns, CAP_SYS_RESOURCE)) + mode = (table->mode & S_IRWXU) >> 6; + else + /* Allow all others at most read-only access */ + mode = table->mode & S_IROTH; + return (mode << 6) | (mode << 3) | mode; +} + +static struct ctl_table_root set_root = { + .lookup = set_lookup, + .permissions = set_permissions, +}; + +static int zero = 0; +static int int_max = INT_MAX; +#define UCOUNT_ENTRY(name) \ + { \ + .procname = name, \ + .maxlen = sizeof(int), \ + .mode = 0644, \ + .proc_handler = proc_dointvec_minmax, \ + .extra1 = &zero, \ + .extra2 = &int_max, \ + } +static struct ctl_table user_table[] = { + UCOUNT_ENTRY("max_user_namespaces"), + UCOUNT_ENTRY("max_pid_namespaces"), + UCOUNT_ENTRY("max_uts_namespaces"), + UCOUNT_ENTRY("max_ipc_namespaces"), + UCOUNT_ENTRY("max_net_namespaces"), + UCOUNT_ENTRY("max_mnt_namespaces"), + UCOUNT_ENTRY("max_cgroup_namespaces"), + { } +}; +#endif /* CONFIG_SYSCTL */ + +bool setup_userns_sysctls(struct user_namespace *ns) +{ +#ifdef CONFIG_SYSCTL + struct ctl_table *tbl; + setup_sysctl_set(&ns->set, &set_root, set_is_seen); + tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL); + if (tbl) { + int i; + for (i = 0; i < UCOUNT_COUNTS; i++) { + tbl[i].data = &ns->ucount_max[i]; + } + ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl); + } + if (!ns->sysctls) { + kfree(tbl); + retire_sysctl_set(&ns->set); + return false; + } +#endif + return true; +} + +void retire_userns_sysctls(struct user_namespace *ns) +{ +#ifdef CONFIG_SYSCTL + struct ctl_table *tbl; + + tbl = ns->sysctls->ctl_table_arg; + unregister_sysctl_table(ns->sysctls); + retire_sysctl_set(&ns->set); + kfree(tbl); +#endif +} + +static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent) +{ + struct ucounts *ucounts; + + hlist_for_each_entry(ucounts, hashent, node) { + if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) + return ucounts; + } + return NULL; +} + +static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) +{ + struct hlist_head *hashent = ucounts_hashentry(ns, uid); + struct ucounts *ucounts, *new; + + spin_lock(&ucounts_lock); + ucounts = find_ucounts(ns, uid, hashent); + if (!ucounts) { + spin_unlock(&ucounts_lock); + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + new->ns = ns; + new->uid = uid; + atomic_set(&new->count, 0); + + spin_lock(&ucounts_lock); + ucounts = find_ucounts(ns, uid, hashent); + if (ucounts) { + kfree(new); + } else { + hlist_add_head(&new->node, hashent); + ucounts = new; + } + } + if (!atomic_add_unless(&ucounts->count, 1, INT_MAX)) + ucounts = NULL; + spin_unlock(&ucounts_lock); + return ucounts; +} + +static void put_ucounts(struct ucounts *ucounts) +{ + if (atomic_dec_and_test(&ucounts->count)) { + spin_lock(&ucounts_lock); + hlist_del_init(&ucounts->node); + spin_unlock(&ucounts_lock); + + kfree(ucounts); + } +} + +static inline bool atomic_inc_below(atomic_t *v, int u) +{ + int c, old; + c = atomic_read(v); + for (;;) { + if (unlikely(c >= u)) + return false; + old = atomic_cmpxchg(v, c, c+1); + if (likely(old == c)) + return true; + c = old; + } +} + +struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, + enum ucount_type type) +{ + struct ucounts *ucounts, *iter, *bad; + struct user_namespace *tns; + ucounts = get_ucounts(ns, uid); + for (iter = ucounts; iter; iter = tns->ucounts) { + int max; + tns = iter->ns; + max = READ_ONCE(tns->ucount_max[type]); + if (!atomic_inc_below(&iter->ucount[type], max)) + goto fail; + } + return ucounts; +fail: + bad = iter; + for (iter = ucounts; iter != bad; iter = iter->ns->ucounts) + atomic_dec(&iter->ucount[type]); + + put_ucounts(ucounts); + return NULL; +} + +void dec_ucount(struct ucounts *ucounts, enum ucount_type type) +{ + struct ucounts *iter; + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + int dec = atomic_dec_if_positive(&iter->ucount[type]); + WARN_ON_ONCE(dec < 0); + } + put_ucounts(ucounts); +} + +static __init int user_namespace_sysctl_init(void) +{ +#ifdef CONFIG_SYSCTL + static struct ctl_table_header *user_header; + static struct ctl_table empty[1]; + /* + * It is necessary to register the user directory in the + * default set so that registrations in the child sets work + * properly. + */ + user_header = register_sysctl("user", empty); + BUG_ON(!user_header); + BUG_ON(!setup_userns_sysctls(&init_user_ns)); +#endif + return 0; +} +subsys_initcall(user_namespace_sysctl_init); + + diff --git a/kernel/uid16.c b/kernel/uid16.c index d58cc4d8f0d1..cc40793464e3 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -117,7 +117,7 @@ static int groups16_to_user(old_gid_t __user *grouplist, kgid_t kgid; for (i = 0; i < group_info->ngroups; i++) { - kgid = GROUP_AT(group_info, i); + kgid = group_info->gid[i]; group = high2lowgid(from_kgid_munged(user_ns, kgid)); if (put_user(group, grouplist+i)) return -EFAULT; @@ -142,7 +142,7 @@ static int groups16_from_user(struct group_info *group_info, if (!gid_valid(kgid)) return -EINVAL; - GROUP_AT(group_info, i) = kgid; + group_info->gid[i] = kgid; } return 0; diff --git a/kernel/up.c b/kernel/up.c index 1760bf3d1463..ee81ac9af4ca 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -6,6 +6,7 @@ #include <linux/kernel.h> #include <linux/export.h> #include <linux/smp.h> +#include <linux/hypervisor.h> int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int wait) @@ -82,3 +83,20 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), preempt_enable(); } EXPORT_SYMBOL(on_each_cpu_cond); + +int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) +{ + int ret; + + if (cpu != 0) + return -ENXIO; + + if (phys) + hypervisor_pin_vcpu(0); + ret = func(par); + if (phys) + hypervisor_pin_vcpu(-1); + + return ret; +} +EXPORT_SYMBOL_GPL(smp_call_on_cpu); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 68f594212759..86b7854fec8e 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -29,6 +29,17 @@ static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); +static void free_user_ns(struct work_struct *work); + +static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) +{ + return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES); +} + +static void dec_user_namespaces(struct ucounts *ucounts) +{ + return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES); +} static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) { @@ -62,10 +73,16 @@ int create_user_ns(struct cred *new) struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; - int ret; + struct ucounts *ucounts; + int ret, i; + ret = -ENOSPC; if (parent_ns->level > 32) - return -EUSERS; + goto fail; + + ucounts = inc_user_namespaces(parent_ns, owner); + if (!ucounts) + goto fail; /* * Verify that we can not violate the policy of which files @@ -73,26 +90,27 @@ int create_user_ns(struct cred *new) * by verifing that the root directory is at the root of the * mount namespace which allows all files to be accessed. */ + ret = -EPERM; if (current_chrooted()) - return -EPERM; + goto fail_dec; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. */ + ret = -EPERM; if (!kuid_has_mapping(parent_ns, owner) || !kgid_has_mapping(parent_ns, group)) - return -EPERM; + goto fail_dec; + ret = -ENOMEM; ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) - return -ENOMEM; + goto fail_dec; ret = ns_alloc_inum(&ns->ns); - if (ret) { - kmem_cache_free(user_ns_cachep, ns); - return ret; - } + if (ret) + goto fail_free; ns->ns.ops = &userns_operations; atomic_set(&ns->count, 1); @@ -101,18 +119,37 @@ int create_user_ns(struct cred *new) ns->level = parent_ns->level + 1; ns->owner = owner; ns->group = group; + INIT_WORK(&ns->work, free_user_ns); + for (i = 0; i < UCOUNT_COUNTS; i++) { + ns->ucount_max[i] = INT_MAX; + } + ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ mutex_lock(&userns_state_mutex); ns->flags = parent_ns->flags; mutex_unlock(&userns_state_mutex); - set_cred_user_ns(new, ns); - #ifdef CONFIG_PERSISTENT_KEYRINGS init_rwsem(&ns->persistent_keyring_register_sem); #endif + ret = -ENOMEM; + if (!setup_userns_sysctls(ns)) + goto fail_keyring; + + set_cred_user_ns(new, ns); return 0; +fail_keyring: +#ifdef CONFIG_PERSISTENT_KEYRINGS + key_put(ns->persistent_keyring_register); +#endif + ns_free_inum(&ns->ns); +fail_free: + kmem_cache_free(user_ns_cachep, ns); +fail_dec: + dec_user_namespaces(ucounts); +fail: + return ret; } int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) @@ -135,21 +172,30 @@ int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) return err; } -void free_user_ns(struct user_namespace *ns) +static void free_user_ns(struct work_struct *work) { - struct user_namespace *parent; + struct user_namespace *parent, *ns = + container_of(work, struct user_namespace, work); do { + struct ucounts *ucounts = ns->ucounts; parent = ns->parent; + retire_userns_sysctls(ns); #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); + dec_user_namespaces(ucounts); ns = parent; } while (atomic_dec_and_test(&parent->count)); } -EXPORT_SYMBOL(free_user_ns); + +void __put_user_ns(struct user_namespace *ns) +{ + schedule_work(&ns->work); +} +EXPORT_SYMBOL(__put_user_ns); static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) { @@ -1004,12 +1050,37 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) return commit_creds(cred); } +struct ns_common *ns_get_owner(struct ns_common *ns) +{ + struct user_namespace *my_user_ns = current_user_ns(); + struct user_namespace *owner, *p; + + /* See if the owner is in the current user namespace */ + owner = p = ns->ops->owner(ns); + for (;;) { + if (!p) + return ERR_PTR(-EPERM); + if (p == my_user_ns) + break; + p = p->parent; + } + + return &get_user_ns(owner)->ns; +} + +static struct user_namespace *userns_owner(struct ns_common *ns) +{ + return to_user_ns(ns)->parent; +} + const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, + .owner = userns_owner, + .get_parent = ns_get_owner, }; static __init int user_namespaces_init(void) diff --git a/kernel/utsname.c b/kernel/utsname.c index 831ea7108232..6976cd47dcf6 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -17,6 +17,16 @@ #include <linux/user_namespace.h> #include <linux/proc_ns.h> +static struct ucounts *inc_uts_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES); +} + +static void dec_uts_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES); +} + static struct uts_namespace *create_uts_ns(void) { struct uts_namespace *uts_ns; @@ -36,18 +46,24 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *ns; + struct ucounts *ucounts; int err; + err = -ENOSPC; + ucounts = inc_uts_namespaces(user_ns); + if (!ucounts) + goto fail; + + err = -ENOMEM; ns = create_uts_ns(); if (!ns) - return ERR_PTR(-ENOMEM); + goto fail_dec; err = ns_alloc_inum(&ns->ns); - if (err) { - kfree(ns); - return ERR_PTR(err); - } + if (err) + goto fail_free; + ns->ucounts = ucounts; ns->ns.ops = &utsns_operations; down_read(&uts_sem); @@ -55,6 +71,13 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, ns->user_ns = get_user_ns(user_ns); up_read(&uts_sem); return ns; + +fail_free: + kfree(ns); +fail_dec: + dec_uts_namespaces(ucounts); +fail: + return ERR_PTR(err); } /* @@ -85,6 +108,7 @@ void free_uts_ns(struct kref *kref) struct uts_namespace *ns; ns = container_of(kref, struct uts_namespace, kref); + dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kfree(ns); @@ -130,10 +154,16 @@ static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new) return 0; } +static struct user_namespace *utsns_owner(struct ns_common *ns) +{ + return to_uts_ns(ns)->user_ns; +} + const struct proc_ns_operations utsns_operations = { .name = "uts", .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, + .owner = utsns_owner, }; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ef071ca73fc3..1d9fb6543a66 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -290,6 +290,8 @@ module_param_named(disable_numa, wq_disable_numa, bool, 0444); static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT); module_param_named(power_efficient, wq_power_efficient, bool, 0444); +static bool wq_online; /* can kworkers be created yet? */ + static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ @@ -2583,6 +2585,9 @@ void flush_workqueue(struct workqueue_struct *wq) }; int next_color; + if (WARN_ON(!wq_online)) + return; + lock_map_acquire(&wq->lockdep_map); lock_map_release(&wq->lockdep_map); @@ -2843,6 +2848,9 @@ bool flush_work(struct work_struct *work) { struct wq_barrier barr; + if (WARN_ON(!wq_online)) + return false; + lock_map_acquire(&work->lockdep_map); lock_map_release(&work->lockdep_map); @@ -2913,7 +2921,13 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) mark_work_canceling(work); local_irq_restore(flags); - flush_work(work); + /* + * This allows canceling during early boot. We know that @work + * isn't executing. + */ + if (wq_online) + flush_work(work); + clear_work_data(work); /* @@ -2974,6 +2988,31 @@ bool flush_delayed_work(struct delayed_work *dwork) } EXPORT_SYMBOL(flush_delayed_work); +static bool __cancel_work(struct work_struct *work, bool is_dwork) +{ + unsigned long flags; + int ret; + + do { + ret = try_to_grab_pending(work, is_dwork, &flags); + } while (unlikely(ret == -EAGAIN)); + + if (unlikely(ret < 0)) + return false; + + set_work_pool_and_clear_pending(work, get_work_pool_id(work)); + local_irq_restore(flags); + return ret; +} + +/* + * See cancel_delayed_work() + */ +bool cancel_work(struct work_struct *work) +{ + return __cancel_work(work, false); +} + /** * cancel_delayed_work - cancel a delayed work * @dwork: delayed_work to cancel @@ -2992,20 +3031,7 @@ EXPORT_SYMBOL(flush_delayed_work); */ bool cancel_delayed_work(struct delayed_work *dwork) { - unsigned long flags; - int ret; - - do { - ret = try_to_grab_pending(&dwork->work, true, &flags); - } while (unlikely(ret == -EAGAIN)); - - if (unlikely(ret < 0)) - return false; - - set_work_pool_and_clear_pending(&dwork->work, - get_work_pool_id(&dwork->work)); - local_irq_restore(flags); - return ret; + return __cancel_work(&dwork->work, true); } EXPORT_SYMBOL(cancel_delayed_work); @@ -3352,7 +3378,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) goto fail; /* create and start the initial worker */ - if (!create_worker(pool)) + if (wq_online && !create_worker(pool)) goto fail; /* install */ @@ -3417,6 +3443,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) { struct workqueue_struct *wq = pwq->wq; bool freezable = wq->flags & WQ_FREEZABLE; + unsigned long flags; /* for @wq->saved_max_active */ lockdep_assert_held(&wq->mutex); @@ -3425,7 +3452,8 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) if (!freezable && pwq->max_active == wq->saved_max_active) return; - spin_lock_irq(&pwq->pool->lock); + /* this function can be called during early boot w/ irq disabled */ + spin_lock_irqsave(&pwq->pool->lock, flags); /* * During [un]freezing, the caller is responsible for ensuring that @@ -3448,7 +3476,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) pwq->max_active = 0; } - spin_unlock_irq(&pwq->pool->lock); + spin_unlock_irqrestore(&pwq->pool->lock, flags); } /* initialize newly alloced @pwq which is associated with @wq and @pool */ @@ -4021,6 +4049,7 @@ void destroy_workqueue(struct workqueue_struct *wq) for (i = 0; i < WORK_NR_COLORS; i++) { if (WARN_ON(pwq->nr_in_flight[i])) { mutex_unlock(&wq->mutex); + show_workqueue_state(); return; } } @@ -4029,6 +4058,7 @@ void destroy_workqueue(struct workqueue_struct *wq) WARN_ON(pwq->nr_active) || WARN_ON(!list_empty(&pwq->delayed_works))) { mutex_unlock(&wq->mutex); + show_workqueue_state(); return; } } @@ -4249,7 +4279,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) * This function is called without any synchronization and @task * could be in any state. Be careful with dereferences. */ - worker = probe_kthread_data(task); + worker = kthread_probe_data(task); /* * Carefully copy the associated workqueue's workfn and name. Keep @@ -5455,7 +5485,17 @@ static void __init wq_numa_init(void) wq_numa_enabled = true; } -static int __init init_workqueues(void) +/** + * workqueue_init_early - early init for workqueue subsystem + * + * This is the first half of two-staged workqueue subsystem initialization + * and invoked as soon as the bare basics - memory allocation, cpumasks and + * idr are up. It sets up all the data structures and system workqueues + * and allows early boot code to create workqueues and queue/cancel work + * items. Actual work item execution starts only after kthreads can be + * created and scheduled right before early initcalls. + */ +int __init workqueue_init_early(void) { int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; int i, cpu; @@ -5467,8 +5507,6 @@ static int __init init_workqueues(void) pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); - wq_numa_init(); - /* initialize CPU pools */ for_each_possible_cpu(cpu) { struct worker_pool *pool; @@ -5488,16 +5526,6 @@ static int __init init_workqueues(void) } } - /* create the initial worker */ - for_each_online_cpu(cpu) { - struct worker_pool *pool; - - for_each_cpu_worker_pool(pool, cpu) { - pool->flags &= ~POOL_DISASSOCIATED; - BUG_ON(!create_worker(pool)); - } - } - /* create default unbound and ordered wq attrs */ for (i = 0; i < NR_STD_WORKER_POOLS; i++) { struct workqueue_attrs *attrs; @@ -5534,8 +5562,59 @@ static int __init init_workqueues(void) !system_power_efficient_wq || !system_freezable_power_efficient_wq); + return 0; +} + +/** + * workqueue_init - bring workqueue subsystem fully online + * + * This is the latter half of two-staged workqueue subsystem initialization + * and invoked as soon as kthreads can be created and scheduled. + * Workqueues have been created and work items queued on them, but there + * are no kworkers executing the work items yet. Populate the worker pools + * with the initial workers and enable future kworker creations. + */ +int __init workqueue_init(void) +{ + struct workqueue_struct *wq; + struct worker_pool *pool; + int cpu, bkt; + + /* + * It'd be simpler to initialize NUMA in workqueue_init_early() but + * CPU to node mapping may not be available that early on some + * archs such as power and arm64. As per-cpu pools created + * previously could be missing node hint and unbound pools NUMA + * affinity, fix them up. + */ + wq_numa_init(); + + mutex_lock(&wq_pool_mutex); + + for_each_possible_cpu(cpu) { + for_each_cpu_worker_pool(pool, cpu) { + pool->node = cpu_to_node(cpu); + } + } + + list_for_each_entry(wq, &workqueues, list) + wq_update_unbound_numa(wq, smp_processor_id(), true); + + mutex_unlock(&wq_pool_mutex); + + /* create the initial workers */ + for_each_online_cpu(cpu) { + for_each_cpu_worker_pool(pool, cpu) { + pool->flags &= ~POOL_DISASSOCIATED; + BUG_ON(!create_worker(pool)); + } + } + + hash_for_each(unbound_pool_hash, bkt, pool, hash_node) + BUG_ON(!create_worker(pool)); + + wq_online = true; wq_watchdog_init(); return 0; } -early_initcall(init_workqueues); |