diff options
Diffstat (limited to 'kernel')
34 files changed, 590 insertions, 173 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index d15c0ee4d955..addf7732fb56 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -102,7 +102,7 @@ static int check_free_space(struct bsd_acct_struct *acct) { struct kstatfs sbuf; - if (time_is_before_jiffies(acct->needcheck)) + if (time_is_after_jiffies(acct->needcheck)) goto out; /* May block */ diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 7c25426d3cf5..ab94d304a634 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -53,9 +53,10 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); + u32 elem_size, index_mask, max_entries; + bool unpriv = !capable(CAP_SYS_ADMIN); struct bpf_array *array; - u64 array_size; - u32 elem_size; + u64 array_size, mask64; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -72,11 +73,32 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) elem_size = round_up(attr->value_size, 8); + max_entries = attr->max_entries; + + /* On 32 bit archs roundup_pow_of_two() with max_entries that has + * upper most bit set in u32 space is undefined behavior due to + * resulting 1U << 32, so do it manually here in u64 space. + */ + mask64 = fls_long(max_entries - 1); + mask64 = 1ULL << mask64; + mask64 -= 1; + + index_mask = mask64; + if (unpriv) { + /* round up array size to nearest power of 2, + * since cpu will speculate within index_mask limits + */ + max_entries = index_mask + 1; + /* Check for overflows. */ + if (max_entries < attr->max_entries) + return ERR_PTR(-E2BIG); + } + array_size = sizeof(*array); if (percpu) - array_size += (u64) attr->max_entries * sizeof(void *); + array_size += (u64) max_entries * sizeof(void *); else - array_size += (u64) attr->max_entries * elem_size; + array_size += (u64) max_entries * elem_size; /* make sure there is no u32 overflow later in round_up() */ if (array_size >= U32_MAX - PAGE_SIZE) @@ -86,6 +108,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array = bpf_map_area_alloc(array_size, numa_node); if (!array) return ERR_PTR(-ENOMEM); + array->index_mask = index_mask; + array->map.unpriv_array = unpriv; /* copy mandatory map attributes */ array->map.map_type = attr->map_type; @@ -121,12 +145,13 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) if (unlikely(index >= array->map.max_entries)) return NULL; - return array->value + array->elem_size * index; + return array->value + array->elem_size * (index & array->index_mask); } /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { + struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; u32 elem_size = round_up(map->value_size, 8); const int ret = BPF_REG_0; @@ -135,7 +160,12 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); + if (map->unpriv_array) { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); + *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); + } else { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); + } if (is_power_of_2(elem_size)) { *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); @@ -157,7 +187,7 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) if (unlikely(index >= array->map.max_entries)) return NULL; - return this_cpu_ptr(array->pptrs[index]); + return this_cpu_ptr(array->pptrs[index & array->index_mask]); } int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) @@ -177,7 +207,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) */ size = round_up(map->value_size, 8); rcu_read_lock(); - pptr = array->pptrs[index]; + pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); off += size; @@ -225,10 +255,11 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, return -EEXIST; if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) - memcpy(this_cpu_ptr(array->pptrs[index]), + memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), value, map->value_size); else - memcpy(array->value + array->elem_size * index, + memcpy(array->value + + array->elem_size * (index & array->index_mask), value, map->value_size); return 0; } @@ -262,7 +293,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, */ size = round_up(map->value_size, 8); rcu_read_lock(); - pptr = array->pptrs[index]; + pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); off += size; @@ -613,6 +644,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) static u32 array_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { + struct bpf_array *array = container_of(map, struct bpf_array, map); u32 elem_size = round_up(map->value_size, 8); struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; @@ -621,7 +653,12 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); + if (map->unpriv_array) { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); + *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); + } else { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); + } if (is_power_of_2(elem_size)) *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); else diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 86b50aa26ee8..7949e8b8f94e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -767,6 +767,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } EXPORT_SYMBOL_GPL(__bpf_call_base); +#ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * __bpf_prog_run - run eBPF program on a given context * @ctx: is the data we are operating on @@ -955,7 +956,7 @@ select_insn: DST = tmp; CONT; ALU_MOD_X: - if (unlikely(SRC == 0)) + if (unlikely((u32)SRC == 0)) return 0; tmp = (u32) DST; DST = do_div(tmp, (u32) SRC); @@ -974,7 +975,7 @@ select_insn: DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: - if (unlikely(SRC == 0)) + if (unlikely((u32)SRC == 0)) return 0; tmp = (u32) DST; do_div(tmp, (u32) SRC); @@ -1317,6 +1318,14 @@ EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; +#else +static unsigned int __bpf_prog_ret0(const void *ctx, + const struct bpf_insn *insn) +{ + return 0; +} +#endif + bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { @@ -1364,9 +1373,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { +#ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; +#else + fp->bpf_func = __bpf_prog_ret0; +#endif /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during @@ -1376,6 +1389,12 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) */ if (!bpf_prog_is_dev_bound(fp->aux)) { fp = bpf_int_jit_compile(fp); +#ifdef CONFIG_BPF_JIT_ALWAYS_ON + if (!fp->jited) { + *err = -ENOTSUPP; + return fp; + } +#endif } else { *err = bpf_prog_offload_compile(fp); if (*err) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 01aaef1a77c5..5bb5e49ef4c3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -368,7 +368,45 @@ out: putname(pname); return ret; } -EXPORT_SYMBOL_GPL(bpf_obj_get_user); + +static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) +{ + struct bpf_prog *prog; + int ret = inode_permission(inode, MAY_READ | MAY_WRITE); + if (ret) + return ERR_PTR(ret); + + if (inode->i_op == &bpf_map_iops) + return ERR_PTR(-EINVAL); + if (inode->i_op != &bpf_prog_iops) + return ERR_PTR(-EACCES); + + prog = inode->i_private; + + ret = security_bpf_prog(prog); + if (ret < 0) + return ERR_PTR(ret); + + if (!bpf_prog_get_ok(prog, &type, false)) + return ERR_PTR(-EINVAL); + + return bpf_prog_inc(prog); +} + +struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) +{ + struct bpf_prog *prog; + struct path path; + int ret = kern_path(name, LOOKUP_FOLLOW, &path); + if (ret) + return ERR_PTR(ret); + prog = __get_prog_inode(d_backing_inode(path.dentry), type); + if (!IS_ERR(prog)) + touch_atime(&path); + path_put(&path); + return prog; +} +EXPORT_SYMBOL(bpf_prog_get_type_path); static void bpf_evict_inode(struct inode *inode) { diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 5ee2e41893d9..1712d319c2d8 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -591,8 +591,15 @@ static void sock_map_free(struct bpf_map *map) write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); - smap_list_remove(psock, &stab->sock_map[i]); - smap_release_sock(psock, sock); + /* This check handles a racing sock event that can get the + * sk_callback_lock before this case but after xchg happens + * causing the refcnt to hit zero and sock user data (psock) + * to be null and queued for garbage collection. + */ + if (likely(psock)) { + smap_list_remove(psock, &stab->sock_map[i]); + smap_release_sock(psock, sock); + } write_unlock_bh(&sock->sk_callback_lock); } rcu_read_unlock(); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2c4cfeaa8d5e..5cb783fc8224 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1057,7 +1057,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) } EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); -static bool bpf_prog_get_ok(struct bpf_prog *prog, +bool bpf_prog_get_ok(struct bpf_prog *prog, enum bpf_prog_type *attach_type, bool attach_drv) { /* not an attachment, just a refcount inc, always allow */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 04b24876cd23..13551e623501 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -978,6 +978,13 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno); } +static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) +{ + const struct bpf_reg_state *reg = cur_regs(env) + regno; + + return reg->type == PTR_TO_CTX; +} + static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int off, int size, bool strict) @@ -1258,6 +1265,12 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins return -EACCES; } + if (is_ctx_reg(env, insn->dst_reg)) { + verbose(env, "BPF_XADD stores into R%d context is not allowed\n", + insn->dst_reg); + return -EACCES; + } + /* check whether atomic_add can read the memory */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1); @@ -1729,6 +1742,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); if (err) return err; + if (func_id == BPF_FUNC_tail_call) { + if (meta.map_ptr == NULL) { + verbose(env, "verifier bug\n"); + return -EINVAL; + } + env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr; + } err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); if (err) return err; @@ -1875,17 +1895,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg = ®s[dst]; - if (WARN_ON_ONCE(known && (smin_val != smax_val))) { - print_verifier_state(env, env->cur_state); - verbose(env, - "verifier internal error: known but bad sbounds\n"); - return -EINVAL; - } - if (WARN_ON_ONCE(known && (umin_val != umax_val))) { - print_verifier_state(env, env->cur_state); - verbose(env, - "verifier internal error: known but bad ubounds\n"); - return -EINVAL; + if ((known && (smin_val != smax_val || umin_val != umax_val)) || + smin_val > smax_val || umin_val > umax_val) { + /* Taint dst register if offset had invalid bounds derived from + * e.g. dead branches. + */ + __mark_reg_unknown(dst_reg); + return 0; } if (BPF_CLASS(insn->code) != BPF_ALU64) { @@ -2077,6 +2093,15 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, src_known = tnum_is_const(src_reg.var_off); dst_known = tnum_is_const(dst_reg->var_off); + if ((src_known && (smin_val != smax_val || umin_val != umax_val)) || + smin_val > smax_val || umin_val > umax_val) { + /* Taint dst register if offset had invalid bounds derived from + * e.g. dead branches. + */ + __mark_reg_unknown(dst_reg); + return 0; + } + if (!src_known && opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) { __mark_reg_unknown(dst_reg); @@ -2486,6 +2511,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) { + verbose(env, "BPF_ARSH not supported for 32 bit ALU\n"); + return -EINVAL; + } + if ((opcode == BPF_LSH || opcode == BPF_RSH || opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; @@ -3981,6 +4011,12 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; + if (is_ctx_reg(env, insn->dst_reg)) { + verbose(env, "BPF_ST stores into R%d context is not allowed\n", + insn->dst_reg); + return -EACCES; + } + /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, @@ -4433,6 +4469,24 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) int i, cnt, delta = 0; for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || + insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { + /* due to JIT bugs clear upper 32-bits of src register + * before div/mod operation + */ + insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg); + insn_buf[1] = *insn; + cnt = 2; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (insn->code != (BPF_JMP | BPF_CALL)) continue; @@ -4456,6 +4510,35 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) */ insn->imm = 0; insn->code = BPF_JMP | BPF_TAIL_CALL; + + /* instead of changing every JIT dealing with tail_call + * emit two extra insns: + * if (index >= max_entries) goto out; + * index &= array->index_mask; + * to avoid out-of-bounds cpu speculation + */ + map_ptr = env->insn_aux_data[i + delta].map_ptr; + if (map_ptr == BPF_MAP_PTR_POISON) { + verbose(env, "tail_call abusing map_ptr\n"); + return -EINVAL; + } + if (!map_ptr->unpriv_array) + continue; + insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, + map_ptr->max_entries, 2); + insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, + container_of(map_ptr, + struct bpf_array, + map)->index_mask); + insn_buf[2] = *insn; + cnt = 3; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; continue; } diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 024085daab1a..a2c05d2476ac 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -123,7 +123,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) */ do { css_task_iter_start(&from->self, 0, &it); - task = css_task_iter_next(&it); + + do { + task = css_task_iter_next(&it); + } while (task && (task->flags & PF_EXITING)); + if (task) get_task_struct(task); css_task_iter_end(&it); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0b1ffe147f24..7e4c44538119 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, cft->name); else - strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + strlcpy(buf, cft->name, CGROUP_FILE_NAME_MAX); return buf; } @@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) root->flags = opts->flags; if (opts->release_agent) - strcpy(root->release_agent_path, opts->release_agent); + strlcpy(root->release_agent_path, opts->release_agent, PATH_MAX); if (opts->name) - strcpy(root->name, opts->name); + strlcpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); if (opts->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } @@ -4125,26 +4125,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) static void css_task_iter_advance(struct css_task_iter *it) { - struct list_head *l = it->task_pos; + struct list_head *next; lockdep_assert_held(&css_set_lock); - WARN_ON_ONCE(!l); - repeat: /* * Advance iterator to find next entry. cset->tasks is consumed * first and then ->mg_tasks. After ->mg_tasks, we move onto the * next cset. */ - l = l->next; + next = it->task_pos->next; - if (l == it->tasks_head) - l = it->mg_tasks_head->next; + if (next == it->tasks_head) + next = it->mg_tasks_head->next; - if (l == it->mg_tasks_head) + if (next == it->mg_tasks_head) css_task_iter_advance_css_set(it); else - it->task_pos = l; + it->task_pos = next; /* if PROCS, skip over tasks which aren't group leaders */ if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && @@ -4449,6 +4447,7 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cgroup.threads", + .flags = CFTYPE_NS_DELEGATABLE, .release = cgroup_procs_release, .seq_start = cgroup_threads_start, .seq_next = cgroup_procs_next, diff --git a/kernel/crash_core.c b/kernel/crash_core.c index b3663896278e..4f63597c824d 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -410,7 +410,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(contig_page_data); #endif #ifdef CONFIG_SPARSEMEM - VMCOREINFO_SYMBOL(mem_section); + VMCOREINFO_SYMBOL_ARRAY(mem_section); VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); VMCOREINFO_STRUCT_SIZE(mem_section); VMCOREINFO_OFFSET(mem_section, section_mem_map); diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 4a1c33416b6a..e2764d767f18 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -51,16 +51,16 @@ void __delayacct_tsk_init(struct task_struct *tsk) * Finish delay accounting for a statistic using its timestamps (@start), * accumalator (@total) and @count */ -static void delayacct_end(u64 *start, u64 *total, u32 *count) +static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count) { s64 ns = ktime_get_ns() - *start; unsigned long flags; if (ns > 0) { - spin_lock_irqsave(¤t->delays->lock, flags); + spin_lock_irqsave(lock, flags); *total += ns; (*count)++; - spin_unlock_irqrestore(¤t->delays->lock, flags); + spin_unlock_irqrestore(lock, flags); } } @@ -69,17 +69,25 @@ void __delayacct_blkio_start(void) current->delays->blkio_start = ktime_get_ns(); } -void __delayacct_blkio_end(void) +/* + * We cannot rely on the `current` macro, as we haven't yet switched back to + * the process being woken. + */ +void __delayacct_blkio_end(struct task_struct *p) { - if (current->delays->flags & DELAYACCT_PF_SWAPIN) - /* Swapin block I/O */ - delayacct_end(¤t->delays->blkio_start, - ¤t->delays->swapin_delay, - ¤t->delays->swapin_count); - else /* Other block I/O */ - delayacct_end(¤t->delays->blkio_start, - ¤t->delays->blkio_delay, - ¤t->delays->blkio_count); + struct task_delay_info *delays = p->delays; + u64 *total; + u32 *count; + + if (p->delays->flags & DELAYACCT_PF_SWAPIN) { + total = &delays->swapin_delay; + count = &delays->swapin_count; + } else { + total = &delays->blkio_delay; + count = &delays->blkio_count; + } + + delayacct_end(&delays->lock, &delays->blkio_start, total, count); } int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) @@ -153,8 +161,10 @@ void __delayacct_freepages_start(void) void __delayacct_freepages_end(void) { - delayacct_end(¤t->delays->freepages_start, - ¤t->delays->freepages_delay, - ¤t->delays->freepages_count); + delayacct_end( + ¤t->delays->lock, + ¤t->delays->freepages_start, + ¤t->delays->freepages_delay, + ¤t->delays->freepages_count); } diff --git a/kernel/events/core.c b/kernel/events/core.c index 4df5b695bf0d..5d8f4031f8d5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1231,6 +1231,10 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event_context::lock * perf_event::mmap_mutex * mmap_sem + * + * cpu_hotplug_lock + * pmus_lock + * cpuctx->mutex / perf_event_context::mutex */ static struct perf_event_context * perf_event_ctx_lock_nested(struct perf_event *event, int nesting) @@ -4196,6 +4200,7 @@ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_event *child, *tmp; + LIST_HEAD(free_list); /* * If we got here through err_file: fput(event_file); we will not have @@ -4268,8 +4273,7 @@ again: struct perf_event, child_list); if (tmp == child) { perf_remove_from_context(child, DETACH_GROUP); - list_del(&child->child_list); - free_event(child); + list_move(&child->child_list, &free_list); /* * This matches the refcount bump in inherit_event(); * this can't be the last reference. @@ -4284,6 +4288,11 @@ again: } mutex_unlock(&event->child_mutex); + list_for_each_entry_safe(child, tmp, &free_list, child_list) { + list_del(&child->child_list); + free_event(child); + } + no_ctx: put_event(event); /* Must be the 'last' reference */ return 0; @@ -8516,6 +8525,29 @@ fail_clear_files: return ret; } +static int +perf_tracepoint_set_filter(struct perf_event *event, char *filter_str) +{ + struct perf_event_context *ctx = event->ctx; + int ret; + + /* + * Beware, here be dragons!! + * + * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint + * stuff does not actually need it. So temporarily drop ctx->mutex. As per + * perf_event_ctx_lock() we already have a reference on ctx. + * + * This can result in event getting moved to a different ctx, but that + * does not affect the tracepoint state. + */ + mutex_unlock(&ctx->mutex); + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); + mutex_lock(&ctx->mutex); + + return ret; +} + static int perf_event_set_filter(struct perf_event *event, void __user *arg) { char *filter_str; @@ -8532,8 +8564,7 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg) if (IS_ENABLED(CONFIG_EVENT_TRACING) && event->attr.type == PERF_TYPE_TRACEPOINT) - ret = ftrace_profile_set_filter(event, event->attr.config, - filter_str); + ret = perf_tracepoint_set_filter(event, filter_str); else if (has_addr_filter(event)) ret = perf_event_set_addr_filter(event, filter_str); @@ -9168,7 +9199,13 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) if (!try_module_get(pmu->module)) return -ENODEV; - if (event->group_leader != event) { + /* + * A number of pmu->event_init() methods iterate the sibling_list to, + * for example, validate if the group fits on the PMU. Therefore, + * if this is a sibling event, acquire the ctx->mutex to protect + * the sibling_list. + */ + if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) { /* * This ctx->mutex can nest when we're called through * inheritance. See the perf_event_ctx_lock_nested() comment. diff --git a/kernel/exit.c b/kernel/exit.c index df0c91d5606c..995453d9fb55 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1763,3 +1763,4 @@ __weak void abort(void) /* if that doesn't kill us, halt */ panic("Oops failed to kill thread"); } +EXPORT_SYMBOL(abort); diff --git a/kernel/futex.c b/kernel/futex.c index 57d0b3657e16..7f719d110908 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1878,6 +1878,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, struct futex_q *this, *next; DEFINE_WAKE_Q(wake_q); + if (nr_wake < 0 || nr_requeue < 0) + return -EINVAL; + /* * When PI not supported: return -ENOSYS if requeue_pi is true, * consequently the compiler knows requeue_pi is always false past @@ -2294,34 +2297,33 @@ static void unqueue_me_pi(struct futex_q *q) spin_unlock(q->lock_ptr); } -/* - * Fixup the pi_state owner with the new owner. - * - * Must be called with hash bucket lock held and mm->sem held for non - * private futexes. - */ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner) + struct task_struct *argowner) { - u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; u32 uval, uninitialized_var(curval), newval; - struct task_struct *oldowner; + struct task_struct *oldowner, *newowner; + u32 newtid; int ret; + lockdep_assert_held(q->lock_ptr); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); oldowner = pi_state->owner; - /* Owner died? */ - if (!pi_state->owner) - newtid |= FUTEX_OWNER_DIED; /* - * We are here either because we stole the rtmutex from the - * previous highest priority waiter or we are the highest priority - * waiter but have failed to get the rtmutex the first time. + * We are here because either: + * + * - we stole the lock and pi_state->owner needs updating to reflect + * that (@argowner == current), + * + * or: * - * We have to replace the newowner TID in the user space variable. + * - someone stole our lock and we need to fix things to point to the + * new owner (@argowner == NULL). + * + * Either way, we have to replace the TID in the user space variable. * This must be atomic as we have to preserve the owner died bit here. * * Note: We write the user space value _before_ changing the pi_state @@ -2334,6 +2336,45 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * in the PID check in lookup_pi_state. */ retry: + if (!argowner) { + if (oldowner != current) { + /* + * We raced against a concurrent self; things are + * already fixed up. Nothing to do. + */ + ret = 0; + goto out_unlock; + } + + if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { + /* We got the lock after all, nothing to fix. */ + ret = 0; + goto out_unlock; + } + + /* + * Since we just failed the trylock; there must be an owner. + */ + newowner = rt_mutex_owner(&pi_state->pi_mutex); + BUG_ON(!newowner); + } else { + WARN_ON_ONCE(argowner != current); + if (oldowner == current) { + /* + * We raced against a concurrent self; things are + * already fixed up. Nothing to do. + */ + ret = 0; + goto out_unlock; + } + newowner = argowner; + } + + newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; + /* Owner died? */ + if (!pi_state->owner) + newtid |= FUTEX_OWNER_DIED; + if (get_futex_value_locked(&uval, uaddr)) goto handle_fault; @@ -2434,9 +2475,9 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) * Got the lock. We might not be the anticipated owner if we * did a lock-steal - fix up the PI-state in that case: * - * We can safely read pi_state->owner without holding wait_lock - * because we now own the rt_mutex, only the owner will attempt - * to change it. + * Speculative pi_state->owner read (we don't hold wait_lock); + * since we own the lock pi_state->owner == current is the + * stable state, anything else needs more attention. */ if (q->pi_state->owner != current) ret = fixup_pi_state_owner(uaddr, q, current); @@ -2444,6 +2485,19 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) } /* + * If we didn't get the lock; check if anybody stole it from us. In + * that case, we need to fix up the uval to point to them instead of + * us, otherwise bad things happen. [10] + * + * Another speculative read; pi_state->owner == current is unstable + * but needs our attention. + */ + if (q->pi_state->owner == current) { + ret = fixup_pi_state_owner(uaddr, q, NULL); + goto out; + } + + /* * Paranoia check. If we did not take the lock, then we should not be * the owner of the rt_mutex. */ diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 0ba0dd8863a7..5187dfe809ac 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -321,15 +321,23 @@ void irq_matrix_remove_reserved(struct irq_matrix *m) int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, bool reserved, unsigned int *mapped_cpu) { - unsigned int cpu; + unsigned int cpu, best_cpu, maxavl = 0; + struct cpumap *cm; + unsigned int bit; + best_cpu = UINT_MAX; for_each_cpu(cpu, msk) { - struct cpumap *cm = per_cpu_ptr(m->maps, cpu); - unsigned int bit; + cm = per_cpu_ptr(m->maps, cpu); - if (!cm->online) + if (!cm->online || cm->available <= maxavl) continue; + best_cpu = cpu; + maxavl = cm->available; + } + + if (maxavl) { + cm = per_cpu_ptr(m->maps, best_cpu); bit = matrix_alloc_area(m, cm, 1, false); if (bit < m->alloc_end) { cm->allocated++; @@ -338,8 +346,8 @@ int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, m->global_available--; if (reserved) m->global_reserved--; - *mapped_cpu = cpu; - trace_irq_matrix_alloc(bit, cpu, m, cm); + *mapped_cpu = best_cpu; + trace_irq_matrix_alloc(bit, best_cpu, m, cm); return bit; } } diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 8594d24e4adc..b4517095db6a 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -79,7 +79,7 @@ int static_key_count(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_count); -static void static_key_slow_inc_cpuslocked(struct static_key *key) +void static_key_slow_inc_cpuslocked(struct static_key *key) { int v, v1; @@ -180,7 +180,7 @@ void static_key_disable(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_disable); -static void static_key_slow_dec_cpuslocked(struct static_key *key, +static void __static_key_slow_dec_cpuslocked(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { @@ -211,7 +211,7 @@ static void __static_key_slow_dec(struct static_key *key, struct delayed_work *work) { cpus_read_lock(); - static_key_slow_dec_cpuslocked(key, rate_limit, work); + __static_key_slow_dec_cpuslocked(key, rate_limit, work); cpus_read_unlock(); } @@ -229,6 +229,12 @@ void static_key_slow_dec(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_slow_dec); +void static_key_slow_dec_cpuslocked(struct static_key *key) +{ + STATIC_KEY_CHECK_USE(key); + __static_key_slow_dec_cpuslocked(key, 0, NULL); +} + void static_key_slow_dec_deferred(struct static_key_deferred *key) { STATIC_KEY_CHECK_USE(key); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 5fa1324a4f29..521659044719 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -49,6 +49,7 @@ #include <linux/gfp.h> #include <linux/random.h> #include <linux/jhash.h> +#include <linux/nmi.h> #include <asm/sections.h> @@ -4490,6 +4491,7 @@ retry: if (!unlock) if (read_trylock(&tasklist_lock)) unlock = 1; + touch_nmi_watchdog(); } while_each_thread(g, p); pr_warn("\n"); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 6f3dba6e4e9e..65cc0cb984e6 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1290,6 +1290,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, return ret; } +static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock) +{ + int ret = try_to_take_rt_mutex(lock, current, NULL); + + /* + * try_to_take_rt_mutex() sets the lock waiters bit + * unconditionally. Clean this up. + */ + fixup_rt_mutex_waiters(lock); + + return ret; +} + /* * Slow path try-lock function: */ @@ -1312,13 +1325,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) */ raw_spin_lock_irqsave(&lock->wait_lock, flags); - ret = try_to_take_rt_mutex(lock, current, NULL); - - /* - * try_to_take_rt_mutex() sets the lock waiters bit - * unconditionally. Clean this up. - */ - fixup_rt_mutex_waiters(lock); + ret = __rt_mutex_slowtrylock(lock); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); @@ -1505,6 +1512,11 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) return rt_mutex_slowtrylock(lock); } +int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) +{ + return __rt_mutex_slowtrylock(lock); +} + /** * rt_mutex_timed_lock - lock a rt_mutex interruptible * the timeout structure is provided diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 124e98ca0b17..68686b3ec3c1 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -148,6 +148,7 @@ extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter); extern int rt_mutex_futex_trylock(struct rt_mutex *l); +extern int __rt_mutex_futex_trylock(struct rt_mutex *l); extern void rt_mutex_futex_unlock(struct rt_mutex *lock); extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, diff --git a/kernel/pid.c b/kernel/pid.c index 161af2eda943..5d30c87e3c42 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -205,10 +205,8 @@ struct pid *alloc_pid(struct pid_namespace *ns) } if (unlikely(is_child_reaper(pid))) { - if (pid_ns_prepare_proc(ns)) { - disable_pid_allocation(ns); + if (pid_ns_prepare_proc(ns)) goto out_free; - } } get_pid_ns(ns); @@ -238,6 +236,10 @@ out_free: while (++i <= ns->level) idr_remove(&ns->idr, (pid->numbers + i)->nr); + /* On failure to allocate the first pid, reset the state */ + if (ns->pid_allocated == PIDNS_ADDING) + idr_set_cursor(&ns->idr, 0); + spin_unlock_irq(&pidmap_lock); kmem_cache_free(ns->pid_cachep, pid); diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 2ddaec40956f..0926aef10dad 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -34,11 +34,6 @@ void complete(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); - /* - * Perform commit of crossrelease here. - */ - complete_release_commit(x); - if (x->done != UINT_MAX) x->done++; __wake_up_locked(&x->wait, TASK_NORMAL, 1); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 644fa2e3d993..a7bf32aabfda 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2056,7 +2056,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->state = TASK_WAKING; if (p->in_iowait) { - delayacct_blkio_end(); + delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); } @@ -2069,7 +2069,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) #else /* CONFIG_SMP */ if (p->in_iowait) { - delayacct_blkio_end(); + delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); } @@ -2122,7 +2122,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) if (!task_on_rq_queued(p)) { if (p->in_iowait) { - delayacct_blkio_end(); + delayacct_blkio_end(p); atomic_dec(&rq->nr_iowait); } ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2fe3aa853e4d..26a71ebcd3c2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4365,12 +4365,12 @@ static inline bool cfs_bandwidth_used(void) void cfs_bandwidth_usage_inc(void) { - static_key_slow_inc(&__cfs_bandwidth_used); + static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used); } void cfs_bandwidth_usage_dec(void) { - static_key_slow_dec(&__cfs_bandwidth_used); + static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used); } #else /* HAVE_JUMP_LABEL */ static bool cfs_bandwidth_used(void) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index dd7908743dab..9bcbacba82a8 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -89,7 +89,9 @@ static int membarrier_private_expedited(void) rcu_read_unlock(); } if (!fallback) { + preempt_disable(); smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + preempt_enable(); free_cpumask_var(tmpmask); } cpus_read_unlock(); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d32520840fde..aa9d2a2b1210 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -655,7 +655,9 @@ static void hrtimer_reprogram(struct hrtimer *timer, static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { base->expires_next = KTIME_MAX; + base->hang_detected = 0; base->hres_active = 0; + base->next_timer = NULL; } /* @@ -1589,6 +1591,7 @@ int hrtimers_prepare_cpu(unsigned int cpu) timerqueue_init_head(&cpu_base->clock_base[i].active); } + cpu_base->active_bases = 0; cpu_base->cpu = cpu; hrtimer_init_hres(cpu_base); return 0; diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 89a9e1b4264a..0bcf00e3ce48 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1696,7 +1696,7 @@ void run_local_timers(void) hrtimer_run_queues(); /* Raise the softirq only if required. */ if (time_before(jiffies, base->clk)) { - if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) + if (!IS_ENABLED(CONFIG_NO_HZ_COMMON)) return; /* CPU is awake, so check the deferrable base. */ base++; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 904c952ac383..f54dc62b599c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -355,7 +355,7 @@ config PROFILE_ANNOTATED_BRANCHES on if you need to profile the system's use of these macros. config PROFILE_ALL_BRANCHES - bool "Profile all if conditionals" + bool "Profile all if conditionals" if !FORTIFY_SOURCE select TRACE_BRANCH_PROFILING help This tracer profiles all branch conditions. Every if () diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ccdf3664e4a9..554b517c61a0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1119,15 +1119,11 @@ static struct ftrace_ops global_ops = { }; /* - * This is used by __kernel_text_address() to return true if the - * address is on a dynamically allocated trampoline that would - * not return true for either core_kernel_text() or - * is_module_text_address(). + * Used by the stack undwinder to know about dynamic ftrace trampolines. */ -bool is_ftrace_trampoline(unsigned long addr) +struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr) { - struct ftrace_ops *op; - bool ret = false; + struct ftrace_ops *op = NULL; /* * Some of the ops may be dynamically allocated, @@ -1144,15 +1140,24 @@ bool is_ftrace_trampoline(unsigned long addr) if (op->trampoline && op->trampoline_size) if (addr >= op->trampoline && addr < op->trampoline + op->trampoline_size) { - ret = true; - goto out; + preempt_enable_notrace(); + return op; } } while_for_each_ftrace_op(op); - - out: preempt_enable_notrace(); - return ret; + return NULL; +} + +/* + * This is used by __kernel_text_address() to return true if the + * address is on a dynamically allocated trampoline that would + * not return true for either core_kernel_text() or + * is_module_text_address(). + */ +bool is_ftrace_trampoline(unsigned long addr) +{ + return ftrace_ops_trampoline(addr) != NULL; } struct ftrace_page { diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9ab18995ff1e..5af2842dea96 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2534,29 +2534,58 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) * The lock and unlock are done within a preempt disable section. * The current_context per_cpu variable can only be modified * by the current task between lock and unlock. But it can - * be modified more than once via an interrupt. There are four - * different contexts that we need to consider. + * be modified more than once via an interrupt. To pass this + * information from the lock to the unlock without having to + * access the 'in_interrupt()' functions again (which do show + * a bit of overhead in something as critical as function tracing, + * we use a bitmask trick. * - * Normal context. - * SoftIRQ context - * IRQ context - * NMI context + * bit 0 = NMI context + * bit 1 = IRQ context + * bit 2 = SoftIRQ context + * bit 3 = normal context. * - * If for some reason the ring buffer starts to recurse, we - * only allow that to happen at most 4 times (one for each - * context). If it happens 5 times, then we consider this a - * recusive loop and do not let it go further. + * This works because this is the order of contexts that can + * preempt other contexts. A SoftIRQ never preempts an IRQ + * context. + * + * When the context is determined, the corresponding bit is + * checked and set (if it was set, then a recursion of that context + * happened). + * + * On unlock, we need to clear this bit. To do so, just subtract + * 1 from the current_context and AND it to itself. + * + * (binary) + * 101 - 1 = 100 + * 101 & 100 = 100 (clearing bit zero) + * + * 1010 - 1 = 1001 + * 1010 & 1001 = 1000 (clearing bit 1) + * + * The least significant bit can be cleared this way, and it + * just so happens that it is the same bit corresponding to + * the current context. */ static __always_inline int trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { - if (cpu_buffer->current_context >= 4) + unsigned int val = cpu_buffer->current_context; + unsigned long pc = preempt_count(); + int bit; + + if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) + bit = RB_CTX_NORMAL; + else + bit = pc & NMI_MASK ? RB_CTX_NMI : + pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; + + if (unlikely(val & (1 << bit))) return 1; - cpu_buffer->current_context++; - /* Interrupts must see this update */ - barrier(); + val |= (1 << bit); + cpu_buffer->current_context = val; return 0; } @@ -2564,9 +2593,7 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) static __always_inline void trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) { - /* Don't let the dec leak out */ - barrier(); - cpu_buffer->current_context--; + cpu_buffer->current_context &= cpu_buffer->current_context - 1; } /** diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2a8d8a294345..8e3f20a18a06 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2374,6 +2374,15 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) } EXPORT_SYMBOL_GPL(trace_event_buffer_commit); +/* + * Skip 3: + * + * trace_buffer_unlock_commit_regs() + * trace_event_buffer_commit() + * trace_event_raw_event_xxx() +*/ +# define STACK_SKIP 3 + void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct ring_buffer *buffer, struct ring_buffer_event *event, @@ -2383,16 +2392,12 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, __buffer_unlock_commit(buffer, event); /* - * If regs is not set, then skip the following callers: - * trace_buffer_unlock_commit_regs - * event_trigger_unlock_commit - * trace_event_buffer_commit - * trace_event_raw_event_sched_switch + * If regs is not set, then skip the necessary functions. * Note, we can still get here via blktrace, wakeup tracer * and mmiotrace, but that's ok if they lose a function or - * two. They are that meaningful. + * two. They are not that meaningful. */ - ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs); + ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } @@ -2579,11 +2584,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, trace.skip = skip; /* - * Add two, for this function and the call to save_stack_trace() + * Add one, for this function and the call to save_stack_trace() * If regs is set, then these functions will not be in the way. */ +#ifndef CONFIG_UNWINDER_ORC if (!regs) - trace.skip += 2; + trace.skip++; +#endif /* * Since events can happen in NMIs there's no safe way to @@ -2711,11 +2718,10 @@ void trace_dump_stack(int skip) local_save_flags(flags); - /* - * Skip 3 more, seems to get us at the caller of - * this function. - */ - skip += 3; +#ifndef CONFIG_UNWINDER_ORC + /* Skip 1 to skip this function. */ + skip++; +#endif __ftrace_trace_stack(global_trace.trace_buffer.buffer, flags, skip, preempt_count(), NULL); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ec0f9aa4e151..1b87157edbff 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2213,6 +2213,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) { struct trace_event_call *call, *p; const char *last_system = NULL; + bool first = false; int last_i; int i; @@ -2220,15 +2221,28 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) list_for_each_entry_safe(call, p, &ftrace_events, list) { /* events are usually grouped together with systems */ if (!last_system || call->class->system != last_system) { + first = true; last_i = 0; last_system = call->class->system; } + /* + * Since calls are grouped by systems, the likelyhood that the + * next call in the iteration belongs to the same system as the + * previous call is high. As an optimization, we skip seaching + * for a map[] that matches the call's system if the last call + * was from the same system. That's what last_i is for. If the + * call has the same system as the previous call, then last_i + * will be the index of the first map[] that has a matching + * system. + */ for (i = last_i; i < len; i++) { if (call->class->system == map[i]->system) { /* Save the first system if need be */ - if (!last_i) + if (first) { last_i = i; + first = false; + } update_event_printk(call, map[i]); } } diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index f2ac9d44f6c4..87411482a46f 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1123,13 +1123,22 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; } #endif /* CONFIG_TRACER_SNAPSHOT */ #ifdef CONFIG_STACKTRACE +#ifdef CONFIG_UNWINDER_ORC +/* Skip 2: + * event_triggers_post_call() + * trace_event_raw_event_xxx() + */ +# define STACK_SKIP 2 +#else /* - * Skip 3: + * Skip 4: * stacktrace_trigger() * event_triggers_post_call() + * trace_event_buffer_commit() * trace_event_raw_event_xxx() */ -#define STACK_SKIP 3 +#define STACK_SKIP 4 +#endif static void stacktrace_trigger(struct event_trigger_data *data, void *rec) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 27f7ad12c4b1..b611cd36e22d 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -154,6 +154,24 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, preempt_enable_notrace(); } +#ifdef CONFIG_UNWINDER_ORC +/* + * Skip 2: + * + * function_stack_trace_call() + * ftrace_call() + */ +#define STACK_SKIP 2 +#else +/* + * Skip 3: + * __trace_stack() + * function_stack_trace_call() + * ftrace_call() + */ +#define STACK_SKIP 3 +#endif + static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) @@ -180,15 +198,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, if (likely(disabled == 1)) { pc = preempt_count(); trace_function(tr, ip, parent_ip, flags, pc); - /* - * skip over 5 funcs: - * __ftrace_trace_stack, - * __trace_stack, - * function_stack_trace_call - * ftrace_list_func - * ftrace_call - */ - __trace_stack(tr, flags, 5, pc); + __trace_stack(tr, flags, STACK_SKIP, pc); } atomic_dec(&data->disabled); @@ -367,14 +377,27 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, tracer_tracing_off(tr); } +#ifdef CONFIG_UNWINDER_ORC /* - * Skip 4: + * Skip 3: + * + * function_trace_probe_call() + * ftrace_ops_assist_func() + * ftrace_call() + */ +#define FTRACE_STACK_SKIP 3 +#else +/* + * Skip 5: + * + * __trace_stack() * ftrace_stacktrace() * function_trace_probe_call() - * ftrace_ops_list_func() + * ftrace_ops_assist_func() * ftrace_call() */ -#define STACK_SKIP 4 +#define FTRACE_STACK_SKIP 5 +#endif static __always_inline void trace_stack(struct trace_array *tr) { @@ -384,7 +407,7 @@ static __always_inline void trace_stack(struct trace_array *tr) local_save_flags(flags); pc = preempt_count(); - __trace_stack(tr, flags, STACK_SKIP, pc); + __trace_stack(tr, flags, FTRACE_STACK_SKIP, pc); } static void diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 43d18cb46308..f699122dab32 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -48,6 +48,7 @@ #include <linux/moduleparam.h> #include <linux/uaccess.h> #include <linux/sched/isolation.h> +#include <linux/nmi.h> #include "workqueue_internal.h" @@ -4463,6 +4464,12 @@ void show_workqueue_state(void) if (pwq->nr_active || !list_empty(&pwq->delayed_works)) show_pwq(pwq); spin_unlock_irqrestore(&pwq->pool->lock, flags); + /* + * We could be printing a lot from atomic context, e.g. + * sysrq-t -> show_workqueue_state(). Avoid triggering + * hard lockup. + */ + touch_nmi_watchdog(); } } @@ -4490,6 +4497,12 @@ void show_workqueue_state(void) pr_cont("\n"); next_pool: spin_unlock_irqrestore(&pool->lock, flags); + /* + * We could be printing a lot from atomic context, e.g. + * sysrq-t -> show_workqueue_state(). Avoid triggering + * hard lockup. + */ + touch_nmi_watchdog(); } rcu_read_unlock_sched(); |