diff options
Diffstat (limited to 'kernel')
120 files changed, 5511 insertions, 2521 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index bf82259cff96..416017301660 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -40,6 +40,7 @@ config PREEMPT depends on !ARCH_NO_PREEMPT select PREEMPTION select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK + select PREEMPT_DYNAMIC if HAVE_PREEMPT_DYNAMIC help This option reduces the latency of the kernel by making all kernel code (that is not executing in a critical section) @@ -80,3 +81,21 @@ config PREEMPT_COUNT config PREEMPTION bool select PREEMPT_COUNT + +config PREEMPT_DYNAMIC + bool + help + This option allows to define the preemption model on the kernel + command line parameter and thus override the default preemption + model defined during compile time. + + The feature is primarily interesting for Linux distributions which + provide a pre-built kernel binary to reduce the number of kernel + flavors they offer while still offering different usecases. + + The runtime overhead is negligible with HAVE_STATIC_CALL_INLINE enabled + but if runtime patching is not available for the specific architecture + then the potential overhead should be considered. + + Interesting if you want the same pre-built kernel should be used for + both Server and Desktop workloads. diff --git a/kernel/Makefile b/kernel/Makefile index aa7368c7eabf..320f1f3941b7 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -51,7 +51,7 @@ obj-y += livepatch/ obj-y += dma/ obj-y += entry/ -obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o +obj-$(CONFIG_KCMP) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_STACKTRACE) += stacktrace.o diff --git a/kernel/audit.c b/kernel/audit.c index 1ffc2e059027..551a394bc8f4 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -2285,7 +2285,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, uid = from_kuid(&init_user_ns, task_uid(current)); oldloginuid = from_kuid(&init_user_ns, koldloginuid); - loginuid = from_kuid(&init_user_ns, kloginuid), + loginuid = from_kuid(&init_user_ns, kloginuid); tty = audit_get_tty(); audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); @@ -2365,7 +2365,7 @@ int audit_signal_info(int sig, struct task_struct *t) * * We can not do a netlink send inside an irq context because it blocks (last * arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed on a - * queue and a tasklet is scheduled to remove them from the queue outside the + * queue and a kthread is scheduled to remove them from the queue outside the * irq context. May be called in any context. */ void audit_log_end(struct audit_buffer *ab) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ce8c9e2279ba..47fb48f42c93 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -799,12 +799,12 @@ static int audit_in_mask(const struct audit_krule *rule, unsigned long val) return rule->mask[word] & bit; } -/* At syscall entry and exit time, this filter is called if the - * audit_state is not low enough that auditing cannot take place, but is - * also not high enough that we already know we have to write an audit - * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT). +/* At syscall exit time, this filter is called if the audit_state is + * not low enough that auditing cannot take place, but is also not + * high enough that we already know we have to write an audit record + * (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT). */ -static enum audit_state audit_filter_syscall(struct task_struct *tsk, +static void audit_filter_syscall(struct task_struct *tsk, struct audit_context *ctx, struct list_head *list) { @@ -812,7 +812,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, enum audit_state state; if (auditd_test_task(tsk)) - return AUDIT_DISABLED; + return; rcu_read_lock(); list_for_each_entry_rcu(e, list, list) { @@ -821,11 +821,11 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, &state, false)) { rcu_read_unlock(); ctx->current_state = state; - return state; + return; } } rcu_read_unlock(); - return AUDIT_BUILD_CONTEXT; + return; } /* @@ -1930,7 +1930,7 @@ static inline int audit_copy_fcaps(struct audit_names *name, if (!dentry) return 0; - rc = get_vfs_caps_from_disk(dentry, &caps); + rc = get_vfs_caps_from_disk(&init_user_ns, dentry, &caps); if (rc) return rc; @@ -2481,7 +2481,8 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, ax->d.next = context->aux; context->aux = (void *)ax; - get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps); + get_vfs_caps_from_disk(&init_user_ns, + bprm->file->f_path.dentry, &vcaps); ax->fcap.permitted = vcaps.permitted; ax->fcap.inheritable = vcaps.inheritable; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 5454161407f1..a0d9eade9c80 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -287,7 +287,7 @@ int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; - tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + tinfo = kzalloc(sizeof(*tinfo), GFP_KERNEL); if (!tinfo) return -ENOMEM; diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 1b6b9349cb85..d99e89f113c4 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -502,13 +502,14 @@ struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash) static void bpf_common_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) { + u8 node_type = READ_ONCE(node->type); unsigned long flags; - if (WARN_ON_ONCE(node->type == BPF_LRU_LIST_T_FREE) || - WARN_ON_ONCE(node->type == BPF_LRU_LOCAL_LIST_T_FREE)) + if (WARN_ON_ONCE(node_type == BPF_LRU_LIST_T_FREE) || + WARN_ON_ONCE(node_type == BPF_LRU_LOCAL_LIST_T_FREE)) return; - if (node->type == BPF_LRU_LOCAL_LIST_T_PENDING) { + if (node_type == BPF_LRU_LOCAL_LIST_T_PENDING) { struct bpf_lru_locallist *loc_l; loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 84a36ee4a4c2..2efeb5f4b343 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -458,7 +458,7 @@ static bool btf_type_is_datasec(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; } -static u32 btf_nr_types_total(const struct btf *btf) +u32 btf_nr_types(const struct btf *btf) { u32 total = 0; @@ -476,7 +476,7 @@ s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) const char *tname; u32 i, total; - total = btf_nr_types_total(btf); + total = btf_nr_types(btf); for (i = 1; i < total; i++) { t = btf_type_by_id(btf, i); if (BTF_INFO_KIND(t->info) != kind) @@ -3540,11 +3540,6 @@ static s32 btf_datasec_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (!btf_type_vlen(t)) { - btf_verifier_log_type(env, t, "vlen == 0"); - return -EINVAL; - } - if (!t->size) { btf_verifier_log_type(env, t, "size == 0"); return -EINVAL; @@ -5296,15 +5291,16 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. */ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *reg) + struct bpf_reg_state *regs) { struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; struct btf *btf = prog->aux->btf; const struct btf_param *args; - const struct btf_type *t; - u32 i, nargs, btf_id; + const struct btf_type *t, *ref_t; + u32 i, nargs, btf_id, type_size; const char *tname; + bool is_global; if (!prog->aux->func_info) return -EINVAL; @@ -5338,38 +5334,57 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, bpf_log(log, "Function %s has %d > 5 args\n", tname, nargs); goto out; } + + is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; /* check that BTF function arguments match actual types that the * verifier sees. */ for (i = 0; i < nargs; i++) { + struct bpf_reg_state *reg = ®s[i + 1]; + t = btf_type_by_id(btf, args[i].type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (btf_type_is_int(t) || btf_type_is_enum(t)) { - if (reg[i + 1].type == SCALAR_VALUE) + if (reg->type == SCALAR_VALUE) continue; bpf_log(log, "R%d is not a scalar\n", i + 1); goto out; } if (btf_type_is_ptr(t)) { - if (reg[i + 1].type == SCALAR_VALUE) { - bpf_log(log, "R%d is not a pointer\n", i + 1); - goto out; - } /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX. */ if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) { - if (reg[i + 1].type != PTR_TO_CTX) { + if (reg->type != PTR_TO_CTX) { bpf_log(log, "arg#%d expected pointer to ctx, but got %s\n", i, btf_kind_str[BTF_INFO_KIND(t->info)]); goto out; } - if (check_ctx_reg(env, ®[i + 1], i + 1)) + if (check_ctx_reg(env, reg, i + 1)) goto out; continue; } + + if (!is_global) + goto out; + + t = btf_type_skip_modifiers(btf, t->type, NULL); + + ref_t = btf_resolve_size(btf, t, &type_size); + if (IS_ERR(ref_t)) { + bpf_log(log, + "arg#%d reference type('%s %s') size cannot be determined: %ld\n", + i, btf_type_str(t), btf_name_by_offset(btf, t->name_off), + PTR_ERR(ref_t)); + goto out; + } + + if (check_mem_reg(env, reg, i + 1, type_size)) + goto out; + + continue; } bpf_log(log, "Unrecognized arg#%d type %s\n", i, btf_kind_str[BTF_INFO_KIND(t->info)]); @@ -5393,14 +5408,14 @@ out: * (either PTR_TO_CTX or SCALAR_VALUE). */ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *reg) + struct bpf_reg_state *regs) { struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; enum bpf_prog_type prog_type = prog->type; struct btf *btf = prog->aux->btf; const struct btf_param *args; - const struct btf_type *t; + const struct btf_type *t, *ref_t; u32 i, nargs, btf_id; const char *tname; @@ -5464,16 +5479,35 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, * Only PTR_TO_CTX and SCALAR are supported atm. */ for (i = 0; i < nargs; i++) { + struct bpf_reg_state *reg = ®s[i + 1]; + t = btf_type_by_id(btf, args[i].type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (btf_type_is_int(t) || btf_type_is_enum(t)) { - reg[i + 1].type = SCALAR_VALUE; + reg->type = SCALAR_VALUE; continue; } - if (btf_type_is_ptr(t) && - btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { - reg[i + 1].type = PTR_TO_CTX; + if (btf_type_is_ptr(t)) { + if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { + reg->type = PTR_TO_CTX; + continue; + } + + t = btf_type_skip_modifiers(btf, t->type, NULL); + + ref_t = btf_resolve_size(btf, t, ®->mem_size); + if (IS_ERR(ref_t)) { + bpf_log(log, + "arg#%d reference type('%s %s') size cannot be determined: %ld\n", + i, btf_type_str(t), btf_name_by_offset(btf, t->name_off), + PTR_ERR(ref_t)); + return -EINVAL; + } + + reg->type = PTR_TO_MEM_OR_NULL; + reg->id = ++env->id_gen; + continue; } bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n", @@ -5743,6 +5777,11 @@ bool btf_is_kernel(const struct btf *btf) return btf->kernel_btf; } +bool btf_is_module(const struct btf *btf) +{ + return btf->kernel_btf && strcmp(btf->name, "vmlinux") != 0; +} + static int btf_id_cmp_func(const void *a, const void *b) { const int *pa = a, *pb = b; @@ -5877,3 +5916,25 @@ static int __init btf_module_init(void) fs_initcall(btf_module_init); #endif /* CONFIG_DEBUG_INFO_BTF_MODULES */ + +struct module *btf_try_get_module(const struct btf *btf) +{ + struct module *res = NULL; +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + struct btf_module *btf_mod, *tmp; + + mutex_lock(&btf_module_mutex); + list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { + if (btf_mod->btf != btf) + continue; + + if (try_module_get(btf_mod->module)) + res = btf_mod->module; + + break; + } + mutex_unlock(&btf_module_mutex); +#endif + + return res; +} diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 6aa9e10c6335..b567ca46555c 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -19,7 +19,7 @@ #include "../cgroup/cgroup-internal.h" -DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); +DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key); void cgroup_bpf_offline(struct cgroup *cgrp) @@ -128,7 +128,7 @@ static void cgroup_bpf_release(struct work_struct *work) if (pl->link) bpf_cgroup_link_auto_detach(pl->link); kfree(pl); - static_branch_dec(&cgroup_bpf_enabled_key); + static_branch_dec(&cgroup_bpf_enabled_key[type]); } old_array = rcu_dereference_protected( cgrp->bpf.effective[type], @@ -499,7 +499,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, if (old_prog) bpf_prog_put(old_prog); else - static_branch_inc(&cgroup_bpf_enabled_key); + static_branch_inc(&cgroup_bpf_enabled_key[type]); bpf_cgroup_storages_link(new_storage, cgrp, type); return 0; @@ -698,7 +698,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, cgrp->bpf.flags[type] = 0; if (old_prog) bpf_prog_put(old_prog); - static_branch_dec(&cgroup_bpf_enabled_key); + static_branch_dec(&cgroup_bpf_enabled_key[type]); return 0; cleanup: @@ -1055,6 +1055,8 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); * @uaddr: sockaddr struct provided by user * @type: The type of program to be exectuted * @t_ctx: Pointer to attach type specific context + * @flags: Pointer to u32 which contains higher bits of BPF program + * return value (OR'ed together). * * socket is expected to be of type INET or INET6. * @@ -1064,7 +1066,8 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, enum bpf_attach_type type, - void *t_ctx) + void *t_ctx, + u32 *flags) { struct bpf_sock_addr_kern ctx = { .sk = sk, @@ -1087,7 +1090,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + ret = BPF_PROG_RUN_ARRAY_FLAGS(cgrp->bpf.effective[type], &ctx, + BPF_PROG_RUN, flags); return ret == 1 ? 0 : -EPERM; } @@ -1298,7 +1302,8 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, return empty; } -static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) +static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen, + struct bpf_sockopt_buf *buf) { if (unlikely(max_optlen < 0)) return -EINVAL; @@ -1310,6 +1315,15 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) max_optlen = PAGE_SIZE; } + if (max_optlen <= sizeof(buf->data)) { + /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE + * bytes avoid the cost of kzalloc. + */ + ctx->optval = buf->data; + ctx->optval_end = ctx->optval + max_optlen; + return max_optlen; + } + ctx->optval = kzalloc(max_optlen, GFP_USER); if (!ctx->optval) return -ENOMEM; @@ -1319,16 +1333,26 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) return max_optlen; } -static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) +static void sockopt_free_buf(struct bpf_sockopt_kern *ctx, + struct bpf_sockopt_buf *buf) { + if (ctx->optval == buf->data) + return; kfree(ctx->optval); } +static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx, + struct bpf_sockopt_buf *buf) +{ + return ctx->optval != buf->data; +} + int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, int *optname, char __user *optval, int *optlen, char **kernel_optval) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_buf buf = {}; struct bpf_sockopt_kern ctx = { .sk = sk, .level = *level, @@ -1340,8 +1364,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (!cgroup_bpf_enabled || - __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) return 0; /* Allocate a bit more than the initial user buffer for @@ -1350,7 +1373,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, */ max_optlen = max_t(int, 16, *optlen); - max_optlen = sockopt_alloc_buf(&ctx, max_optlen); + max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; @@ -1390,14 +1413,31 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, */ if (ctx.optlen != 0) { *optlen = ctx.optlen; - *kernel_optval = ctx.optval; + /* We've used bpf_sockopt_kern->buf as an intermediary + * storage, but the BPF program indicates that we need + * to pass this data to the kernel setsockopt handler. + * No way to export on-stack buf, have to allocate a + * new buffer. + */ + if (!sockopt_buf_allocated(&ctx, &buf)) { + void *p = kmalloc(ctx.optlen, GFP_USER); + + if (!p) { + ret = -ENOMEM; + goto out; + } + memcpy(p, ctx.optval, ctx.optlen); + *kernel_optval = p; + } else { + *kernel_optval = ctx.optval; + } /* export and don't free sockopt buf */ return 0; } } out: - sockopt_free_buf(&ctx); + sockopt_free_buf(&ctx, &buf); return ret; } @@ -1407,6 +1447,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, int retval) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_buf buf = {}; struct bpf_sockopt_kern ctx = { .sk = sk, .level = level, @@ -1419,13 +1460,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (!cgroup_bpf_enabled || - __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) return retval; ctx.optlen = max_optlen; - max_optlen = sockopt_alloc_buf(&ctx, max_optlen); + max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; @@ -1488,9 +1528,55 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, ret = ctx.retval; out: - sockopt_free_buf(&ctx); + sockopt_free_buf(&ctx, &buf); return ret; } + +int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, + int optname, void *optval, + int *optlen, int retval) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_kern ctx = { + .sk = sk, + .level = level, + .optname = optname, + .retval = retval, + .optlen = *optlen, + .optval = optval, + .optval_end = optval + *optlen, + }; + int ret; + + /* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy + * user data back into BPF buffer when reval != 0. This is + * done as an optimization to avoid extra copy, assuming + * kernel won't populate the data in case of an error. + * Here we always pass the data and memset() should + * be called if that data shouldn't be "exported". + */ + + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + &ctx, BPF_PROG_RUN); + if (!ret) + return -EPERM; + + if (ctx.optlen > *optlen) + return -EFAULT; + + /* BPF programs only allowed to set retval to 0, not some + * arbitrary value. + */ + if (ctx.retval != 0 && ctx.retval != retval) + return -EFAULT; + + /* BPF programs can shrink the buffer, export the modifications. + */ + if (ctx.optlen != 0) + *optlen = ctx.optlen; + + return ctx.retval; +} #endif static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 261f8692d0d2..0ae015ad1e05 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -91,6 +91,12 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag vfree(fp); return NULL; } + fp->active = alloc_percpu_gfp(int, GFP_KERNEL_ACCOUNT | gfp_extra_flags); + if (!fp->active) { + vfree(fp); + kfree(aux); + return NULL; + } fp->pages = size / PAGE_SIZE; fp->aux = aux; @@ -114,8 +120,9 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) if (!prog) return NULL; - prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); - if (!prog->aux->stats) { + prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); + if (!prog->stats) { + free_percpu(prog->active); kfree(prog->aux); vfree(prog); return NULL; @@ -124,7 +131,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) for_each_possible_cpu(cpu) { struct bpf_prog_stats *pstats; - pstats = per_cpu_ptr(prog->aux->stats, cpu); + pstats = per_cpu_ptr(prog->stats, cpu); u64_stats_init(&pstats->syncp); } return prog; @@ -238,6 +245,8 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, * reallocated structure. */ fp_old->aux = NULL; + fp_old->stats = NULL; + fp_old->active = NULL; __bpf_prog_free(fp_old); } @@ -249,10 +258,11 @@ void __bpf_prog_free(struct bpf_prog *fp) if (fp->aux) { mutex_destroy(&fp->aux->used_maps_mutex); mutex_destroy(&fp->aux->dst_mutex); - free_percpu(fp->aux->stats); kfree(fp->aux->poke_tab); kfree(fp->aux); } + free_percpu(fp->stats); + free_percpu(fp->active); vfree(fp); } @@ -1309,8 +1319,8 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_3(STX, MEM, H), \ INSN_3(STX, MEM, W), \ INSN_3(STX, MEM, DW), \ - INSN_3(STX, XADD, W), \ - INSN_3(STX, XADD, DW), \ + INSN_3(STX, ATOMIC, W), \ + INSN_3(STX, ATOMIC, DW), \ /* Immediate based. */ \ INSN_3(ST, MEM, B), \ INSN_3(ST, MEM, H), \ @@ -1618,13 +1628,59 @@ out: LDX_PROBE(DW, 8) #undef LDX_PROBE - STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ - atomic_add((u32) SRC, (atomic_t *)(unsigned long) - (DST + insn->off)); - CONT; - STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ - atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) - (DST + insn->off)); +#define ATOMIC_ALU_OP(BOP, KOP) \ + case BOP: \ + if (BPF_SIZE(insn->code) == BPF_W) \ + atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \ + (DST + insn->off)); \ + else \ + atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \ + (DST + insn->off)); \ + break; \ + case BOP | BPF_FETCH: \ + if (BPF_SIZE(insn->code) == BPF_W) \ + SRC = (u32) atomic_fetch_##KOP( \ + (u32) SRC, \ + (atomic_t *)(unsigned long) (DST + insn->off)); \ + else \ + SRC = (u64) atomic64_fetch_##KOP( \ + (u64) SRC, \ + (atomic64_t *)(unsigned long) (DST + insn->off)); \ + break; + + STX_ATOMIC_DW: + STX_ATOMIC_W: + switch (IMM) { + ATOMIC_ALU_OP(BPF_ADD, add) + ATOMIC_ALU_OP(BPF_AND, and) + ATOMIC_ALU_OP(BPF_OR, or) + ATOMIC_ALU_OP(BPF_XOR, xor) +#undef ATOMIC_ALU_OP + + case BPF_XCHG: + if (BPF_SIZE(insn->code) == BPF_W) + SRC = (u32) atomic_xchg( + (atomic_t *)(unsigned long) (DST + insn->off), + (u32) SRC); + else + SRC = (u64) atomic64_xchg( + (atomic64_t *)(unsigned long) (DST + insn->off), + (u64) SRC); + break; + case BPF_CMPXCHG: + if (BPF_SIZE(insn->code) == BPF_W) + BPF_R0 = (u32) atomic_cmpxchg( + (atomic_t *)(unsigned long) (DST + insn->off), + (u32) BPF_R0, (u32) SRC); + else + BPF_R0 = (u64) atomic64_cmpxchg( + (atomic64_t *)(unsigned long) (DST + insn->off), + (u64) BPF_R0, (u64) SRC); + break; + + default: + goto default_label; + } CONT; default_label: @@ -1634,7 +1690,8 @@ out: * * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable(). */ - pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code); + pr_warn("BPF interpreter: unknown opcode %02x (imm: 0x%x)\n", + insn->code, insn->imm); BUG_ON(1); return 0; } @@ -2119,6 +2176,28 @@ static void bpf_free_used_maps(struct bpf_prog_aux *aux) kfree(aux->used_maps); } +void __bpf_free_used_btfs(struct bpf_prog_aux *aux, + struct btf_mod_pair *used_btfs, u32 len) +{ +#ifdef CONFIG_BPF_SYSCALL + struct btf_mod_pair *btf_mod; + u32 i; + + for (i = 0; i < len; i++) { + btf_mod = &used_btfs[i]; + if (btf_mod->module) + module_put(btf_mod->module); + btf_put(btf_mod->btf); + } +#endif +} + +static void bpf_free_used_btfs(struct bpf_prog_aux *aux) +{ + __bpf_free_used_btfs(aux, aux->used_btfs, aux->used_btf_cnt); + kfree(aux->used_btfs); +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; @@ -2126,6 +2205,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) aux = container_of(work, struct bpf_prog_aux, work); bpf_free_used_maps(aux); + bpf_free_used_btfs(aux); if (bpf_prog_is_dev_bound(aux)) bpf_prog_offload_destroy(aux->prog); #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 747313698178..5d1469de6921 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -141,49 +141,6 @@ static void cpu_map_kthread_stop(struct work_struct *work) kthread_stop(rcpu->kthread); } -static struct sk_buff *cpu_map_build_skb(struct xdp_frame *xdpf, - struct sk_buff *skb) -{ - unsigned int hard_start_headroom; - unsigned int frame_size; - void *pkt_data_start; - - /* Part of headroom was reserved to xdpf */ - hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; - - /* Memory size backing xdp_frame data already have reserved - * room for build_skb to place skb_shared_info in tailroom. - */ - frame_size = xdpf->frame_sz; - - pkt_data_start = xdpf->data - hard_start_headroom; - skb = build_skb_around(skb, pkt_data_start, frame_size); - if (unlikely(!skb)) - return NULL; - - skb_reserve(skb, hard_start_headroom); - __skb_put(skb, xdpf->len); - if (xdpf->metasize) - skb_metadata_set(skb, xdpf->metasize); - - /* Essential SKB info: protocol and skb->dev */ - skb->protocol = eth_type_trans(skb, xdpf->dev_rx); - - /* Optional SKB info, currently missing: - * - HW checksum info (skb->ip_summed) - * - HW RX hash (skb_set_hash) - * - RX ring dev queue index (skb_record_rx_queue) - */ - - /* Until page_pool get SKB return path, release DMA here */ - xdp_release_frame(xdpf); - - /* Allow SKB to reuse area used by xdp_frame */ - xdp_scrub_frame(xdpf); - - return skb; -} - static void __cpu_map_ring_cleanup(struct ptr_ring *ring) { /* The tear-down procedure should have made sure that queue is @@ -350,7 +307,8 @@ static int cpu_map_kthread_run(void *data) struct sk_buff *skb = skbs[i]; int ret; - skb = cpu_map_build_skb(xdpf, skb); + skb = __xdp_build_skb_from_frame(xdpf, skb, + xdpf->dev_rx); if (!skb) { xdp_return_frame(xdpf); continue; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index f6e9c68afdd4..85d9d1b72a33 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -802,9 +802,7 @@ static int dev_map_notification(struct notifier_block *notifier, break; /* will be freed in free_netdev() */ - netdev->xdp_bulkq = - __alloc_percpu_gfp(sizeof(struct xdp_dev_bulk_queue), - sizeof(void *), GFP_ATOMIC); + netdev->xdp_bulkq = alloc_percpu(struct xdp_dev_bulk_queue); if (!netdev->xdp_bulkq) return NOTIFY_BAD; diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index b44d8c447afd..3acc7e0b6916 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -80,6 +80,13 @@ const char *const bpf_alu_string[16] = { [BPF_END >> 4] = "endian", }; +static const char *const bpf_atomic_alu_string[16] = { + [BPF_ADD >> 4] = "add", + [BPF_AND >> 4] = "and", + [BPF_OR >> 4] = "or", + [BPF_XOR >> 4] = "or", +}; + static const char *const bpf_ldst_string[] = { [BPF_W >> 3] = "u32", [BPF_H >> 3] = "u16", @@ -153,14 +160,44 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); - else if (BPF_MODE(insn->code) == BPF_XADD) - verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", + else if (BPF_MODE(insn->code) == BPF_ATOMIC && + (insn->imm == BPF_ADD || insn->imm == BPF_AND || + insn->imm == BPF_OR || insn->imm == BPF_XOR)) { + verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) %s r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, + bpf_alu_string[BPF_OP(insn->imm) >> 4], + insn->src_reg); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + (insn->imm == (BPF_ADD | BPF_FETCH) || + insn->imm == (BPF_AND | BPF_FETCH) || + insn->imm == (BPF_OR | BPF_FETCH) || + insn->imm == (BPF_XOR | BPF_FETCH))) { + verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_%s((%s *)(r%d %+d), r%d)\n", + insn->code, insn->src_reg, + BPF_SIZE(insn->code) == BPF_DW ? "64" : "", + bpf_atomic_alu_string[BPF_OP(insn->imm) >> 4], + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, insn->src_reg); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_CMPXCHG) { + verbose(cbs->private_data, "(%02x) r0 = atomic%s_cmpxchg((%s *)(r%d %+d), r0, r%d)\n", insn->code, + BPF_SIZE(insn->code) == BPF_DW ? "64" : "", bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); - else + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_XCHG) { + verbose(cbs->private_data, "(%02x) r%d = atomic%s_xchg((%s *)(r%d %+d), r%d)\n", + insn->code, insn->src_reg, + BPF_SIZE(insn->code) == BPF_DW ? "64" : "", + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, insn->src_reg); + } else { verbose(cbs->private_data, "BUG_%02x\n", insn->code); + } } else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM) { verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c1ac7f964bc9..d63912e73ad9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1148,7 +1148,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1202,7 +1202,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 41ca280b1dc1..308427fe03a3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -720,14 +720,6 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_spin_lock_proto; case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; - case BPF_FUNC_trace_printk: - if (!perfmon_capable()) - return NULL; - return bpf_get_trace_printk_proto(); - case BPF_FUNC_snprintf_btf: - if (!perfmon_capable()) - return NULL; - return &bpf_snprintf_btf_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; case BPF_FUNC_per_cpu_ptr: @@ -742,6 +734,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return NULL; switch (func_id) { + case BPF_FUNC_trace_printk: + return bpf_get_trace_printk_proto(); case BPF_FUNC_get_current_task: return &bpf_get_current_task_proto; case BPF_FUNC_probe_read_user: @@ -752,6 +746,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_probe_read_user_str_proto; case BPF_FUNC_probe_read_kernel_str: return &bpf_probe_read_kernel_str_proto; + case BPF_FUNC_snprintf_btf: + return &bpf_snprintf_btf_proto; default: return NULL; } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index dd4b7fd60ee7..1576ff331ee4 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -122,7 +122,7 @@ static struct inode *bpf_get_inode(struct super_block *sb, inode->i_mtime = inode->i_atime; inode->i_ctime = inode->i_atime; - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); return inode; } @@ -152,7 +152,8 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, dir->i_ctime = dir->i_mtime; } -static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int bpf_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -381,8 +382,8 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) return simple_lookup(dir, dentry, flags); } -static int bpf_symlink(struct inode *dir, struct dentry *dentry, - const char *target) +static int bpf_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *target) { char *link = kstrdup(target, GFP_USER | __GFP_NOWARN); struct inode *inode; @@ -507,7 +508,7 @@ static void *bpf_obj_do_get(const char __user *pathname, return ERR_PTR(ret); inode = d_backing_inode(path.dentry); - ret = inode_permission(inode, ACC_MODE(flags)); + ret = path_permission(&path, ACC_MODE(flags)); if (ret) goto out; @@ -558,7 +559,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags) static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) { struct bpf_prog *prog; - int ret = inode_permission(inode, MAY_READ); + int ret = inode_permission(&init_user_ns, inode, MAY_READ); if (ret) return ERR_PTR(ret); diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c index b7ff87939172..5d872a705470 100644 --- a/kernel/bpf/preload/iterators/iterators.c +++ b/kernel/bpf/preload/iterators/iterators.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ -#include <argp.h> +#include <errno.h> #include <stdio.h> #include <stdlib.h> #include <string.h> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index aea96b638473..be35bfb7fb13 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -7,10 +7,9 @@ #include <linux/kernel.h> #include <linux/stacktrace.h> #include <linux/perf_event.h> -#include <linux/elf.h> -#include <linux/pagemap.h> #include <linux/irq_work.h> #include <linux/btf_ids.h> +#include <linux/buildid.h> #include "percpu_freelist.h" #define STACK_CREATE_FLAG_MASK \ @@ -115,6 +114,8 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) /* hash table size must be power of 2 */ n_buckets = roundup_pow_of_two(attr->max_entries); + if (!n_buckets) + return ERR_PTR(-E2BIG); cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); @@ -143,140 +144,6 @@ free_smap: return ERR_PTR(err); } -#define BPF_BUILD_ID 3 -/* - * Parse build id from the note segment. This logic can be shared between - * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are - * identical. - */ -static inline int stack_map_parse_build_id(void *page_addr, - unsigned char *build_id, - void *note_start, - Elf32_Word note_size) -{ - Elf32_Word note_offs = 0, new_offs; - - /* check for overflow */ - if (note_start < page_addr || note_start + note_size < note_start) - return -EINVAL; - - /* only supports note that fits in the first page */ - if (note_start + note_size > page_addr + PAGE_SIZE) - return -EINVAL; - - while (note_offs + sizeof(Elf32_Nhdr) < note_size) { - Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); - - if (nhdr->n_type == BPF_BUILD_ID && - nhdr->n_namesz == sizeof("GNU") && - nhdr->n_descsz > 0 && - nhdr->n_descsz <= BPF_BUILD_ID_SIZE) { - memcpy(build_id, - note_start + note_offs + - ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), - nhdr->n_descsz); - memset(build_id + nhdr->n_descsz, 0, - BPF_BUILD_ID_SIZE - nhdr->n_descsz); - return 0; - } - new_offs = note_offs + sizeof(Elf32_Nhdr) + - ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); - if (new_offs <= note_offs) /* overflow */ - break; - note_offs = new_offs; - } - return -EINVAL; -} - -/* Parse build ID from 32-bit ELF */ -static int stack_map_get_build_id_32(void *page_addr, - unsigned char *build_id) -{ - Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; - Elf32_Phdr *phdr; - int i; - - /* only supports phdr that fits in one page */ - if (ehdr->e_phnum > - (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) - return -EINVAL; - - phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); - - for (i = 0; i < ehdr->e_phnum; ++i) { - if (phdr[i].p_type == PT_NOTE && - !stack_map_parse_build_id(page_addr, build_id, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz)) - return 0; - } - return -EINVAL; -} - -/* Parse build ID from 64-bit ELF */ -static int stack_map_get_build_id_64(void *page_addr, - unsigned char *build_id) -{ - Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; - Elf64_Phdr *phdr; - int i; - - /* only supports phdr that fits in one page */ - if (ehdr->e_phnum > - (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) - return -EINVAL; - - phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); - - for (i = 0; i < ehdr->e_phnum; ++i) { - if (phdr[i].p_type == PT_NOTE && - !stack_map_parse_build_id(page_addr, build_id, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz)) - return 0; - } - return -EINVAL; -} - -/* Parse build ID of ELF file mapped to vma */ -static int stack_map_get_build_id(struct vm_area_struct *vma, - unsigned char *build_id) -{ - Elf32_Ehdr *ehdr; - struct page *page; - void *page_addr; - int ret; - - /* only works for page backed storage */ - if (!vma->vm_file) - return -EINVAL; - - page = find_get_page(vma->vm_file->f_mapping, 0); - if (!page) - return -EFAULT; /* page not mapped */ - - ret = -EINVAL; - page_addr = kmap_atomic(page); - ehdr = (Elf32_Ehdr *)page_addr; - - /* compare magic x7f "ELF" */ - if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) - goto out; - - /* only support executable file and shared object file */ - if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) - goto out; - - if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) - ret = stack_map_get_build_id_32(page_addr, build_id); - else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) - ret = stack_map_get_build_id_64(page_addr, build_id); -out: - kunmap_atomic(page_addr); - put_page(page); - return ret; -} - static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, u64 *ips, u32 trace_nr, bool user) { @@ -317,18 +184,18 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; - memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE); + memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); } return; } for (i = 0; i < trace_nr; i++) { vma = find_vma(current->mm, ips[i]); - if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) { + if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) { /* per entry fall back to ips */ id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; - memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE); + memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); continue; } id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e5999d86c76e..c859bc46d06c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1731,25 +1731,28 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) static void bpf_prog_get_stats(const struct bpf_prog *prog, struct bpf_prog_stats *stats) { - u64 nsecs = 0, cnt = 0; + u64 nsecs = 0, cnt = 0, misses = 0; int cpu; for_each_possible_cpu(cpu) { const struct bpf_prog_stats *st; unsigned int start; - u64 tnsecs, tcnt; + u64 tnsecs, tcnt, tmisses; - st = per_cpu_ptr(prog->aux->stats, cpu); + st = per_cpu_ptr(prog->stats, cpu); do { start = u64_stats_fetch_begin_irq(&st->syncp); tnsecs = st->nsecs; tcnt = st->cnt; + tmisses = st->misses; } while (u64_stats_fetch_retry_irq(&st->syncp, start)); nsecs += tnsecs; cnt += tcnt; + misses += tmisses; } stats->nsecs = nsecs; stats->cnt = cnt; + stats->misses = misses; } #ifdef CONFIG_PROC_FS @@ -1768,14 +1771,16 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) "memlock:\t%llu\n" "prog_id:\t%u\n" "run_time_ns:\t%llu\n" - "run_cnt:\t%llu\n", + "run_cnt:\t%llu\n" + "recursion_misses:\t%llu\n", prog->type, prog->jited, prog_tag, prog->pages * 1ULL << PAGE_SHIFT, prog->aux->id, stats.nsecs, - stats.cnt); + stats.cnt, + stats.misses); } #endif @@ -3438,6 +3443,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, bpf_prog_get_stats(prog, &stats); info.run_time_ns = stats.nsecs; info.run_cnt = stats.cnt; + info.recursion_misses = stats.misses; if (!bpf_capable()) { info.jited_prog_len = 0; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 175b7b42bfc4..b68cb5d6d6eb 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -286,9 +286,248 @@ static const struct seq_operations task_file_seq_ops = { .show = task_file_seq_show, }; +struct bpf_iter_seq_task_vma_info { + /* The first field must be struct bpf_iter_seq_task_common. + * this is assumed by {init, fini}_seq_pidns() callback functions. + */ + struct bpf_iter_seq_task_common common; + struct task_struct *task; + struct vm_area_struct *vma; + u32 tid; + unsigned long prev_vm_start; + unsigned long prev_vm_end; +}; + +enum bpf_task_vma_iter_find_op { + task_vma_iter_first_vma, /* use mm->mmap */ + task_vma_iter_next_vma, /* use curr_vma->vm_next */ + task_vma_iter_find_vma, /* use find_vma() to find next vma */ +}; + +static struct vm_area_struct * +task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) +{ + struct pid_namespace *ns = info->common.ns; + enum bpf_task_vma_iter_find_op op; + struct vm_area_struct *curr_vma; + struct task_struct *curr_task; + u32 curr_tid = info->tid; + + /* If this function returns a non-NULL vma, it holds a reference to + * the task_struct, and holds read lock on vma->mm->mmap_lock. + * If this function returns NULL, it does not hold any reference or + * lock. + */ + if (info->task) { + curr_task = info->task; + curr_vma = info->vma; + /* In case of lock contention, drop mmap_lock to unblock + * the writer. + * + * After relock, call find(mm, prev_vm_end - 1) to find + * new vma to process. + * + * +------+------+-----------+ + * | VMA1 | VMA2 | VMA3 | + * +------+------+-----------+ + * | | | | + * 4k 8k 16k 400k + * + * For example, curr_vma == VMA2. Before unlock, we set + * + * prev_vm_start = 8k + * prev_vm_end = 16k + * + * There are a few cases: + * + * 1) VMA2 is freed, but VMA3 exists. + * + * find_vma() will return VMA3, just process VMA3. + * + * 2) VMA2 still exists. + * + * find_vma() will return VMA2, process VMA2->next. + * + * 3) no more vma in this mm. + * + * Process the next task. + * + * 4) find_vma() returns a different vma, VMA2'. + * + * 4.1) If VMA2 covers same range as VMA2', skip VMA2', + * because we already covered the range; + * 4.2) VMA2 and VMA2' covers different ranges, process + * VMA2'. + */ + if (mmap_lock_is_contended(curr_task->mm)) { + info->prev_vm_start = curr_vma->vm_start; + info->prev_vm_end = curr_vma->vm_end; + op = task_vma_iter_find_vma; + mmap_read_unlock(curr_task->mm); + if (mmap_read_lock_killable(curr_task->mm)) + goto finish; + } else { + op = task_vma_iter_next_vma; + } + } else { +again: + curr_task = task_seq_get_next(ns, &curr_tid, true); + if (!curr_task) { + info->tid = curr_tid + 1; + goto finish; + } + + if (curr_tid != info->tid) { + info->tid = curr_tid; + /* new task, process the first vma */ + op = task_vma_iter_first_vma; + } else { + /* Found the same tid, which means the user space + * finished data in previous buffer and read more. + * We dropped mmap_lock before returning to user + * space, so it is necessary to use find_vma() to + * find the next vma to process. + */ + op = task_vma_iter_find_vma; + } + + if (!curr_task->mm) + goto next_task; + + if (mmap_read_lock_killable(curr_task->mm)) + goto finish; + } + + switch (op) { + case task_vma_iter_first_vma: + curr_vma = curr_task->mm->mmap; + break; + case task_vma_iter_next_vma: + curr_vma = curr_vma->vm_next; + break; + case task_vma_iter_find_vma: + /* We dropped mmap_lock so it is necessary to use find_vma + * to find the next vma. This is similar to the mechanism + * in show_smaps_rollup(). + */ + curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1); + /* case 1) and 4.2) above just use curr_vma */ + + /* check for case 2) or case 4.1) above */ + if (curr_vma && + curr_vma->vm_start == info->prev_vm_start && + curr_vma->vm_end == info->prev_vm_end) + curr_vma = curr_vma->vm_next; + break; + } + if (!curr_vma) { + /* case 3) above, or case 2) 4.1) with vma->next == NULL */ + mmap_read_unlock(curr_task->mm); + goto next_task; + } + info->task = curr_task; + info->vma = curr_vma; + return curr_vma; + +next_task: + put_task_struct(curr_task); + info->task = NULL; + curr_tid++; + goto again; + +finish: + if (curr_task) + put_task_struct(curr_task); + info->task = NULL; + info->vma = NULL; + return NULL; +} + +static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_task_vma_info *info = seq->private; + struct vm_area_struct *vma; + + vma = task_vma_seq_get_next(info); + if (vma && *pos == 0) + ++*pos; + + return vma; +} + +static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_task_vma_info *info = seq->private; + + ++*pos; + return task_vma_seq_get_next(info); +} + +struct bpf_iter__task_vma { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct task_struct *, task); + __bpf_md_ptr(struct vm_area_struct *, vma); +}; + +DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta, + struct task_struct *task, struct vm_area_struct *vma) + +static int __task_vma_seq_show(struct seq_file *seq, bool in_stop) +{ + struct bpf_iter_seq_task_vma_info *info = seq->private; + struct bpf_iter__task_vma ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.task = info->task; + ctx.vma = info->vma; + return bpf_iter_run_prog(prog, &ctx); +} + +static int task_vma_seq_show(struct seq_file *seq, void *v) +{ + return __task_vma_seq_show(seq, false); +} + +static void task_vma_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_seq_task_vma_info *info = seq->private; + + if (!v) { + (void)__task_vma_seq_show(seq, true); + } else { + /* info->vma has not been seen by the BPF program. If the + * user space reads more, task_vma_seq_get_next should + * return this vma again. Set prev_vm_start to ~0UL, + * so that we don't skip the vma returned by the next + * find_vma() (case task_vma_iter_find_vma in + * task_vma_seq_get_next()). + */ + info->prev_vm_start = ~0UL; + info->prev_vm_end = info->vma->vm_end; + mmap_read_unlock(info->task->mm); + put_task_struct(info->task); + info->task = NULL; + } +} + +static const struct seq_operations task_vma_seq_ops = { + .start = task_vma_seq_start, + .next = task_vma_seq_next, + .stop = task_vma_seq_stop, + .show = task_vma_seq_show, +}; + BTF_ID_LIST(btf_task_file_ids) BTF_ID(struct, task_struct) BTF_ID(struct, file) +BTF_ID(struct, vm_area_struct) static const struct bpf_iter_seq_info task_seq_info = { .seq_ops = &task_seq_ops, @@ -328,6 +567,26 @@ static struct bpf_iter_reg task_file_reg_info = { .seq_info = &task_file_seq_info, }; +static const struct bpf_iter_seq_info task_vma_seq_info = { + .seq_ops = &task_vma_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_vma_info), +}; + +static struct bpf_iter_reg task_vma_reg_info = { + .target = "task_vma", + .feature = BPF_ITER_RESCHED, + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__task_vma, task), + PTR_TO_BTF_ID_OR_NULL }, + { offsetof(struct bpf_iter__task_vma, vma), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &task_vma_seq_info, +}; + static int __init task_iter_init(void) { int ret; @@ -339,6 +598,12 @@ static int __init task_iter_init(void) task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1]; - return bpf_iter_reg_target(&task_file_reg_info); + ret = bpf_iter_reg_target(&task_file_reg_info); + if (ret) + return ret; + + task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; + task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[2]; + return bpf_iter_reg_target(&task_vma_reg_info); } late_initcall(task_iter_init); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 35c5887d82ff..7bc3b3209224 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -381,55 +381,100 @@ out: mutex_unlock(&trampoline_mutex); } +#define NO_START_TIME 1 +static u64 notrace bpf_prog_start_time(void) +{ + u64 start = NO_START_TIME; + + if (static_branch_unlikely(&bpf_stats_enabled_key)) { + start = sched_clock(); + if (unlikely(!start)) + start = NO_START_TIME; + } + return start; +} + +static void notrace inc_misses_counter(struct bpf_prog *prog) +{ + struct bpf_prog_stats *stats; + + stats = this_cpu_ptr(prog->stats); + u64_stats_update_begin(&stats->syncp); + stats->misses++; + u64_stats_update_end(&stats->syncp); +} + /* The logic is similar to BPF_PROG_RUN, but with an explicit * rcu_read_lock() and migrate_disable() which are required * for the trampoline. The macro is split into - * call _bpf_prog_enter + * call __bpf_prog_enter * call prog->bpf_func * call __bpf_prog_exit + * + * __bpf_prog_enter returns: + * 0 - skip execution of the bpf prog + * 1 - execute bpf prog + * [2..MAX_U64] - excute bpf prog and record execution time. + * This is start time. */ -u64 notrace __bpf_prog_enter(void) +u64 notrace __bpf_prog_enter(struct bpf_prog *prog) __acquires(RCU) { - u64 start = 0; - rcu_read_lock(); migrate_disable(); - if (static_branch_unlikely(&bpf_stats_enabled_key)) - start = sched_clock(); - return start; + if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { + inc_misses_counter(prog); + return 0; + } + return bpf_prog_start_time(); } -void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) - __releases(RCU) +static void notrace update_prog_stats(struct bpf_prog *prog, + u64 start) { struct bpf_prog_stats *stats; if (static_branch_unlikely(&bpf_stats_enabled_key) && - /* static_key could be enabled in __bpf_prog_enter - * and disabled in __bpf_prog_exit. + /* static_key could be enabled in __bpf_prog_enter* + * and disabled in __bpf_prog_exit*. * And vice versa. - * Hence check that 'start' is not zero. + * Hence check that 'start' is valid. */ - start) { - stats = this_cpu_ptr(prog->aux->stats); + start > NO_START_TIME) { + stats = this_cpu_ptr(prog->stats); u64_stats_update_begin(&stats->syncp); stats->cnt++; stats->nsecs += sched_clock() - start; u64_stats_update_end(&stats->syncp); } +} + +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) + __releases(RCU) +{ + update_prog_stats(prog, start); + __this_cpu_dec(*(prog->active)); migrate_enable(); rcu_read_unlock(); } -void notrace __bpf_prog_enter_sleepable(void) +u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog) { rcu_read_lock_trace(); + migrate_disable(); might_fault(); + if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { + inc_misses_counter(prog); + return 0; + } + return bpf_prog_start_time(); } -void notrace __bpf_prog_exit_sleepable(void) +void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start) { + update_prog_stats(prog, start); + __this_cpu_dec(*(prog->active)); + migrate_enable(); rcu_read_unlock_trace(); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e7368c5eacb7..1dda9d81f12c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -228,6 +228,12 @@ static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state) (poisoned ? BPF_MAP_KEY_POISON : 0ULL); } +static bool bpf_pseudo_call(const struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == BPF_PSEUDO_CALL; +} + struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; @@ -1073,6 +1079,51 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, __mark_reg_known_zero(regs + regno); } +static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) +{ + switch (reg->type) { + case PTR_TO_MAP_VALUE_OR_NULL: { + const struct bpf_map *map = reg->map_ptr; + + if (map->inner_map_meta) { + reg->type = CONST_PTR_TO_MAP; + reg->map_ptr = map->inner_map_meta; + } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { + reg->type = PTR_TO_XDP_SOCK; + } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || + map->map_type == BPF_MAP_TYPE_SOCKHASH) { + reg->type = PTR_TO_SOCKET; + } else { + reg->type = PTR_TO_MAP_VALUE; + } + break; + } + case PTR_TO_SOCKET_OR_NULL: + reg->type = PTR_TO_SOCKET; + break; + case PTR_TO_SOCK_COMMON_OR_NULL: + reg->type = PTR_TO_SOCK_COMMON; + break; + case PTR_TO_TCP_SOCK_OR_NULL: + reg->type = PTR_TO_TCP_SOCK; + break; + case PTR_TO_BTF_ID_OR_NULL: + reg->type = PTR_TO_BTF_ID; + break; + case PTR_TO_MEM_OR_NULL: + reg->type = PTR_TO_MEM; + break; + case PTR_TO_RDONLY_BUF_OR_NULL: + reg->type = PTR_TO_RDONLY_BUF; + break; + case PTR_TO_RDWR_BUF_OR_NULL: + reg->type = PTR_TO_RDWR_BUF; + break; + default: + WARN_ON("unknown nullable register type"); + } +} + static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) { return type_is_pkt_pointer(reg->type); @@ -1486,9 +1537,7 @@ static int check_subprogs(struct bpf_verifier_env *env) /* determine subprog starts. The end is one before the next starts */ for (i = 0; i < insn_cnt; i++) { - if (insn[i].code != (BPF_JMP | BPF_CALL)) - continue; - if (insn[i].src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn + i)) continue; if (!env->bpf_capable) { verbose(env, @@ -2271,12 +2320,14 @@ static void save_register_state(struct bpf_func_state *state, state->stack[spi].slot_type[i] = STACK_SPILL; } -/* check_stack_read/write functions track spill/fill of registers, +/* check_stack_{read,write}_fixed_off functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ -static int check_stack_write(struct bpf_verifier_env *env, - struct bpf_func_state *state, /* func where register points to */ - int off, int size, int value_regno, int insn_idx) +static int check_stack_write_fixed_off(struct bpf_verifier_env *env, + /* stack frame we're writing to */ + struct bpf_func_state *state, + int off, int size, int value_regno, + int insn_idx) { struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; @@ -2402,9 +2453,175 @@ static int check_stack_write(struct bpf_verifier_env *env, return 0; } -static int check_stack_read(struct bpf_verifier_env *env, - struct bpf_func_state *reg_state /* func where register points to */, - int off, int size, int value_regno) +/* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is + * known to contain a variable offset. + * This function checks whether the write is permitted and conservatively + * tracks the effects of the write, considering that each stack slot in the + * dynamic range is potentially written to. + * + * 'off' includes 'regno->off'. + * 'value_regno' can be -1, meaning that an unknown value is being written to + * the stack. + * + * Spilled pointers in range are not marked as written because we don't know + * what's going to be actually written. This means that read propagation for + * future reads cannot be terminated by this write. + * + * For privileged programs, uninitialized stack slots are considered + * initialized by this write (even though we don't know exactly what offsets + * are going to be written to). The idea is that we don't want the verifier to + * reject future reads that access slots written to through variable offsets. + */ +static int check_stack_write_var_off(struct bpf_verifier_env *env, + /* func where register points to */ + struct bpf_func_state *state, + int ptr_regno, int off, int size, + int value_regno, int insn_idx) +{ + struct bpf_func_state *cur; /* state of the current function */ + int min_off, max_off; + int i, err; + struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL; + bool writing_zero = false; + /* set if the fact that we're writing a zero is used to let any + * stack slots remain STACK_ZERO + */ + bool zero_used = false; + + cur = env->cur_state->frame[env->cur_state->curframe]; + ptr_reg = &cur->regs[ptr_regno]; + min_off = ptr_reg->smin_value + off; + max_off = ptr_reg->smax_value + off + size; + if (value_regno >= 0) + value_reg = &cur->regs[value_regno]; + if (value_reg && register_is_null(value_reg)) + writing_zero = true; + + err = realloc_func_state(state, round_up(-min_off, BPF_REG_SIZE), + state->acquired_refs, true); + if (err) + return err; + + + /* Variable offset writes destroy any spilled pointers in range. */ + for (i = min_off; i < max_off; i++) { + u8 new_type, *stype; + int slot, spi; + + slot = -i - 1; + spi = slot / BPF_REG_SIZE; + stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; + + if (!env->allow_ptr_leaks + && *stype != NOT_INIT + && *stype != SCALAR_VALUE) { + /* Reject the write if there's are spilled pointers in + * range. If we didn't reject here, the ptr status + * would be erased below (even though not all slots are + * actually overwritten), possibly opening the door to + * leaks. + */ + verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d", + insn_idx, i); + return -EINVAL; + } + + /* Erase all spilled pointers. */ + state->stack[spi].spilled_ptr.type = NOT_INIT; + + /* Update the slot type. */ + new_type = STACK_MISC; + if (writing_zero && *stype == STACK_ZERO) { + new_type = STACK_ZERO; + zero_used = true; + } + /* If the slot is STACK_INVALID, we check whether it's OK to + * pretend that it will be initialized by this write. The slot + * might not actually be written to, and so if we mark it as + * initialized future reads might leak uninitialized memory. + * For privileged programs, we will accept such reads to slots + * that may or may not be written because, if we're reject + * them, the error would be too confusing. + */ + if (*stype == STACK_INVALID && !env->allow_uninit_stack) { + verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d", + insn_idx, i); + return -EINVAL; + } + *stype = new_type; + } + if (zero_used) { + /* backtracking doesn't work for STACK_ZERO yet. */ + err = mark_chain_precision(env, value_regno); + if (err) + return err; + } + return 0; +} + +/* When register 'dst_regno' is assigned some values from stack[min_off, + * max_off), we set the register's type according to the types of the + * respective stack slots. If all the stack values are known to be zeros, then + * so is the destination reg. Otherwise, the register is considered to be + * SCALAR. This function does not deal with register filling; the caller must + * ensure that all spilled registers in the stack range have been marked as + * read. + */ +static void mark_reg_stack_read(struct bpf_verifier_env *env, + /* func where src register points to */ + struct bpf_func_state *ptr_state, + int min_off, int max_off, int dst_regno) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + int i, slot, spi; + u8 *stype; + int zeros = 0; + + for (i = min_off; i < max_off; i++) { + slot = -i - 1; + spi = slot / BPF_REG_SIZE; + stype = ptr_state->stack[spi].slot_type; + if (stype[slot % BPF_REG_SIZE] != STACK_ZERO) + break; + zeros++; + } + if (zeros == max_off - min_off) { + /* any access_size read into register is zero extended, + * so the whole register == const_zero + */ + __mark_reg_const_zero(&state->regs[dst_regno]); + /* backtracking doesn't support STACK_ZERO yet, + * so mark it precise here, so that later + * backtracking can stop here. + * Backtracking may not need this if this register + * doesn't participate in pointer adjustment. + * Forward propagation of precise flag is not + * necessary either. This mark is only to stop + * backtracking. Any register that contributed + * to const 0 was marked precise before spill. + */ + state->regs[dst_regno].precise = true; + } else { + /* have read misc data from the stack */ + mark_reg_unknown(env, state->regs, dst_regno); + } + state->regs[dst_regno].live |= REG_LIVE_WRITTEN; +} + +/* Read the stack at 'off' and put the results into the register indicated by + * 'dst_regno'. It handles reg filling if the addressed stack slot is a + * spilled reg. + * + * 'dst_regno' can be -1, meaning that the read value is not going to a + * register. + * + * The access is assumed to be within the current stack bounds. + */ +static int check_stack_read_fixed_off(struct bpf_verifier_env *env, + /* func where src register points to */ + struct bpf_func_state *reg_state, + int off, int size, int dst_regno) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; @@ -2412,11 +2629,6 @@ static int check_stack_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg; u8 *stype; - if (reg_state->allocated_stack <= slot) { - verbose(env, "invalid read from stack off %d+0 size %d\n", - off, size); - return -EACCES; - } stype = reg_state->stack[spi].slot_type; reg = ®_state->stack[spi].spilled_ptr; @@ -2427,9 +2639,9 @@ static int check_stack_read(struct bpf_verifier_env *env, verbose(env, "invalid size of register fill\n"); return -EACCES; } - if (value_regno >= 0) { - mark_reg_unknown(env, state->regs, value_regno); - state->regs[value_regno].live |= REG_LIVE_WRITTEN; + if (dst_regno >= 0) { + mark_reg_unknown(env, state->regs, dst_regno); + state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); return 0; @@ -2441,16 +2653,16 @@ static int check_stack_read(struct bpf_verifier_env *env, } } - if (value_regno >= 0) { + if (dst_regno >= 0) { /* restore register state from stack */ - state->regs[value_regno] = *reg; + state->regs[dst_regno] = *reg; /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions */ - state->regs[value_regno].live |= REG_LIVE_WRITTEN; + state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } else if (__is_pointer_value(env->allow_ptr_leaks, reg)) { - /* If value_regno==-1, the caller is asking us whether + /* If dst_regno==-1, the caller is asking us whether * it is acceptable to use this value as a SCALAR_VALUE * (e.g. for XADD). * We must not allow unprivileged callers to do that @@ -2462,70 +2674,167 @@ static int check_stack_read(struct bpf_verifier_env *env, } mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); } else { - int zeros = 0; + u8 type; for (i = 0; i < size; i++) { - if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC) + type = stype[(slot - i) % BPF_REG_SIZE]; + if (type == STACK_MISC) continue; - if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) { - zeros++; + if (type == STACK_ZERO) continue; - } verbose(env, "invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; } mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); - if (value_regno >= 0) { - if (zeros == size) { - /* any size read into register is zero extended, - * so the whole register == const_zero - */ - __mark_reg_const_zero(&state->regs[value_regno]); - /* backtracking doesn't support STACK_ZERO yet, - * so mark it precise here, so that later - * backtracking can stop here. - * Backtracking may not need this if this register - * doesn't participate in pointer adjustment. - * Forward propagation of precise flag is not - * necessary either. This mark is only to stop - * backtracking. Any register that contributed - * to const 0 was marked precise before spill. - */ - state->regs[value_regno].precise = true; - } else { - /* have read misc data from the stack */ - mark_reg_unknown(env, state->regs, value_regno); - } - state->regs[value_regno].live |= REG_LIVE_WRITTEN; - } + if (dst_regno >= 0) + mark_reg_stack_read(env, reg_state, off, off + size, dst_regno); } return 0; } -static int check_stack_access(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, - int off, int size) +enum stack_access_src { + ACCESS_DIRECT = 1, /* the access is performed by an instruction */ + ACCESS_HELPER = 2, /* the access is performed by a helper */ +}; + +static int check_stack_range_initialized(struct bpf_verifier_env *env, + int regno, int off, int access_size, + bool zero_size_allowed, + enum stack_access_src type, + struct bpf_call_arg_meta *meta); + +static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) { - /* Stack accesses must be at a fixed offset, so that we - * can determine what type of data were returned. See - * check_stack_read(). + return cur_regs(env) + regno; +} + +/* Read the stack at 'ptr_regno + off' and put the result into the register + * 'dst_regno'. + * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'), + * but not its variable offset. + * 'size' is assumed to be <= reg size and the access is assumed to be aligned. + * + * As opposed to check_stack_read_fixed_off, this function doesn't deal with + * filling registers (i.e. reads of spilled register cannot be detected when + * the offset is not fixed). We conservatively mark 'dst_regno' as containing + * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable + * offset; for a fixed offset check_stack_read_fixed_off should be used + * instead. + */ +static int check_stack_read_var_off(struct bpf_verifier_env *env, + int ptr_regno, int off, int size, int dst_regno) +{ + /* The state of the source register. */ + struct bpf_reg_state *reg = reg_state(env, ptr_regno); + struct bpf_func_state *ptr_state = func(env, reg); + int err; + int min_off, max_off; + + /* Note that we pass a NULL meta, so raw access will not be permitted. */ - if (!tnum_is_const(reg->var_off)) { + err = check_stack_range_initialized(env, ptr_regno, off, size, + false, ACCESS_DIRECT, NULL); + if (err) + return err; + + min_off = reg->smin_value + off; + max_off = reg->smax_value + off; + mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno); + return 0; +} + +/* check_stack_read dispatches to check_stack_read_fixed_off or + * check_stack_read_var_off. + * + * The caller must ensure that the offset falls within the allocated stack + * bounds. + * + * 'dst_regno' is a register which will receive the value from the stack. It + * can be -1, meaning that the read value is not going to a register. + */ +static int check_stack_read(struct bpf_verifier_env *env, + int ptr_regno, int off, int size, + int dst_regno) +{ + struct bpf_reg_state *reg = reg_state(env, ptr_regno); + struct bpf_func_state *state = func(env, reg); + int err; + /* Some accesses are only permitted with a static offset. */ + bool var_off = !tnum_is_const(reg->var_off); + + /* The offset is required to be static when reads don't go to a + * register, in order to not leak pointers (see + * check_stack_read_fixed_off). + */ + if (dst_regno < 0 && var_off) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable stack access var_off=%s off=%d size=%d\n", + verbose(env, "variable offset stack pointer cannot be passed into helper function; var_off=%s off=%d size=%d\n", tn_buf, off, size); return -EACCES; } + /* Variable offset is prohibited for unprivileged mode for simplicity + * since it requires corresponding support in Spectre masking for stack + * ALU. See also retrieve_ptr_limit(). + */ + if (!env->bypass_spec_v1 && var_off) { + char tn_buf[48]; - if (off >= 0 || off < -MAX_BPF_STACK) { - verbose(env, "invalid stack off=%d size=%d\n", off, size); + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n", + ptr_regno, tn_buf); return -EACCES; } - return 0; + if (!var_off) { + off += reg->var_off.value; + err = check_stack_read_fixed_off(env, state, off, size, + dst_regno); + } else { + /* Variable offset stack reads need more conservative handling + * than fixed offset ones. Note that dst_regno >= 0 on this + * branch. + */ + err = check_stack_read_var_off(env, ptr_regno, off, size, + dst_regno); + } + return err; +} + + +/* check_stack_write dispatches to check_stack_write_fixed_off or + * check_stack_write_var_off. + * + * 'ptr_regno' is the register used as a pointer into the stack. + * 'off' includes 'ptr_regno->off', but not its variable offset (if any). + * 'value_regno' is the register whose value we're writing to the stack. It can + * be -1, meaning that we're not writing from a register. + * + * The caller must ensure that the offset falls within the maximum stack size. + */ +static int check_stack_write(struct bpf_verifier_env *env, + int ptr_regno, int off, int size, + int value_regno, int insn_idx) +{ + struct bpf_reg_state *reg = reg_state(env, ptr_regno); + struct bpf_func_state *state = func(env, reg); + int err; + + if (tnum_is_const(reg->var_off)) { + off += reg->var_off.value; + err = check_stack_write_fixed_off(env, state, off, size, + value_regno, insn_idx); + } else { + /* Variable offset stack reads need more conservative handling + * than fixed offset ones. + */ + err = check_stack_write_var_off(env, state, + ptr_regno, off, size, + value_regno, insn_idx); + } + return err; } static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, @@ -2858,11 +3167,6 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, return -EACCES; } -static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) -{ - return cur_regs(env) + regno; -} - static bool is_pointer_value(struct bpf_verifier_env *env, int regno) { return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno)); @@ -2981,8 +3285,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, break; case PTR_TO_STACK: pointer_desc = "stack "; - /* The stack spill tracking logic in check_stack_write() - * and check_stack_read() relies on stack accesses being + /* The stack spill tracking logic in check_stack_write_fixed_off() + * and check_stack_read_fixed_off() relies on stack accesses being * aligned. */ strict = true; @@ -3074,9 +3378,7 @@ process_func: continue_func: subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { - if (insn[i].code != (BPF_JMP | BPF_CALL)) - continue; - if (insn[i].src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn + i)) continue; /* remember insn and function to return to */ ret_insn[frame] = i + 1; @@ -3400,6 +3702,91 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, return 0; } +/* Check that the stack access at the given offset is within bounds. The + * maximum valid offset is -1. + * + * The minimum valid offset is -MAX_BPF_STACK for writes, and + * -state->allocated_stack for reads. + */ +static int check_stack_slot_within_bounds(int off, + struct bpf_func_state *state, + enum bpf_access_type t) +{ + int min_valid_off; + + if (t == BPF_WRITE) + min_valid_off = -MAX_BPF_STACK; + else + min_valid_off = -state->allocated_stack; + + if (off < min_valid_off || off > -1) + return -EACCES; + return 0; +} + +/* Check that the stack access at 'regno + off' falls within the maximum stack + * bounds. + * + * 'off' includes `regno->offset`, but not its dynamic part (if any). + */ +static int check_stack_access_within_bounds( + struct bpf_verifier_env *env, + int regno, int off, int access_size, + enum stack_access_src src, enum bpf_access_type type) +{ + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *reg = regs + regno; + struct bpf_func_state *state = func(env, reg); + int min_off, max_off; + int err; + char *err_extra; + + if (src == ACCESS_HELPER) + /* We don't know if helpers are reading or writing (or both). */ + err_extra = " indirect access to"; + else if (type == BPF_READ) + err_extra = " read from"; + else + err_extra = " write to"; + + if (tnum_is_const(reg->var_off)) { + min_off = reg->var_off.value + off; + if (access_size > 0) + max_off = min_off + access_size - 1; + else + max_off = min_off; + } else { + if (reg->smax_value >= BPF_MAX_VAR_OFF || + reg->smin_value <= -BPF_MAX_VAR_OFF) { + verbose(env, "invalid unbounded variable-offset%s stack R%d\n", + err_extra, regno); + return -EACCES; + } + min_off = reg->smin_value + off; + if (access_size > 0) + max_off = reg->smax_value + off + access_size - 1; + else + max_off = min_off; + } + + err = check_stack_slot_within_bounds(min_off, state, type); + if (!err) + err = check_stack_slot_within_bounds(max_off, state, type); + + if (err) { + if (tnum_is_const(reg->var_off)) { + verbose(env, "invalid%s stack R%d off=%d size=%d\n", + err_extra, regno, off, access_size); + } else { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "invalid variable-offset%s stack R%d var_off=%s size=%d\n", + err_extra, regno, tn_buf, access_size); + } + } + return err; +} /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory @@ -3515,8 +3902,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } } else if (reg->type == PTR_TO_STACK) { - off += reg->var_off.value; - err = check_stack_access(env, reg, off, size); + /* Basic bounds checks. */ + err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t); if (err) return err; @@ -3525,12 +3912,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err) return err; - if (t == BPF_WRITE) - err = check_stack_write(env, state, off, size, - value_regno, insn_idx); - else - err = check_stack_read(env, state, off, size, + if (t == BPF_READ) + err = check_stack_read(env, regno, off, size, value_regno); + else + err = check_stack_write(env, regno, off, size, + value_regno, insn_idx); } else if (reg_is_pkt_pointer(reg)) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { verbose(env, "cannot write into packet\n"); @@ -3606,13 +3993,30 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return err; } -static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) +static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) { + int load_reg; int err; - if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || - insn->imm != 0) { - verbose(env, "BPF_XADD uses reserved fields\n"); + switch (insn->imm) { + case BPF_ADD: + case BPF_ADD | BPF_FETCH: + case BPF_AND: + case BPF_AND | BPF_FETCH: + case BPF_OR: + case BPF_OR | BPF_FETCH: + case BPF_XOR: + case BPF_XOR | BPF_FETCH: + case BPF_XCHG: + case BPF_CMPXCHG: + break; + default: + verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm); + return -EINVAL; + } + + if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) { + verbose(env, "invalid atomic operand size\n"); return -EINVAL; } @@ -3626,6 +4030,13 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins if (err) return err; + if (insn->imm == BPF_CMPXCHG) { + /* Check comparison of R0 with memory location */ + err = check_reg_arg(env, BPF_REG_0, SRC_OP); + if (err) + return err; + } + if (is_pointer_value(env, insn->src_reg)) { verbose(env, "R%d leaks addr into mem\n", insn->src_reg); return -EACCES; @@ -3635,66 +4046,91 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins is_pkt_reg(env, insn->dst_reg) || is_flow_key_reg(env, insn->dst_reg) || is_sk_reg(env, insn->dst_reg)) { - verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", + verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str[reg_state(env, insn->dst_reg)->type]); return -EACCES; } - /* check whether atomic_add can read the memory */ + if (insn->imm & BPF_FETCH) { + if (insn->imm == BPF_CMPXCHG) + load_reg = BPF_REG_0; + else + load_reg = insn->src_reg; + + /* check and record load of old value */ + err = check_reg_arg(env, load_reg, DST_OP); + if (err) + return err; + } else { + /* This instruction accesses a memory location but doesn't + * actually load it into a register. + */ + load_reg = -1; + } + + /* check whether we can read the memory */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, -1, true); + BPF_SIZE(insn->code), BPF_READ, load_reg, true); if (err) return err; - /* check whether atomic_add can write into the same memory */ - return check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, -1, true); -} - -static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno, - int off, int access_size, - bool zero_size_allowed) -{ - struct bpf_reg_state *reg = reg_state(env, regno); - - if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || - access_size < 0 || (access_size == 0 && !zero_size_allowed)) { - if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid stack type R%d off=%d access_size=%d\n", - regno, off, access_size); - } else { - char tn_buf[48]; + /* check whether we can write into the same memory */ + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, -1, true); + if (err) + return err; - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n", - regno, tn_buf, access_size); - } - return -EACCES; - } return 0; } -/* when register 'regno' is passed into function that will read 'access_size' - * bytes from that pointer, make sure that it's within stack boundary - * and all elements of stack are initialized. - * Unlike most pointer bounds-checking functions, this one doesn't take an - * 'off' argument, so it has to add in reg->off itself. +/* When register 'regno' is used to read the stack (either directly or through + * a helper function) make sure that it's within stack boundary and, depending + * on the access type, that all elements of the stack are initialized. + * + * 'off' includes 'regno->off', but not its dynamic part (if any). + * + * All registers that have been spilled on the stack in the slots within the + * read offsets are marked as read. */ -static int check_stack_boundary(struct bpf_verifier_env *env, int regno, - int access_size, bool zero_size_allowed, - struct bpf_call_arg_meta *meta) +static int check_stack_range_initialized( + struct bpf_verifier_env *env, int regno, int off, + int access_size, bool zero_size_allowed, + enum stack_access_src type, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = func(env, reg); int err, min_off, max_off, i, j, slot, spi; + char *err_extra = type == ACCESS_HELPER ? " indirect" : ""; + enum bpf_access_type bounds_check_type; + /* Some accesses can write anything into the stack, others are + * read-only. + */ + bool clobber = false; + + if (access_size == 0 && !zero_size_allowed) { + verbose(env, "invalid zero-sized read\n"); + return -EACCES; + } + + if (type == ACCESS_HELPER) { + /* The bounds checks for writes are more permissive than for + * reads. However, if raw_mode is not set, we'll do extra + * checks below. + */ + bounds_check_type = BPF_WRITE; + clobber = true; + } else { + bounds_check_type = BPF_READ; + } + err = check_stack_access_within_bounds(env, regno, off, access_size, + type, bounds_check_type); + if (err) + return err; + if (tnum_is_const(reg->var_off)) { - min_off = max_off = reg->var_off.value + reg->off; - err = __check_stack_boundary(env, regno, min_off, access_size, - zero_size_allowed); - if (err) - return err; + min_off = max_off = reg->var_off.value + off; } else { /* Variable offset is prohibited for unprivileged mode for * simplicity since it requires corresponding support in @@ -3705,8 +4141,8 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n", - regno, tn_buf); + verbose(env, "R%d%s variable offset stack access prohibited for !root, var_off=%s\n", + regno, err_extra, tn_buf); return -EACCES; } /* Only initialized buffer on stack is allowed to be accessed @@ -3718,28 +4154,8 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (meta && meta->raw_mode) meta = NULL; - if (reg->smax_value >= BPF_MAX_VAR_OFF || - reg->smax_value <= -BPF_MAX_VAR_OFF) { - verbose(env, "R%d unbounded indirect variable offset stack access\n", - regno); - return -EACCES; - } - min_off = reg->smin_value + reg->off; - max_off = reg->smax_value + reg->off; - err = __check_stack_boundary(env, regno, min_off, access_size, - zero_size_allowed); - if (err) { - verbose(env, "R%d min value is outside of stack bound\n", - regno); - return err; - } - err = __check_stack_boundary(env, regno, max_off, access_size, - zero_size_allowed); - if (err) { - verbose(env, "R%d max value is outside of stack bound\n", - regno); - return err; - } + min_off = reg->smin_value + off; + max_off = reg->smax_value + off; } if (meta && meta->raw_mode) { @@ -3759,8 +4175,10 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (*stype == STACK_MISC) goto mark; if (*stype == STACK_ZERO) { - /* helper can write anything into the stack */ - *stype = STACK_MISC; + if (clobber) { + /* helper can write anything into the stack */ + *stype = STACK_MISC; + } goto mark; } @@ -3771,22 +4189,24 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (state->stack[spi].slot_type[0] == STACK_SPILL && (state->stack[spi].spilled_ptr.type == SCALAR_VALUE || env->allow_ptr_leaks)) { - __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); - for (j = 0; j < BPF_REG_SIZE; j++) - state->stack[spi].slot_type[j] = STACK_MISC; + if (clobber) { + __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); + for (j = 0; j < BPF_REG_SIZE; j++) + state->stack[spi].slot_type[j] = STACK_MISC; + } goto mark; } err: if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid indirect read from stack off %d+%d size %d\n", - min_off, i - min_off, access_size); + verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n", + err_extra, regno, min_off, i - min_off, access_size); } else { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n", - tn_buf, i - min_off, access_size); + verbose(env, "invalid%s read from stack R%d var_off %s+%d size %d\n", + err_extra, regno, tn_buf, i - min_off, access_size); } return -EACCES; mark: @@ -3835,8 +4255,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, "rdwr", &env->prog->aux->max_rdwr_access); case PTR_TO_STACK: - return check_stack_boundary(env, regno, access_size, - zero_size_allowed, meta); + return check_stack_range_initialized( + env, + regno, reg->off, access_size, + zero_size_allowed, ACCESS_HELPER, meta); default: /* scalar_value or invalid ptr */ /* Allow zero-byte read from NULL, regardless of pointer type */ if (zero_size_allowed && access_size == 0 && @@ -3850,6 +4272,29 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } +int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + u32 regno, u32 mem_size) +{ + if (register_is_null(reg)) + return 0; + + if (reg_type_may_be_null(reg->type)) { + /* Assuming that the register contains a value check if the memory + * access is safe. Temporarily save and restore the register's state as + * the conversion shouldn't be visible to a caller. + */ + const struct bpf_reg_state saved_reg = *reg; + int rv; + + mark_ptr_not_null_reg(reg); + rv = check_helper_mem_access(env, regno, mem_size, true, NULL); + *reg = saved_reg; + return rv; + } + + return check_helper_mem_access(env, regno, mem_size, true, NULL); +} + /* Implementation details: * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL * Two bpf_map_lookups (even with the same key) will have different reg->id. @@ -4321,7 +4766,7 @@ skip_type_check: err = mark_chain_precision(env, regno); } else if (arg_type_is_alloc_size(arg_type)) { if (!tnum_is_const(reg->var_off)) { - verbose(env, "R%d unbounded size, use 'var &= const' or 'if (var < const)'\n", + verbose(env, "R%d is not a known constant'\n", regno); return -EACCES; } @@ -4834,8 +5279,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, subprog); clear_caller_saved_regs(env, caller->regs); - /* All global functions return SCALAR_VALUE */ + /* All global functions return a 64-bit SCALAR_VALUE */ mark_reg_unknown(env, caller->regs, BPF_REG_0); + caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; /* continue with next insn after call */ return 0; @@ -5500,6 +5946,41 @@ do_sim: return !ret ? -EFAULT : 0; } +/* check that stack access falls within stack limits and that 'reg' doesn't + * have a variable offset. + * + * Variable offset is prohibited for unprivileged mode for simplicity since it + * requires corresponding support in Spectre masking for stack ALU. See also + * retrieve_ptr_limit(). + * + * + * 'off' includes 'reg->off'. + */ +static int check_stack_access_for_ptr_arithmetic( + struct bpf_verifier_env *env, + int regno, + const struct bpf_reg_state *reg, + int off) +{ + if (!tnum_is_const(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "R%d variable stack access prohibited for !root, var_off=%s off=%d\n", + regno, tn_buf, off); + return -EACCES; + } + + if (off >= 0 || off < -MAX_BPF_STACK) { + verbose(env, "R%d stack pointer arithmetic goes out of range, " + "prohibited for !root; off=%d\n", regno, off); + return -EACCES; + } + + return 0; +} + + /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. * Caller should also handle BPF_MOV case separately. * If we return -EACCES, caller may want to try again treating pointer as a @@ -5743,10 +6224,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, "prohibited for !root\n", dst); return -EACCES; } else if (dst_reg->type == PTR_TO_STACK && - check_stack_access(env, dst_reg, dst_reg->off + - dst_reg->var_off.value, 1)) { - verbose(env, "R%d stack pointer arithmetic goes out of range, " - "prohibited for !root\n", dst); + check_stack_access_for_ptr_arithmetic( + env, dst, dst_reg, dst_reg->off + + dst_reg->var_off.value)) { return -EACCES; } } @@ -6225,7 +6705,7 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg, * 3) the signed bounds cross zero, so they tell us nothing * about the result * If the value in dst_reg is known nonnegative, then again the - * unsigned bounts capture the signed bounds. + * unsigned bounds capture the signed bounds. * Thus, in all cases it suffices to blow away our signed bounds * and rely on inferring new ones from the unsigned bounds and * var_off of the result. @@ -6256,7 +6736,7 @@ static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg, * 3) the signed bounds cross zero, so they tell us nothing * about the result * If the value in dst_reg is known nonnegative, then again the - * unsigned bounts capture the signed bounds. + * unsigned bounds capture the signed bounds. * Thus, in all cases it suffices to blow away our signed bounds * and rely on inferring new ones from the unsigned bounds and * var_off of the result. @@ -6877,7 +7357,7 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode) case BPF_JSGT: if (reg->s32_min_value > sval) return 1; - else if (reg->s32_max_value < sval) + else if (reg->s32_max_value <= sval) return 0; break; case BPF_JLT: @@ -6950,7 +7430,7 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) case BPF_JSGT: if (reg->smin_value > sval) return 1; - else if (reg->smax_value < sval) + else if (reg->smax_value <= sval) return 0; break; case BPF_JLT: @@ -7326,43 +7806,19 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } if (is_null) { reg->type = SCALAR_VALUE; - } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - const struct bpf_map *map = reg->map_ptr; - - if (map->inner_map_meta) { - reg->type = CONST_PTR_TO_MAP; - reg->map_ptr = map->inner_map_meta; - } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { - reg->type = PTR_TO_XDP_SOCK; - } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || - map->map_type == BPF_MAP_TYPE_SOCKHASH) { - reg->type = PTR_TO_SOCKET; - } else { - reg->type = PTR_TO_MAP_VALUE; - } - } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { - reg->type = PTR_TO_SOCKET; - } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) { - reg->type = PTR_TO_SOCK_COMMON; - } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { - reg->type = PTR_TO_TCP_SOCK; - } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) { - reg->type = PTR_TO_BTF_ID; - } else if (reg->type == PTR_TO_MEM_OR_NULL) { - reg->type = PTR_TO_MEM; - } else if (reg->type == PTR_TO_RDONLY_BUF_OR_NULL) { - reg->type = PTR_TO_RDONLY_BUF; - } else if (reg->type == PTR_TO_RDWR_BUF_OR_NULL) { - reg->type = PTR_TO_RDWR_BUF; - } - if (is_null) { /* We don't need id and ref_obj_id from this point * onwards anymore, thus we should better reset it, * so that state pruning has chances to take effect. */ reg->id = 0; reg->ref_obj_id = 0; - } else if (!reg_may_point_to_spin_lock(reg)) { + + return; + } + + mark_ptr_not_null_reg(reg); + + if (!reg_may_point_to_spin_lock(reg)) { /* For not-NULL ptr, reg->ref_obj_id will be reset * in release_reg_references(). * @@ -7945,6 +8401,9 @@ static int check_return_code(struct bpf_verifier_env *env) env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME) range = tnum_range(1, 1); + if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND || + env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND) + range = tnum_range(0, 3); break; case BPF_PROG_TYPE_CGROUP_SKB: if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { @@ -8590,7 +9049,11 @@ static bool range_within(struct bpf_reg_state *old, return old->umin_value <= cur->umin_value && old->umax_value >= cur->umax_value && old->smin_value <= cur->smin_value && - old->smax_value >= cur->smax_value; + old->smax_value >= cur->smax_value && + old->u32_min_value <= cur->u32_min_value && + old->u32_max_value >= cur->u32_max_value && + old->s32_min_value <= cur->s32_min_value && + old->s32_max_value >= cur->s32_max_value; } /* Maximum number of register states that can exist at once */ @@ -9526,14 +9989,19 @@ static int do_check(struct bpf_verifier_env *env) } else if (class == BPF_STX) { enum bpf_reg_type *prev_dst_type, dst_reg_type; - if (BPF_MODE(insn->code) == BPF_XADD) { - err = check_xadd(env, env->insn_idx, insn); + if (BPF_MODE(insn->code) == BPF_ATOMIC) { + err = check_atomic(env, env->insn_idx, insn); if (err) return err; env->insn_idx++; continue; } + if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) { + verbose(env, "BPF_STX uses reserved fields\n"); + return -EINVAL; + } + /* check src1 operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) @@ -9705,6 +10173,36 @@ process_bpf_exit: return 0; } +static int find_btf_percpu_datasec(struct btf *btf) +{ + const struct btf_type *t; + const char *tname; + int i, n; + + /* + * Both vmlinux and module each have their own ".data..percpu" + * DATASECs in BTF. So for module's case, we need to skip vmlinux BTF + * types to look at only module's own BTF types. + */ + n = btf_nr_types(btf); + if (btf_is_module(btf)) + i = btf_nr_types(btf_vmlinux); + else + i = 1; + + for(; i < n; i++) { + t = btf_type_by_id(btf, i); + if (BTF_INFO_KIND(t->info) != BTF_KIND_DATASEC) + continue; + + tname = btf_name_by_offset(btf, t->name_off); + if (!strcmp(tname, ".data..percpu")) + return i; + } + + return -ENOENT; +} + /* replace pseudo btf_id with kernel symbol address */ static int check_pseudo_btf_id(struct bpf_verifier_env *env, struct bpf_insn *insn, @@ -9712,48 +10210,57 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, { const struct btf_var_secinfo *vsi; const struct btf_type *datasec; + struct btf_mod_pair *btf_mod; const struct btf_type *t; const char *sym_name; bool percpu = false; u32 type, id = insn->imm; + struct btf *btf; s32 datasec_id; u64 addr; - int i; + int i, btf_fd, err; - if (!btf_vmlinux) { - verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); - return -EINVAL; - } - - if (insn[1].imm != 0) { - verbose(env, "reserved field (insn[1].imm) is used in pseudo_btf_id ldimm64 insn.\n"); - return -EINVAL; + btf_fd = insn[1].imm; + if (btf_fd) { + btf = btf_get_by_fd(btf_fd); + if (IS_ERR(btf)) { + verbose(env, "invalid module BTF object FD specified.\n"); + return -EINVAL; + } + } else { + if (!btf_vmlinux) { + verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); + return -EINVAL; + } + btf = btf_vmlinux; + btf_get(btf); } - t = btf_type_by_id(btf_vmlinux, id); + t = btf_type_by_id(btf, id); if (!t) { verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id); - return -ENOENT; + err = -ENOENT; + goto err_put; } if (!btf_type_is_var(t)) { - verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", - id); - return -EINVAL; + verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", id); + err = -EINVAL; + goto err_put; } - sym_name = btf_name_by_offset(btf_vmlinux, t->name_off); + sym_name = btf_name_by_offset(btf, t->name_off); addr = kallsyms_lookup_name(sym_name); if (!addr) { verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n", sym_name); - return -ENOENT; + err = -ENOENT; + goto err_put; } - datasec_id = btf_find_by_name_kind(btf_vmlinux, ".data..percpu", - BTF_KIND_DATASEC); + datasec_id = find_btf_percpu_datasec(btf); if (datasec_id > 0) { - datasec = btf_type_by_id(btf_vmlinux, datasec_id); + datasec = btf_type_by_id(btf, datasec_id); for_each_vsi(i, datasec, vsi) { if (vsi->type == id) { percpu = true; @@ -9766,10 +10273,10 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, insn[1].imm = addr >> 32; type = t->type; - t = btf_type_skip_modifiers(btf_vmlinux, type, NULL); + t = btf_type_skip_modifiers(btf, type, NULL); if (percpu) { aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID; - aux->btf_var.btf = btf_vmlinux; + aux->btf_var.btf = btf; aux->btf_var.btf_id = type; } else if (!btf_type_is_struct(t)) { const struct btf_type *ret; @@ -9777,21 +10284,54 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, u32 tsize; /* resolve the type size of ksym. */ - ret = btf_resolve_size(btf_vmlinux, t, &tsize); + ret = btf_resolve_size(btf, t, &tsize); if (IS_ERR(ret)) { - tname = btf_name_by_offset(btf_vmlinux, t->name_off); + tname = btf_name_by_offset(btf, t->name_off); verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n", tname, PTR_ERR(ret)); - return -EINVAL; + err = -EINVAL; + goto err_put; } aux->btf_var.reg_type = PTR_TO_MEM; aux->btf_var.mem_size = tsize; } else { aux->btf_var.reg_type = PTR_TO_BTF_ID; - aux->btf_var.btf = btf_vmlinux; + aux->btf_var.btf = btf; aux->btf_var.btf_id = type; } + + /* check whether we recorded this BTF (and maybe module) already */ + for (i = 0; i < env->used_btf_cnt; i++) { + if (env->used_btfs[i].btf == btf) { + btf_put(btf); + return 0; + } + } + + if (env->used_btf_cnt >= MAX_USED_BTFS) { + err = -E2BIG; + goto err_put; + } + + btf_mod = &env->used_btfs[env->used_btf_cnt]; + btf_mod->btf = btf; + btf_mod->module = NULL; + + /* if we reference variables from kernel module, bump its refcount */ + if (btf_is_module(btf)) { + btf_mod->module = btf_try_get_module(btf); + if (!btf_mod->module) { + err = -ENXIO; + goto err_put; + } + } + + env->used_btf_cnt++; + return 0; +err_put: + btf_put(btf); + return err; } static int check_map_prealloc(struct bpf_map *map) @@ -9893,15 +10433,22 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_HASH: case BPF_MAP_TYPE_LRU_HASH: case BPF_MAP_TYPE_ARRAY: + case BPF_MAP_TYPE_PERCPU_HASH: + case BPF_MAP_TYPE_PERCPU_ARRAY: + case BPF_MAP_TYPE_LRU_PERCPU_HASH: + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH_OF_MAPS: if (!is_preallocated_map(map)) { verbose(env, - "Sleepable programs can only use preallocated hash maps\n"); + "Sleepable programs can only use preallocated maps\n"); return -EINVAL; } break; + case BPF_MAP_TYPE_RINGBUF: + break; default: verbose(env, - "Sleepable programs can only use array and hash maps\n"); + "Sleepable programs can only use array, hash, and ringbuf maps\n"); return -EINVAL; } @@ -9938,13 +10485,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) return -EINVAL; } - if (BPF_CLASS(insn->code) == BPF_STX && - ((BPF_MODE(insn->code) != BPF_MEM && - BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) { - verbose(env, "BPF_STX uses reserved fields\n"); - return -EINVAL; - } - if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { struct bpf_insn_aux_data *aux; struct bpf_map *map; @@ -10088,6 +10628,13 @@ static void release_maps(struct bpf_verifier_env *env) env->used_map_cnt); } +/* drop refcnt of maps used by the rejected program */ +static void release_btfs(struct bpf_verifier_env *env) +{ + __bpf_free_used_btfs(env->prog->aux, env->used_btfs, + env->used_btf_cnt); +} + /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) { @@ -10459,6 +11006,7 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, for (i = 0; i < len; i++) { int adj_idx = i + delta; struct bpf_insn insn; + u8 load_reg; insn = insns[adj_idx]; if (!aux[adj_idx].zext_dst) { @@ -10501,9 +11049,27 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, if (!bpf_jit_needs_zext()) continue; + /* zext_dst means that we want to zero-extend whatever register + * the insn defines, which is dst_reg most of the time, with + * the notable exception of BPF_STX + BPF_ATOMIC + BPF_FETCH. + */ + if (BPF_CLASS(insn.code) == BPF_STX && + BPF_MODE(insn.code) == BPF_ATOMIC) { + /* BPF_STX + BPF_ATOMIC insns without BPF_FETCH do not + * define any registers, therefore zext_dst cannot be + * set. + */ + if (WARN_ON(!(insn.imm & BPF_FETCH))) + return -EINVAL; + load_reg = insn.imm == BPF_CMPXCHG ? BPF_REG_0 + : insn.src_reg; + } else { + load_reg = insn.dst_reg; + } + zext_patch[0] = insn; - zext_patch[1].dst_reg = insn.dst_reg; - zext_patch[1].src_reg = insn.dst_reg; + zext_patch[1].dst_reg = load_reg; + zext_patch[1].src_reg = load_reg; patch = zext_patch; patch_len = 2; apply_patch_buffer: @@ -10719,8 +11285,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) return 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; /* Upon error here we cannot fall back to interpreter but * need a hard reject of the program. Thus -EFAULT is @@ -10761,7 +11326,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) /* BPF_PROG_RUN doesn't call subprogs directly, * hence main prog stats include the runtime of subprogs. * subprogs don't have IDs and not reachable via prog_get_next_id - * func[i]->aux->stats will never be accessed and stays NULL + * func[i]->stats will never be accessed and stays NULL */ func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER); if (!func[i]) @@ -10849,8 +11414,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) for (i = 0; i < env->subprog_cnt; i++) { insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; subprog = insn->off; insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) - @@ -10895,8 +11459,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) * later look the same as if they were interpreted only. */ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; insn->off = env->insn_aux_data[i].call_imm; subprog = find_subprog(env, i + insn->off + 1); @@ -10925,8 +11488,7 @@ out_undo_insn: /* cleanup main prog to be interpreted */ prog->jit_requested = 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; insn->off = 0; insn->imm = env->insn_aux_data[i].call_imm; @@ -10961,8 +11523,7 @@ static int fixup_call_args(struct bpf_verifier_env *env) return -EINVAL; } for (i = 0; i < prog->len; i++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; depth = get_callee_stack_depth(env, insn, i); if (depth < 0) @@ -10999,30 +11560,30 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->code == (BPF_ALU | BPF_MOD | BPF_X) || insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; - struct bpf_insn mask_and_div[] = { - BPF_MOV32_REG(insn->src_reg, insn->src_reg), - /* Rx div 0 -> 0 */ - BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2), + bool isdiv = BPF_OP(insn->code) == BPF_DIV; + struct bpf_insn *patchlet; + struct bpf_insn chk_and_div[] = { + /* [R,W]x div 0 -> 0 */ + BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JNE | BPF_K, insn->src_reg, + 0, 2, 0), BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg), BPF_JMP_IMM(BPF_JA, 0, 0, 1), *insn, }; - struct bpf_insn mask_and_mod[] = { - BPF_MOV32_REG(insn->src_reg, insn->src_reg), - /* Rx mod 0 -> Rx */ - BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1), + struct bpf_insn chk_and_mod[] = { + /* [R,W]x mod 0 -> [R,W]x */ + BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JEQ | BPF_K, insn->src_reg, + 0, 1 + (is64 ? 0 : 1), 0), *insn, + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + BPF_MOV32_REG(insn->dst_reg, insn->dst_reg), }; - struct bpf_insn *patchlet; - if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || - insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { - patchlet = mask_and_div + (is64 ? 1 : 0); - cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0); - } else { - patchlet = mask_and_mod + (is64 ? 1 : 0); - cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0); - } + patchlet = isdiv ? chk_and_div : chk_and_mod; + cnt = isdiv ? ARRAY_SIZE(chk_and_div) : + ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0); new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); if (!new_prog) @@ -11427,6 +11988,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) mark_reg_known_zero(env, regs, i); else if (regs[i].type == SCALAR_VALUE) mark_reg_unknown(env, regs, i); + else if (regs[i].type == PTR_TO_MEM_OR_NULL) { + const u32 mem_size = regs[i].mem_size; + + mark_reg_known_zero(env, regs, i); + regs[i].mem_size = mem_size; + regs[i].id = ++env->id_gen; + } } } else { /* 1st arg to a function */ @@ -12005,6 +12573,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->strict_alignment = false; env->allow_ptr_leaks = bpf_allow_ptr_leaks(); + env->allow_uninit_stack = bpf_allow_uninit_stack(); env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access(); env->bypass_spec_v1 = bpf_bypass_spec_v1(); env->bypass_spec_v4 = bpf_bypass_spec_v4(); @@ -12100,7 +12669,10 @@ skip_full_check: goto err_release_maps; } - if (ret == 0 && env->used_map_cnt) { + if (ret) + goto err_release_maps; + + if (env->used_map_cnt) { /* if program passed verifier, update used_maps in bpf_prog_info */ env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt, sizeof(env->used_maps[0]), @@ -12114,15 +12686,29 @@ skip_full_check: memcpy(env->prog->aux->used_maps, env->used_maps, sizeof(env->used_maps[0]) * env->used_map_cnt); env->prog->aux->used_map_cnt = env->used_map_cnt; + } + if (env->used_btf_cnt) { + /* if program passed verifier, update used_btfs in bpf_prog_aux */ + env->prog->aux->used_btfs = kmalloc_array(env->used_btf_cnt, + sizeof(env->used_btfs[0]), + GFP_KERNEL); + if (!env->prog->aux->used_btfs) { + ret = -ENOMEM; + goto err_release_maps; + } + memcpy(env->prog->aux->used_btfs, env->used_btfs, + sizeof(env->used_btfs[0]) * env->used_btf_cnt); + env->prog->aux->used_btf_cnt = env->used_btf_cnt; + } + if (env->used_map_cnt || env->used_btf_cnt) { /* program is valid. Convert pseudo bpf_ld_imm64 into generic * bpf_ld_imm64 instructions */ convert_pseudo_ld_imm64(env); } - if (ret == 0) - adjust_btf_func(env); + adjust_btf_func(env); err_release_maps: if (!env->prog->aux->used_maps) @@ -12130,6 +12716,8 @@ err_release_maps: * them now. Otherwise free_used_maps() will release them. */ release_maps(env); + if (!env->prog->aux->used_btfs) + release_btfs(env); /* extension progs temporarily inherit the attach_type of their targets for verification purposes, so set it back to zero before returning diff --git a/kernel/capability.c b/kernel/capability.c index de7eac903a2a..46a361dde042 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -484,10 +484,12 @@ EXPORT_SYMBOL(file_ns_capable); * * Return true if the inode uid and gid are within the namespace. */ -bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode) +bool privileged_wrt_inode_uidgid(struct user_namespace *ns, + struct user_namespace *mnt_userns, + const struct inode *inode) { - return kuid_has_mapping(ns, inode->i_uid) && - kgid_has_mapping(ns, inode->i_gid); + return kuid_has_mapping(ns, i_uid_into_mnt(mnt_userns, inode)) && + kgid_has_mapping(ns, i_gid_into_mnt(mnt_userns, inode)); } /** @@ -499,11 +501,13 @@ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode * * its own user namespace and that the given inode's uid and gid are * mapped into the current user namespace. */ -bool capable_wrt_inode_uidgid(const struct inode *inode, int cap) +bool capable_wrt_inode_uidgid(struct user_namespace *mnt_userns, + const struct inode *inode, int cap) { struct user_namespace *ns = current_user_ns(); - return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode); + return ns_capable(ns, cap) && + privileged_wrt_inode_uidgid(ns, mnt_userns, inode); } EXPORT_SYMBOL(capable_wrt_inode_uidgid); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 32596fdbcd5b..a5751784ad74 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -917,6 +917,9 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) for_each_subsys(ss, i) { if (strcmp(param->key, ss->legacy_name)) continue; + if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i)) + return invalfc(fc, "Disabled controller '%s'", + param->key); ctx->subsys_mask |= (1 << i); return 0; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 613845769103..9153b20e5cc6 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3564,6 +3564,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, { struct psi_trigger *new; struct cgroup *cgrp; + struct psi_group *psi; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) @@ -3572,7 +3573,8 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, cgroup_get(cgrp); cgroup_kn_unlock(of->kn); - new = psi_trigger_create(&cgrp->psi, buf, nbytes, res); + psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + new = psi_trigger_create(psi, buf, nbytes, res); if (IS_ERR(new)) { cgroup_put(cgrp); return PTR_ERR(new); @@ -4670,7 +4672,7 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb) if (!inode) return -ENOMEM; - ret = inode_permission(inode, MAY_WRITE); + ret = inode_permission(&init_user_ns, inode, MAY_WRITE); iput(inode); return ret; } @@ -4726,8 +4728,8 @@ static int cgroup_attach_permissions(struct cgroup *src_cgrp, return ret; } -static ssize_t cgroup_procs_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) +static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, + bool threadgroup) { struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; @@ -4738,7 +4740,7 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, true, &locked); + task = cgroup_procs_write_start(buf, threadgroup, &locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -4748,19 +4750,26 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); spin_unlock_irq(&css_set_lock); + /* process and thread migrations follow same delegation rule */ ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb, true); + of->file->f_path.dentry->d_sb, threadgroup); if (ret) goto out_finish; - ret = cgroup_attach_task(dst_cgrp, task, true); + ret = cgroup_attach_task(dst_cgrp, task, threadgroup); out_finish: cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); - return ret ?: nbytes; + return ret; +} + +static ssize_t cgroup_procs_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return __cgroup_procs_write(of, buf, true) ?: nbytes; } static void *cgroup_threads_start(struct seq_file *s, loff_t *pos) @@ -4771,41 +4780,7 @@ static void *cgroup_threads_start(struct seq_file *s, loff_t *pos) static ssize_t cgroup_threads_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - struct cgroup *src_cgrp, *dst_cgrp; - struct task_struct *task; - ssize_t ret; - bool locked; - - buf = strstrip(buf); - - dst_cgrp = cgroup_kn_lock_live(of->kn, false); - if (!dst_cgrp) - return -ENODEV; - - task = cgroup_procs_write_start(buf, false, &locked); - ret = PTR_ERR_OR_ZERO(task); - if (ret) - goto out_unlock; - - /* find the source cgroup */ - spin_lock_irq(&css_set_lock); - src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); - spin_unlock_irq(&css_set_lock); - - /* thread migrations follow the cgroup.procs delegation rule */ - ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb, false); - if (ret) - goto out_finish; - - ret = cgroup_attach_task(dst_cgrp, task, false); - -out_finish: - cgroup_procs_write_finish(task, locked); -out_unlock: - cgroup_kn_unlock(of->kn); - - return ret ?: nbytes; + return __cgroup_procs_write(of, buf, false) ?: nbytes; } /* cgroup core interface files for the default hierarchy */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 53c70c470a38..5258b68153e0 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -98,7 +98,7 @@ struct cpuset { * and if it ends up empty, it will inherit the parent's mask. * * - * On legacy hierachy: + * On legacy hierarchy: * * The user-configured masks are always the same with effective masks. */ @@ -1309,10 +1309,10 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, * @cs: the cpuset to consider * @tmp: temp variables for calculating effective_cpus & partition setup * - * When congifured cpumask is changed, the effective cpumasks of this cpuset + * When configured cpumask is changed, the effective cpumasks of this cpuset * and all its descendants need to be updated. * - * On legacy hierachy, effective_cpus will be the same with cpu_allowed. + * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. * * Called with cpuset_mutex held */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 4e11e91010e1..1b6302ecbabe 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -330,6 +330,13 @@ void lockdep_assert_cpus_held(void) percpu_rwsem_assert_held(&cpu_hotplug_lock); } +#ifdef CONFIG_LOCKDEP +int lockdep_is_cpus_held(void) +{ + return percpu_rwsem_is_held(&cpu_hotplug_lock); +} +#endif + static void lockdep_acquire_cpus_lock(void) { rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_); diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index af6e8b4fb359..b636d517c02c 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -119,7 +119,6 @@ static DEFINE_RAW_SPINLOCK(dbg_slave_lock); */ static atomic_t masters_in_kgdb; static atomic_t slaves_in_kgdb; -static atomic_t kgdb_break_tasklet_var; atomic_t kgdb_setting_breakpoint; struct task_struct *kgdb_usethread; @@ -1084,31 +1083,6 @@ static void kgdb_unregister_callbacks(void) } } -/* - * There are times a tasklet needs to be used vs a compiled in - * break point so as to cause an exception outside a kgdb I/O module, - * such as is the case with kgdboe, where calling a breakpoint in the - * I/O driver itself would be fatal. - */ -static void kgdb_tasklet_bpt(unsigned long ing) -{ - kgdb_breakpoint(); - atomic_set(&kgdb_break_tasklet_var, 0); -} - -static DECLARE_TASKLET_OLD(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt); - -void kgdb_schedule_breakpoint(void) -{ - if (atomic_read(&kgdb_break_tasklet_var) || - atomic_read(&kgdb_active) != -1 || - atomic_read(&kgdb_setting_breakpoint)) - return; - atomic_inc(&kgdb_break_tasklet_var); - tasklet_schedule(&kgdb_tasklet_breakpoint); -} -EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint); - /** * kgdb_register_io_module - register KGDB IO module * @new_dbg_io_ops: the io ops vector @@ -1166,7 +1140,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) EXPORT_SYMBOL_GPL(kgdb_register_io_module); /** - * kkgdb_unregister_io_module - unregister KGDB IO module + * kgdb_unregister_io_module - unregister KGDB IO module * @old_dbg_io_ops: the io ops vector * * Unregister it with the KGDB core. diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index a77df59d9ca5..e149a0ac9e9e 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -595,7 +595,7 @@ static char *gdb_hex_reg_helper(int regnum, char *out) dbg_reg_def[i].size); } -/* Handle the 'p' individual regster get */ +/* Handle the 'p' individual register get */ static void gdb_cmd_reg_get(struct kgdb_state *ks) { unsigned long regnum; @@ -610,7 +610,7 @@ static void gdb_cmd_reg_get(struct kgdb_state *ks) gdb_hex_reg_helper(regnum, remcom_out_buffer); } -/* Handle the 'P' individual regster set */ +/* Handle the 'P' individual register set */ static void gdb_cmd_reg_set(struct kgdb_state *ks) { unsigned long regnum; diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index a4281fb99299..6cb92f7bbbd0 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -230,7 +230,7 @@ extern struct task_struct *kdb_curr_task(int); #define kdb_task_has_cpu(p) (task_curr(p)) -#define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL) +#define GFP_KDB (in_dbg_master() ? GFP_ATOMIC : GFP_KERNEL) extern void *debug_kmalloc(size_t size, gfp_t flags); extern void debug_kfree(void *); @@ -254,4 +254,14 @@ extern char kdb_prompt_str[]; #define KDB_WORD_SIZE ((int)sizeof(unsigned long)) #endif /* CONFIG_KGDB_KDB */ + +#define kdb_func_printf(format, args...) \ + kdb_printf("%s: " format, __func__, ## args) + +#define kdb_dbg_printf(mask, format, args...) \ + do { \ + if (KDB_DEBUG(mask)) \ + kdb_func_printf(format, ## args); \ + } while (0) + #endif /* !_KDBPRIVATE_H */ diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 6226502ce049..f7c1885abeb6 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -39,20 +39,15 @@ */ int kdbgetsymval(const char *symname, kdb_symtab_t *symtab) { - if (KDB_DEBUG(AR)) - kdb_printf("kdbgetsymval: symname=%s, symtab=%px\n", symname, - symtab); + kdb_dbg_printf(AR, "symname=%s, symtab=%px\n", symname, symtab); memset(symtab, 0, sizeof(*symtab)); symtab->sym_start = kallsyms_lookup_name(symname); if (symtab->sym_start) { - if (KDB_DEBUG(AR)) - kdb_printf("kdbgetsymval: returns 1, " - "symtab->sym_start=0x%lx\n", - symtab->sym_start); + kdb_dbg_printf(AR, "returns 1, symtab->sym_start=0x%lx\n", + symtab->sym_start); return 1; } - if (KDB_DEBUG(AR)) - kdb_printf("kdbgetsymval: returns 0\n"); + kdb_dbg_printf(AR, "returns 0\n"); return 0; } EXPORT_SYMBOL(kdbgetsymval); @@ -87,16 +82,14 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) #define knt1_size 128 /* must be >= kallsyms table size */ char *knt1 = NULL; - if (KDB_DEBUG(AR)) - kdb_printf("kdbnearsym: addr=0x%lx, symtab=%px\n", addr, symtab); + kdb_dbg_printf(AR, "addr=0x%lx, symtab=%px\n", addr, symtab); memset(symtab, 0, sizeof(*symtab)); if (addr < 4096) goto out; knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC); if (!knt1) { - kdb_printf("kdbnearsym: addr=0x%lx cannot kmalloc knt1\n", - addr); + kdb_func_printf("addr=0x%lx cannot kmalloc knt1\n", addr); goto out; } symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset, @@ -147,11 +140,8 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) if (symtab->mod_name == NULL) symtab->mod_name = "kernel"; - if (KDB_DEBUG(AR)) - kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, " - "symtab->mod_name=%px, symtab->sym_name=%px (%s)\n", ret, - symtab->sym_start, symtab->mod_name, symtab->sym_name, - symtab->sym_name); + kdb_dbg_printf(AR, "returns %d symtab->sym_start=0x%lx, symtab->mod_name=%px, symtab->sym_name=%px (%s)\n", + ret, symtab->sym_start, symtab->mod_name, symtab->sym_name, symtab->sym_name); out: debug_kfree(knt1); @@ -328,7 +318,7 @@ int kdb_getarea_size(void *res, unsigned long addr, size_t size) int ret = copy_from_kernel_nofault((char *)res, (char *)addr, size); if (ret) { if (!KDB_STATE(SUPPRESS)) { - kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr); + kdb_func_printf("Bad address 0x%lx\n", addr); KDB_STATE_SET(SUPPRESS); } ret = KDB_BADADDR; @@ -353,7 +343,7 @@ int kdb_putarea_size(unsigned long addr, void *res, size_t size) int ret = copy_from_kernel_nofault((char *)addr, (char *)res, size); if (ret) { if (!KDB_STATE(SUPPRESS)) { - kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr); + kdb_func_printf("Bad address 0x%lx\n", addr); KDB_STATE_SET(SUPPRESS); } ret = KDB_BADADDR; @@ -435,7 +425,7 @@ int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size) fallthrough; default: diag = KDB_BADWIDTH; - kdb_printf("kdb_getphysword: bad width %ld\n", (long) size); + kdb_func_printf("bad width %zu\n", size); } return diag; } @@ -484,7 +474,7 @@ int kdb_getword(unsigned long *word, unsigned long addr, size_t size) fallthrough; default: diag = KDB_BADWIDTH; - kdb_printf("kdb_getword: bad width %ld\n", (long) size); + kdb_func_printf("bad width %zu\n", size); } return diag; } @@ -528,7 +518,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size) fallthrough; default: diag = KDB_BADWIDTH; - kdb_printf("kdb_putword: bad width %ld\n", (long) size); + kdb_func_printf("bad width %zu\n", size); } return diag; } @@ -602,8 +592,7 @@ unsigned long kdb_task_state_string(const char *s) res = ~0UL; break; default: - kdb_printf("%s: unknown flag '%c' ignored\n", - __func__, *s); + kdb_func_printf("unknown flag '%c' ignored\n", *s); break; } ++s; @@ -884,18 +873,16 @@ void debug_kusage(void) if (!debug_kusage_one_time) goto out; debug_kusage_one_time = 0; - kdb_printf("%s: debug_kmalloc memory leak dah_first %d\n", - __func__, dah_first); + kdb_func_printf("debug_kmalloc memory leak dah_first %d\n", dah_first); if (dah_first) { h_used = (struct debug_alloc_header *)debug_alloc_pool; - kdb_printf("%s: h_used %px size %d\n", __func__, h_used, - h_used->size); + kdb_func_printf("h_used %px size %d\n", h_used, h_used->size); } do { h_used = (struct debug_alloc_header *) ((char *)h_free + dah_overhead + h_free->size); - kdb_printf("%s: h_used %px size %d caller %px\n", - __func__, h_used, h_used->size, h_used->caller); + kdb_func_printf("h_used %px size %d caller %px\n", + h_used, h_used->size, h_used->caller); h_free = (struct debug_alloc_header *) (debug_alloc_pool + h_free->next); } while (h_free->next); @@ -903,8 +890,8 @@ void debug_kusage(void) ((char *)h_free + dah_overhead + h_free->size); if ((char *)h_used - debug_alloc_pool != sizeof(debug_alloc_pool_aligned)) - kdb_printf("%s: h_used %px size %d caller %px\n", - __func__, h_used, h_used->size, h_used->caller); + kdb_func_printf("h_used %px size %d caller %px\n", + h_used, h_used->size, h_used->caller); out: spin_unlock(&dap_lock); } diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 479fc145acfc..77b405508743 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -33,9 +33,6 @@ config NEED_DMA_MAP_STATE config ARCH_DMA_ADDR_T_64BIT def_bool 64BIT || PHYS_ADDR_T_64BIT -config ARCH_HAS_DMA_COHERENCE_H - bool - config ARCH_HAS_DMA_SET_MASK bool diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index f87a89d08654..84de6b1c5fab 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -16,6 +16,8 @@ #include "debug.h" #include "direct.h" +bool dma_default_coherent; + /* * Managed DMA API */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index f9d491b17b78..8442e5c9cfa2 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -184,6 +184,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, * enabled above. */ local_irq_disable_exit_to_user(); + + /* Check if any of the above work has queued a deferred wakeup */ + rcu_nocb_flush_deferred_wakeup(); + ti_work = READ_ONCE(current_thread_info()->flags); } @@ -197,6 +201,9 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) lockdep_assert_irqs_disabled(); + /* Flush pending rcuog wakeup before the last need_resched() check */ + rcu_nocb_flush_deferred_wakeup(); + if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) ti_work = exit_to_user_mode_loop(regs, ti_work); @@ -385,6 +392,9 @@ void irqentry_exit_cond_resched(void) preempt_schedule_irq(); } } +#ifdef CONFIG_PREEMPT_DYNAMIC +DEFINE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched); +#endif noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) { @@ -411,8 +421,13 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) } instrumentation_begin(); - if (IS_ENABLED(CONFIG_PREEMPTION)) + if (IS_ENABLED(CONFIG_PREEMPTION)) { +#ifdef CONFIG_PREEMT_DYNAMIC + static_call(irqentry_exit_cond_resched)(); +#else irqentry_exit_cond_resched(); +#endif + } /* Covers both tracing and lockdep */ trace_hardirqs_on(); instrumentation_end(); diff --git a/kernel/events/core.c b/kernel/events/core.c index 55d18791a72d..129dee540a8b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -53,6 +53,7 @@ #include <linux/min_heap.h> #include <linux/highmem.h> #include <linux/pgtable.h> +#include <linux/buildid.h> #include "internal.h" @@ -397,6 +398,7 @@ static atomic_t nr_ksymbol_events __read_mostly; static atomic_t nr_bpf_events __read_mostly; static atomic_t nr_cgroup_events __read_mostly; static atomic_t nr_text_poke_events __read_mostly; +static atomic_t nr_build_id_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -1595,50 +1597,91 @@ static void perf_event_groups_init(struct perf_event_groups *groups) groups->index = 0; } +static inline struct cgroup *event_cgroup(const struct perf_event *event) +{ + struct cgroup *cgroup = NULL; + +#ifdef CONFIG_CGROUP_PERF + if (event->cgrp) + cgroup = event->cgrp->css.cgroup; +#endif + + return cgroup; +} + /* * Compare function for event groups; * * Implements complex key that first sorts by CPU and then by virtual index * which provides ordering when rotating groups for the same CPU. */ -static bool -perf_event_groups_less(struct perf_event *left, struct perf_event *right) +static __always_inline int +perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup, + const u64 left_group_index, const struct perf_event *right) { - if (left->cpu < right->cpu) - return true; - if (left->cpu > right->cpu) - return false; + if (left_cpu < right->cpu) + return -1; + if (left_cpu > right->cpu) + return 1; #ifdef CONFIG_CGROUP_PERF - if (left->cgrp != right->cgrp) { - if (!left->cgrp || !left->cgrp->css.cgroup) { - /* - * Left has no cgroup but right does, no cgroups come - * first. - */ - return true; - } - if (!right->cgrp || !right->cgrp->css.cgroup) { - /* - * Right has no cgroup but left does, no cgroups come - * first. - */ - return false; - } - /* Two dissimilar cgroups, order by id. */ - if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id) - return true; + { + const struct cgroup *right_cgroup = event_cgroup(right); - return false; + if (left_cgroup != right_cgroup) { + if (!left_cgroup) { + /* + * Left has no cgroup but right does, no + * cgroups come first. + */ + return -1; + } + if (!right_cgroup) { + /* + * Right has no cgroup but left does, no + * cgroups come first. + */ + return 1; + } + /* Two dissimilar cgroups, order by id. */ + if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup)) + return -1; + + return 1; + } } #endif - if (left->group_index < right->group_index) - return true; - if (left->group_index > right->group_index) - return false; + if (left_group_index < right->group_index) + return -1; + if (left_group_index > right->group_index) + return 1; - return false; + return 0; +} + +#define __node_2_pe(node) \ + rb_entry((node), struct perf_event, group_node) + +static inline bool __group_less(struct rb_node *a, const struct rb_node *b) +{ + struct perf_event *e = __node_2_pe(a); + return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index, + __node_2_pe(b)) < 0; +} + +struct __group_key { + int cpu; + struct cgroup *cgroup; +}; + +static inline int __group_cmp(const void *key, const struct rb_node *node) +{ + const struct __group_key *a = key; + const struct perf_event *b = __node_2_pe(node); + + /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */ + return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b); } /* @@ -1650,27 +1693,9 @@ static void perf_event_groups_insert(struct perf_event_groups *groups, struct perf_event *event) { - struct perf_event *node_event; - struct rb_node *parent; - struct rb_node **node; - event->group_index = ++groups->index; - node = &groups->tree.rb_node; - parent = *node; - - while (*node) { - parent = *node; - node_event = container_of(*node, struct perf_event, group_node); - - if (perf_event_groups_less(event, node_event)) - node = &parent->rb_left; - else - node = &parent->rb_right; - } - - rb_link_node(&event->group_node, parent, node); - rb_insert_color(&event->group_node, &groups->tree); + rb_add(&event->group_node, &groups->tree, __group_less); } /* @@ -1718,45 +1743,17 @@ static struct perf_event * perf_event_groups_first(struct perf_event_groups *groups, int cpu, struct cgroup *cgrp) { - struct perf_event *node_event = NULL, *match = NULL; - struct rb_node *node = groups->tree.rb_node; -#ifdef CONFIG_CGROUP_PERF - u64 node_cgrp_id, cgrp_id = 0; - - if (cgrp) - cgrp_id = cgrp->kn->id; -#endif + struct __group_key key = { + .cpu = cpu, + .cgroup = cgrp, + }; + struct rb_node *node; - while (node) { - node_event = container_of(node, struct perf_event, group_node); + node = rb_find_first(&key, &groups->tree, __group_cmp); + if (node) + return __node_2_pe(node); - if (cpu < node_event->cpu) { - node = node->rb_left; - continue; - } - if (cpu > node_event->cpu) { - node = node->rb_right; - continue; - } -#ifdef CONFIG_CGROUP_PERF - node_cgrp_id = 0; - if (node_event->cgrp && node_event->cgrp->css.cgroup) - node_cgrp_id = node_event->cgrp->css.cgroup->kn->id; - - if (cgrp_id < node_cgrp_id) { - node = node->rb_left; - continue; - } - if (cgrp_id > node_cgrp_id) { - node = node->rb_right; - continue; - } -#endif - match = node_event; - node = node->rb_left; - } - - return match; + return NULL; } /* @@ -1765,27 +1762,17 @@ perf_event_groups_first(struct perf_event_groups *groups, int cpu, static struct perf_event * perf_event_groups_next(struct perf_event *event) { - struct perf_event *next; -#ifdef CONFIG_CGROUP_PERF - u64 curr_cgrp_id = 0; - u64 next_cgrp_id = 0; -#endif - - next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node); - if (next == NULL || next->cpu != event->cpu) - return NULL; - -#ifdef CONFIG_CGROUP_PERF - if (event->cgrp && event->cgrp->css.cgroup) - curr_cgrp_id = event->cgrp->css.cgroup->kn->id; + struct __group_key key = { + .cpu = event->cpu, + .cgroup = event_cgroup(event), + }; + struct rb_node *next; - if (next->cgrp && next->cgrp->css.cgroup) - next_cgrp_id = next->cgrp->css.cgroup->kn->id; + next = rb_next_match(&key, &event->group_node, __group_cmp); + if (next) + return __node_2_pe(next); - if (curr_cgrp_id != next_cgrp_id) - return NULL; -#endif - return next; + return NULL; } /* @@ -1879,8 +1866,8 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_PERIOD) size += sizeof(data->period); - if (sample_type & PERF_SAMPLE_WEIGHT) - size += sizeof(data->weight); + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) + size += sizeof(data->weight.full); if (sample_type & PERF_SAMPLE_READ) size += event->read_size; @@ -4673,6 +4660,8 @@ static void unaccount_event(struct perf_event *event) dec = true; if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); + if (event->attr.build_id) + atomic_dec(&nr_build_id_events); if (event->attr.comm) atomic_dec(&nr_comm_events); if (event->attr.namespaces) @@ -6907,8 +6896,8 @@ void perf_output_sample(struct perf_output_handle *handle, data->regs_user.regs); } - if (sample_type & PERF_SAMPLE_WEIGHT) - perf_output_put(handle, data->weight); + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) + perf_output_put(handle, data->weight.full); if (sample_type & PERF_SAMPLE_DATA_SRC) perf_output_put(handle, data->data_src.val); @@ -8046,6 +8035,8 @@ struct perf_mmap_event { u64 ino; u64 ino_generation; u32 prot, flags; + u8 build_id[BUILD_ID_SIZE_MAX]; + u32 build_id_size; struct { struct perf_event_header header; @@ -8077,6 +8068,7 @@ static void perf_event_mmap_output(struct perf_event *event, struct perf_sample_data sample; int size = mmap_event->event_id.header.size; u32 type = mmap_event->event_id.header.type; + bool use_build_id; int ret; if (!perf_event_mmap_match(event, data)) @@ -8101,13 +8093,25 @@ static void perf_event_mmap_output(struct perf_event *event, mmap_event->event_id.pid = perf_event_pid(event, current); mmap_event->event_id.tid = perf_event_tid(event, current); + use_build_id = event->attr.build_id && mmap_event->build_id_size; + + if (event->attr.mmap2 && use_build_id) + mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID; + perf_output_put(&handle, mmap_event->event_id); if (event->attr.mmap2) { - perf_output_put(&handle, mmap_event->maj); - perf_output_put(&handle, mmap_event->min); - perf_output_put(&handle, mmap_event->ino); - perf_output_put(&handle, mmap_event->ino_generation); + if (use_build_id) { + u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 }; + + __output_copy(&handle, size, 4); + __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX); + } else { + perf_output_put(&handle, mmap_event->maj); + perf_output_put(&handle, mmap_event->min); + perf_output_put(&handle, mmap_event->ino); + perf_output_put(&handle, mmap_event->ino_generation); + } perf_output_put(&handle, mmap_event->prot); perf_output_put(&handle, mmap_event->flags); } @@ -8236,6 +8240,9 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; + if (atomic_read(&nr_build_id_events)) + build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size); + perf_iterate_sb(perf_event_mmap_output, mmap_event, NULL); @@ -11172,6 +11179,8 @@ static void account_event(struct perf_event *event) inc = true; if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); + if (event->attr.build_id) + atomic_inc(&nr_build_id_events); if (event->attr.comm) atomic_inc(&nr_comm_events); if (event->attr.namespaces) @@ -11564,6 +11573,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->sample_type & PERF_SAMPLE_CGROUP) return -EINVAL; #endif + if ((attr->sample_type & PERF_SAMPLE_WEIGHT) && + (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT)) + return -EINVAL; out: return ret; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index bf9edd8d75be..3ea7f8f92f1d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -613,41 +613,56 @@ static void put_uprobe(struct uprobe *uprobe) } } -static int match_uprobe(struct uprobe *l, struct uprobe *r) +static __always_inline +int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset, + const struct uprobe *r) { - if (l->inode < r->inode) + if (l_inode < r->inode) return -1; - if (l->inode > r->inode) + if (l_inode > r->inode) return 1; - if (l->offset < r->offset) + if (l_offset < r->offset) return -1; - if (l->offset > r->offset) + if (l_offset > r->offset) return 1; return 0; } +#define __node_2_uprobe(node) \ + rb_entry((node), struct uprobe, rb_node) + +struct __uprobe_key { + struct inode *inode; + loff_t offset; +}; + +static inline int __uprobe_cmp_key(const void *key, const struct rb_node *b) +{ + const struct __uprobe_key *a = key; + return uprobe_cmp(a->inode, a->offset, __node_2_uprobe(b)); +} + +static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b) +{ + struct uprobe *u = __node_2_uprobe(a); + return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b)); +} + static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) { - struct uprobe u = { .inode = inode, .offset = offset }; - struct rb_node *n = uprobes_tree.rb_node; - struct uprobe *uprobe; - int match; + struct __uprobe_key key = { + .inode = inode, + .offset = offset, + }; + struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key); - while (n) { - uprobe = rb_entry(n, struct uprobe, rb_node); - match = match_uprobe(&u, uprobe); - if (!match) - return get_uprobe(uprobe); + if (node) + return get_uprobe(__node_2_uprobe(node)); - if (match < 0) - n = n->rb_left; - else - n = n->rb_right; - } return NULL; } @@ -668,32 +683,15 @@ static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) static struct uprobe *__insert_uprobe(struct uprobe *uprobe) { - struct rb_node **p = &uprobes_tree.rb_node; - struct rb_node *parent = NULL; - struct uprobe *u; - int match; + struct rb_node *node; - while (*p) { - parent = *p; - u = rb_entry(parent, struct uprobe, rb_node); - match = match_uprobe(uprobe, u); - if (!match) - return get_uprobe(u); + node = rb_find_add(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp); + if (node) + return get_uprobe(__node_2_uprobe(node)); - if (match < 0) - p = &parent->rb_left; - else - p = &parent->rb_right; - - } - - u = NULL; - rb_link_node(&uprobe->rb_node, parent, p); - rb_insert_color(&uprobe->rb_node, &uprobes_tree); /* get access + creation ref */ refcount_set(&uprobe->ref, 2); - - return u; + return NULL; } /* diff --git a/kernel/futex.c b/kernel/futex.c index 45a13eb8894e..e68db7745039 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3012,7 +3012,7 @@ retry: * Success, we're done! No tricky corner cases. */ if (!ret) - goto out_putkey; + return ret; /* * The atomic access to the futex value generated a * pagefault, so retry the user-access and the wakeup: @@ -3029,7 +3029,7 @@ retry: * wake_futex_pi has detected invalid state. Tell user * space. */ - goto out_putkey; + return ret; } /* @@ -3050,7 +3050,7 @@ retry: default: WARN_ON_ONCE(1); - goto out_putkey; + return ret; } } @@ -3061,7 +3061,6 @@ retry: out_unlock: spin_unlock(&hb->lock); -out_putkey: return ret; pi_retry: @@ -3763,8 +3762,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, - u32, val3) + const struct __kernel_timespec __user *, utime, + u32 __user *, uaddr2, u32, val3) { struct timespec64 ts; ktime_t t, *tp = NULL; @@ -3959,7 +3958,7 @@ err_unlock: #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - struct old_timespec32 __user *, utime, u32 __user *, uaddr2, + const struct old_timespec32 __user *, utime, u32 __user *, uaddr2, u32, val3) { struct timespec64 ts; diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 8ccd32a0cc80..bd1d85c610aa 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -27,7 +27,7 @@ static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS); /* * Run software resends of IRQ's */ -static void resend_irqs(unsigned long arg) +static void resend_irqs(struct tasklet_struct *unused) { struct irq_desc *desc; int irq; @@ -45,7 +45,7 @@ static void resend_irqs(unsigned long arg) } /* Tasklet to handle resend: */ -static DECLARE_TASKLET_OLD(resend_tasklet, resend_irqs); +static DECLARE_TASKLET(resend_tasklet, resend_irqs); static int irq_sw_resend(struct irq_desc *desc) { diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index fe9de067771c..8043a90aa50e 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -177,6 +177,11 @@ unsigned long kallsyms_lookup_name(const char *name) return module_kallsyms_lookup_name(name); } +#ifdef CONFIG_LIVEPATCH +/* + * Iterate over all symbols in vmlinux. For symbols from modules use + * module_kallsyms_on_each_symbol instead. + */ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), void *data) @@ -192,8 +197,9 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, if (ret != 0) return ret; } - return module_kallsyms_on_each_symbol(fn, data); + return 0; } +#endif /* CONFIG_LIVEPATCH */ static unsigned long get_symbol_pos(unsigned long addr, unsigned long *symbolsize, diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 3994a217bde7..3bf98db9c702 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -12,7 +12,6 @@ #include <linux/moduleparam.h> #include <linux/percpu.h> #include <linux/preempt.h> -#include <linux/random.h> #include <linux/sched.h> #include <linux/uaccess.h> @@ -101,7 +100,7 @@ static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1]; static DEFINE_PER_CPU(long, kcsan_skip); /* For kcsan_prandom_u32_max(). */ -static DEFINE_PER_CPU(struct rnd_state, kcsan_rand_state); +static DEFINE_PER_CPU(u32, kcsan_rand_state); static __always_inline atomic_long_t *find_watchpoint(unsigned long addr, size_t size, @@ -275,20 +274,17 @@ should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx * } /* - * Returns a pseudo-random number in interval [0, ep_ro). See prandom_u32_max() - * for more details. - * - * The open-coded version here is using only safe primitives for all contexts - * where we can have KCSAN instrumentation. In particular, we cannot use - * prandom_u32() directly, as its tracepoint could cause recursion. + * Returns a pseudo-random number in interval [0, ep_ro). Simple linear + * congruential generator, using constants from "Numerical Recipes". */ static u32 kcsan_prandom_u32_max(u32 ep_ro) { - struct rnd_state *state = &get_cpu_var(kcsan_rand_state); - const u32 res = prandom_u32_state(state); + u32 state = this_cpu_read(kcsan_rand_state); + + state = 1664525 * state + 1013904223; + this_cpu_write(kcsan_rand_state, state); - put_cpu_var(kcsan_rand_state); - return (u32)(((u64) res * ep_ro) >> 32); + return state % ep_ro; } static inline void reset_kcsan_skip(void) @@ -639,10 +635,14 @@ static __always_inline void check_access(const volatile void *ptr, size_t size, void __init kcsan_init(void) { + int cpu; + BUG_ON(!in_task()); kcsan_debugfs_init(); - prandom_seed_full_state(&kcsan_rand_state); + + for_each_possible_cpu(cpu) + per_cpu(kcsan_rand_state, cpu) = (u32)get_cycles(); /* * We are in the init task, and no other tasks should be running; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index aa919585c24b..a0b6780740c8 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1076,7 +1076,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) if (!buf) return; memset(&prstatus, 0, sizeof(prstatus)); - prstatus.pr_pid = current->pid; + prstatus.common.pr_pid = current->pid; elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, &prstatus, sizeof(prstatus)); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b02086d70492..5c3447cf7ad5 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -166,6 +166,11 @@ void kimage_file_post_load_cleanup(struct kimage *image) vfree(pi->sechdrs); pi->sechdrs = NULL; +#ifdef CONFIG_IMA_KEXEC + vfree(image->ima_buffer); + image->ima_buffer = NULL; +#endif /* CONFIG_IMA_KEXEC */ + /* See if architecture has anything to cleanup post load */ arch_kimage_file_post_load_cleanup(image); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d5a3eb74a657..745f08fdd7a6 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -861,7 +861,6 @@ out: cpus_read_unlock(); } -#ifdef CONFIG_SYSCTL static void optimize_all_kprobes(void) { struct hlist_head *head; @@ -887,6 +886,7 @@ out: mutex_unlock(&kprobe_mutex); } +#ifdef CONFIG_SYSCTL static void unoptimize_all_kprobes(void) { struct hlist_head *head; @@ -1520,13 +1520,16 @@ valid: return ap; } -/* Return error if the kprobe is being re-registered */ -static inline int check_kprobe_rereg(struct kprobe *p) +/* + * Warn and return error if the kprobe is being re-registered since + * there must be a software bug. + */ +static inline int warn_kprobe_rereg(struct kprobe *p) { int ret = 0; mutex_lock(&kprobe_mutex); - if (__get_valid_kprobe(p)) + if (WARN_ON_ONCE(__get_valid_kprobe(p))) ret = -EINVAL; mutex_unlock(&kprobe_mutex); @@ -1614,7 +1617,7 @@ int register_kprobe(struct kprobe *p) return PTR_ERR(addr); p->addr = addr; - ret = check_kprobe_rereg(p); + ret = warn_kprobe_rereg(p); if (ret) return ret; @@ -1995,7 +1998,7 @@ int register_kretprobe(struct kretprobe *rp) return ret; /* If only rp->kp.addr is specified, check reregistering kprobes */ - if (rp->kp.addr && check_kprobe_rereg(&rp->kp)) + if (rp->kp.addr && warn_kprobe_rereg(&rp->kp)) return -EINVAL; if (kretprobe_blacklist_size) { @@ -2497,18 +2500,14 @@ static int __init init_kprobes(void) } } -#if defined(CONFIG_OPTPROBES) -#if defined(__ARCH_WANT_KPROBES_INSN_SLOT) - /* Init kprobe_optinsn_slots */ - kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE; -#endif - /* By default, kprobes can be optimized */ - kprobes_allow_optimization = true; -#endif - /* By default, kprobes are armed */ kprobes_all_disarmed = false; +#if defined(CONFIG_OPTPROBES) && defined(__ARCH_WANT_KPROBES_INSN_SLOT) + /* Init kprobe_optinsn_slots for allocation */ + kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE; +#endif + err = arch_init_kprobes(); if (!err) err = register_die_notifier(&kprobe_exceptions_nb); @@ -2523,6 +2522,21 @@ static int __init init_kprobes(void) } early_initcall(init_kprobes); +#if defined(CONFIG_OPTPROBES) +static int __init init_optprobes(void) +{ + /* + * Enable kprobe optimization - this kicks the optimizer which + * depends on synchronize_rcu_tasks() and ksoftirqd, that is + * not spawned in early initcall. So delay the optimization. + */ + optimize_all_kprobes(); + + return 0; +} +subsys_initcall(init_optprobes); +#endif + #ifdef CONFIG_DEBUG_FS static void report_probe(struct seq_file *pi, struct kprobe *p, const char *sym, int offset, char *modname, struct kprobe *pp) diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index f76fdb925532..335d988bd811 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -19,6 +19,7 @@ #include <linux/moduleloader.h> #include <linux/completion.h> #include <linux/memory.h> +#include <linux/rcupdate.h> #include <asm/cacheflush.h> #include "core.h" #include "patch.h" @@ -57,7 +58,7 @@ static void klp_find_object_module(struct klp_object *obj) if (!klp_is_module(obj)) return; - mutex_lock(&module_mutex); + rcu_read_lock_sched(); /* * We do not want to block removal of patched modules and therefore * we do not take a reference here. The patches are removed by @@ -74,7 +75,7 @@ static void klp_find_object_module(struct klp_object *obj) if (mod && mod->klp_alive) obj->mod = mod; - mutex_unlock(&module_mutex); + rcu_read_unlock_sched(); } static bool klp_initialized(void) @@ -163,12 +164,10 @@ static int klp_find_object_symbol(const char *objname, const char *name, .pos = sympos, }; - mutex_lock(&module_mutex); if (objname) module_kallsyms_on_each_symbol(klp_find_callback, &args); else kallsyms_on_each_symbol(klp_find_callback, &args); - mutex_unlock(&module_mutex); /* * Ensure an address was found. If sympos is 0, ensure symbol is unique; diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 6d11cfb9b41f..8838f1d7c4a2 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -15,6 +15,7 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE) endif +obj-$(CONFIG_DEBUG_IRQFLAGS) += irqflag-debug.o obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o obj-$(CONFIG_LOCKDEP) += lockdep.o ifeq ($(CONFIG_PROC_FS),y) diff --git a/kernel/locking/irqflag-debug.c b/kernel/locking/irqflag-debug.c new file mode 100644 index 000000000000..810b50344d35 --- /dev/null +++ b/kernel/locking/irqflag-debug.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/bug.h> +#include <linux/export.h> +#include <linux/irqflags.h> + +noinstr void warn_bogus_irq_restore(void) +{ + instrumentation_begin(); + WARN_ONCE(1, "raw_local_irq_restore() called with IRQs enabled\n"); + instrumentation_end(); +} +EXPORT_SYMBOL(warn_bogus_irq_restore); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index bdaf4829098c..c6d0c1dc6253 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1290,6 +1290,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) class->name_version = count_matching_names(class); class->wait_type_inner = lock->wait_type_inner; class->wait_type_outer = lock->wait_type_outer; + class->lock_type = lock->lock_type; /* * We use RCU's safe list-add method to make * parallel walking of the hash-list safe: @@ -1671,6 +1672,7 @@ static inline struct lock_list *__bfs_next(struct lock_list *lock, int offset) static enum bfs_result __bfs(struct lock_list *source_entry, void *data, bool (*match)(struct lock_list *entry, void *data), + bool (*skip)(struct lock_list *entry, void *data), struct lock_list **target_entry, int offset) { @@ -1731,7 +1733,12 @@ static enum bfs_result __bfs(struct lock_list *source_entry, /* * Step 3: we haven't visited this and there is a strong * dependency path to this, so check with @match. + * If @skip is provide and returns true, we skip this + * lock (and any path this lock is in). */ + if (skip && skip(lock, data)) + continue; + if (match(lock, data)) { *target_entry = lock; return BFS_RMATCH; @@ -1774,9 +1781,10 @@ static inline enum bfs_result __bfs_forwards(struct lock_list *src_entry, void *data, bool (*match)(struct lock_list *entry, void *data), + bool (*skip)(struct lock_list *entry, void *data), struct lock_list **target_entry) { - return __bfs(src_entry, data, match, target_entry, + return __bfs(src_entry, data, match, skip, target_entry, offsetof(struct lock_class, locks_after)); } @@ -1785,9 +1793,10 @@ static inline enum bfs_result __bfs_backwards(struct lock_list *src_entry, void *data, bool (*match)(struct lock_list *entry, void *data), + bool (*skip)(struct lock_list *entry, void *data), struct lock_list **target_entry) { - return __bfs(src_entry, data, match, target_entry, + return __bfs(src_entry, data, match, skip, target_entry, offsetof(struct lock_class, locks_before)); } @@ -2018,7 +2027,7 @@ static unsigned long __lockdep_count_forward_deps(struct lock_list *this) unsigned long count = 0; struct lock_list *target_entry; - __bfs_forwards(this, (void *)&count, noop_count, &target_entry); + __bfs_forwards(this, (void *)&count, noop_count, NULL, &target_entry); return count; } @@ -2043,7 +2052,7 @@ static unsigned long __lockdep_count_backward_deps(struct lock_list *this) unsigned long count = 0; struct lock_list *target_entry; - __bfs_backwards(this, (void *)&count, noop_count, &target_entry); + __bfs_backwards(this, (void *)&count, noop_count, NULL, &target_entry); return count; } @@ -2071,11 +2080,12 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) static noinline enum bfs_result check_path(struct held_lock *target, struct lock_list *src_entry, bool (*match)(struct lock_list *entry, void *data), + bool (*skip)(struct lock_list *entry, void *data), struct lock_list **target_entry) { enum bfs_result ret; - ret = __bfs_forwards(src_entry, target, match, target_entry); + ret = __bfs_forwards(src_entry, target, match, skip, target_entry); if (unlikely(bfs_error(ret))) print_bfs_bug(ret); @@ -2102,7 +2112,7 @@ check_noncircular(struct held_lock *src, struct held_lock *target, debug_atomic_inc(nr_cyclic_checks); - ret = check_path(target, &src_entry, hlock_conflict, &target_entry); + ret = check_path(target, &src_entry, hlock_conflict, NULL, &target_entry); if (unlikely(ret == BFS_RMATCH)) { if (!*trace) { @@ -2120,46 +2130,6 @@ check_noncircular(struct held_lock *src, struct held_lock *target, return ret; } -#ifdef CONFIG_LOCKDEP_SMALL -/* - * Check that the dependency graph starting at <src> can lead to - * <target> or not. If it can, <src> -> <target> dependency is already - * in the graph. - * - * Return BFS_RMATCH if it does, or BFS_RMATCH if it does not, return BFS_E* if - * any error appears in the bfs search. - */ -static noinline enum bfs_result -check_redundant(struct held_lock *src, struct held_lock *target) -{ - enum bfs_result ret; - struct lock_list *target_entry; - struct lock_list src_entry; - - bfs_init_root(&src_entry, src); - /* - * Special setup for check_redundant(). - * - * To report redundant, we need to find a strong dependency path that - * is equal to or stronger than <src> -> <target>. So if <src> is E, - * we need to let __bfs() only search for a path starting at a -(E*)->, - * we achieve this by setting the initial node's ->only_xr to true in - * that case. And if <prev> is S, we set initial ->only_xr to false - * because both -(S*)-> (equal) and -(E*)-> (stronger) are redundant. - */ - src_entry.only_xr = src->read == 0; - - debug_atomic_inc(nr_redundant_checks); - - ret = check_path(target, &src_entry, hlock_equal, &target_entry); - - if (ret == BFS_RMATCH) - debug_atomic_inc(nr_redundant); - - return ret; -} -#endif - #ifdef CONFIG_TRACE_IRQFLAGS /* @@ -2230,6 +2200,44 @@ static inline bool usage_match(struct lock_list *entry, void *mask) return !!((entry->class->usage_mask & LOCKF_IRQ) & *(unsigned long *)mask); } +static inline bool usage_skip(struct lock_list *entry, void *mask) +{ + /* + * Skip local_lock() for irq inversion detection. + * + * For !RT, local_lock() is not a real lock, so it won't carry any + * dependency. + * + * For RT, an irq inversion happens when we have lock A and B, and on + * some CPU we can have: + * + * lock(A); + * <interrupted> + * lock(B); + * + * where lock(B) cannot sleep, and we have a dependency B -> ... -> A. + * + * Now we prove local_lock() cannot exist in that dependency. First we + * have the observation for any lock chain L1 -> ... -> Ln, for any + * 1 <= i <= n, Li.inner_wait_type <= L1.inner_wait_type, otherwise + * wait context check will complain. And since B is not a sleep lock, + * therefore B.inner_wait_type >= 2, and since the inner_wait_type of + * local_lock() is 3, which is greater than 2, therefore there is no + * way the local_lock() exists in the dependency B -> ... -> A. + * + * As a result, we will skip local_lock(), when we search for irq + * inversion bugs. + */ + if (entry->class->lock_type == LD_LOCK_PERCPU) { + if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG)) + return false; + + return true; + } + + return false; +} + /* * Find a node in the forwards-direction dependency sub-graph starting * at @root->class that matches @bit. @@ -2245,7 +2253,7 @@ find_usage_forwards(struct lock_list *root, unsigned long usage_mask, debug_atomic_inc(nr_find_usage_forwards_checks); - result = __bfs_forwards(root, &usage_mask, usage_match, target_entry); + result = __bfs_forwards(root, &usage_mask, usage_match, usage_skip, target_entry); return result; } @@ -2262,7 +2270,7 @@ find_usage_backwards(struct lock_list *root, unsigned long usage_mask, debug_atomic_inc(nr_find_usage_backwards_checks); - result = __bfs_backwards(root, &usage_mask, usage_match, target_entry); + result = __bfs_backwards(root, &usage_mask, usage_match, usage_skip, target_entry); return result; } @@ -2627,7 +2635,7 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, */ bfs_init_rootb(&this, prev); - ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL); + ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, usage_skip, NULL); if (bfs_error(ret)) { print_bfs_bug(ret); return 0; @@ -2694,8 +2702,68 @@ static inline int check_irq_usage(struct task_struct *curr, { return 1; } + +static inline bool usage_skip(struct lock_list *entry, void *mask) +{ + return false; +} + #endif /* CONFIG_TRACE_IRQFLAGS */ +#ifdef CONFIG_LOCKDEP_SMALL +/* + * Check that the dependency graph starting at <src> can lead to + * <target> or not. If it can, <src> -> <target> dependency is already + * in the graph. + * + * Return BFS_RMATCH if it does, or BFS_RMATCH if it does not, return BFS_E* if + * any error appears in the bfs search. + */ +static noinline enum bfs_result +check_redundant(struct held_lock *src, struct held_lock *target) +{ + enum bfs_result ret; + struct lock_list *target_entry; + struct lock_list src_entry; + + bfs_init_root(&src_entry, src); + /* + * Special setup for check_redundant(). + * + * To report redundant, we need to find a strong dependency path that + * is equal to or stronger than <src> -> <target>. So if <src> is E, + * we need to let __bfs() only search for a path starting at a -(E*)->, + * we achieve this by setting the initial node's ->only_xr to true in + * that case. And if <prev> is S, we set initial ->only_xr to false + * because both -(S*)-> (equal) and -(E*)-> (stronger) are redundant. + */ + src_entry.only_xr = src->read == 0; + + debug_atomic_inc(nr_redundant_checks); + + /* + * Note: we skip local_lock() for redundant check, because as the + * comment in usage_skip(), A -> local_lock() -> B and A -> B are not + * the same. + */ + ret = check_path(target, &src_entry, hlock_equal, usage_skip, &target_entry); + + if (ret == BFS_RMATCH) + debug_atomic_inc(nr_redundant); + + return ret; +} + +#else + +static inline enum bfs_result +check_redundant(struct held_lock *src, struct held_lock *target) +{ + return BFS_RNOMATCH; +} + +#endif + static void inc_chains(int irq_context) { if (irq_context & LOCK_CHAIN_HARDIRQ_CONTEXT) @@ -2916,7 +2984,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, } } -#ifdef CONFIG_LOCKDEP_SMALL /* * Is the <prev> -> <next> link redundant? */ @@ -2925,7 +2992,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, return 0; else if (ret == BFS_RMATCH) return 2; -#endif if (!*trace) { *trace = save_trace(); @@ -3707,7 +3773,7 @@ static void print_usage_bug(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) { - if (!debug_locks_off_graph_unlock() || debug_locks_silent) + if (!debug_locks_off() || debug_locks_silent) return; pr_warn("\n"); @@ -3748,6 +3814,7 @@ valid_state(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) { if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) { + graph_unlock(); print_usage_bug(curr, this, bad_bit, new_bit); return 0; } @@ -4503,9 +4570,9 @@ print_lock_invalid_wait_context(struct task_struct *curr, */ static int check_wait_context(struct task_struct *curr, struct held_lock *next) { - short next_inner = hlock_class(next)->wait_type_inner; - short next_outer = hlock_class(next)->wait_type_outer; - short curr_inner; + u8 next_inner = hlock_class(next)->wait_type_inner; + u8 next_outer = hlock_class(next)->wait_type_outer; + u8 curr_inner; int depth; if (!curr->lockdep_depth || !next_inner || next->trylock) @@ -4528,7 +4595,7 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next) for (; depth < curr->lockdep_depth; depth++) { struct held_lock *prev = curr->held_locks + depth; - short prev_inner = hlock_class(prev)->wait_type_inner; + u8 prev_inner = hlock_class(prev)->wait_type_inner; if (prev_inner) { /* @@ -4577,9 +4644,9 @@ static inline int check_wait_context(struct task_struct *curr, /* * Initialize a lock instance's lock-class mapping info: */ -void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, +void lockdep_init_map_type(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass, - short inner, short outer) + u8 inner, u8 outer, u8 lock_type) { int i; @@ -4602,6 +4669,7 @@ void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, lock->wait_type_outer = outer; lock->wait_type_inner = inner; + lock->lock_type = lock_type; /* * No key, no joy, we need to hash something. @@ -4636,7 +4704,7 @@ void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, raw_local_irq_restore(flags); } } -EXPORT_SYMBOL_GPL(lockdep_init_map_waits); +EXPORT_SYMBOL_GPL(lockdep_init_map_type); struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index fd838cea3934..0ab94e1f1276 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -27,7 +27,6 @@ #include <linux/moduleparam.h> #include <linux/delay.h> #include <linux/slab.h> -#include <linux/percpu-rwsem.h> #include <linux/torture.h> #include <linux/reboot.h> diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 5352ce50a97e..adb935090768 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -86,16 +86,6 @@ bool mutex_is_locked(struct mutex *lock) } EXPORT_SYMBOL(mutex_is_locked); -__must_check enum mutex_trylock_recursive_enum -mutex_trylock_recursive(struct mutex *lock) -{ - if (unlikely(__mutex_owner(lock) == current)) - return MUTEX_TRYLOCK_RECURSIVE; - - return mutex_trylock(lock); -} -EXPORT_SYMBOL(mutex_trylock_recursive); - static inline unsigned long __owner_flags(unsigned long owner) { return owner & MUTEX_FLAGS; diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index fe9ca92faa2a..4786dd271b45 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -12,7 +12,6 @@ #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/spinlock.h> -#include <asm/qrwlock.h> /** * queued_read_lock_slowpath - acquire read lock of a queue rwlock diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 2f8cd616d3b2..03b21135313c 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -267,27 +267,18 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, return 1; } +#define __node_2_waiter(node) \ + rb_entry((node), struct rt_mutex_waiter, tree_entry) + +static inline bool __waiter_less(struct rb_node *a, const struct rb_node *b) +{ + return rt_mutex_waiter_less(__node_2_waiter(a), __node_2_waiter(b)); +} + static void rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) { - struct rb_node **link = &lock->waiters.rb_root.rb_node; - struct rb_node *parent = NULL; - struct rt_mutex_waiter *entry; - bool leftmost = true; - - while (*link) { - parent = *link; - entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry); - if (rt_mutex_waiter_less(waiter, entry)) { - link = &parent->rb_left; - } else { - link = &parent->rb_right; - leftmost = false; - } - } - - rb_link_node(&waiter->tree_entry, parent, link); - rb_insert_color_cached(&waiter->tree_entry, &lock->waiters, leftmost); + rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less); } static void @@ -300,27 +291,18 @@ rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) RB_CLEAR_NODE(&waiter->tree_entry); } +#define __node_2_pi_waiter(node) \ + rb_entry((node), struct rt_mutex_waiter, pi_tree_entry) + +static inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b) +{ + return rt_mutex_waiter_less(__node_2_pi_waiter(a), __node_2_pi_waiter(b)); +} + static void rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) { - struct rb_node **link = &task->pi_waiters.rb_root.rb_node; - struct rb_node *parent = NULL; - struct rt_mutex_waiter *entry; - bool leftmost = true; - - while (*link) { - parent = *link; - entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry); - if (rt_mutex_waiter_less(waiter, entry)) { - link = &parent->rb_left; - } else { - link = &parent->rb_right; - leftmost = false; - } - } - - rb_link_node(&waiter->pi_tree_entry, parent, link); - rb_insert_color_cached(&waiter->pi_tree_entry, &task->pi_waiters, leftmost); + rb_add_cached(&waiter->pi_tree_entry, &task->pi_waiters, __pi_waiter_less); } static void @@ -1604,8 +1586,11 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock) EXPORT_SYMBOL_GPL(rt_mutex_unlock); /** - * Futex variant, that since futex variants do not use the fast-path, can be - * simple and will not need to retry. + * __rt_mutex_futex_unlock - Futex variant, that since futex variants + * do not use the fast-path, can be simple and will not need to retry. + * + * @lock: The rt_mutex to be unlocked + * @wake_q: The wake queue head from which to get the next lock waiter */ bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, struct wake_q_head *wake_q) @@ -1662,13 +1647,15 @@ void rt_mutex_destroy(struct rt_mutex *lock) EXPORT_SYMBOL_GPL(rt_mutex_destroy); /** - * __rt_mutex_init - initialize the rt lock + * __rt_mutex_init - initialize the rt_mutex * - * @lock: the rt lock to be initialized + * @lock: The rt_mutex to be initialized + * @name: The lock name used for debugging + * @key: The lock class key used for debugging * - * Initialize the rt lock to unlocked state. + * Initialize the rt_mutex to unlocked state. * - * Initializing of a locked rt lock is not allowed + * Initializing of a locked rt_mutex is not allowed */ void __rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key) diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h deleted file mode 100644 index e69de29bb2d1..000000000000 --- a/kernel/locking/rwsem.h +++ /dev/null diff --git a/kernel/module.c b/kernel/module.c index 4bf30e4b3eaa..30479355ab85 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -87,8 +87,7 @@ * 3) module_addr_min/module_addr_max. * (delete and add uses RCU list operations). */ -DEFINE_MUTEX(module_mutex); -EXPORT_SYMBOL_GPL(module_mutex); +static DEFINE_MUTEX(module_mutex); static LIST_HEAD(modules); /* Work queue for freeing init sections in success case */ @@ -256,11 +255,6 @@ static void mod_update_bounds(struct module *mod) struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ #endif /* CONFIG_KGDB_KDB */ -static void module_assert_mutex(void) -{ - lockdep_assert_held(&module_mutex); -} - static void module_assert_mutex_or_preempt(void) { #ifdef CONFIG_LOCKDEP @@ -414,19 +408,8 @@ extern const struct kernel_symbol __start___ksymtab[]; extern const struct kernel_symbol __stop___ksymtab[]; extern const struct kernel_symbol __start___ksymtab_gpl[]; extern const struct kernel_symbol __stop___ksymtab_gpl[]; -extern const struct kernel_symbol __start___ksymtab_gpl_future[]; -extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; extern const s32 __start___kcrctab[]; extern const s32 __start___kcrctab_gpl[]; -extern const s32 __start___kcrctab_gpl_future[]; -#ifdef CONFIG_UNUSED_SYMBOLS -extern const struct kernel_symbol __start___ksymtab_unused[]; -extern const struct kernel_symbol __stop___ksymtab_unused[]; -extern const struct kernel_symbol __start___ksymtab_unused_gpl[]; -extern const struct kernel_symbol __stop___ksymtab_unused_gpl[]; -extern const s32 __start___kcrctab_unused[]; -extern const s32 __start___kcrctab_unused_gpl[]; -#endif #ifndef CONFIG_MODVERSIONS #define symversion(base, idx) NULL @@ -434,87 +417,14 @@ extern const s32 __start___kcrctab_unused_gpl[]; #define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL) #endif -static bool each_symbol_in_section(const struct symsearch *arr, - unsigned int arrsize, - struct module *owner, - bool (*fn)(const struct symsearch *syms, - struct module *owner, - void *data), - void *data) -{ - unsigned int j; - - for (j = 0; j < arrsize; j++) { - if (fn(&arr[j], owner, data)) - return true; - } - - return false; -} - -/* Returns true as soon as fn returns true, otherwise false. */ -static bool each_symbol_section(bool (*fn)(const struct symsearch *arr, - struct module *owner, - void *data), - void *data) -{ - struct module *mod; - static const struct symsearch arr[] = { - { __start___ksymtab, __stop___ksymtab, __start___kcrctab, - NOT_GPL_ONLY, false }, - { __start___ksymtab_gpl, __stop___ksymtab_gpl, - __start___kcrctab_gpl, - GPL_ONLY, false }, - { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future, - __start___kcrctab_gpl_future, - WILL_BE_GPL_ONLY, false }, -#ifdef CONFIG_UNUSED_SYMBOLS - { __start___ksymtab_unused, __stop___ksymtab_unused, - __start___kcrctab_unused, - NOT_GPL_ONLY, true }, - { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl, - __start___kcrctab_unused_gpl, - GPL_ONLY, true }, -#endif - }; - - module_assert_mutex_or_preempt(); - - if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data)) - return true; - - list_for_each_entry_rcu(mod, &modules, list, - lockdep_is_held(&module_mutex)) { - struct symsearch arr[] = { - { mod->syms, mod->syms + mod->num_syms, mod->crcs, - NOT_GPL_ONLY, false }, - { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms, - mod->gpl_crcs, - GPL_ONLY, false }, - { mod->gpl_future_syms, - mod->gpl_future_syms + mod->num_gpl_future_syms, - mod->gpl_future_crcs, - WILL_BE_GPL_ONLY, false }, -#ifdef CONFIG_UNUSED_SYMBOLS - { mod->unused_syms, - mod->unused_syms + mod->num_unused_syms, - mod->unused_crcs, - NOT_GPL_ONLY, true }, - { mod->unused_gpl_syms, - mod->unused_gpl_syms + mod->num_unused_gpl_syms, - mod->unused_gpl_crcs, - GPL_ONLY, true }, -#endif - }; - - if (mod->state == MODULE_STATE_UNFORMED) - continue; - - if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data)) - return true; - } - return false; -} +struct symsearch { + const struct kernel_symbol *start, *stop; + const s32 *crcs; + enum mod_license { + NOT_GPL_ONLY, + GPL_ONLY, + } license; +}; struct find_symbol_arg { /* Input */ @@ -535,28 +445,8 @@ static bool check_exported_symbol(const struct symsearch *syms, { struct find_symbol_arg *fsa = data; - if (!fsa->gplok) { - if (syms->license == GPL_ONLY) - return false; - if (syms->license == WILL_BE_GPL_ONLY && fsa->warn) { - pr_warn("Symbol %s is being used by a non-GPL module, " - "which will not be allowed in the future\n", - fsa->name); - } - } - -#ifdef CONFIG_UNUSED_SYMBOLS - if (syms->unused && fsa->warn) { - pr_warn("Symbol %s is marked as UNUSED, however this module is " - "using it.\n", fsa->name); - pr_warn("This symbol will go away in the future.\n"); - pr_warn("Please evaluate if this is the right api to use and " - "if it really is, submit a report to the linux kernel " - "mailing list together with submitting your code for " - "inclusion.\n"); - } -#endif - + if (!fsa->gplok && syms->license == GPL_ONLY) + return false; fsa->owner = owner; fsa->crc = symversion(syms->crcs, symnum); fsa->sym = &syms->start[symnum]; @@ -619,31 +509,44 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms, * Find an exported symbol and return it, along with, (optional) crc and * (optional) module which owns it. Needs preempt disabled or module_mutex. */ -static const struct kernel_symbol *find_symbol(const char *name, - struct module **owner, - const s32 **crc, - enum mod_license *license, - bool gplok, - bool warn) -{ - struct find_symbol_arg fsa; - - fsa.name = name; - fsa.gplok = gplok; - fsa.warn = warn; - - if (each_symbol_section(find_exported_symbol_in_section, &fsa)) { - if (owner) - *owner = fsa.owner; - if (crc) - *crc = fsa.crc; - if (license) - *license = fsa.license; - return fsa.sym; +static bool find_symbol(struct find_symbol_arg *fsa) +{ + static const struct symsearch arr[] = { + { __start___ksymtab, __stop___ksymtab, __start___kcrctab, + NOT_GPL_ONLY }, + { __start___ksymtab_gpl, __stop___ksymtab_gpl, + __start___kcrctab_gpl, + GPL_ONLY }, + }; + struct module *mod; + unsigned int i; + + module_assert_mutex_or_preempt(); + + for (i = 0; i < ARRAY_SIZE(arr); i++) + if (find_exported_symbol_in_section(&arr[i], NULL, fsa)) + return true; + + list_for_each_entry_rcu(mod, &modules, list, + lockdep_is_held(&module_mutex)) { + struct symsearch arr[] = { + { mod->syms, mod->syms + mod->num_syms, mod->crcs, + NOT_GPL_ONLY }, + { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms, + mod->gpl_crcs, + GPL_ONLY }, + }; + + if (mod->state == MODULE_STATE_UNFORMED) + continue; + + for (i = 0; i < ARRAY_SIZE(arr); i++) + if (find_exported_symbol_in_section(&arr[i], mod, fsa)) + return true; } - pr_debug("Failed to find symbol %s\n", name); - return NULL; + pr_debug("Failed to find symbol %s\n", fsa->name); + return false; } /* @@ -669,10 +572,8 @@ static struct module *find_module_all(const char *name, size_t len, struct module *find_module(const char *name) { - module_assert_mutex(); return find_module_all(name, strlen(name), false); } -EXPORT_SYMBOL_GPL(find_module); #ifdef CONFIG_SMP @@ -1107,12 +1008,15 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) void __symbol_put(const char *symbol) { - struct module *owner; + struct find_symbol_arg fsa = { + .name = symbol, + .gplok = true, + }; preempt_disable(); - if (!find_symbol(symbol, &owner, NULL, NULL, true, false)) + if (!find_symbol(&fsa)) BUG(); - module_put(owner); + module_put(fsa.owner); preempt_enable(); } EXPORT_SYMBOL(__symbol_put); @@ -1381,19 +1285,22 @@ bad_version: static inline int check_modstruct_version(const struct load_info *info, struct module *mod) { - const s32 *crc; + struct find_symbol_arg fsa = { + .name = "module_layout", + .gplok = true, + }; /* * Since this should be found in kernel (which can't be removed), no * locking is necessary -- use preempt_disable() to placate lockdep. */ preempt_disable(); - if (!find_symbol("module_layout", NULL, &crc, NULL, true, false)) { + if (!find_symbol(&fsa)) { preempt_enable(); BUG(); } preempt_enable(); - return check_version(info, "module_layout", mod, crc); + return check_version(info, "module_layout", mod, fsa.crc); } /* First part is kernel version, which we ignore if module has crcs. */ @@ -1487,10 +1394,11 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, const char *name, char ownername[]) { - struct module *owner; - const struct kernel_symbol *sym; - const s32 *crc; - enum mod_license license; + struct find_symbol_arg fsa = { + .name = name, + .gplok = !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), + .warn = true, + }; int err; /* @@ -1500,42 +1408,40 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, */ sched_annotate_sleep(); mutex_lock(&module_mutex); - sym = find_symbol(name, &owner, &crc, &license, - !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); - if (!sym) + if (!find_symbol(&fsa)) goto unlock; - if (license == GPL_ONLY) + if (fsa.license == GPL_ONLY) mod->using_gplonly_symbols = true; - if (!inherit_taint(mod, owner)) { - sym = NULL; + if (!inherit_taint(mod, fsa.owner)) { + fsa.sym = NULL; goto getname; } - if (!check_version(info, name, mod, crc)) { - sym = ERR_PTR(-EINVAL); + if (!check_version(info, name, mod, fsa.crc)) { + fsa.sym = ERR_PTR(-EINVAL); goto getname; } - err = verify_namespace_is_imported(info, sym, mod); + err = verify_namespace_is_imported(info, fsa.sym, mod); if (err) { - sym = ERR_PTR(err); + fsa.sym = ERR_PTR(err); goto getname; } - err = ref_module(mod, owner); + err = ref_module(mod, fsa.owner); if (err) { - sym = ERR_PTR(err); + fsa.sym = ERR_PTR(err); goto getname; } getname: /* We must make copy under the lock if we failed to get ref. */ - strncpy(ownername, module_name(owner), MODULE_NAME_LEN); + strncpy(ownername, module_name(fsa.owner), MODULE_NAME_LEN); unlock: mutex_unlock(&module_mutex); - return sym; + return fsa.sym; } static const struct kernel_symbol * @@ -2296,16 +2202,19 @@ static void free_module(struct module *mod) void *__symbol_get(const char *symbol) { - struct module *owner; - const struct kernel_symbol *sym; + struct find_symbol_arg fsa = { + .name = symbol, + .gplok = true, + .warn = true, + }; preempt_disable(); - sym = find_symbol(symbol, &owner, NULL, NULL, true, true); - if (sym && strong_try_module_get(owner)) - sym = NULL; + if (!find_symbol(&fsa) || strong_try_module_get(fsa.owner)) { + preempt_enable(); + return NULL; + } preempt_enable(); - - return sym ? (void *)kernel_symbol_value(sym) : NULL; + return (void *)kernel_symbol_value(fsa.sym); } EXPORT_SYMBOL_GPL(__symbol_get); @@ -2318,7 +2227,6 @@ EXPORT_SYMBOL_GPL(__symbol_get); static int verify_exported_symbols(struct module *mod) { unsigned int i; - struct module *owner; const struct kernel_symbol *s; struct { const struct kernel_symbol *sym; @@ -2326,21 +2234,19 @@ static int verify_exported_symbols(struct module *mod) } arr[] = { { mod->syms, mod->num_syms }, { mod->gpl_syms, mod->num_gpl_syms }, - { mod->gpl_future_syms, mod->num_gpl_future_syms }, -#ifdef CONFIG_UNUSED_SYMBOLS - { mod->unused_syms, mod->num_unused_syms }, - { mod->unused_gpl_syms, mod->num_unused_gpl_syms }, -#endif }; for (i = 0; i < ARRAY_SIZE(arr); i++) { for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { - if (find_symbol(kernel_symbol_name(s), &owner, NULL, - NULL, true, false)) { + struct find_symbol_arg fsa = { + .name = kernel_symbol_name(s), + .gplok = true, + }; + if (find_symbol(&fsa)) { pr_err("%s: exports duplicate symbol %s" " (owned by %s)\n", mod->name, kernel_symbol_name(s), - module_name(owner)); + module_name(fsa.owner)); return -ENOEXEC; } } @@ -2348,6 +2254,21 @@ static int verify_exported_symbols(struct module *mod) return 0; } +static bool ignore_undef_symbol(Elf_Half emachine, const char *name) +{ + /* + * On x86, PIC code and Clang non-PIC code may have call foo@PLT. GNU as + * before 2.37 produces an unreferenced _GLOBAL_OFFSET_TABLE_ on x86-64. + * i386 has a similar problem but may not deserve a fix. + * + * If we ever have to ignore many symbols, consider refactoring the code to + * only warn if referenced by a relocation. + */ + if (emachine == EM_386 || emachine == EM_X86_64) + return !strcmp(name, "_GLOBAL_OFFSET_TABLE_"); + return false; +} + /* Change all symbols so that st_value encodes the pointer directly. */ static int simplify_symbols(struct module *mod, const struct load_info *info) { @@ -2395,8 +2316,10 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) break; } - /* Ok if weak. */ - if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK) + /* Ok if weak or ignored. */ + if (!ksym && + (ELF_ST_BIND(sym[i].st_info) == STB_WEAK || + ignore_undef_symbol(info->hdr->e_machine, name))) break; ret = PTR_ERR(ksym) ?: -ENOENT; @@ -2964,7 +2887,7 @@ static int module_sig_check(struct load_info *info, int flags) } if (is_module_sig_enforced()) { - pr_notice("%s: loading of %s is rejected\n", info->name, reason); + pr_notice("Loading of %s is rejected\n", reason); return -EKEYREJECTED; } @@ -2977,9 +2900,33 @@ static int module_sig_check(struct load_info *info, int flags) } #endif /* !CONFIG_MODULE_SIG */ -/* Sanity checks against invalid binaries, wrong arch, weird elf version. */ -static int elf_header_check(struct load_info *info) +static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr) +{ + unsigned long secend; + + /* + * Check for both overflow and offset/size being + * too large. + */ + secend = shdr->sh_offset + shdr->sh_size; + if (secend < shdr->sh_offset || secend > info->len) + return -ENOEXEC; + + return 0; +} + +/* + * Sanity checks against invalid binaries, wrong arch, weird elf version. + * + * Also do basic validity checks against section offsets and sizes, the + * section name string table, and the indices used for it (sh_name). + */ +static int elf_validity_check(struct load_info *info) { + unsigned int i; + Elf_Shdr *shdr, *strhdr; + int err; + if (info->len < sizeof(*(info->hdr))) return -ENOEXEC; @@ -2989,11 +2936,78 @@ static int elf_header_check(struct load_info *info) || info->hdr->e_shentsize != sizeof(Elf_Shdr)) return -ENOEXEC; + /* + * e_shnum is 16 bits, and sizeof(Elf_Shdr) is + * known and small. So e_shnum * sizeof(Elf_Shdr) + * will not overflow unsigned long on any platform. + */ if (info->hdr->e_shoff >= info->len || (info->hdr->e_shnum * sizeof(Elf_Shdr) > info->len - info->hdr->e_shoff)) return -ENOEXEC; + info->sechdrs = (void *)info->hdr + info->hdr->e_shoff; + + /* + * Verify if the section name table index is valid. + */ + if (info->hdr->e_shstrndx == SHN_UNDEF + || info->hdr->e_shstrndx >= info->hdr->e_shnum) + return -ENOEXEC; + + strhdr = &info->sechdrs[info->hdr->e_shstrndx]; + err = validate_section_offset(info, strhdr); + if (err < 0) + return err; + + /* + * The section name table must be NUL-terminated, as required + * by the spec. This makes strcmp and pr_* calls that access + * strings in the section safe. + */ + info->secstrings = (void *)info->hdr + strhdr->sh_offset; + if (info->secstrings[strhdr->sh_size - 1] != '\0') + return -ENOEXEC; + + /* + * The code assumes that section 0 has a length of zero and + * an addr of zero, so check for it. + */ + if (info->sechdrs[0].sh_type != SHT_NULL + || info->sechdrs[0].sh_size != 0 + || info->sechdrs[0].sh_addr != 0) + return -ENOEXEC; + + for (i = 1; i < info->hdr->e_shnum; i++) { + shdr = &info->sechdrs[i]; + switch (shdr->sh_type) { + case SHT_NULL: + case SHT_NOBITS: + continue; + case SHT_SYMTAB: + if (shdr->sh_link == SHN_UNDEF + || shdr->sh_link >= info->hdr->e_shnum) + return -ENOEXEC; + fallthrough; + default: + err = validate_section_offset(info, shdr); + if (err < 0) { + pr_err("Invalid ELF section in module (section %u type %u)\n", + i, shdr->sh_type); + return err; + } + + if (shdr->sh_flags & SHF_ALLOC) { + if (shdr->sh_name >= strhdr->sh_size) { + pr_err("Invalid ELF section name in module (section %u type %u)\n", + i, shdr->sh_type); + return -ENOEXEC; + } + } + break; + } + } + return 0; } @@ -3095,11 +3109,6 @@ static int rewrite_section_headers(struct load_info *info, int flags) for (i = 1; i < info->hdr->e_shnum; i++) { Elf_Shdr *shdr = &info->sechdrs[i]; - if (shdr->sh_type != SHT_NOBITS - && info->len < shdr->sh_offset + shdr->sh_size) { - pr_err("Module len %lu truncated\n", info->len); - return -ENOEXEC; - } /* * Mark all sections sh_addr with their address in the @@ -3133,11 +3142,6 @@ static int setup_load_info(struct load_info *info, int flags) { unsigned int i; - /* Set up the convenience variables */ - info->sechdrs = (void *)info->hdr + info->hdr->e_shoff; - info->secstrings = (void *)info->hdr - + info->sechdrs[info->hdr->e_shstrndx].sh_offset; - /* Try to find a name early so we can log errors with a module name */ info->index.info = find_sec(info, ".modinfo"); if (info->index.info) @@ -3241,22 +3245,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->gpl_syms), &mod->num_gpl_syms); mod->gpl_crcs = section_addr(info, "__kcrctab_gpl"); - mod->gpl_future_syms = section_objs(info, - "__ksymtab_gpl_future", - sizeof(*mod->gpl_future_syms), - &mod->num_gpl_future_syms); - mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future"); - -#ifdef CONFIG_UNUSED_SYMBOLS - mod->unused_syms = section_objs(info, "__ksymtab_unused", - sizeof(*mod->unused_syms), - &mod->num_unused_syms); - mod->unused_crcs = section_addr(info, "__kcrctab_unused"); - mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl", - sizeof(*mod->unused_gpl_syms), - &mod->num_unused_gpl_syms); - mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl"); -#endif + #ifdef CONFIG_CONSTRUCTORS mod->ctors = section_objs(info, ".ctors", sizeof(*mod->ctors), &mod->num_ctors); @@ -3437,14 +3426,8 @@ static int check_module_license_and_versions(struct module *mod) pr_warn("%s: module license taints kernel.\n", mod->name); #ifdef CONFIG_MODVERSIONS - if ((mod->num_syms && !mod->crcs) - || (mod->num_gpl_syms && !mod->gpl_crcs) - || (mod->num_gpl_future_syms && !mod->gpl_future_crcs) -#ifdef CONFIG_UNUSED_SYMBOLS - || (mod->num_unused_syms && !mod->unused_crcs) - || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs) -#endif - ) { + if ((mod->num_syms && !mod->crcs) || + (mod->num_gpl_syms && !mod->gpl_crcs)) { return try_to_force_load(mod, "no versions for exported symbols"); } @@ -3894,26 +3877,50 @@ static int load_module(struct load_info *info, const char __user *uargs, long err = 0; char *after_dashes; - err = elf_header_check(info); + /* + * Do the signature check (if any) first. All that + * the signature check needs is info->len, it does + * not need any of the section info. That can be + * set up later. This will minimize the chances + * of a corrupt module causing problems before + * we even get to the signature check. + * + * The check will also adjust info->len by stripping + * off the sig length at the end of the module, making + * checks against info->len more correct. + */ + err = module_sig_check(info, flags); + if (err) + goto free_copy; + + /* + * Do basic sanity checks against the ELF header and + * sections. + */ + err = elf_validity_check(info); if (err) { - pr_err("Module has invalid ELF header\n"); + pr_err("Module has invalid ELF structures\n"); goto free_copy; } + /* + * Everything checks out, so set up the section info + * in the info structure. + */ err = setup_load_info(info, flags); if (err) goto free_copy; + /* + * Now that we know we have the correct module name, check + * if it's blacklisted. + */ if (blacklisted(info->name)) { err = -EPERM; pr_err("Module %s is blacklisted\n", info->name); goto free_copy; } - err = module_sig_check(info, flags); - if (err) - goto free_copy; - err = rewrite_section_headers(info, flags); if (err) goto free_copy; @@ -4374,16 +4381,16 @@ unsigned long module_kallsyms_lookup_name(const char *name) return ret; } +#ifdef CONFIG_LIVEPATCH int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), void *data) { struct module *mod; unsigned int i; - int ret; - - module_assert_mutex(); + int ret = 0; + mutex_lock(&module_mutex); list_for_each_entry(mod, &modules, list) { /* We hold module_mutex: no need for rcu_dereference_sched */ struct mod_kallsyms *kallsyms = mod->kallsyms; @@ -4399,11 +4406,13 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, ret = fn(data, kallsyms_symbol_name(kallsyms, i), mod, kallsyms_symbol_value(sym)); if (ret != 0) - return ret; + break; } } - return 0; + mutex_unlock(&module_mutex); + return ret; } +#endif /* CONFIG_LIVEPATCH */ #endif /* CONFIG_KALLSYMS */ /* Maximum number of characters written by module_flags() */ diff --git a/kernel/module_signature.c b/kernel/module_signature.c index 4224a1086b7d..00132d12487c 100644 --- a/kernel/module_signature.c +++ b/kernel/module_signature.c @@ -25,7 +25,7 @@ int mod_check_sig(const struct module_signature *ms, size_t file_len, return -EBADMSG; if (ms->id_type != PKEY_ID_PKCS7) { - pr_err("%s: Module is not signed with expected PKCS#7 message\n", + pr_err("%s: not signed with expected PKCS#7 message\n", name); return -ENOPKG; } diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 9d9fc678c91d..8723ae70ea1f 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -30,7 +30,7 @@ int mod_verify_sig(const void *mod, struct load_info *info) memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms)); - ret = mod_check_sig(&ms, modlen, info->name); + ret = mod_check_sig(&ms, modlen, "module"); if (ret) return ret; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5a95c688621f..575a34b88936 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -735,9 +735,9 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, logbuf_lock_irq(); } - if (user->seq < prb_first_valid_seq(prb)) { + if (r->info->seq != user->seq) { /* our last seen message is gone, return error and reset */ - user->seq = prb_first_valid_seq(prb); + user->seq = r->info->seq; ret = -EPIPE; logbuf_unlock_irq(); goto out; @@ -812,6 +812,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) static __poll_t devkmsg_poll(struct file *file, poll_table *wait) { struct devkmsg_user *user = file->private_data; + struct printk_info info; __poll_t ret = 0; if (!user) @@ -820,9 +821,9 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait) poll_wait(file, &log_wait, wait); logbuf_lock_irq(); - if (prb_read_valid(prb, user->seq, NULL)) { + if (prb_read_valid_info(prb, user->seq, &info, NULL)) { /* return error when data has vanished underneath us */ - if (user->seq < prb_first_valid_seq(prb)) + if (info.seq != user->seq) ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; else ret = EPOLLIN|EPOLLRDNORM; @@ -1559,6 +1560,7 @@ static void syslog_clear(void) int do_syslog(int type, char __user *buf, int len, int source) { + struct printk_info info; bool clear = false; static int saved_console_loglevel = LOGLEVEL_DEFAULT; int error; @@ -1629,9 +1631,14 @@ int do_syslog(int type, char __user *buf, int len, int source) /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: logbuf_lock_irq(); - if (syslog_seq < prb_first_valid_seq(prb)) { + if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) { + /* No unread messages. */ + logbuf_unlock_irq(); + return 0; + } + if (info.seq != syslog_seq) { /* messages are gone, move to first one */ - syslog_seq = prb_first_valid_seq(prb); + syslog_seq = info.seq; syslog_partial = 0; } if (source == SYSLOG_FROM_PROC) { @@ -1643,7 +1650,6 @@ int do_syslog(int type, char __user *buf, int len, int source) error = prb_next_seq(prb) - syslog_seq; } else { bool time = syslog_partial ? syslog_time : printk_time; - struct printk_info info; unsigned int line_count; u64 seq; @@ -3429,9 +3435,11 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, goto out; logbuf_lock_irqsave(flags); - if (dumper->cur_seq < prb_first_valid_seq(prb)) { - /* messages are gone, move to first available one */ - dumper->cur_seq = prb_first_valid_seq(prb); + if (prb_read_valid_info(prb, dumper->cur_seq, &info, NULL)) { + if (info.seq != dumper->cur_seq) { + /* messages are gone, move to first available one */ + dumper->cur_seq = info.seq; + } } /* last entry */ diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index 5dc9d022db07..73cc80e01cef 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -287,7 +287,7 @@ _DEFINE_PRINTKRB(name, descbits, avgtextbits, &_##name##_text[0]) /* Writer Interface */ /** - * prb_rec_init_wd() - Initialize a buffer for writing records. + * prb_rec_init_wr() - Initialize a buffer for writing records. * * @r: The record to initialize. * @text_buf_size: The needed text buffer size. diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index a0e6f746de6c..2e9e3ed7d63e 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -45,6 +45,8 @@ struct printk_safe_seq_buf { static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq); static DEFINE_PER_CPU(int, printk_context); +static DEFINE_RAW_SPINLOCK(safe_read_lock); + #ifdef CONFIG_PRINTK_NMI static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq); #endif @@ -180,8 +182,6 @@ static void report_message_lost(struct printk_safe_seq_buf *s) */ static void __printk_safe_flush(struct irq_work *work) { - static raw_spinlock_t read_lock = - __RAW_SPIN_LOCK_INITIALIZER(read_lock); struct printk_safe_seq_buf *s = container_of(work, struct printk_safe_seq_buf, work); unsigned long flags; @@ -195,7 +195,7 @@ static void __printk_safe_flush(struct irq_work *work) * different CPUs. This is especially important when printing * a backtrace. */ - raw_spin_lock_irqsave(&read_lock, flags); + raw_spin_lock_irqsave(&safe_read_lock, flags); i = 0; more: @@ -232,7 +232,7 @@ more: out: report_message_lost(s); - raw_spin_unlock_irqrestore(&read_lock, flags); + raw_spin_unlock_irqrestore(&safe_read_lock, flags); } /** @@ -278,6 +278,14 @@ void printk_safe_flush_on_panic(void) raw_spin_lock_init(&logbuf_lock); } + if (raw_spin_is_locked(&safe_read_lock)) { + if (num_online_cpus() > 1) + return; + + debug_locks_off(); + raw_spin_lock_init(&safe_read_lock); + } + printk_safe_flush(); } diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index cdc57b4f6d48..3128b7cf8e1f 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -95,6 +95,7 @@ config TASKS_RUDE_RCU config TASKS_TRACE_RCU def_bool 0 + select IRQ_WORK help This option enables a task-based RCU implementation that uses explicit rcu_read_lock_trace() read-side markers, and allows @@ -188,8 +189,8 @@ config RCU_FAST_NO_HZ config RCU_BOOST bool "Enable RCU priority boosting" - depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT - default n + depends on (RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT) || PREEMPT_RT + default y if PREEMPT_RT help This option boosts the priority of preempted RCU readers that block the current preemptible RCU grace period for too long. diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 59ef1ae6dc37..bf0827d4b659 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -378,7 +378,11 @@ do { \ smp_mb__after_unlock_lock(); \ } while (0) -#define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock)) +#define raw_spin_unlock_rcu_node(p) \ +do { \ + lockdep_assert_irqs_disabled(); \ + raw_spin_unlock(&ACCESS_PRIVATE(p, lock)); \ +} while (0) #define raw_spin_lock_irq_rcu_node(p) \ do { \ @@ -387,7 +391,10 @@ do { \ } while (0) #define raw_spin_unlock_irq_rcu_node(p) \ - raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) +do { \ + lockdep_assert_irqs_disabled(); \ + raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)); \ +} while (0) #define raw_spin_lock_irqsave_rcu_node(p, flags) \ do { \ @@ -396,7 +403,10 @@ do { \ } while (0) #define raw_spin_unlock_irqrestore_rcu_node(p, flags) \ - raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) +do { \ + lockdep_assert_irqs_disabled(); \ + raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags); \ +} while (0) #define raw_spin_trylock_rcu_node(p) \ ({ \ diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 2d2a6b6b9dfb..7f181c9675f7 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -7,10 +7,10 @@ * Authors: Paul E. McKenney <paulmck@linux.ibm.com> */ -#include <linux/types.h> -#include <linux/kernel.h> +#include <linux/cpu.h> #include <linux/interrupt.h> -#include <linux/rcupdate.h> +#include <linux/kernel.h> +#include <linux/types.h> #include "rcu_segcblist.h" @@ -88,23 +88,135 @@ static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v) #endif } +/* Get the length of a segment of the rcu_segcblist structure. */ +static long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg) +{ + return READ_ONCE(rsclp->seglen[seg]); +} + +/* Return number of callbacks in segmented callback list by summing seglen. */ +long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp) +{ + long len = 0; + int i; + + for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) + len += rcu_segcblist_get_seglen(rsclp, i); + + return len; +} + +/* Set the length of a segment of the rcu_segcblist structure. */ +static void rcu_segcblist_set_seglen(struct rcu_segcblist *rsclp, int seg, long v) +{ + WRITE_ONCE(rsclp->seglen[seg], v); +} + +/* Increase the numeric length of a segment by a specified amount. */ +static void rcu_segcblist_add_seglen(struct rcu_segcblist *rsclp, int seg, long v) +{ + WRITE_ONCE(rsclp->seglen[seg], rsclp->seglen[seg] + v); +} + +/* Move from's segment length to to's segment. */ +static void rcu_segcblist_move_seglen(struct rcu_segcblist *rsclp, int from, int to) +{ + long len; + + if (from == to) + return; + + len = rcu_segcblist_get_seglen(rsclp, from); + if (!len) + return; + + rcu_segcblist_add_seglen(rsclp, to, len); + rcu_segcblist_set_seglen(rsclp, from, 0); +} + +/* Increment segment's length. */ +static void rcu_segcblist_inc_seglen(struct rcu_segcblist *rsclp, int seg) +{ + rcu_segcblist_add_seglen(rsclp, seg, 1); +} + /* * Increase the numeric length of an rcu_segcblist structure by the * specified amount, which can be negative. This can cause the ->len * field to disagree with the actual number of callbacks on the structure. * This increase is fully ordered with respect to the callers accesses * both before and after. + * + * So why on earth is a memory barrier required both before and after + * the update to the ->len field??? + * + * The reason is that rcu_barrier() locklessly samples each CPU's ->len + * field, and if a given CPU's field is zero, avoids IPIing that CPU. + * This can of course race with both queuing and invoking of callbacks. + * Failing to correctly handle either of these races could result in + * rcu_barrier() failing to IPI a CPU that actually had callbacks queued + * which rcu_barrier() was obligated to wait on. And if rcu_barrier() + * failed to wait on such a callback, unloading certain kernel modules + * would result in calls to functions whose code was no longer present in + * the kernel, for but one example. + * + * Therefore, ->len transitions from 1->0 and 0->1 have to be carefully + * ordered with respect with both list modifications and the rcu_barrier(). + * + * The queuing case is CASE 1 and the invoking case is CASE 2. + * + * CASE 1: Suppose that CPU 0 has no callbacks queued, but invokes + * call_rcu() just as CPU 1 invokes rcu_barrier(). CPU 0's ->len field + * will transition from 0->1, which is one of the transitions that must + * be handled carefully. Without the full memory barriers after the ->len + * update and at the beginning of rcu_barrier(), the following could happen: + * + * CPU 0 CPU 1 + * + * call_rcu(). + * rcu_barrier() sees ->len as 0. + * set ->len = 1. + * rcu_barrier() does nothing. + * module is unloaded. + * callback invokes unloaded function! + * + * With the full barriers, any case where rcu_barrier() sees ->len as 0 will + * have unambiguously preceded the return from the racing call_rcu(), which + * means that this call_rcu() invocation is OK to not wait on. After all, + * you are supposed to make sure that any problematic call_rcu() invocations + * happen before the rcu_barrier(). + * + * + * CASE 2: Suppose that CPU 0 is invoking its last callback just as + * CPU 1 invokes rcu_barrier(). CPU 0's ->len field will transition from + * 1->0, which is one of the transitions that must be handled carefully. + * Without the full memory barriers before the ->len update and at the + * end of rcu_barrier(), the following could happen: + * + * CPU 0 CPU 1 + * + * start invoking last callback + * set ->len = 0 (reordered) + * rcu_barrier() sees ->len as 0 + * rcu_barrier() does nothing. + * module is unloaded + * callback executing after unloaded! + * + * With the full barriers, any case where rcu_barrier() sees ->len as 0 + * will be fully ordered after the completion of the callback function, + * so that the module unloading operation is completely safe. + * */ -static void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v) +void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v) { #ifdef CONFIG_RCU_NOCB_CPU - smp_mb__before_atomic(); /* Up to the caller! */ + smp_mb__before_atomic(); // Read header comment above. atomic_long_add(v, &rsclp->len); - smp_mb__after_atomic(); /* Up to the caller! */ + smp_mb__after_atomic(); // Read header comment above. #else - smp_mb(); /* Up to the caller! */ + smp_mb(); // Read header comment above. WRITE_ONCE(rsclp->len, rsclp->len + v); - smp_mb(); /* Up to the caller! */ + smp_mb(); // Read header comment above. #endif } @@ -120,26 +232,6 @@ void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp) } /* - * Exchange the numeric length of the specified rcu_segcblist structure - * with the specified value. This can cause the ->len field to disagree - * with the actual number of callbacks on the structure. This exchange is - * fully ordered with respect to the callers accesses both before and after. - */ -static long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v) -{ -#ifdef CONFIG_RCU_NOCB_CPU - return atomic_long_xchg(&rsclp->len, v); -#else - long ret = rsclp->len; - - smp_mb(); /* Up to the caller! */ - WRITE_ONCE(rsclp->len, v); - smp_mb(); /* Up to the caller! */ - return ret; -#endif -} - -/* * Initialize an rcu_segcblist structure. */ void rcu_segcblist_init(struct rcu_segcblist *rsclp) @@ -149,10 +241,12 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp) BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq)); BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq)); rsclp->head = NULL; - for (i = 0; i < RCU_CBLIST_NSEGS; i++) + for (i = 0; i < RCU_CBLIST_NSEGS; i++) { rsclp->tails[i] = &rsclp->head; + rcu_segcblist_set_seglen(rsclp, i, 0); + } rcu_segcblist_set_len(rsclp, 0); - rsclp->enabled = 1; + rcu_segcblist_set_flags(rsclp, SEGCBLIST_ENABLED); } /* @@ -163,16 +257,21 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) { WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); - rsclp->enabled = 0; + rcu_segcblist_clear_flags(rsclp, SEGCBLIST_ENABLED); } /* * Mark the specified rcu_segcblist structure as offloaded. This * structure must be empty. */ -void rcu_segcblist_offload(struct rcu_segcblist *rsclp) +void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload) { - rsclp->offloaded = 1; + if (offload) { + rcu_segcblist_clear_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY); + rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED); + } else { + rcu_segcblist_clear_flags(rsclp, SEGCBLIST_OFFLOADED); + } } /* @@ -245,7 +344,7 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, struct rcu_head *rhp) { rcu_segcblist_inc_len(rsclp); - smp_mb(); /* Ensure counts are updated before callback is enqueued. */ + rcu_segcblist_inc_seglen(rsclp, RCU_NEXT_TAIL); rhp->next = NULL; WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp); WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], &rhp->next); @@ -274,6 +373,7 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) if (rsclp->tails[i] != rsclp->tails[i - 1]) break; + rcu_segcblist_inc_seglen(rsclp, i); WRITE_ONCE(*rsclp->tails[i], rhp); for (; i <= RCU_NEXT_TAIL; i++) WRITE_ONCE(rsclp->tails[i], &rhp->next); @@ -281,21 +381,6 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, } /* - * Extract only the counts from the specified rcu_segcblist structure, - * and place them in the specified rcu_cblist structure. This function - * supports both callback orphaning and invocation, hence the separation - * of counts and callbacks. (Callbacks ready for invocation must be - * orphaned and adopted separately from pending callbacks, but counts - * apply to all callbacks. Locking must be used to make sure that - * both orphaned-callbacks lists are consistent.) - */ -void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, - struct rcu_cblist *rclp) -{ - rclp->len = rcu_segcblist_xchg_len(rsclp, 0); -} - -/* * Extract only those callbacks ready to be invoked from the specified * rcu_segcblist structure and place them in the specified rcu_cblist * structure. @@ -307,6 +392,7 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, if (!rcu_segcblist_ready_cbs(rsclp)) return; /* Nothing to do. */ + rclp->len = rcu_segcblist_get_seglen(rsclp, RCU_DONE_TAIL); *rclp->tail = rsclp->head; WRITE_ONCE(rsclp->head, *rsclp->tails[RCU_DONE_TAIL]); WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL); @@ -314,6 +400,7 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--) if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL]) WRITE_ONCE(rsclp->tails[i], &rsclp->head); + rcu_segcblist_set_seglen(rsclp, RCU_DONE_TAIL, 0); } /* @@ -330,11 +417,15 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, if (!rcu_segcblist_pend_cbs(rsclp)) return; /* Nothing to do. */ + rclp->len = 0; *rclp->tail = *rsclp->tails[RCU_DONE_TAIL]; rclp->tail = rsclp->tails[RCU_NEXT_TAIL]; WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL); - for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) + for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) { + rclp->len += rcu_segcblist_get_seglen(rsclp, i); WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]); + rcu_segcblist_set_seglen(rsclp, i, 0); + } } /* @@ -345,7 +436,6 @@ void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp) { rcu_segcblist_add_len(rsclp, rclp->len); - rclp->len = 0; } /* @@ -359,6 +449,7 @@ void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp, if (!rclp->head) return; /* No callbacks to move. */ + rcu_segcblist_add_seglen(rsclp, RCU_DONE_TAIL, rclp->len); *rclp->tail = rsclp->head; WRITE_ONCE(rsclp->head, rclp->head); for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) @@ -379,6 +470,8 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, { if (!rclp->head) return; /* Nothing to do. */ + + rcu_segcblist_add_seglen(rsclp, RCU_NEXT_TAIL, rclp->len); WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head); WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail); } @@ -403,6 +496,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) if (ULONG_CMP_LT(seq, rsclp->gp_seq[i])) break; WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]); + rcu_segcblist_move_seglen(rsclp, i, RCU_DONE_TAIL); } /* If no callbacks moved, nothing more need be done. */ @@ -423,6 +517,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) break; /* No more callbacks. */ WRITE_ONCE(rsclp->tails[j], rsclp->tails[i]); + rcu_segcblist_move_seglen(rsclp, i, j); rsclp->gp_seq[j] = rsclp->gp_seq[i]; } } @@ -444,7 +539,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) */ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) { - int i; + int i, j; WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) @@ -487,6 +582,10 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) if (rcu_segcblist_restempty(rsclp, i) || ++i >= RCU_NEXT_TAIL) return false; + /* Accounting: everything below i is about to get merged into i. */ + for (j = i + 1; j <= RCU_NEXT_TAIL; j++) + rcu_segcblist_move_seglen(rsclp, j, i); + /* * Merge all later callbacks, including newly arrived callbacks, * into the segment located by the for-loop above. Assign "seq" @@ -514,13 +613,24 @@ void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp, struct rcu_cblist donecbs; struct rcu_cblist pendcbs; + lockdep_assert_cpus_held(); + rcu_cblist_init(&donecbs); rcu_cblist_init(&pendcbs); - rcu_segcblist_extract_count(src_rsclp, &donecbs); + rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs); rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs); + + /* + * No need smp_mb() before setting length to 0, because CPU hotplug + * lock excludes rcu_barrier. + */ + rcu_segcblist_set_len(src_rsclp, 0); + rcu_segcblist_insert_count(dst_rsclp, &donecbs); + rcu_segcblist_insert_count(dst_rsclp, &pendcbs); rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs); rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs); + rcu_segcblist_init(src_rsclp); } diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 492262bcb591..9a19328ff251 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -15,6 +15,9 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp) return READ_ONCE(rclp->len); } +/* Return number of callbacks in segmented callback list by summing seglen. */ +long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp); + void rcu_cblist_init(struct rcu_cblist *rclp); void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp); void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, @@ -50,19 +53,51 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) #endif } +static inline void rcu_segcblist_set_flags(struct rcu_segcblist *rsclp, + int flags) +{ + rsclp->flags |= flags; +} + +static inline void rcu_segcblist_clear_flags(struct rcu_segcblist *rsclp, + int flags) +{ + rsclp->flags &= ~flags; +} + +static inline bool rcu_segcblist_test_flags(struct rcu_segcblist *rsclp, + int flags) +{ + return READ_ONCE(rsclp->flags) & flags; +} + /* * Is the specified rcu_segcblist enabled, for example, not corresponding * to an offline CPU? */ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) { - return rsclp->enabled; + return rcu_segcblist_test_flags(rsclp, SEGCBLIST_ENABLED); } -/* Is the specified rcu_segcblist offloaded? */ +/* Is the specified rcu_segcblist offloaded, or is SEGCBLIST_SOFTIRQ_ONLY set? */ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) { - return IS_ENABLED(CONFIG_RCU_NOCB_CPU) && rsclp->offloaded; + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + !rcu_segcblist_test_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY)) + return true; + + return false; +} + +static inline bool rcu_segcblist_completely_offloaded(struct rcu_segcblist *rsclp) +{ + int flags = SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP | SEGCBLIST_OFFLOADED; + + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && (rsclp->flags & flags) == flags) + return true; + + return false; } /* @@ -75,10 +110,22 @@ static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) return !READ_ONCE(*READ_ONCE(rsclp->tails[seg])); } +/* + * Is the specified segment of the specified rcu_segcblist structure + * empty of callbacks? + */ +static inline bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg) +{ + if (seg == RCU_DONE_TAIL) + return &rsclp->head == rsclp->tails[RCU_DONE_TAIL]; + return rsclp->tails[seg - 1] == rsclp->tails[seg]; +} + void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp); +void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v); void rcu_segcblist_init(struct rcu_segcblist *rsclp); void rcu_segcblist_disable(struct rcu_segcblist *rsclp); -void rcu_segcblist_offload(struct rcu_segcblist *rsclp); +void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload); bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); @@ -88,8 +135,6 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, struct rcu_head *rhp); bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, struct rcu_head *rhp); -void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, - struct rcu_cblist *rclp); void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp); void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 528ed10b78fd..99657ffa6688 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -85,6 +85,7 @@ torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives"); +torture_param(bool, gp_poll, false, "Use polling GP wait primitives"); torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers"); @@ -97,6 +98,8 @@ torture_param(int, object_debug, 0, torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable"); +torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable"); +torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)"); torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes (s)"); torture_param(int, read_exit_burst, 16, @@ -127,10 +130,12 @@ static char *torture_type = "rcu"; module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)"); +static int nrealnocbers; static int nrealreaders; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; +static struct task_struct **nocb_tasks; static struct task_struct *stats_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; @@ -142,11 +147,22 @@ static struct task_struct *read_exit_task; #define RCU_TORTURE_PIPE_LEN 10 +// Mailbox-like structure to check RCU global memory ordering. +struct rcu_torture_reader_check { + unsigned long rtc_myloops; + int rtc_chkrdr; + unsigned long rtc_chkloops; + int rtc_ready; + struct rcu_torture_reader_check *rtc_assigner; +} ____cacheline_internodealigned_in_smp; + +// Update-side data structure used to check RCU readers. struct rcu_torture { struct rcu_head rtort_rcu; int rtort_pipe_count; struct list_head rtort_free; int rtort_mbtest; + struct rcu_torture_reader_check *rtort_chkp; }; static LIST_HEAD(rcu_torture_freelist); @@ -157,10 +173,13 @@ static DEFINE_SPINLOCK(rcu_torture_lock); static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count); static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch); static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; +static struct rcu_torture_reader_check *rcu_torture_reader_mbchk; static atomic_t n_rcu_torture_alloc; static atomic_t n_rcu_torture_alloc_fail; static atomic_t n_rcu_torture_free; static atomic_t n_rcu_torture_mberror; +static atomic_t n_rcu_torture_mbchk_fail; +static atomic_t n_rcu_torture_mbchk_tries; static atomic_t n_rcu_torture_error; static long n_rcu_torture_barrier_error; static long n_rcu_torture_boost_ktrerror; @@ -174,6 +193,8 @@ static unsigned long n_read_exits; static struct list_head rcu_torture_removed; static unsigned long shutdown_jiffies; static unsigned long start_gp_seq; +static atomic_long_t n_nocb_offload; +static atomic_long_t n_nocb_deoffload; static int rcu_torture_writer_state; #define RTWS_FIXED_DELAY 0 @@ -183,9 +204,11 @@ static int rcu_torture_writer_state; #define RTWS_EXP_SYNC 4 #define RTWS_COND_GET 5 #define RTWS_COND_SYNC 6 -#define RTWS_SYNC 7 -#define RTWS_STUTTER 8 -#define RTWS_STOPPING 9 +#define RTWS_POLL_GET 7 +#define RTWS_POLL_WAIT 8 +#define RTWS_SYNC 9 +#define RTWS_STUTTER 10 +#define RTWS_STOPPING 11 static const char * const rcu_torture_writer_state_names[] = { "RTWS_FIXED_DELAY", "RTWS_DELAY", @@ -194,6 +217,8 @@ static const char * const rcu_torture_writer_state_names[] = { "RTWS_EXP_SYNC", "RTWS_COND_GET", "RTWS_COND_SYNC", + "RTWS_POLL_GET", + "RTWS_POLL_WAIT", "RTWS_SYNC", "RTWS_STUTTER", "RTWS_STOPPING", @@ -311,7 +336,9 @@ struct rcu_torture_ops { void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); - unsigned long (*get_state)(void); + unsigned long (*get_gp_state)(void); + unsigned long (*start_gp_poll)(void); + bool (*poll_gp_state)(unsigned long oldstate); void (*cond_sync)(unsigned long oldstate); call_rcu_func_t call; void (*cb_barrier)(void); @@ -386,7 +413,12 @@ static bool rcu_torture_pipe_update_one(struct rcu_torture *rp) { int i; + struct rcu_torture_reader_check *rtrcp = READ_ONCE(rp->rtort_chkp); + if (rtrcp) { + WRITE_ONCE(rp->rtort_chkp, NULL); + smp_store_release(&rtrcp->rtc_ready, 1); // Pair with smp_load_acquire(). + } i = READ_ONCE(rp->rtort_pipe_count); if (i > RCU_TORTURE_PIPE_LEN) i = RCU_TORTURE_PIPE_LEN; @@ -461,7 +493,7 @@ static struct rcu_torture_ops rcu_ops = { .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, - .get_state = get_state_synchronize_rcu, + .get_gp_state = get_state_synchronize_rcu, .cond_sync = cond_synchronize_rcu, .call = call_rcu, .cb_barrier = rcu_barrier, @@ -570,6 +602,21 @@ static void srcu_torture_synchronize(void) synchronize_srcu(srcu_ctlp); } +static unsigned long srcu_torture_get_gp_state(void) +{ + return get_state_synchronize_srcu(srcu_ctlp); +} + +static unsigned long srcu_torture_start_gp_poll(void) +{ + return start_poll_synchronize_srcu(srcu_ctlp); +} + +static bool srcu_torture_poll_gp_state(unsigned long oldstate) +{ + return poll_state_synchronize_srcu(srcu_ctlp, oldstate); +} + static void srcu_torture_call(struct rcu_head *head, rcu_callback_t func) { @@ -601,6 +648,9 @@ static struct rcu_torture_ops srcu_ops = { .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, + .get_gp_state = srcu_torture_get_gp_state, + .start_gp_poll = srcu_torture_start_gp_poll, + .poll_gp_state = srcu_torture_poll_gp_state, .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, @@ -1018,42 +1068,26 @@ rcu_torture_fqs(void *arg) return 0; } +// Used by writers to randomly choose from the available grace-period +// primitives. The only purpose of the initialization is to size the array. +static int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, RTWS_COND_GET, RTWS_POLL_GET, RTWS_SYNC }; +static int nsynctypes; + /* - * RCU torture writer kthread. Repeatedly substitutes a new structure - * for that pointed to by rcu_torture_current, freeing the old structure - * after a series of grace periods (the "pipeline"). + * Determine which grace-period primitives are available. */ -static int -rcu_torture_writer(void *arg) +static void rcu_torture_write_types(void) { - bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); - int expediting = 0; - unsigned long gp_snap; bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; - bool gp_sync1 = gp_sync; - int i; - int oldnice = task_nice(current); - struct rcu_torture *rp; - struct rcu_torture *old_rp; - static DEFINE_TORTURE_RANDOM(rand); - bool stutter_waited; - int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, - RTWS_COND_GET, RTWS_SYNC }; - int nsynctypes = 0; - - VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); - if (!can_expedite) - pr_alert("%s" TORTURE_FLAG - " GP expediting controlled from boot/sysfs for %s.\n", - torture_type, cur_ops->name); + bool gp_poll1 = gp_poll, gp_sync1 = gp_sync; /* Initialize synctype[] array. If none set, take default. */ - if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) - gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; - if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) { + if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_poll1 && !gp_sync1) + gp_cond1 = gp_exp1 = gp_normal1 = gp_poll1 = gp_sync1 = true; + if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) { synctype[nsynctypes++] = RTWS_COND_GET; pr_info("%s: Testing conditional GPs.\n", __func__); - } else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) { + } else if (gp_cond && (!cur_ops->get_gp_state || !cur_ops->cond_sync)) { pr_alert("%s: gp_cond without primitives.\n", __func__); } if (gp_exp1 && cur_ops->exp_sync) { @@ -1068,12 +1102,46 @@ rcu_torture_writer(void *arg) } else if (gp_normal && !cur_ops->deferred_free) { pr_alert("%s: gp_normal without primitives.\n", __func__); } + if (gp_poll1 && cur_ops->start_gp_poll && cur_ops->poll_gp_state) { + synctype[nsynctypes++] = RTWS_POLL_GET; + pr_info("%s: Testing polling GPs.\n", __func__); + } else if (gp_poll && (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)) { + pr_alert("%s: gp_poll without primitives.\n", __func__); + } if (gp_sync1 && cur_ops->sync) { synctype[nsynctypes++] = RTWS_SYNC; pr_info("%s: Testing normal GPs.\n", __func__); } else if (gp_sync && !cur_ops->sync) { pr_alert("%s: gp_sync without primitives.\n", __func__); } +} + +/* + * RCU torture writer kthread. Repeatedly substitutes a new structure + * for that pointed to by rcu_torture_current, freeing the old structure + * after a series of grace periods (the "pipeline"). + */ +static int +rcu_torture_writer(void *arg) +{ + bool boot_ended; + bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); + unsigned long cookie; + int expediting = 0; + unsigned long gp_snap; + int i; + int idx; + int oldnice = task_nice(current); + struct rcu_torture *rp; + struct rcu_torture *old_rp; + static DEFINE_TORTURE_RANDOM(rand); + bool stutter_waited; + + VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); + if (!can_expedite) + pr_alert("%s" TORTURE_FLAG + " GP expediting controlled from boot/sysfs for %s.\n", + torture_type, cur_ops->name); if (WARN_ONCE(nsynctypes == 0, "rcu_torture_writer: No update-side primitives.\n")) { /* @@ -1087,7 +1155,7 @@ rcu_torture_writer(void *arg) do { rcu_torture_writer_state = RTWS_FIXED_DELAY; - schedule_timeout_uninterruptible(1); + torture_hrtimeout_us(500, 1000, &rand); rp = rcu_torture_alloc(); if (rp == NULL) continue; @@ -1107,6 +1175,18 @@ rcu_torture_writer(void *arg) atomic_inc(&rcu_torture_wcount[i]); WRITE_ONCE(old_rp->rtort_pipe_count, old_rp->rtort_pipe_count + 1); + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) { + idx = cur_ops->readlock(); + cookie = cur_ops->get_gp_state(); + WARN_ONCE(rcu_torture_writer_state != RTWS_DEF_FREE && + cur_ops->poll_gp_state(cookie), + "%s: Cookie check 1 failed %s(%d) %lu->%lu\n", + __func__, + rcu_torture_writer_state_getname(), + rcu_torture_writer_state, + cookie, cur_ops->get_gp_state()); + cur_ops->readunlock(idx); + } switch (synctype[torture_random(&rand) % nsynctypes]) { case RTWS_DEF_FREE: rcu_torture_writer_state = RTWS_DEF_FREE; @@ -1119,15 +1199,21 @@ rcu_torture_writer(void *arg) break; case RTWS_COND_GET: rcu_torture_writer_state = RTWS_COND_GET; - gp_snap = cur_ops->get_state(); - i = torture_random(&rand) % 16; - if (i != 0) - schedule_timeout_interruptible(i); - udelay(torture_random(&rand) % 1000); + gp_snap = cur_ops->get_gp_state(); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); rcu_torture_writer_state = RTWS_COND_SYNC; cur_ops->cond_sync(gp_snap); rcu_torture_pipe_update(old_rp); break; + case RTWS_POLL_GET: + rcu_torture_writer_state = RTWS_POLL_GET; + gp_snap = cur_ops->start_gp_poll(); + rcu_torture_writer_state = RTWS_POLL_WAIT; + while (!cur_ops->poll_gp_state(gp_snap)) + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + rcu_torture_pipe_update(old_rp); + break; case RTWS_SYNC: rcu_torture_writer_state = RTWS_SYNC; cur_ops->sync(); @@ -1137,6 +1223,14 @@ rcu_torture_writer(void *arg) WARN_ON_ONCE(1); break; } + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + WARN_ONCE(rcu_torture_writer_state != RTWS_DEF_FREE && + !cur_ops->poll_gp_state(cookie), + "%s: Cookie check 2 failed %s(%d) %lu->%lu\n", + __func__, + rcu_torture_writer_state_getname(), + rcu_torture_writer_state, + cookie, cur_ops->get_gp_state()); } WRITE_ONCE(rcu_torture_current_version, rcu_torture_current_version + 1); @@ -1155,12 +1249,13 @@ rcu_torture_writer(void *arg) !rcu_gp_is_normal(); } rcu_torture_writer_state = RTWS_STUTTER; + boot_ended = rcu_inkernel_boot_has_ended(); stutter_waited = stutter_wait("rcu_torture_writer"); if (stutter_waited && !READ_ONCE(rcu_fwd_cb_nodelay) && !cur_ops->slow_gps && !torture_must_stop() && - rcu_inkernel_boot_has_ended()) + boot_ended) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) if (list_empty(&rcu_tortures[i].rtort_free) && rcu_access_pointer(rcu_torture_current) != @@ -1194,26 +1289,43 @@ rcu_torture_writer(void *arg) static int rcu_torture_fakewriter(void *arg) { + unsigned long gp_snap; DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started"); set_user_nice(current, MAX_NICE); do { - schedule_timeout_uninterruptible(1 + torture_random(&rand)%10); - udelay(torture_random(&rand) & 0x3ff); + torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand); if (cur_ops->cb_barrier != NULL && torture_random(&rand) % (nfakewriters * 8) == 0) { cur_ops->cb_barrier(); - } else if (gp_normal == gp_exp) { - if (cur_ops->sync && torture_random(&rand) & 0x80) - cur_ops->sync(); - else if (cur_ops->exp_sync) + } else { + switch (synctype[torture_random(&rand) % nsynctypes]) { + case RTWS_DEF_FREE: + break; + case RTWS_EXP_SYNC: cur_ops->exp_sync(); - } else if (gp_normal && cur_ops->sync) { - cur_ops->sync(); - } else if (cur_ops->exp_sync) { - cur_ops->exp_sync(); + break; + case RTWS_COND_GET: + gp_snap = cur_ops->get_gp_state(); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + cur_ops->cond_sync(gp_snap); + break; + case RTWS_POLL_GET: + gp_snap = cur_ops->start_gp_poll(); + while (!cur_ops->poll_gp_state(gp_snap)) { + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + } + break; + case RTWS_SYNC: + cur_ops->sync(); + break; + default: + WARN_ON_ONCE(1); + break; + } } stutter_wait("rcu_torture_fakewriter"); } while (!torture_must_stop()); @@ -1227,6 +1339,62 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp) kfree(rhp); } +// Set up and carry out testing of RCU's global memory ordering +static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, + struct torture_random_state *trsp) +{ + unsigned long loops; + int noc = torture_num_online_cpus(); + int rdrchked; + int rdrchker; + struct rcu_torture_reader_check *rtrcp; // Me. + struct rcu_torture_reader_check *rtrcp_assigner; // Assigned us to do checking. + struct rcu_torture_reader_check *rtrcp_chked; // Reader being checked. + struct rcu_torture_reader_check *rtrcp_chker; // Reader doing checking when not me. + + if (myid < 0) + return; // Don't try this from timer handlers. + + // Increment my counter. + rtrcp = &rcu_torture_reader_mbchk[myid]; + WRITE_ONCE(rtrcp->rtc_myloops, rtrcp->rtc_myloops + 1); + + // Attempt to assign someone else some checking work. + rdrchked = torture_random(trsp) % nrealreaders; + rtrcp_chked = &rcu_torture_reader_mbchk[rdrchked]; + rdrchker = torture_random(trsp) % nrealreaders; + rtrcp_chker = &rcu_torture_reader_mbchk[rdrchker]; + if (rdrchked != myid && rdrchked != rdrchker && noc >= rdrchked && noc >= rdrchker && + smp_load_acquire(&rtrcp->rtc_chkrdr) < 0 && // Pairs with smp_store_release below. + !READ_ONCE(rtp->rtort_chkp) && + !smp_load_acquire(&rtrcp_chker->rtc_assigner)) { // Pairs with smp_store_release below. + rtrcp->rtc_chkloops = READ_ONCE(rtrcp_chked->rtc_myloops); + WARN_ON_ONCE(rtrcp->rtc_chkrdr >= 0); + rtrcp->rtc_chkrdr = rdrchked; + WARN_ON_ONCE(rtrcp->rtc_ready); // This gets set after the grace period ends. + if (cmpxchg_relaxed(&rtrcp_chker->rtc_assigner, NULL, rtrcp) || + cmpxchg_relaxed(&rtp->rtort_chkp, NULL, rtrcp)) + (void)cmpxchg_relaxed(&rtrcp_chker->rtc_assigner, rtrcp, NULL); // Back out. + } + + // If assigned some completed work, do it! + rtrcp_assigner = READ_ONCE(rtrcp->rtc_assigner); + if (!rtrcp_assigner || !smp_load_acquire(&rtrcp_assigner->rtc_ready)) + return; // No work or work not yet ready. + rdrchked = rtrcp_assigner->rtc_chkrdr; + if (WARN_ON_ONCE(rdrchked < 0)) + return; + rtrcp_chked = &rcu_torture_reader_mbchk[rdrchked]; + loops = READ_ONCE(rtrcp_chked->rtc_myloops); + atomic_inc(&n_rcu_torture_mbchk_tries); + if (ULONG_CMP_LT(loops, rtrcp_assigner->rtc_chkloops)) + atomic_inc(&n_rcu_torture_mbchk_fail); + rtrcp_assigner->rtc_chkloops = loops + ULONG_MAX / 2; + rtrcp_assigner->rtc_ready = 0; + smp_store_release(&rtrcp->rtc_assigner, NULL); // Someone else can assign us work. + smp_store_release(&rtrcp_assigner->rtc_chkrdr, -1); // Assigner can again assign. +} + /* * Do one extension of an RCU read-side critical section using the * current reader state in readstate (set to zero for initial entry @@ -1362,8 +1530,9 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, * no data to read. Can be invoked both from process context and * from a timer handler. */ -static bool rcu_torture_one_read(struct torture_random_state *trsp) +static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) { + unsigned long cookie; int i; unsigned long started; unsigned long completed; @@ -1379,6 +1548,8 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) WARN_ON_ONCE(!rcu_is_watching()); newstate = rcutorture_extend_mask(readstate, trsp); rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++); + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + cookie = cur_ops->get_gp_state(); started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, @@ -1394,6 +1565,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) } if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); + rcu_torture_reader_do_mbchk(myid, p, trsp); rtrsp = rcutorture_loop_extend(&readstate, trsp, rtrsp); preempt_disable(); pipe_count = READ_ONCE(p->rtort_pipe_count); @@ -1415,6 +1587,13 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) } __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + WARN_ONCE(cur_ops->poll_gp_state(cookie), + "%s: Cookie check 3 failed %s(%d) %lu->%lu\n", + __func__, + rcu_torture_writer_state_getname(), + rcu_torture_writer_state, + cookie, cur_ops->get_gp_state()); rcutorture_one_extend(&readstate, 0, trsp, rtrsp); WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK); // This next splat is expected behavior if leakpointer, especially @@ -1443,7 +1622,7 @@ static DEFINE_TORTURE_RANDOM_PERCPU(rcu_torture_timer_rand); static void rcu_torture_timer(struct timer_list *unused) { atomic_long_inc(&n_rcu_torture_timers); - (void)rcu_torture_one_read(this_cpu_ptr(&rcu_torture_timer_rand)); + (void)rcu_torture_one_read(this_cpu_ptr(&rcu_torture_timer_rand), -1); /* Test call_rcu() invocation from interrupt handler. */ if (cur_ops->call) { @@ -1479,13 +1658,13 @@ rcu_torture_reader(void *arg) if (!timer_pending(&t)) mod_timer(&t, jiffies + 1); } - if (!rcu_torture_one_read(&rand) && !torture_must_stop()) + if (!rcu_torture_one_read(&rand, myid) && !torture_must_stop()) schedule_timeout_interruptible(HZ); if (time_after(jiffies, lastsleep) && !torture_must_stop()) { - schedule_timeout_interruptible(1); + torture_hrtimeout_us(500, 1000, &rand); lastsleep = jiffies + 10; } - while (num_online_cpus() < mynumonline && !torture_must_stop()) + while (torture_num_online_cpus() < mynumonline && !torture_must_stop()) schedule_timeout_interruptible(HZ / 5); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); @@ -1499,6 +1678,53 @@ rcu_torture_reader(void *arg) } /* + * Randomly Toggle CPUs' callback-offload state. This uses hrtimers to + * increase race probabilities and fuzzes the interval between toggling. + */ +static int rcu_nocb_toggle(void *arg) +{ + int cpu; + int maxcpu = -1; + int oldnice = task_nice(current); + long r; + DEFINE_TORTURE_RANDOM(rand); + ktime_t toggle_delay; + unsigned long toggle_fuzz; + ktime_t toggle_interval = ms_to_ktime(nocbs_toggle); + + VERBOSE_TOROUT_STRING("rcu_nocb_toggle task started"); + while (!rcu_inkernel_boot_has_ended()) + schedule_timeout_interruptible(HZ / 10); + for_each_online_cpu(cpu) + maxcpu = cpu; + WARN_ON(maxcpu < 0); + if (toggle_interval > ULONG_MAX) + toggle_fuzz = ULONG_MAX >> 3; + else + toggle_fuzz = toggle_interval >> 3; + if (toggle_fuzz <= 0) + toggle_fuzz = NSEC_PER_USEC; + do { + r = torture_random(&rand); + cpu = (r >> 4) % (maxcpu + 1); + if (r & 0x1) { + rcu_nocb_cpu_offload(cpu); + atomic_long_inc(&n_nocb_offload); + } else { + rcu_nocb_cpu_deoffload(cpu); + atomic_long_inc(&n_nocb_deoffload); + } + toggle_delay = torture_random(&rand) % toggle_fuzz + toggle_interval; + set_current_state(TASK_INTERRUPTIBLE); + schedule_hrtimeout(&toggle_delay, HRTIMER_MODE_REL); + if (stutter_wait("rcu_nocb_toggle")) + sched_set_normal(current, oldnice); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_nocb_toggle"); + return 0; +} + +/* * Print torture statistics. Caller must ensure that there is only * one call to this function at a given time!!! This is normally * accomplished by relying on the module system to only have one copy @@ -1539,8 +1765,9 @@ rcu_torture_stats_print(void) atomic_read(&n_rcu_torture_alloc), atomic_read(&n_rcu_torture_alloc_fail), atomic_read(&n_rcu_torture_free)); - pr_cont("rtmbe: %d rtbe: %ld rtbke: %ld rtbre: %ld ", + pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld rtbre: %ld ", atomic_read(&n_rcu_torture_mberror), + atomic_read(&n_rcu_torture_mbchk_fail), atomic_read(&n_rcu_torture_mbchk_tries), n_rcu_torture_barrier_error, n_rcu_torture_boost_ktrerror, n_rcu_torture_boost_rterror); @@ -1553,16 +1780,20 @@ rcu_torture_stats_print(void) data_race(n_barrier_successes), data_race(n_barrier_attempts), data_race(n_rcu_torture_barrier_error)); - pr_cont("read-exits: %ld\n", data_race(n_read_exits)); + pr_cont("read-exits: %ld ", data_race(n_read_exits)); // Statistic. + pr_cont("nocb-toggles: %ld:%ld\n", + atomic_long_read(&n_nocb_offload), atomic_long_read(&n_nocb_deoffload)); pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) || + atomic_read(&n_rcu_torture_mbchk_fail) || n_rcu_torture_barrier_error || n_rcu_torture_boost_ktrerror || n_rcu_torture_boost_rterror || n_rcu_torture_boost_failure || i > 1) { pr_cont("%s", "!!! "); atomic_inc(&n_rcu_torture_error); WARN_ON_ONCE(atomic_read(&n_rcu_torture_mberror)); + WARN_ON_ONCE(atomic_read(&n_rcu_torture_mbchk_fail)); WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier() WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread WARN_ON_ONCE(n_rcu_torture_boost_rterror); // can't set RT prio @@ -1647,7 +1878,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "stall_cpu_block=%d " "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d " - "read_exit_delay=%d read_exit_burst=%d\n", + "read_exit_delay=%d read_exit_burst=%d " + "nocbs_nthreads=%d nocbs_toggle=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, @@ -1657,7 +1889,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) stall_cpu_block, n_barrier_cbs, onoff_interval, onoff_holdoff, - read_exit_delay, read_exit_burst); + read_exit_delay, read_exit_burst, + nocbs_nthreads, nocbs_toggle); } static int rcutorture_booster_cleanup(unsigned int cpu) @@ -2392,7 +2625,7 @@ static int rcu_torture_read_exit_child(void *trsp_in) // Minimize time between reading and exiting. while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); - (void)rcu_torture_one_read(trsp); + (void)rcu_torture_one_read(trsp, -1); return 0; } @@ -2500,6 +2733,13 @@ rcu_torture_cleanup(void) torture_stop_kthread(rcu_torture_stall, stall_task); torture_stop_kthread(rcu_torture_writer, writer_task); + if (nocb_tasks) { + for (i = 0; i < nrealnocbers; i++) + torture_stop_kthread(rcu_nocb_toggle, nocb_tasks[i]); + kfree(nocb_tasks); + nocb_tasks = NULL; + } + if (reader_tasks) { for (i = 0; i < nrealreaders; i++) torture_stop_kthread(rcu_torture_reader, @@ -2507,6 +2747,8 @@ rcu_torture_cleanup(void) kfree(reader_tasks); reader_tasks = NULL; } + kfree(rcu_torture_reader_mbchk); + rcu_torture_reader_mbchk = NULL; if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) @@ -2604,6 +2846,7 @@ static void rcu_test_debug_objects(void) #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD struct rcu_head rh1; struct rcu_head rh2; + struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); init_rcu_head_on_stack(&rh1); init_rcu_head_on_stack(&rh2); @@ -2616,6 +2859,10 @@ static void rcu_test_debug_objects(void) local_irq_disable(); /* Make it harder to start a new grace period. */ call_rcu(&rh2, rcu_torture_leak_cb); call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ + if (rhp) { + call_rcu(rhp, rcu_torture_leak_cb); + call_rcu(rhp, rcu_torture_err_cb); /* Another duplicate callback. */ + } local_irq_enable(); rcu_read_unlock(); preempt_enable(); @@ -2710,6 +2957,8 @@ rcu_torture_init(void) atomic_set(&n_rcu_torture_alloc_fail, 0); atomic_set(&n_rcu_torture_free, 0); atomic_set(&n_rcu_torture_mberror, 0); + atomic_set(&n_rcu_torture_mbchk_fail, 0); + atomic_set(&n_rcu_torture_mbchk_tries, 0); atomic_set(&n_rcu_torture_error, 0); n_rcu_torture_barrier_error = 0; n_rcu_torture_boost_ktrerror = 0; @@ -2729,6 +2978,7 @@ rcu_torture_init(void) /* Start up the kthreads. */ + rcu_torture_write_types(); firsterr = torture_create_kthread(rcu_torture_writer, NULL, writer_task); if (firsterr) @@ -2751,17 +3001,40 @@ rcu_torture_init(void) } reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), GFP_KERNEL); - if (reader_tasks == NULL) { + rcu_torture_reader_mbchk = kcalloc(nrealreaders, sizeof(*rcu_torture_reader_mbchk), + GFP_KERNEL); + if (!reader_tasks || !rcu_torture_reader_mbchk) { VERBOSE_TOROUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } for (i = 0; i < nrealreaders; i++) { + rcu_torture_reader_mbchk[i].rtc_chkrdr = -1; firsterr = torture_create_kthread(rcu_torture_reader, (void *)i, reader_tasks[i]); if (firsterr) goto unwind; } + nrealnocbers = nocbs_nthreads; + if (WARN_ON(nrealnocbers < 0)) + nrealnocbers = 1; + if (WARN_ON(nocbs_toggle < 0)) + nocbs_toggle = HZ; + if (nrealnocbers > 0) { + nocb_tasks = kcalloc(nrealnocbers, sizeof(nocb_tasks[0]), GFP_KERNEL); + if (nocb_tasks == NULL) { + VERBOSE_TOROUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + } else { + nocb_tasks = NULL; + } + for (i = 0; i < nrealnocbers; i++) { + firsterr = torture_create_kthread(rcu_nocb_toggle, NULL, nocb_tasks[i]); + if (firsterr) + goto unwind; + } if (stat_interval > 0) { firsterr = torture_create_kthread(rcu_torture_stats, NULL, stats_task); diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 23ff36a66f97..02dd9767b559 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -46,6 +46,18 @@ #define VERBOSE_SCALEOUT(s, x...) \ do { if (verbose) pr_alert("%s" SCALE_FLAG s, scale_type, ## x); } while (0) +static atomic_t verbose_batch_ctr; + +#define VERBOSE_SCALEOUT_BATCH(s, x...) \ +do { \ + if (verbose && \ + (verbose_batched <= 0 || \ + !(atomic_inc_return(&verbose_batch_ctr) % verbose_batched))) { \ + schedule_timeout_uninterruptible(1); \ + pr_alert("%s" SCALE_FLAG s, scale_type, ## x); \ + } \ +} while (0) + #define VERBOSE_SCALEOUT_ERRSTRING(s, x...) \ do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! " s, scale_type, ## x); } while (0) @@ -57,6 +69,7 @@ module_param(scale_type, charp, 0444); MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); +torture_param(int, verbose_batched, 0, "Batch verbose debugging printk()s"); // Wait until there are multiple CPUs before starting test. torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, @@ -368,14 +381,14 @@ ref_scale_reader(void *arg) u64 start; s64 duration; - VERBOSE_SCALEOUT("ref_scale_reader %ld: task started", me); + VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: task started", me); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); set_user_nice(current, MAX_NICE); atomic_inc(&n_init); if (holdoff) schedule_timeout_interruptible(holdoff * HZ); repeat: - VERBOSE_SCALEOUT("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); + VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); // Wait for signal that this reader can start. wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) || @@ -392,7 +405,7 @@ repeat: while (atomic_read_acquire(&n_started)) cpu_relax(); - VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d started", me, exp_idx); + VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: experiment %d started", me, exp_idx); // To reduce noise, do an initial cache-warming invocation, check @@ -421,8 +434,8 @@ repeat: if (atomic_dec_and_test(&nreaders_exp)) wake_up(&main_wq); - VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)", - me, exp_idx, atomic_read(&nreaders_exp)); + VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)", + me, exp_idx, atomic_read(&nreaders_exp)); if (!torture_must_stop()) goto repeat; diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 6208c1dae5c9..26344dc6483b 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -34,6 +34,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp) ssp->srcu_gp_running = false; ssp->srcu_gp_waiting = false; ssp->srcu_idx = 0; + ssp->srcu_idx_max = 0; INIT_WORK(&ssp->srcu_work, srcu_drive_gp); INIT_LIST_HEAD(&ssp->srcu_work.entry); return 0; @@ -84,6 +85,8 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) WARN_ON(ssp->srcu_gp_waiting); WARN_ON(ssp->srcu_cb_head); WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail); + WARN_ON(ssp->srcu_idx != ssp->srcu_idx_max); + WARN_ON(ssp->srcu_idx & 0x1); } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); @@ -114,7 +117,7 @@ void srcu_drive_gp(struct work_struct *wp) struct srcu_struct *ssp; ssp = container_of(wp, struct srcu_struct, srcu_work); - if (ssp->srcu_gp_running || !READ_ONCE(ssp->srcu_cb_head)) + if (ssp->srcu_gp_running || USHORT_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) return; /* Already running or nothing to do. */ /* Remove recently arrived callbacks and wait for readers. */ @@ -124,11 +127,12 @@ void srcu_drive_gp(struct work_struct *wp) ssp->srcu_cb_head = NULL; ssp->srcu_cb_tail = &ssp->srcu_cb_head; local_irq_enable(); - idx = ssp->srcu_idx; - WRITE_ONCE(ssp->srcu_idx, !ssp->srcu_idx); + idx = (ssp->srcu_idx & 0x2) / 2; + WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx])); WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ + WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); /* Invoke the callbacks we removed above. */ while (lh) { @@ -146,11 +150,27 @@ void srcu_drive_gp(struct work_struct *wp) * straighten that out. */ WRITE_ONCE(ssp->srcu_gp_running, false); - if (READ_ONCE(ssp->srcu_cb_head)) + if (USHORT_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) schedule_work(&ssp->srcu_work); } EXPORT_SYMBOL_GPL(srcu_drive_gp); +static void srcu_gp_start_if_needed(struct srcu_struct *ssp) +{ + unsigned short cookie; + + cookie = get_state_synchronize_srcu(ssp); + if (USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) + return; + WRITE_ONCE(ssp->srcu_idx_max, cookie); + if (!READ_ONCE(ssp->srcu_gp_running)) { + if (likely(srcu_init_done)) + schedule_work(&ssp->srcu_work); + else if (list_empty(&ssp->srcu_work.entry)) + list_add(&ssp->srcu_work.entry, &srcu_boot_list); + } +} + /* * Enqueue an SRCU callback on the specified srcu_struct structure, * initiating grace-period processing if it is not already running. @@ -166,12 +186,7 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, *ssp->srcu_cb_tail = rhp; ssp->srcu_cb_tail = &rhp->next; local_irq_restore(flags); - if (!READ_ONCE(ssp->srcu_gp_running)) { - if (likely(srcu_init_done)) - schedule_work(&ssp->srcu_work); - else if (list_empty(&ssp->srcu_work.entry)) - list_add(&ssp->srcu_work.entry, &srcu_boot_list); - } + srcu_gp_start_if_needed(ssp); } EXPORT_SYMBOL_GPL(call_srcu); @@ -190,6 +205,48 @@ void synchronize_srcu(struct srcu_struct *ssp) } EXPORT_SYMBOL_GPL(synchronize_srcu); +/* + * get_state_synchronize_srcu - Provide an end-of-grace-period cookie + */ +unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp) +{ + unsigned long ret; + + barrier(); + ret = (READ_ONCE(ssp->srcu_idx) + 3) & ~0x1; + barrier(); + return ret & USHRT_MAX; +} +EXPORT_SYMBOL_GPL(get_state_synchronize_srcu); + +/* + * start_poll_synchronize_srcu - Provide cookie and start grace period + * + * The difference between this and get_state_synchronize_srcu() is that + * this function ensures that the poll_state_synchronize_srcu() will + * eventually return the value true. + */ +unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) +{ + unsigned long ret = get_state_synchronize_srcu(ssp); + + srcu_gp_start_if_needed(ssp); + return ret; +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); + +/* + * poll_state_synchronize_srcu - Has cookie's grace period ended? + */ +bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) +{ + bool ret = USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx), cookie); + + barrier(); + return ret; +} +EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); + /* Lockdep diagnostics. */ void __init rcu_scheduler_starting(void) { diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0f23d20d485a..e26547b34ad3 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -808,6 +808,46 @@ static void srcu_leak_callback(struct rcu_head *rhp) } /* + * Start an SRCU grace period, and also queue the callback if non-NULL. + */ +static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, + struct rcu_head *rhp, bool do_norm) +{ + unsigned long flags; + int idx; + bool needexp = false; + bool needgp = false; + unsigned long s; + struct srcu_data *sdp; + + check_init_srcu_struct(ssp); + idx = srcu_read_lock(ssp); + sdp = raw_cpu_ptr(ssp->sda); + spin_lock_irqsave_rcu_node(sdp, flags); + if (rhp) + rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); + rcu_segcblist_advance(&sdp->srcu_cblist, + rcu_seq_current(&ssp->srcu_gp_seq)); + s = rcu_seq_snap(&ssp->srcu_gp_seq); + (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); + if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { + sdp->srcu_gp_seq_needed = s; + needgp = true; + } + if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) { + sdp->srcu_gp_seq_needed_exp = s; + needexp = true; + } + spin_unlock_irqrestore_rcu_node(sdp, flags); + if (needgp) + srcu_funnel_gp_start(ssp, sdp, s, do_norm); + else if (needexp) + srcu_funnel_exp_start(ssp, sdp->mynode, s); + srcu_read_unlock(ssp, idx); + return s; +} + +/* * Enqueue an SRCU callback on the srcu_data structure associated with * the current CPU and the specified srcu_struct structure, initiating * grace-period processing if it is not already running. @@ -838,14 +878,6 @@ static void srcu_leak_callback(struct rcu_head *rhp) static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rcu_callback_t func, bool do_norm) { - unsigned long flags; - int idx; - bool needexp = false; - bool needgp = false; - unsigned long s; - struct srcu_data *sdp; - - check_init_srcu_struct(ssp); if (debug_rcu_head_queue(rhp)) { /* Probable double call_srcu(), so leak the callback. */ WRITE_ONCE(rhp->func, srcu_leak_callback); @@ -853,28 +885,7 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, return; } rhp->func = func; - idx = srcu_read_lock(ssp); - sdp = raw_cpu_ptr(ssp->sda); - spin_lock_irqsave_rcu_node(sdp, flags); - rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); - rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&ssp->srcu_gp_seq)); - s = rcu_seq_snap(&ssp->srcu_gp_seq); - (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); - if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { - sdp->srcu_gp_seq_needed = s; - needgp = true; - } - if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) { - sdp->srcu_gp_seq_needed_exp = s; - needexp = true; - } - spin_unlock_irqrestore_rcu_node(sdp, flags); - if (needgp) - srcu_funnel_gp_start(ssp, sdp, s, do_norm); - else if (needexp) - srcu_funnel_exp_start(ssp, sdp->mynode, s); - srcu_read_unlock(ssp, idx); + (void)srcu_gp_start_if_needed(ssp, rhp, do_norm); } /** @@ -1003,6 +1014,77 @@ void synchronize_srcu(struct srcu_struct *ssp) } EXPORT_SYMBOL_GPL(synchronize_srcu); +/** + * get_state_synchronize_srcu - Provide an end-of-grace-period cookie + * @ssp: srcu_struct to provide cookie for. + * + * This function returns a cookie that can be passed to + * poll_state_synchronize_srcu(), which will return true if a full grace + * period has elapsed in the meantime. It is the caller's responsibility + * to make sure that grace period happens, for example, by invoking + * call_srcu() after return from get_state_synchronize_srcu(). + */ +unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp) +{ + // Any prior manipulation of SRCU-protected data must happen + // before the load from ->srcu_gp_seq. + smp_mb(); + return rcu_seq_snap(&ssp->srcu_gp_seq); +} +EXPORT_SYMBOL_GPL(get_state_synchronize_srcu); + +/** + * start_poll_synchronize_srcu - Provide cookie and start grace period + * @ssp: srcu_struct to provide cookie for. + * + * This function returns a cookie that can be passed to + * poll_state_synchronize_srcu(), which will return true if a full grace + * period has elapsed in the meantime. Unlike get_state_synchronize_srcu(), + * this function also ensures that any needed SRCU grace period will be + * started. This convenience does come at a cost in terms of CPU overhead. + */ +unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) +{ + return srcu_gp_start_if_needed(ssp, NULL, true); +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); + +/** + * poll_state_synchronize_srcu - Has cookie's grace period ended? + * @ssp: srcu_struct to provide cookie for. + * @cookie: Return value from get_state_synchronize_srcu() or start_poll_synchronize_srcu(). + * + * This function takes the cookie that was returned from either + * get_state_synchronize_srcu() or start_poll_synchronize_srcu(), and + * returns @true if an SRCU grace period elapsed since the time that the + * cookie was created. + * + * Because cookies are finite in size, wrapping/overflow is possible. + * This is more pronounced on 32-bit systems where cookies are 32 bits, + * where in theory wrapping could happen in about 14 hours assuming + * 25-microsecond expedited SRCU grace periods. However, a more likely + * overflow lower bound is on the order of 24 days in the case of + * one-millisecond SRCU grace periods. Of course, wrapping in a 64-bit + * system requires geologic timespans, as in more than seven million years + * even for expedited SRCU grace periods. + * + * Wrapping/overflow is much more of an issue for CONFIG_SMP=n systems + * that also have CONFIG_PREEMPTION=n, which selects Tiny SRCU. This uses + * a 16-bit cookie, which rcutorture routinely wraps in a matter of a + * few minutes. If this proves to be a problem, this counter will be + * expanded to the same size as for Tree SRCU. + */ +bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) +{ + if (!rcu_seq_done(&ssp->srcu_gp_seq, cookie)) + return false; + // Ensure that the end of the SRCU grace period happens before + // any subsequent code that the caller might execute. + smp_mb(); // ^^^ + return true; +} +EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); + /* * Callback function for srcu_barrier() use. */ @@ -1160,6 +1242,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) */ static void srcu_invoke_callbacks(struct work_struct *work) { + long len; bool more; struct rcu_cblist ready_cbs; struct rcu_head *rhp; @@ -1182,6 +1265,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) /* We are on the job! Extract and invoke ready callbacks. */ sdp->srcu_cblist_invoking = true; rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); + len = ready_cbs.len; spin_unlock_irq_rcu_node(sdp); rhp = rcu_cblist_dequeue(&ready_cbs); for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { @@ -1190,13 +1274,14 @@ static void srcu_invoke_callbacks(struct work_struct *work) rhp->func(rhp); local_bh_enable(); } + WARN_ON_ONCE(ready_cbs.len); /* * Update counts, accelerate new callbacks, and if needed, * schedule another round of callback invocation. */ spin_lock_irq_rcu_node(sdp); - rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); + rcu_segcblist_add_len(&sdp->srcu_cblist, -len); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, rcu_seq_snap(&ssp->srcu_gp_seq)); sdp->srcu_cblist_invoking = false; diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 36607551f966..af7c19439f4e 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1224,6 +1224,82 @@ void show_rcu_tasks_gp_kthreads(void) } #endif /* #ifndef CONFIG_TINY_RCU */ +#ifdef CONFIG_PROVE_RCU +struct rcu_tasks_test_desc { + struct rcu_head rh; + const char *name; + bool notrun; +}; + +static struct rcu_tasks_test_desc tests[] = { + { + .name = "call_rcu_tasks()", + /* If not defined, the test is skipped. */ + .notrun = !IS_ENABLED(CONFIG_TASKS_RCU), + }, + { + .name = "call_rcu_tasks_rude()", + /* If not defined, the test is skipped. */ + .notrun = !IS_ENABLED(CONFIG_TASKS_RUDE_RCU), + }, + { + .name = "call_rcu_tasks_trace()", + /* If not defined, the test is skipped. */ + .notrun = !IS_ENABLED(CONFIG_TASKS_TRACE_RCU) + } +}; + +static void test_rcu_tasks_callback(struct rcu_head *rhp) +{ + struct rcu_tasks_test_desc *rttd = + container_of(rhp, struct rcu_tasks_test_desc, rh); + + pr_info("Callback from %s invoked.\n", rttd->name); + + rttd->notrun = true; +} + +static void rcu_tasks_initiate_self_tests(void) +{ + pr_info("Running RCU-tasks wait API self tests\n"); +#ifdef CONFIG_TASKS_RCU + synchronize_rcu_tasks(); + call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback); +#endif + +#ifdef CONFIG_TASKS_RUDE_RCU + synchronize_rcu_tasks_rude(); + call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback); +#endif + +#ifdef CONFIG_TASKS_TRACE_RCU + synchronize_rcu_tasks_trace(); + call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback); +#endif +} + +static int rcu_tasks_verify_self_tests(void) +{ + int ret = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + if (!tests[i].notrun) { // still hanging. + pr_err("%s has been failed.\n", tests[i].name); + ret = -1; + } + } + + if (ret) + WARN_ON(1); + + return ret; +} +late_initcall(rcu_tasks_verify_self_tests); +#else /* #ifdef CONFIG_PROVE_RCU */ +static void rcu_tasks_initiate_self_tests(void) { } +#endif /* #else #ifdef CONFIG_PROVE_RCU */ + void __init rcu_init_tasks_generic(void) { #ifdef CONFIG_TASKS_RCU @@ -1237,6 +1313,9 @@ void __init rcu_init_tasks_generic(void) #ifdef CONFIG_TASKS_TRACE_RCU rcu_spawn_tasks_trace_kthread(); #endif + + // Run the self-tests. + rcu_tasks_initiate_self_tests(); } #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 40e5e3dd253e..da6f5213fb74 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -83,6 +83,9 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nesting = 1, .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), +#ifdef CONFIG_RCU_NOCB_CPU + .cblist.flags = SEGCBLIST_SOFTIRQ_ONLY, +#endif }; static struct rcu_state rcu_state = { .level = { &rcu_state.node[0] }, @@ -100,8 +103,10 @@ static struct rcu_state rcu_state = { static bool dump_tree; module_param(dump_tree, bool, 0444); /* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */ -static bool use_softirq = true; +static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT); +#ifndef CONFIG_PREEMPT_RT module_param(use_softirq, bool, 0444); +#endif /* Control rcu_node-tree auto-balancing at boot time. */ static bool rcu_fanout_exact; module_param(rcu_fanout_exact, bool, 0444); @@ -644,7 +649,6 @@ static noinstr void rcu_eqs_enter(bool user) trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rdp = this_cpu_ptr(&rcu_data); - do_nocb_deferred_wakeup(rdp); rcu_prepare_for_idle(); rcu_preempt_deferred_qs(current); @@ -678,6 +682,50 @@ void rcu_idle_enter(void) EXPORT_SYMBOL_GPL(rcu_idle_enter); #ifdef CONFIG_NO_HZ_FULL + +#if !defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK) +/* + * An empty function that will trigger a reschedule on + * IRQ tail once IRQs get re-enabled on userspace/guest resume. + */ +static void late_wakeup_func(struct irq_work *work) +{ +} + +static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) = + IRQ_WORK_INIT(late_wakeup_func); + +/* + * If either: + * + * 1) the task is about to enter in guest mode and $ARCH doesn't support KVM generic work + * 2) the task is about to enter in user mode and $ARCH doesn't support generic entry. + * + * In these cases the late RCU wake ups aren't supported in the resched loops and our + * last resort is to fire a local irq_work that will trigger a reschedule once IRQs + * get re-enabled again. + */ +noinstr static void rcu_irq_work_resched(void) +{ + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + + if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU)) + return; + + if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU)) + return; + + instrumentation_begin(); + if (do_nocb_deferred_wakeup(rdp) && need_resched()) { + irq_work_queue(this_cpu_ptr(&late_wakeup_work)); + } + instrumentation_end(); +} + +#else +static inline void rcu_irq_work_resched(void) { } +#endif + /** * rcu_user_enter - inform RCU that we are resuming userspace. * @@ -692,8 +740,16 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); noinstr void rcu_user_enter(void) { lockdep_assert_irqs_disabled(); + + /* + * Other than generic entry implementation, we may be past the last + * rescheduling opportunity in the entry code. Trigger a self IPI + * that will fire and reschedule once we resume in user/guest mode. + */ + rcu_irq_work_resched(); rcu_eqs_enter(true); } + #endif /* CONFIG_NO_HZ_FULL */ /** @@ -1495,6 +1551,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp) if (!rcu_segcblist_pend_cbs(&rdp->cblist)) return false; + trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPreAcc")); + /* * Callbacks are often registered with incomplete grace-period * information. Something about the fact that getting exact @@ -1515,6 +1573,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp) else trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB")); + trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPostAcc")); + return ret; } @@ -1765,7 +1825,7 @@ static bool rcu_gp_init(void) * go offline later. Please also refer to "Hotplug CPU" section * of RCU's Requirements documentation. */ - rcu_state.gp_state = RCU_GP_ONOFF; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF); rcu_for_each_leaf_node(rnp) { smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values. firstseq = READ_ONCE(rnp->ofl_seq); @@ -1831,7 +1891,7 @@ static bool rcu_gp_init(void) * The grace period cannot complete until the initialization * process finishes, because this kthread handles both. */ - rcu_state.gp_state = RCU_GP_INIT; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_INIT); rcu_for_each_node_breadth_first(rnp) { rcu_gp_slow(gp_init_delay); raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -1930,17 +1990,22 @@ static void rcu_gp_fqs_loop(void) ret = 0; for (;;) { if (!ret) { - rcu_state.jiffies_force_qs = jiffies + j; + WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j); + /* + * jiffies_force_qs before RCU_GP_WAIT_FQS state + * update; required for stall checks. + */ + smp_wmb(); WRITE_ONCE(rcu_state.jiffies_kick_kthreads, jiffies + (j ? 3 * j : 2)); } trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqswait")); - rcu_state.gp_state = RCU_GP_WAIT_FQS; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS); ret = swait_event_idle_timeout_exclusive( rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j); rcu_gp_torture_wait(); - rcu_state.gp_state = RCU_GP_DOING_FQS; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS); /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ if (!READ_ONCE(rnp->qsmask) && @@ -2054,7 +2119,7 @@ static void rcu_gp_cleanup(void) trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end")); rcu_seq_end(&rcu_state.gp_seq); ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); - rcu_state.gp_state = RCU_GP_IDLE; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_IDLE); /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(&rcu_data); if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) { @@ -2093,12 +2158,12 @@ static int __noreturn rcu_gp_kthread(void *unused) for (;;) { trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("reqwait")); - rcu_state.gp_state = RCU_GP_WAIT_GPS; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_GPS); swait_event_idle_exclusive(rcu_state.gp_wq, READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_INIT); rcu_gp_torture_wait(); - rcu_state.gp_state = RCU_GP_DONE_GPS; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_DONE_GPS); /* Locking provides needed memory barrier. */ if (rcu_gp_init()) break; @@ -2113,9 +2178,9 @@ static int __noreturn rcu_gp_kthread(void *unused) rcu_gp_fqs_loop(); /* Handle grace-period end. */ - rcu_state.gp_state = RCU_GP_CLEANUP; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANUP); rcu_gp_cleanup(); - rcu_state.gp_state = RCU_GP_CLEANED; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANED); } } @@ -2430,11 +2495,12 @@ int rcutree_dead_cpu(unsigned int cpu) static void rcu_do_batch(struct rcu_data *rdp) { int div; + bool __maybe_unused empty; unsigned long flags; const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); struct rcu_head *rhp; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); - long bl, count; + long bl, count = 0; long pending, tlimit = 0; /* If no callbacks are ready, just return. */ @@ -2471,14 +2537,18 @@ static void rcu_do_batch(struct rcu_data *rdp) rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); if (offloaded) rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); + + trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued")); rcu_nocb_unlock_irqrestore(rdp, flags); /* Invoke callbacks. */ tick_dep_set_task(current, TICK_DEP_BIT_RCU); rhp = rcu_cblist_dequeue(&rcl); + for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { rcu_callback_t f; + count++; debug_rcu_head_unqueue(rhp); rcu_lock_acquire(&rcu_callback_map); @@ -2492,21 +2562,19 @@ static void rcu_do_batch(struct rcu_data *rdp) /* * Stop only if limit reached and CPU has something to do. - * Note: The rcl structure counts down from zero. */ - if (-rcl.len >= bl && !offloaded && + if (count >= bl && !offloaded && (need_resched() || (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) break; if (unlikely(tlimit)) { /* only call local_clock() every 32 callbacks */ - if (likely((-rcl.len & 31) || local_clock() < tlimit)) + if (likely((count & 31) || local_clock() < tlimit)) continue; /* Exceeded the time limit, so leave. */ break; } - if (offloaded) { - WARN_ON_ONCE(in_serving_softirq()); + if (!in_serving_softirq()) { local_bh_enable(); lockdep_assert_irqs_enabled(); cond_resched_tasks_rcu_qs(); @@ -2517,15 +2585,13 @@ static void rcu_do_batch(struct rcu_data *rdp) local_irq_save(flags); rcu_nocb_lock(rdp); - count = -rcl.len; rdp->n_cbs_invoked += count; trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); /* Update counts and requeue any remaining callbacks. */ rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); - smp_mb(); /* List handling before counting for rcu_barrier(). */ - rcu_segcblist_insert_count(&rdp->cblist, &rcl); + rcu_segcblist_add_len(&rdp->cblist, -count); /* Reinstate batch limit if we have worked down the excess. */ count = rcu_segcblist_n_cbs(&rdp->cblist); @@ -2543,9 +2609,12 @@ static void rcu_do_batch(struct rcu_data *rdp) * The following usually indicates a double call_rcu(). To track * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. */ - WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist)); + empty = rcu_segcblist_empty(&rdp->cblist); + WARN_ON_ONCE(count == 0 && !empty); WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - count != 0 && rcu_segcblist_empty(&rdp->cblist)); + count != 0 && empty); + WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0); + WARN_ON_ONCE(!empty && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0); rcu_nocb_unlock_irqrestore(rdp, flags); @@ -2566,6 +2635,7 @@ static void rcu_do_batch(struct rcu_data *rdp) void rcu_sched_clock_irq(int user) { trace_rcu_utilization(TPS("Start scheduler-tick")); + lockdep_assert_irqs_disabled(); raw_cpu_inc(rcu_data.ticks_this_gp); /* The load-acquire pairs with the store-release setting to true. */ if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) { @@ -2579,6 +2649,7 @@ void rcu_sched_clock_irq(int user) rcu_flavor_sched_clock_irq(user); if (rcu_pending(user)) invoke_rcu_core(); + lockdep_assert_irqs_disabled(); trace_rcu_utilization(TPS("End scheduler-tick")); } @@ -2688,7 +2759,7 @@ static __latent_entropy void rcu_core(void) unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); + const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist); if (cpu_is_offline(smp_processor_id())) return; @@ -2708,17 +2779,17 @@ static __latent_entropy void rcu_core(void) /* No grace period and unregistered callbacks? */ if (!rcu_gp_in_progress() && - rcu_segcblist_is_enabled(&rdp->cblist) && !offloaded) { - local_irq_save(flags); + rcu_segcblist_is_enabled(&rdp->cblist) && do_batch) { + rcu_nocb_lock_irqsave(rdp, flags); if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) rcu_accelerate_cbs_unlocked(rnp, rdp); - local_irq_restore(flags); + rcu_nocb_unlock_irqrestore(rdp, flags); } rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); /* If there are callbacks ready, invoke them. */ - if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist) && + if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) && likely(READ_ONCE(rcu_scheduler_fully_active))) rcu_do_batch(rdp); @@ -2941,6 +3012,7 @@ static void check_cb_ovld(struct rcu_data *rdp) static void __call_rcu(struct rcu_head *head, rcu_callback_t func) { + static atomic_t doublefrees; unsigned long flags; struct rcu_data *rdp; bool was_alldone; @@ -2954,8 +3026,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) * Use rcu:rcu_callback trace event to find the previous * time callback was passed to __call_rcu(). */ - WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n", - head, head->func); + if (atomic_inc_return(&doublefrees) < 4) { + pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func); + mem_dump_obj(head); + } WRITE_ONCE(head->func, rcu_leak_callback); return; } @@ -2989,6 +3063,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) trace_rcu_callback(rcu_state.name, head, rcu_segcblist_n_cbs(&rdp->cblist)); + trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued")); + /* Go handle any RCU core processing required. */ if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) { __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ @@ -3498,6 +3574,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) goto unlock_return; } + kasan_record_aux_stack(ptr); success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr); if (!success) { run_page_cache_worker(krcp); @@ -3747,6 +3824,8 @@ static int rcu_pending(int user) struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; + lockdep_assert_irqs_disabled(); + /* Check for CPU stalls, if enabled. */ check_cpu_stall(rdp); @@ -4001,12 +4080,18 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rcu_state.n_force_qs; rdp->blimit = blimit; - if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ - !rcu_segcblist_is_offloaded(&rdp->cblist)) - rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */ rcu_dynticks_eqs_online(); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + /* + * Lock in case the CB/GP kthreads are still around handling + * old callbacks (longer term we should flush all callbacks + * before completing CPU offline) + */ + rcu_nocb_lock(rdp); + if (rcu_segcblist_empty(&rdp->cblist)) /* No early-boot CBs? */ + rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ + rcu_nocb_unlock(rdp); /* * Add CPU to leaf rcu_node pending-online bitmask. Any needed @@ -4159,6 +4244,9 @@ void rcu_report_dead(unsigned int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + // Do any dangling deferred wakeups. + do_nocb_deferred_wakeup(rdp); + /* QS for any half-done expedited grace period. */ preempt_disable(); rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7708ed161f4a..71821d59d95c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -201,6 +201,7 @@ struct rcu_data { /* 5) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */ + struct swait_queue_head nocb_state_wq; /* For offloading state changes */ struct task_struct *nocb_gp_kthread; raw_spinlock_t nocb_lock; /* Guard following pair of fields. */ atomic_t nocb_lock_contended; /* Contention experienced. */ @@ -256,6 +257,7 @@ struct rcu_data { }; /* Values for nocb_defer_wakeup field in struct rcu_data. */ +#define RCU_NOCB_WAKE_OFF -1 #define RCU_NOCB_WAKE_NOT 0 #define RCU_NOCB_WAKE 1 #define RCU_NOCB_WAKE_FORCE 2 @@ -433,7 +435,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, unsigned long flags); static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); -static void do_nocb_deferred_wakeup(struct rcu_data *rdp); +static bool do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_cpu_nocb_kthread(int cpu); static void __init rcu_spawn_nocb_kthreads(void); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 8760b6ead770..6c6ff06d4ae6 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -545,7 +545,7 @@ static void synchronize_rcu_expedited_wait(void) data_race(rnp_root->expmask), ".T"[!!data_race(rnp_root->exp_tasks)]); if (ndetected) { - pr_err("blocking rcu_node structures:"); + pr_err("blocking rcu_node structures (internal RCU debug):"); rcu_for_each_node_breadth_first(rnp) { if (rnp == rnp_root) continue; /* printed unconditionally */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7e291ce0a1d6..2d603771c7dc 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -682,6 +682,7 @@ static void rcu_flavor_sched_clock_irq(int user) { struct task_struct *t = current; + lockdep_assert_irqs_disabled(); if (user || rcu_is_cpu_rrupt_from_idle()) { rcu_note_voluntary_context_switch(current); } @@ -1631,8 +1632,8 @@ bool rcu_is_nocb_cpu(int cpu) * Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock * and this function releases it. */ -static void wake_nocb_gp(struct rcu_data *rdp, bool force, - unsigned long flags) +static bool wake_nocb_gp(struct rcu_data *rdp, bool force, + unsigned long flags) __releases(rdp->nocb_lock) { bool needwake = false; @@ -1643,7 +1644,7 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force, trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("AlreadyAwake")); rcu_nocb_unlock_irqrestore(rdp, flags); - return; + return false; } del_timer(&rdp->nocb_timer); rcu_nocb_unlock_irqrestore(rdp, flags); @@ -1656,6 +1657,8 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force, raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); if (needwake) wake_up_process(rdp_gp->nocb_gp_kthread); + + return needwake; } /* @@ -1665,6 +1668,8 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force, static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, const char *reason) { + if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_OFF) + return; if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) mod_timer(&rdp->nocb_timer, jiffies + 1); if (rdp->nocb_defer_wakeup < waketype) @@ -1929,6 +1934,52 @@ static void do_nocb_bypass_wakeup_timer(struct timer_list *t) } /* + * Check if we ignore this rdp. + * + * We check that without holding the nocb lock but + * we make sure not to miss a freshly offloaded rdp + * with the current ordering: + * + * rdp_offload_toggle() nocb_gp_enabled_cb() + * ------------------------- ---------------------------- + * WRITE flags LOCK nocb_gp_lock + * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep + * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock + * UNLOCK nocb_gp_lock READ flags + */ +static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp) +{ + u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP; + + return rcu_segcblist_test_flags(&rdp->cblist, flags); +} + +static inline bool nocb_gp_update_state(struct rcu_data *rdp, bool *needwake_state) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) + *needwake_state = true; + } + return true; + } + + /* + * De-offloading. Clear our flag and notify the de-offload worker. + * We will ignore this rdp until it ever gets re-offloaded. + */ + WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) + *needwake_state = true; + return false; +} + + +/* * No-CBs GP kthreads come here to wait for additional callbacks to show up * or for grace periods to end. */ @@ -1956,8 +2007,18 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) */ WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp); for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { + bool needwake_state = false; + + if (!nocb_gp_enabled_cb(rdp)) + continue; trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); rcu_nocb_lock_irqsave(rdp, flags); + if (!nocb_gp_update_state(rdp, &needwake_state)) { + rcu_nocb_unlock_irqrestore(rdp, flags); + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); + continue; + } bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); if (bypass_ncbs && (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) || @@ -1967,6 +2028,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) { rcu_nocb_unlock_irqrestore(rdp, flags); + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); continue; /* No callbacks here, try next. */ } if (bypass_ncbs) { @@ -2018,6 +2081,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) } if (needwake_gp) rcu_gp_kthread_wake(); + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); } my_rdp->nocb_gp_bypass = bypass; @@ -2081,14 +2146,27 @@ static int rcu_nocb_gp_kthread(void *arg) return 0; } +static inline bool nocb_cb_can_run(struct rcu_data *rdp) +{ + u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB; + return rcu_segcblist_test_flags(&rdp->cblist, flags); +} + +static inline bool nocb_cb_wait_cond(struct rcu_data *rdp) +{ + return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep); +} + /* * Invoke any ready callbacks from the corresponding no-CBs CPU, * then, if there are no more, wait for more to appear. */ static void nocb_cb_wait(struct rcu_data *rdp) { + struct rcu_segcblist *cblist = &rdp->cblist; unsigned long cur_gp_seq; unsigned long flags; + bool needwake_state = false; bool needwake_gp = false; struct rcu_node *rnp = rdp->mynode; @@ -2100,32 +2178,55 @@ static void nocb_cb_wait(struct rcu_data *rdp) local_bh_enable(); lockdep_assert_irqs_enabled(); rcu_nocb_lock_irqsave(rdp, flags); - if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && + if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) && rcu_seq_done(&rnp->gp_seq, cur_gp_seq) && raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */ needwake_gp = rcu_advance_cbs(rdp->mynode, rdp); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ } - if (rcu_segcblist_ready_cbs(&rdp->cblist)) { - rcu_nocb_unlock_irqrestore(rdp, flags); - if (needwake_gp) - rcu_gp_kthread_wake(); - return; - } - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); WRITE_ONCE(rdp->nocb_cb_sleep, true); + + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) { + rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB); + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) + needwake_state = true; + } + if (rcu_segcblist_ready_cbs(cblist)) + WRITE_ONCE(rdp->nocb_cb_sleep, false); + } else { + /* + * De-offloading. Clear our flag and notify the de-offload worker. + * We won't touch the callbacks and keep sleeping until we ever + * get re-offloaded. + */ + WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) + needwake_state = true; + } + + if (rdp->nocb_cb_sleep) + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); + rcu_nocb_unlock_irqrestore(rdp, flags); if (needwake_gp) rcu_gp_kthread_wake(); - swait_event_interruptible_exclusive(rdp->nocb_cb_wq, - !READ_ONCE(rdp->nocb_cb_sleep)); - if (!smp_load_acquire(&rdp->nocb_cb_sleep)) { /* VVV */ - /* ^^^ Ensure CB invocation follows _sleep test. */ - return; - } - WARN_ON(signal_pending(current)); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); + + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); + + do { + swait_event_interruptible_exclusive(rdp->nocb_cb_wq, + nocb_cb_wait_cond(rdp)); + + // VVV Ensure CB invocation follows _sleep test. + if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^ + WARN_ON(signal_pending(current)); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); + } + } while (!nocb_cb_can_run(rdp)); } /* @@ -2148,24 +2249,27 @@ static int rcu_nocb_cb_kthread(void *arg) /* Is a deferred wakeup of rcu_nocb_kthread() required? */ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) { - return READ_ONCE(rdp->nocb_defer_wakeup); + return READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT; } /* Do a deferred wakeup of rcu_nocb_kthread(). */ -static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp) +static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp) { unsigned long flags; int ndw; + int ret; rcu_nocb_lock_irqsave(rdp, flags); if (!rcu_nocb_need_deferred_wakeup(rdp)) { rcu_nocb_unlock_irqrestore(rdp, flags); - return; + return false; } ndw = READ_ONCE(rdp->nocb_defer_wakeup); WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); + ret = wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); + + return ret; } /* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ @@ -2181,12 +2285,208 @@ static void do_nocb_deferred_wakeup_timer(struct timer_list *t) * This means we do an inexact common-case check. Note that if * we miss, ->nocb_timer will eventually clean things up. */ -static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) { if (rcu_nocb_need_deferred_wakeup(rdp)) - do_nocb_deferred_wakeup_common(rdp); + return do_nocb_deferred_wakeup_common(rdp); + return false; } +void rcu_nocb_flush_deferred_wakeup(void) +{ + do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data)); +} +EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup); + +static int rdp_offload_toggle(struct rcu_data *rdp, + bool offload, unsigned long flags) + __releases(rdp->nocb_lock) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + bool wake_gp = false; + + rcu_segcblist_offload(cblist, offload); + + if (rdp->nocb_cb_sleep) + rdp->nocb_cb_sleep = false; + rcu_nocb_unlock_irqrestore(rdp, flags); + + /* + * Ignore former value of nocb_cb_sleep and force wake up as it could + * have been spuriously set to false already. + */ + swake_up_one(&rdp->nocb_cb_wq); + + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + if (rdp_gp->nocb_gp_sleep) { + rdp_gp->nocb_gp_sleep = false; + wake_gp = true; + } + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + + if (wake_gp) + wake_up_process(rdp_gp->nocb_gp_kthread); + + return 0; +} + +static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + unsigned long flags; + int ret; + + pr_info("De-offloading %d\n", rdp->cpu); + + rcu_nocb_lock_irqsave(rdp, flags); + /* + * If there are still pending work offloaded, the offline + * CPU won't help much handling them. + */ + if (cpu_is_offline(rdp->cpu) && !rcu_segcblist_empty(&rdp->cblist)) { + rcu_nocb_unlock_irqrestore(rdp, flags); + return -EBUSY; + } + + ret = rdp_offload_toggle(rdp, false, flags); + swait_event_exclusive(rdp->nocb_state_wq, + !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | + SEGCBLIST_KTHREAD_GP)); + rcu_nocb_lock_irqsave(rdp, flags); + /* Make sure nocb timer won't stay around */ + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_OFF); + rcu_nocb_unlock_irqrestore(rdp, flags); + del_timer_sync(&rdp->nocb_timer); + + /* + * Flush bypass. While IRQs are disabled and once we set + * SEGCBLIST_SOFTIRQ_ONLY, no callback is supposed to be + * enqueued on bypass. + */ + rcu_nocb_lock_irqsave(rdp, flags); + rcu_nocb_flush_bypass(rdp, NULL, jiffies); + rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY); + /* + * With SEGCBLIST_SOFTIRQ_ONLY, we can't use + * rcu_nocb_unlock_irqrestore() anymore. Theoretically we + * could set SEGCBLIST_SOFTIRQ_ONLY with cb unlocked and IRQs + * disabled now, but let's be paranoid. + */ + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + + return ret; +} + +static long rcu_nocb_rdp_deoffload(void *arg) +{ + struct rcu_data *rdp = arg; + + WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + return __rcu_nocb_rdp_deoffload(rdp); +} + +int rcu_nocb_cpu_deoffload(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + int ret = 0; + + if (rdp == rdp->nocb_gp_rdp) { + pr_info("Can't deoffload an rdp GP leader (yet)\n"); + return -EINVAL; + } + mutex_lock(&rcu_state.barrier_mutex); + cpus_read_lock(); + if (rcu_segcblist_is_offloaded(&rdp->cblist)) { + if (cpu_online(cpu)) + ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); + else + ret = __rcu_nocb_rdp_deoffload(rdp); + if (!ret) + cpumask_clear_cpu(cpu, rcu_nocb_mask); + } + cpus_read_unlock(); + mutex_unlock(&rcu_state.barrier_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); + +static int __rcu_nocb_rdp_offload(struct rcu_data *rdp) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + unsigned long flags; + int ret; + + /* + * For now we only support re-offload, ie: the rdp must have been + * offloaded on boot first. + */ + if (!rdp->nocb_gp_rdp) + return -EINVAL; + + pr_info("Offloading %d\n", rdp->cpu); + /* + * Can't use rcu_nocb_lock_irqsave() while we are in + * SEGCBLIST_SOFTIRQ_ONLY mode. + */ + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + /* Re-enable nocb timer */ + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + /* + * We didn't take the nocb lock while working on the + * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode. + * Every modifications that have been done previously on + * rdp->cblist must be visible remotely by the nocb kthreads + * upon wake up after reading the cblist flags. + * + * The layout against nocb_lock enforces that ordering: + * + * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait() + * ------------------------- ---------------------------- + * WRITE callbacks rcu_nocb_lock() + * rcu_nocb_lock() READ flags + * WRITE flags READ callbacks + * rcu_nocb_unlock() rcu_nocb_unlock() + */ + ret = rdp_offload_toggle(rdp, true, flags); + swait_event_exclusive(rdp->nocb_state_wq, + rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) && + rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); + + return ret; +} + +static long rcu_nocb_rdp_offload(void *arg) +{ + struct rcu_data *rdp = arg; + + WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + return __rcu_nocb_rdp_offload(rdp); +} + +int rcu_nocb_cpu_offload(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + int ret = 0; + + mutex_lock(&rcu_state.barrier_mutex); + cpus_read_lock(); + if (!rcu_segcblist_is_offloaded(&rdp->cblist)) { + if (cpu_online(cpu)) + ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); + else + ret = __rcu_nocb_rdp_offload(rdp); + if (!ret) + cpumask_set_cpu(cpu, rcu_nocb_mask); + } + cpus_read_unlock(); + mutex_unlock(&rcu_state.barrier_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload); + void __init rcu_init_nohz(void) { int cpu; @@ -2229,7 +2529,9 @@ void __init rcu_init_nohz(void) rdp = per_cpu_ptr(&rcu_data, cpu); if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); - rcu_segcblist_offload(&rdp->cblist); + rcu_segcblist_offload(&rdp->cblist, true); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); } rcu_organize_nocb_kthreads(); } @@ -2239,6 +2541,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { init_swait_queue_head(&rdp->nocb_cb_wq); init_swait_queue_head(&rdp->nocb_gp_wq); + init_swait_queue_head(&rdp->nocb_state_wq); raw_spin_lock_init(&rdp->nocb_lock); raw_spin_lock_init(&rdp->nocb_bypass_lock); raw_spin_lock_init(&rdp->nocb_gp_lock); @@ -2381,6 +2684,19 @@ void rcu_bind_current_to_nocb(void) } EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); +// The ->on_cpu field is available only in CONFIG_SMP=y, so... +#ifdef CONFIG_SMP +static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) +{ + return tsp && tsp->state == TASK_RUNNING && !tsp->on_cpu ? "!" : ""; +} +#else // #ifdef CONFIG_SMP +static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) +{ + return ""; +} +#endif // #else #ifdef CONFIG_SMP + /* * Dump out nocb grace-period kthread state for the specified rcu_data * structure. @@ -2389,7 +2705,7 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp) { struct rcu_node *rnp = rdp->mynode; - pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu\n", + pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n", rdp->cpu, "kK"[!!rdp->nocb_gp_kthread], "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)], @@ -2403,12 +2719,17 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp) ".B"[!!rdp->nocb_gp_bypass], ".G"[!!rdp->nocb_gp_gp], (long)rdp->nocb_gp_seq, - rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops)); + rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops), + rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.', + rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, + show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); } /* Dump out nocb kthread state for the specified rcu_data structure. */ static void show_rcu_nocb_state(struct rcu_data *rdp) { + char bufw[20]; + char bufr[20]; struct rcu_segcblist *rsclp = &rdp->cblist; bool waslocked; bool wastimer; @@ -2417,8 +2738,11 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) if (rdp->nocb_gp_rdp == rdp) show_rcu_nocb_gp_state(rdp); - pr_info(" CB %d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%c%c%c q%ld\n", + sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); + sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); + pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", rdp->cpu, rdp->nocb_gp_rdp->cpu, + rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1, "kK"[!!rdp->nocb_cb_kthread], "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], "cC"[!!atomic_read(&rdp->nocb_lock_contended)], @@ -2429,11 +2753,16 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) jiffies - rdp->nocb_nobypass_last, rdp->nocb_nobypass_count, ".D"[rcu_segcblist_ready_cbs(rsclp)], - ".W"[!rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)], - ".R"[!rcu_segcblist_restempty(rsclp, RCU_WAIT_TAIL)], - ".N"[!rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL)], + ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)], + rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw, + ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)], + rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr, + ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)], ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], - rcu_segcblist_n_cbs(&rdp->cblist)); + rcu_segcblist_n_cbs(&rdp->cblist), + rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.', + rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, + show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); /* It is OK for GP kthreads to have GP state. */ if (rdp->nocb_gp_rdp == rdp) @@ -2518,8 +2847,9 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) return false; } -static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) { + return false; } static void rcu_spawn_cpu_nocb_kthread(int cpu) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 70d48c52fabc..475b26171b20 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -266,6 +266,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) struct task_struct *t; struct task_struct *ts[8]; + lockdep_assert_irqs_disabled(); if (!rcu_preempt_blocked_readers_cgp(rnp)) return 0; pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", @@ -290,6 +291,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) ".q"[rscr.rs.b.need_qs], ".e"[rscr.rs.b.exp_hint], ".l"[rscr.on_blkd_list]); + lockdep_assert_irqs_disabled(); put_task_struct(t); ndetected++; } @@ -333,9 +335,12 @@ static void rcu_dump_cpu_stacks(void) rcu_for_each_leaf_node(rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); for_each_leaf_node_possible_cpu(rnp, cpu) - if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) - if (!trigger_single_cpu_backtrace(cpu)) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { + if (cpu_is_offline(cpu)) + pr_err("Offline CPU %d blocking current GP.\n", cpu); + else if (!trigger_single_cpu_backtrace(cpu)) dump_cpu_task(cpu); + } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -449,25 +454,66 @@ static void print_cpu_stall_info(int cpu) /* Complain about starvation of grace-period kthread. */ static void rcu_check_gp_kthread_starvation(void) { + int cpu; struct task_struct *gpk = rcu_state.gp_kthread; unsigned long j; if (rcu_is_gp_kthread_starving(&j)) { + cpu = gpk ? task_cpu(gpk) : -1; pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", rcu_state.name, j, (long)rcu_seq_current(&rcu_state.gp_seq), data_race(rcu_state.gp_flags), gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, - gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); + gpk ? gpk->state : ~0, cpu); if (gpk) { pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name); pr_err("RCU grace-period kthread stack dump:\n"); sched_show_task(gpk); + if (cpu >= 0) { + if (cpu_is_offline(cpu)) { + pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu); + } else { + pr_err("Stack dump where RCU GP kthread last ran:\n"); + if (!trigger_single_cpu_backtrace(cpu)) + dump_cpu_task(cpu); + } + } wake_up_process(gpk); } } } +/* Complain about missing wakeups from expired fqs wait timer */ +static void rcu_check_gp_kthread_expired_fqs_timer(void) +{ + struct task_struct *gpk = rcu_state.gp_kthread; + short gp_state; + unsigned long jiffies_fqs; + int cpu; + + /* + * Order reads of .gp_state and .jiffies_force_qs. + * Matching smp_wmb() is present in rcu_gp_fqs_loop(). + */ + gp_state = smp_load_acquire(&rcu_state.gp_state); + jiffies_fqs = READ_ONCE(rcu_state.jiffies_force_qs); + + if (gp_state == RCU_GP_WAIT_FQS && + time_after(jiffies, jiffies_fqs + RCU_STALL_MIGHT_MIN) && + gpk && !READ_ONCE(gpk->on_rq)) { + cpu = task_cpu(gpk); + pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx\n", + rcu_state.name, (jiffies - jiffies_fqs), + (long)rcu_seq_current(&rcu_state.gp_seq), + data_race(rcu_state.gp_flags), + gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS, + gpk->state); + pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n", + cpu, kstat_softirqs_cpu(TIMER_SOFTIRQ, cpu)); + } +} + static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) { int cpu; @@ -478,6 +524,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) struct rcu_node *rnp; long totqlen = 0; + lockdep_assert_irqs_disabled(); + /* Kick and suppress, if so configured. */ rcu_stall_kick_kthreads(); if (rcu_stall_is_suppressed()) @@ -499,6 +547,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) } } ndetected += rcu_print_task_stall(rnp, flags); // Releases rnp->lock. + lockdep_assert_irqs_disabled(); } for_each_possible_cpu(cpu) @@ -529,6 +578,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) WRITE_ONCE(rcu_state.jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); + rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); panic_on_rcu_stall(); @@ -544,6 +594,8 @@ static void print_cpu_stall(unsigned long gps) struct rcu_node *rnp = rcu_get_root(); long totqlen = 0; + lockdep_assert_irqs_disabled(); + /* Kick and suppress, if so configured. */ rcu_stall_kick_kthreads(); if (rcu_stall_is_suppressed()) @@ -564,6 +616,7 @@ static void print_cpu_stall(unsigned long gps) jiffies - gps, (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); + rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); rcu_dump_cpu_stacks(); @@ -598,6 +651,7 @@ static void check_cpu_stall(struct rcu_data *rdp) unsigned long js; struct rcu_node *rnp; + lockdep_assert_irqs_disabled(); if ((rcu_stall_is_suppressed() && !READ_ONCE(rcu_kick_kthreads)) || !rcu_gp_in_progress()) return; diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 39334d2d2b37..b95ae86c40a7 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -56,8 +56,10 @@ #ifndef CONFIG_TINY_RCU module_param(rcu_expedited, int, 0); module_param(rcu_normal, int, 0); -static int rcu_normal_after_boot; +static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT); +#ifndef CONFIG_PREEMPT_RT module_param(rcu_normal_after_boot, int, 0); +#endif #endif /* #ifndef CONFIG_TINY_RCU */ #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/kernel/resource.c b/kernel/resource.c index 833394f9c608..627e61b0c124 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -18,12 +18,15 @@ #include <linux/spinlock.h> #include <linux/fs.h> #include <linux/proc_fs.h> +#include <linux/pseudo_fs.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/device.h> #include <linux/pfn.h> #include <linux/mm.h> +#include <linux/mount.h> #include <linux/resource_ext.h> +#include <uapi/linux/magic.h> #include <asm/io.h> @@ -1119,6 +1122,55 @@ resource_size_t resource_alignment(struct resource *res) static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait); +static struct inode *iomem_inode; + +#ifdef CONFIG_IO_STRICT_DEVMEM +static void revoke_iomem(struct resource *res) +{ + /* pairs with smp_store_release() in iomem_init_inode() */ + struct inode *inode = smp_load_acquire(&iomem_inode); + + /* + * Check that the initialization has completed. Losing the race + * is ok because it means drivers are claiming resources before + * the fs_initcall level of init and prevent iomem_get_mapping users + * from establishing mappings. + */ + if (!inode) + return; + + /* + * The expectation is that the driver has successfully marked + * the resource busy by this point, so devmem_is_allowed() + * should start returning false, however for performance this + * does not iterate the entire resource range. + */ + if (devmem_is_allowed(PHYS_PFN(res->start)) && + devmem_is_allowed(PHYS_PFN(res->end))) { + /* + * *cringe* iomem=relaxed says "go ahead, what's the + * worst that can happen?" + */ + return; + } + + unmap_mapping_range(inode->i_mapping, res->start, resource_size(res), 1); +} +#else +static void revoke_iomem(struct resource *res) {} +#endif + +struct address_space *iomem_get_mapping(void) +{ + /* + * This function is only called from file open paths, hence guaranteed + * that fs_initcalls have completed and no need to check for NULL. But + * since revoke_iomem can be called before the initcall we still need + * the barrier to appease checkers. + */ + return smp_load_acquire(&iomem_inode)->i_mapping; +} + /** * __request_region - create a new busy resource region * @parent: parent resource descriptor @@ -1186,7 +1238,7 @@ struct resource * __request_region(struct resource *parent, write_unlock(&resource_lock); if (res && orig_parent == &iomem_resource) - revoke_devmem(res); + revoke_iomem(res); return res; } @@ -1786,4 +1838,48 @@ static int __init strict_iomem(char *str) return 1; } +static int iomem_fs_init_fs_context(struct fs_context *fc) +{ + return init_pseudo(fc, DEVMEM_MAGIC) ? 0 : -ENOMEM; +} + +static struct file_system_type iomem_fs_type = { + .name = "iomem", + .owner = THIS_MODULE, + .init_fs_context = iomem_fs_init_fs_context, + .kill_sb = kill_anon_super, +}; + +static int __init iomem_init_inode(void) +{ + static struct vfsmount *iomem_vfs_mount; + static int iomem_fs_cnt; + struct inode *inode; + int rc; + + rc = simple_pin_fs(&iomem_fs_type, &iomem_vfs_mount, &iomem_fs_cnt); + if (rc < 0) { + pr_err("Cannot mount iomem pseudo filesystem: %d\n", rc); + return rc; + } + + inode = alloc_anon_inode(iomem_vfs_mount->mnt_sb); + if (IS_ERR(inode)) { + rc = PTR_ERR(inode); + pr_err("Cannot allocate inode for iomem: %d\n", rc); + simple_release_fs(&iomem_vfs_mount, &iomem_fs_cnt); + return rc; + } + + /* + * Publish iomem revocation inode initialized. + * Pairs with smp_load_acquire() in revoke_iomem(). + */ + smp_store_release(&iomem_inode, inode); + + return 0; +} + +fs_initcall(iomem_init_inode); + __setup("iomem=", strict_iomem); diff --git a/kernel/scftorture.c b/kernel/scftorture.c index d55a9f8cda3d..2377cbb32474 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -398,6 +398,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra static int scftorture_invoker(void *arg) { int cpu; + int curcpu; DEFINE_TORTURE_RANDOM(rand); struct scf_statistics *scfp = (struct scf_statistics *)arg; bool was_offline = false; @@ -412,7 +413,10 @@ static int scftorture_invoker(void *arg) VERBOSE_SCFTORTOUT("scftorture_invoker %d: Waiting for all SCF torturers from cpu %d", scfp->cpu, smp_processor_id()); // Make sure that the CPU is affinitized appropriately during testing. - WARN_ON_ONCE(smp_processor_id() != scfp->cpu); + curcpu = smp_processor_id(); + WARN_ONCE(curcpu != scfp->cpu % nr_cpu_ids, + "%s: Wanted CPU %d, running on %d, nr_cpu_ids = %d\n", + __func__, scfp->cpu, curcpu, nr_cpu_ids); if (!atomic_dec_return(&n_started)) while (atomic_read_acquire(&n_started)) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ff74fca39ed2..ca2bb629595f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -355,8 +355,9 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) static void __hrtick_restart(struct rq *rq) { struct hrtimer *timer = &rq->hrtick_timer; + ktime_t time = rq->hrtick_time; - hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); + hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD); } /* @@ -380,7 +381,6 @@ static void __hrtick_start(void *arg) void hrtick_start(struct rq *rq, u64 delay) { struct hrtimer *timer = &rq->hrtick_timer; - ktime_t time; s64 delta; /* @@ -388,9 +388,7 @@ void hrtick_start(struct rq *rq, u64 delay) * doesn't make sense and can cause timer DoS. */ delta = max_t(s64, delay, 10000LL); - time = ktime_add_ns(timer->base->get_time(), delta); - - hrtimer_set_expires(timer, time); + rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta); if (rq == this_rq()) __hrtick_restart(rq); @@ -3478,7 +3476,7 @@ out: /** * try_invoke_on_locked_down_task - Invoke a function on task in fixed state - * @p: Process for which the function is to be invoked. + * @p: Process for which the function is to be invoked, can be @current. * @func: Function to invoke. * @arg: Argument to function. * @@ -3496,12 +3494,11 @@ out: */ bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) { - bool ret = false; struct rq_flags rf; + bool ret = false; struct rq *rq; - lockdep_assert_irqs_enabled(); - raw_spin_lock_irq(&p->pi_lock); + raw_spin_lock_irqsave(&p->pi_lock, rf.flags); if (p->on_rq) { rq = __task_rq_lock(p, &rf); if (task_rq(p) == rq) @@ -3518,7 +3515,7 @@ bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct t ret = func(p, arg); } } - raw_spin_unlock_irq(&p->pi_lock); + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); return ret; } @@ -4971,7 +4968,7 @@ static void __sched notrace __schedule(bool preempt) schedule_debug(prev, preempt); - if (sched_feat(HRTICK)) + if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) hrtick_clear(rq); local_irq_disable(); @@ -5265,6 +5262,12 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); +#ifdef CONFIG_PREEMPT_DYNAMIC +DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func); +EXPORT_STATIC_CALL_TRAMP(preempt_schedule); +#endif + + /** * preempt_schedule_notrace - preempt_schedule called by tracing * @@ -5317,8 +5320,197 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) } EXPORT_SYMBOL_GPL(preempt_schedule_notrace); +#ifdef CONFIG_PREEMPT_DYNAMIC +DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func); +EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); +#endif + #endif /* CONFIG_PREEMPTION */ +#ifdef CONFIG_PREEMPT_DYNAMIC + +#include <linux/entry-common.h> + +/* + * SC:cond_resched + * SC:might_resched + * SC:preempt_schedule + * SC:preempt_schedule_notrace + * SC:irqentry_exit_cond_resched + * + * + * NONE: + * cond_resched <- __cond_resched + * might_resched <- RET0 + * preempt_schedule <- NOP + * preempt_schedule_notrace <- NOP + * irqentry_exit_cond_resched <- NOP + * + * VOLUNTARY: + * cond_resched <- __cond_resched + * might_resched <- __cond_resched + * preempt_schedule <- NOP + * preempt_schedule_notrace <- NOP + * irqentry_exit_cond_resched <- NOP + * + * FULL: + * cond_resched <- RET0 + * might_resched <- RET0 + * preempt_schedule <- preempt_schedule + * preempt_schedule_notrace <- preempt_schedule_notrace + * irqentry_exit_cond_resched <- irqentry_exit_cond_resched + */ + +enum { + preempt_dynamic_none = 0, + preempt_dynamic_voluntary, + preempt_dynamic_full, +}; + +static int preempt_dynamic_mode = preempt_dynamic_full; + +static int sched_dynamic_mode(const char *str) +{ + if (!strcmp(str, "none")) + return 0; + + if (!strcmp(str, "voluntary")) + return 1; + + if (!strcmp(str, "full")) + return 2; + + return -1; +} + +static void sched_dynamic_update(int mode) +{ + /* + * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in + * the ZERO state, which is invalid. + */ + static_call_update(cond_resched, __cond_resched); + static_call_update(might_resched, __cond_resched); + static_call_update(preempt_schedule, __preempt_schedule_func); + static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func); + static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched); + + switch (mode) { + case preempt_dynamic_none: + static_call_update(cond_resched, __cond_resched); + static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0); + static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL); + static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL); + static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL); + pr_info("Dynamic Preempt: none\n"); + break; + + case preempt_dynamic_voluntary: + static_call_update(cond_resched, __cond_resched); + static_call_update(might_resched, __cond_resched); + static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL); + static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL); + static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL); + pr_info("Dynamic Preempt: voluntary\n"); + break; + + case preempt_dynamic_full: + static_call_update(cond_resched, (typeof(&__cond_resched)) __static_call_return0); + static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0); + static_call_update(preempt_schedule, __preempt_schedule_func); + static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func); + static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched); + pr_info("Dynamic Preempt: full\n"); + break; + } + + preempt_dynamic_mode = mode; +} + +static int __init setup_preempt_mode(char *str) +{ + int mode = sched_dynamic_mode(str); + if (mode < 0) { + pr_warn("Dynamic Preempt: unsupported mode: %s\n", str); + return 1; + } + + sched_dynamic_update(mode); + return 0; +} +__setup("preempt=", setup_preempt_mode); + +#ifdef CONFIG_SCHED_DEBUG + +static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[16]; + int mode; + + if (cnt > 15) + cnt = 15; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + mode = sched_dynamic_mode(strstrip(buf)); + if (mode < 0) + return mode; + + sched_dynamic_update(mode); + + *ppos += cnt; + + return cnt; +} + +static int sched_dynamic_show(struct seq_file *m, void *v) +{ + static const char * preempt_modes[] = { + "none", "voluntary", "full" + }; + int i; + + for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) { + if (preempt_dynamic_mode == i) + seq_puts(m, "("); + seq_puts(m, preempt_modes[i]); + if (preempt_dynamic_mode == i) + seq_puts(m, ")"); + + seq_puts(m, " "); + } + + seq_puts(m, "\n"); + return 0; +} + +static int sched_dynamic_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_dynamic_show, NULL); +} + +static const struct file_operations sched_dynamic_fops = { + .open = sched_dynamic_open, + .write = sched_dynamic_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static __init int sched_init_debug_dynamic(void) +{ + debugfs_create_file("sched_preempt", 0644, NULL, NULL, &sched_dynamic_fops); + return 0; +} +late_initcall(sched_init_debug_dynamic); + +#endif /* CONFIG_SCHED_DEBUG */ +#endif /* CONFIG_PREEMPT_DYNAMIC */ + + /* * This is the entry point to schedule() from kernel preemption * off of irq context. @@ -5616,8 +5808,12 @@ SYSCALL_DEFINE1(nice, int, increment) * @p: the task in question. * * Return: The priority value as seen by users in /proc. - * RT tasks are offset by -200. Normal tasks are centered - * around 0, value goes from -16 to +15. + * + * sched policy return value kernel prio user prio/nice + * + * normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19] + * fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99] + * deadline -101 -1 0 */ int task_prio(const struct task_struct *p) { @@ -5676,6 +5872,120 @@ struct task_struct *idle_task(int cpu) return cpu_rq(cpu)->idle; } +#ifdef CONFIG_SMP +/* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. + * + * The scheduler tracks the following metrics: + * + * cpu_util_{cfs,rt,dl,irq}() + * cpu_bw_dl() + * + * Where the cfs,rt and dl util numbers are tracked with the same metric and + * synchronized windows and are thus directly comparable. + * + * The cfs,rt,dl utilization are the running times measured with rq->clock_task + * which excludes things like IRQ and steal-time. These latter are then accrued + * in the irq utilization. + * + * The DL bandwidth number otoh is not a measured metric but a value computed + * based on the task model parameters and gives the minimal utilization + * required to meet deadlines. + */ +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum cpu_util_type type, + struct task_struct *p) +{ + unsigned long dl_util, util, irq; + struct rq *rq = cpu_rq(cpu); + + if (!uclamp_is_used() && + type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { + return max; + } + + /* + * Early check to see if IRQ/steal time saturates the CPU, can be + * because of inaccuracies in how we track these -- see + * update_irq_load_avg(). + */ + irq = cpu_util_irq(rq); + if (unlikely(irq >= max)) + return max; + + /* + * Because the time spend on RT/DL tasks is visible as 'lost' time to + * CFS tasks and we use the same metric to track the effective + * utilization (PELT windows are synchronized) we can directly add them + * to obtain the CPU's actual utilization. + * + * CFS and RT utilization can be boosted or capped, depending on + * utilization clamp constraints requested by currently RUNNABLE + * tasks. + * When there are no CFS RUNNABLE tasks, clamps are released and + * frequency will be gracefully reduced with the utilization decay. + */ + util = util_cfs + cpu_util_rt(rq); + if (type == FREQUENCY_UTIL) + util = uclamp_rq_util_with(rq, util, p); + + dl_util = cpu_util_dl(rq); + + /* + * For frequency selection we do not make cpu_util_dl() a permanent part + * of this sum because we want to use cpu_bw_dl() later on, but we need + * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such + * that we select f_max when there is no idle time. + * + * NOTE: numerical errors or stop class might cause us to not quite hit + * saturation when we should -- something for later. + */ + if (util + dl_util >= max) + return max; + + /* + * OTOH, for energy computation we need the estimated running time, so + * include util_dl and ignore dl_bw. + */ + if (type == ENERGY_UTIL) + util += dl_util; + + /* + * There is still idle time; further improve the number by using the + * irq metric. Because IRQ/steal time is hidden from the task clock we + * need to scale the task numbers: + * + * max - irq + * U' = irq + --------- * U + * max + */ + util = scale_irq_capacity(util, irq, max); + util += irq; + + /* + * Bandwidth required by DEADLINE must always be granted while, for + * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism + * to gracefully reduce the frequency when no tasks show up for longer + * periods of time. + * + * Ideally we would like to set bw_dl as min/guaranteed freq and util + + * bw_dl as requested freq. However, cpufreq is not yet ready for such + * an interface. So, we only do the latter for now. + */ + if (type == FREQUENCY_UTIL) + util += cpu_bw_dl(rq); + + return min(max, util); +} + +unsigned long sched_cpu_util(int cpu, unsigned long max) +{ + return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max, + ENERGY_UTIL, NULL); +} +#endif /* CONFIG_SMP */ + /** * find_process_by_pid - find a process with a matching PID value. * @pid: the pid in question. @@ -5797,11 +6107,10 @@ recheck: /* * Valid priorities for SCHED_FIFO and SCHED_RR are - * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, + * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL, * SCHED_BATCH and SCHED_IDLE is 0. */ - if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || - (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) + if (attr->sched_priority > MAX_RT_PRIO-1) return -EINVAL; if ((dl_policy(policy) && !__checkparam_dl(attr)) || (rt_policy(policy) != (attr->sched_priority != 0))) @@ -6668,17 +6977,27 @@ SYSCALL_DEFINE0(sched_yield) return 0; } -#ifndef CONFIG_PREEMPTION -int __sched _cond_resched(void) +#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) +int __sched __cond_resched(void) { if (should_resched(0)) { preempt_schedule_common(); return 1; } +#ifndef CONFIG_PREEMPT_RCU rcu_all_qs(); +#endif return 0; } -EXPORT_SYMBOL(_cond_resched); +EXPORT_SYMBOL(__cond_resched); +#endif + +#ifdef CONFIG_PREEMPT_DYNAMIC +DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); +EXPORT_STATIC_CALL_TRAMP(cond_resched); + +DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); +EXPORT_STATIC_CALL_TRAMP(might_resched); #endif /* @@ -6709,6 +7028,46 @@ int __cond_resched_lock(spinlock_t *lock) } EXPORT_SYMBOL(__cond_resched_lock); +int __cond_resched_rwlock_read(rwlock_t *lock) +{ + int resched = should_resched(PREEMPT_LOCK_OFFSET); + int ret = 0; + + lockdep_assert_held_read(lock); + + if (rwlock_needbreak(lock) || resched) { + read_unlock(lock); + if (resched) + preempt_schedule_common(); + else + cpu_relax(); + ret = 1; + read_lock(lock); + } + return ret; +} +EXPORT_SYMBOL(__cond_resched_rwlock_read); + +int __cond_resched_rwlock_write(rwlock_t *lock) +{ + int resched = should_resched(PREEMPT_LOCK_OFFSET); + int ret = 0; + + lockdep_assert_held_write(lock); + + if (rwlock_needbreak(lock) || resched) { + write_unlock(lock); + if (resched) + preempt_schedule_common(); + else + cpu_relax(); + ret = 1; + write_lock(lock); + } + return ret; +} +EXPORT_SYMBOL(__cond_resched_rwlock_write); + /** * yield - yield the current processor to other threads. * @@ -6869,7 +7228,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) switch (policy) { case SCHED_FIFO: case SCHED_RR: - ret = MAX_USER_RT_PRIO-1; + ret = MAX_RT_PRIO-1; break; case SCHED_DEADLINE: case SCHED_NORMAL: @@ -7509,6 +7868,12 @@ int sched_cpu_deactivate(unsigned int cpu) struct rq_flags rf; int ret; + /* + * Remove CPU from nohz.idle_cpus_mask to prevent participating in + * load balancing when not active + */ + nohz_balance_exit_idle(rq); + set_cpu_active(cpu, false); /* @@ -7653,7 +8018,6 @@ int sched_cpu_dying(unsigned int cpu) calc_load_migrate(rq); update_max_interval(); - nohz_balance_exit_idle(rq); hrtick_clear(rq); return 0; } diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index fe7df039ce9e..50cbad89f7fa 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -171,112 +171,6 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return cpufreq_driver_resolve_freq(policy, freq); } -/* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. - * - * The scheduler tracks the following metrics: - * - * cpu_util_{cfs,rt,dl,irq}() - * cpu_bw_dl() - * - * Where the cfs,rt and dl util numbers are tracked with the same metric and - * synchronized windows and are thus directly comparable. - * - * The cfs,rt,dl utilization are the running times measured with rq->clock_task - * which excludes things like IRQ and steal-time. These latter are then accrued - * in the irq utilization. - * - * The DL bandwidth number otoh is not a measured metric but a value computed - * based on the task model parameters and gives the minimal utilization - * required to meet deadlines. - */ -unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type, - struct task_struct *p) -{ - unsigned long dl_util, util, irq; - struct rq *rq = cpu_rq(cpu); - - if (!uclamp_is_used() && - type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { - return max; - } - - /* - * Early check to see if IRQ/steal time saturates the CPU, can be - * because of inaccuracies in how we track these -- see - * update_irq_load_avg(). - */ - irq = cpu_util_irq(rq); - if (unlikely(irq >= max)) - return max; - - /* - * Because the time spend on RT/DL tasks is visible as 'lost' time to - * CFS tasks and we use the same metric to track the effective - * utilization (PELT windows are synchronized) we can directly add them - * to obtain the CPU's actual utilization. - * - * CFS and RT utilization can be boosted or capped, depending on - * utilization clamp constraints requested by currently RUNNABLE - * tasks. - * When there are no CFS RUNNABLE tasks, clamps are released and - * frequency will be gracefully reduced with the utilization decay. - */ - util = util_cfs + cpu_util_rt(rq); - if (type == FREQUENCY_UTIL) - util = uclamp_rq_util_with(rq, util, p); - - dl_util = cpu_util_dl(rq); - - /* - * For frequency selection we do not make cpu_util_dl() a permanent part - * of this sum because we want to use cpu_bw_dl() later on, but we need - * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such - * that we select f_max when there is no idle time. - * - * NOTE: numerical errors or stop class might cause us to not quite hit - * saturation when we should -- something for later. - */ - if (util + dl_util >= max) - return max; - - /* - * OTOH, for energy computation we need the estimated running time, so - * include util_dl and ignore dl_bw. - */ - if (type == ENERGY_UTIL) - util += dl_util; - - /* - * There is still idle time; further improve the number by using the - * irq metric. Because IRQ/steal time is hidden from the task clock we - * need to scale the task numbers: - * - * max - irq - * U' = irq + --------- * U - * max - */ - util = scale_irq_capacity(util, irq, max); - util += irq; - - /* - * Bandwidth required by DEADLINE must always be granted while, for - * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism - * to gracefully reduce the frequency when no tasks show up for longer - * periods of time. - * - * Ideally we would like to set bw_dl as min/guaranteed freq and util + - * bw_dl as requested freq. However, cpufreq is not yet ready for such - * an interface. So, we only do the latter for now. - */ - if (type == FREQUENCY_UTIL) - util += cpu_bw_dl(rq); - - return min(max, util); -} - static void sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); @@ -284,7 +178,7 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) sg_cpu->max = max; sg_cpu->bw_dl = cpu_bw_dl(rq); - sg_cpu->util = schedutil_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max, + sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max, FREQUENCY_UTIL, NULL); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 75686c6d4436..aac3539aa0fe 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -517,58 +517,44 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) update_dl_migration(dl_rq); } +#define __node_2_pdl(node) \ + rb_entry((node), struct task_struct, pushable_dl_tasks) + +static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b) +{ + return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl); +} + /* * The list of pushable -deadline task is not a plist, like in * sched_rt.c, it is an rb-tree with tasks ordered by deadline. */ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) { - struct dl_rq *dl_rq = &rq->dl; - struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_root.rb_node; - struct rb_node *parent = NULL; - struct task_struct *entry; - bool leftmost = true; + struct rb_node *leftmost; BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); - while (*link) { - parent = *link; - entry = rb_entry(parent, struct task_struct, - pushable_dl_tasks); - if (dl_entity_preempt(&p->dl, &entry->dl)) - link = &parent->rb_left; - else { - link = &parent->rb_right; - leftmost = false; - } - } - + leftmost = rb_add_cached(&p->pushable_dl_tasks, + &rq->dl.pushable_dl_tasks_root, + __pushable_less); if (leftmost) - dl_rq->earliest_dl.next = p->dl.deadline; - - rb_link_node(&p->pushable_dl_tasks, parent, link); - rb_insert_color_cached(&p->pushable_dl_tasks, - &dl_rq->pushable_dl_tasks_root, leftmost); + rq->dl.earliest_dl.next = p->dl.deadline; } static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) { struct dl_rq *dl_rq = &rq->dl; + struct rb_root_cached *root = &dl_rq->pushable_dl_tasks_root; + struct rb_node *leftmost; if (RB_EMPTY_NODE(&p->pushable_dl_tasks)) return; - if (dl_rq->pushable_dl_tasks_root.rb_leftmost == &p->pushable_dl_tasks) { - struct rb_node *next_node; - - next_node = rb_next(&p->pushable_dl_tasks); - if (next_node) { - dl_rq->earliest_dl.next = rb_entry(next_node, - struct task_struct, pushable_dl_tasks)->dl.deadline; - } - } + leftmost = rb_erase_cached(&p->pushable_dl_tasks, root); + if (leftmost) + dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline; - rb_erase_cached(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); RB_CLEAR_NODE(&p->pushable_dl_tasks); } @@ -1478,29 +1464,21 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) dec_dl_migration(dl_se, dl_rq); } +#define __node_2_dle(node) \ + rb_entry((node), struct sched_dl_entity, rb_node) + +static inline bool __dl_less(struct rb_node *a, const struct rb_node *b) +{ + return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline); +} + static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rb_node **link = &dl_rq->root.rb_root.rb_node; - struct rb_node *parent = NULL; - struct sched_dl_entity *entry; - int leftmost = 1; BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node)); - while (*link) { - parent = *link; - entry = rb_entry(parent, struct sched_dl_entity, rb_node); - if (dl_time_before(dl_se->deadline, entry->deadline)) - link = &parent->rb_left; - else { - link = &parent->rb_right; - leftmost = 0; - } - } - - rb_link_node(&dl_se->rb_node, parent, link); - rb_insert_color_cached(&dl_se->rb_node, &dl_rq->root, leftmost); + rb_add_cached(&dl_se->rb_node, &dl_rq->root, __dl_less); inc_dl_tasks(dl_se, dl_rq); } @@ -1513,6 +1491,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) return; rb_erase_cached(&dl_se->rb_node, &dl_rq->root); + RB_CLEAR_NODE(&dl_se->rb_node); dec_dl_tasks(dl_se, dl_rq); @@ -1853,7 +1832,7 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) if (!first) return; - if (hrtick_enabled(rq)) + if (hrtick_enabled_dl(rq)) start_hrtick_dl(rq, p); if (rq->curr->sched_class != &dl_sched_class) @@ -1916,7 +1895,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) * not being the leftmost task anymore. In that case NEED_RESCHED will * be set and schedule() will start a new hrtick for the next task. */ - if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 && + if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 && is_leftmost(p, &rq->dl)) start_hrtick_dl(rq, p); } @@ -2409,9 +2388,13 @@ void dl_add_task_root_domain(struct task_struct *p) struct rq *rq; struct dl_bw *dl_b; - rq = task_rq_lock(p, &rf); - if (!dl_task(p)) - goto unlock; + raw_spin_lock_irqsave(&p->pi_lock, rf.flags); + if (!dl_task(p)) { + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); + return; + } + + rq = __task_rq_lock(p, &rf); dl_b = &rq->rd->dl_bw; raw_spin_lock(&dl_b->lock); @@ -2420,7 +2403,6 @@ void dl_add_task_root_domain(struct task_struct *p) raw_spin_unlock(&dl_b->lock); -unlock: task_rq_unlock(rq, p, &rf); } @@ -2514,7 +2496,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) static void prio_changed_dl(struct rq *rq, struct task_struct *p, int oldprio) { - if (task_on_rq_queued(p) || rq->curr == p) { + if (task_on_rq_queued(p) || task_current(rq, p)) { #ifdef CONFIG_SMP /* * This might be too much, but unfortunately diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2357921580f9..486f403a778b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -486,7 +486,7 @@ static char *task_group_path(struct task_group *tg) static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { - if (rq->curr == p) + if (task_current(rq, p)) SEQ_printf(m, ">R"); else SEQ_printf(m, " %c", task_state_to_char(p)); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 04a3ce20da67..8a8bd7b13634 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -531,12 +531,15 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) return min_vruntime; } -static inline int entity_before(struct sched_entity *a, +static inline bool entity_before(struct sched_entity *a, struct sched_entity *b) { return (s64)(a->vruntime - b->vruntime) < 0; } +#define __node_2_se(node) \ + rb_entry((node), struct sched_entity, run_node) + static void update_min_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; @@ -552,8 +555,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) } if (leftmost) { /* non-empty tree */ - struct sched_entity *se; - se = rb_entry(leftmost, struct sched_entity, run_node); + struct sched_entity *se = __node_2_se(leftmost); if (!curr) vruntime = se->vruntime; @@ -569,37 +571,17 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) #endif } +static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) +{ + return entity_before(__node_2_se(a), __node_2_se(b)); +} + /* * Enqueue an entity into the rb-tree: */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node; - struct rb_node *parent = NULL; - struct sched_entity *entry; - bool leftmost = true; - - /* - * Find the right place in the rbtree: - */ - while (*link) { - parent = *link; - entry = rb_entry(parent, struct sched_entity, run_node); - /* - * We dont care about collisions. Nodes with - * the same key stay together. - */ - if (entity_before(se, entry)) { - link = &parent->rb_left; - } else { - link = &parent->rb_right; - leftmost = false; - } - } - - rb_link_node(&se->run_node, parent, link); - rb_insert_color_cached(&se->run_node, - &cfs_rq->tasks_timeline, leftmost); + rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less); } static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -614,7 +596,7 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) if (!left) return NULL; - return rb_entry(left, struct sched_entity, run_node); + return __node_2_se(left); } static struct sched_entity *__pick_next_entity(struct sched_entity *se) @@ -624,7 +606,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) if (!next) return NULL; - return rb_entry(next, struct sched_entity, run_node); + return __node_2_se(next); } #ifdef CONFIG_SCHED_DEBUG @@ -635,7 +617,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) if (!last) return NULL; - return rb_entry(last, struct sched_entity, run_node); + return __node_2_se(last); } /************************************************************** @@ -3943,6 +3925,22 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, trace_sched_util_est_cfs_tp(cfs_rq); } +static inline void util_est_dequeue(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + unsigned int enqueued; + + if (!sched_feat(UTIL_EST)) + return; + + /* Update root cfs_rq's estimated utilization */ + enqueued = cfs_rq->avg.util_est.enqueued; + enqueued -= min_t(unsigned int, enqueued, _task_util_est(p)); + WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + + trace_sched_util_est_cfs_tp(cfs_rq); +} + /* * Check if a (signed) value is within a specified (unsigned) margin, * based on the observation that: @@ -3956,23 +3954,16 @@ static inline bool within_margin(int value, int margin) return ((unsigned int)(value + margin - 1) < (2 * margin - 1)); } -static void -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) +static inline void util_est_update(struct cfs_rq *cfs_rq, + struct task_struct *p, + bool task_sleep) { long last_ewma_diff; struct util_est ue; - int cpu; if (!sched_feat(UTIL_EST)) return; - /* Update root cfs_rq's estimated utilization */ - ue.enqueued = cfs_rq->avg.util_est.enqueued; - ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p)); - WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); - - trace_sched_util_est_cfs_tp(cfs_rq); - /* * Skip update of task's estimated utilization when the task has not * yet completed an activation, e.g. being migrated. @@ -4012,8 +4003,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) * To avoid overestimation of actual task utilization, skip updates if * we cannot grant there is idle time in this CPU. */ - cpu = cpu_of(rq_of(cfs_rq)); - if (task_util(p) > capacity_orig_of(cpu)) + if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq)))) return; /* @@ -4052,7 +4042,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) if (!static_branch_unlikely(&sched_asym_cpucapacity)) return; - if (!p) { + if (!p || p->nr_cpus_allowed == 1) { rq->misfit_task_load = 0; return; } @@ -4096,8 +4086,11 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} static inline void -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, - bool task_sleep) {} +util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {} + +static inline void +util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, + bool task_sleep) {} static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ @@ -5419,7 +5412,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) s64 delta = slice - ran; if (delta < 0) { - if (rq->curr == p) + if (task_current(rq, p)) resched_curr(rq); return; } @@ -5436,7 +5429,7 @@ static void hrtick_update(struct rq *rq) { struct task_struct *curr = rq->curr; - if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class) + if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class) return; if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) @@ -5609,6 +5602,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) int idle_h_nr_running = task_has_idle_policy(p); bool was_sched_idle = sched_idle_rq(rq); + util_est_dequeue(&rq->cfs, p); + for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); @@ -5659,7 +5654,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) rq->next_balance = jiffies; dequeue_throttle: - util_est_dequeue(&rq->cfs, p, task_sleep); + util_est_update(&rq->cfs, p, task_sleep); hrtick_update(rq); } @@ -6006,6 +6001,14 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p return new_cpu; } +static inline int __select_idle_cpu(int cpu) +{ + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) + return cpu; + + return -1; +} + #ifdef CONFIG_SCHED_SMT DEFINE_STATIC_KEY_FALSE(sched_smt_present); EXPORT_SYMBOL_GPL(sched_smt_present); @@ -6064,74 +6067,51 @@ unlock: * there are no idle cores left in the system; tracked through * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above. */ -static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) { - struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - int core, cpu; + bool idle = true; + int cpu; if (!static_branch_likely(&sched_smt_present)) - return -1; + return __select_idle_cpu(core); - if (!test_idle_cores(target, false)) - return -1; - - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - - for_each_cpu_wrap(core, cpus, target) { - bool idle = true; - - for_each_cpu(cpu, cpu_smt_mask(core)) { - if (!available_idle_cpu(cpu)) { - idle = false; - break; + for_each_cpu(cpu, cpu_smt_mask(core)) { + if (!available_idle_cpu(cpu)) { + idle = false; + if (*idle_cpu == -1) { + if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) { + *idle_cpu = cpu; + break; + } + continue; } + break; } - - if (idle) - return core; - - cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); + if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr)) + *idle_cpu = cpu; } - /* - * Failed to find an idle core; stop looking for one. - */ - set_idle_cores(target, 0); + if (idle) + return core; + cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); return -1; } -/* - * Scan the local SMT mask for idle CPUs. - */ -static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) -{ - int cpu; - - if (!static_branch_likely(&sched_smt_present)) - return -1; - - for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, p->cpus_ptr) || - !cpumask_test_cpu(cpu, sched_domain_span(sd))) - continue; - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) - return cpu; - } +#else /* CONFIG_SCHED_SMT */ - return -1; +static inline void set_idle_cores(int cpu, int val) +{ } -#else /* CONFIG_SCHED_SMT */ - -static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +static inline bool test_idle_cores(int cpu, bool def) { - return -1; + return def; } -static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) { - return -1; + return __select_idle_cpu(core); } #endif /* CONFIG_SCHED_SMT */ @@ -6144,49 +6124,61 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + int i, cpu, idle_cpu = -1, nr = INT_MAX; + bool smt = test_idle_cores(target, false); + int this = smp_processor_id(); struct sched_domain *this_sd; - u64 avg_cost, avg_idle; u64 time; - int this = smp_processor_id(); - int cpu, nr = INT_MAX; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) return -1; - /* - * Due to large variance we need a large fuzz factor; hackbench in - * particularly is sensitive here. - */ - avg_idle = this_rq()->avg_idle / 512; - avg_cost = this_sd->avg_scan_cost + 1; + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost) - return -1; + if (sched_feat(SIS_PROP) && !smt) { + u64 avg_cost, avg_idle, span_avg; + + /* + * Due to large variance we need a large fuzz factor; + * hackbench in particularly is sensitive here. + */ + avg_idle = this_rq()->avg_idle / 512; + avg_cost = this_sd->avg_scan_cost + 1; - if (sched_feat(SIS_PROP)) { - u64 span_avg = sd->span_weight * avg_idle; + span_avg = sd->span_weight * avg_idle; if (span_avg > 4*avg_cost) nr = div_u64(span_avg, avg_cost); else nr = 4; - } - - time = cpu_clock(this); - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + time = cpu_clock(this); + } for_each_cpu_wrap(cpu, cpus, target) { - if (!--nr) - return -1; - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) - break; + if (smt) { + i = select_idle_core(p, cpu, cpus, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + + } else { + if (!--nr) + return -1; + idle_cpu = __select_idle_cpu(cpu); + if ((unsigned int)idle_cpu < nr_cpumask_bits) + break; + } } - time = cpu_clock(this) - time; - update_avg(&this_sd->avg_scan_cost, time); + if (smt) + set_idle_cores(this, false); - return cpu; + if (sched_feat(SIS_PROP) && !smt) { + time = cpu_clock(this) - time; + update_avg(&this_sd->avg_scan_cost, time); + } + + return idle_cpu; } /* @@ -6315,18 +6307,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (!sd) return target; - i = select_idle_core(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) - return i; - i = select_idle_cpu(p, sd, target); if ((unsigned)i < nr_cpumask_bits) return i; - i = select_idle_smt(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) - return i; - return target; } @@ -6543,7 +6527,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * is already enough to scale the EM reported power * consumption at the (eventually clamped) cpu_capacity. */ - sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, + sum_util += effective_cpu_util(cpu, util_cfs, cpu_cap, ENERGY_UTIL, NULL); /* @@ -6553,7 +6537,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * NOTE: in case RT tasks are running, by default the * FREQUENCY_UTIL's utilization can be max OPP. */ - cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, + cpu_util = effective_cpu_util(cpu, util_cfs, cpu_cap, FREQUENCY_UTIL, tsk); max_util = max(max_util, cpu_util); } @@ -6651,7 +6635,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * IOW, placing the task there would make the CPU * overutilized. Take uclamp into account to see how * much capacity we can get out of the CPU; this is - * aligned with schedutil_cpu_util(). + * aligned with sched_cpu_util(). */ util = uclamp_rq_util_with(cpu_rq(cpu), util, p); if (!fits_capacity(util, cpu_cap)) @@ -7132,7 +7116,7 @@ done: __maybe_unused; list_move(&p->se.group_node, &rq->cfs_tasks); #endif - if (hrtick_enabled(rq)) + if (hrtick_enabled_fair(rq)) hrtick_start_fair(rq, p); update_misfit_status(p, rq); @@ -9389,8 +9373,11 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; - capacity = capacity_of(i); nr_running = rq->cfs.h_nr_running; + if (!nr_running) + continue; + + capacity = capacity_of(i); /* * For ASYM_CPUCAPACITY domains, don't pick a CPU that could @@ -9496,13 +9483,32 @@ asym_active_balance(struct lb_env *env) } static inline bool -voluntary_active_balance(struct lb_env *env) +imbalanced_active_balance(struct lb_env *env) +{ + struct sched_domain *sd = env->sd; + + /* + * The imbalanced case includes the case of pinned tasks preventing a fair + * distribution of the load on the system but also the even distribution of the + * threads on a system with spare capacity + */ + if ((env->migration_type == migrate_task) && + (sd->nr_balance_failed > sd->cache_nice_tries+2)) + return 1; + + return 0; +} + +static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; if (asym_active_balance(env)) return 1; + if (imbalanced_active_balance(env)) + return 1; + /* * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. * It's worth migrating the task if the src_cpu's capacity is reduced @@ -9522,16 +9528,6 @@ voluntary_active_balance(struct lb_env *env) return 0; } -static int need_active_balance(struct lb_env *env) -{ - struct sched_domain *sd = env->sd; - - if (voluntary_active_balance(env)) - return 1; - - return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); -} - static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) @@ -9623,6 +9619,8 @@ redo: env.src_rq = busiest; ld_moved = 0; + /* Clear this flag as soon as we find a pullable task */ + env.flags |= LBF_ALL_PINNED; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found @@ -9630,7 +9628,6 @@ redo: * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ - env.flags |= LBF_ALL_PINNED; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: @@ -9756,10 +9753,12 @@ more_balance: if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) { raw_spin_unlock_irqrestore(&busiest->lock, flags); - env.flags |= LBF_ALL_PINNED; goto out_one_pinned; } + /* Record that we found at least one task that could run on this_cpu */ + env.flags &= ~LBF_ALL_PINNED; + /* * ->active_balance synchronizes accesses to * ->active_balance_work. Once set, it's cleared @@ -9781,21 +9780,13 @@ more_balance: /* We've kicked active balancing, force task migration. */ sd->nr_balance_failed = sd->cache_nice_tries+1; } - } else + } else { sd->nr_balance_failed = 0; + } - if (likely(!active_balance) || voluntary_active_balance(&env)) { + if (likely(!active_balance) || need_active_balance(&env)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; - } else { - /* - * If we've begun active balancing, start to back off. This - * case may not be covered by the all_pinned logic if there - * is only 1 task on the busy runqueue (because we don't call - * detach_tasks). - */ - if (sd->balance_interval < sd->max_interval) - sd->balance_interval *= 2; } goto out; @@ -10700,8 +10691,11 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) */ void trigger_load_balance(struct rq *rq) { - /* Don't need to rebalance while attached to NULL domain */ - if (unlikely(on_null_domain(rq))) + /* + * Don't need to rebalance while attached to NULL domain or + * runqueue CPU is not active + */ + if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq)))) return; if (time_after_eq(jiffies, rq->next_balance)) @@ -10806,7 +10800,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (rq->curr == p) { + if (task_current(rq, p)) { if (p->prio > oldprio) resched_curr(rq); } else @@ -10939,7 +10933,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) * kick off the schedule if running, otherwise just see * if we can still preempt the current task. */ - if (rq->curr == p) + if (task_current(rq, p)) resched_curr(rq); else check_preempt_curr(rq, p, 0); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 68d369cba9e4..1bc2b158fc51 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -38,6 +38,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) SCHED_FEAT(WAKEUP_PREEMPTION, true) SCHED_FEAT(HRTICK, false) +SCHED_FEAT(HRTICK_DL, false) SCHED_FEAT(DOUBLE_TICK, false) /* @@ -54,7 +55,6 @@ SCHED_FEAT(TTWU_QUEUE, true) /* * When doing wakeups, attempt to limit superfluous scans of the LLC domain. */ -SCHED_FEAT(SIS_AVG_CPU, false) SCHED_FEAT(SIS_PROP, true) /* diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 305727ea0677..7199e6f23789 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -285,6 +285,7 @@ static void do_idle(void) } arch_cpu_idle_enter(); + rcu_nocb_flush_deferred_wakeup(); /* * In poll mode we reenable interrupts and spin. Also if we diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index dbe4629cf7ba..8f720b71d13d 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2357,7 +2357,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) if (!task_on_rq_queued(p)) return; - if (rq->curr == p) { + if (task_current(rq, p)) { #ifdef CONFIG_SMP /* * If our priority decreases while running, we diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bb09988451a0..10a1522b1e30 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -140,7 +140,7 @@ extern void call_trace_sched_update_nr_running(struct rq *rq, int count); * scale_load() and scale_load_down(w) to convert between them. The * following must be true: * - * scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD + * scale_load(sched_prio_to_weight[NICE_TO_PRIO(0)-MAX_RT_PRIO]) == NICE_0_LOAD * */ #define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT) @@ -1031,6 +1031,7 @@ struct rq { call_single_data_t hrtick_csd; #endif struct hrtimer hrtick_timer; + ktime_t hrtick_time; #endif #ifdef CONFIG_SCHEDSTATS @@ -2104,17 +2105,39 @@ extern const_debug unsigned int sysctl_sched_migration_cost; */ static inline int hrtick_enabled(struct rq *rq) { - if (!sched_feat(HRTICK)) - return 0; if (!cpu_active(cpu_of(rq))) return 0; return hrtimer_is_hres_active(&rq->hrtick_timer); } +static inline int hrtick_enabled_fair(struct rq *rq) +{ + if (!sched_feat(HRTICK)) + return 0; + return hrtick_enabled(rq); +} + +static inline int hrtick_enabled_dl(struct rq *rq) +{ + if (!sched_feat(HRTICK_DL)) + return 0; + return hrtick_enabled(rq); +} + void hrtick_start(struct rq *rq, u64 delay); #else +static inline int hrtick_enabled_fair(struct rq *rq) +{ + return 0; +} + +static inline int hrtick_enabled_dl(struct rq *rq) +{ + return 0; +} + static inline int hrtick_enabled(struct rq *rq) { return 0; @@ -2558,27 +2581,24 @@ static inline unsigned long capacity_orig_of(int cpu) { return cpu_rq(cpu)->cpu_capacity_orig; } -#endif /** - * enum schedutil_type - CPU utilization type + * enum cpu_util_type - CPU utilization type * @FREQUENCY_UTIL: Utilization used to select frequency * @ENERGY_UTIL: Utilization used during energy calculation * * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time * need to be aggregated differently depending on the usage made of them. This - * enum is used within schedutil_freq_util() to differentiate the types of + * enum is used within effective_cpu_util() to differentiate the types of * utilization expected by the callers, and adjust the aggregation accordingly. */ -enum schedutil_type { +enum cpu_util_type { FREQUENCY_UTIL, ENERGY_UTIL, }; -#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL - -unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type, +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum cpu_util_type type, struct task_struct *p); static inline unsigned long cpu_bw_dl(struct rq *rq) @@ -2607,14 +2627,7 @@ static inline unsigned long cpu_util_rt(struct rq *rq) { return READ_ONCE(rq->avg_rt.util_avg); } -#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ -static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type, - struct task_struct *p) -{ - return 0; -} -#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ +#endif #ifdef CONFIG_HAVE_SCHED_AVG_IRQ static inline unsigned long cpu_util_irq(struct rq *rq) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 5d3675c7a76b..09d35044bd88 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1596,66 +1596,58 @@ static void init_numa_topology_type(void) } } + +#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) + void sched_init_numa(void) { - int next_distance, curr_distance = node_distance(0, 0); struct sched_domain_topology_level *tl; - int level = 0; - int i, j, k; - - sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL); - if (!sched_domains_numa_distance) - return; - - /* Includes NUMA identity node at level 0. */ - sched_domains_numa_distance[level++] = curr_distance; - sched_domains_numa_levels = level; + unsigned long *distance_map; + int nr_levels = 0; + int i, j; /* * O(nr_nodes^2) deduplicating selection sort -- in order to find the * unique distances in the node_distance() table. - * - * Assumes node_distance(0,j) includes all distances in - * node_distance(i,j) in order to avoid cubic time. */ - next_distance = curr_distance; + distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); + if (!distance_map) + return; + + bitmap_zero(distance_map, NR_DISTANCE_VALUES); for (i = 0; i < nr_node_ids; i++) { for (j = 0; j < nr_node_ids; j++) { - for (k = 0; k < nr_node_ids; k++) { - int distance = node_distance(i, k); - - if (distance > curr_distance && - (distance < next_distance || - next_distance == curr_distance)) - next_distance = distance; - - /* - * While not a strong assumption it would be nice to know - * about cases where if node A is connected to B, B is not - * equally connected to A. - */ - if (sched_debug() && node_distance(k, i) != distance) - sched_numa_warn("Node-distance not symmetric"); + int distance = node_distance(i, j); - if (sched_debug() && i && !find_numa_distance(distance)) - sched_numa_warn("Node-0 not representative"); + if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) { + sched_numa_warn("Invalid distance value range"); + return; } - if (next_distance != curr_distance) { - sched_domains_numa_distance[level++] = next_distance; - sched_domains_numa_levels = level; - curr_distance = next_distance; - } else break; + + bitmap_set(distance_map, distance, 1); } + } + /* + * We can now figure out how many unique distance values there are and + * allocate memory accordingly. + */ + nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES); - /* - * In case of sched_debug() we verify the above assumption. - */ - if (!sched_debug()) - break; + sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); + if (!sched_domains_numa_distance) { + bitmap_free(distance_map); + return; + } + + for (i = 0, j = 0; i < nr_levels; i++, j++) { + j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j); + sched_domains_numa_distance[i] = j; } + bitmap_free(distance_map); + /* - * 'level' contains the number of unique distances + * 'nr_levels' contains the number of unique distances * * The sched_domains_numa_distance[] array includes the actual distance * numbers. @@ -1664,15 +1656,15 @@ void sched_init_numa(void) /* * Here, we should temporarily reset sched_domains_numa_levels to 0. * If it fails to allocate memory for array sched_domains_numa_masks[][], - * the array will contain less then 'level' members. This could be + * the array will contain less then 'nr_levels' members. This could be * dangerous when we use it to iterate array sched_domains_numa_masks[][] * in other functions. * - * We reset it to 'level' at the end of this function. + * We reset it to 'nr_levels' at the end of this function. */ sched_domains_numa_levels = 0; - sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); + sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); if (!sched_domains_numa_masks) return; @@ -1680,7 +1672,7 @@ void sched_init_numa(void) * Now for each level, construct a mask per node which contains all * CPUs of nodes that are that many hops away from us. */ - for (i = 0; i < level; i++) { + for (i = 0; i < nr_levels; i++) { sched_domains_numa_masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); if (!sched_domains_numa_masks[i]) @@ -1688,12 +1680,17 @@ void sched_init_numa(void) for (j = 0; j < nr_node_ids; j++) { struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); + int k; + if (!mask) return; sched_domains_numa_masks[i][j] = mask; for_each_node(k) { + if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) + sched_numa_warn("Node-distance not symmetric"); + if (node_distance(j, k) > sched_domains_numa_distance[i]) continue; @@ -1705,7 +1702,7 @@ void sched_init_numa(void) /* Compute default topology size */ for (i = 0; sched_domain_topology[i].mask; i++); - tl = kzalloc((i + level + 1) * + tl = kzalloc((i + nr_levels + 1) * sizeof(struct sched_domain_topology_level), GFP_KERNEL); if (!tl) return; @@ -1728,7 +1725,7 @@ void sched_init_numa(void) /* * .. and append 'j' levels of NUMA goodness. */ - for (j = 1; j < level; i++, j++) { + for (j = 1; j < nr_levels; i++, j++) { tl[i] = (struct sched_domain_topology_level){ .mask = sd_numa_mask, .sd_flags = cpu_numa_flags, @@ -1740,8 +1737,8 @@ void sched_init_numa(void) sched_domain_topology = tl; - sched_domains_numa_levels = level; - sched_max_numa_distance = sched_domains_numa_distance[level - 1]; + sched_domains_numa_levels = nr_levels; + sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1]; init_numa_topology_type(); } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 952dc1c90229..1d60fc2c9987 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1164,7 +1164,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, * Make sure that any changes to mode from another thread have * been seen after SYSCALL_WORK_SECCOMP was seen. */ - rmb(); + smp_rmb(); if (!sd) { populate_seccomp_data(&sd_local); @@ -1284,6 +1284,8 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, const bool recheck_after_trace) { BUG(); + + return -1; } #endif diff --git a/kernel/smp.c b/kernel/smp.c index 1b6070bf97bb..aeb0adfa0606 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -14,6 +14,7 @@ #include <linux/export.h> #include <linux/percpu.h> #include <linux/init.h> +#include <linux/interrupt.h> #include <linux/gfp.h> #include <linux/smp.h> #include <linux/cpu.h> @@ -449,6 +450,9 @@ void flush_smp_call_function_from_idle(void) local_irq_save(flags); flush_smp_call_function_queue(true); + if (local_softirq_pending()) + do_softirq(); + local_irq_restore(flags); } diff --git a/kernel/static_call.c b/kernel/static_call.c index 84565c2a41b8..6906c6ec4c97 100644 --- a/kernel/static_call.c +++ b/kernel/static_call.c @@ -12,6 +12,8 @@ extern struct static_call_site __start_static_call_sites[], __stop_static_call_sites[]; +extern struct static_call_tramp_key __start_static_call_tramp_key[], + __stop_static_call_tramp_key[]; static bool static_call_initialized; @@ -323,10 +325,59 @@ static int __static_call_mod_text_reserved(void *start, void *end) return ret; } +static unsigned long tramp_key_lookup(unsigned long addr) +{ + struct static_call_tramp_key *start = __start_static_call_tramp_key; + struct static_call_tramp_key *stop = __stop_static_call_tramp_key; + struct static_call_tramp_key *tramp_key; + + for (tramp_key = start; tramp_key != stop; tramp_key++) { + unsigned long tramp; + + tramp = (long)tramp_key->tramp + (long)&tramp_key->tramp; + if (tramp == addr) + return (long)tramp_key->key + (long)&tramp_key->key; + } + + return 0; +} + static int static_call_add_module(struct module *mod) { - return __static_call_init(mod, mod->static_call_sites, - mod->static_call_sites + mod->num_static_call_sites); + struct static_call_site *start = mod->static_call_sites; + struct static_call_site *stop = start + mod->num_static_call_sites; + struct static_call_site *site; + + for (site = start; site != stop; site++) { + unsigned long addr = (unsigned long)static_call_key(site); + unsigned long key; + + /* + * Is the key is exported, 'addr' points to the key, which + * means modules are allowed to call static_call_update() on + * it. + * + * Otherwise, the key isn't exported, and 'addr' points to the + * trampoline so we need to lookup the key. + * + * We go through this dance to prevent crazy modules from + * abusing sensitive static calls. + */ + if (!kernel_text_address(addr)) + continue; + + key = tramp_key_lookup(addr); + if (!key) { + pr_warn("Failed to fixup __raw_static_call() usage at: %ps\n", + static_call_addr(site)); + return -EINVAL; + } + + site->key = (key - (long)&site->key) | + (site->key & STATIC_CALL_SITE_FLAGS); + } + + return __static_call_init(mod, start, stop); } static void static_call_del_module(struct module *mod) @@ -438,6 +489,11 @@ int __init static_call_init(void) } early_initcall(static_call_init); +long __static_call_return0(void) +{ + return 0; +} + #ifdef CONFIG_STATIC_CALL_SELFTEST static int func_a(int x) diff --git a/kernel/sys.c b/kernel/sys.c index 51f00fe20e4d..8bb46e50f02d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -24,7 +24,6 @@ #include <linux/times.h> #include <linux/posix-timers.h> #include <linux/security.h> -#include <linux/dcookies.h> #include <linux/suspend.h> #include <linux/tty.h> #include <linux/signal.h> @@ -1848,7 +1847,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path)) goto exit; - err = inode_permission(inode, MAY_EXEC); + err = file_permission(exe.file, MAY_EXEC); if (err) goto exit; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index f4ace1bf8382..98d7a15e8cf6 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -527,8 +527,11 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) /** * alarm_handle_timer - Callback for posix timers * @alarm: alarm that fired + * @now: time at the timer expiration * * Posix timer callback for expired alarm timers. + * + * Return: whether the timer is to be restarted */ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, ktime_t now) @@ -715,8 +718,11 @@ static int alarm_timer_create(struct k_itimer *new_timer) /** * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep * @alarm: ptr to alarm that fired + * @now: time at the timer expiration * * Wakes up the task that set the alarmtimer + * + * Return: ALARMTIMER_NORESTART */ static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, ktime_t now) @@ -733,6 +739,7 @@ static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation * @alarm: ptr to alarmtimer * @absexp: absolute expiration time + * @type: alarm type (BOOTTIME/REALTIME). * * Sets the alarm timer and sleeps until it is fired or interrupted. */ @@ -806,7 +813,6 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) * @which_clock: clockid * @flags: determins abstime or relative * @tsreq: requested sleep time (abs or rel) - * @rmtp: remaining sleep time saved * * Handles clock_nanosleep calls against _ALARM clockids */ diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 6ca625f5e554..12eab0d2ae28 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -465,9 +465,3 @@ struct time_namespace init_time_ns = { .ns.ops = &timens_operations, .frozen_offsets = true, }; - -static int __init time_ns_init(void) -{ - return 0; -} -subsys_initcall(time_ns_init); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 8dbc008f8942..f475f1a027c8 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1237,6 +1237,20 @@ int try_to_del_timer_sync(struct timer_list *timer) } EXPORT_SYMBOL(try_to_del_timer_sync); +bool timer_curr_running(struct timer_list *timer) +{ + int i; + + for (i = 0; i < NR_BASES; i++) { + struct timer_base *base = this_cpu_ptr(&timer_bases[i]); + + if (base->running_timer == timer) + return true; + } + + return false; +} + #ifdef CONFIG_PREEMPT_RT static __init void timer_base_init_expiry_lock(struct timer_base *base) { diff --git a/kernel/torture.c b/kernel/torture.c index 8562ac18d2eb..01e336f1e5b2 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -48,6 +48,12 @@ module_param(disable_onoff_at_boot, bool, 0444); static bool ftrace_dump_at_shutdown; module_param(ftrace_dump_at_shutdown, bool, 0444); +static int verbose_sleep_frequency; +module_param(verbose_sleep_frequency, int, 0444); + +static int verbose_sleep_duration = 1; +module_param(verbose_sleep_duration, int, 0444); + static char *torture_type; static int verbose; @@ -58,6 +64,95 @@ static int verbose; static int fullstop = FULLSTOP_RMMOD; static DEFINE_MUTEX(fullstop_mutex); +static atomic_t verbose_sleep_counter; + +/* + * Sleep if needed from VERBOSE_TOROUT*(). + */ +void verbose_torout_sleep(void) +{ + if (verbose_sleep_frequency > 0 && + verbose_sleep_duration > 0 && + !(atomic_inc_return(&verbose_sleep_counter) % verbose_sleep_frequency)) + schedule_timeout_uninterruptible(verbose_sleep_duration); +} +EXPORT_SYMBOL_GPL(verbose_torout_sleep); + +/* + * Schedule a high-resolution-timer sleep in nanoseconds, with a 32-bit + * nanosecond random fuzz. This function and its friends desynchronize + * testing from the timer wheel. + */ +int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_state *trsp) +{ + ktime_t hto = baset_ns; + + if (trsp) + hto += (torture_random(trsp) >> 3) % fuzzt_ns; + set_current_state(TASK_UNINTERRUPTIBLE); + return schedule_hrtimeout(&hto, HRTIMER_MODE_REL); +} +EXPORT_SYMBOL_GPL(torture_hrtimeout_ns); + +/* + * Schedule a high-resolution-timer sleep in microseconds, with a 32-bit + * nanosecond (not microsecond!) random fuzz. + */ +int torture_hrtimeout_us(u32 baset_us, u32 fuzzt_ns, struct torture_random_state *trsp) +{ + ktime_t baset_ns = baset_us * NSEC_PER_USEC; + + return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp); +} +EXPORT_SYMBOL_GPL(torture_hrtimeout_us); + +/* + * Schedule a high-resolution-timer sleep in milliseconds, with a 32-bit + * microsecond (not millisecond!) random fuzz. + */ +int torture_hrtimeout_ms(u32 baset_ms, u32 fuzzt_us, struct torture_random_state *trsp) +{ + ktime_t baset_ns = baset_ms * NSEC_PER_MSEC; + u32 fuzzt_ns; + + if ((u32)~0U / NSEC_PER_USEC < fuzzt_us) + fuzzt_ns = (u32)~0U; + else + fuzzt_ns = fuzzt_us * NSEC_PER_USEC; + return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp); +} +EXPORT_SYMBOL_GPL(torture_hrtimeout_ms); + +/* + * Schedule a high-resolution-timer sleep in jiffies, with an + * implied one-jiffy random fuzz. This is intended to replace calls to + * schedule_timeout_interruptible() and friends. + */ +int torture_hrtimeout_jiffies(u32 baset_j, struct torture_random_state *trsp) +{ + ktime_t baset_ns = jiffies_to_nsecs(baset_j); + + return torture_hrtimeout_ns(baset_ns, jiffies_to_nsecs(1), trsp); +} +EXPORT_SYMBOL_GPL(torture_hrtimeout_jiffies); + +/* + * Schedule a high-resolution-timer sleep in milliseconds, with a 32-bit + * millisecond (not second!) random fuzz. + */ +int torture_hrtimeout_s(u32 baset_s, u32 fuzzt_ms, struct torture_random_state *trsp) +{ + ktime_t baset_ns = baset_s * NSEC_PER_SEC; + u32 fuzzt_ns; + + if ((u32)~0U / NSEC_PER_MSEC < fuzzt_ms) + fuzzt_ns = (u32)~0U; + else + fuzzt_ns = fuzzt_ms * NSEC_PER_MSEC; + return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp); +} +EXPORT_SYMBOL_GPL(torture_hrtimeout_s); + #ifdef CONFIG_HOTPLUG_CPU /* @@ -80,6 +175,19 @@ static unsigned long sum_online; static int min_online = -1; static int max_online; +static int torture_online_cpus = NR_CPUS; + +/* + * Some torture testing leverages confusion as to the number of online + * CPUs. This function returns the torture-testing view of this number, + * which allows torture tests to load-balance appropriately. + */ +int torture_num_online_cpus(void) +{ + return READ_ONCE(torture_online_cpus); +} +EXPORT_SYMBOL_GPL(torture_num_online_cpus); + /* * Attempt to take a CPU offline. Return false if the CPU is already * offline or if it is not subject to CPU-hotplug operations. The @@ -134,6 +242,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, *min_offl = delta; if (*max_offl < delta) *max_offl = delta; + WRITE_ONCE(torture_online_cpus, torture_online_cpus - 1); + WARN_ON_ONCE(torture_online_cpus <= 0); } return true; @@ -190,6 +300,7 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, *min_onl = delta; if (*max_onl < delta) *max_onl = delta; + WRITE_ONCE(torture_online_cpus, torture_online_cpus + 1); } return true; @@ -197,6 +308,26 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, EXPORT_SYMBOL_GPL(torture_online); /* + * Get everything online at the beginning and ends of tests. + */ +static void torture_online_all(char *phase) +{ + int cpu; + int ret; + + for_each_possible_cpu(cpu) { + if (cpu_online(cpu)) + continue; + ret = add_cpu(cpu); + if (ret && verbose) { + pr_alert("%s" TORTURE_FLAG + "%s: %s online %d: errno %d\n", + __func__, phase, torture_type, cpu, ret); + } + } +} + +/* * Execute random CPU-hotplug operations at the interval specified * by the onoff_interval. */ @@ -206,25 +337,12 @@ torture_onoff(void *arg) int cpu; int maxcpu = -1; DEFINE_TORTURE_RANDOM(rand); - int ret; VERBOSE_TOROUT_STRING("torture_onoff task started"); for_each_online_cpu(cpu) maxcpu = cpu; WARN_ON(maxcpu < 0); - if (!IS_MODULE(CONFIG_TORTURE_TEST)) { - for_each_possible_cpu(cpu) { - if (cpu_online(cpu)) - continue; - ret = add_cpu(cpu); - if (ret && verbose) { - pr_alert("%s" TORTURE_FLAG - "%s: Initial online %d: errno %d\n", - __func__, torture_type, cpu, ret); - } - } - } - + torture_online_all("Initial"); if (maxcpu == 0) { VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled"); goto stop; @@ -252,6 +370,7 @@ torture_onoff(void *arg) stop: torture_kthread_stopping("torture_onoff"); + torture_online_all("Final"); return 0; } @@ -602,7 +721,6 @@ static int stutter_gap; */ bool stutter_wait(const char *title) { - ktime_t delay; unsigned int i = 0; bool ret = false; int spt; @@ -618,11 +736,8 @@ bool stutter_wait(const char *title) schedule_timeout_interruptible(1); } else if (spt == 2) { while (READ_ONCE(stutter_pause_test)) { - if (!(i++ & 0xffff)) { - set_current_state(TASK_INTERRUPTIBLE); - delay = 10 * NSEC_PER_USEC; - schedule_hrtimeout(&delay, HRTIMER_MODE_REL); - } + if (!(i++ & 0xffff)) + torture_hrtimeout_us(10, 0, NULL); cond_resched(); } } else { @@ -640,7 +755,6 @@ EXPORT_SYMBOL_GPL(stutter_wait); */ static int torture_stutter(void *arg) { - ktime_t delay; DEFINE_TORTURE_RANDOM(rand); int wtime; @@ -651,20 +765,15 @@ static int torture_stutter(void *arg) if (stutter > 2) { WRITE_ONCE(stutter_pause_test, 1); wtime = stutter - 3; - delay = ktime_divns(NSEC_PER_SEC * wtime, HZ); - delay += (torture_random(&rand) >> 3) % NSEC_PER_MSEC; - set_current_state(TASK_INTERRUPTIBLE); - schedule_hrtimeout(&delay, HRTIMER_MODE_REL); + torture_hrtimeout_jiffies(wtime, &rand); wtime = 2; } WRITE_ONCE(stutter_pause_test, 2); - delay = ktime_divns(NSEC_PER_SEC * wtime, HZ); - set_current_state(TASK_INTERRUPTIBLE); - schedule_hrtimeout(&delay, HRTIMER_MODE_REL); + torture_hrtimeout_jiffies(wtime, NULL); } WRITE_ONCE(stutter_pause_test, 0); if (!torture_must_stop()) - schedule_timeout_interruptible(stutter_gap); + torture_hrtimeout_jiffies(stutter_gap, NULL); torture_shutdown_absorb("torture_stutter"); } while (!torture_must_stop()); torture_kthread_stopping("torture_stutter"); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index c1a62ae7e812..61cefb157b59 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -545,7 +545,7 @@ config KPROBE_EVENTS_ON_NOTRACE using kprobe events. If kprobes can use ftrace instead of breakpoint, ftrace related - functions are protected from kprobe-events to prevent an infinit + functions are protected from kprobe-events to prevent an infinite recursion or any unexpected execution path which leads to a kernel crash. @@ -602,6 +602,22 @@ config FTRACE_MCOUNT_RECORD depends on DYNAMIC_FTRACE depends on HAVE_FTRACE_MCOUNT_RECORD +config FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY + bool + depends on FTRACE_MCOUNT_RECORD + +config FTRACE_MCOUNT_USE_CC + def_bool y + depends on $(cc-option,-mrecord-mcount) + depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY + depends on FTRACE_MCOUNT_RECORD + +config FTRACE_MCOUNT_USE_RECORDMCOUNT + def_bool y + depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY + depends on !FTRACE_MCOUNT_USE_CC + depends on FTRACE_MCOUNT_RECORD + config TRACING_MAP bool depends on ARCH_HAVE_NMI_SAFE_CMPXCHG @@ -886,6 +902,10 @@ config PREEMPTIRQ_DELAY_TEST irq-disabled critical sections for 500us: modprobe preemptirq_delay_test test_mode=irq delay=500 burst_size=3 + What's more, if you want to attach the test on the cpu which the latency + tracer is running on, specify cpu_affinity=cpu_num at the end of the + command. + If unsure, say N config SYNTH_EVENT_GEN_TEST diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fb0fe4c66b84..c286c13bd31a 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -72,17 +72,17 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, struct blk_io_trace *t; struct ring_buffer_event *event = NULL; struct trace_buffer *buffer = NULL; - int pc = 0; + unsigned int trace_ctx = 0; int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; if (blk_tracer) { buffer = blk_tr->array_buffer.buffer; - pc = preempt_count(); + trace_ctx = tracing_gen_ctx_flags(0); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + len + cgid_len, - 0, pc); + trace_ctx); if (!event) return; t = ring_buffer_event_data(event); @@ -107,7 +107,7 @@ record_it: memcpy((void *) t + sizeof(*t) + cgid_len, data, len); if (blk_tracer) - trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); + trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); } } @@ -222,8 +222,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, struct blk_io_trace *t; unsigned long flags = 0; unsigned long *sequence; + unsigned int trace_ctx = 0; pid_t pid; - int cpu, pc = 0; + int cpu; bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; @@ -252,10 +253,10 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, tracing_record_cmdline(current); buffer = blk_tr->array_buffer.buffer; - pc = preempt_count(); + trace_ctx = tracing_gen_ctx_flags(0); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + pdu_len + cgid_len, - 0, pc); + trace_ctx); if (!event) return; t = ring_buffer_event_data(event); @@ -301,7 +302,7 @@ record_it: memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); if (blk_tracer) { - trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); + trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); return; } } @@ -903,7 +904,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BOUNCE, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BOUNCE, 0); } static void blk_add_trace_bio_complete(void *ignore, @@ -915,22 +916,24 @@ static void blk_add_trace_bio_complete(void *ignore, static void blk_add_trace_bio_backmerge(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BACKMERGE, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BACKMERGE, + 0); } static void blk_add_trace_bio_frontmerge(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_FRONTMERGE, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_FRONTMERGE, + 0); } static void blk_add_trace_bio_queue(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_QUEUE, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_QUEUE, 0); } static void blk_add_trace_getrq(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_GETRQ, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_GETRQ, 0); } static void blk_add_trace_plug(void *ignore, struct request_queue *q) @@ -967,7 +970,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct blk_trace *bt; rcu_read_lock(); @@ -997,7 +1000,7 @@ static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev, sector_t from) { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct blk_trace *bt; struct blk_io_trace_remap r; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6c0018abe68a..b0c45d923f0f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -96,9 +96,6 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { unsigned int ret; - if (in_nmi()) /* not supported yet */ - return 1; - cant_sleep(); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { @@ -1191,6 +1188,10 @@ BTF_SET_END(btf_allowlist_d_path) static bool bpf_d_path_allowed(const struct bpf_prog *prog) { + if (prog->type == BPF_PROG_TYPE_TRACING && + prog->expected_attach_type == BPF_TRACE_ITER) + return true; + if (prog->type == BPF_PROG_TYPE_LSM) return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id); @@ -1760,6 +1761,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_delete_tracing_proto; case BPF_FUNC_sock_from_file: return &bpf_sock_from_file_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_ptr_cookie_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index 312d1a0ca3b6..8c4ffd076162 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -21,13 +21,16 @@ static ulong delay = 100; static char test_mode[12] = "irq"; static uint burst_size = 1; +static int cpu_affinity = -1; module_param_named(delay, delay, ulong, 0444); module_param_string(test_mode, test_mode, 12, 0444); module_param_named(burst_size, burst_size, uint, 0444); +module_param_named(cpu_affinity, cpu_affinity, int, 0444); MODULE_PARM_DESC(delay, "Period in microseconds (100 us default)"); MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt, irq, or alternate (default irq)"); MODULE_PARM_DESC(burst_size, "The size of a burst (default 1)"); +MODULE_PARM_DESC(cpu_affinity, "Cpu num test is running on"); static struct completion done; @@ -36,7 +39,9 @@ static struct completion done; static void busy_wait(ulong time) { u64 start, end; + start = trace_clock_local(); + do { end = trace_clock_local(); if (kthread_should_stop()) @@ -47,6 +52,7 @@ static void busy_wait(ulong time) static __always_inline void irqoff_test(void) { unsigned long flags; + local_irq_save(flags); busy_wait(delay); local_irq_restore(flags); @@ -113,6 +119,14 @@ static int preemptirq_delay_run(void *data) { int i; int s = MIN(burst_size, NR_TEST_FUNCS); + struct cpumask cpu_mask; + + if (cpu_affinity > -1) { + cpumask_clear(&cpu_mask); + cpumask_set_cpu(cpu_affinity, &cpu_mask); + if (set_cpus_allowed_ptr(current, &cpu_mask)) + pr_err("cpu_affinity:%d, failed\n", cpu_affinity); + } for (i = 0; i < s; i++) (testfuncs[i])(i); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ec08f948dd80..b9dad3500041 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1112,8 +1112,7 @@ static struct list_head *rb_list_head(struct list_head *list) * its flags will be non zero. */ static inline int -rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page *page, struct list_head *list) +rb_is_head_page(struct buffer_page *page, struct list_head *list) { unsigned long val; @@ -1142,8 +1141,7 @@ static bool rb_is_reader_page(struct buffer_page *page) /* * rb_set_list_to_head - set a list_head to be pointing to head. */ -static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer, - struct list_head *list) +static void rb_set_list_to_head(struct list_head *list) { unsigned long *ptr; @@ -1166,7 +1164,7 @@ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) /* * Set the previous list pointer to have the HEAD flag. */ - rb_set_list_to_head(cpu_buffer, head->list.prev); + rb_set_list_to_head(head->list.prev); } static void rb_list_head_clear(struct list_head *list) @@ -1241,8 +1239,7 @@ static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer, old_flag, RB_PAGE_NORMAL); } -static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page **bpage) +static inline void rb_inc_page(struct buffer_page **bpage) { struct list_head *p = rb_list_head((*bpage)->list.next); @@ -1274,11 +1271,11 @@ rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer) */ for (i = 0; i < 3; i++) { do { - if (rb_is_head_page(cpu_buffer, page, page->list.prev)) { + if (rb_is_head_page(page, page->list.prev)) { cpu_buffer->head_page = page; return page; } - rb_inc_page(cpu_buffer, &page); + rb_inc_page(&page); } while (page != head); } @@ -1824,7 +1821,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) cond_resched(); to_remove_page = tmp_iter_page; - rb_inc_page(cpu_buffer, &tmp_iter_page); + rb_inc_page(&tmp_iter_page); /* update the counters */ page_entries = rb_page_entries(to_remove_page); @@ -2062,10 +2059,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, put_online_cpus(); } else { - /* Make sure this CPU has been initialized */ - if (!cpumask_test_cpu(cpu_id, buffer->cpumask)) - goto out; - cpu_buffer = buffer->buffers[cpu_id]; if (nr_pages == cpu_buffer->nr_pages) @@ -2271,7 +2264,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) if (iter->head_page == cpu_buffer->reader_page) iter->head_page = rb_set_head_page(cpu_buffer); else - rb_inc_page(cpu_buffer, &iter->head_page); + rb_inc_page(&iter->head_page); iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp; iter->head = 0; @@ -2374,7 +2367,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, * want the outer most commit to reset it. */ new_head = next_page; - rb_inc_page(cpu_buffer, &new_head); + rb_inc_page(&new_head); ret = rb_head_page_set_head(cpu_buffer, new_head, next_page, RB_PAGE_NORMAL); @@ -2526,7 +2519,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, next_page = tail_page; - rb_inc_page(cpu_buffer, &next_page); + rb_inc_page(&next_page); /* * If for some reason, we had an interrupt storm that made @@ -2552,7 +2545,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, * the buffer, unless the commit page is still on the * reader page. */ - if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) { + if (rb_is_head_page(next_page, &tail_page->list)) { /* * If the commit is not on the reader page, then @@ -2583,7 +2576,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, * have filled up the buffer with events * from interrupts and such, and wrapped. * - * Note, if the tail page is also the on the + * Note, if the tail page is also on the * reader_page, we let it move out. */ if (unlikely((cpu_buffer->commit_page != @@ -2879,7 +2872,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) return; local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); - rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); + rb_inc_page(&cpu_buffer->commit_page); /* add barrier to keep gcc from optimizing too much */ barrier(); } @@ -3638,14 +3631,14 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer, * Because the commit page may be on the reader page we * start with the next page and check the end loop there. */ - rb_inc_page(cpu_buffer, &bpage); + rb_inc_page(&bpage); start = bpage; do { if (bpage->page == (void *)addr) { local_dec(&bpage->entries); return; } - rb_inc_page(cpu_buffer, &bpage); + rb_inc_page(&bpage); } while (bpage != start); /* commit not part of this buffer?? */ @@ -4367,7 +4360,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->pages = reader->list.prev; /* The reader page will be pointing to the new head */ - rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list); + rb_set_list_to_head(&cpu_buffer->reader_page->list); /* * We want to make sure we read the overruns after we set up our @@ -4406,7 +4399,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) * Now make the new head point back to the reader page. */ rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list; - rb_inc_page(cpu_buffer, &cpu_buffer->head_page); + rb_inc_page(&cpu_buffer->head_page); local_inc(&cpu_buffer->pages_read); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b8a2d786b503..e295c413580e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -176,7 +176,7 @@ static union trace_eval_map_item *trace_eval_maps; int tracing_set_tracer(struct trace_array *tr, const char *buf); static void ftrace_trace_userstack(struct trace_array *tr, struct trace_buffer *buffer, - unsigned long flags, int pc); + unsigned int trace_ctx); #define MAX_TRACER_SIZE 100 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; @@ -408,7 +408,8 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export); TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | \ TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \ TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \ - TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS) + TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \ + TRACE_ITER_HASH_PTR) /* trace_options that are only supported by global_trace */ #define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \ @@ -454,6 +455,7 @@ static void __trace_array_put(struct trace_array *this_tr) /** * trace_array_put - Decrement the reference counter for this trace array. + * @this_tr : pointer to the trace array * * NOTE: Use this when we no longer need the trace array returned by * trace_array_get_by_name(). This ensures the trace array can be later @@ -530,6 +532,7 @@ trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) /** * trace_ignore_this_task - should a task be ignored for tracing * @filtered_pids: The list of pids to check + * @filtered_no_pids: The list of pids not to be traced * @task: The task that should be ignored if not filtered * * Checks if @task should be traced or not from @filtered_pids. @@ -780,7 +783,7 @@ u64 ftrace_now(int cpu) } /** - * tracing_is_enabled - Show if global_trace has been disabled + * tracing_is_enabled - Show if global_trace has been enabled * * Shows if the global trace has been enabled or not. It uses the * mirror flag "buffer_disabled" to be used in fast paths such as for @@ -905,23 +908,23 @@ static inline void trace_access_lock_init(void) #ifdef CONFIG_STACKTRACE static void __ftrace_trace_stack(struct trace_buffer *buffer, - unsigned long flags, - int skip, int pc, struct pt_regs *regs); + unsigned int trace_ctx, + int skip, struct pt_regs *regs); static inline void ftrace_trace_stack(struct trace_array *tr, struct trace_buffer *buffer, - unsigned long flags, - int skip, int pc, struct pt_regs *regs); + unsigned int trace_ctx, + int skip, struct pt_regs *regs); #else static inline void __ftrace_trace_stack(struct trace_buffer *buffer, - unsigned long flags, - int skip, int pc, struct pt_regs *regs) + unsigned int trace_ctx, + int skip, struct pt_regs *regs) { } static inline void ftrace_trace_stack(struct trace_array *tr, struct trace_buffer *buffer, - unsigned long flags, - int skip, int pc, struct pt_regs *regs) + unsigned long trace_ctx, + int skip, struct pt_regs *regs) { } @@ -929,24 +932,24 @@ static inline void ftrace_trace_stack(struct trace_array *tr, static __always_inline void trace_event_setup(struct ring_buffer_event *event, - int type, unsigned long flags, int pc) + int type, unsigned int trace_ctx) { struct trace_entry *ent = ring_buffer_event_data(event); - tracing_generic_entry_update(ent, type, flags, pc); + tracing_generic_entry_update(ent, type, trace_ctx); } static __always_inline struct ring_buffer_event * __trace_buffer_lock_reserve(struct trace_buffer *buffer, int type, unsigned long len, - unsigned long flags, int pc) + unsigned int trace_ctx) { struct ring_buffer_event *event; event = ring_buffer_lock_reserve(buffer, len); if (event != NULL) - trace_event_setup(event, type, flags, pc); + trace_event_setup(event, type, trace_ctx); return event; } @@ -1007,25 +1010,22 @@ int __trace_puts(unsigned long ip, const char *str, int size) struct ring_buffer_event *event; struct trace_buffer *buffer; struct print_entry *entry; - unsigned long irq_flags; + unsigned int trace_ctx; int alloc; - int pc; if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0; - pc = preempt_count(); - if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; alloc = sizeof(*entry) + size + 2; /* possible \n added */ - local_save_flags(irq_flags); + trace_ctx = tracing_gen_ctx(); buffer = global_trace.array_buffer.buffer; ring_buffer_nest_start(buffer); - event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, - irq_flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, + trace_ctx); if (!event) { size = 0; goto out; @@ -1044,7 +1044,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) entry->buf[size] = '\0'; __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL); + ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL); out: ring_buffer_nest_end(buffer); return size; @@ -1061,25 +1061,22 @@ int __trace_bputs(unsigned long ip, const char *str) struct ring_buffer_event *event; struct trace_buffer *buffer; struct bputs_entry *entry; - unsigned long irq_flags; + unsigned int trace_ctx; int size = sizeof(struct bputs_entry); int ret = 0; - int pc; if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0; - pc = preempt_count(); - if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; - local_save_flags(irq_flags); + trace_ctx = tracing_gen_ctx(); buffer = global_trace.array_buffer.buffer; ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, - irq_flags, pc); + trace_ctx); if (!event) goto out; @@ -1088,7 +1085,7 @@ int __trace_bputs(unsigned long ip, const char *str) entry->str = str; __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL); + ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL); ret = 1; out: @@ -2584,36 +2581,34 @@ enum print_line_t trace_handle_return(struct trace_seq *s) } EXPORT_SYMBOL_GPL(trace_handle_return); -void -tracing_generic_entry_update(struct trace_entry *entry, unsigned short type, - unsigned long flags, int pc) +unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) { - struct task_struct *tsk = current; + unsigned int trace_flags = irqs_status; + unsigned int pc; - entry->preempt_count = pc & 0xff; - entry->pid = (tsk) ? tsk->pid : 0; - entry->type = type; - entry->flags = -#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT - (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | -#else - TRACE_FLAG_IRQS_NOSUPPORT | -#endif - ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) | - ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | - ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) | - (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | - (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0); + pc = preempt_count(); + + if (pc & NMI_MASK) + trace_flags |= TRACE_FLAG_NMI; + if (pc & HARDIRQ_MASK) + trace_flags |= TRACE_FLAG_HARDIRQ; + if (in_serving_softirq()) + trace_flags |= TRACE_FLAG_SOFTIRQ; + + if (tif_need_resched()) + trace_flags |= TRACE_FLAG_NEED_RESCHED; + if (test_preempt_need_resched()) + trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; + return (trace_flags << 16) | (pc & 0xff); } -EXPORT_SYMBOL_GPL(tracing_generic_entry_update); struct ring_buffer_event * trace_buffer_lock_reserve(struct trace_buffer *buffer, int type, unsigned long len, - unsigned long flags, int pc) + unsigned int trace_ctx) { - return __trace_buffer_lock_reserve(buffer, type, len, flags, pc); + return __trace_buffer_lock_reserve(buffer, type, len, trace_ctx); } DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); @@ -2733,7 +2728,7 @@ struct ring_buffer_event * trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, struct trace_event_file *trace_file, int type, unsigned long len, - unsigned long flags, int pc) + unsigned int trace_ctx) { struct ring_buffer_event *entry; int val; @@ -2745,16 +2740,16 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, (entry = this_cpu_read(trace_buffered_event))) { /* Try to use the per cpu buffer first */ val = this_cpu_inc_return(trace_buffered_event_cnt); - if (val == 1) { - trace_event_setup(entry, type, flags, pc); + if ((len < (PAGE_SIZE - sizeof(*entry))) && val == 1) { + trace_event_setup(entry, type, trace_ctx); entry->array[0] = len; return entry; } this_cpu_dec(trace_buffered_event_cnt); } - entry = __trace_buffer_lock_reserve(*current_rb, - type, len, flags, pc); + entry = __trace_buffer_lock_reserve(*current_rb, type, len, + trace_ctx); /* * If tracing is off, but we have triggers enabled * we still need to look at the event data. Use the temp_buffer @@ -2763,8 +2758,8 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, */ if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) { *current_rb = temp_buffer; - entry = __trace_buffer_lock_reserve(*current_rb, - type, len, flags, pc); + entry = __trace_buffer_lock_reserve(*current_rb, type, len, + trace_ctx); } return entry; } @@ -2850,7 +2845,7 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT); event_trigger_unlock_commit_regs(fbuffer->trace_file, fbuffer->buffer, fbuffer->event, fbuffer->entry, - fbuffer->flags, fbuffer->pc, fbuffer->regs); + fbuffer->trace_ctx, fbuffer->regs); } EXPORT_SYMBOL_GPL(trace_event_buffer_commit); @@ -2866,7 +2861,7 @@ EXPORT_SYMBOL_GPL(trace_event_buffer_commit); void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct trace_buffer *buffer, struct ring_buffer_event *event, - unsigned long flags, int pc, + unsigned int trace_ctx, struct pt_regs *regs) { __buffer_unlock_commit(buffer, event); @@ -2877,8 +2872,8 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, * and mmiotrace, but that's ok if they lose a function or * two. They are not that meaningful. */ - ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs); - ftrace_trace_userstack(tr, buffer, flags, pc); + ftrace_trace_stack(tr, buffer, trace_ctx, regs ? 0 : STACK_SKIP, regs); + ftrace_trace_userstack(tr, buffer, trace_ctx); } /* @@ -2892,9 +2887,8 @@ trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, } void -trace_function(struct trace_array *tr, - unsigned long ip, unsigned long parent_ip, unsigned long flags, - int pc) +trace_function(struct trace_array *tr, unsigned long ip, unsigned long + parent_ip, unsigned int trace_ctx) { struct trace_event_call *call = &event_function; struct trace_buffer *buffer = tr->array_buffer.buffer; @@ -2902,7 +2896,7 @@ trace_function(struct trace_array *tr, struct ftrace_entry *entry; event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), - flags, pc); + trace_ctx); if (!event) return; entry = ring_buffer_event_data(event); @@ -2936,8 +2930,8 @@ static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks); static DEFINE_PER_CPU(int, ftrace_stack_reserve); static void __ftrace_trace_stack(struct trace_buffer *buffer, - unsigned long flags, - int skip, int pc, struct pt_regs *regs) + unsigned int trace_ctx, + int skip, struct pt_regs *regs) { struct trace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; @@ -2984,7 +2978,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, size = nr_entries * sizeof(unsigned long); event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, - sizeof(*entry) + size, flags, pc); + sizeof(*entry) + size, trace_ctx); if (!event) goto out; entry = ring_buffer_event_data(event); @@ -3005,22 +2999,22 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, static inline void ftrace_trace_stack(struct trace_array *tr, struct trace_buffer *buffer, - unsigned long flags, - int skip, int pc, struct pt_regs *regs) + unsigned int trace_ctx, + int skip, struct pt_regs *regs) { if (!(tr->trace_flags & TRACE_ITER_STACKTRACE)) return; - __ftrace_trace_stack(buffer, flags, skip, pc, regs); + __ftrace_trace_stack(buffer, trace_ctx, skip, regs); } -void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, - int pc) +void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, + int skip) { struct trace_buffer *buffer = tr->array_buffer.buffer; if (rcu_is_watching()) { - __ftrace_trace_stack(buffer, flags, skip, pc, NULL); + __ftrace_trace_stack(buffer, trace_ctx, skip, NULL); return; } @@ -3034,7 +3028,7 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, return; rcu_irq_enter_irqson(); - __ftrace_trace_stack(buffer, flags, skip, pc, NULL); + __ftrace_trace_stack(buffer, trace_ctx, skip, NULL); rcu_irq_exit_irqson(); } @@ -3044,19 +3038,15 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, */ void trace_dump_stack(int skip) { - unsigned long flags; - if (tracing_disabled || tracing_selftest_running) return; - local_save_flags(flags); - #ifndef CONFIG_UNWINDER_ORC /* Skip 1 to skip this function. */ skip++; #endif __ftrace_trace_stack(global_trace.array_buffer.buffer, - flags, skip, preempt_count(), NULL); + tracing_gen_ctx(), skip, NULL); } EXPORT_SYMBOL_GPL(trace_dump_stack); @@ -3065,7 +3055,7 @@ static DEFINE_PER_CPU(int, user_stack_count); static void ftrace_trace_userstack(struct trace_array *tr, - struct trace_buffer *buffer, unsigned long flags, int pc) + struct trace_buffer *buffer, unsigned int trace_ctx) { struct trace_event_call *call = &event_user_stack; struct ring_buffer_event *event; @@ -3092,7 +3082,7 @@ ftrace_trace_userstack(struct trace_array *tr, __this_cpu_inc(user_stack_count); event = __trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, - sizeof(*entry), flags, pc); + sizeof(*entry), trace_ctx); if (!event) goto out_drop_count; entry = ring_buffer_event_data(event); @@ -3112,7 +3102,7 @@ ftrace_trace_userstack(struct trace_array *tr, #else /* CONFIG_USER_STACKTRACE_SUPPORT */ static void ftrace_trace_userstack(struct trace_array *tr, struct trace_buffer *buffer, - unsigned long flags, int pc) + unsigned int trace_ctx) { } #endif /* !CONFIG_USER_STACKTRACE_SUPPORT */ @@ -3242,9 +3232,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) struct trace_buffer *buffer; struct trace_array *tr = &global_trace; struct bprint_entry *entry; - unsigned long flags; + unsigned int trace_ctx; char *tbuffer; - int len = 0, size, pc; + int len = 0, size; if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; @@ -3252,7 +3242,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) /* Don't pollute graph traces with trace_vprintk internals */ pause_graph_tracing(); - pc = preempt_count(); + trace_ctx = tracing_gen_ctx(); preempt_disable_notrace(); tbuffer = get_trace_buf(); @@ -3266,12 +3256,11 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) goto out_put; - local_save_flags(flags); size = sizeof(*entry) + sizeof(u32) * len; buffer = tr->array_buffer.buffer; ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, - flags, pc); + trace_ctx); if (!event) goto out; entry = ring_buffer_event_data(event); @@ -3281,7 +3270,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) memcpy(entry->buf, tbuffer, sizeof(u32) * len); if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL); + ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); } out: @@ -3304,9 +3293,9 @@ __trace_array_vprintk(struct trace_buffer *buffer, { struct trace_event_call *call = &event_print; struct ring_buffer_event *event; - int len = 0, size, pc; + int len = 0, size; struct print_entry *entry; - unsigned long flags; + unsigned int trace_ctx; char *tbuffer; if (tracing_disabled || tracing_selftest_running) @@ -3315,7 +3304,7 @@ __trace_array_vprintk(struct trace_buffer *buffer, /* Don't pollute graph traces with trace_vprintk internals */ pause_graph_tracing(); - pc = preempt_count(); + trace_ctx = tracing_gen_ctx(); preempt_disable_notrace(); @@ -3327,11 +3316,10 @@ __trace_array_vprintk(struct trace_buffer *buffer, len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); - local_save_flags(flags); size = sizeof(*entry) + len + 1; ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - flags, pc); + trace_ctx); if (!event) goto out; entry = ring_buffer_event_data(event); @@ -3340,7 +3328,7 @@ __trace_array_vprintk(struct trace_buffer *buffer, memcpy(&entry->buf, tbuffer, len + 1); if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL); + ftrace_trace_stack(&global_trace, buffer, trace_ctx, 6, NULL); } out: @@ -3543,6 +3531,65 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, return next; } +#define STATIC_FMT_BUF_SIZE 128 +static char static_fmt_buf[STATIC_FMT_BUF_SIZE]; + +static char *trace_iter_expand_format(struct trace_iterator *iter) +{ + char *tmp; + + if (iter->fmt == static_fmt_buf) + return NULL; + + tmp = krealloc(iter->fmt, iter->fmt_size + STATIC_FMT_BUF_SIZE, + GFP_KERNEL); + if (tmp) { + iter->fmt_size += STATIC_FMT_BUF_SIZE; + iter->fmt = tmp; + } + + return tmp; +} + +const char *trace_event_format(struct trace_iterator *iter, const char *fmt) +{ + const char *p, *new_fmt; + char *q; + + if (WARN_ON_ONCE(!fmt)) + return fmt; + + if (iter->tr->trace_flags & TRACE_ITER_HASH_PTR) + return fmt; + + p = fmt; + new_fmt = q = iter->fmt; + while (*p) { + if (unlikely(q - new_fmt + 3 > iter->fmt_size)) { + if (!trace_iter_expand_format(iter)) + return fmt; + + q += iter->fmt - new_fmt; + new_fmt = iter->fmt; + } + + *q++ = *p++; + + /* Replace %p with %px */ + if (p[-1] == '%') { + if (p[0] == '%') { + *q++ = *p++; + } else if (p[0] == 'p' && !isalnum(p[1])) { + *q++ = *p++; + *q++ = 'x'; + } + } + } + *q = '\0'; + + return new_fmt; +} + #define STATIC_TEMP_BUF_SIZE 128 static char static_temp_buf[STATIC_TEMP_BUF_SIZE] __aligned(4); @@ -4336,6 +4383,16 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) iter->temp_size = 128; /* + * trace_event_printf() may need to modify given format + * string to replace %p with %px so that it shows real address + * instead of hash value. However, that is only for the event + * tracing, other tracer may not need. Defer the allocation + * until it is needed. + */ + iter->fmt = NULL; + iter->fmt_size = 0; + + /* * We make a copy of the current tracer to avoid concurrent * changes on it while we are reading. */ @@ -4486,6 +4543,7 @@ static int tracing_release(struct inode *inode, struct file *file) mutex_destroy(&iter->mutex); free_cpumask_var(iter->started); + kfree(iter->fmt); kfree(iter->temp); kfree(iter->trace); kfree(iter->buffer_iter); @@ -6653,7 +6711,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, enum event_trigger_type tt = ETT_NONE; struct trace_buffer *buffer; struct print_entry *entry; - unsigned long irq_flags; ssize_t written; int size; int len; @@ -6673,7 +6730,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); - local_save_flags(irq_flags); size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */ /* If less than "<faulted>", then make sure we can still add that */ @@ -6682,7 +6738,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, buffer = tr->array_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - irq_flags, preempt_count()); + tracing_gen_ctx()); if (unlikely(!event)) /* Ring buffer disabled, return as if not open for write */ return -EBADF; @@ -6734,7 +6790,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, struct ring_buffer_event *event; struct trace_buffer *buffer; struct raw_data_entry *entry; - unsigned long irq_flags; ssize_t written; int size; int len; @@ -6756,14 +6811,13 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); - local_save_flags(irq_flags); size = sizeof(*entry) + cnt; if (cnt < FAULT_SIZE_ID) size += FAULT_SIZE_ID - cnt; buffer = tr->array_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size, - irq_flags, preempt_count()); + tracing_gen_ctx()); if (!event) /* Ring buffer disabled, return as if not open for write */ return -EBADF; @@ -9348,9 +9402,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) /* Simulate the iterator */ trace_init_global_iter(&iter); - /* Can not use kmalloc for iter.temp */ + /* Can not use kmalloc for iter.temp and iter.fmt */ iter.temp = static_temp_buf; iter.temp_size = STATIC_TEMP_BUF_SIZE; + iter.fmt = static_fmt_buf; + iter.fmt_size = STATIC_FMT_BUF_SIZE; for_each_tracing_cpu(cpu) { atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); @@ -9429,30 +9485,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) } EXPORT_SYMBOL_GPL(ftrace_dump); -int trace_run_command(const char *buf, int (*createfn)(int, char **)) -{ - char **argv; - int argc, ret; - - argc = 0; - ret = 0; - argv = argv_split(GFP_KERNEL, buf, &argc); - if (!argv) - return -ENOMEM; - - if (argc) - ret = createfn(argc, argv); - - argv_free(argv); - - return ret; -} - #define WRITE_BUFSIZE 4096 ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, size_t count, loff_t *ppos, - int (*createfn)(int, char **)) + int (*createfn)(const char *)) { char *kbuf, *buf, *tmp; int ret = 0; @@ -9500,7 +9537,7 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, if (tmp) *tmp = '\0'; - ret = trace_run_command(buf, createfn); + ret = createfn(buf); if (ret) goto out; buf += size; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e448d2da0b99..dec13ff66077 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -136,25 +136,6 @@ struct kretprobe_trace_entry_head { unsigned long ret_ip; }; -/* - * trace_flag_type is an enumeration that holds different - * states when a trace occurs. These are: - * IRQS_OFF - interrupts were disabled - * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags - * NEED_RESCHED - reschedule is requested - * HARDIRQ - inside an interrupt handler - * SOFTIRQ - inside a softirq handler - */ -enum trace_flag_type { - TRACE_FLAG_IRQS_OFF = 0x01, - TRACE_FLAG_IRQS_NOSUPPORT = 0x02, - TRACE_FLAG_NEED_RESCHED = 0x04, - TRACE_FLAG_HARDIRQ = 0x08, - TRACE_FLAG_SOFTIRQ = 0x10, - TRACE_FLAG_PREEMPT_RESCHED = 0x20, - TRACE_FLAG_NMI = 0x40, -}; - #define TRACE_BUF_SIZE 1024 struct trace_array; @@ -589,8 +570,7 @@ struct ring_buffer_event * trace_buffer_lock_reserve(struct trace_buffer *buffer, int type, unsigned long len, - unsigned long flags, - int pc); + unsigned int trace_ctx); struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); @@ -601,6 +581,8 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, struct ring_buffer_event *event); +const char *trace_event_format(struct trace_iterator *iter, const char *fmt); + int trace_empty(struct trace_iterator *iter); void *trace_find_next_entry_inc(struct trace_iterator *iter); @@ -615,11 +597,11 @@ unsigned long trace_total_entries(struct trace_array *tr); void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, - unsigned long flags, int pc); + unsigned int trace_ctx); void trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, - unsigned long flags, int pc); + unsigned int trace_ctx); void trace_latency_header(struct seq_file *m); void trace_default_header(struct seq_file *m); void print_trace_header(struct seq_file *m, struct trace_iterator *iter); @@ -687,11 +669,10 @@ static inline void latency_fsnotify(struct trace_array *tr) { } #endif #ifdef CONFIG_STACKTRACE -void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, - int pc); +void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip); #else -static inline void __trace_stack(struct trace_array *tr, unsigned long flags, - int skip, int pc) +static inline void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, + int skip) { } #endif /* CONFIG_STACKTRACE */ @@ -831,10 +812,10 @@ extern void graph_trace_open(struct trace_iterator *iter); extern void graph_trace_close(struct trace_iterator *iter); extern int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, - unsigned long flags, int pc); + unsigned int trace_ctx); extern void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, - unsigned long flags, int pc); + unsigned int trace_ctx); #ifdef CONFIG_DYNAMIC_FTRACE extern struct ftrace_hash __rcu *ftrace_graph_hash; @@ -1194,6 +1175,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(MARKERS, "markers"), \ C(EVENT_FORK, "event-fork"), \ C(PAUSE_ON_TRACE, "pause-on-trace"), \ + C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \ FUNCTION_FLAGS \ FGRAPH_FLAGS \ STACK_FLAGS \ @@ -1297,15 +1279,15 @@ extern int call_filter_check_discard(struct trace_event_call *call, void *rec, void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct trace_buffer *buffer, struct ring_buffer_event *event, - unsigned long flags, int pc, + unsigned int trcace_ctx, struct pt_regs *regs); static inline void trace_buffer_unlock_commit(struct trace_array *tr, struct trace_buffer *buffer, struct ring_buffer_event *event, - unsigned long flags, int pc) + unsigned int trace_ctx) { - trace_buffer_unlock_commit_regs(tr, buffer, event, flags, pc, NULL); + trace_buffer_unlock_commit_regs(tr, buffer, event, trace_ctx, NULL); } DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); @@ -1366,8 +1348,7 @@ __event_trigger_test_discard(struct trace_event_file *file, * @buffer: The ring buffer that the event is being written to * @event: The event meta data in the ring buffer * @entry: The event itself - * @irq_flags: The state of the interrupts at the start of the event - * @pc: The state of the preempt count at the start of the event. + * @trace_ctx: The tracing context flags. * * This is a helper function to handle triggers that require data * from the event itself. It also tests the event against filters and @@ -1377,12 +1358,12 @@ static inline void event_trigger_unlock_commit(struct trace_event_file *file, struct trace_buffer *buffer, struct ring_buffer_event *event, - void *entry, unsigned long irq_flags, int pc) + void *entry, unsigned int trace_ctx) { enum event_trigger_type tt = ETT_NONE; if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) - trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc); + trace_buffer_unlock_commit(file->tr, buffer, event, trace_ctx); if (tt) event_triggers_post_call(file, tt); @@ -1394,8 +1375,7 @@ event_trigger_unlock_commit(struct trace_event_file *file, * @buffer: The ring buffer that the event is being written to * @event: The event meta data in the ring buffer * @entry: The event itself - * @irq_flags: The state of the interrupts at the start of the event - * @pc: The state of the preempt count at the start of the event. + * @trace_ctx: The tracing context flags. * * This is a helper function to handle triggers that require data * from the event itself. It also tests the event against filters and @@ -1408,14 +1388,14 @@ static inline void event_trigger_unlock_commit_regs(struct trace_event_file *file, struct trace_buffer *buffer, struct ring_buffer_event *event, - void *entry, unsigned long irq_flags, int pc, + void *entry, unsigned int trace_ctx, struct pt_regs *regs) { enum event_trigger_type tt = ETT_NONE; if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) trace_buffer_unlock_commit_regs(file->tr, buffer, event, - irq_flags, pc, regs); + trace_ctx, regs); if (tt) event_triggers_post_call(file, tt); @@ -1830,10 +1810,9 @@ extern int tracing_set_cpumask(struct trace_array *tr, #define MAX_EVENT_NAME_LEN 64 -extern int trace_run_command(const char *buf, int (*createfn)(int, char**)); extern ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, size_t count, loff_t *ppos, - int (*createfn)(int, char**)); + int (*createfn)(const char *)); extern unsigned int err_pos(char *cmd, const char *str); extern void tracing_log_err(struct trace_array *tr, diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index eff099123aa2..e47fdb4c92fb 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -37,7 +37,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) struct ring_buffer_event *event; struct trace_branch *entry; unsigned long flags; - int pc; + unsigned int trace_ctx; const char *p; if (current->trace_recursion & TRACE_BRANCH_BIT) @@ -59,10 +59,10 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) if (atomic_read(&data->disabled)) goto out; - pc = preempt_count(); + trace_ctx = tracing_gen_ctx_flags(flags); buffer = tr->array_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, - sizeof(*entry), flags, pc); + sizeof(*entry), trace_ctx); if (!event) goto out; diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 4f967d5cd917..dc971a68dda4 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -31,23 +31,31 @@ int dyn_event_register(struct dyn_event_operations *ops) return 0; } -int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type) +int dyn_event_release(const char *raw_command, struct dyn_event_operations *type) { struct dyn_event *pos, *n; char *system = NULL, *event, *p; - int ret = -ENOENT; + int argc, ret = -ENOENT; + char **argv; + + argv = argv_split(GFP_KERNEL, raw_command, &argc); + if (!argv) + return -ENOMEM; if (argv[0][0] == '-') { - if (argv[0][1] != ':') - return -EINVAL; + if (argv[0][1] != ':') { + ret = -EINVAL; + goto out; + } event = &argv[0][2]; } else { event = strchr(argv[0], ':'); - if (!event) - return -EINVAL; + if (!event) { + ret = -EINVAL; + goto out; + } event++; } - argc--; argv++; p = strchr(event, '/'); if (p) { @@ -63,7 +71,7 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type) if (type && type != pos->ops) continue; if (!pos->ops->match(system, event, - argc, (const char **)argv, pos)) + argc - 1, (const char **)argv + 1, pos)) continue; ret = pos->ops->free(pos); @@ -71,21 +79,22 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type) break; } mutex_unlock(&event_mutex); - +out: + argv_free(argv); return ret; } -static int create_dyn_event(int argc, char **argv) +static int create_dyn_event(const char *raw_command) { struct dyn_event_operations *ops; int ret = -ENODEV; - if (argv[0][0] == '-' || argv[0][0] == '!') - return dyn_event_release(argc, argv, NULL); + if (raw_command[0] == '-' || raw_command[0] == '!') + return dyn_event_release(raw_command, NULL); mutex_lock(&dyn_event_ops_mutex); list_for_each_entry(ops, &dyn_event_ops_list, list) { - ret = ops->create(argc, (const char **)argv); + ret = ops->create(raw_command); if (!ret || ret != -ECANCELED) break; } diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index d6f72dcb7269..7754936b57ee 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -39,7 +39,7 @@ struct dyn_event; */ struct dyn_event_operations { struct list_head list; - int (*create)(int argc, const char *argv[]); + int (*create)(const char *raw_command); int (*show)(struct seq_file *m, struct dyn_event *ev); bool (*is_busy)(struct dyn_event *ev); int (*free)(struct dyn_event *ev); @@ -97,7 +97,7 @@ void *dyn_event_seq_start(struct seq_file *m, loff_t *pos); void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos); void dyn_event_seq_stop(struct seq_file *m, void *v); int dyn_events_release_all(struct dyn_event_operations *type); -int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type); +int dyn_event_release(const char *raw_command, struct dyn_event_operations *type); /* * for_each_dyn_event - iterate over the dyn_event list diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index a71181655958..288ad2c274fb 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -421,11 +421,8 @@ NOKPROBE_SYMBOL(perf_trace_buf_alloc); void perf_trace_buf_update(void *record, u16 type) { struct trace_entry *entry = record; - int pc = preempt_count(); - unsigned long flags; - local_save_flags(flags); - tracing_generic_entry_update(entry, type, flags, pc); + tracing_generic_entry_update(entry, type, tracing_gen_ctx()); } NOKPROBE_SYMBOL(perf_trace_buf_update); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e9d28eeccb7e..a3563afd412d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -258,22 +258,19 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, trace_event_ignore_this_pid(trace_file)) return NULL; - local_save_flags(fbuffer->flags); - fbuffer->pc = preempt_count(); /* * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables * preemption (adding one to the preempt_count). Since we are * interested in the preempt_count at the time the tracepoint was * hit, we need to subtract one to offset the increment. */ - if (IS_ENABLED(CONFIG_PREEMPTION)) - fbuffer->pc--; + fbuffer->trace_ctx = tracing_gen_ctx_dec(); fbuffer->trace_file = trace_file; fbuffer->event = trace_event_buffer_lock_reserve(&fbuffer->buffer, trace_file, event_call->event.type, len, - fbuffer->flags, fbuffer->pc); + fbuffer->trace_ctx); if (!fbuffer->event) return NULL; @@ -1212,7 +1209,8 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); list_for_each_entry(file, &tr->events, list) { call = file->event_call; - if (!trace_event_name(call) || !call->class || !call->class->reg) + if ((call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) || + !trace_event_name(call) || !call->class || !call->class->reg) continue; if (system && strcmp(call->class->system, system->name) != 0) @@ -2100,16 +2098,20 @@ event_subsystem_dir(struct trace_array *tr, const char *name, dir->subsystem = system; file->system = dir; - entry = tracefs_create_file("filter", 0644, dir->entry, dir, - &ftrace_subsystem_filter_fops); - if (!entry) { - kfree(system->filter); - system->filter = NULL; - pr_warn("Could not create tracefs '%s/filter' entry\n", name); - } + /* the ftrace system is special, do not create enable or filter files */ + if (strcmp(name, "ftrace") != 0) { - trace_create_file("enable", 0644, dir->entry, dir, - &ftrace_system_enable_fops); + entry = tracefs_create_file("filter", 0644, dir->entry, dir, + &ftrace_subsystem_filter_fops); + if (!entry) { + kfree(system->filter); + system->filter = NULL; + pr_warn("Could not create tracefs '%s/filter' entry\n", name); + } + + trace_create_file("enable", 0644, dir->entry, dir, + &ftrace_system_enable_fops); + } list_add(&dir->list, &tr->systems); @@ -3678,12 +3680,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, struct trace_buffer *buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; - unsigned long flags; + unsigned int trace_ctx; long disabled; int cpu; - int pc; - pc = preempt_count(); + trace_ctx = tracing_gen_ctx(); preempt_disable_notrace(); cpu = raw_smp_processor_id(); disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); @@ -3691,11 +3692,9 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, if (disabled != 1) goto out; - local_save_flags(flags); - event = trace_event_buffer_lock_reserve(&buffer, &event_trace_file, TRACE_FN, sizeof(*entry), - flags, pc); + trace_ctx); if (!event) goto out; entry = ring_buffer_event_data(event); @@ -3703,7 +3702,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, entry->parent_ip = parent_ip; event_trigger_unlock_commit(&event_trace_file, buffer, event, - entry, flags, pc); + entry, trace_ctx); out: atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); preempt_enable_notrace(); diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c index 22bcf7c51d1e..c188045c5f97 100644 --- a/kernel/trace/trace_events_inject.c +++ b/kernel/trace/trace_events_inject.c @@ -192,7 +192,6 @@ static void *trace_alloc_entry(struct trace_event_call *call, int *size) static int parse_entry(char *str, struct trace_event_call *call, void **pentry) { struct ftrace_event_field *field; - unsigned long irq_flags; void *entry = NULL; int entry_size; u64 val = 0; @@ -203,9 +202,8 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry) if (!entry) return -ENOMEM; - local_save_flags(irq_flags); - tracing_generic_entry_update(entry, call->event.type, irq_flags, - preempt_count()); + tracing_generic_entry_update(entry, call->event.type, + tracing_gen_ctx()); while ((len = parse_field(str, call, &field, &val)) > 0) { if (is_function_field(field)) diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 5a8bc0b421f1..2979a96595b4 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -23,13 +23,14 @@ #undef ERRORS #define ERRORS \ C(BAD_NAME, "Illegal name"), \ - C(CMD_INCOMPLETE, "Incomplete command"), \ + C(INVALID_CMD, "Command must be of the form: <name> field[;field] ..."),\ + C(INVALID_DYN_CMD, "Command must be of the form: s or -:[synthetic/]<name> field[;field] ..."),\ C(EVENT_EXISTS, "Event already exists"), \ C(TOO_MANY_FIELDS, "Too many fields"), \ C(INCOMPLETE_TYPE, "Incomplete type"), \ C(INVALID_TYPE, "Invalid type"), \ - C(INVALID_FIELD, "Invalid field"), \ - C(CMD_TOO_LONG, "Command too long"), + C(INVALID_FIELD, "Invalid field"), \ + C(INVALID_ARRAY_SPEC, "Invalid array specification"), #undef C #define C(a, b) SYNTH_ERR_##a @@ -48,7 +49,7 @@ static int errpos(const char *str) return err_pos(last_cmd, str); } -static void last_cmd_set(char *str) +static void last_cmd_set(const char *str) { if (!str) return; @@ -62,7 +63,7 @@ static void synth_err(u8 err_type, u8 err_pos) err_type, err_pos); } -static int create_synth_event(int argc, const char **argv); +static int create_synth_event(const char *raw_command); static int synth_event_show(struct seq_file *m, struct dyn_event *ev); static int synth_event_release(struct dyn_event *ev); static bool synth_event_is_busy(struct dyn_event *ev); @@ -579,18 +580,32 @@ static void free_synth_field(struct synth_field *field) kfree(field); } -static struct synth_field *parse_synth_field(int argc, const char **argv, - int *consumed) +static int check_field_version(const char *prefix, const char *field_type, + const char *field_name) +{ + /* + * For backward compatibility, the old synthetic event command + * format did not require semicolons, and in order to not + * break user space, that old format must still work. If a new + * feature is added, then the format that uses the new feature + * will be required to have semicolons, as nothing that uses + * the old format would be using the new, yet to be created, + * feature. When a new feature is added, this will detect it, + * and return a number greater than 1, and require the format + * to use semicolons. + */ + return 1; +} + +static struct synth_field *parse_synth_field(int argc, char **argv, + int *consumed, int *field_version) { - struct synth_field *field; const char *prefix = NULL, *field_type = argv[0], *field_name, *array; + struct synth_field *field; int len, ret = -ENOMEM; struct seq_buf s; ssize_t size; - if (field_type[0] == ';') - field_type++; - if (!strcmp(field_type, "unsigned")) { if (argc < 3) { synth_err(SYNTH_ERR_INCOMPLETE_TYPE, errpos(field_type)); @@ -599,12 +614,19 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, prefix = "unsigned "; field_type = argv[1]; field_name = argv[2]; - *consumed = 3; + *consumed += 3; } else { field_name = argv[1]; - *consumed = 2; + *consumed += 2; } + if (!field_name) { + synth_err(SYNTH_ERR_INVALID_FIELD, errpos(field_type)); + return ERR_PTR(-EINVAL); + } + + *field_version = check_field_version(prefix, field_type, field_name); + field = kzalloc(sizeof(*field), GFP_KERNEL); if (!field) return ERR_PTR(-ENOMEM); @@ -613,8 +635,6 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, array = strchr(field_name, '['); if (array) len -= strlen(array); - else if (field_name[len - 1] == ';') - len--; field->name = kmemdup_nul(field_name, len, GFP_KERNEL); if (!field->name) @@ -626,8 +646,6 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, goto free; } - if (field_type[0] == ';') - field_type++; len = strlen(field_type) + 1; if (array) @@ -644,11 +662,8 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, if (prefix) seq_buf_puts(&s, prefix); seq_buf_puts(&s, field_type); - if (array) { + if (array) seq_buf_puts(&s, array); - if (s.buffer[s.len - 1] == ';') - s.len--; - } if (WARN_ON_ONCE(!seq_buf_buffer_left(&s))) goto free; @@ -656,7 +671,10 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, size = synth_field_size(field->type); if (size < 0) { - synth_err(SYNTH_ERR_INVALID_TYPE, errpos(field_type)); + if (array) + synth_err(SYNTH_ERR_INVALID_ARRAY_SPEC, errpos(field_name)); + else + synth_err(SYNTH_ERR_INVALID_TYPE, errpos(field_type)); ret = -EINVAL; goto free; } else if (size == 0) { @@ -1160,46 +1178,13 @@ int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name, } EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start); -static int save_cmdstr(int argc, const char *name, const char **argv) -{ - struct seq_buf s; - char *buf; - int i; - - buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - seq_buf_init(&s, buf, MAX_DYNEVENT_CMD_LEN); - - seq_buf_puts(&s, name); - - for (i = 0; i < argc; i++) { - seq_buf_putc(&s, ' '); - seq_buf_puts(&s, argv[i]); - } - - if (!seq_buf_buffer_left(&s)) { - synth_err(SYNTH_ERR_CMD_TOO_LONG, 0); - kfree(buf); - return -EINVAL; - } - buf[s.len] = 0; - last_cmd_set(buf); - - kfree(buf); - return 0; -} - -static int __create_synth_event(int argc, const char *name, const char **argv) +static int __create_synth_event(const char *name, const char *raw_fields) { + char **argv, *field_str, *tmp_fields, *saved_fields = NULL; struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; + int consumed, cmd_version = 1, n_fields_this_loop; + int i, argc, n_fields = 0, ret = 0; struct synth_event *event = NULL; - int i, consumed = 0, n_fields = 0, ret = 0; - - ret = save_cmdstr(argc, name, argv); - if (ret) - return ret; /* * Argument syntax: @@ -1208,46 +1193,99 @@ static int __create_synth_event(int argc, const char *name, const char **argv) * where 'field' = type field_name */ - if (name[0] == '\0' || argc < 1) { - synth_err(SYNTH_ERR_CMD_INCOMPLETE, 0); + if (name[0] == '\0') { + synth_err(SYNTH_ERR_INVALID_CMD, 0); return -EINVAL; } - mutex_lock(&event_mutex); - if (!is_good_name(name)) { synth_err(SYNTH_ERR_BAD_NAME, errpos(name)); - ret = -EINVAL; - goto out; + return -EINVAL; } + mutex_lock(&event_mutex); + event = find_synth_event(name); if (event) { synth_err(SYNTH_ERR_EVENT_EXISTS, errpos(name)); ret = -EEXIST; - goto out; + goto err; } - for (i = 0; i < argc - 1; i++) { - if (strcmp(argv[i], ";") == 0) - continue; - if (n_fields == SYNTH_FIELDS_MAX) { - synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0); - ret = -EINVAL; + tmp_fields = saved_fields = kstrdup(raw_fields, GFP_KERNEL); + if (!tmp_fields) { + ret = -ENOMEM; + goto err; + } + + while ((field_str = strsep(&tmp_fields, ";")) != NULL) { + argv = argv_split(GFP_KERNEL, field_str, &argc); + if (!argv) { + ret = -ENOMEM; goto err; } - field = parse_synth_field(argc - i, &argv[i], &consumed); - if (IS_ERR(field)) { - ret = PTR_ERR(field); + if (!argc) + continue; + + n_fields_this_loop = 0; + consumed = 0; + while (argc > consumed) { + int field_version; + + field = parse_synth_field(argc - consumed, + argv + consumed, &consumed, + &field_version); + if (IS_ERR(field)) { + argv_free(argv); + ret = PTR_ERR(field); + goto err; + } + + /* + * Track the highest version of any field we + * found in the command. + */ + if (field_version > cmd_version) + cmd_version = field_version; + + /* + * Now sort out what is and isn't valid for + * each supported version. + * + * If we see more than 1 field per loop, it + * means we have multiple fields between + * semicolons, and that's something we no + * longer support in a version 2 or greater + * command. + */ + if (cmd_version > 1 && n_fields_this_loop >= 1) { + synth_err(SYNTH_ERR_INVALID_CMD, errpos(field_str)); + ret = -EINVAL; + goto err; + } + + fields[n_fields++] = field; + if (n_fields == SYNTH_FIELDS_MAX) { + synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0); + ret = -EINVAL; + goto err; + } + + n_fields_this_loop++; + } + + if (consumed < argc) { + synth_err(SYNTH_ERR_INVALID_CMD, 0); + ret = -EINVAL; goto err; } - fields[n_fields++] = field; - i += consumed - 1; + + argv_free(argv); } - if (i < argc && strcmp(argv[i], ";") != 0) { - synth_err(SYNTH_ERR_INVALID_FIELD, errpos(argv[i])); + if (n_fields == 0) { + synth_err(SYNTH_ERR_INVALID_CMD, 0); ret = -EINVAL; goto err; } @@ -1266,6 +1304,8 @@ static int __create_synth_event(int argc, const char *name, const char **argv) out: mutex_unlock(&event_mutex); + kfree(saved_fields); + return ret; err: for (i = 0; i < n_fields; i++) @@ -1383,19 +1423,79 @@ int synth_event_delete(const char *event_name) } EXPORT_SYMBOL_GPL(synth_event_delete); -static int create_or_delete_synth_event(int argc, char **argv) +static int check_command(const char *raw_command) { - const char *name = argv[0]; - int ret; + char **argv = NULL, *cmd, *saved_cmd, *name_and_field; + int argc, ret = 0; + + cmd = saved_cmd = kstrdup(raw_command, GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + name_and_field = strsep(&cmd, ";"); + if (!name_and_field) { + ret = -EINVAL; + goto free; + } + + if (name_and_field[0] == '!') + goto free; + + argv = argv_split(GFP_KERNEL, name_and_field, &argc); + if (!argv) { + ret = -ENOMEM; + goto free; + } + argv_free(argv); + + if (argc < 3) + ret = -EINVAL; +free: + kfree(saved_cmd); + + return ret; +} + +static int create_or_delete_synth_event(const char *raw_command) +{ + char *name = NULL, *fields, *p; + int ret = 0; + + raw_command = skip_spaces(raw_command); + if (raw_command[0] == '\0') + return ret; + + last_cmd_set(raw_command); + + ret = check_command(raw_command); + if (ret) { + synth_err(SYNTH_ERR_INVALID_CMD, 0); + return ret; + } + + p = strpbrk(raw_command, " \t"); + if (!p && raw_command[0] != '!') { + synth_err(SYNTH_ERR_INVALID_CMD, 0); + ret = -EINVAL; + goto free; + } + + name = kmemdup_nul(raw_command, p ? p - raw_command : strlen(raw_command), GFP_KERNEL); + if (!name) + return -ENOMEM; - /* trace_run_command() ensures argc != 0 */ if (name[0] == '!') { ret = synth_event_delete(name + 1); - return ret; + goto free; } - ret = __create_synth_event(argc - 1, name, (const char **)argv + 1); - return ret == -ECANCELED ? -EINVAL : ret; + fields = skip_spaces(p); + + ret = __create_synth_event(name, fields); +free: + kfree(name); + + return ret; } static int synth_event_run_command(struct dynevent_cmd *cmd) @@ -1403,7 +1503,7 @@ static int synth_event_run_command(struct dynevent_cmd *cmd) struct synth_event *se; int ret; - ret = trace_run_command(cmd->seq.buffer, create_or_delete_synth_event); + ret = create_or_delete_synth_event(cmd->seq.buffer); if (ret) return ret; @@ -1939,10 +2039,27 @@ int synth_event_trace_end(struct synth_event_trace_state *trace_state) } EXPORT_SYMBOL_GPL(synth_event_trace_end); -static int create_synth_event(int argc, const char **argv) +static int create_synth_event(const char *raw_command) { - const char *name = argv[0]; - int len; + char *fields, *p; + const char *name; + int len, ret = 0; + + raw_command = skip_spaces(raw_command); + if (raw_command[0] == '\0') + return ret; + + last_cmd_set(raw_command); + + p = strpbrk(raw_command, " \t"); + if (!p) { + synth_err(SYNTH_ERR_INVALID_CMD, 0); + return -EINVAL; + } + + fields = skip_spaces(p); + + name = raw_command; if (name[0] != 's' || name[1] != ':') return -ECANCELED; @@ -1951,11 +2068,30 @@ static int create_synth_event(int argc, const char **argv) /* This interface accepts group name prefix */ if (strchr(name, '/')) { len = str_has_prefix(name, SYNTH_SYSTEM "/"); - if (len == 0) + if (len == 0) { + synth_err(SYNTH_ERR_INVALID_DYN_CMD, 0); return -EINVAL; + } name += len; } - return __create_synth_event(argc - 1, name, argv + 1); + + len = name - raw_command; + + ret = check_command(raw_command + len); + if (ret) { + synth_err(SYNTH_ERR_INVALID_CMD, 0); + return ret; + } + + name = kmemdup_nul(raw_command + len, p - raw_command - len, GFP_KERNEL); + if (!name) + return -ENOMEM; + + ret = __create_synth_event(name, fields); + + kfree(name); + + return ret; } static int synth_event_release(struct dyn_event *ev) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index c5095dd28e20..f93723ca66bc 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -106,8 +106,7 @@ static int function_trace_init(struct trace_array *tr) ftrace_init_array_ops(tr, func); - tr->array_buffer.cpu = get_cpu(); - put_cpu(); + tr->array_buffer.cpu = raw_smp_processor_id(); tracing_start_cmdline_record(); tracing_start_function_trace(tr); @@ -132,10 +131,9 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, { struct trace_array *tr = op->private; struct trace_array_cpu *data; - unsigned long flags; + unsigned int trace_ctx; int bit; int cpu; - int pc; if (unlikely(!tr->function_enabled)) return; @@ -144,15 +142,14 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; - pc = preempt_count(); + trace_ctx = tracing_gen_ctx(); preempt_disable_notrace(); cpu = smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); - if (!atomic_read(&data->disabled)) { - local_save_flags(flags); - trace_function(tr, ip, parent_ip, flags, pc); - } + if (!atomic_read(&data->disabled)) + trace_function(tr, ip, parent_ip, trace_ctx); + ftrace_test_recursion_unlock(bit); preempt_enable_notrace(); } @@ -184,7 +181,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, unsigned long flags; long disabled; int cpu; - int pc; + unsigned int trace_ctx; if (unlikely(!tr->function_enabled)) return; @@ -199,9 +196,9 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { - pc = preempt_count(); - trace_function(tr, ip, parent_ip, flags, pc); - __trace_stack(tr, flags, STACK_SKIP, pc); + trace_ctx = tracing_gen_ctx_flags(flags); + trace_function(tr, ip, parent_ip, trace_ctx); + __trace_stack(tr, trace_ctx, STACK_SKIP); } atomic_dec(&data->disabled); @@ -404,13 +401,11 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, static __always_inline void trace_stack(struct trace_array *tr) { - unsigned long flags; - int pc; + unsigned int trace_ctx; - local_save_flags(flags); - pc = preempt_count(); + trace_ctx = tracing_gen_ctx(); - __trace_stack(tr, flags, FTRACE_STACK_SKIP, pc); + __trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP); } static void diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d874dec87131..0aa6e6faa943 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -96,8 +96,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, - unsigned long flags, - int pc) + unsigned int trace_ctx) { struct trace_event_call *call = &event_funcgraph_entry; struct ring_buffer_event *event; @@ -105,7 +104,7 @@ int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent_entry *entry; event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, - sizeof(*entry), flags, pc); + sizeof(*entry), trace_ctx); if (!event) return 0; entry = ring_buffer_event_data(event); @@ -129,10 +128,10 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) struct trace_array *tr = graph_array; struct trace_array_cpu *data; unsigned long flags; + unsigned int trace_ctx; long disabled; int ret; int cpu; - int pc; if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) return 0; @@ -174,8 +173,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) data = per_cpu_ptr(tr->array_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { - pc = preempt_count(); - ret = __trace_graph_entry(tr, trace, flags, pc); + trace_ctx = tracing_gen_ctx_flags(flags); + ret = __trace_graph_entry(tr, trace, trace_ctx); } else { ret = 0; } @@ -188,7 +187,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) static void __trace_graph_function(struct trace_array *tr, - unsigned long ip, unsigned long flags, int pc) + unsigned long ip, unsigned int trace_ctx) { u64 time = trace_clock_local(); struct ftrace_graph_ent ent = { @@ -202,22 +201,21 @@ __trace_graph_function(struct trace_array *tr, .rettime = time, }; - __trace_graph_entry(tr, &ent, flags, pc); - __trace_graph_return(tr, &ret, flags, pc); + __trace_graph_entry(tr, &ent, trace_ctx); + __trace_graph_return(tr, &ret, trace_ctx); } void trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, - unsigned long flags, int pc) + unsigned int trace_ctx) { - __trace_graph_function(tr, ip, flags, pc); + __trace_graph_function(tr, ip, trace_ctx); } void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, - unsigned long flags, - int pc) + unsigned int trace_ctx) { struct trace_event_call *call = &event_funcgraph_exit; struct ring_buffer_event *event; @@ -225,7 +223,7 @@ void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret_entry *entry; event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, - sizeof(*entry), flags, pc); + sizeof(*entry), trace_ctx); if (!event) return; entry = ring_buffer_event_data(event); @@ -239,9 +237,9 @@ void trace_graph_return(struct ftrace_graph_ret *trace) struct trace_array *tr = graph_array; struct trace_array_cpu *data; unsigned long flags; + unsigned int trace_ctx; long disabled; int cpu; - int pc; ftrace_graph_addr_finish(trace); @@ -255,8 +253,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace) data = per_cpu_ptr(tr->array_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { - pc = preempt_count(); - __trace_graph_return(tr, trace, flags, pc); + trace_ctx = tracing_gen_ctx_flags(flags); + __trace_graph_return(tr, trace, trace_ctx); } atomic_dec(&data->disabled); local_irq_restore(flags); diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index c0df9b97f147..34dc1a712dcb 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -108,14 +108,9 @@ static void trace_hwlat_sample(struct hwlat_sample *sample) struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct hwlat_entry *entry; - unsigned long flags; - int pc; - - pc = preempt_count(); - local_save_flags(flags); event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry), - flags, pc); + tracing_gen_ctx()); if (!event) return; entry = ring_buffer_event_data(event); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 6756379b661f..590b3d51afae 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -143,11 +143,14 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; + unsigned int trace_ctx; if (!func_prolog_dec(tr, &data, &flags)) return; - trace_function(tr, ip, parent_ip, flags, preempt_count()); + trace_ctx = tracing_gen_ctx_flags(flags); + + trace_function(tr, ip, parent_ip, trace_ctx); atomic_dec(&data->disabled); } @@ -177,8 +180,8 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; + unsigned int trace_ctx; int ret; - int pc; if (ftrace_graph_ignore_func(trace)) return 0; @@ -195,8 +198,8 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) if (!func_prolog_dec(tr, &data, &flags)) return 0; - pc = preempt_count(); - ret = __trace_graph_entry(tr, trace, flags, pc); + trace_ctx = tracing_gen_ctx_flags(flags); + ret = __trace_graph_entry(tr, trace, trace_ctx); atomic_dec(&data->disabled); return ret; @@ -207,15 +210,15 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace) struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; - int pc; + unsigned int trace_ctx; ftrace_graph_addr_finish(trace); if (!func_prolog_dec(tr, &data, &flags)) return; - pc = preempt_count(); - __trace_graph_return(tr, trace, flags, pc); + trace_ctx = tracing_gen_ctx_flags(flags); + __trace_graph_return(tr, trace, trace_ctx); atomic_dec(&data->disabled); } @@ -267,12 +270,12 @@ static void irqsoff_print_header(struct seq_file *s) static void __trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, - unsigned long flags, int pc) + unsigned int trace_ctx) { if (is_graph(tr)) - trace_graph_function(tr, ip, parent_ip, flags, pc); + trace_graph_function(tr, ip, parent_ip, trace_ctx); else - trace_function(tr, ip, parent_ip, flags, pc); + trace_function(tr, ip, parent_ip, trace_ctx); } #else @@ -322,15 +325,13 @@ check_critical_timing(struct trace_array *tr, { u64 T0, T1, delta; unsigned long flags; - int pc; + unsigned int trace_ctx; T0 = data->preempt_timestamp; T1 = ftrace_now(cpu); delta = T1-T0; - local_save_flags(flags); - - pc = preempt_count(); + trace_ctx = tracing_gen_ctx(); if (!report_latency(tr, delta)) goto out; @@ -341,9 +342,9 @@ check_critical_timing(struct trace_array *tr, if (!report_latency(tr, delta)) goto out_unlock; - __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); + __trace_function(tr, CALLER_ADDR0, parent_ip, trace_ctx); /* Skip 5 functions to get to the irq/preempt enable function */ - __trace_stack(tr, flags, 5, pc); + __trace_stack(tr, trace_ctx, 5); if (data->critical_sequence != max_sequence) goto out_unlock; @@ -363,16 +364,15 @@ out_unlock: out: data->critical_sequence = max_sequence; data->preempt_timestamp = ftrace_now(cpu); - __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); + __trace_function(tr, CALLER_ADDR0, parent_ip, trace_ctx); } static nokprobe_inline void -start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) +start_critical_timing(unsigned long ip, unsigned long parent_ip) { int cpu; struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; - unsigned long flags; if (!tracer_enabled || !tracing_is_enabled()) return; @@ -393,9 +393,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) data->preempt_timestamp = ftrace_now(cpu); data->critical_start = parent_ip ? : ip; - local_save_flags(flags); - - __trace_function(tr, ip, parent_ip, flags, pc); + __trace_function(tr, ip, parent_ip, tracing_gen_ctx()); per_cpu(tracing_cpu, cpu) = 1; @@ -403,12 +401,12 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) } static nokprobe_inline void -stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) +stop_critical_timing(unsigned long ip, unsigned long parent_ip) { int cpu; struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; - unsigned long flags; + unsigned int trace_ctx; cpu = raw_smp_processor_id(); /* Always clear the tracing cpu on stopping the trace */ @@ -428,8 +426,8 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) atomic_inc(&data->disabled); - local_save_flags(flags); - __trace_function(tr, ip, parent_ip, flags, pc); + trace_ctx = tracing_gen_ctx(); + __trace_function(tr, ip, parent_ip, trace_ctx); check_critical_timing(tr, data, parent_ip ? : ip, cpu); data->critical_start = 0; atomic_dec(&data->disabled); @@ -438,20 +436,16 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) /* start and stop critical timings used to for stoppage (in idle) */ void start_critical_timings(void) { - int pc = preempt_count(); - - if (preempt_trace(pc) || irq_trace()) - start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc); + if (preempt_trace(preempt_count()) || irq_trace()) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } EXPORT_SYMBOL_GPL(start_critical_timings); NOKPROBE_SYMBOL(start_critical_timings); void stop_critical_timings(void) { - int pc = preempt_count(); - - if (preempt_trace(pc) || irq_trace()) - stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc); + if (preempt_trace(preempt_count()) || irq_trace()) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } EXPORT_SYMBOL_GPL(stop_critical_timings); NOKPROBE_SYMBOL(stop_critical_timings); @@ -613,19 +607,15 @@ static void irqsoff_tracer_stop(struct trace_array *tr) */ void tracer_hardirqs_on(unsigned long a0, unsigned long a1) { - unsigned int pc = preempt_count(); - - if (!preempt_trace(pc) && irq_trace()) - stop_critical_timing(a0, a1, pc); + if (!preempt_trace(preempt_count()) && irq_trace()) + stop_critical_timing(a0, a1); } NOKPROBE_SYMBOL(tracer_hardirqs_on); void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { - unsigned int pc = preempt_count(); - - if (!preempt_trace(pc) && irq_trace()) - start_critical_timing(a0, a1, pc); + if (!preempt_trace(preempt_count()) && irq_trace()) + start_critical_timing(a0, a1); } NOKPROBE_SYMBOL(tracer_hardirqs_off); @@ -665,18 +655,14 @@ static struct tracer irqsoff_tracer __read_mostly = #ifdef CONFIG_PREEMPT_TRACER void tracer_preempt_on(unsigned long a0, unsigned long a1) { - int pc = preempt_count(); - - if (preempt_trace(pc) && !irq_trace()) - stop_critical_timing(a0, a1, pc); + if (preempt_trace(preempt_count()) && !irq_trace()) + stop_critical_timing(a0, a1); } void tracer_preempt_off(unsigned long a0, unsigned long a1) { - int pc = preempt_count(); - - if (preempt_trace(pc) && !irq_trace()) - start_critical_timing(a0, a1, pc); + if (preempt_trace(preempt_count()) && !irq_trace()) + start_critical_timing(a0, a1); } static int preemptoff_tracer_init(struct trace_array *tr) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 56c7fbff7bd7..6fe770d86dc3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -35,7 +35,7 @@ static int __init set_kprobe_boot_events(char *str) } __setup("kprobe_event=", set_kprobe_boot_events); -static int trace_kprobe_create(int argc, const char **argv); +static int trace_kprobe_create(const char *raw_command); static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev); static int trace_kprobe_release(struct dyn_event *ev); static bool trace_kprobe_is_busy(struct dyn_event *ev); @@ -124,9 +124,9 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) if (!p) return true; *p = '\0'; - mutex_lock(&module_mutex); + rcu_read_lock_sched(); ret = !!find_module(tk->symbol); - mutex_unlock(&module_mutex); + rcu_read_unlock_sched(); *p = ':'; return ret; @@ -711,7 +711,7 @@ static inline void sanitize_event_name(char *name) *name = '_'; } -static int trace_kprobe_create(int argc, const char *argv[]) +static int __trace_kprobe_create(int argc, const char *argv[]) { /* * Argument syntax: @@ -910,20 +910,25 @@ error: goto out; } -static int create_or_delete_trace_kprobe(int argc, char **argv) +static int trace_kprobe_create(const char *raw_command) +{ + return trace_probe_create(raw_command, __trace_kprobe_create); +} + +static int create_or_delete_trace_kprobe(const char *raw_command) { int ret; - if (argv[0][0] == '-') - return dyn_event_release(argc, argv, &trace_kprobe_ops); + if (raw_command[0] == '-') + return dyn_event_release(raw_command, &trace_kprobe_ops); - ret = trace_kprobe_create(argc, (const char **)argv); + ret = trace_kprobe_create(raw_command); return ret == -ECANCELED ? -EINVAL : ret; } static int trace_kprobe_run_command(struct dynevent_cmd *cmd) { - return trace_run_command(cmd->seq.buffer, create_or_delete_trace_kprobe); + return create_or_delete_trace_kprobe(cmd->seq.buffer); } /** @@ -1084,7 +1089,7 @@ int kprobe_event_delete(const char *name) snprintf(buf, MAX_EVENT_NAME_LEN, "-:%s", name); - return trace_run_command(buf, create_or_delete_trace_kprobe); + return create_or_delete_trace_kprobe(buf); } EXPORT_SYMBOL_GPL(kprobe_event_delete); @@ -1386,8 +1391,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, if (trace_trigger_soft_disabled(trace_file)) return; - local_save_flags(fbuffer.flags); - fbuffer.pc = preempt_count(); + fbuffer.trace_ctx = tracing_gen_ctx(); fbuffer.trace_file = trace_file; dsize = __get_data_size(&tk->tp, regs); @@ -1396,7 +1400,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file, call->event.type, sizeof(*entry) + tk->tp.size + dsize, - fbuffer.flags, fbuffer.pc); + fbuffer.trace_ctx); if (!fbuffer.event) return; @@ -1434,8 +1438,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, if (trace_trigger_soft_disabled(trace_file)) return; - local_save_flags(fbuffer.flags); - fbuffer.pc = preempt_count(); + fbuffer.trace_ctx = tracing_gen_ctx(); fbuffer.trace_file = trace_file; dsize = __get_data_size(&tk->tp, regs); @@ -1443,7 +1446,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file, call->event.type, sizeof(*entry) + tk->tp.size + dsize, - fbuffer.flags, fbuffer.pc); + fbuffer.trace_ctx); if (!fbuffer.event) return; @@ -1888,7 +1891,7 @@ static __init void setup_boot_kprobe_events(void) if (p) *p++ = '\0'; - ret = trace_run_command(cmd, create_or_delete_trace_kprobe); + ret = create_or_delete_trace_kprobe(cmd); if (ret) pr_warn("Failed to add event(%d): %s\n", ret, cmd); @@ -1982,8 +1985,7 @@ static __init int kprobe_trace_self_tests_init(void) pr_info("Testing kprobe tracing: "); - ret = trace_run_command("p:testprobe kprobe_trace_selftest_target $stack $stack0 +0($stack)", - create_or_delete_trace_kprobe); + ret = create_or_delete_trace_kprobe("p:testprobe kprobe_trace_selftest_target $stack $stack0 +0($stack)"); if (WARN_ON_ONCE(ret)) { pr_warn("error on probing function entry.\n"); warn++; @@ -2004,8 +2006,7 @@ static __init int kprobe_trace_self_tests_init(void) } } - ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target $retval", - create_or_delete_trace_kprobe); + ret = create_or_delete_trace_kprobe("r:testprobe2 kprobe_trace_selftest_target $retval"); if (WARN_ON_ONCE(ret)) { pr_warn("error on probing function return.\n"); warn++; @@ -2078,13 +2079,13 @@ static __init int kprobe_trace_self_tests_init(void) trace_probe_event_call(&tk->tp), file); } - ret = trace_run_command("-:testprobe", create_or_delete_trace_kprobe); + ret = create_or_delete_trace_kprobe("-:testprobe"); if (WARN_ON_ONCE(ret)) { pr_warn("error on deleting a probe.\n"); warn++; } - ret = trace_run_command("-:testprobe2", create_or_delete_trace_kprobe); + ret = create_or_delete_trace_kprobe("-:testprobe2"); if (WARN_ON_ONCE(ret)) { pr_warn("error on deleting a probe.\n"); warn++; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 84582bf1ed5f..64e77b513697 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -5,8 +5,6 @@ * Copyright (C) 2008 Pekka Paalanen <pq@iki.fi> */ -#define DEBUG 1 - #include <linux/kernel.h> #include <linux/mmiotrace.h> #include <linux/pci.h> @@ -300,10 +298,11 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_rw *entry; - int pc = preempt_count(); + unsigned int trace_ctx; + trace_ctx = tracing_gen_ctx_flags(0); event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW, - sizeof(*entry), 0, pc); + sizeof(*entry), trace_ctx); if (!event) { atomic_inc(&dropped_count); return; @@ -312,7 +311,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, entry->rw = *rw; if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(tr, buffer, event, 0, pc); + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); } void mmio_trace_rw(struct mmiotrace_rw *rw) @@ -330,10 +329,11 @@ static void __trace_mmiotrace_map(struct trace_array *tr, struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_map *entry; - int pc = preempt_count(); + unsigned int trace_ctx; + trace_ctx = tracing_gen_ctx_flags(0); event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP, - sizeof(*entry), 0, pc); + sizeof(*entry), trace_ctx); if (!event) { atomic_inc(&dropped_count); return; @@ -342,7 +342,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, entry->map = *map; if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(tr, buffer, event, 0, pc); + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); } void mmio_trace_mapping(struct mmiotrace_map *map) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 92b1575ae0ca..61255bad7e01 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -312,13 +312,23 @@ int trace_raw_output_prep(struct trace_iterator *iter, } EXPORT_SYMBOL(trace_raw_output_prep); +void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + trace_seq_vprintf(&iter->seq, trace_event_format(iter, fmt), ap); + va_end(ap); +} +EXPORT_SYMBOL(trace_event_printf); + static int trace_output_raw(struct trace_iterator *iter, char *name, char *fmt, va_list ap) { struct trace_seq *s = &iter->seq; trace_seq_printf(s, "%s: ", name); - trace_seq_vprintf(s, fmt, ap); + trace_seq_vprintf(s, trace_event_format(iter, fmt), ap); return trace_handle_return(s); } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index d2867ccc6aca..ec589a4612df 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1134,3 +1134,20 @@ bool trace_probe_match_command_args(struct trace_probe *tp, } return true; } + +int trace_probe_create(const char *raw_command, int (*createfn)(int, const char **)) +{ + int argc = 0, ret = 0; + char **argv; + + argv = argv_split(GFP_KERNEL, raw_command, &argc); + if (!argv) + return -ENOMEM; + + if (argc) + ret = createfn(argc, (const char **)argv); + + argv_free(argv); + + return ret; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 2f703a20c724..7ce4027089ee 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -341,6 +341,7 @@ struct event_file_link *trace_probe_get_file_link(struct trace_probe *tp, int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b); bool trace_probe_match_command_args(struct trace_probe *tp, int argc, const char **argv); +int trace_probe_create(const char *raw_command, int (*createfn)(int, const char **)); #define trace_probe_for_each_link(pos, tp) \ list_for_each_entry(pos, &(tp)->event->files, list) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index c0181066dbe9..e5778d1d7a5b 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -67,7 +67,7 @@ static bool function_enabled; static int func_prolog_preempt_disable(struct trace_array *tr, struct trace_array_cpu **data, - int *pc) + unsigned int *trace_ctx) { long disabled; int cpu; @@ -75,7 +75,7 @@ func_prolog_preempt_disable(struct trace_array *tr, if (likely(!wakeup_task)) return 0; - *pc = preempt_count(); + *trace_ctx = tracing_gen_ctx(); preempt_disable_notrace(); cpu = raw_smp_processor_id(); @@ -116,8 +116,8 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace) { struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; - unsigned long flags; - int pc, ret = 0; + unsigned int trace_ctx; + int ret = 0; if (ftrace_graph_ignore_func(trace)) return 0; @@ -131,11 +131,10 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace) if (ftrace_graph_notrace_addr(trace->func)) return 1; - if (!func_prolog_preempt_disable(tr, &data, &pc)) + if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) return 0; - local_save_flags(flags); - ret = __trace_graph_entry(tr, trace, flags, pc); + ret = __trace_graph_entry(tr, trace, trace_ctx); atomic_dec(&data->disabled); preempt_enable_notrace(); @@ -146,16 +145,14 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace) { struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; - unsigned long flags; - int pc; + unsigned int trace_ctx; ftrace_graph_addr_finish(trace); - if (!func_prolog_preempt_disable(tr, &data, &pc)) + if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) return; - local_save_flags(flags); - __trace_graph_return(tr, trace, flags, pc); + __trace_graph_return(tr, trace, trace_ctx); atomic_dec(&data->disabled); preempt_enable_notrace(); @@ -217,13 +214,13 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; unsigned long flags; - int pc; + unsigned int trace_ctx; - if (!func_prolog_preempt_disable(tr, &data, &pc)) + if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) return; local_irq_save(flags); - trace_function(tr, ip, parent_ip, flags, pc); + trace_function(tr, ip, parent_ip, trace_ctx); local_irq_restore(flags); atomic_dec(&data->disabled); @@ -303,12 +300,12 @@ static void wakeup_print_header(struct seq_file *s) static void __trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, - unsigned long flags, int pc) + unsigned int trace_ctx) { if (is_graph(tr)) - trace_graph_function(tr, ip, parent_ip, flags, pc); + trace_graph_function(tr, ip, parent_ip, trace_ctx); else - trace_function(tr, ip, parent_ip, flags, pc); + trace_function(tr, ip, parent_ip, trace_ctx); } static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) @@ -375,7 +372,7 @@ static void tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *prev, struct task_struct *next, - unsigned long flags, int pc) + unsigned int trace_ctx) { struct trace_event_call *call = &event_context_switch; struct trace_buffer *buffer = tr->array_buffer.buffer; @@ -383,7 +380,7 @@ tracing_sched_switch_trace(struct trace_array *tr, struct ctx_switch_entry *entry; event = trace_buffer_lock_reserve(buffer, TRACE_CTX, - sizeof(*entry), flags, pc); + sizeof(*entry), trace_ctx); if (!event) return; entry = ring_buffer_event_data(event); @@ -396,14 +393,14 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_cpu = task_cpu(next); if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(tr, buffer, event, flags, pc); + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); } static void tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *wakee, struct task_struct *curr, - unsigned long flags, int pc) + unsigned int trace_ctx) { struct trace_event_call *call = &event_wakeup; struct ring_buffer_event *event; @@ -411,7 +408,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct trace_buffer *buffer = tr->array_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, - sizeof(*entry), flags, pc); + sizeof(*entry), trace_ctx); if (!event) return; entry = ring_buffer_event_data(event); @@ -424,7 +421,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_cpu = task_cpu(wakee); if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(tr, buffer, event, flags, pc); + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); } static void notrace @@ -436,7 +433,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, unsigned long flags; long disabled; int cpu; - int pc; + unsigned int trace_ctx; tracing_record_cmdline(prev); @@ -455,8 +452,6 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, if (next != wakeup_task) return; - pc = preempt_count(); - /* disable local data, not wakeup_cpu data */ cpu = raw_smp_processor_id(); disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); @@ -464,6 +459,8 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, goto out; local_irq_save(flags); + trace_ctx = tracing_gen_ctx_flags(flags); + arch_spin_lock(&wakeup_lock); /* We could race with grabbing wakeup_lock */ @@ -473,9 +470,9 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, /* The task we are waiting for is waking up */ data = per_cpu_ptr(wakeup_trace->array_buffer.data, wakeup_cpu); - __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); - tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); - __trace_stack(wakeup_trace, flags, 0, pc); + __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, trace_ctx); + tracing_sched_switch_trace(wakeup_trace, prev, next, trace_ctx); + __trace_stack(wakeup_trace, trace_ctx, 0); T0 = data->preempt_timestamp; T1 = ftrace_now(cpu); @@ -527,9 +524,8 @@ probe_wakeup(void *ignore, struct task_struct *p) { struct trace_array_cpu *data; int cpu = smp_processor_id(); - unsigned long flags; long disabled; - int pc; + unsigned int trace_ctx; if (likely(!tracer_enabled)) return; @@ -550,11 +546,12 @@ probe_wakeup(void *ignore, struct task_struct *p) (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) return; - pc = preempt_count(); disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); if (unlikely(disabled != 1)) goto out; + trace_ctx = tracing_gen_ctx(); + /* interrupts should be off from try_to_wake_up */ arch_spin_lock(&wakeup_lock); @@ -581,19 +578,17 @@ probe_wakeup(void *ignore, struct task_struct *p) wakeup_task = get_task_struct(p); - local_save_flags(flags); - data = per_cpu_ptr(wakeup_trace->array_buffer.data, wakeup_cpu); data->preempt_timestamp = ftrace_now(cpu); - tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); - __trace_stack(wakeup_trace, flags, 0, pc); + tracing_sched_wakeup_trace(wakeup_trace, p, current, trace_ctx); + __trace_stack(wakeup_trace, trace_ctx, 0); /* * We must be careful in using CALLER_ADDR2. But since wake_up * is not called by an assembly function (where as schedule is) * it should be safe to use it here. */ - __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); + __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, trace_ctx); out_locked: arch_spin_unlock(&wakeup_lock); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index d85a2f0f316b..8bfcd3b09422 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -298,9 +298,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct ring_buffer_event *event; struct trace_buffer *buffer; - unsigned long irq_flags; + unsigned int trace_ctx; unsigned long args[6]; - int pc; int syscall_nr; int size; @@ -322,12 +321,11 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; - local_save_flags(irq_flags); - pc = preempt_count(); + trace_ctx = tracing_gen_ctx(); buffer = tr->array_buffer.buffer; event = trace_buffer_lock_reserve(buffer, - sys_data->enter_event->event.type, size, irq_flags, pc); + sys_data->enter_event->event.type, size, trace_ctx); if (!event) return; @@ -337,7 +335,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); event_trigger_unlock_commit(trace_file, buffer, event, entry, - irq_flags, pc); + trace_ctx); } static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) @@ -348,8 +346,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) struct syscall_metadata *sys_data; struct ring_buffer_event *event; struct trace_buffer *buffer; - unsigned long irq_flags; - int pc; + unsigned int trace_ctx; int syscall_nr; syscall_nr = trace_get_syscall_nr(current, regs); @@ -368,13 +365,12 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) if (!sys_data) return; - local_save_flags(irq_flags); - pc = preempt_count(); + trace_ctx = tracing_gen_ctx(); buffer = tr->array_buffer.buffer; event = trace_buffer_lock_reserve(buffer, sys_data->exit_event->event.type, sizeof(*entry), - irq_flags, pc); + trace_ctx); if (!event) return; @@ -383,7 +379,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) entry->ret = syscall_get_return_value(current, regs); event_trigger_unlock_commit(trace_file, buffer, event, entry, - irq_flags, pc); + trace_ctx); } static int reg_event_syscall_enter(struct trace_event_file *file, diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 3cf7128e1ad3..9b50869a5ddb 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -34,7 +34,7 @@ struct uprobe_trace_entry_head { #define DATAOF_TRACE_ENTRY(entry, is_return) \ ((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return)) -static int trace_uprobe_create(int argc, const char **argv); +static int trace_uprobe_create(const char *raw_command); static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev); static int trace_uprobe_release(struct dyn_event *ev); static bool trace_uprobe_is_busy(struct dyn_event *ev); @@ -530,7 +530,7 @@ end: * Argument syntax: * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET[%return][(REF)] [FETCHARGS] */ -static int trace_uprobe_create(int argc, const char **argv) +static int __trace_uprobe_create(int argc, const char **argv) { struct trace_uprobe *tu; const char *event = NULL, *group = UPROBE_EVENT_SYSTEM; @@ -716,14 +716,19 @@ fail_address_parse: return ret; } -static int create_or_delete_trace_uprobe(int argc, char **argv) +int trace_uprobe_create(const char *raw_command) +{ + return trace_probe_create(raw_command, __trace_uprobe_create); +} + +static int create_or_delete_trace_uprobe(const char *raw_command) { int ret; - if (argv[0][0] == '-') - return dyn_event_release(argc, argv, &trace_uprobe_ops); + if (raw_command[0] == '-') + return dyn_event_release(raw_command, &trace_uprobe_ops); - ret = trace_uprobe_create(argc, (const char **)argv); + ret = trace_uprobe_create(raw_command); return ret == -ECANCELED ? -EINVAL : ret; } @@ -961,7 +966,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); size = esize + tu->tp.size + dsize; event = trace_event_buffer_lock_reserve(&buffer, trace_file, - call->event.type, size, 0, 0); + call->event.type, size, 0); if (!event) return; @@ -977,7 +982,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, memcpy(data, ucb->buf, tu->tp.size + dsize); - event_trigger_unlock_commit(trace_file, buffer, event, entry, 0, 0); + event_trigger_unlock_commit(trace_file, buffer, event, entry, 0); } /* uprobe handler */ @@ -1635,7 +1640,7 @@ void destroy_local_trace_uprobe(struct trace_event_call *event_call) } #endif /* CONFIG_PERF_EVENTS */ -/* Make a trace interface for controling probe points */ +/* Make a trace interface for controlling probe points */ static __init int init_uprobe_trace(void) { int ret; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 7261fa0f5e3c..9f478d29b926 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -53,6 +53,12 @@ struct tp_probes { struct tracepoint_func probes[]; }; +/* Called in removal of a func but failed to allocate a new tp_funcs */ +static void tp_stub_func(void) +{ + return; +} + static inline void *allocate_probes(int count) { struct tp_probes *p = kmalloc(struct_size(p, probes, count), @@ -130,8 +136,9 @@ func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func, int prio) { struct tracepoint_func *old, *new; - int nr_probes = 0; - int pos = -1; + int iter_probes; /* Iterate over old probe array. */ + int nr_probes = 0; /* Counter for probes */ + int pos = -1; /* Insertion position into new array */ if (WARN_ON(!tp_func->func)) return ERR_PTR(-EINVAL); @@ -140,13 +147,13 @@ func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func, old = *funcs; if (old) { /* (N -> N+1), (N != 0, 1) probes */ - for (nr_probes = 0; old[nr_probes].func; nr_probes++) { - /* Insert before probes of lower priority */ - if (pos < 0 && old[nr_probes].prio < prio) - pos = nr_probes; - if (old[nr_probes].func == tp_func->func && - old[nr_probes].data == tp_func->data) + for (iter_probes = 0; old[iter_probes].func; iter_probes++) { + if (old[iter_probes].func == tp_stub_func) + continue; /* Skip stub functions. */ + if (old[iter_probes].func == tp_func->func && + old[iter_probes].data == tp_func->data) return ERR_PTR(-EEXIST); + nr_probes++; } } /* + 2 : one for new probe, one for NULL func */ @@ -154,20 +161,24 @@ func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func, if (new == NULL) return ERR_PTR(-ENOMEM); if (old) { - if (pos < 0) { - pos = nr_probes; - memcpy(new, old, nr_probes * sizeof(struct tracepoint_func)); - } else { - /* Copy higher priority probes ahead of the new probe */ - memcpy(new, old, pos * sizeof(struct tracepoint_func)); - /* Copy the rest after it. */ - memcpy(new + pos + 1, old + pos, - (nr_probes - pos) * sizeof(struct tracepoint_func)); + nr_probes = 0; + for (iter_probes = 0; old[iter_probes].func; iter_probes++) { + if (old[iter_probes].func == tp_stub_func) + continue; + /* Insert before probes of lower priority */ + if (pos < 0 && old[iter_probes].prio < prio) + pos = nr_probes++; + new[nr_probes++] = old[iter_probes]; } - } else + if (pos < 0) + pos = nr_probes++; + /* nr_probes now points to the end of the new array */ + } else { pos = 0; + nr_probes = 1; /* must point at end of array */ + } new[pos] = *tp_func; - new[nr_probes + 1].func = NULL; + new[nr_probes].func = NULL; *funcs = new; debug_print_probes(*funcs); return old; @@ -188,8 +199,9 @@ static void *func_remove(struct tracepoint_func **funcs, /* (N -> M), (N > 1, M >= 0) probes */ if (tp_func->func) { for (nr_probes = 0; old[nr_probes].func; nr_probes++) { - if (old[nr_probes].func == tp_func->func && - old[nr_probes].data == tp_func->data) + if ((old[nr_probes].func == tp_func->func && + old[nr_probes].data == tp_func->data) || + old[nr_probes].func == tp_stub_func) nr_del++; } } @@ -208,14 +220,27 @@ static void *func_remove(struct tracepoint_func **funcs, /* N -> M, (N > 1, M > 0) */ /* + 1 for NULL */ new = allocate_probes(nr_probes - nr_del + 1); - if (new == NULL) - return ERR_PTR(-ENOMEM); - for (i = 0; old[i].func; i++) - if (old[i].func != tp_func->func - || old[i].data != tp_func->data) - new[j++] = old[i]; - new[nr_probes - nr_del].func = NULL; - *funcs = new; + if (new) { + for (i = 0; old[i].func; i++) { + if ((old[i].func != tp_func->func || + old[i].data != tp_func->data) && + old[i].func != tp_stub_func) + new[j++] = old[i]; + } + new[nr_probes - nr_del].func = NULL; + *funcs = new; + } else { + /* + * Failed to allocate, replace the old function + * with calls to tp_stub_func. + */ + for (i = 0; old[i].func; i++) { + if (old[i].func == tp_func->func && + old[i].data == tp_func->data) + WRITE_ONCE(old[i].func, tp_stub_func); + } + *funcs = old; + } } debug_print_probes(*funcs); return old; @@ -295,10 +320,12 @@ static int tracepoint_remove_func(struct tracepoint *tp, tp_funcs = rcu_dereference_protected(tp->funcs, lockdep_is_held(&tracepoints_mutex)); old = func_remove(&tp_funcs, func); - if (IS_ERR(old)) { - WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM); + if (WARN_ON_ONCE(IS_ERR(old))) return PTR_ERR(old); - } + + if (tp_funcs == old) + /* Failed allocating new tp_funcs, replaced func with stub */ + return 0; if (!tp_funcs) { /* Removed last function */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 894bb885b40b..0d150da252e8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2964,8 +2964,8 @@ reflush: if (++flush_cnt == 10 || (flush_cnt % 100 == 0 && flush_cnt <= 1000)) - pr_warn("workqueue %s: drain_workqueue() isn't complete after %u tries\n", - wq->name, flush_cnt); + pr_warn("workqueue %s: %s() isn't complete after %u tries\n", + wq->name, __func__, flush_cnt); mutex_unlock(&wq->mutex); goto reflush; |