diff options
author | Peter Zijlstra <peterz@infradead.org> | 2020-12-09 17:08:45 +0100 |
---|---|---|
committer | Peter Zijlstra <peterz@infradead.org> | 2020-12-09 17:08:45 +0100 |
commit | 2b3c99ee6389d33aff91d9e7a55465d7d1332bbd (patch) | |
tree | 9e1e5d839d80c95854007c958f2c367290bf1090 /kernel | |
parent | 97d62caa32d6d79dadae3f8d19af5c92ea9a589a (diff) | |
parent | 31784cff7ee073b34d6eddabb95e3be2880a425c (diff) |
Merge branch 'locking/rwsem'
Diffstat (limited to 'kernel')
36 files changed, 412 insertions, 234 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index bdc8cd1b6767..c1b9f71ee6aa 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,6 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := core.o -CFLAGS_core.o += $(call cc-disable-warning, override-init) +ifneq ($(CONFIG_BPF_JIT_ALWAYS_ON),y) +# ___bpf_prog_run() needs GCSE disabled on x86; see 3193c0836f203 for details +cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse +endif +CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 78ea8a7bd27f..56cc5a915f67 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -13,6 +13,7 @@ #include <linux/bpf_verifier.h> #include <net/bpf_sk_storage.h> #include <linux/bpf_local_storage.h> +#include <linux/btf_ids.h> /* For every LSM hook that allows attachment of BPF programs, declare a nop * function where a BPF program can be attached. @@ -26,7 +27,11 @@ noinline RET bpf_lsm_##NAME(__VA_ARGS__) \ #include <linux/lsm_hook_defs.h> #undef LSM_HOOK -#define BPF_LSM_SYM_PREFX "bpf_lsm_" +#define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME) +BTF_SET_START(bpf_lsm_hooks) +#include <linux/lsm_hook_defs.h> +#undef LSM_HOOK +BTF_SET_END(bpf_lsm_hooks) int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) @@ -37,8 +42,7 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, return -EINVAL; } - if (strncmp(BPF_LSM_SYM_PREFX, prog->aux->attach_func_name, - sizeof(BPF_LSM_SYM_PREFX) - 1)) { + if (!btf_id_set_contains(&bpf_lsm_hooks, prog->aux->attach_btf_id)) { bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n", prog->aux->attach_btf_id, prog->aux->attach_func_name); return -EINVAL; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9268d77898b7..55454d2278b1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1369,7 +1369,7 @@ u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) * * Decode and execute eBPF instructions. */ -static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) +static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 1815e97d4c9c..1fccba6e88c4 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -821,6 +821,32 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, } } +static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, + void *value, bool onallcpus) +{ + /* When using prealloc and not setting the initial value on all cpus, + * zero-fill element values for other cpus (just as what happens when + * not using prealloc). Otherwise, bpf program has no way to ensure + * known initial values for cpus other than current one + * (onallcpus=false always when coming from bpf prog). + */ + if (htab_is_prealloc(htab) && !onallcpus) { + u32 size = round_up(htab->map.value_size, 8); + int current_cpu = raw_smp_processor_id(); + int cpu; + + for_each_possible_cpu(cpu) { + if (cpu == current_cpu) + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value, + size); + else + memset(per_cpu_ptr(pptr, cpu), 0, size); + } + } else { + pcpu_copy_value(htab, pptr, value, onallcpus); + } +} + static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) { return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && @@ -891,7 +917,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, } } - pcpu_copy_value(htab, pptr, value, onallcpus); + pcpu_init_value(htab, pptr, value, onallcpus); if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr); @@ -1183,7 +1209,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), value, onallcpus); } else { - pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), + pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size), value, onallcpus); hlist_nulls_add_head_rcu(&l_new->hash_node, head); l_new = NULL; diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig index ace49111d3a3..26bced262473 100644 --- a/kernel/bpf/preload/Kconfig +++ b/kernel/bpf/preload/Kconfig @@ -6,6 +6,7 @@ config USERMODE_DRIVER menuconfig BPF_PRELOAD bool "Preload BPF file system with kernel specific program and map iterators" depends on BPF + depends on BPF_SYSCALL # The dependency on !COMPILE_TEST prevents it from being enabled # in allmodconfig or allyesconfig configurations depends on !COMPILE_TEST diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6200519582a6..1388bf733071 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7786,9 +7786,11 @@ static int check_return_code(struct bpf_verifier_env *env) struct tnum range = tnum_range(0, 1); enum bpf_prog_type prog_type = resolve_prog_type(env->prog); int err; + const bool is_subprog = env->cur_state->frame[0]->subprogno; /* LSM and struct_ops func-ptr's return type could be "void" */ - if ((prog_type == BPF_PROG_TYPE_STRUCT_OPS || + if (!is_subprog && + (prog_type == BPF_PROG_TYPE_STRUCT_OPS || prog_type == BPF_PROG_TYPE_LSM) && !prog->aux->attach_func_proto->type) return 0; @@ -7808,6 +7810,16 @@ static int check_return_code(struct bpf_verifier_env *env) return -EACCES; } + reg = cur_regs(env) + BPF_REG_0; + if (is_subprog) { + if (reg->type != SCALAR_VALUE) { + verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n", + reg_type_str[reg->type]); + return -EINVAL; + } + return 0; + } + switch (prog_type) { case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || @@ -7861,7 +7873,6 @@ static int check_return_code(struct bpf_verifier_env *env) return 0; } - reg = cur_regs(env) + BPF_REG_0; if (reg->type != SCALAR_VALUE) { verbose(env, "At program exit the register R0 is not a known value (%s)\n", reg_type_str[reg->type]); @@ -9572,12 +9583,13 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn_aux_data *aux) { - u32 datasec_id, type, id = insn->imm; const struct btf_var_secinfo *vsi; const struct btf_type *datasec; const struct btf_type *t; const char *sym_name; bool percpu = false; + u32 type, id = insn->imm; + s32 datasec_id; u64 addr; int i; diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index b4eea0abc3f0..781b9dca197c 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -229,6 +229,7 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; } io_tlb_index = 0; + no_iotlb_memory = false; if (verbose) swiotlb_print_info(); @@ -260,9 +261,11 @@ swiotlb_init(int verbose) if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) return; - if (io_tlb_start) + if (io_tlb_start) { memblock_free_early(io_tlb_start, PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + io_tlb_start = 0; + } pr_warn("Cannot allocate buffer"); no_iotlb_memory = true; } @@ -360,6 +363,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; } io_tlb_index = 0; + no_iotlb_memory = false; swiotlb_print_info(); @@ -441,14 +445,11 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, } } -phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, - dma_addr_t tbl_dma_addr, - phys_addr_t orig_addr, - size_t mapping_size, - size_t alloc_size, - enum dma_data_direction dir, - unsigned long attrs) +phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr, + size_t mapping_size, size_t alloc_size, + enum dma_data_direction dir, unsigned long attrs) { + dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(hwdev, io_tlb_start); unsigned long flags; phys_addr_t tlb_addr; unsigned int nslots, stride, index, wrap; @@ -667,9 +668,8 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size, swiotlb_force); - swiotlb_addr = swiotlb_tbl_map_single(dev, - phys_to_dma_unencrypted(dev, io_tlb_start), - paddr, size, size, dir, attrs); + swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, dir, + attrs); if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; diff --git a/kernel/events/core.c b/kernel/events/core.c index 5a29ab09e72d..dc568ca295bd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2312,9 +2312,6 @@ group_sched_out(struct perf_event *group_event, event_sched_out(event, cpuctx, ctx); perf_pmu_enable(ctx->pmu); - - if (group_event->attr.exclusive) - cpuctx->exclusive = 0; } #define DETACH_GROUP 0x01UL @@ -2583,11 +2580,8 @@ group_sched_in(struct perf_event *group_event, pmu->start_txn(pmu, PERF_PMU_TXN_ADD); - if (event_sched_in(group_event, cpuctx, ctx)) { - pmu->cancel_txn(pmu); - perf_mux_hrtimer_restart(cpuctx); - return -EAGAIN; - } + if (event_sched_in(group_event, cpuctx, ctx)) + goto error; /* * Schedule in siblings as one group (if any): @@ -2616,10 +2610,8 @@ group_error: } event_sched_out(group_event, cpuctx, ctx); +error: pmu->cancel_txn(pmu); - - perf_mux_hrtimer_restart(cpuctx); - return -EAGAIN; } @@ -2645,7 +2637,7 @@ static int group_can_go_on(struct perf_event *event, * If this group is exclusive and there are already * events on the CPU, it can't go on. */ - if (event->attr.exclusive && cpuctx->active_oncpu) + if (event->attr.exclusive && !list_empty(get_event_list(event))) return 0; /* * Otherwise, try to add it if all previous groups were able @@ -3679,6 +3671,7 @@ static int merge_sched_in(struct perf_event *event, void *data) *can_add_hw = 0; ctx->rotate_necessary = 1; + perf_mux_hrtimer_restart(cpuctx); } return 0; @@ -6374,14 +6367,13 @@ perf_output_sample_regs(struct perf_output_handle *handle, } static void perf_sample_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { if (user_mode(regs)) { regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; } else if (!(current->flags & PF_KTHREAD)) { - perf_get_regs_user(regs_user, regs, regs_user_copy); + perf_get_regs_user(regs_user, regs); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; regs_user->regs = NULL; @@ -7083,8 +7075,7 @@ void perf_prepare_sample(struct perf_event_header *header, } if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) - perf_sample_regs_user(&data->regs_user, regs, - &data->regs_user_copy); + perf_sample_regs_user(&data->regs_user, regs); if (sample_type & PERF_SAMPLE_REGS_USER) { /* regs dump ABI info */ @@ -7186,6 +7177,7 @@ __perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs, int (*output_begin)(struct perf_output_handle *, + struct perf_sample_data *, struct perf_event *, unsigned int)) { @@ -7198,7 +7190,7 @@ __perf_event_output(struct perf_event *event, perf_prepare_sample(&header, data, event, regs); - err = output_begin(&handle, event, header.size); + err = output_begin(&handle, data, event, header.size); if (err) goto exit; @@ -7264,7 +7256,7 @@ perf_event_read_event(struct perf_event *event, int ret; perf_event_header__init_id(&read_event.header, &sample, event); - ret = perf_output_begin(&handle, event, read_event.header.size); + ret = perf_output_begin(&handle, &sample, event, read_event.header.size); if (ret) return; @@ -7533,7 +7525,7 @@ static void perf_event_task_output(struct perf_event *event, perf_event_header__init_id(&task_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, task_event->event_id.header.size); if (ret) goto out; @@ -7636,7 +7628,7 @@ static void perf_event_comm_output(struct perf_event *event, return; perf_event_header__init_id(&comm_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, comm_event->event_id.header.size); if (ret) @@ -7736,7 +7728,7 @@ static void perf_event_namespaces_output(struct perf_event *event, perf_event_header__init_id(&namespaces_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, namespaces_event->event_id.header.size); if (ret) goto out; @@ -7863,7 +7855,7 @@ static void perf_event_cgroup_output(struct perf_event *event, void *data) perf_event_header__init_id(&cgroup_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, cgroup_event->event_id.header.size); if (ret) goto out; @@ -7989,7 +7981,7 @@ static void perf_event_mmap_output(struct perf_event *event, } perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, mmap_event->event_id.header.size); if (ret) goto out; @@ -8299,7 +8291,7 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head, int ret; perf_event_header__init_id(&rec.header, &sample, event); - ret = perf_output_begin(&handle, event, rec.header.size); + ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; @@ -8333,7 +8325,7 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost) perf_event_header__init_id(&lost_samples_event.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, lost_samples_event.header.size); if (ret) return; @@ -8388,7 +8380,7 @@ static void perf_event_switch_output(struct perf_event *event, void *data) perf_event_header__init_id(&se->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, se->event_id.header.size); + ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size); if (ret) return; @@ -8463,7 +8455,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_event_header__init_id(&throttle_event.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, throttle_event.header.size); if (ret) return; @@ -8506,7 +8498,7 @@ static void perf_event_ksymbol_output(struct perf_event *event, void *data) perf_event_header__init_id(&ksymbol_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, ksymbol_event->event_id.header.size); if (ret) return; @@ -8596,7 +8588,7 @@ static void perf_event_bpf_output(struct perf_event *event, void *data) perf_event_header__init_id(&bpf_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, data, event, bpf_event->event_id.header.size); if (ret) return; @@ -8705,7 +8697,8 @@ static void perf_event_text_poke_output(struct perf_event *event, void *data) perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, text_poke_event->event_id.header.size); + ret = perf_output_begin(&handle, &sample, event, + text_poke_event->event_id.header.size); if (ret) return; @@ -8786,7 +8779,7 @@ static void perf_log_itrace_start(struct perf_event *event) rec.tid = perf_event_tid(event, current); perf_event_header__init_id(&rec.header, &sample, event); - ret = perf_output_begin(&handle, event, rec.header.size); + ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; diff --git a/kernel/events/internal.h b/kernel/events/internal.h index fcbf5616a441..228801e20788 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -205,16 +205,12 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) static inline int get_recursion_context(int *recursion) { - int rctx; - - if (unlikely(in_nmi())) - rctx = 3; - else if (in_irq()) - rctx = 2; - else if (in_softirq()) - rctx = 1; - else - rctx = 0; + unsigned int pc = preempt_count(); + unsigned char rctx = 0; + + rctx += !!(pc & (NMI_MASK)); + rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK)); + rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); if (recursion[rctx]) return -1; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 192b8abc6330..ef91ae75ca56 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -147,6 +147,7 @@ ring_buffer_has_space(unsigned long head, unsigned long tail, static __always_inline int __perf_output_begin(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size, bool backward) { @@ -237,18 +238,16 @@ __perf_output_begin(struct perf_output_handle *handle, handle->size = (1UL << page_shift) - offset; if (unlikely(have_lost)) { - struct perf_sample_data sample_data; - lost_event.header.size = sizeof(lost_event); lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; lost_event.id = event->id; lost_event.lost = local_xchg(&rb->lost, 0); - perf_event_header__init_id(&lost_event.header, - &sample_data, event); + /* XXX mostly redundant; @data is already fully initializes */ + perf_event_header__init_id(&lost_event.header, data, event); perf_output_put(handle, lost_event); - perf_event__output_id_sample(event, handle, &sample_data); + perf_event__output_id_sample(event, handle, data); } return 0; @@ -263,22 +262,25 @@ out: } int perf_output_begin_forward(struct perf_output_handle *handle, - struct perf_event *event, unsigned int size) + struct perf_sample_data *data, + struct perf_event *event, unsigned int size) { - return __perf_output_begin(handle, event, size, false); + return __perf_output_begin(handle, data, event, size, false); } int perf_output_begin_backward(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size) { - return __perf_output_begin(handle, event, size, true); + return __perf_output_begin(handle, data, event, size, true); } int perf_output_begin(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size) { - return __perf_output_begin(handle, event, size, + return __perf_output_begin(handle, data, event, size, unlikely(is_write_backward(event))); } diff --git a/kernel/exit.c b/kernel/exit.c index 87a2d515de0d..1f236ed375f8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -454,7 +454,10 @@ static void exit_mm(void) mmap_read_unlock(mm); self.task = current; - self.next = xchg(&core_state->dumper.next, &self); + if (self.task->flags & PF_SIGNALED) + self.next = xchg(&core_state->dumper.next, &self); + else + self.task = NULL; /* * Implies mb(), the result of xchg() must be visible * to core_state->dumper. diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 63b349168da7..b0b1ad93fa95 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -253,7 +253,7 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, if (copy_from_user(buf, buffer, count)) { ret = -EFAULT; - goto out; + goto out_free; } buf[count] = '\0'; sym = strstrip(buf); @@ -307,8 +307,9 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, ret = count; } out: - kfree(buf); mutex_unlock(&fei_lock); +out_free: + kfree(buf); return ret; } diff --git a/kernel/futex.c b/kernel/futex.c index aee6ce294d84..c47d1015d759 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -786,8 +786,9 @@ static void put_pi_state(struct futex_pi_state *pi_state) */ if (pi_state->owner) { struct task_struct *owner; + unsigned long flags; - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags); owner = pi_state->owner; if (owner) { raw_spin_lock(&owner->pi_lock); @@ -795,7 +796,7 @@ static void put_pi_state(struct futex_pi_state *pi_state) raw_spin_unlock(&owner->pi_lock); } rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); } if (current->pi_state_cache) { diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index b71ad8d9f1c9..c1418b47f625 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -108,19 +108,21 @@ static inline void lockdep_lock(void) { DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + __this_cpu_inc(lockdep_recursion); arch_spin_lock(&__lock); __owner = current; - __this_cpu_inc(lockdep_recursion); } static inline void lockdep_unlock(void) { + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + if (debug_locks && DEBUG_LOCKS_WARN_ON(__owner != current)) return; - __this_cpu_dec(lockdep_recursion); __owner = NULL; arch_spin_unlock(&__lock); + __this_cpu_dec(lockdep_recursion); } static inline bool lockdep_assert_locked(void) @@ -2765,7 +2767,9 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, * (Note that this has to be done separately, because the graph cannot * detect such classes of deadlocks.) * - * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read + * Returns: 0 on deadlock detected, 1 on OK, 2 if another lock with the same + * lock class is held but nest_lock is also held, i.e. we rely on the + * nest_lock to avoid the deadlock. */ static int check_deadlock(struct task_struct *curr, struct held_lock *next) @@ -2788,7 +2792,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next) * lock class (i.e. read_lock(lock)+read_lock(lock)): */ if ((next->read == 2) && prev->read) - return 2; + continue; /* * We're holding the nest_lock, which serializes this lock's @@ -3593,15 +3597,12 @@ static int validate_chain(struct task_struct *curr, if (!ret) return 0; /* - * Mark recursive read, as we jump over it when - * building dependencies (just like we jump over - * trylock entries): - */ - if (ret == 2) - hlock->read = 2; - /* * Add dependency only if this lock is not the head - * of the chain, and if it's not a secondary read-lock: + * of the chain, and if the new lock introduces no more + * lock dependency (because we already hold a lock with the + * same lock class) nor deadlock (because the nest_lock + * serializes nesting locks), see the comments for + * check_deadlock(). */ if (!chain_head && ret != 2) { if (!check_prevs_add(curr, hlock)) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index f11b9bd3431d..a163542d178e 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1345,6 +1345,18 @@ static inline void __down_read(struct rw_semaphore *sem) } } +static inline int __down_read_interruptible(struct rw_semaphore *sem) +{ + if (!rwsem_read_trylock(sem)) { + if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_INTERRUPTIBLE))) + return -EINTR; + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); + } else { + rwsem_set_reader_owned(sem); + } + return 0; +} + static inline int __down_read_killable(struct rw_semaphore *sem) { if (!rwsem_read_trylock(sem)) { @@ -1495,6 +1507,20 @@ void __sched down_read(struct rw_semaphore *sem) } EXPORT_SYMBOL(down_read); +int __sched down_read_interruptible(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_interruptible)) { + rwsem_release(&sem->dep_map, _RET_IP_); + return -EINTR; + } + + return 0; +} +EXPORT_SYMBOL(down_read_interruptible); + int __sched down_read_killable(struct rw_semaphore *sem) { might_sleep(); @@ -1605,6 +1631,20 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) } EXPORT_SYMBOL(down_read_nested); +int down_read_killable_nested(struct rw_semaphore *sem, int subclass) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) { + rwsem_release(&sem->dep_map, _RET_IP_); + return -EINTR; + } + + return 0; +} +EXPORT_SYMBOL(down_read_killable_nested); + void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) { might_sleep(); diff --git a/kernel/panic.c b/kernel/panic.c index 396142ee43fd..332736a72a58 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -605,7 +605,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, panic("panic_on_warn set ...\n"); } - dump_stack(); + if (!regs) + dump_stack(); print_irqtrace_events(current); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fe64a49344bf..bc1e3b5a97bd 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -528,8 +528,8 @@ static int log_store(u32 caller_id, int facility, int level, if (dev_info) memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); - /* insert message */ - if ((flags & LOG_CONT) || !(flags & LOG_NEWLINE)) + /* A message without a trailing newline can be continued. */ + if (!(flags & LOG_NEWLINE)) prb_commit(&e); else prb_final_commit(&e); diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 6b1525685277..74e25a1704f2 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -882,8 +882,6 @@ static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out) head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */ do { - desc = to_desc(desc_ring, head_id); - id = DESC_ID(head_id + 1); id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 43d6179508d6..79de1294f8eb 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -264,17 +264,11 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) return ret; } -static bool ptrace_has_cap(const struct cred *cred, struct user_namespace *ns, - unsigned int mode) +static bool ptrace_has_cap(struct user_namespace *ns, unsigned int mode) { - int ret; - if (mode & PTRACE_MODE_NOAUDIT) - ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT); - else - ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NONE); - - return ret == 0; + return ns_capable_noaudit(ns, CAP_SYS_PTRACE); + return ns_capable(ns, CAP_SYS_PTRACE); } /* Returns 0 on success, -errno on denial. */ @@ -326,7 +320,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) gid_eq(caller_gid, tcred->sgid) && gid_eq(caller_gid, tcred->gid)) goto ok; - if (ptrace_has_cap(cred, tcred->user_ns, mode)) + if (ptrace_has_cap(tcred->user_ns, mode)) goto ok; rcu_read_unlock(); return -EPERM; @@ -345,7 +339,7 @@ ok: mm = task->mm; if (mm && ((get_dumpable(mm) != SUID_DUMP_USER) && - !ptrace_has_cap(cred, mm->user_ns, mode))) + !ptrace_has_cap(mm->user_ns, mode))) return -EPERM; return security_ptrace_access_check(task, mode); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2a52f42f64b6..bd04b09b84b3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4077,7 +4077,6 @@ void rcu_cpu_starting(unsigned int cpu) smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } -#ifdef CONFIG_HOTPLUG_CPU /* * The outgoing function has no further need of RCU, so remove it from * the rcu_node tree's ->qsmaskinitnext bit masks. @@ -4117,6 +4116,7 @@ void rcu_report_dead(unsigned int cpu) rdp->cpu_started = false; } +#ifdef CONFIG_HOTPLUG_CPU /* * The outgoing CPU has just passed through the dying-idle state, and we * are being invoked from the CPU that was IPIed to continue the offline diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 0fde39b8daab..ca21d28a0f98 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -249,13 +249,16 @@ static bool check_slow_task(struct task_struct *t, void *arg) /* * Scan the current list of tasks blocked within RCU read-side critical - * sections, printing out the tid of each. + * sections, printing out the tid of each of the first few of them. */ -static int rcu_print_task_stall(struct rcu_node *rnp) +static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) { + int i = 0; int ndetected = 0; struct rcu_stall_chk_rdr rscr; struct task_struct *t; + struct task_struct *ts[8]; if (!rcu_preempt_blocked_readers_cgp(rnp)) return 0; @@ -264,6 +267,14 @@ static int rcu_print_task_stall(struct rcu_node *rnp) t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + get_task_struct(t); + ts[i++] = t; + if (i >= ARRAY_SIZE(ts)) + break; + } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + for (i--; i; i--) { + t = ts[i]; if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr)) pr_cont(" P%d", t->pid); else @@ -273,6 +284,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) ".q"[rscr.rs.b.need_qs], ".e"[rscr.rs.b.exp_hint], ".l"[rscr.on_blkd_list]); + put_task_struct(t); ndetected++; } pr_cont("\n"); @@ -293,8 +305,9 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections. */ -static int rcu_print_task_stall(struct rcu_node *rnp) +static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return 0; } #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ @@ -472,7 +485,6 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name); rcu_for_each_leaf_node(rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); - ndetected += rcu_print_task_stall(rnp); if (rnp->qsmask != 0) { for_each_leaf_node_possible_cpu(rnp, cpu) if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { @@ -480,7 +492,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) ndetected++; } } - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + ndetected += rcu_print_task_stall(rnp, flags); // Releases rnp->lock. } for_each_possible_cpu(cpu) diff --git a/kernel/reboot.c b/kernel/reboot.c index e7b78d5ae1ab..af6f23d8bea1 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -551,22 +551,22 @@ static int __init reboot_setup(char *str) break; case 's': - { - int rc; - - if (isdigit(*(str+1))) { - rc = kstrtoint(str+1, 0, &reboot_cpu); - if (rc) - return rc; - } else if (str[1] == 'm' && str[2] == 'p' && - isdigit(*(str+3))) { - rc = kstrtoint(str+3, 0, &reboot_cpu); - if (rc) - return rc; - } else + if (isdigit(*(str+1))) + reboot_cpu = simple_strtoul(str+1, NULL, 0); + else if (str[1] == 'm' && str[2] == 'p' && + isdigit(*(str+3))) + reboot_cpu = simple_strtoul(str+3, NULL, 0); + else *mode = REBOOT_SOFT; + if (reboot_cpu >= num_possible_cpus()) { + pr_err("Ignoring the CPU number in reboot= option. " + "CPU %d exceeds possible cpu number %d\n", + reboot_cpu, num_possible_cpus()); + reboot_cpu = 0; + break; + } break; - } + case 'g': *mode = REBOOT_GPIO; break; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d2003a7d5ab5..e7e453492cff 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2501,7 +2501,12 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, #ifdef CONFIG_SMP if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; + else #endif + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } activate_task(rq, p, en_flags); ttwu_do_wakeup(rq, p, wake_flags, rf); @@ -2888,11 +2893,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) goto unlock; - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } - #ifdef CONFIG_SMP /* * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be @@ -2963,6 +2963,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } + wake_flags |= WF_MIGRATED; psi_ttwu_dequeue(p); set_task_cpu(p, cpu); @@ -4907,20 +4912,21 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) if (!dl_prio(p->normal_prio) || (pi_task && dl_prio(pi_task->prio) && dl_entity_preempt(&pi_task->dl, &p->dl))) { - p->dl.dl_boosted = 1; + p->dl.pi_se = pi_task->dl.pi_se; queue_flag |= ENQUEUE_REPLENISH; - } else - p->dl.dl_boosted = 0; + } else { + p->dl.pi_se = &p->dl; + } p->sched_class = &dl_sched_class; } else if (rt_prio(prio)) { if (dl_prio(oldprio)) - p->dl.dl_boosted = 0; + p->dl.pi_se = &p->dl; if (oldprio < prio) queue_flag |= ENQUEUE_HEAD; p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) - p->dl.dl_boosted = 0; + p->dl.pi_se = &p->dl; if (rt_prio(oldprio)) p->rt.timeout = 0; p->sched_class = &fair_sched_class; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index d73bccde2720..97d318b0cd0c 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -881,7 +881,7 @@ static void sugov_limits(struct cpufreq_policy *policy) struct cpufreq_governor schedutil_gov = { .name = "schedutil", .owner = THIS_MODULE, - .dynamic_switching = true, + .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING, .init = sugov_init, .exit = sugov_exit, .start = sugov_start, diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f232305dcefe..1d3c97268ec0 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -43,6 +43,28 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se) return !RB_EMPTY_NODE(&dl_se->rb_node); } +#ifdef CONFIG_RT_MUTEXES +static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) +{ + return dl_se->pi_se; +} + +static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) +{ + return pi_of(dl_se) != dl_se; +} +#else +static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) +{ + return dl_se; +} + +static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) +{ + return false; +} +#endif + #ifdef CONFIG_SMP static inline struct dl_bw *dl_bw_of(int i) { @@ -698,7 +720,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - WARN_ON(dl_se->dl_boosted); + WARN_ON(is_dl_boosted(dl_se)); WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); /* @@ -736,21 +758,20 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) * could happen are, typically, a entity voluntarily trying to overcome its * runtime, or it just underestimated it during sched_setattr(). */ -static void replenish_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se) +static void replenish_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - BUG_ON(pi_se->dl_runtime <= 0); + BUG_ON(pi_of(dl_se)->dl_runtime <= 0); /* * This could be the case for a !-dl task that is boosted. * Just go with full inherited parameters. */ if (dl_se->dl_deadline == 0) { - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; } if (dl_se->dl_yielded && dl_se->runtime > 0) @@ -763,8 +784,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * arbitrary large. */ while (dl_se->runtime <= 0) { - dl_se->deadline += pi_se->dl_period; - dl_se->runtime += pi_se->dl_runtime; + dl_se->deadline += pi_of(dl_se)->dl_period; + dl_se->runtime += pi_of(dl_se)->dl_runtime; } /* @@ -778,8 +799,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { printk_deferred_once("sched: DL replenish lagged too much\n"); - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; } if (dl_se->dl_yielded) @@ -812,8 +833,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * task with deadline equal to period this is the same of using * dl_period instead of dl_deadline in the equation above. */ -static bool dl_entity_overflow(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se, u64 t) +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t) { u64 left, right; @@ -835,9 +855,9 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, * of anything below microseconds resolution is actually fiction * (but still we want to give the user that illusion >;). */ - left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); + left = (pi_of(dl_se)->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); right = ((dl_se->deadline - t) >> DL_SCALE) * - (pi_se->dl_runtime >> DL_SCALE); + (pi_of(dl_se)->dl_runtime >> DL_SCALE); return dl_time_before(right, left); } @@ -922,24 +942,23 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) * Please refer to the comments update_dl_revised_wakeup() function to find * more about the Revised CBS rule. */ -static void update_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se) +static void update_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); if (dl_time_before(dl_se->deadline, rq_clock(rq)) || - dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { + dl_entity_overflow(dl_se, rq_clock(rq))) { if (unlikely(!dl_is_implicit(dl_se) && !dl_time_before(dl_se->deadline, rq_clock(rq)) && - !dl_se->dl_boosted)){ + !is_dl_boosted(dl_se))) { update_dl_revised_wakeup(dl_se, rq); return; } - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; } } @@ -1038,7 +1057,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * The task might have been boosted by someone else and might be in the * boosting/deboosting path, its not throttled. */ - if (dl_se->dl_boosted) + if (is_dl_boosted(dl_se)) goto unlock; /* @@ -1066,7 +1085,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * but do not enqueue -- wait for our wakeup to do that. */ if (!task_on_rq_queued(p)) { - replenish_dl_entity(dl_se, dl_se); + replenish_dl_entity(dl_se); goto unlock; } @@ -1156,7 +1175,7 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) if (dl_time_before(dl_se->deadline, rq_clock(rq)) && dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { - if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p))) return; dl_se->dl_throttled = 1; if (dl_se->runtime > 0) @@ -1287,7 +1306,7 @@ throttle: dl_se->dl_overrun = 1; __dequeue_task_dl(rq, curr, 0); - if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr))) enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); if (!is_leftmost(curr, &rq->dl)) @@ -1481,8 +1500,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) } static void -enqueue_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se, int flags) +enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) { BUG_ON(on_dl_rq(dl_se)); @@ -1493,9 +1511,9 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, */ if (flags & ENQUEUE_WAKEUP) { task_contending(dl_se, flags); - update_dl_entity(dl_se, pi_se); + update_dl_entity(dl_se); } else if (flags & ENQUEUE_REPLENISH) { - replenish_dl_entity(dl_se, pi_se); + replenish_dl_entity(dl_se); } else if ((flags & ENQUEUE_RESTORE) && dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) { @@ -1512,19 +1530,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se) static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) { - struct task_struct *pi_task = rt_mutex_get_top_task(p); - struct sched_dl_entity *pi_se = &p->dl; - - /* - * Use the scheduling parameters of the top pi-waiter task if: - * - we have a top pi-waiter which is a SCHED_DEADLINE task AND - * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is - * smaller than our deadline OR we are a !SCHED_DEADLINE task getting - * boosted due to a SCHED_DEADLINE pi-waiter). - * Otherwise we keep our runtime and deadline. - */ - if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) { - pi_se = &pi_task->dl; + if (is_dl_boosted(&p->dl)) { /* * Because of delays in the detection of the overrun of a * thread's runtime, it might be the case that a thread @@ -1557,7 +1563,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * the throttle. */ p->dl.dl_throttled = 0; - BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); + BUG_ON(!is_dl_boosted(&p->dl) || flags != ENQUEUE_REPLENISH); return; } @@ -1594,7 +1600,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) return; } - enqueue_dl_entity(&p->dl, pi_se, flags); + enqueue_dl_entity(&p->dl, flags); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); @@ -2787,11 +2793,14 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_bw = 0; dl_se->dl_density = 0; - dl_se->dl_boosted = 0; dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; dl_se->dl_non_contending = 0; dl_se->dl_overrun = 0; + +#ifdef CONFIG_RT_MUTEXES + dl_se->pi_se = dl_se; +#endif } bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0655524700d2..2357921580f9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -251,7 +251,7 @@ static int sd_ctl_doflags(struct ctl_table *table, int write, unsigned long flags = *(unsigned long *)table->data; size_t data_size = 0; size_t len = 0; - char *tmp; + char *tmp, *buf; int idx; if (write) @@ -269,17 +269,17 @@ static int sd_ctl_doflags(struct ctl_table *table, int write, return 0; } - tmp = kcalloc(data_size + 1, sizeof(*tmp), GFP_KERNEL); - if (!tmp) + buf = kcalloc(data_size + 1, sizeof(*buf), GFP_KERNEL); + if (!buf) return -ENOMEM; for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { char *name = sd_flag_debug[idx].name; - len += snprintf(tmp + len, strlen(name) + 2, "%s ", name); + len += snprintf(buf + len, strlen(name) + 2, "%s ", name); } - tmp += *ppos; + tmp = buf + *ppos; len -= *ppos; if (len > *lenp) @@ -294,7 +294,7 @@ static int sd_ctl_doflags(struct ctl_table *table, int write, *lenp = len; *ppos += len; - kfree(tmp); + kfree(buf); return 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 290f9e38378c..ae7ceba8fd4f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5477,6 +5477,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int idle_h_nr_running = task_has_idle_policy(p); + int task_new = !(flags & ENQUEUE_WAKEUP); /* * The code below (indirectly) updates schedutil which looks at @@ -5549,7 +5550,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * into account, but that is not straightforward to implement, * and the following generally works well enough in practice. */ - if (flags & ENQUEUE_WAKEUP) + if (!task_new) update_overutilized_status(rq); enqueue_throttle: @@ -6172,21 +6173,21 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t static int select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) { - unsigned long best_cap = 0; + unsigned long task_util, best_cap = 0; int cpu, best_cpu = -1; struct cpumask *cpus; - sync_entity_load_avg(&p->se); - cpus = this_cpu_cpumask_var_ptr(select_idle_mask); cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + task_util = uclamp_task_util(p); + for_each_cpu_wrap(cpu, cpus, target) { unsigned long cpu_cap = capacity_of(cpu); if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) continue; - if (task_fits_capacity(p, cpu_cap)) + if (fits_capacity(task_util, cpu_cap)) return cpu; if (cpu_cap > best_cap) { @@ -6198,44 +6199,42 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) return best_cpu; } +static inline bool asym_fits_capacity(int task_util, int cpu) +{ + if (static_branch_unlikely(&sched_asym_cpucapacity)) + return fits_capacity(task_util, capacity_of(cpu)); + + return true; +} + /* * Try and locate an idle core/thread in the LLC cache domain. */ static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; + unsigned long task_util; int i, recent_used_cpu; /* - * For asymmetric CPU capacity systems, our domain of interest is - * sd_asym_cpucapacity rather than sd_llc. + * On asymmetric system, update task utilization because we will check + * that the task fits with cpu's capacity. */ if (static_branch_unlikely(&sched_asym_cpucapacity)) { - sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); - /* - * On an asymmetric CPU capacity system where an exclusive - * cpuset defines a symmetric island (i.e. one unique - * capacity_orig value through the cpuset), the key will be set - * but the CPUs within that cpuset will not have a domain with - * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric - * capacity path. - */ - if (!sd) - goto symmetric; - - i = select_idle_capacity(p, sd, target); - return ((unsigned)i < nr_cpumask_bits) ? i : target; + sync_entity_load_avg(&p->se); + task_util = uclamp_task_util(p); } -symmetric: - if (available_idle_cpu(target) || sched_idle_cpu(target)) + if ((available_idle_cpu(target) || sched_idle_cpu(target)) && + asym_fits_capacity(task_util, target)) return target; /* * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && - (available_idle_cpu(prev) || sched_idle_cpu(prev))) + (available_idle_cpu(prev) || sched_idle_cpu(prev)) && + asym_fits_capacity(task_util, prev)) return prev; /* @@ -6258,7 +6257,8 @@ symmetric: recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && - cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { + cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && + asym_fits_capacity(task_util, recent_used_cpu)) { /* * Replace recent_used_cpu with prev as it is a potential * candidate for the next wake: @@ -6267,6 +6267,26 @@ symmetric: return recent_used_cpu; } + /* + * For asymmetric CPU capacity systems, our domain of interest is + * sd_asym_cpucapacity rather than sd_llc. + */ + if (static_branch_unlikely(&sched_asym_cpucapacity)) { + sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); + /* + * On an asymmetric CPU capacity system where an exclusive + * cpuset defines a symmetric island (i.e. one unique + * capacity_orig value through the cpuset), the key will be set + * but the CPUs within that cpuset will not have a domain with + * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric + * capacity path. + */ + if (sd) { + i = select_idle_capacity(p, sd, target); + return ((unsigned)i < nr_cpumask_bits) ? i : target; + } + } + sd = rcu_dereference(per_cpu(sd_llc, target)); if (!sd) return target; @@ -9031,7 +9051,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * emptying busiest. */ if (local->group_type == group_has_spare) { - if (busiest->group_type > group_fully_busy) { + if ((busiest->group_type > group_fully_busy) && + !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) { /* * If busiest is overloaded, try to fill spare * capacity. This might end up creating spare capacity diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 24d0ee26377d..c6932b8f4467 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -78,7 +78,7 @@ void __weak arch_cpu_idle_dead(void) { } void __weak arch_cpu_idle(void) { cpu_idle_force_poll = 1; - local_irq_enable(); + raw_local_irq_enable(); } /** @@ -94,9 +94,35 @@ void __cpuidle default_idle_call(void) trace_cpu_idle(1, smp_processor_id()); stop_critical_timings(); + + /* + * arch_cpu_idle() is supposed to enable IRQs, however + * we can't do that because of RCU and tracing. + * + * Trace IRQs enable here, then switch off RCU, and have + * arch_cpu_idle() use raw_local_irq_enable(). Note that + * rcu_idle_enter() relies on lockdep IRQ state, so switch that + * last -- this is very similar to the entry code. + */ + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(_THIS_IP_); rcu_idle_enter(); + lockdep_hardirqs_on(_THIS_IP_); + arch_cpu_idle(); + + /* + * OK, so IRQs are enabled here, but RCU needs them disabled to + * turn itself back on.. funny thing is that disabling IRQs + * will cause tracing, which needs RCU. Jump through hoops to + * make it 'work'. + */ + raw_local_irq_disable(); + lockdep_hardirqs_off(_THIS_IP_); rcu_idle_exit(); + lockdep_hardirqs_on(_THIS_IP_); + raw_local_irq_enable(); + start_critical_timings(); trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 8ad7a293255a..53a7d1512dd7 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -38,7 +38,7 @@ #include <linux/filter.h> #include <linux/pid.h> #include <linux/ptrace.h> -#include <linux/security.h> +#include <linux/capability.h> #include <linux/tracehook.h> #include <linux/uaccess.h> #include <linux/anon_inodes.h> @@ -558,8 +558,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) * behavior of privileged children. */ if (!task_no_new_privs(current) && - security_capable(current_cred(), current_user_ns(), - CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) != 0) + !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); /* Allocate a new seccomp_filter */ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a4020c0b4508..e1bf5228fb69 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -202,7 +202,7 @@ config DYNAMIC_FTRACE_WITH_REGS config DYNAMIC_FTRACE_WITH_DIRECT_CALLS def_bool y - depends on DYNAMIC_FTRACE + depends on DYNAMIC_FTRACE_WITH_REGS depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS config FUNCTION_PROFILER diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4517c8b66518..048c655315f1 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -181,6 +181,16 @@ bpf_probe_read_user_str_common(void *dst, u32 size, { int ret; + /* + * NB: We rely on strncpy_from_user() not copying junk past the NUL + * terminator into `dst`. + * + * strncpy_from_user() does long-sized strides in the fast path. If the + * strncpy does not mask out the bytes after the NUL in `unsafe_ptr`, + * then there could be junk after the NUL in `dst`. If user takes `dst` + * and keys a hash map with it, then semantically identical strings can + * occupy multiple entries in the map. + */ ret = strncpy_from_user_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); @@ -1198,7 +1208,7 @@ static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, *btf = bpf_get_btf_vmlinux(); if (IS_ERR_OR_NULL(*btf)) - return PTR_ERR(*btf); + return IS_ERR(*btf) ? PTR_ERR(*btf) : -EINVAL; if (ptr->type_id > 0) *btf_id = ptr->type_id; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8185f7240095..9c1bba8cc51b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1629,6 +1629,8 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) static struct ftrace_ops * ftrace_find_tramp_ops_any(struct dyn_ftrace *rec); static struct ftrace_ops * +ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_exclude); +static struct ftrace_ops * ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops); static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, @@ -1778,7 +1780,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, * to it. */ if (ftrace_rec_count(rec) == 1 && - ftrace_find_tramp_ops_any(rec)) + ftrace_find_tramp_ops_any_other(rec, ops)) rec->flags |= FTRACE_FL_TRAMP; else rec->flags &= ~FTRACE_FL_TRAMP; @@ -2245,6 +2247,24 @@ ftrace_find_tramp_ops_any(struct dyn_ftrace *rec) } static struct ftrace_ops * +ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_exclude) +{ + struct ftrace_ops *op; + unsigned long ip = rec->ip; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + + if (op == op_exclude || !op->trampoline) + continue; + + if (hash_contains_ip(ip, op->func_hash)) + return op; + } while_for_each_ftrace_op(op); + + return NULL; +} + +static struct ftrace_ops * ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *op) { diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index dc83b3fa9fe7..a6268e09160a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3234,14 +3234,12 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* See if we shot pass the end of this buffer page */ if (unlikely(write > BUF_PAGE_SIZE)) { - if (tail != w) { - /* before and after may now different, fix it up*/ - b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); - a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); - if (a_ok && b_ok && info->before != info->after) - (void)rb_time_cmpxchg(&cpu_buffer->before_stamp, - info->before, info->after); - } + /* before and after may now different, fix it up*/ + b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); + if (a_ok && b_ok && info->before != info->after) + (void)rb_time_cmpxchg(&cpu_buffer->before_stamp, + info->before, info->after); return rb_move_tail(cpu_buffer, tail, info); } @@ -3287,11 +3285,11 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, ts = rb_time_stamp(cpu_buffer->buffer); barrier(); /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && - info->after < ts) { + info->after < ts && + rb_time_cmpxchg(&cpu_buffer->write_stamp, + info->after, ts)) { /* Nothing came after this event between C and E */ info->delta = ts - info->after; - (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, - info->after, info->ts); info->ts = ts; } else { /* diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 410cfeb16db5..7d53c5bdea3e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3534,7 +3534,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, } #define STATIC_TEMP_BUF_SIZE 128 -static char static_temp_buf[STATIC_TEMP_BUF_SIZE]; +static char static_temp_buf[STATIC_TEMP_BUF_SIZE] __aligned(4); /* Find the next real entry, without updating the iterator itself */ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index c9ad5c6fbaad..d071fc271eef 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -368,7 +368,7 @@ static int start_kthread(struct trace_array *tr) struct task_struct *kthread; int next_cpu; - if (WARN_ON(hwlat_kthread)) + if (hwlat_kthread) return 0; /* Just pick the first CPU on first iteration */ diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5abb5b22ad13..71109065bd8e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -44,8 +44,6 @@ int __read_mostly soft_watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; static int __read_mostly nmi_watchdog_available; -static struct cpumask watchdog_allowed_mask __read_mostly; - struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -162,6 +160,8 @@ static void lockup_detector_update_enable(void) int __read_mostly sysctl_softlockup_all_cpu_backtrace; #endif +static struct cpumask watchdog_allowed_mask __read_mostly; + /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; |