diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-08-29 14:53:51 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-08-29 14:53:51 -0700 |
commit | d68b4b6f307d155475cce541f2aee938032ed22e (patch) | |
tree | c2a6487ac8b1bce963b5b352b42e461a6fa8da15 /kernel | |
parent | b96a3e9142fdf346b05b20e867b4f0dfca119e96 (diff) | |
parent | dce8f8ed1de1d9d6d27c5ccd202ce4ec163b100c (diff) |
Merge tag 'mm-nonmm-stable-2023-08-28-22-48' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull non-MM updates from Andrew Morton:
- An extensive rework of kexec and crash Kconfig from Eric DeVolder
("refactor Kconfig to consolidate KEXEC and CRASH options")
- kernel.h slimming work from Andy Shevchenko ("kernel.h: Split out a
couple of macros to args.h")
- gdb feature work from Kuan-Ying Lee ("Add GDB memory helper
commands")
- vsprintf inclusion rationalization from Andy Shevchenko
("lib/vsprintf: Rework header inclusions")
- Switch the handling of kdump from a udev scheme to in-kernel
handling, by Eric DeVolder ("crash: Kernel handling of CPU and memory
hot un/plug")
- Many singleton patches to various parts of the tree
* tag 'mm-nonmm-stable-2023-08-28-22-48' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (81 commits)
document while_each_thread(), change first_tid() to use for_each_thread()
drivers/char/mem.c: shrink character device's devlist[] array
x86/crash: optimize CPU changes
crash: change crash_prepare_elf64_headers() to for_each_possible_cpu()
crash: hotplug support for kexec_load()
x86/crash: add x86 crash hotplug support
crash: memory and CPU hotplug sysfs attributes
kexec: exclude elfcorehdr from the segment digest
crash: add generic infrastructure for crash hotplug support
crash: move a few code bits to setup support of crash hotplug
kstrtox: consistently use _tolower()
kill do_each_thread()
nilfs2: fix WARNING in mark_buffer_dirty due to discarded buffer reuse
scripts/bloat-o-meter: count weak symbol sizes
treewide: drop CONFIG_EMBEDDED
lockdep: fix static memory detection even more
lib/vsprintf: declare no_hash_pointers in sprintf.h
lib/vsprintf: split out sprintf() and friends
kernel/fork: stop playing lockless games for exe_file replacement
adfs: delete unused "union adfs_dirtail" definition
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Kconfig.kexec | 150 | ||||
-rw-r--r-- | kernel/acct.c | 2 | ||||
-rw-r--r-- | kernel/configs/tiny-base.config | 2 | ||||
-rw-r--r-- | kernel/crash_core.c | 391 | ||||
-rw-r--r-- | kernel/cred.c | 27 | ||||
-rw-r--r-- | kernel/fork.c | 22 | ||||
-rw-r--r-- | kernel/gcov/Makefile | 2 | ||||
-rw-r--r-- | kernel/kexec.c | 5 | ||||
-rw-r--r-- | kernel/kexec_core.c | 43 | ||||
-rw-r--r-- | kernel/kexec_file.c | 193 | ||||
-rw-r--r-- | kernel/ksysfs.c | 15 | ||||
-rw-r--r-- | kernel/kthread.c | 3 | ||||
-rw-r--r-- | kernel/locking/lockdep.c | 36 | ||||
-rw-r--r-- | kernel/relay.c | 2 | ||||
-rw-r--r-- | kernel/signal.c | 13 | ||||
-rw-r--r-- | kernel/watchdog.c | 11 |
16 files changed, 635 insertions, 282 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec new file mode 100644 index 000000000000..9bfe68fe9676 --- /dev/null +++ b/kernel/Kconfig.kexec @@ -0,0 +1,150 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menu "Kexec and crash features" + +config CRASH_CORE + bool + +config KEXEC_CORE + select CRASH_CORE + bool + +config KEXEC_ELF + bool + +config HAVE_IMA_KEXEC + bool + +config KEXEC + bool "Enable kexec system call" + depends on ARCH_SUPPORTS_KEXEC + select KEXEC_CORE + help + kexec is a system call that implements the ability to shutdown your + current kernel, and to start another kernel. It is like a reboot + but it is independent of the system firmware. And like a reboot + you can start any kernel with it, not just Linux. + + The name comes from the similarity to the exec system call. + + It is an ongoing process to be certain the hardware in a machine + is properly shutdown, so do not be surprised if this code does not + initially work for you. As of this writing the exact hardware + interface is strongly in flux, so no good recommendation can be + made. + +config KEXEC_FILE + bool "Enable kexec file based system call" + depends on ARCH_SUPPORTS_KEXEC_FILE + select KEXEC_CORE + help + This is new version of kexec system call. This system call is + file based and takes file descriptors as system call argument + for kernel and initramfs as opposed to list of segments as + accepted by kexec system call. + +config KEXEC_SIG + bool "Verify kernel signature during kexec_file_load() syscall" + depends on ARCH_SUPPORTS_KEXEC_SIG + depends on KEXEC_FILE + help + This option makes the kexec_file_load() syscall check for a valid + signature of the kernel image. The image can still be loaded without + a valid signature unless you also enable KEXEC_SIG_FORCE, though if + there's a signature that we can check, then it must be valid. + + In addition to this option, you need to enable signature + verification for the corresponding kernel image type being + loaded in order for this to work. + +config KEXEC_SIG_FORCE + bool "Require a valid signature in kexec_file_load() syscall" + depends on ARCH_SUPPORTS_KEXEC_SIG_FORCE + depends on KEXEC_SIG + help + This option makes kernel signature verification mandatory for + the kexec_file_load() syscall. + +config KEXEC_IMAGE_VERIFY_SIG + bool "Enable Image signature verification support (ARM)" + default ARCH_DEFAULT_KEXEC_IMAGE_VERIFY_SIG + depends on ARCH_SUPPORTS_KEXEC_IMAGE_VERIFY_SIG + depends on KEXEC_SIG + depends on EFI && SIGNED_PE_FILE_VERIFICATION + help + Enable Image signature verification support. + +config KEXEC_BZIMAGE_VERIFY_SIG + bool "Enable bzImage signature verification support" + depends on ARCH_SUPPORTS_KEXEC_BZIMAGE_VERIFY_SIG + depends on KEXEC_SIG + depends on SIGNED_PE_FILE_VERIFICATION + select SYSTEM_TRUSTED_KEYRING + help + Enable bzImage signature verification support. + +config KEXEC_JUMP + bool "kexec jump" + depends on ARCH_SUPPORTS_KEXEC_JUMP + depends on KEXEC && HIBERNATION + help + Jump between original kernel and kexeced kernel and invoke + code in physical address mode via KEXEC + +config CRASH_DUMP + bool "kernel crash dumps" + depends on ARCH_SUPPORTS_CRASH_DUMP + depends on ARCH_SUPPORTS_KEXEC + select CRASH_CORE + select KEXEC_CORE + select KEXEC + help + Generate crash dump after being started by kexec. + This should be normally only set in special crash dump kernels + which are loaded in the main kernel with kexec-tools into + a specially reserved region and then later executed after + a crash by kdump/kexec. The crash dump kernel must be compiled + to a memory address not used by the main kernel or BIOS using + PHYSICAL_START, or it must be built as a relocatable image + (CONFIG_RELOCATABLE=y). + For more details see Documentation/admin-guide/kdump/kdump.rst + + For s390, this option also enables zfcpdump. + See also <file:Documentation/s390/zfcpdump.rst> + +config CRASH_HOTPLUG + bool "Update the crash elfcorehdr on system configuration changes" + default y + depends on CRASH_DUMP && (HOTPLUG_CPU || MEMORY_HOTPLUG) + depends on ARCH_SUPPORTS_CRASH_HOTPLUG + help + Enable direct update to the crash elfcorehdr (which contains + the list of CPUs and memory regions to be dumped upon a crash) + in response to hot plug/unplug or online/offline of CPUs or + memory. This is a much more advanced approach than userspace + attempting that. + + If unsure, say Y. + +config CRASH_MAX_MEMORY_RANGES + int "Specify the maximum number of memory regions for the elfcorehdr" + default 8192 + depends on CRASH_HOTPLUG + help + For the kexec_file_load() syscall path, specify the maximum number of + memory regions that the elfcorehdr buffer/segment can accommodate. + These regions are obtained via walk_system_ram_res(); eg. the + 'System RAM' entries in /proc/iomem. + This value is combined with NR_CPUS_DEFAULT and multiplied by + sizeof(Elf64_Phdr) to determine the final elfcorehdr memory buffer/ + segment size. + The value 8192, for example, covers a (sparsely populated) 1TiB system + consisting of 128MiB memblocks, while resulting in an elfcorehdr + memory buffer/segment size under 1MiB. This represents a sane choice + to accommodate both baremetal and virtual machine configurations. + + For the kexec_load() syscall path, CRASH_MAX_MEMORY_RANGES is part of + the computation behind the value provided through the + /sys/kernel/crash_elfcorehdr_size attribute. + +endmenu diff --git a/kernel/acct.c b/kernel/acct.c index 010667ce6080..10f769e13f72 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -445,7 +445,7 @@ static void fill_ac(acct_t *ac) memset(ac, 0, sizeof(acct_t)); ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER; - strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm)); + strscpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm)); /* calculate run_time in nsec*/ run_time = ktime_get_ns(); diff --git a/kernel/configs/tiny-base.config b/kernel/configs/tiny-base.config index 2f0e6bf6db2c..ffb9dcafca26 100644 --- a/kernel/configs/tiny-base.config +++ b/kernel/configs/tiny-base.config @@ -1 +1 @@ -CONFIG_EMBEDDED=y +CONFIG_EXPERT=y diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 693445e1f7f6..03a7932cde0a 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -10,6 +10,9 @@ #include <linux/utsname.h> #include <linux/vmalloc.h> #include <linux/sizes.h> +#include <linux/kexec.h> +#include <linux/memory.h> +#include <linux/cpuhotplug.h> #include <asm/page.h> #include <asm/sections.h> @@ -17,6 +20,10 @@ #include <crypto/sha1.h> #include "kallsyms_internal.h" +#include "kexec_internal.h" + +/* Per cpu memory for storing cpu states in case of system crash. */ +note_buf_t __percpu *crash_notes; /* vmcoreinfo stuff */ unsigned char *vmcoreinfo_data; @@ -314,6 +321,187 @@ static int __init parse_crashkernel_dummy(char *arg) } early_param("crashkernel", parse_crashkernel_dummy); +int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, + void **addr, unsigned long *sz) +{ + Elf64_Ehdr *ehdr; + Elf64_Phdr *phdr; + unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz; + unsigned char *buf; + unsigned int cpu, i; + unsigned long long notes_addr; + unsigned long mstart, mend; + + /* extra phdr for vmcoreinfo ELF note */ + nr_phdr = nr_cpus + 1; + nr_phdr += mem->nr_ranges; + + /* + * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping + * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64). + * I think this is required by tools like gdb. So same physical + * memory will be mapped in two ELF headers. One will contain kernel + * text virtual addresses and other will have __va(physical) addresses. + */ + + nr_phdr++; + elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr); + elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN); + + buf = vzalloc(elf_sz); + if (!buf) + return -ENOMEM; + + ehdr = (Elf64_Ehdr *)buf; + phdr = (Elf64_Phdr *)(ehdr + 1); + memcpy(ehdr->e_ident, ELFMAG, SELFMAG); + ehdr->e_ident[EI_CLASS] = ELFCLASS64; + ehdr->e_ident[EI_DATA] = ELFDATA2LSB; + ehdr->e_ident[EI_VERSION] = EV_CURRENT; + ehdr->e_ident[EI_OSABI] = ELF_OSABI; + memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD); + ehdr->e_type = ET_CORE; + ehdr->e_machine = ELF_ARCH; + ehdr->e_version = EV_CURRENT; + ehdr->e_phoff = sizeof(Elf64_Ehdr); + ehdr->e_ehsize = sizeof(Elf64_Ehdr); + ehdr->e_phentsize = sizeof(Elf64_Phdr); + + /* Prepare one phdr of type PT_NOTE for each possible CPU */ + for_each_possible_cpu(cpu) { + phdr->p_type = PT_NOTE; + notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu)); + phdr->p_offset = phdr->p_paddr = notes_addr; + phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t); + (ehdr->e_phnum)++; + phdr++; + } + + /* Prepare one PT_NOTE header for vmcoreinfo */ + phdr->p_type = PT_NOTE; + phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); + phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE; + (ehdr->e_phnum)++; + phdr++; + + /* Prepare PT_LOAD type program header for kernel text region */ + if (need_kernel_map) { + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_vaddr = (unsigned long) _text; + phdr->p_filesz = phdr->p_memsz = _end - _text; + phdr->p_offset = phdr->p_paddr = __pa_symbol(_text); + ehdr->e_phnum++; + phdr++; + } + + /* Go through all the ranges in mem->ranges[] and prepare phdr */ + for (i = 0; i < mem->nr_ranges; i++) { + mstart = mem->ranges[i].start; + mend = mem->ranges[i].end; + + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_offset = mstart; + + phdr->p_paddr = mstart; + phdr->p_vaddr = (unsigned long) __va(mstart); + phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; + phdr->p_align = 0; + ehdr->e_phnum++; + pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", + phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, + ehdr->e_phnum, phdr->p_offset); + phdr++; + } + + *addr = buf; + *sz = elf_sz; + return 0; +} + +int crash_exclude_mem_range(struct crash_mem *mem, + unsigned long long mstart, unsigned long long mend) +{ + int i, j; + unsigned long long start, end, p_start, p_end; + struct range temp_range = {0, 0}; + + for (i = 0; i < mem->nr_ranges; i++) { + start = mem->ranges[i].start; + end = mem->ranges[i].end; + p_start = mstart; + p_end = mend; + + if (mstart > end || mend < start) + continue; + + /* Truncate any area outside of range */ + if (mstart < start) + p_start = start; + if (mend > end) + p_end = end; + + /* Found completely overlapping range */ + if (p_start == start && p_end == end) { + mem->ranges[i].start = 0; + mem->ranges[i].end = 0; + if (i < mem->nr_ranges - 1) { + /* Shift rest of the ranges to left */ + for (j = i; j < mem->nr_ranges - 1; j++) { + mem->ranges[j].start = + mem->ranges[j+1].start; + mem->ranges[j].end = + mem->ranges[j+1].end; + } + + /* + * Continue to check if there are another overlapping ranges + * from the current position because of shifting the above + * mem ranges. + */ + i--; + mem->nr_ranges--; + continue; + } + mem->nr_ranges--; + return 0; + } + + if (p_start > start && p_end < end) { + /* Split original range */ + mem->ranges[i].end = p_start - 1; + temp_range.start = p_end + 1; + temp_range.end = end; + } else if (p_start != start) + mem->ranges[i].end = p_start - 1; + else + mem->ranges[i].start = p_end + 1; + break; + } + + /* If a split happened, add the split to array */ + if (!temp_range.end) + return 0; + + /* Split happened */ + if (i == mem->max_nr_ranges - 1) + return -ENOMEM; + + /* Location where new range should go */ + j = i + 1; + if (j < mem->nr_ranges) { + /* Move over all ranges one slot towards the end */ + for (i = mem->nr_ranges - 1; i >= j; i--) + mem->ranges[i + 1] = mem->ranges[i]; + } + + mem->ranges[j].start = temp_range.start; + mem->ranges[j].end = temp_range.end; + mem->nr_ranges++; + return 0; +} + Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len) { @@ -513,3 +701,206 @@ static int __init crash_save_vmcoreinfo_init(void) } subsys_initcall(crash_save_vmcoreinfo_init); + +static int __init crash_notes_memory_init(void) +{ + /* Allocate memory for saving cpu registers. */ + size_t size, align; + + /* + * crash_notes could be allocated across 2 vmalloc pages when percpu + * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc + * pages are also on 2 continuous physical pages. In this case the + * 2nd part of crash_notes in 2nd page could be lost since only the + * starting address and size of crash_notes are exported through sysfs. + * Here round up the size of crash_notes to the nearest power of two + * and pass it to __alloc_percpu as align value. This can make sure + * crash_notes is allocated inside one physical page. + */ + size = sizeof(note_buf_t); + align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE); + + /* + * Break compile if size is bigger than PAGE_SIZE since crash_notes + * definitely will be in 2 pages with that. + */ + BUILD_BUG_ON(size > PAGE_SIZE); + + crash_notes = __alloc_percpu(size, align); + if (!crash_notes) { + pr_warn("Memory allocation for saving cpu register states failed\n"); + return -ENOMEM; + } + return 0; +} +subsys_initcall(crash_notes_memory_init); + +#ifdef CONFIG_CRASH_HOTPLUG +#undef pr_fmt +#define pr_fmt(fmt) "crash hp: " fmt + +/* + * This routine utilized when the crash_hotplug sysfs node is read. + * It reflects the kernel's ability/permission to update the crash + * elfcorehdr directly. + */ +int crash_check_update_elfcorehdr(void) +{ + int rc = 0; + + /* Obtain lock while reading crash information */ + if (!kexec_trylock()) { + pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n"); + return 0; + } + if (kexec_crash_image) { + if (kexec_crash_image->file_mode) + rc = 1; + else + rc = kexec_crash_image->update_elfcorehdr; + } + /* Release lock now that update complete */ + kexec_unlock(); + + return rc; +} + +/* + * To accurately reflect hot un/plug changes of cpu and memory resources + * (including onling and offlining of those resources), the elfcorehdr + * (which is passed to the crash kernel via the elfcorehdr= parameter) + * must be updated with the new list of CPUs and memories. + * + * In order to make changes to elfcorehdr, two conditions are needed: + * First, the segment containing the elfcorehdr must be large enough + * to permit a growing number of resources; the elfcorehdr memory size + * is based on NR_CPUS_DEFAULT and CRASH_MAX_MEMORY_RANGES. + * Second, purgatory must explicitly exclude the elfcorehdr from the + * list of segments it checks (since the elfcorehdr changes and thus + * would require an update to purgatory itself to update the digest). + */ +static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu) +{ + struct kimage *image; + + /* Obtain lock while changing crash information */ + if (!kexec_trylock()) { + pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n"); + return; + } + + /* Check kdump is not loaded */ + if (!kexec_crash_image) + goto out; + + image = kexec_crash_image; + + /* Check that updating elfcorehdr is permitted */ + if (!(image->file_mode || image->update_elfcorehdr)) + goto out; + + if (hp_action == KEXEC_CRASH_HP_ADD_CPU || + hp_action == KEXEC_CRASH_HP_REMOVE_CPU) + pr_debug("hp_action %u, cpu %u\n", hp_action, cpu); + else + pr_debug("hp_action %u\n", hp_action); + + /* + * The elfcorehdr_index is set to -1 when the struct kimage + * is allocated. Find the segment containing the elfcorehdr, + * if not already found. + */ + if (image->elfcorehdr_index < 0) { + unsigned long mem; + unsigned char *ptr; + unsigned int n; + + for (n = 0; n < image->nr_segments; n++) { + mem = image->segment[n].mem; + ptr = kmap_local_page(pfn_to_page(mem >> PAGE_SHIFT)); + if (ptr) { + /* The segment containing elfcorehdr */ + if (memcmp(ptr, ELFMAG, SELFMAG) == 0) + image->elfcorehdr_index = (int)n; + kunmap_local(ptr); + } + } + } + + if (image->elfcorehdr_index < 0) { + pr_err("unable to locate elfcorehdr segment"); + goto out; + } + + /* Needed in order for the segments to be updated */ + arch_kexec_unprotect_crashkres(); + + /* Differentiate between normal load and hotplug update */ + image->hp_action = hp_action; + + /* Now invoke arch-specific update handler */ + arch_crash_handle_hotplug_event(image); + + /* No longer handling a hotplug event */ + image->hp_action = KEXEC_CRASH_HP_NONE; + image->elfcorehdr_updated = true; + + /* Change back to read-only */ + arch_kexec_protect_crashkres(); + + /* Errors in the callback is not a reason to rollback state */ +out: + /* Release lock now that update complete */ + kexec_unlock(); +} + +static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v) +{ + switch (val) { + case MEM_ONLINE: + crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_MEMORY, + KEXEC_CRASH_HP_INVALID_CPU); + break; + + case MEM_OFFLINE: + crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_MEMORY, + KEXEC_CRASH_HP_INVALID_CPU); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block crash_memhp_nb = { + .notifier_call = crash_memhp_notifier, + .priority = 0 +}; + +static int crash_cpuhp_online(unsigned int cpu) +{ + crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_CPU, cpu); + return 0; +} + +static int crash_cpuhp_offline(unsigned int cpu) +{ + crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_CPU, cpu); + return 0; +} + +static int __init crash_hotplug_init(void) +{ + int result = 0; + + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) + register_memory_notifier(&crash_memhp_nb); + + if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) { + result = cpuhp_setup_state_nocalls(CPUHP_BP_PREPARE_DYN, + "crash/cpuhp", crash_cpuhp_online, crash_cpuhp_offline); + } + + return result; +} + +subsys_initcall(crash_hotplug_init); +#endif diff --git a/kernel/cred.c b/kernel/cred.c index 811ad654abd1..98cb4eca23fb 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -4,6 +4,9 @@ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ + +#define pr_fmt(fmt) "CRED: " fmt + #include <linux/export.h> #include <linux/cred.h> #include <linux/slab.h> @@ -835,32 +838,32 @@ EXPORT_SYMBOL(creds_are_invalid); static void dump_invalid_creds(const struct cred *cred, const char *label, const struct task_struct *tsk) { - printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n", + pr_err("%s credentials: %p %s%s%s\n", label, cred, cred == &init_cred ? "[init]" : "", cred == tsk->real_cred ? "[real]" : "", cred == tsk->cred ? "[eff]" : ""); - printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n", + pr_err("->magic=%x, put_addr=%p\n", cred->magic, cred->put_addr); - printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n", + pr_err("->usage=%d, subscr=%d\n", atomic_read(&cred->usage), read_cred_subscribers(cred)); - printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n", + pr_err("->*uid = { %d,%d,%d,%d }\n", from_kuid_munged(&init_user_ns, cred->uid), from_kuid_munged(&init_user_ns, cred->euid), from_kuid_munged(&init_user_ns, cred->suid), from_kuid_munged(&init_user_ns, cred->fsuid)); - printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n", + pr_err("->*gid = { %d,%d,%d,%d }\n", from_kgid_munged(&init_user_ns, cred->gid), from_kgid_munged(&init_user_ns, cred->egid), from_kgid_munged(&init_user_ns, cred->sgid), from_kgid_munged(&init_user_ns, cred->fsgid)); #ifdef CONFIG_SECURITY - printk(KERN_ERR "CRED: ->security is %p\n", cred->security); + pr_err("->security is %p\n", cred->security); if ((unsigned long) cred->security >= PAGE_SIZE && (((unsigned long) cred->security & 0xffffff00) != (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))) - printk(KERN_ERR "CRED: ->security {%x, %x}\n", + pr_err("->security {%x, %x}\n", ((u32*)cred->security)[0], ((u32*)cred->security)[1]); #endif @@ -871,8 +874,8 @@ static void dump_invalid_creds(const struct cred *cred, const char *label, */ void __noreturn __invalid_creds(const struct cred *cred, const char *file, unsigned line) { - printk(KERN_ERR "CRED: Invalid credentials\n"); - printk(KERN_ERR "CRED: At %s:%u\n", file, line); + pr_err("Invalid credentials\n"); + pr_err("At %s:%u\n", file, line); dump_invalid_creds(cred, "Specified", current); BUG(); } @@ -898,14 +901,14 @@ void __validate_process_creds(struct task_struct *tsk, return; invalid_creds: - printk(KERN_ERR "CRED: Invalid process credentials\n"); - printk(KERN_ERR "CRED: At %s:%u\n", file, line); + pr_err("Invalid process credentials\n"); + pr_err("At %s:%u\n", file, line); dump_invalid_creds(tsk->real_cred, "Real", tsk); if (tsk->cred != tsk->real_cred) dump_invalid_creds(tsk->cred, "Effective", tsk); else - printk(KERN_ERR "CRED: Effective creds == Real creds\n"); + pr_err("Effective creds == Real creds\n"); BUG(); } EXPORT_SYMBOL(__validate_process_creds); diff --git a/kernel/fork.c b/kernel/fork.c index f81149739eb9..a9c18d480dc5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1404,8 +1404,8 @@ EXPORT_SYMBOL_GPL(mmput_async); * This changes mm's executable file (shown as symlink /proc/[pid]/exe). * * Main users are mmput() and sys_execve(). Callers prevent concurrent - * invocations: in mmput() nobody alive left, in execve task is single - * threaded. + * invocations: in mmput() nobody alive left, in execve it happens before + * the new mm is made visible to anyone. * * Can only fail if new_exe_file != NULL. */ @@ -1440,9 +1440,7 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) /** * replace_mm_exe_file - replace a reference to the mm's executable file * - * This changes mm's executable file (shown as symlink /proc/[pid]/exe), - * dealing with concurrent invocation and without grabbing the mmap lock in - * write mode. + * This changes mm's executable file (shown as symlink /proc/[pid]/exe). * * Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE). */ @@ -1472,22 +1470,20 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) return ret; } - /* set the new file, lockless */ ret = deny_write_access(new_exe_file); if (ret) return -EACCES; get_file(new_exe_file); - old_exe_file = xchg(&mm->exe_file, new_exe_file); + /* set the new file */ + mmap_write_lock(mm); + old_exe_file = rcu_dereference_raw(mm->exe_file); + rcu_assign_pointer(mm->exe_file, new_exe_file); + mmap_write_unlock(mm); + if (old_exe_file) { - /* - * Don't race with dup_mmap() getting the file and disallowing - * write access while someone might open the file writable. - */ - mmap_read_lock(mm); allow_write_access(old_exe_file); fput(old_exe_file); - mmap_read_unlock(mm); } return 0; } diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index 16f8ecc7d882..ccd02afaeffb 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -3,4 +3,6 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' obj-y := base.o fs.o obj-$(CONFIG_CC_IS_GCC) += gcc_base.o gcc_4_7.o +CFLAGS_gcc_base.o += -Wno-missing-prototypes -Wno-missing-declarations obj-$(CONFIG_CC_IS_CLANG) += clang.o +CFLAGS_clang.o += -Wno-missing-prototypes -Wno-missing-declarations diff --git a/kernel/kexec.c b/kernel/kexec.c index 92d301f98776..107f355eac10 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -129,6 +129,11 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, if (flags & KEXEC_PRESERVE_CONTEXT) image->preserve_context = 1; +#ifdef CONFIG_CRASH_HOTPLUG + if (flags & KEXEC_UPDATE_ELFCOREHDR) + image->update_elfcorehdr = 1; +#endif + ret = machine_kexec_prepare(image); if (ret) goto out; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index e2f2574d8b74..9dc728982d79 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -49,9 +49,6 @@ atomic_t __kexec_lock = ATOMIC_INIT(0); -/* Per cpu memory for storing cpu states in case of system crash. */ -note_buf_t __percpu *crash_notes; - /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; @@ -277,6 +274,12 @@ struct kimage *do_kimage_alloc_init(void) /* Initialize the list of unusable pages */ INIT_LIST_HEAD(&image->unusable_pages); +#ifdef CONFIG_CRASH_HOTPLUG + image->hp_action = KEXEC_CRASH_HP_NONE; + image->elfcorehdr_index = -1; + image->elfcorehdr_updated = false; +#endif + return image; } @@ -1218,40 +1221,6 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) final_note(buf); } -static int __init crash_notes_memory_init(void) -{ - /* Allocate memory for saving cpu registers. */ - size_t size, align; - - /* - * crash_notes could be allocated across 2 vmalloc pages when percpu - * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc - * pages are also on 2 continuous physical pages. In this case the - * 2nd part of crash_notes in 2nd page could be lost since only the - * starting address and size of crash_notes are exported through sysfs. - * Here round up the size of crash_notes to the nearest power of two - * and pass it to __alloc_percpu as align value. This can make sure - * crash_notes is allocated inside one physical page. - */ - size = sizeof(note_buf_t); - align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE); - - /* - * Break compile if size is bigger than PAGE_SIZE since crash_notes - * definitely will be in 2 pages with that. - */ - BUILD_BUG_ON(size > PAGE_SIZE); - - crash_notes = __alloc_percpu(size, align); - if (!crash_notes) { - pr_warn("Memory allocation for saving cpu register states failed\n"); - return -ENOMEM; - } - return 0; -} -subsys_initcall(crash_notes_memory_init); - - /* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 881ba0d1714c..e2ec9d7b9a1f 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -685,7 +685,7 @@ static int kexec_calculate_store_digests(struct kimage *image) struct kexec_sha_region *sha_regions; struct purgatory_info *pi = &image->purgatory_info; - if (!IS_ENABLED(CONFIG_ARCH_HAS_KEXEC_PURGATORY)) + if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY)) return 0; zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); @@ -726,6 +726,12 @@ static int kexec_calculate_store_digests(struct kimage *image) for (j = i = 0; i < image->nr_segments; i++) { struct kexec_segment *ksegment; +#ifdef CONFIG_CRASH_HOTPLUG + /* Exclude elfcorehdr segment to allow future changes via hotplug */ + if (j == image->elfcorehdr_index) + continue; +#endif + ksegment = &image->segment[i]; /* * Skip purgatory as it will be modified once we put digest @@ -790,7 +796,7 @@ out: return ret; } -#ifdef CONFIG_ARCH_HAS_KEXEC_PURGATORY +#ifdef CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY /* * kexec_purgatory_setup_kbuf - prepare buffer to load purgatory. * @pi: Purgatory to be loaded. @@ -1150,185 +1156,4 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, return 0; } -#endif /* CONFIG_ARCH_HAS_KEXEC_PURGATORY */ - -int crash_exclude_mem_range(struct crash_mem *mem, - unsigned long long mstart, unsigned long long mend) -{ - int i, j; - unsigned long long start, end, p_start, p_end; - struct range temp_range = {0, 0}; - - for (i = 0; i < mem->nr_ranges; i++) { - start = mem->ranges[i].start; - end = mem->ranges[i].end; - p_start = mstart; - p_end = mend; - - if (mstart > end || mend < start) - continue; - - /* Truncate any area outside of range */ - if (mstart < start) - p_start = start; - if (mend > end) - p_end = end; - - /* Found completely overlapping range */ - if (p_start == start && p_end == end) { - mem->ranges[i].start = 0; - mem->ranges[i].end = 0; - if (i < mem->nr_ranges - 1) { - /* Shift rest of the ranges to left */ - for (j = i; j < mem->nr_ranges - 1; j++) { - mem->ranges[j].start = - mem->ranges[j+1].start; - mem->ranges[j].end = - mem->ranges[j+1].end; - } - - /* - * Continue to check if there are another overlapping ranges - * from the current position because of shifting the above - * mem ranges. - */ - i--; - mem->nr_ranges--; - continue; - } - mem->nr_ranges--; - return 0; - } - - if (p_start > start && p_end < end) { - /* Split original range */ - mem->ranges[i].end = p_start - 1; - temp_range.start = p_end + 1; - temp_range.end = end; - } else if (p_start != start) - mem->ranges[i].end = p_start - 1; - else - mem->ranges[i].start = p_end + 1; - break; - } - - /* If a split happened, add the split to array */ - if (!temp_range.end) - return 0; - - /* Split happened */ - if (i == mem->max_nr_ranges - 1) - return -ENOMEM; - - /* Location where new range should go */ - j = i + 1; - if (j < mem->nr_ranges) { - /* Move over all ranges one slot towards the end */ - for (i = mem->nr_ranges - 1; i >= j; i--) - mem->ranges[i + 1] = mem->ranges[i]; - } - - mem->ranges[j].start = temp_range.start; - mem->ranges[j].end = temp_range.end; - mem->nr_ranges++; - return 0; -} - -int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, - void **addr, unsigned long *sz) -{ - Elf64_Ehdr *ehdr; - Elf64_Phdr *phdr; - unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz; - unsigned char *buf; - unsigned int cpu, i; - unsigned long long notes_addr; - unsigned long mstart, mend; - - /* extra phdr for vmcoreinfo ELF note */ - nr_phdr = nr_cpus + 1; - nr_phdr += mem->nr_ranges; - - /* - * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping - * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64). - * I think this is required by tools like gdb. So same physical - * memory will be mapped in two ELF headers. One will contain kernel - * text virtual addresses and other will have __va(physical) addresses. - */ - - nr_phdr++; - elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr); - elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN); - - buf = vzalloc(elf_sz); - if (!buf) - return -ENOMEM; - - ehdr = (Elf64_Ehdr *)buf; - phdr = (Elf64_Phdr *)(ehdr + 1); - memcpy(ehdr->e_ident, ELFMAG, SELFMAG); - ehdr->e_ident[EI_CLASS] = ELFCLASS64; - ehdr->e_ident[EI_DATA] = ELFDATA2LSB; - ehdr->e_ident[EI_VERSION] = EV_CURRENT; - ehdr->e_ident[EI_OSABI] = ELF_OSABI; - memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD); - ehdr->e_type = ET_CORE; - ehdr->e_machine = ELF_ARCH; - ehdr->e_version = EV_CURRENT; - ehdr->e_phoff = sizeof(Elf64_Ehdr); - ehdr->e_ehsize = sizeof(Elf64_Ehdr); - ehdr->e_phentsize = sizeof(Elf64_Phdr); - - /* Prepare one phdr of type PT_NOTE for each present CPU */ - for_each_present_cpu(cpu) { - phdr->p_type = PT_NOTE; - notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu)); - phdr->p_offset = phdr->p_paddr = notes_addr; - phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t); - (ehdr->e_phnum)++; - phdr++; - } - - /* Prepare one PT_NOTE header for vmcoreinfo */ - phdr->p_type = PT_NOTE; - phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); - phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE; - (ehdr->e_phnum)++; - phdr++; - - /* Prepare PT_LOAD type program header for kernel text region */ - if (need_kernel_map) { - phdr->p_type = PT_LOAD; - phdr->p_flags = PF_R|PF_W|PF_X; - phdr->p_vaddr = (unsigned long) _text; - phdr->p_filesz = phdr->p_memsz = _end - _text; - phdr->p_offset = phdr->p_paddr = __pa_symbol(_text); - ehdr->e_phnum++; - phdr++; - } - - /* Go through all the ranges in mem->ranges[] and prepare phdr */ - for (i = 0; i < mem->nr_ranges; i++) { - mstart = mem->ranges[i].start; - mend = mem->ranges[i].end; - - phdr->p_type = PT_LOAD; - phdr->p_flags = PF_R|PF_W|PF_X; - phdr->p_offset = mstart; - - phdr->p_paddr = mstart; - phdr->p_vaddr = (unsigned long) __va(mstart); - phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; - phdr->p_align = 0; - ehdr->e_phnum++; - pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", - phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, - ehdr->e_phnum, phdr->p_offset); - phdr++; - } - - *addr = buf; - *sz = elf_sz; - return 0; -} +#endif /* CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY */ diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index aad7a3bfd846..1d4bc493b2f4 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -165,6 +165,18 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, } KERNEL_ATTR_RO(vmcoreinfo); +#ifdef CONFIG_CRASH_HOTPLUG +static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + unsigned int sz = crash_get_elfcorehdr_size(); + + return sysfs_emit(buf, "%u\n", sz); +} +KERNEL_ATTR_RO(crash_elfcorehdr_size); + +#endif + #endif /* CONFIG_CRASH_CORE */ /* whether file capabilities are enabled */ @@ -255,6 +267,9 @@ static struct attribute * kernel_attrs[] = { #endif #ifdef CONFIG_CRASH_CORE &vmcoreinfo_attr.attr, +#ifdef CONFIG_CRASH_HOTPLUG + &crash_elfcorehdr_size_attr.attr, +#endif #endif #ifndef CONFIG_TINY_RCU &rcu_expedited_attr.attr, diff --git a/kernel/kthread.c b/kernel/kthread.c index 4fff7df17a68..1eea53050bab 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -159,11 +159,10 @@ bool kthread_should_stop(void) } EXPORT_SYMBOL(kthread_should_stop); -bool __kthread_should_park(struct task_struct *k) +static bool __kthread_should_park(struct task_struct *k) { return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags); } -EXPORT_SYMBOL_GPL(__kthread_should_park); /** * kthread_should_park - should this kthread park now? diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 111607d91489..e85b5ad3e206 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -819,34 +819,26 @@ static int very_verbose(struct lock_class *class) * Is this the address of a static object: */ #ifdef __KERNEL__ -/* - * Check if an address is part of freed initmem. After initmem is freed, - * memory can be allocated from it, and such allocations would then have - * addresses within the range [_stext, _end]. - */ -#ifndef arch_is_kernel_initmem_freed -static int arch_is_kernel_initmem_freed(unsigned long addr) -{ - if (system_state < SYSTEM_FREEING_INITMEM) - return 0; - - return init_section_contains((void *)addr, 1); -} -#endif - static int static_obj(const void *obj) { - unsigned long start = (unsigned long) &_stext, - end = (unsigned long) &_end, - addr = (unsigned long) obj; + unsigned long addr = (unsigned long) obj; - if (arch_is_kernel_initmem_freed(addr)) - return 0; + if (is_kernel_core_data(addr)) + return 1; + + /* + * keys are allowed in the __ro_after_init section. + */ + if (is_kernel_rodata(addr)) + return 1; /* - * static variable? + * in initdata section and used during bootup only? + * NOTE: On some platforms the initdata section is + * outside of the _stext ... _end range. */ - if ((addr >= start) && (addr < end)) + if (system_state < SYSTEM_FREEING_INITMEM && + init_section_contains((void *)addr, 1)) return 1; /* diff --git a/kernel/relay.c b/kernel/relay.c index a80fa01042e9..83fe0325cde1 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -375,7 +375,7 @@ static struct dentry *relay_create_buf_file(struct rchan *chan, */ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) { - struct rchan_buf *buf = NULL; + struct rchan_buf *buf; struct dentry *dentry; if (chan->is_global) diff --git a/kernel/signal.c b/kernel/signal.c index 128e9bb3d1a2..09019017d669 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -22,6 +22,7 @@ #include <linux/sched/cputime.h> #include <linux/file.h> #include <linux/fs.h> +#include <linux/mm.h> #include <linux/proc_fs.h> #include <linux/tty.h> #include <linux/binfmts.h> @@ -1260,7 +1261,17 @@ int send_signal_locked(int sig, struct kernel_siginfo *info, static void print_fatal_signal(int signr) { struct pt_regs *regs = task_pt_regs(current); - pr_info("potentially unexpected fatal signal %d.\n", signr); + struct file *exe_file; + + exe_file = get_task_exe_file(current); + if (exe_file) { + pr_info("%pD: %s: potentially unexpected fatal signal %d.\n", + exe_file, current->comm, signr); + fput(exe_file); + } else { + pr_info("%s: potentially unexpected fatal signal %d.\n", + current->comm, signr); + } #if defined(__i386__) && !defined(__arch_um__) pr_info("code at %08lx: ", regs->ip); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index be38276a365f..d145305d95fe 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -151,9 +151,6 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) */ if (is_hardlockup(cpu)) { unsigned int this_cpu = smp_processor_id(); - struct cpumask backtrace_mask; - - cpumask_copy(&backtrace_mask, cpu_online_mask); /* Only print hardlockups once. */ if (per_cpu(watchdog_hardlockup_warned, cpu)) @@ -167,10 +164,8 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) show_regs(regs); else dump_stack(); - cpumask_clear_cpu(cpu, &backtrace_mask); } else { - if (trigger_single_cpu_backtrace(cpu)) - cpumask_clear_cpu(cpu, &backtrace_mask); + trigger_single_cpu_backtrace(cpu); } /* @@ -179,7 +174,7 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) */ if (sysctl_hardlockup_all_cpu_backtrace && !test_and_set_bit(0, &watchdog_hardlockup_all_cpu_dumped)) - trigger_cpumask_backtrace(&backtrace_mask); + trigger_allbutcpu_cpu_backtrace(cpu); if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); @@ -523,7 +518,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) dump_stack(); if (softlockup_all_cpu_backtrace) { - trigger_allbutself_cpu_backtrace(); + trigger_allbutcpu_cpu_backtrace(smp_processor_id()); clear_bit_unlock(0, &soft_lockup_nmi_warn); } |