diff options
Diffstat (limited to 'arch/x86')
80 files changed, 582 insertions, 369 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6a0cc524882d..883da0abf779 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -67,7 +67,7 @@ config X86 select ARCH_HAS_FILTER_PGPROT select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_KCOV if X86_64 + select ARCH_HAS_KCOV if X86_64 && STACK_VALIDATION select ARCH_HAS_MEM_ENCRYPT select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 7619742f91c9..5a828fde7a42 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -90,8 +90,8 @@ endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o -vmlinux-objs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o +efi-obj-$(CONFIG_EFI_STUB) = $(objtree)/drivers/firmware/efi/libstub/lib.a # The compressed kernel is built with -fPIC/-fPIE so that a boot loader # can place it anywhere in memory and it will still run. However, since @@ -115,7 +115,7 @@ endef quiet_cmd_check-and-link-vmlinux = LD $@ cmd_check-and-link-vmlinux = $(cmd_check_data_rel); $(cmd_ld) -$(obj)/vmlinux: $(vmlinux-objs-y) FORCE +$(obj)/vmlinux: $(vmlinux-objs-y) $(efi-obj-y) FORCE $(call if_changed,check-and-link-vmlinux) OBJCOPYFLAGS_vmlinux.bin := -R .comment -S diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index e821a7d7d5c4..97d37f0a34f5 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -213,7 +213,6 @@ SYM_FUNC_START(startup_32) * We place all of the values on our mini stack so lret can * used to perform that far jump. */ - pushl $__KERNEL_CS leal startup_64(%ebp), %eax #ifdef CONFIG_EFI_MIXED movl efi32_boot_args(%ebp), %edi @@ -224,11 +223,20 @@ SYM_FUNC_START(startup_32) movl efi32_boot_args+8(%ebp), %edx // saved bootparams pointer cmpl $0, %edx jnz 1f + /* + * efi_pe_entry uses MS calling convention, which requires 32 bytes of + * shadow space on the stack even if all arguments are passed in + * registers. We also need an additional 8 bytes for the space that + * would be occupied by the return address, and this also results in + * the correct stack alignment for entry. + */ + subl $40, %esp leal efi_pe_entry(%ebp), %eax movl %edi, %ecx // MS calling convention movl %esi, %edx 1: #endif + pushl $__KERNEL_CS pushl %eax /* Enter paged protected Mode, activating Long Mode */ @@ -784,6 +792,7 @@ SYM_DATA_LOCAL(boot_heap, .fill BOOT_HEAP_SIZE, 1, 0) SYM_DATA_START_LOCAL(boot_stack) .fill BOOT_STACK_SIZE, 1, 0 + .balign 16 SYM_DATA_END_LABEL(boot_stack, SYM_L_LOCAL, boot_stack_end) /* diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index b7a5790d8d63..08bf95dbc911 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -7,12 +7,20 @@ KASAN_SANITIZE := n UBSAN_SANITIZE := n KCOV_INSTRUMENT := n -CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) -fstack-protector -fstack-protector-strong -CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE) -fstack-protector -fstack-protector-strong -CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE) -fstack-protector -fstack-protector-strong +CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_syscall_x32.o = $(CC_FLAGS_FTRACE) + +CFLAGS_common.o += -fno-stack-protector +CFLAGS_syscall_64.o += -fno-stack-protector +CFLAGS_syscall_32.o += -fno-stack-protector +CFLAGS_syscall_x32.o += -fno-stack-protector CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,) CFLAGS_syscall_32.o += $(call cc-option,-Wno-override-init,) +CFLAGS_syscall_x32.o += $(call cc-option,-Wno-override-init,) + obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o obj-y += common.o diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index bd3f14175193..f09288431f28 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -45,6 +45,32 @@ #define CREATE_TRACE_POINTS #include <trace/events/syscalls.h> +/* Check that the stack and regs on entry from user mode are sane. */ +static noinstr void check_user_regs(struct pt_regs *regs) +{ + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) { + /* + * Make sure that the entry code gave us a sensible EFLAGS + * register. Native because we want to check the actual CPU + * state, not the interrupt state as imagined by Xen. + */ + unsigned long flags = native_save_fl(); + WARN_ON_ONCE(flags & (X86_EFLAGS_AC | X86_EFLAGS_DF | + X86_EFLAGS_NT)); + + /* We think we came from user mode. Make sure pt_regs agrees. */ + WARN_ON_ONCE(!user_mode(regs)); + + /* + * All entries from user mode (except #DF) should be on the + * normal thread stack and should have user pt_regs in the + * correct location. + */ + WARN_ON_ONCE(!on_thread_stack()); + WARN_ON_ONCE(regs != task_pt_regs(current)); + } +} + #ifdef CONFIG_CONTEXT_TRACKING /** * enter_from_user_mode - Establish state when coming from user mode @@ -127,9 +153,6 @@ static long syscall_trace_enter(struct pt_regs *regs) unsigned long ret = 0; u32 work; - if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) - BUG_ON(regs != task_pt_regs(current)); - work = READ_ONCE(ti->flags); if (work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { @@ -271,7 +294,7 @@ static void __prepare_exit_to_usermode(struct pt_regs *regs) #endif } -__visible noinstr void prepare_exit_to_usermode(struct pt_regs *regs) +static noinstr void prepare_exit_to_usermode(struct pt_regs *regs) { instrumentation_begin(); __prepare_exit_to_usermode(regs); @@ -346,6 +369,8 @@ __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) { struct thread_info *ti; + check_user_regs(regs); + enter_from_user_mode(); instrumentation_begin(); @@ -409,6 +434,8 @@ static void do_syscall_32_irqs_on(struct pt_regs *regs) /* Handles int $0x80 */ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) { + check_user_regs(regs); + enter_from_user_mode(); instrumentation_begin(); @@ -460,6 +487,8 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) vdso_image_32.sym_int80_landing_pad; bool success; + check_user_regs(regs); + /* * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. @@ -510,6 +539,18 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0; #endif } + +/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ +__visible noinstr long do_SYSENTER_32(struct pt_regs *regs) +{ + /* SYSENTER loses RSP, but the vDSO saved it in RBP. */ + regs->sp = regs->bp; + + /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */ + regs->flags |= X86_EFLAGS_IF; + + return do_fast_syscall_32(regs); +} #endif SYSCALL_DEFINE0(ni_syscall) @@ -553,6 +594,7 @@ SYSCALL_DEFINE0(ni_syscall) bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs) { if (user_mode(regs)) { + check_user_regs(regs); enter_from_user_mode(); return false; } @@ -686,6 +728,7 @@ void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit) */ void noinstr idtentry_enter_user(struct pt_regs *regs) { + check_user_regs(regs); enter_from_user_mode(); } diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 024d7d276cd4..2d0bd5d5f032 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -933,9 +933,8 @@ SYM_FUNC_START(entry_SYSENTER_32) .Lsysenter_past_esp: pushl $__USER_DS /* pt_regs->ss */ - pushl %ebp /* pt_regs->sp (stashed in bp) */ + pushl $0 /* pt_regs->sp (placeholder) */ pushfl /* pt_regs->flags (except IF = 0) */ - orl $X86_EFLAGS_IF, (%esp) /* Fix IF */ pushl $__USER_CS /* pt_regs->cs */ pushl $0 /* pt_regs->ip = 0 (placeholder) */ pushl %eax /* pt_regs->orig_ax */ @@ -965,7 +964,7 @@ SYM_FUNC_START(entry_SYSENTER_32) .Lsysenter_flags_fixed: movl %esp, %eax - call do_fast_syscall_32 + call do_SYSENTER_32 /* XEN PV guests always use IRET path */ ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ "jmp .Lsyscall_32_done", X86_FEATURE_XENPV diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 0f974ae01e62..541fdaf64045 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -57,29 +57,30 @@ SYM_CODE_START(entry_SYSENTER_compat) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - /* - * User tracing code (ptrace or signal handlers) might assume that - * the saved RAX contains a 32-bit number when we're invoking a 32-bit - * syscall. Just in case the high bits are nonzero, zero-extend - * the syscall number. (This could almost certainly be deleted - * with no ill effects.) - */ - movl %eax, %eax - /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ - pushq %rbp /* pt_regs->sp (stashed in bp) */ + pushq $0 /* pt_regs->sp = 0 (placeholder) */ /* * Push flags. This is nasty. First, interrupts are currently - * off, but we need pt_regs->flags to have IF set. Second, even - * if TF was set when SYSENTER started, it's clear by now. We fix - * that later using TIF_SINGLESTEP. + * off, but we need pt_regs->flags to have IF set. Second, if TS + * was set in usermode, it's still set, and we're singlestepping + * through this code. do_SYSENTER_32() will fix up IF. */ pushfq /* pt_regs->flags (except IF = 0) */ - orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */ pushq $__USER32_CS /* pt_regs->cs */ pushq $0 /* pt_regs->ip = 0 (placeholder) */ +SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) + + /* + * User tracing code (ptrace or signal handlers) might assume that + * the saved RAX contains a 32-bit number when we're invoking a 32-bit + * syscall. Just in case the high bits are nonzero, zero-extend + * the syscall number. (This could almost certainly be deleted + * with no ill effects.) + */ + movl %eax, %eax + pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -135,7 +136,7 @@ SYM_CODE_START(entry_SYSENTER_compat) .Lsysenter_flags_fixed: movq %rsp, %rdi - call do_fast_syscall_32 + call do_SYSENTER_32 /* XEN PV guests always use IRET path */ ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile index 12c42eba77ec..9933c0e8e97a 100644 --- a/arch/x86/events/Makefile +++ b/arch/x86/events/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y += core.o probe.o -obj-$(PERF_EVENTS_INTEL_RAPL) += rapl.o +obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += rapl.o obj-y += amd/ obj-$(CONFIG_X86_LOCAL_APIC) += msr.o obj-$(CONFIG_CPU_SUP_INTEL) += intel/ diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index a54c6a401581..6035df1b49e1 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -375,7 +375,10 @@ void __init hyperv_init(void) guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0); wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); - hv_hypercall_pg = vmalloc_exec(PAGE_SIZE); + hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, + VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_ROX, + VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, + __builtin_return_address(0)); if (hv_hypercall_pg == NULL) { wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); goto remove_cpuhp_state; diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 35460fef39b8..0367efdc5b7a 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -201,12 +201,8 @@ arch_test_and_change_bit(long nr, volatile unsigned long *addr) return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), *addr, c, "Ir", nr); } -static __no_kcsan_or_inline bool constant_test_bit(long nr, const volatile unsigned long *addr) +static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr) { - /* - * Because this is a plain access, we need to disable KCSAN here to - * avoid double instrumentation via instrumented bitops. - */ return ((1UL << (nr & (BITS_PER_LONG-1))) & (addr[nr >> _BITOPS_LONG_SHIFT])) != 0; } diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index fb34ff641e0a..028189575560 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -75,6 +75,12 @@ do { \ unreachable(); \ } while (0) +/* + * This instrumentation_begin() is strictly speaking incorrect; but it + * suppresses the complaints from WARN()s in noinstr code. If such a WARN() + * were to trigger, we'd rather wreck the machine in an attempt to get the + * message out than not know about it. + */ #define __WARN_FLAGS(flags) \ do { \ instrumentation_begin(); \ diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index dd17c2da1af5..da78ccbd493b 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -58,4 +58,9 @@ static inline bool handle_guest_split_lock(unsigned long ip) return false; } #endif +#ifdef CONFIG_IA32_FEAT_CTL +void init_ia32_feat_ctl(struct cpuinfo_x86 *c); +#else +static inline void init_ia32_feat_ctl(struct cpuinfo_x86 *c) {} +#endif #endif /* _ASM_X86_CPU_H */ diff --git a/arch/x86/include/asm/cpumask.h b/arch/x86/include/asm/cpumask.h index 6722ffcef2e6..3afa990d756b 100644 --- a/arch/x86/include/asm/cpumask.h +++ b/arch/x86/include/asm/cpumask.h @@ -11,5 +11,23 @@ extern cpumask_var_t cpu_sibling_setup_mask; extern void setup_cpu_local_masks(void); +/* + * NMI and MCE exceptions need cpu_is_offline() _really_ early, + * provide an arch_ special for them to avoid instrumentation. + */ +#if NR_CPUS > 1 +static __always_inline bool arch_cpu_online(int cpu) +{ + return arch_test_bit(cpu, cpumask_bits(cpu_online_mask)); +} +#else +static __always_inline bool arch_cpu_online(int cpu) +{ + return cpu == 0; +} +#endif + +#define arch_cpu_is_offline(cpu) unlikely(!arch_cpu_online(cpu)) + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_CPUMASK_H */ diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 42159f45bf9c..845e7481ab77 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -623,6 +623,11 @@ static inline void switch_fpu_finish(struct fpu *new_fpu) * MXCSR and XCR definitions: */ +static inline void ldmxcsr(u32 mxcsr) +{ + asm volatile("ldmxcsr %0" :: "m" (mxcsr)); +} + extern unsigned int mxcsr_feature_mask; #define XCR_XFEATURE_ENABLED_MASK 0x00000000 diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index cf51c50eb356..80d3b30d3ee3 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -353,10 +353,6 @@ static __always_inline void __##func(struct pt_regs *regs) #else /* CONFIG_X86_64 */ -/* Maps to a regular IDTENTRY on 32bit for now */ -# define DECLARE_IDTENTRY_IST DECLARE_IDTENTRY -# define DEFINE_IDTENTRY_IST DEFINE_IDTENTRY - /** * DECLARE_IDTENTRY_DF - Declare functions for double fault 32bit variant * @vector: Vector number (ignored for C) @@ -387,28 +383,18 @@ __visible noinstr void func(struct pt_regs *regs, \ #endif /* !CONFIG_X86_64 */ /* C-Code mapping */ +#define DECLARE_IDTENTRY_NMI DECLARE_IDTENTRY_RAW +#define DEFINE_IDTENTRY_NMI DEFINE_IDTENTRY_RAW + +#ifdef CONFIG_X86_64 #define DECLARE_IDTENTRY_MCE DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_MCE DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_MCE_USER DEFINE_IDTENTRY_NOIST -#define DECLARE_IDTENTRY_NMI DECLARE_IDTENTRY_RAW -#define DEFINE_IDTENTRY_NMI DEFINE_IDTENTRY_RAW - #define DECLARE_IDTENTRY_DEBUG DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG_USER DEFINE_IDTENTRY_NOIST - -/** - * DECLARE_IDTENTRY_XEN - Declare functions for XEN redirect IDT entry points - * @vector: Vector number (ignored for C) - * @func: Function name of the entry point - * - * Used for xennmi and xendebug redirections. No DEFINE as this is all ASM - * indirection magic. - */ -#define DECLARE_IDTENTRY_XEN(vector, func) \ - asmlinkage void xen_asm_exc_xen##func(void); \ - asmlinkage void asm_exc_xen##func(void) +#endif #else /* !__ASSEMBLY__ */ @@ -455,9 +441,6 @@ __visible noinstr void func(struct pt_regs *regs, \ # define DECLARE_IDTENTRY_MCE(vector, func) \ DECLARE_IDTENTRY(vector, func) -# define DECLARE_IDTENTRY_DEBUG(vector, func) \ - DECLARE_IDTENTRY(vector, func) - /* No ASM emitted for DF as this goes through a C shim */ # define DECLARE_IDTENTRY_DF(vector, func) @@ -469,10 +452,6 @@ __visible noinstr void func(struct pt_regs *regs, \ /* No ASM code emitted for NMI */ #define DECLARE_IDTENTRY_NMI(vector, func) -/* XEN NMI and DB wrapper */ -#define DECLARE_IDTENTRY_XEN(vector, func) \ - idtentry vector asm_exc_xen##func exc_##func has_error_code=0 - /* * ASM code to emit the common vector entry stubs where each stub is * packed into 8 bytes. @@ -490,16 +469,15 @@ __visible noinstr void func(struct pt_regs *regs, \ .align 8 SYM_CODE_START(irq_entries_start) vector=FIRST_EXTERNAL_VECTOR - pos = . .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) UNWIND_HINT_IRET_REGS +0 : .byte 0x6a, vector jmp asm_common_interrupt nop /* Ensure that the above is 8 bytes max */ - . = pos + 8 - pos=pos+8 - vector=vector+1 + . = 0b + 8 + vector = vector+1 .endr SYM_CODE_END(irq_entries_start) @@ -507,16 +485,15 @@ SYM_CODE_END(irq_entries_start) .align 8 SYM_CODE_START(spurious_entries_start) vector=FIRST_SYSTEM_VECTOR - pos = . .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR) UNWIND_HINT_IRET_REGS +0 : .byte 0x6a, vector jmp asm_spurious_interrupt nop /* Ensure that the above is 8 bytes max */ - . = pos + 8 - pos=pos+8 - vector=vector+1 + . = 0b + 8 + vector = vector+1 .endr SYM_CODE_END(spurious_entries_start) #endif @@ -565,16 +542,28 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_BP, exc_int3); DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF, exc_page_fault); #ifdef CONFIG_X86_MCE +#ifdef CONFIG_X86_64 DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check); +#else +DECLARE_IDTENTRY_RAW(X86_TRAP_MC, exc_machine_check); +#endif #endif /* NMI */ DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi); -DECLARE_IDTENTRY_XEN(X86_TRAP_NMI, nmi); +#if defined(CONFIG_XEN_PV) && defined(CONFIG_X86_64) +DECLARE_IDTENTRY_RAW(X86_TRAP_NMI, xenpv_exc_nmi); +#endif /* #DB */ +#ifdef CONFIG_X86_64 DECLARE_IDTENTRY_DEBUG(X86_TRAP_DB, exc_debug); -DECLARE_IDTENTRY_XEN(X86_TRAP_DB, debug); +#else +DECLARE_IDTENTRY_RAW(X86_TRAP_DB, exc_debug); +#endif +#if defined(CONFIG_XEN_PV) && defined(CONFIG_X86_64) +DECLARE_IDTENTRY_RAW(X86_TRAP_DB, xenpv_exc_debug); +#endif /* #DF */ DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault); @@ -635,8 +624,8 @@ DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested #if IS_ENABLED(CONFIG_HYPERV) DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback); -DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment); -DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_STIMER0_VECTOR, sysvec_hyperv_stimer0); +DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment); +DECLARE_IDTENTRY_SYSVEC(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0); #endif #if IS_ENABLED(CONFIG_ACRN_GUEST) diff --git a/arch/x86/include/asm/io_bitmap.h b/arch/x86/include/asm/io_bitmap.h index ac1a99ffbd8d..7f080f5c7def 100644 --- a/arch/x86/include/asm/io_bitmap.h +++ b/arch/x86/include/asm/io_bitmap.h @@ -19,12 +19,28 @@ struct task_struct; void io_bitmap_share(struct task_struct *tsk); void io_bitmap_exit(struct task_struct *tsk); +static inline void native_tss_invalidate_io_bitmap(void) +{ + /* + * Invalidate the I/O bitmap by moving io_bitmap_base outside the + * TSS limit so any subsequent I/O access from user space will + * trigger a #GP. + * + * This is correct even when VMEXIT rewrites the TSS limit + * to 0x67 as the only requirement is that the base points + * outside the limit. + */ + this_cpu_write(cpu_tss_rw.x86_tss.io_bitmap_base, + IO_BITMAP_OFFSET_INVALID); +} + void native_tss_update_io_bitmap(void); #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else #define tss_update_io_bitmap native_tss_update_io_bitmap +#define tss_invalidate_io_bitmap native_tss_invalidate_io_bitmap #endif #else diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h index 5270ff39b9af..a1911fea8739 100644 --- a/arch/x86/include/asm/iosf_mbi.h +++ b/arch/x86/include/asm/iosf_mbi.h @@ -39,6 +39,7 @@ #define BT_MBI_UNIT_PMC 0x04 #define BT_MBI_UNIT_GFX 0x06 #define BT_MBI_UNIT_SMI 0x0C +#define BT_MBI_UNIT_CCK 0x14 #define BT_MBI_UNIT_USB 0x43 #define BT_MBI_UNIT_SATA 0xA3 #define BT_MBI_UNIT_PCIE 0xA6 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f8998e97457f..be5363b21540 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -943,7 +943,7 @@ struct kvm_arch { atomic_t vapics_in_nmi_mode; struct mutex apic_map_lock; struct kvm_apic_map *apic_map; - bool apic_map_dirty; + atomic_t apic_map_dirty; bool apic_access_page_done; unsigned long apicv_inhibit_reasons; @@ -1220,7 +1220,7 @@ struct kvm_x86_ops { void (*enable_log_dirty_pt_masked)(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t offset, unsigned long mask); - int (*write_log_dirty)(struct kvm_vcpu *vcpu); + int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa); /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 73d997aa2966..e039a933aca3 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -25,8 +25,6 @@ #define TPAUSE_C01_STATE 1 #define TPAUSE_C02_STATE 0 -u32 get_umwait_control_msr(void); - static inline void __monitor(const void *eax, unsigned long ecx, unsigned long edx) { diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 5ca5d297df75..3d2afecde50c 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -302,6 +302,11 @@ static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) } #ifdef CONFIG_X86_IOPL_IOPERM +static inline void tss_invalidate_io_bitmap(void) +{ + PVOP_VCALL0(cpu.invalidate_io_bitmap); +} + static inline void tss_update_io_bitmap(void) { PVOP_VCALL0(cpu.update_io_bitmap); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 732f62e04ddb..8dfcb2508e6d 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -141,6 +141,7 @@ struct pv_cpu_ops { void (*load_sp0)(unsigned long sp0); #ifdef CONFIG_X86_IOPL_IOPERM + void (*invalidate_io_bitmap)(void); void (*update_io_bitmap)(void); #endif diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 2da1f95b88d7..816b31c68550 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -194,6 +194,7 @@ enum page_cache_mode { #define _PAGE_TABLE_NOENC (__PP|__RW|_USR|___A| 0|___D| 0| 0) #define _PAGE_TABLE (__PP|__RW|_USR|___A| 0|___D| 0| 0| _ENC) #define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX|___D| 0|___G) +#define __PAGE_KERNEL_ROX (__PP| 0| 0|___A| 0|___D| 0|___G) #define __PAGE_KERNEL_NOCACHE (__PP|__RW| 0|___A|__NX|___D| 0|___G| __NC) #define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX|___D| 0|___G) #define __PAGE_KERNEL_LARGE (__PP|__RW| 0|___A|__NX|___D|_PSE|___G) @@ -219,6 +220,7 @@ enum page_cache_mode { #define PAGE_KERNEL_RO __pgprot_mask(__PAGE_KERNEL_RO | _ENC) #define PAGE_KERNEL_EXEC __pgprot_mask(__PAGE_KERNEL_EXEC | _ENC) #define PAGE_KERNEL_EXEC_NOENC __pgprot_mask(__PAGE_KERNEL_EXEC | 0) +#define PAGE_KERNEL_ROX __pgprot_mask(__PAGE_KERNEL_ROX | _ENC) #define PAGE_KERNEL_NOCACHE __pgprot_mask(__PAGE_KERNEL_NOCACHE | _ENC) #define PAGE_KERNEL_LARGE __pgprot_mask(__PAGE_KERNEL_LARGE | _ENC) #define PAGE_KERNEL_LARGE_EXEC __pgprot_mask(__PAGE_KERNEL_LARGE_EXEC | _ENC) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 42cd333616c4..03b7c4ca425a 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -370,7 +370,7 @@ struct x86_hw_tss { #define IO_BITMAP_OFFSET_INVALID (__KERNEL_TSS_LIMIT + 1) struct entry_stack { - unsigned long words[64]; + char stack[PAGE_SIZE]; }; struct entry_stack_page { diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index ebedeab48704..255b2dde2c1b 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -278,7 +278,7 @@ static inline unsigned long *regs_get_kernel_stack_nth_addr(struct pt_regs *regs } /* To avoid include hell, we can't include uaccess.h */ -extern long probe_kernel_read(void *dst, const void *src, size_t size); +extern long copy_from_kernel_nofault(void *dst, const void *src, size_t size); /** * regs_get_kernel_stack_nth() - get Nth entry of the stack @@ -298,7 +298,7 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, addr = regs_get_kernel_stack_nth_addr(regs, n); if (addr) { - ret = probe_kernel_read(&val, addr, sizeof(val)); + ret = copy_from_kernel_nofault(&val, addr, sizeof(val)); if (!ret) return val; } diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 17c5a038f42d..0780f97c1850 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -408,14 +408,15 @@ struct kvm_vmx_nested_state_data { }; struct kvm_vmx_nested_state_hdr { - __u32 flags; __u64 vmxon_pa; __u64 vmcs12_pa; - __u64 preemption_timer_deadline; struct { __u16 flags; } smm; + + __u32 flags; + __u64 preemption_timer_deadline; }; struct kvm_svm_nested_state_data { diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ce61e3e7d399..81ffcfbfaef2 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2316,12 +2316,12 @@ static int mp_irqdomain_create(int ioapic) ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops, (void *)(long)ioapic); - /* Release fw handle if it was allocated above */ - if (!cfg->dev) - irq_domain_free_fwnode(fn); - - if (!ip->irqdomain) + if (!ip->irqdomain) { + /* Release fw handle if it was allocated above */ + if (!cfg->dev) + irq_domain_free_fwnode(fn); return -ENOMEM; + } ip->irqdomain->parent = parent; diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 5cbaca58af95..c2b2911feeef 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -263,12 +263,13 @@ void __init arch_init_msi_domain(struct irq_domain *parent) msi_default_domain = pci_msi_create_irq_domain(fn, &pci_msi_domain_info, parent); - irq_domain_free_fwnode(fn); } - if (!msi_default_domain) + if (!msi_default_domain) { + irq_domain_free_fwnode(fn); pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n"); - else + } else { msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK; + } } #ifdef CONFIG_IRQ_REMAP @@ -301,7 +302,8 @@ struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent, if (!fn) return NULL; d = pci_msi_create_irq_domain(fn, &pci_msi_ir_domain_info, parent); - irq_domain_free_fwnode(fn); + if (!d) + irq_domain_free_fwnode(fn); return d; } #endif @@ -364,7 +366,8 @@ static struct irq_domain *dmar_get_irq_domain(void) if (fn) { dmar_domain = msi_create_irq_domain(fn, &dmar_msi_domain_info, x86_vector_domain); - irq_domain_free_fwnode(fn); + if (!dmar_domain) + irq_domain_free_fwnode(fn); } out: mutex_unlock(&dmar_lock); @@ -489,7 +492,10 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id) } d = msi_create_irq_domain(fn, domain_info, parent); - irq_domain_free_fwnode(fn); + if (!d) { + irq_domain_free_fwnode(fn); + kfree(domain_info); + } return d; } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index c48be6e1f676..7649da2478d8 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -446,12 +446,10 @@ static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd, trace_vector_activate(irqd->irq, apicd->is_managed, apicd->can_reserve, reserve); - /* Nothing to do for fixed assigned vectors */ - if (!apicd->can_reserve && !apicd->is_managed) - return 0; - raw_spin_lock_irqsave(&vector_lock, flags); - if (reserve || irqd_is_managed_and_shutdown(irqd)) + if (!apicd->can_reserve && !apicd->is_managed) + assign_irq_vector_any_locked(irqd); + else if (reserve || irqd_is_managed_and_shutdown(irqd)) vector_assign_managed_shutdown(irqd); else if (apicd->is_managed) ret = activate_managed(irqd); @@ -709,7 +707,6 @@ int __init arch_early_irq_init(void) x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops, NULL); BUG_ON(x86_vector_domain == NULL); - irq_domain_free_fwnode(fn); irq_set_default_host(x86_vector_domain); arch_init_msi_domain(x86_vector_domain); @@ -775,20 +772,10 @@ void lapic_offline(void) static int apic_set_affinity(struct irq_data *irqd, const struct cpumask *dest, bool force) { - struct apic_chip_data *apicd = apic_chip_data(irqd); int err; - /* - * Core code can call here for inactive interrupts. For inactive - * interrupts which use managed or reservation mode there is no - * point in going through the vector assignment right now as the - * activation will assign a vector which fits the destination - * cpumask. Let the core code store the destination mask and be - * done with it. - */ - if (!irqd_is_activated(irqd) && - (apicd->is_managed || apicd->can_reserve)) - return IRQ_SET_MASK_OK; + if (WARN_ON_ONCE(!irqd_is_activated(irqd))) + return -EIO; raw_spin_lock(&vector_lock); cpumask_and(vector_searchmask, dest, cpu_online_mask); diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 426792565d86..c5cf336e5077 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -3,6 +3,7 @@ #include <linux/sched.h> #include <linux/sched/clock.h> +#include <asm/cpu.h> #include <asm/cpufeature.h> #include <asm/e820/api.h> #include <asm/mtrr.h> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 043d93cdcaad..95c090a45b4b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -347,6 +347,9 @@ out: cr4_clear_bits(X86_CR4_UMIP); } +/* These bits should not change their value after CPU init is finished. */ +static const unsigned long cr4_pinned_mask = + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | X86_CR4_FSGSBASE; static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning); static unsigned long cr4_pinned_bits __ro_after_init; @@ -371,20 +374,20 @@ EXPORT_SYMBOL(native_write_cr0); void native_write_cr4(unsigned long val) { - unsigned long bits_missing = 0; + unsigned long bits_changed = 0; set_register: asm volatile("mov %0,%%cr4": "+r" (val), "+m" (cr4_pinned_bits)); if (static_branch_likely(&cr_pinning)) { - if (unlikely((val & cr4_pinned_bits) != cr4_pinned_bits)) { - bits_missing = ~val & cr4_pinned_bits; - val |= bits_missing; + if (unlikely((val & cr4_pinned_mask) != cr4_pinned_bits)) { + bits_changed = (val & cr4_pinned_mask) ^ cr4_pinned_bits; + val = (val & ~cr4_pinned_mask) | cr4_pinned_bits; goto set_register; } - /* Warn after we've set the missing bits. */ - WARN_ONCE(bits_missing, "CR4 bits went missing: %lx!?\n", - bits_missing); + /* Warn after we've corrected the changed bits. */ + WARN_ONCE(bits_changed, "pinned CR4 bits changed: 0x%lx!?\n", + bits_changed); } } #if IS_MODULE(CONFIG_LKDTM) @@ -419,7 +422,7 @@ void cr4_init(void) if (boot_cpu_has(X86_FEATURE_PCID)) cr4 |= X86_CR4_PCIDE; if (static_branch_likely(&cr_pinning)) - cr4 |= cr4_pinned_bits; + cr4 = (cr4 & ~cr4_pinned_mask) | cr4_pinned_bits; __write_cr4(cr4); @@ -434,10 +437,7 @@ void cr4_init(void) */ static void __init setup_cr_pinning(void) { - unsigned long mask; - - mask = (X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP); - cr4_pinned_bits = this_cpu_read(cpu_tlbstate.cr4) & mask; + cr4_pinned_bits = this_cpu_read(cpu_tlbstate.cr4) & cr4_pinned_mask; static_key_enable(&cr_pinning.key); } diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index fb538fccd24c..9d033693519a 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -81,8 +81,4 @@ extern void update_srbds_msr(void); extern u64 x86_read_arch_cap_msr(void); -#ifdef CONFIG_IA32_FEAT_CTL -void init_ia32_feat_ctl(struct cpuinfo_x86 *c); -#endif - #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index c25a67a34bd3..0ab48f1cdf84 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -50,6 +50,13 @@ static enum split_lock_detect_state sld_state __ro_after_init = sld_off; static u64 msr_test_ctrl_cache __ro_after_init; /* + * With a name like MSR_TEST_CTL it should go without saying, but don't touch + * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it + * on CPUs that do not support SLD can cause fireworks, even when writing '0'. + */ +static bool cpu_model_supports_sld __ro_after_init; + +/* * Processors which have self-snooping capability can handle conflicting * memory type across CPUs by snooping its own cache. However, there exists * CPU models in which having conflicting memory types still leads to @@ -1071,7 +1078,8 @@ static void sld_update_msr(bool on) static void split_lock_init(void) { - split_lock_verify_msr(sld_state != sld_off); + if (cpu_model_supports_sld) + split_lock_verify_msr(sld_state != sld_off); } static void split_lock_warn(unsigned long ip) @@ -1177,5 +1185,6 @@ void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) return; } + cpu_model_supports_sld = true; split_lock_setup(); } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index ce9120c4f740..14e4b4d17ee5 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1083,7 +1083,7 @@ static noinstr bool mce_check_crashing_cpu(void) { unsigned int cpu = smp_processor_id(); - if (cpu_is_offline(cpu) || + if (arch_cpu_is_offline(cpu) || (crashing_cpu != -1 && crashing_cpu != cpu)) { u64 mcgstatus; @@ -1901,6 +1901,8 @@ void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check; static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) { + WARN_ON_ONCE(user_mode(regs)); + /* * Only required when from kernel mode. See * mce_check_crashing_cpu() for details. @@ -1954,7 +1956,7 @@ DEFINE_IDTENTRY_MCE_USER(exc_machine_check) } #else /* 32bit unified entry point */ -DEFINE_IDTENTRY_MCE(exc_machine_check) +DEFINE_IDTENTRY_RAW(exc_machine_check) { unsigned long dr7; diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 12f967c6b603..6a9df71c1b9e 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -981,10 +981,10 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c) c->x86_cache_max_rmid = ecx; c->x86_cache_occ_scale = ebx; - if (c->x86_vendor == X86_VENDOR_INTEL) - c->x86_cache_mbm_width_offset = eax & 0xff; - else - c->x86_cache_mbm_width_offset = -1; + c->x86_cache_mbm_width_offset = eax & 0xff; + + if (c->x86_vendor == X86_VENDOR_AMD && !c->x86_cache_mbm_width_offset) + c->x86_cache_mbm_width_offset = MBM_CNTR_WIDTH_OFFSET_AMD; } } diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index f20a47d120b1..5ffa32256b3b 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -37,6 +37,7 @@ #define MBA_IS_LINEAR 0x4 #define MBA_MAX_MBPS U32_MAX #define MAX_MBA_BW_AMD 0x800 +#define MBM_CNTR_WIDTH_OFFSET_AMD 20 #define RMID_VAL_ERROR BIT_ULL(63) #define RMID_VAL_UNAVAIL BIT_ULL(62) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 23b4b61319d3..3f844f14fc0a 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -1117,6 +1117,7 @@ static int rdt_cdp_peer_get(struct rdt_resource *r, struct rdt_domain *d, _d_cdp = rdt_find_domain(_r_cdp, d->id, NULL); if (WARN_ON(IS_ERR_OR_NULL(_d_cdp))) { _r_cdp = NULL; + _d_cdp = NULL; ret = -EINVAL; } diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c index 300e3fd5ade3..ec8064c0ae03 100644 --- a/arch/x86/kernel/cpu/umwait.c +++ b/arch/x86/kernel/cpu/umwait.c @@ -18,12 +18,6 @@ */ static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE); -u32 get_umwait_control_msr(void) -{ - return umwait_control_cached; -} -EXPORT_SYMBOL_GPL(get_umwait_control_msr); - /* * Cache the original IA32_UMWAIT_CONTROL MSR value which is configured by * hardware or BIOS before kernel boot. diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c index df1358ba622b..05fa4ef63490 100644 --- a/arch/x86/kernel/cpu/zhaoxin.c +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -2,6 +2,7 @@ #include <linux/sched.h> #include <linux/sched/clock.h> +#include <asm/cpu.h> #include <asm/cpufeature.h> #include "cpu.h" diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 456511b2284e..7401cc12c3cc 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -71,6 +71,22 @@ static void printk_stack_address(unsigned long address, int reliable, printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); } +static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src, + unsigned int nbytes) +{ + if (!user_mode(regs)) + return copy_from_kernel_nofault(buf, (u8 *)src, nbytes); + + /* + * Make sure userspace isn't trying to trick us into dumping kernel + * memory by pointing the userspace instruction pointer at it. + */ + if (__chk_range_not_ok(src, nbytes, TASK_SIZE_MAX)) + return -EINVAL; + + return copy_from_user_nmi(buf, (void __user *)src, nbytes); +} + /* * There are a couple of reasons for the 2/3rd prologue, courtesy of Linus: * @@ -97,17 +113,8 @@ void show_opcodes(struct pt_regs *regs, const char *loglvl) #define OPCODE_BUFSIZE (PROLOGUE_SIZE + 1 + EPILOGUE_SIZE) u8 opcodes[OPCODE_BUFSIZE]; unsigned long prologue = regs->ip - PROLOGUE_SIZE; - bool bad_ip; - - /* - * Make sure userspace isn't trying to trick us into dumping kernel - * memory by pointing the userspace instruction pointer at it. - */ - bad_ip = user_mode(regs) && - __chk_range_not_ok(prologue, OPCODE_BUFSIZE, TASK_SIZE_MAX); - if (bad_ip || probe_kernel_read(opcodes, (u8 *)prologue, - OPCODE_BUFSIZE)) { + if (copy_code(regs, opcodes, prologue, sizeof(opcodes))) { printk("%sCode: Bad RIP value.\n", loglvl); } else { printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %" diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 06c818967bb6..15247b96c6ea 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -101,6 +101,12 @@ void kernel_fpu_begin(void) copy_fpregs_to_fpstate(¤t->thread.fpu); } __cpu_invalidate_fpregs_state(); + + if (boot_cpu_has(X86_FEATURE_XMM)) + ldmxcsr(MXCSR_DEFAULT); + + if (boot_cpu_has(X86_FEATURE_FPU)) + asm volatile ("fninit"); } EXPORT_SYMBOL_GPL(kernel_fpu_begin); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index bda2e5eaca0e..ad3a2b37927d 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1074,7 +1074,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of copy_part(offsetof(struct fxregs_state, st_space), 128, &xsave->i387.st_space, &kbuf, &offset_start, &count); if (header.xfeatures & XFEATURE_MASK_SSE) - copy_part(xstate_offsets[XFEATURE_MASK_SSE], 256, + copy_part(xstate_offsets[XFEATURE_SSE], 256, &xsave->i387.xmm_space, &kbuf, &offset_start, &count); /* * Fill xsave->i387.sw_reserved value for ptrace frame: diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index c84d28e90a58..51504566b3a6 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -86,7 +86,7 @@ static int ftrace_verify_code(unsigned long ip, const char *old_code) * sure what we read is what we expected it to be before modifying it. */ /* read the text we want to modify */ - if (probe_kernel_read(cur_code, (void *)ip, MCOUNT_INSN_SIZE)) { + if (copy_from_kernel_nofault(cur_code, (void *)ip, MCOUNT_INSN_SIZE)) { WARN_ON(1); return -EFAULT; } @@ -355,7 +355,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE); /* Copy ftrace_caller onto the trampoline memory */ - ret = probe_kernel_read(trampoline, (void *)start_offset, size); + ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, size); if (WARN_ON(ret < 0)) goto fail; @@ -363,13 +363,13 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* The trampoline ends with ret(q) */ retq = (unsigned long)ftrace_stub; - ret = probe_kernel_read(ip, (void *)retq, RET_SIZE); + ret = copy_from_kernel_nofault(ip, (void *)retq, RET_SIZE); if (WARN_ON(ret < 0)) goto fail; if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { ip = trampoline + (ftrace_regs_caller_ret - ftrace_regs_caller); - ret = probe_kernel_read(ip, (void *)retq, RET_SIZE); + ret = copy_from_kernel_nofault(ip, (void *)retq, RET_SIZE); if (WARN_ON(ret < 0)) goto fail; } @@ -506,7 +506,7 @@ static void *addr_from_call(void *ptr) union text_poke_insn call; int ret; - ret = probe_kernel_read(&call, ptr, CALL_INSN_SIZE); + ret = copy_from_kernel_nofault(&call, ptr, CALL_INSN_SIZE); if (WARN_ON_ONCE(ret < 0)) return NULL; diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index f3c76252247d..282b4ee1339f 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -207,7 +207,7 @@ spurious_8259A_irq: * lets ACK and report it. [once per IRQ] */ if (!(spurious_irq_mask & irqmask)) { - printk(KERN_DEBUG + printk_deferred(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); spurious_irq_mask |= irqmask; } diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index c44fe7d8d9a4..68acd30c6b87 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -732,11 +732,11 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) int err; bpt->type = BP_BREAKPOINT; - err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, + err = copy_from_kernel_nofault(bpt->saved_instr, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE); if (err) return err; - err = probe_kernel_write((char *)bpt->bpt_addr, + err = copy_to_kernel_nofault((char *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE); if (!err) return err; @@ -768,7 +768,7 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) return 0; knl_write: - return probe_kernel_write((char *)bpt->bpt_addr, + return copy_to_kernel_nofault((char *)bpt->bpt_addr, (char *)bpt->saved_instr, BREAK_INSTR_SIZE); } diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 3bafe1bd4dc7..ada39ddbc922 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -243,7 +243,7 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) * Fortunately, we know that the original code is the ideal 5-byte * long NOP. */ - if (probe_kernel_read(buf, (void *)addr, + if (copy_from_kernel_nofault(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) return 0UL; @@ -346,7 +346,8 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) return 0; /* This can access kernel text if given address is not recovered */ - if (probe_kernel_read(dest, (void *)recovered_insn, MAX_INSN_SIZE)) + if (copy_from_kernel_nofault(dest, (void *)recovered_insn, + MAX_INSN_SIZE)) return 0; kernel_insn_init(insn, dest, MAX_INSN_SIZE); @@ -753,16 +754,11 @@ asm( NOKPROBE_SYMBOL(kretprobe_trampoline); STACK_FRAME_NON_STANDARD(kretprobe_trampoline); -static struct kprobe kretprobe_kprobe = { - .addr = (void *)kretprobe_trampoline, -}; - /* * Called from kretprobe_trampoline */ __used __visible void *trampoline_handler(struct pt_regs *regs) { - struct kprobe_ctlblk *kcb; struct kretprobe_instance *ri = NULL; struct hlist_head *head, empty_rp; struct hlist_node *tmp; @@ -772,16 +768,12 @@ __used __visible void *trampoline_handler(struct pt_regs *regs) void *frame_pointer; bool skipped = false; - preempt_disable(); - /* * Set a dummy kprobe for avoiding kretprobe recursion. * Since kretprobe never run in kprobe handler, kprobe must not * be running at this point. */ - kcb = get_kprobe_ctlblk(); - __this_cpu_write(current_kprobe, &kretprobe_kprobe); - kcb->kprobe_status = KPROBE_HIT_ACTIVE; + kprobe_busy_begin(); INIT_HLIST_HEAD(&empty_rp); kretprobe_hash_lock(current, &head, &flags); @@ -857,7 +849,7 @@ __used __visible void *trampoline_handler(struct pt_regs *regs) __this_cpu_write(current_kprobe, &ri->rp->kp); ri->ret_addr = correct_ret_addr; ri->rp->handler(ri, regs); - __this_cpu_write(current_kprobe, &kretprobe_kprobe); + __this_cpu_write(current_kprobe, &kprobe_busy); } recycle_rp_inst(ri, &empty_rp); @@ -873,8 +865,7 @@ __used __visible void *trampoline_handler(struct pt_regs *regs) kretprobe_hash_unlock(current, &flags); - __this_cpu_write(current_kprobe, NULL); - preempt_enable(); + kprobe_busy_end(); hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 321c19950285..7af4c61dde52 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -56,7 +56,7 @@ found: * overwritten by jump destination address. In this case, original * bytes must be recovered from op->optinsn.copied_insn buffer. */ - if (probe_kernel_read(buf, (void *)addr, + if (copy_from_kernel_nofault(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) return 0UL; diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 8748321c4486..b8aee71840ae 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -29,6 +29,8 @@ #include <asm/mmu_context.h> #include <asm/pgtable_areas.h> +#include <xen/xen.h> + /* This is a multiple of PAGE_SIZE. */ #define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE) @@ -543,6 +545,28 @@ static int read_default_ldt(void __user *ptr, unsigned long bytecount) return bytecount; } +static bool allow_16bit_segments(void) +{ + if (!IS_ENABLED(CONFIG_X86_16BIT)) + return false; + +#ifdef CONFIG_XEN_PV + /* + * Xen PV does not implement ESPFIX64, which means that 16-bit + * segments will not work correctly. Until either Xen PV implements + * ESPFIX64 and can signal this fact to the guest or unless someone + * provides compelling evidence that allowing broken 16-bit segments + * is worthwhile, disallow 16-bit segments under Xen PV. + */ + if (xen_pv_domain()) { + pr_info_once("Warning: 16-bit segments do not work correctly in a Xen PV guest\n"); + return false; + } +#endif + + return true; +} + static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) { struct mm_struct *mm = current->mm; @@ -574,7 +598,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) /* The user wants to clear the entry. */ memset(&ldt, 0, sizeof(ldt)); } else { - if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) { + if (!ldt_info.seg_32bit && !allow_16bit_segments()) { error = -EINVAL; goto out; } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 2de365f15684..d7c5e44b26f7 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -478,7 +478,7 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7); DEFINE_IDTENTRY_RAW(exc_nmi) { - if (IS_ENABLED(CONFIG_SMP) && cpu_is_offline(smp_processor_id())) + if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) return; if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 674a7d66d960..de2138ba38e5 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -324,7 +324,8 @@ struct paravirt_patch_template pv_ops = { .cpu.swapgs = native_swapgs, #ifdef CONFIG_X86_IOPL_IOPERM - .cpu.update_io_bitmap = native_tss_update_io_bitmap, + .cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap, + .cpu.update_io_bitmap = native_tss_update_io_bitmap, #endif .cpu.start_context_switch = paravirt_nop, diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index ee0286390a4c..9e1def3744f2 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -94,12 +94,12 @@ static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short } static bool probe_list(struct pci_dev *pdev, unsigned short vendor, - const unsigned char *rom_list) + const void *rom_list) { unsigned short device; do { - if (probe_kernel_address(rom_list, device) != 0) + if (get_kernel_nofault(device, rom_list) != 0) device = 0; if (device && match_id(pdev, vendor, device)) @@ -119,19 +119,19 @@ static struct resource *find_oprom(struct pci_dev *pdev) for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) { struct resource *res = &adapter_rom_resources[i]; unsigned short offset, vendor, device, list, rev; - const unsigned char *rom; + const void *rom; if (res->end == 0) break; rom = isa_bus_to_virt(res->start); - if (probe_kernel_address(rom + 0x18, offset) != 0) + if (get_kernel_nofault(offset, rom + 0x18) != 0) continue; - if (probe_kernel_address(rom + offset + 0x4, vendor) != 0) + if (get_kernel_nofault(vendor, rom + offset + 0x4) != 0) continue; - if (probe_kernel_address(rom + offset + 0x6, device) != 0) + if (get_kernel_nofault(device, rom + offset + 0x6) != 0) continue; if (match_id(pdev, vendor, device)) { @@ -139,8 +139,8 @@ static struct resource *find_oprom(struct pci_dev *pdev) break; } - if (probe_kernel_address(rom + offset + 0x8, list) == 0 && - probe_kernel_address(rom + offset + 0xc, rev) == 0 && + if (get_kernel_nofault(list, rom + offset + 0x8) == 0 && + get_kernel_nofault(rev, rom + offset + 0xc) == 0 && rev >= 3 && list && probe_list(pdev, vendor, rom + offset + list)) { oprom = res; @@ -183,14 +183,14 @@ static int __init romsignature(const unsigned char *rom) const unsigned short * const ptr = (const unsigned short *)rom; unsigned short sig; - return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE; + return get_kernel_nofault(sig, ptr) == 0 && sig == ROMSIGNATURE; } static int __init romchecksum(const unsigned char *rom, unsigned long length) { unsigned char sum, c; - for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--) + for (sum = 0; length && get_kernel_nofault(c, rom++) == 0; length--) sum += c; return !length && !sum; } @@ -211,7 +211,7 @@ void __init probe_roms(void) video_rom_resource.start = start; - if (probe_kernel_address(rom + 2, c) != 0) + if (get_kernel_nofault(c, rom + 2) != 0) continue; /* 0 < length <= 0x7f * 512, historically */ @@ -249,7 +249,7 @@ void __init probe_roms(void) if (!romsignature(rom)) continue; - if (probe_kernel_address(rom + 2, c) != 0) + if (get_kernel_nofault(c, rom + 2) != 0) continue; /* 0 < length <= 0x7f * 512, historically */ diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index f362ce0d5ac0..fe67dbd76e51 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -322,20 +322,6 @@ void arch_setup_new_exec(void) } #ifdef CONFIG_X86_IOPL_IOPERM -static inline void tss_invalidate_io_bitmap(struct tss_struct *tss) -{ - /* - * Invalidate the I/O bitmap by moving io_bitmap_base outside the - * TSS limit so any subsequent I/O access from user space will - * trigger a #GP. - * - * This is correct even when VMEXIT rewrites the TSS limit - * to 0x67 as the only requirement is that the base points - * outside the limit. - */ - tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID; -} - static inline void switch_to_bitmap(unsigned long tifp) { /* @@ -346,7 +332,7 @@ static inline void switch_to_bitmap(unsigned long tifp) * user mode. */ if (tifp & _TIF_IO_BITMAP) - tss_invalidate_io_bitmap(this_cpu_ptr(&cpu_tss_rw)); + tss_invalidate_io_bitmap(); } static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm) @@ -380,7 +366,7 @@ void native_tss_update_io_bitmap(void) u16 *base = &tss->x86_tss.io_bitmap_base; if (!test_thread_flag(TIF_IO_BITMAP)) { - tss_invalidate_io_bitmap(tss); + native_tss_invalidate_io_bitmap(); return; } diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 6ad43fc44556..2fd698e28e4d 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -58,7 +58,6 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, * or a page fault), which can make frame pointers * unreliable. */ - if (IS_ENABLED(CONFIG_FRAME_POINTER)) return -EINVAL; } @@ -81,10 +80,6 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, if (unwind_error(&state)) return -EINVAL; - /* Success path for non-user tasks, i.e. kthreads and idle tasks */ - if (!(task->flags & (PF_KTHREAD | PF_IDLE))) - return -EINVAL; - return 0; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index af75109485c2..b7cb3e0716f7 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -84,17 +84,16 @@ static inline void cond_local_irq_disable(struct pt_regs *regs) local_irq_disable(); } -int is_valid_bugaddr(unsigned long addr) +__always_inline int is_valid_bugaddr(unsigned long addr) { - unsigned short ud; - if (addr < TASK_SIZE_MAX) return 0; - if (probe_kernel_address((unsigned short *)addr, ud)) - return 0; - - return ud == INSN_UD0 || ud == INSN_UD2; + /* + * We got #UD, if the text isn't readable we'd have gotten + * a different exception. + */ + return *(unsigned short *)addr == INSN_UD2; } static nokprobe_inline int @@ -216,40 +215,45 @@ static inline void handle_invalid_op(struct pt_regs *regs) ILL_ILLOPN, error_get_trap_addr(regs)); } -DEFINE_IDTENTRY_RAW(exc_invalid_op) +static noinstr bool handle_bug(struct pt_regs *regs) { - bool rcu_exit; + bool handled = false; + + if (!is_valid_bugaddr(regs->ip)) + return handled; /* - * Handle BUG/WARN like NMIs instead of like normal idtentries: - * if we bugged/warned in a bad RCU context, for example, the last - * thing we want is to BUG/WARN again in the idtentry code, ad - * infinitum. + * All lies, just get the WARN/BUG out. + */ + instrumentation_begin(); + /* + * Since we're emulating a CALL with exceptions, restore the interrupt + * state to what it was at the exception site. */ - if (!user_mode(regs) && is_valid_bugaddr(regs->ip)) { - enum bug_trap_type type; + if (regs->flags & X86_EFLAGS_IF) + raw_local_irq_enable(); + if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) { + regs->ip += LEN_UD2; + handled = true; + } + if (regs->flags & X86_EFLAGS_IF) + raw_local_irq_disable(); + instrumentation_end(); - nmi_enter(); - instrumentation_begin(); - trace_hardirqs_off_finish(); - type = report_bug(regs->ip, regs); - if (regs->flags & X86_EFLAGS_IF) - trace_hardirqs_on_prepare(); - instrumentation_end(); - nmi_exit(); + return handled; +} - if (type == BUG_TRAP_TYPE_WARN) { - /* Skip the ud2. */ - regs->ip += LEN_UD2; - return; - } +DEFINE_IDTENTRY_RAW(exc_invalid_op) +{ + bool rcu_exit; - /* - * Else, if this was a BUG and report_bug returns or if this - * was just a normal #UD, we want to continue onward and - * crash. - */ - } + /* + * We use UD2 as a short encoding for 'CALL __WARN', as such + * handle it before exception entry to avoid recursive WARN + * in case exception entry is the one triggering WARNs. + */ + if (!user_mode(regs) && handle_bug(regs)) + return; rcu_exit = idtentry_enter_cond_rcu(regs); instrumentation_begin(); @@ -299,6 +303,8 @@ DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check) do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs, error_code, BUS_ADRALN, NULL); + + local_irq_disable(); } #ifdef CONFIG_VMAP_STACK @@ -488,7 +494,8 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs, u8 insn_buf[MAX_INSN_SIZE]; struct insn insn; - if (probe_kernel_read(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) + if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, + MAX_INSN_SIZE)) return GP_NO_HINT; kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE); @@ -690,13 +697,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) (struct bad_iret_stack *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; /* Copy the IRET target to the temporary storage. */ - memcpy(&tmp.regs.ip, (void *)s->regs.sp, 5*8); + __memcpy(&tmp.regs.ip, (void *)s->regs.sp, 5*8); /* Copy the remainder of the stack from the current stack. */ - memcpy(&tmp, s, offsetof(struct bad_iret_stack, regs.ip)); + __memcpy(&tmp, s, offsetof(struct bad_iret_stack, regs.ip)); /* Update the entry stack */ - memcpy(new_stack, &tmp, sizeof(tmp)); + __memcpy(new_stack, &tmp, sizeof(tmp)); BUG_ON(!user_mode(&new_stack->regs)); return new_stack; @@ -865,6 +872,12 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, trace_hardirqs_off_finish(); /* + * If something gets miswired and we end up here for a user mode + * #DB, we will malfunction. + */ + WARN_ON_ONCE(user_mode(regs)); + + /* * Catch SYSENTER with TF set and clear DR_STEP. If this hit a * watchpoint at the same time then that will still be handled. */ @@ -882,6 +895,12 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, static __always_inline void exc_debug_user(struct pt_regs *regs, unsigned long dr6) { + /* + * If something gets miswired and we end up here for a kernel mode + * #DB, we will malfunction. + */ + WARN_ON_ONCE(!user_mode(regs)); + idtentry_enter_user(regs); instrumentation_begin(); @@ -912,7 +931,7 @@ DEFINE_IDTENTRY_DEBUG_USER(exc_debug) } #else /* 32 bit does not have separate entry points. */ -DEFINE_IDTENTRY_DEBUG(exc_debug) +DEFINE_IDTENTRY_RAW(exc_debug) { unsigned long dr6, dr7; diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 7f969b2d240f..ec88bbe08a32 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -440,8 +440,11 @@ bool unwind_next_frame(struct unwind_state *state) /* * Find the orc_entry associated with the text address. * - * Decrement call return addresses by one so they work for sibling - * calls and calls to noreturn functions. + * For a call frame (as opposed to a signal frame), state->ip points to + * the instruction after the call. That instruction's stack layout + * could be different from the call instruction's layout, for example + * if the call was to a noreturn function. So get the ORC data for the + * call instruction itself. */ orc = orc_find(state->signal ? state->ip : state->ip - 1); if (!orc) { @@ -662,6 +665,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, state->sp = task->thread.sp; state->bp = READ_ONCE_NOCHECK(frame->bp); state->ip = READ_ONCE_NOCHECK(frame->ret_addr); + state->signal = (void *)state->ip == ret_from_fork; } if (get_stack_info((unsigned long *)state->sp, state->task, diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 3bfc8dd8a43d..9a03e5b23135 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -358,6 +358,7 @@ SECTIONS .bss : AT(ADDR(.bss) - LOAD_OFFSET) { __bss_start = .; *(.bss..page_aligned) + . = ALIGN(PAGE_SIZE); *(BSS_MAIN) BSS_DECRYPTED . = ALIGN(PAGE_SIZE); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index ff2d0e9ca3bc..cfe83d4ae625 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -7,7 +7,7 @@ #define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS #define KVM_POSSIBLE_CR4_GUEST_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE) + | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE | X86_CR4_TSD) #define BUILD_KVM_GPR_ACCESSORS(lname, uname) \ static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 34a7e0533dad..4ce2ddd26c0b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -169,6 +169,18 @@ static void kvm_apic_map_free(struct rcu_head *rcu) kvfree(map); } +/* + * CLEAN -> DIRTY and UPDATE_IN_PROGRESS -> DIRTY changes happen without a lock. + * + * DIRTY -> UPDATE_IN_PROGRESS and UPDATE_IN_PROGRESS -> CLEAN happen with + * apic_map_lock_held. + */ +enum { + CLEAN, + UPDATE_IN_PROGRESS, + DIRTY +}; + void kvm_recalculate_apic_map(struct kvm *kvm) { struct kvm_apic_map *new, *old = NULL; @@ -176,17 +188,17 @@ void kvm_recalculate_apic_map(struct kvm *kvm) int i; u32 max_id = 255; /* enough space for any xAPIC ID */ - if (!kvm->arch.apic_map_dirty) { - /* - * Read kvm->arch.apic_map_dirty before - * kvm->arch.apic_map - */ - smp_rmb(); + /* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */ + if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN) return; - } mutex_lock(&kvm->arch.apic_map_lock); - if (!kvm->arch.apic_map_dirty) { + /* + * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map + * (if clean) or the APIC registers (if dirty). + */ + if (atomic_cmpxchg_acquire(&kvm->arch.apic_map_dirty, + DIRTY, UPDATE_IN_PROGRESS) == CLEAN) { /* Someone else has updated the map. */ mutex_unlock(&kvm->arch.apic_map_lock); return; @@ -256,11 +268,11 @@ out: lockdep_is_held(&kvm->arch.apic_map_lock)); rcu_assign_pointer(kvm->arch.apic_map, new); /* - * Write kvm->arch.apic_map before - * clearing apic->apic_map_dirty + * Write kvm->arch.apic_map before clearing apic->apic_map_dirty. + * If another update has come in, leave it DIRTY. */ - smp_wmb(); - kvm->arch.apic_map_dirty = false; + atomic_cmpxchg_release(&kvm->arch.apic_map_dirty, + UPDATE_IN_PROGRESS, CLEAN); mutex_unlock(&kvm->arch.apic_map_lock); if (old) @@ -282,20 +294,20 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) else static_key_slow_inc(&apic_sw_disabled.key); - apic->vcpu->kvm->arch.apic_map_dirty = true; + atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } } static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id) { kvm_lapic_set_reg(apic, APIC_ID, id << 24); - apic->vcpu->kvm->arch.apic_map_dirty = true; + atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) { kvm_lapic_set_reg(apic, APIC_LDR, id); - apic->vcpu->kvm->arch.apic_map_dirty = true; + atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } static inline u32 kvm_apic_calc_x2apic_ldr(u32 id) @@ -311,7 +323,7 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id) kvm_lapic_set_reg(apic, APIC_ID, id); kvm_lapic_set_reg(apic, APIC_LDR, ldr); - apic->vcpu->kvm->arch.apic_map_dirty = true; + atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) @@ -1976,7 +1988,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_DFR: if (!apic_x2apic_mode(apic)) { kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); - apic->vcpu->kvm->arch.apic_map_dirty = true; + atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } else ret = 1; break; @@ -2183,7 +2195,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) || + if (!kvm_apic_present(vcpu) || apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) return; @@ -2232,7 +2244,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) static_key_slow_dec_deferred(&apic_hw_disabled); } else { static_key_slow_inc(&apic_hw_disabled.key); - vcpu->kvm->arch.apic_map_dirty = true; + atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } } @@ -2273,7 +2285,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) if (!apic) return; - vcpu->kvm->arch.apic_map_dirty = false; /* Stop the timer in case it's a reset to an active apic */ hrtimer_cancel(&apic->lapic_timer.timer); @@ -2567,6 +2578,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) } memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s)); + atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); kvm_recalculate_apic_map(vcpu->kvm); kvm_apic_set_version(vcpu); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 0ad06bfe2c2c..444bb9c54548 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -222,7 +222,7 @@ void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn); -int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); +int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa); int kvm_mmu_post_init_vm(struct kvm *kvm); void kvm_mmu_pre_destroy_vm(struct kvm *kvm); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index fdd05c233308..6d6a0ae7800c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1745,10 +1745,10 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, * Emulate arch specific page modification logging for the * nested hypervisor */ -int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu) +int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa) { if (kvm_x86_ops.write_log_dirty) - return kvm_x86_ops.write_log_dirty(vcpu); + return kvm_x86_ops.write_log_dirty(vcpu, l2_gpa); return 0; } @@ -4449,7 +4449,7 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, nonleaf_bit8_rsvd | rsvd_bits(7, 7) | rsvd_bits(maxphyaddr, 51); rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd | - nonleaf_bit8_rsvd | gbpages_bit_rsvd | + gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index a6d484ea110b..bd70ece1ef8b 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -235,7 +235,7 @@ static inline unsigned FNAME(gpte_access)(u64 gpte) static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct guest_walker *walker, - int write_fault) + gpa_t addr, int write_fault) { unsigned level, index; pt_element_t pte, orig_pte; @@ -260,7 +260,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, !(pte & PT_GUEST_DIRTY_MASK)) { trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); #if PTTYPE == PTTYPE_EPT - if (kvm_arch_write_log_dirty(vcpu)) + if (kvm_arch_write_log_dirty(vcpu, addr)) return -EINVAL; #endif pte |= PT_GUEST_DIRTY_MASK; @@ -360,7 +360,6 @@ retry_walk: ++walker->level; do { - gfn_t real_gfn; unsigned long host_addr; pt_access = pte_access; @@ -375,7 +374,7 @@ retry_walk: walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), + real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), nested_access, &walker->fault); @@ -389,12 +388,10 @@ retry_walk: * information to fix the exit_qualification or exit_info_1 * fields. */ - if (unlikely(real_gfn == UNMAPPED_GVA)) + if (unlikely(real_gpa == UNMAPPED_GVA)) return 0; - real_gfn = gpa_to_gfn(real_gfn); - - host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, real_gfn, + host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gpa_to_gfn(real_gpa), &walker->pte_writable[walker->level - 1]); if (unlikely(kvm_is_error_hva(host_addr))) goto error; @@ -457,7 +454,8 @@ retry_walk: (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT); if (unlikely(!accessed_dirty)) { - ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); + ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, + addr, write_fault); if (unlikely(ret < 0)) goto error; else if (ret) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8ccfa4197d9c..5bbf76189afa 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1090,7 +1090,7 @@ static void init_vmcb(struct vcpu_svm *svm) svm->nested.vmcb = 0; svm->vcpu.arch.hflags = 0; - if (pause_filter_count) { + if (!kvm_pause_in_guest(svm->vcpu.kvm)) { control->pause_filter_count = pause_filter_count; if (pause_filter_thresh) control->pause_filter_thresh = pause_filter_thresh; @@ -2693,7 +2693,7 @@ static int pause_interception(struct vcpu_svm *svm) struct kvm_vcpu *vcpu = &svm->vcpu; bool in_kernel = (svm_get_cpl(vcpu) == 0); - if (pause_filter_thresh) + if (!kvm_pause_in_guest(vcpu->kvm)) grow_ple_window(vcpu); kvm_vcpu_on_spin(vcpu, in_kernel); @@ -3344,7 +3344,7 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs); -static fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) +static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) { fastpath_t exit_fastpath; struct vcpu_svm *svm = to_svm(vcpu); @@ -3780,7 +3780,7 @@ static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) { - if (pause_filter_thresh) + if (!kvm_pause_in_guest(vcpu->kvm)) shrink_ple_window(vcpu); } @@ -3958,6 +3958,9 @@ static void svm_vm_destroy(struct kvm *kvm) static int svm_vm_init(struct kvm *kvm) { + if (!pause_filter_count || !pause_filter_thresh) + kvm->arch.pause_in_guest = true; + if (avic) { int ret = avic_vm_init(kvm); if (ret) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d1af20b050a8..11e4df560018 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4109,7 +4109,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * CR0_GUEST_HOST_MASK is already set in the original vmcs01 * (KVM doesn't change it); */ - vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; vmx_set_cr0(vcpu, vmcs12->host_cr0); /* Same as above - no reason to call set_cr4_guest_host_mask(). */ @@ -4259,7 +4259,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) */ vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); - vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); @@ -6079,6 +6079,9 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) return -EINVAL; + if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) + return -EINVAL; + /* * SMM temporarily disables VMX, so we cannot be in guest mode, * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags @@ -6108,9 +6111,16 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (ret) return ret; - /* Empty 'VMXON' state is permitted */ - if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) - return 0; + /* Empty 'VMXON' state is permitted if no VMCS loaded */ + if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { + /* See vmx_has_valid_vmcs12. */ + if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || + (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || + (kvm_state->hdr.vmx.vmcs12_pa != -1ull)) + return -EINVAL; + else + return 0; + } if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) { if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || @@ -6176,6 +6186,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, goto error_guest_mode; } + vmx->nested.has_preemption_timer_deadline = false; if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { vmx->nested.has_preemption_timer_deadline = true; vmx->nested.preemption_timer_deadline = diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 758bccc26cf9..197148d76b8f 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -47,6 +47,11 @@ static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->nested.cached_shadow_vmcs12; } +/* + * Note: the same condition is checked against the state provided by userspace + * in vmx_set_nested_state; if it is satisfied, the nested state must include + * the VMCS12. + */ static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 5c0ff80b85c0..7a3675fddec2 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -72,11 +72,24 @@ struct loaded_vmcs { struct vmcs_controls_shadow controls_shadow; }; +static inline bool is_intr_type(u32 intr_info, u32 type) +{ + const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK; + + return (intr_info & mask) == (INTR_INFO_VALID_MASK | type); +} + +static inline bool is_intr_type_n(u32 intr_info, u32 type, u8 vector) +{ + const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK | + INTR_INFO_VECTOR_MASK; + + return (intr_info & mask) == (INTR_INFO_VALID_MASK | type | vector); +} + static inline bool is_exception_n(u32 intr_info, u8 vector) { - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | - INTR_INFO_VALID_MASK)) == - (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK); + return is_intr_type_n(intr_info, INTR_TYPE_HARD_EXCEPTION, vector); } static inline bool is_debug(u32 intr_info) @@ -106,28 +119,23 @@ static inline bool is_gp_fault(u32 intr_info) static inline bool is_machine_check(u32 intr_info) { - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | - INTR_INFO_VALID_MASK)) == - (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); + return is_exception_n(intr_info, MC_VECTOR); } /* Undocumented: icebp/int1 */ static inline bool is_icebp(u32 intr_info) { - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) - == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK); + return is_intr_type(intr_info, INTR_TYPE_PRIV_SW_EXCEPTION); } static inline bool is_nmi(u32 intr_info) { - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) - == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK); + return is_intr_type(intr_info, INTR_TYPE_NMI_INTR); } static inline bool is_external_intr(u32 intr_info) { - return (intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) - == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR); + return is_intr_type(intr_info, INTR_TYPE_EXT_INTR); } enum vmcs_field_width { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 36c771728c8c..13745f2a5ecd 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -133,9 +133,6 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); #define KVM_VM_CR0_ALWAYS_ON \ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \ X86_CR0_WP | X86_CR0_PG | X86_CR0_PE) -#define KVM_CR4_GUEST_OWNED_BITS \ - (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD) #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) @@ -4034,9 +4031,9 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) { - vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; - if (enable_ept) - vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; + vmx->vcpu.arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS; + if (!enable_ept) + vmx->vcpu.arch.cr4_guest_owned_bits &= ~X86_CR4_PGE; if (is_guest_mode(&vmx->vcpu)) vmx->vcpu.arch.cr4_guest_owned_bits &= ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; @@ -4333,8 +4330,8 @@ static void init_vmcs(struct vcpu_vmx *vmx) /* 22.2.1, 20.8.1 */ vm_entry_controls_set(vmx, vmx_vmentry_ctrl()); - vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS; - vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS); + vmx->vcpu.arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits); set_cr4_guest_host_mask(vmx); @@ -6606,23 +6603,6 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host, false); } -static void atomic_switch_umwait_control_msr(struct vcpu_vmx *vmx) -{ - u32 host_umwait_control; - - if (!vmx_has_waitpkg(vmx)) - return; - - host_umwait_control = get_umwait_control_msr(); - - if (vmx->msr_ia32_umwait_control != host_umwait_control) - add_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL, - vmx->msr_ia32_umwait_control, - host_umwait_control, false); - else - clear_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL); -} - static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6728,9 +6708,7 @@ reenter_guest: pt_guest_enter(vmx); - if (vcpu_to_pmu(vcpu)->version) - atomic_switch_perf_msrs(vmx); - atomic_switch_umwait_control_msr(vmx); + atomic_switch_perf_msrs(vmx); if (enable_preemption_timer) vmx_update_hv_timer(vcpu); @@ -7501,11 +7479,11 @@ static void vmx_flush_log_dirty(struct kvm *kvm) kvm_flush_pml_buffers(kvm); } -static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) +static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) { struct vmcs12 *vmcs12; struct vcpu_vmx *vmx = to_vmx(vcpu); - gpa_t gpa, dst; + gpa_t dst; if (is_guest_mode(vcpu)) { WARN_ON_ONCE(vmx->nested.pml_full); @@ -7524,7 +7502,7 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) return 1; } - gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; + gpa &= ~0xFFFull; dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 8a83b5edc820..639798e4a6ca 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -288,8 +288,6 @@ struct vcpu_vmx { u64 current_tsc_ratio; - u32 host_pkru; - unsigned long host_debugctlmsr; /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 00c88c2f34e4..88c593f83b28 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -975,6 +975,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) return 1; + if ((cr4 ^ old_cr4) & X86_CR4_LA57) + return 1; } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & pdptr_bits) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, @@ -2693,6 +2695,9 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) if (data & 0x30) return 1; + if (!lapic_in_kernel(vcpu)) + return 1; + vcpu->arch.apf.msr_en_val = data; if (!kvm_pv_async_pf_enabled(vcpu)) { @@ -2856,7 +2861,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: return kvm_set_apic_base(vcpu, msr_info); - case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_write(vcpu, msr, data); case MSR_IA32_TSCDEADLINE: kvm_set_lapic_tscdeadline_msr(vcpu, data); @@ -3196,7 +3201,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_APICBASE: msr_info->data = kvm_get_apic_base(vcpu); break; - case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); case MSR_IA32_TSCDEADLINE: msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu); @@ -4603,7 +4608,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; user_tsc_khz = (u32)arg; - if (user_tsc_khz >= kvm_max_guest_tsc_khz) + if (kvm_has_tsc_control && + user_tsc_khz >= kvm_max_guest_tsc_khz) goto out; if (user_tsc_khz == 0) diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 56b243b14c3a..bbcc05bcefad 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -8,6 +8,8 @@ #include <asm/alternative-asm.h> #include <asm/export.h> +.pushsection .noinstr.text, "ax" + /* * We build a jump to memcpy_orig by default which gets NOPped out on * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which @@ -184,6 +186,8 @@ SYM_FUNC_START_LOCAL(memcpy_orig) retq SYM_FUNC_END(memcpy_orig) +.popsection + #ifndef CONFIG_UML MCSAFE_TEST_CTL diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index fff28c6f73a2..b0dfac3d3df7 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -24,6 +24,7 @@ unsigned long __clear_user(void __user *addr, unsigned long size) asm volatile( " testq %[size8],%[size8]\n" " jz 4f\n" + " .align 16\n" "0: movq $0,(%[dst])\n" " addq $8,%[dst]\n" " decl %%ecx ; jnz 0b\n" diff --git a/arch/x86/math-emu/wm_sqrt.S b/arch/x86/math-emu/wm_sqrt.S index 3b2b58164ec1..40526dd85137 100644 --- a/arch/x86/math-emu/wm_sqrt.S +++ b/arch/x86/math-emu/wm_sqrt.S @@ -209,7 +209,7 @@ sqrt_stage_2_finish: #ifdef PARANOID /* It should be possible to get here only if the arg is ffff....ffff */ - cmp $0xffffffff,FPU_fsqrt_arg_1 + cmpl $0xffffffff,FPU_fsqrt_arg_1 jnz sqrt_stage_2_error #endif /* PARANOID */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 66be9bd60307..1ead568c0101 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -99,7 +99,7 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, return !instr_lo || (instr_lo>>1) == 1; case 0x00: /* Prefetch instruction is 0x0F0D or 0x0F18 */ - if (probe_kernel_address(instr, opcode)) + if (get_kernel_nofault(opcode, instr)) return 0; *prefetch = (instr_lo == 0xF) && @@ -133,7 +133,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) while (instr < max_instr) { unsigned char opcode; - if (probe_kernel_address(instr, opcode)) + if (get_kernel_nofault(opcode, instr)) break; instr++; @@ -301,7 +301,7 @@ static int bad_address(void *p) { unsigned long dummy; - return probe_kernel_address((unsigned long *)p, dummy); + return get_kernel_nofault(dummy, (unsigned long *)p); } static void dump_pagetable(unsigned long address) @@ -442,7 +442,7 @@ static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) return; } - if (probe_kernel_read(&desc, (void *)(gdt->address + offset), + if (copy_from_kernel_nofault(&desc, (void *)(gdt->address + offset), sizeof(struct ldttss_desc))) { pr_alert("%s: 0x%hx -- GDT entry is not readable\n", name, index); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index bda909e3e37e..8b4afad84f4a 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -737,7 +737,7 @@ static void __init test_wp_bit(void) __set_fixmap(FIX_WP_TEST, __pa_symbol(empty_zero_page), PAGE_KERNEL_RO); - if (probe_kernel_write((char *)fix_to_virt(FIX_WP_TEST), &z, 1)) { + if (copy_to_kernel_nofault((char *)fix_to_virt(FIX_WP_TEST), &z, 1)) { clear_fixmap(FIX_WP_TEST); printk(KERN_CONT "Ok.\n"); return; diff --git a/arch/x86/mm/maccess.c b/arch/x86/mm/maccess.c index e1d7d7477c22..92ec176a7293 100644 --- a/arch/x86/mm/maccess.c +++ b/arch/x86/mm/maccess.c @@ -9,7 +9,7 @@ static __always_inline u64 canonical_address(u64 vaddr, u8 vaddr_bits) return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits); } -bool probe_kernel_read_allowed(const void *unsafe_src, size_t size) +bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { unsigned long vaddr = (unsigned long)unsafe_src; @@ -22,7 +22,7 @@ bool probe_kernel_read_allowed(const void *unsafe_src, size_t size) canonical_address(vaddr, boot_cpu_data.x86_virt_bits) == vaddr; } #else -bool probe_kernel_read_allowed(const void *unsafe_src, size_t size) +bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { return (unsigned long)unsafe_src >= TASK_SIZE_MAX; } diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index 9c97d814125e..4f15280732ed 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -302,7 +302,7 @@ static const struct pci_raw_ops *__init pci_find_bios(void) check <= (union bios32 *) __va(0xffff0); ++check) { long sig; - if (probe_kernel_address(&check->fields.signature, sig)) + if (get_kernel_nofault(sig, &check->fields.signature)) continue; if (check->fields.signature != BIOS32_SIGNATURE) diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c index b8f7f193f383..30bd5714a3d4 100644 --- a/arch/x86/platform/intel-mid/sfi.c +++ b/arch/x86/platform/intel-mid/sfi.c @@ -287,8 +287,8 @@ void intel_scu_devices_create(void) adapter = i2c_get_adapter(i2c_bus[i]); if (adapter) { - client = i2c_new_device(adapter, i2c_devs[i]); - if (!client) + client = i2c_new_client_device(adapter, i2c_devs[i]); + if (IS_ERR(client)) pr_err("can't create i2c device %s\n", i2c_devs[i]->type); } else diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index fc13cbbb2dce..abb6075397f0 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -167,9 +167,10 @@ static struct irq_domain *uv_get_irq_domain(void) goto out; uv_domain = irq_domain_create_tree(fn, &uv_domain_ops, NULL); - irq_domain_free_fwnode(fn); if (uv_domain) uv_domain->parent = x86_vector_domain; + else + irq_domain_free_fwnode(fn); out: mutex_unlock(&uv_lock); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 7c65102debaf..db1378c6ff26 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -193,6 +193,8 @@ static void fix_processor_context(void) */ static void notrace __restore_processor_state(struct saved_context *ctxt) { + struct cpuinfo_x86 *c; + if (ctxt->misc_enable_saved) wrmsrl(MSR_IA32_MISC_ENABLE, ctxt->misc_enable); /* @@ -263,6 +265,10 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) mtrr_bp_restore(); perf_restore_debug_store(); msr_restore_context(ctxt); + + c = &cpu_data(smp_processor_id()); + if (cpu_has(c, X86_FEATURE_MSR_IA32_FEAT_CTL)) + init_ia32_feat_ctl(c); } /* Needed by apm.c */ diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index b04e6e72a592..088bd764e0b7 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -34,6 +34,7 @@ KCOV_INSTRUMENT := n PURGATORY_CFLAGS_REMOVE := -mcmodel=kernel PURGATORY_CFLAGS := -mcmodel=large -ffreestanding -fno-zero-initialized-in-bss PURGATORY_CFLAGS += $(DISABLE_STACKLEAK_PLUGIN) -DDISABLE_BRANCH_PROFILING +PURGATORY_CFLAGS += $(call cc-option,-fno-stack-protector) # Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That # in turn leaves some undefined symbols like __fentry__ in purgatory and not diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 33b309d65955..c46b9f2e732f 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -386,7 +386,7 @@ static void set_aliased_prot(void *v, pgprot_t prot) preempt_disable(); - probe_kernel_read(&dummy, v, 1); + copy_from_kernel_nofault(&dummy, v, 1); if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0)) BUG(); @@ -598,6 +598,26 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, } #ifdef CONFIG_X86_64 +void noist_exc_debug(struct pt_regs *regs); + +DEFINE_IDTENTRY_RAW(xenpv_exc_nmi) +{ + /* On Xen PV, NMI doesn't use IST. The C part is the sane as native. */ + exc_nmi(regs); +} + +DEFINE_IDTENTRY_RAW(xenpv_exc_debug) +{ + /* + * There's no IST on Xen PV, but we still need to dispatch + * to the correct handler. + */ + if (user_mode(regs)) + noist_exc_debug(regs); + else + exc_debug(regs); +} + struct trap_array_entry { void (*orig)(void); void (*xen)(void); @@ -609,18 +629,18 @@ struct trap_array_entry { .xen = xen_asm_##func, \ .ist_okay = ist_ok } -#define TRAP_ENTRY_REDIR(func, xenfunc, ist_ok) { \ +#define TRAP_ENTRY_REDIR(func, ist_ok) { \ .orig = asm_##func, \ - .xen = xen_asm_##xenfunc, \ + .xen = xen_asm_xenpv_##func, \ .ist_okay = ist_ok } static struct trap_array_entry trap_array[] = { - TRAP_ENTRY_REDIR(exc_debug, exc_xendebug, true ), + TRAP_ENTRY_REDIR(exc_debug, true ), TRAP_ENTRY(exc_double_fault, true ), #ifdef CONFIG_X86_MCE TRAP_ENTRY(exc_machine_check, true ), #endif - TRAP_ENTRY_REDIR(exc_nmi, exc_xennmi, true ), + TRAP_ENTRY_REDIR(exc_nmi, true ), TRAP_ENTRY(exc_int3, false ), TRAP_ENTRY(exc_overflow, false ), #ifdef CONFIG_IA32_EMULATION @@ -850,6 +870,17 @@ static void xen_load_sp0(unsigned long sp0) } #ifdef CONFIG_X86_IOPL_IOPERM +static void xen_invalidate_io_bitmap(void) +{ + struct physdev_set_iobitmap iobitmap = { + .bitmap = 0, + .nr_ports = 0, + }; + + native_tss_invalidate_io_bitmap(); + HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &iobitmap); +} + static void xen_update_io_bitmap(void) { struct physdev_set_iobitmap iobitmap; @@ -1079,6 +1110,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .load_sp0 = xen_load_sp0, #ifdef CONFIG_X86_IOPL_IOPERM + .invalidate_io_bitmap = xen_invalidate_io_bitmap, .update_io_bitmap = xen_update_io_bitmap, #endif .io_delay = xen_io_delay, diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 5d252aaeade8..aab1d99b2b48 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -29,10 +29,9 @@ _ASM_NOKPROBE(xen_\name) .endm xen_pv_trap asm_exc_divide_error -xen_pv_trap asm_exc_debug -xen_pv_trap asm_exc_xendebug +xen_pv_trap asm_xenpv_exc_debug xen_pv_trap asm_exc_int3 -xen_pv_trap asm_exc_xennmi +xen_pv_trap asm_xenpv_exc_nmi xen_pv_trap asm_exc_overflow xen_pv_trap asm_exc_bounds xen_pv_trap asm_exc_invalid_op @@ -161,10 +160,22 @@ SYM_FUNC_END(xen_syscall32_target) /* 32-bit compat sysenter target */ SYM_FUNC_START(xen_sysenter_target) - mov 0*8(%rsp), %rcx - mov 1*8(%rsp), %r11 - mov 5*8(%rsp), %rsp - jmp entry_SYSENTER_compat + /* + * NB: Xen is polite and clears TF from EFLAGS for us. This means + * that we don't need to guard against single step exceptions here. + */ + popq %rcx + popq %r11 + + /* + * Neither Xen nor the kernel really knows what the old SS and + * CS were. The kernel expects __USER32_DS and __USER32_CS, so + * report those values even though Xen will guess its own values. + */ + movq $__USER32_DS, 4*8(%rsp) + movq $__USER32_CS, 1*8(%rsp) + + jmp entry_SYSENTER_compat_after_hwframe SYM_FUNC_END(xen_sysenter_target) #else /* !CONFIG_IA32_EMULATION */ |