diff options
Diffstat (limited to 'arch')
86 files changed, 3666 insertions, 1018 deletions
diff --git a/arch/powerpc/kernel/machine_kexec_file_64.c b/arch/powerpc/kernel/machine_kexec_file_64.c index 992c0d258e5d..e4395f937d63 100644 --- a/arch/powerpc/kernel/machine_kexec_file_64.c +++ b/arch/powerpc/kernel/machine_kexec_file_64.c @@ -91,11 +91,13 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image) * and that value will be returned. If all free regions are visited without * func returning non-zero, then zero will be returned. */ -int arch_kexec_walk_mem(struct kexec_buf *kbuf, int (*func)(u64, u64, void *)) +int arch_kexec_walk_mem(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) { int ret = 0; u64 i; phys_addr_t mstart, mend; + struct resource res = { }; if (kbuf->top_down) { for_each_free_mem_range_reverse(i, NUMA_NO_NODE, 0, @@ -105,7 +107,9 @@ int arch_kexec_walk_mem(struct kexec_buf *kbuf, int (*func)(u64, u64, void *)) * range while in kexec, end points to the last byte * in the range. */ - ret = func(mstart, mend - 1, kbuf); + res.start = mstart; + res.end = mend - 1; + ret = func(&res, kbuf); if (ret) break; } @@ -117,7 +121,9 @@ int arch_kexec_walk_mem(struct kexec_buf *kbuf, int (*func)(u64, u64, void *)) * range while in kexec, end points to the last byte * in the range. */ - ret = func(mstart, mend - 1, kbuf); + res.start = mstart; + res.end = mend - 1; + ret = func(&res, kbuf); if (ret) break; } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9bceea6a5852..28fe465c7b7b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -171,7 +171,7 @@ config X86 select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION + select HAVE_RELIABLE_STACKTRACE if X86_64 && UNWINDER_FRAME_POINTER && STACK_VALIDATION select HAVE_STACK_VALIDATION if X86_64 select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK @@ -303,7 +303,6 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC config KASAN_SHADOW_OFFSET hex depends on KASAN - default 0xdff8000000000000 if X86_5LEVEL default 0xdffffc0000000000 config HAVE_INTEL_TXT @@ -1803,6 +1802,16 @@ config X86_SMAP If unsure, say Y. +config X86_INTEL_UMIP + def_bool n + depends on CPU_SUP_INTEL + prompt "Intel User Mode Instruction Prevention" if EXPERT + ---help--- + The User Mode Instruction Prevention (UMIP) is a security + feature in newer Intel processors. If enabled, a general + protection fault is issued if the instructions SGDT, SLDT, + SIDT, SMSW and STR are executed in user mode. + config X86_INTEL_MPX prompt "Intel MPX (Memory Protection Extensions)" def_bool n diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 90b123056f4b..6293a8768a91 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -359,28 +359,14 @@ config PUNIT_ATOM_DEBUG choice prompt "Choose kernel unwinder" - default FRAME_POINTER_UNWINDER + default UNWINDER_ORC if X86_64 + default UNWINDER_FRAME_POINTER if X86_32 ---help--- This determines which method will be used for unwinding kernel stack traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack, livepatch, lockdep, and more. -config FRAME_POINTER_UNWINDER - bool "Frame pointer unwinder" - select FRAME_POINTER - ---help--- - This option enables the frame pointer unwinder for unwinding kernel - stack traces. - - The unwinder itself is fast and it uses less RAM than the ORC - unwinder, but the kernel text size will grow by ~3% and the kernel's - overall performance will degrade by roughly 5-10%. - - This option is recommended if you want to use the livepatch - consistency model, as this is currently the only way to get a - reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). - -config ORC_UNWINDER +config UNWINDER_ORC bool "ORC unwinder" depends on X86_64 select STACK_VALIDATION @@ -396,7 +382,22 @@ config ORC_UNWINDER Enabling this option will increase the kernel's runtime memory usage by roughly 2-4MB, depending on your kernel config. -config GUESS_UNWINDER +config UNWINDER_FRAME_POINTER + bool "Frame pointer unwinder" + select FRAME_POINTER + ---help--- + This option enables the frame pointer unwinder for unwinding kernel + stack traces. + + The unwinder itself is fast and it uses less RAM than the ORC + unwinder, but the kernel text size will grow by ~3% and the kernel's + overall performance will degrade by roughly 5-10%. + + This option is recommended if you want to use the livepatch + consistency model, as this is currently the only way to get a + reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). + +config UNWINDER_GUESS bool "Guess unwinder" depends on EXPERT ---help--- @@ -411,7 +412,7 @@ config GUESS_UNWINDER endchoice config FRAME_POINTER - depends on !ORC_UNWINDER && !GUESS_UNWINDER + depends on !UNWINDER_ORC && !UNWINDER_GUESS bool endmenu diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore index e3cf9f682be5..09d25dd09307 100644 --- a/arch/x86/boot/.gitignore +++ b/arch/x86/boot/.gitignore @@ -7,3 +7,6 @@ zoffset.h setup setup.bin setup.elf +fdimage +mtools.conf +image.iso diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index d88a2fddba8c..9b5adae9cc40 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -123,63 +123,26 @@ image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,) $(obj)/mtools.conf: $(src)/mtools.conf.in sed -e 's|@OBJ@|$(obj)|g' < $< > $@ +quiet_cmd_genimage = GENIMAGE $3 +cmd_genimage = sh $(srctree)/$(src)/genimage.sh $2 $3 $(obj)/bzImage \ + $(obj)/mtools.conf '$(image_cmdline)' $(FDINITRD) + # This requires write access to /dev/fd0 bzdisk: $(obj)/bzImage $(obj)/mtools.conf - MTOOLSRC=$(obj)/mtools.conf mformat a: ; sync - syslinux /dev/fd0 ; sync - echo '$(image_cmdline)' | \ - MTOOLSRC=$(src)/mtools.conf mcopy - a:syslinux.cfg - if [ -f '$(FDINITRD)' ] ; then \ - MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' a:initrd.img ; \ - fi - MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage a:linux ; sync + $(call cmd,genimage,bzdisk,/dev/fd0) # These require being root or having syslinux 2.02 or higher installed fdimage fdimage144: $(obj)/bzImage $(obj)/mtools.conf - dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440 - MTOOLSRC=$(obj)/mtools.conf mformat v: ; sync - syslinux $(obj)/fdimage ; sync - echo '$(image_cmdline)' | \ - MTOOLSRC=$(obj)/mtools.conf mcopy - v:syslinux.cfg - if [ -f '$(FDINITRD)' ] ; then \ - MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' v:initrd.img ; \ - fi - MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage v:linux ; sync + $(call cmd,genimage,fdimage144,$(obj)/fdimage) + @$(kecho) 'Kernel: $(obj)/fdimage is ready' fdimage288: $(obj)/bzImage $(obj)/mtools.conf - dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880 - MTOOLSRC=$(obj)/mtools.conf mformat w: ; sync - syslinux $(obj)/fdimage ; sync - echo '$(image_cmdline)' | \ - MTOOLSRC=$(obj)/mtools.conf mcopy - w:syslinux.cfg - if [ -f '$(FDINITRD)' ] ; then \ - MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' w:initrd.img ; \ - fi - MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage w:linux ; sync + $(call cmd,genimage,fdimage288,$(obj)/fdimage) + @$(kecho) 'Kernel: $(obj)/fdimage is ready' isoimage: $(obj)/bzImage - -rm -rf $(obj)/isoimage - mkdir $(obj)/isoimage - for i in lib lib64 share end ; do \ - if [ -f /usr/$$i/syslinux/isolinux.bin ] ; then \ - cp /usr/$$i/syslinux/isolinux.bin $(obj)/isoimage ; \ - if [ -f /usr/$$i/syslinux/ldlinux.c32 ]; then \ - cp /usr/$$i/syslinux/ldlinux.c32 $(obj)/isoimage ; \ - fi ; \ - break ; \ - fi ; \ - if [ $$i = end ] ; then exit 1 ; fi ; \ - done - cp $(obj)/bzImage $(obj)/isoimage/linux - echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg - if [ -f '$(FDINITRD)' ] ; then \ - cp '$(FDINITRD)' $(obj)/isoimage/initrd.img ; \ - fi - mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \ - -no-emul-boot -boot-load-size 4 -boot-info-table \ - $(obj)/isoimage - isohybrid $(obj)/image.iso 2>/dev/null || true - rm -rf $(obj)/isoimage + $(call cmd,genimage,isoimage,$(obj)/image.iso) + @$(kecho) 'Kernel: $(obj)/image.iso is ready' bzlilo: $(obj)/bzImage if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 4b7575b00563..61827c2282bf 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -78,6 +78,7 @@ vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o ifdef CONFIG_X86_64 vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o + vmlinux-objs-y += $(obj)/mem_encrypt.o endif $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index beb255b66447..20919b4f3133 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -131,6 +131,19 @@ ENTRY(startup_32) /* * Build early 4G boot pagetable */ + /* + * If SEV is active then set the encryption mask in the page tables. + * This will insure that when the kernel is copied and decompressed + * it will be done so encrypted. + */ + call get_sev_encryption_bit + xorl %edx, %edx + testl %eax, %eax + jz 1f + subl $32, %eax /* Encryption bit is always above bit 31 */ + bts %eax, %edx /* Set encryption mask for page tables */ +1: + /* Initialize Page tables to 0 */ leal pgtable(%ebx), %edi xorl %eax, %eax @@ -141,12 +154,14 @@ ENTRY(startup_32) leal pgtable + 0(%ebx), %edi leal 0x1007 (%edi), %eax movl %eax, 0(%edi) + addl %edx, 4(%edi) /* Build Level 3 */ leal pgtable + 0x1000(%ebx), %edi leal 0x1007(%edi), %eax movl $4, %ecx 1: movl %eax, 0x00(%edi) + addl %edx, 0x04(%edi) addl $0x00001000, %eax addl $8, %edi decl %ecx @@ -157,6 +172,7 @@ ENTRY(startup_32) movl $0x00000183, %eax movl $2048, %ecx 1: movl %eax, 0(%edi) + addl %edx, 4(%edi) addl $0x00200000, %eax addl $8, %edi decl %ecx diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S new file mode 100644 index 000000000000..54f5f6625a73 --- /dev/null +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -0,0 +1,120 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2017 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> + +#include <asm/processor-flags.h> +#include <asm/msr.h> +#include <asm/asm-offsets.h> + + .text + .code32 +ENTRY(get_sev_encryption_bit) + xor %eax, %eax + +#ifdef CONFIG_AMD_MEM_ENCRYPT + push %ebx + push %ecx + push %edx + push %edi + + /* + * RIP-relative addressing is needed to access the encryption bit + * variable. Since we are running in 32-bit mode we need this call/pop + * sequence to get the proper relative addressing. + */ + call 1f +1: popl %edi + subl $1b, %edi + + movl enc_bit(%edi), %eax + cmpl $0, %eax + jge .Lsev_exit + + /* Check if running under a hypervisor */ + movl $1, %eax + cpuid + bt $31, %ecx /* Check the hypervisor bit */ + jnc .Lno_sev + + movl $0x80000000, %eax /* CPUID to check the highest leaf */ + cpuid + cmpl $0x8000001f, %eax /* See if 0x8000001f is available */ + jb .Lno_sev + + /* + * Check for the SEV feature: + * CPUID Fn8000_001F[EAX] - Bit 1 + * CPUID Fn8000_001F[EBX] - Bits 5:0 + * Pagetable bit position used to indicate encryption + */ + movl $0x8000001f, %eax + cpuid + bt $1, %eax /* Check if SEV is available */ + jnc .Lno_sev + + movl $MSR_AMD64_SEV, %ecx /* Read the SEV MSR */ + rdmsr + bt $MSR_AMD64_SEV_ENABLED_BIT, %eax /* Check if SEV is active */ + jnc .Lno_sev + + movl %ebx, %eax + andl $0x3f, %eax /* Return the encryption bit location */ + movl %eax, enc_bit(%edi) + jmp .Lsev_exit + +.Lno_sev: + xor %eax, %eax + movl %eax, enc_bit(%edi) + +.Lsev_exit: + pop %edi + pop %edx + pop %ecx + pop %ebx + +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + + ret +ENDPROC(get_sev_encryption_bit) + + .code64 +ENTRY(get_sev_encryption_mask) + xor %rax, %rax + +#ifdef CONFIG_AMD_MEM_ENCRYPT + push %rbp + push %rdx + + movq %rsp, %rbp /* Save current stack pointer */ + + call get_sev_encryption_bit /* Get the encryption bit position */ + testl %eax, %eax + jz .Lno_sev_mask + + xor %rdx, %rdx + bts %rax, %rdx /* Create the encryption mask */ + mov %rdx, %rax /* ... and return it */ + +.Lno_sev_mask: + movq %rbp, %rsp /* Restore original stack pointer */ + + pop %rdx + pop %rbp +#endif + + ret +ENDPROC(get_sev_encryption_mask) + + .data +enc_bit: + .int 0xffffffff diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 32d4ec2e0243..9d323dc6b159 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -109,4 +109,6 @@ static inline void console_init(void) { } #endif +unsigned long get_sev_encryption_mask(void); + #endif diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index 972319ff5b01..d5364ca2e3f9 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c @@ -77,16 +77,18 @@ static unsigned long top_level_pgt; * Mapping information structure passed to kernel_ident_mapping_init(). * Due to relocation, pointers must be assigned at run time not build time. */ -static struct x86_mapping_info mapping_info = { - .page_flag = __PAGE_KERNEL_LARGE_EXEC, -}; +static struct x86_mapping_info mapping_info; /* Locates and clears a region for a new top level page table. */ void initialize_identity_maps(void) { + unsigned long sev_me_mask = get_sev_encryption_mask(); + /* Init mapping_info with run-time function/buffer pointers. */ mapping_info.alloc_pgt_page = alloc_pgt_page; mapping_info.context = &pgt_data; + mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sev_me_mask; + mapping_info.kernpg_flag = _KERNPG_TABLE | sev_me_mask; /* * It should be impossible for this not to already be true, diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh new file mode 100644 index 000000000000..49f4970f693b --- /dev/null +++ b/arch/x86/boot/genimage.sh @@ -0,0 +1,124 @@ +#!/bin/sh +# +# This file is subject to the terms and conditions of the GNU General Public +# License. See the file "COPYING" in the main directory of this archive +# for more details. +# +# Copyright (C) 2017 by Changbin Du <changbin.du@intel.com> +# +# Adapted from code in arch/x86/boot/Makefile by H. Peter Anvin and others +# +# "make fdimage/fdimage144/fdimage288/isoimage" script for x86 architecture +# +# Arguments: +# $1 - fdimage format +# $2 - target image file +# $3 - kernel bzImage file +# $4 - mtool configuration file +# $5 - kernel cmdline +# $6 - inird image file +# + +# Use "make V=1" to debug this script +case "${KBUILD_VERBOSE}" in +*1*) + set -x + ;; +esac + +verify () { + if [ ! -f "$1" ]; then + echo "" 1>&2 + echo " *** Missing file: $1" 1>&2 + echo "" 1>&2 + exit 1 + fi +} + + +export MTOOLSRC=$4 +FIMAGE=$2 +FBZIMAGE=$3 +KCMDLINE=$5 +FDINITRD=$6 + +# Make sure the files actually exist +verify "$FBZIMAGE" +verify "$MTOOLSRC" + +genbzdisk() { + mformat a: + syslinux $FIMAGE + echo "$KCMDLINE" | mcopy - a:syslinux.cfg + if [ -f "$FDINITRD" ] ; then + mcopy "$FDINITRD" a:initrd.img + fi + mcopy $FBZIMAGE a:linux +} + +genfdimage144() { + dd if=/dev/zero of=$FIMAGE bs=1024 count=1440 2> /dev/null + mformat v: + syslinux $FIMAGE + echo "$KCMDLINE" | mcopy - v:syslinux.cfg + if [ -f "$FDINITRD" ] ; then + mcopy "$FDINITRD" v:initrd.img + fi + mcopy $FBZIMAGE v:linux +} + +genfdimage288() { + dd if=/dev/zero of=$FIMAGE bs=1024 count=2880 2> /dev/null + mformat w: + syslinux $FIMAGE + echo "$KCMDLINE" | mcopy - W:syslinux.cfg + if [ -f "$FDINITRD" ] ; then + mcopy "$FDINITRD" w:initrd.img + fi + mcopy $FBZIMAGE w:linux +} + +genisoimage() { + tmp_dir=`dirname $FIMAGE`/isoimage + rm -rf $tmp_dir + mkdir $tmp_dir + for i in lib lib64 share end ; do + for j in syslinux ISOLINUX ; do + if [ -f /usr/$i/$j/isolinux.bin ] ; then + isolinux=/usr/$i/$j/isolinux.bin + cp $isolinux $tmp_dir + fi + done + for j in syslinux syslinux/modules/bios ; do + if [ -f /usr/$i/$j/ldlinux.c32 ]; then + ldlinux=/usr/$i/$j/ldlinux.c32 + cp $ldlinux $tmp_dir + fi + done + if [ -n "$isolinux" -a -n "$ldlinux" ] ; then + break + fi + if [ $i = end -a -z "$isolinux" ] ; then + echo 'Need an isolinux.bin file, please install syslinux/isolinux.' + exit 1 + fi + done + cp $FBZIMAGE $tmp_dir/linux + echo "$KCMDLINE" > $tmp_dir/isolinux.cfg + if [ -f "$FDINITRD" ] ; then + cp "$FDINITRD" $tmp_dir/initrd.img + fi + mkisofs -J -r -input-charset=utf-8 -quiet -o $FIMAGE -b isolinux.bin \ + -c boot.cat -no-emul-boot -boot-load-size 4 -boot-info-table \ + $tmp_dir + isohybrid $FIMAGE 2>/dev/null || true + rm -rf $tmp_dir +} + +case $1 in + bzdisk) genbzdisk;; + fdimage144) genfdimage144;; + fdimage288) genfdimage288;; + isoimage) genisoimage;; + *) echo 'Unknown image format'; exit 1; +esac diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config index 550cd5012b73..66c9e2aab16c 100644 --- a/arch/x86/configs/tiny.config +++ b/arch/x86/configs/tiny.config @@ -1,5 +1,5 @@ CONFIG_NOHIGHMEM=y # CONFIG_HIGHMEM4G is not set # CONFIG_HIGHMEM64G is not set -CONFIG_GUESS_UNWINDER=y -# CONFIG_FRAME_POINTER_UNWINDER is not set +CONFIG_UNWINDER_GUESS=y +# CONFIG_UNWINDER_FRAME_POINTER is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 4a4b16e56d35..e32fc1f274d8 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -299,6 +299,7 @@ CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_BOOT_PARAMS=y CONFIG_OPTIMIZE_INLINING=y +CONFIG_UNWINDER_ORC=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 6e160031cfea..3fd8bc560fae 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -142,56 +142,25 @@ For 32-bit we have the following conventions - kernel is built with UNWIND_HINT_REGS offset=\offset .endm - .macro RESTORE_EXTRA_REGS offset=0 - movq 0*8+\offset(%rsp), %r15 - movq 1*8+\offset(%rsp), %r14 - movq 2*8+\offset(%rsp), %r13 - movq 3*8+\offset(%rsp), %r12 - movq 4*8+\offset(%rsp), %rbp - movq 5*8+\offset(%rsp), %rbx - UNWIND_HINT_REGS offset=\offset extra=0 - .endm - - .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 - .if \rstor_r11 - movq 6*8(%rsp), %r11 - .endif - .if \rstor_r8910 - movq 7*8(%rsp), %r10 - movq 8*8(%rsp), %r9 - movq 9*8(%rsp), %r8 - .endif - .if \rstor_rax - movq 10*8(%rsp), %rax - .endif - .if \rstor_rcx - movq 11*8(%rsp), %rcx - .endif - .if \rstor_rdx - movq 12*8(%rsp), %rdx - .endif - movq 13*8(%rsp), %rsi - movq 14*8(%rsp), %rdi - UNWIND_HINT_IRET_REGS offset=16*8 - .endm - .macro RESTORE_C_REGS - RESTORE_C_REGS_HELPER 1,1,1,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_RAX - RESTORE_C_REGS_HELPER 0,1,1,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_RCX - RESTORE_C_REGS_HELPER 1,0,1,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_R11 - RESTORE_C_REGS_HELPER 1,1,0,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_RCX_R11 - RESTORE_C_REGS_HELPER 1,0,0,1,1 - .endm - - .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 - subq $-(15*8+\addskip), %rsp + .macro POP_EXTRA_REGS + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + .endm + + .macro POP_C_REGS + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + popq %rdi .endm .macro icebp diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index bcfc5668dcb2..a2b30ec69497 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -221,10 +221,9 @@ entry_SYSCALL_64_fastpath: TRACE_IRQS_ON /* user mode is traced as IRQs on */ movq RIP(%rsp), %rcx movq EFLAGS(%rsp), %r11 - RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RSP(%rsp), %rsp + addq $6*8, %rsp /* skip extra regs -- they were preserved */ UNWIND_HINT_EMPTY - USERGS_SYSRET64 + jmp .Lpop_c_regs_except_rcx_r11_and_sysret 1: /* @@ -246,17 +245,18 @@ entry_SYSCALL64_slow_path: call do_syscall_64 /* returns with IRQs disabled */ return_from_SYSCALL_64: - RESTORE_EXTRA_REGS TRACE_IRQS_IRETQ /* we're about to change IF */ /* * Try to use SYSRET instead of IRET if we're returning to - * a completely clean 64-bit userspace context. + * a completely clean 64-bit userspace context. If we're not, + * go to the slow exit path. */ movq RCX(%rsp), %rcx movq RIP(%rsp), %r11 - cmpq %rcx, %r11 /* RCX == RIP */ - jne opportunistic_sysret_failed + + cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */ + jne swapgs_restore_regs_and_return_to_usermode /* * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP @@ -274,14 +274,14 @@ return_from_SYSCALL_64: /* If this changed %rcx, it was not canonical */ cmpq %rcx, %r11 - jne opportunistic_sysret_failed + jne swapgs_restore_regs_and_return_to_usermode cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ - jne opportunistic_sysret_failed + jne swapgs_restore_regs_and_return_to_usermode movq R11(%rsp), %r11 cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ - jne opportunistic_sysret_failed + jne swapgs_restore_regs_and_return_to_usermode /* * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot @@ -302,12 +302,12 @@ return_from_SYSCALL_64: * would never get past 'stuck_here'. */ testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 - jnz opportunistic_sysret_failed + jnz swapgs_restore_regs_and_return_to_usermode /* nothing to check for RSP */ cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ - jne opportunistic_sysret_failed + jne swapgs_restore_regs_and_return_to_usermode /* * We win! This label is here just for ease of understanding @@ -315,14 +315,20 @@ return_from_SYSCALL_64: */ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ - RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RSP(%rsp), %rsp UNWIND_HINT_EMPTY + POP_EXTRA_REGS +.Lpop_c_regs_except_rcx_r11_and_sysret: + popq %rsi /* skip r11 */ + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rsi /* skip rcx */ + popq %rdx + popq %rsi + popq %rdi + movq RSP-ORIG_RAX(%rsp), %rsp USERGS_SYSRET64 - -opportunistic_sysret_failed: - SWAPGS - jmp restore_c_regs_and_iret END(entry_SYSCALL_64) ENTRY(stub_ptregs_64) @@ -423,8 +429,7 @@ ENTRY(ret_from_fork) movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ - SWAPGS - jmp restore_regs_and_iret + jmp swapgs_restore_regs_and_return_to_usermode 1: /* kernel thread */ @@ -612,8 +617,21 @@ GLOBAL(retint_user) mov %rsp,%rdi call prepare_exit_to_usermode TRACE_IRQS_IRETQ + +GLOBAL(swapgs_restore_regs_and_return_to_usermode) +#ifdef CONFIG_DEBUG_ENTRY + /* Assert that pt_regs indicates user mode. */ + testb $3, CS(%rsp) + jnz 1f + ud2 +1: +#endif SWAPGS - jmp restore_regs_and_iret + POP_EXTRA_REGS + POP_C_REGS + addq $8, %rsp /* skip regs->orig_ax */ + INTERRUPT_RETURN + /* Returning to kernel space */ retint_kernel: @@ -633,15 +651,17 @@ retint_kernel: */ TRACE_IRQS_IRETQ -/* - * At this label, code paths which return to kernel and to user, - * which come from interrupts/exception and from syscalls, merge. - */ -GLOBAL(restore_regs_and_iret) - RESTORE_EXTRA_REGS -restore_c_regs_and_iret: - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 +GLOBAL(restore_regs_and_return_to_kernel) +#ifdef CONFIG_DEBUG_ENTRY + /* Assert that pt_regs indicates kernel mode. */ + testb $3, CS(%rsp) + jz 1f + ud2 +1: +#endif + POP_EXTRA_REGS + POP_C_REGS + addq $8, %rsp /* skip regs->orig_ax */ INTERRUPT_RETURN ENTRY(native_iret) @@ -818,7 +838,7 @@ ENTRY(\sym) ASM_CLAC - .ifeq \has_error_code + .if \has_error_code == 0 pushq $-1 /* ORIG_RAX: no syscall to restart */ .endif @@ -1059,6 +1079,7 @@ idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK idtentry stack_segment do_stack_segment has_error_code=1 #ifdef CONFIG_XEN +idtentry xennmi do_nmi has_error_code=0 idtentry xendebug do_debug has_error_code=0 idtentry xenint3 do_int3 has_error_code=0 #endif @@ -1112,17 +1133,14 @@ ENTRY(paranoid_exit) DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF_DEBUG testl %ebx, %ebx /* swapgs needed? */ - jnz paranoid_exit_no_swapgs + jnz .Lparanoid_exit_no_swapgs TRACE_IRQS_IRETQ SWAPGS_UNSAFE_STACK - jmp paranoid_exit_restore -paranoid_exit_no_swapgs: + jmp .Lparanoid_exit_restore +.Lparanoid_exit_no_swapgs: TRACE_IRQS_IRETQ_DEBUG -paranoid_exit_restore: - RESTORE_EXTRA_REGS - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 - INTERRUPT_RETURN +.Lparanoid_exit_restore: + jmp restore_regs_and_return_to_kernel END(paranoid_exit) /* @@ -1223,10 +1241,13 @@ ENTRY(error_exit) jmp retint_user END(error_exit) -/* Runs on exception stack */ -/* XXX: broken on Xen PV */ +/* + * Runs on exception stack. Xen PV does not go through this path at all, + * so we can use real assembly here. + */ ENTRY(nmi) UNWIND_HINT_IRET_REGS + /* * We allow breakpoints in NMIs. If a breakpoint occurs, then * the iretq it performs will take us out of NMI context. @@ -1284,7 +1305,7 @@ ENTRY(nmi) * stacks lest we corrupt the "NMI executing" variable. */ - SWAPGS_UNSAFE_STACK + swapgs cld movq %rsp, %rdx movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp @@ -1328,8 +1349,7 @@ ENTRY(nmi) * Return back to user mode. We must *not* do the normal exit * work, because we don't want to enable interrupts. */ - SWAPGS - jmp restore_regs_and_iret + jmp swapgs_restore_regs_and_return_to_usermode .Lnmi_from_kernel: /* @@ -1450,7 +1470,7 @@ nested_nmi_out: popq %rdx /* We are returning to kernel mode, so this cannot result in a fault. */ - INTERRUPT_RETURN + iretq first_nmi: /* Restore rdx. */ @@ -1481,7 +1501,7 @@ first_nmi: pushfq /* RFLAGS */ pushq $__KERNEL_CS /* CS */ pushq $1f /* RIP */ - INTERRUPT_RETURN /* continues at repeat_nmi below */ + iretq /* continues at repeat_nmi below */ UNWIND_HINT_IRET_REGS 1: #endif @@ -1544,29 +1564,34 @@ end_repeat_nmi: nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: - RESTORE_EXTRA_REGS - RESTORE_C_REGS + POP_EXTRA_REGS + POP_C_REGS - /* Point RSP at the "iret" frame. */ - REMOVE_PT_GPREGS_FROM_STACK 6*8 + /* + * Skip orig_ax and the "outermost" frame to point RSP at the "iret" + * at the "iret" frame. + */ + addq $6*8, %rsp /* * Clear "NMI executing". Set DF first so that we can easily * distinguish the remaining code between here and IRET from - * the SYSCALL entry and exit paths. On a native kernel, we - * could just inspect RIP, but, on paravirt kernels, - * INTERRUPT_RETURN can translate into a jump into a - * hypercall page. + * the SYSCALL entry and exit paths. + * + * We arguably should just inspect RIP instead, but I (Andy) wrote + * this code when I had the misapprehension that Xen PV supported + * NMIs, and Xen PV would break that approach. */ std movq $0, 5*8(%rsp) /* clear "NMI executing" */ /* - * INTERRUPT_RETURN reads the "iret" frame and exits the NMI - * stack in a single instruction. We are returning to kernel - * mode, so this cannot result in a fault. + * iretq reads the "iret" frame and exits the NMI stack in a + * single instruction. We are returning to kernel mode, so this + * cannot result in a fault. Similarly, we don't need to worry + * about espfix64 on the way back to kernel mode. */ - INTERRUPT_RETURN + iretq END(nmi) ENTRY(ignore_sysret) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index b5c7a56ed256..568e130d932c 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -337,8 +337,7 @@ ENTRY(entry_INT80_compat) /* Go back to user mode. */ TRACE_IRQS_ON - SWAPGS - jmp restore_regs_and_iret + jmp swapgs_restore_regs_and_return_to_usermode END(entry_INT80_compat) ENTRY(stub32_clone) diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile index 331f1dca5085..6fb9b57ed5ba 100644 --- a/arch/x86/entry/syscalls/Makefile +++ b/arch/x86/entry/syscalls/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -out := $(obj)/../../include/generated/asm -uapi := $(obj)/../../include/generated/uapi/asm +out := arch/$(SRCARCH)/include/generated/asm +uapi := arch/$(SRCARCH)/include/generated/uapi/asm # Create output directory if not already present _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \ diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 1911310959f8..d63053142b16 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -114,10 +114,11 @@ static int vvar_fault(const struct vm_special_mapping *sm, struct pvclock_vsyscall_time_info *pvti = pvclock_pvti_cpu0_va(); if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { - ret = vm_insert_pfn( + ret = vm_insert_pfn_prot( vma, vmf->address, - __pa(pvti) >> PAGE_SHIFT); + __pa(pvti) >> PAGE_SHIFT, + pgprot_decrypted(vma->vm_page_prot)); } } else if (sym_offset == image->sym_hvclock_page) { struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page(); diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index 5b0579abb398..3ac991d81e74 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -45,7 +45,7 @@ static inline bool rdrand_long(unsigned long *v) bool ok; unsigned int retry = RDRAND_RETRY_LOOPS; do { - asm volatile(RDRAND_LONG "\n\t" + asm volatile(RDRAND_LONG CC_SET(c) : CC_OUT(c) (ok), "=a" (*v)); if (ok) @@ -59,7 +59,7 @@ static inline bool rdrand_int(unsigned int *v) bool ok; unsigned int retry = RDRAND_RETRY_LOOPS; do { - asm volatile(RDRAND_INT "\n\t" + asm volatile(RDRAND_INT CC_SET(c) : CC_OUT(c) (ok), "=a" (*v)); if (ok) @@ -71,7 +71,7 @@ static inline bool rdrand_int(unsigned int *v) static inline bool rdseed_long(unsigned long *v) { bool ok; - asm volatile(RDSEED_LONG "\n\t" + asm volatile(RDSEED_LONG CC_SET(c) : CC_OUT(c) (ok), "=a" (*v)); return ok; @@ -80,7 +80,7 @@ static inline bool rdseed_long(unsigned long *v) static inline bool rdseed_int(unsigned int *v) { bool ok; - asm volatile(RDSEED_INT "\n\t" + asm volatile(RDSEED_INT CC_SET(c) : CC_OUT(c) (ok), "=a" (*v)); return ok; diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 2bcf47314959..3fa039855b8f 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -143,7 +143,7 @@ static __always_inline void __clear_bit(long nr, volatile unsigned long *addr) static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) { bool negative; - asm volatile(LOCK_PREFIX "andb %2,%1\n\t" + asm volatile(LOCK_PREFIX "andb %2,%1" CC_SET(s) : CC_OUT(s) (negative), ADDR : "ir" ((char) ~(1 << nr)) : "memory"); @@ -246,7 +246,7 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long * { bool oldbit; - asm("bts %2,%1\n\t" + asm("bts %2,%1" CC_SET(c) : CC_OUT(c) (oldbit), ADDR : "Ir" (nr)); @@ -286,7 +286,7 @@ static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long { bool oldbit; - asm volatile("btr %2,%1\n\t" + asm volatile("btr %2,%1" CC_SET(c) : CC_OUT(c) (oldbit), ADDR : "Ir" (nr)); @@ -298,7 +298,7 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon { bool oldbit; - asm volatile("btc %2,%1\n\t" + asm volatile("btc %2,%1" CC_SET(c) : CC_OUT(c) (oldbit), ADDR : "Ir" (nr) : "memory"); @@ -329,7 +329,7 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l { bool oldbit; - asm volatile("bt %2,%1\n\t" + asm volatile("bt %2,%1" CC_SET(c) : CC_OUT(c) (oldbit) : "m" (*(unsigned long *)addr), "Ir" (nr)); diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 9eef9cc64c68..a600a6cda9ec 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -7,6 +7,7 @@ */ #include <linux/types.h> #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <asm/processor.h> #include <asm/user32.h> #include <asm/unistd.h> diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 0dfa68438e80..bf6a76202a77 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -126,11 +126,10 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) #define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) -#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability)) -#define setup_clear_cpu_cap(bit) do { \ - clear_cpu_cap(&boot_cpu_data, bit); \ - set_bit(bit, (unsigned long *)cpu_caps_cleared); \ -} while (0) + +extern void setup_clear_cpu_cap(unsigned int bit); +extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); + #define setup_force_cpu_cap(bit) do { \ set_cpu_cap(&boot_cpu_data, bit); \ set_bit(bit, (unsigned long *)cpu_caps_set); \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 793690fbda36..c0b0e9e8aa66 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -13,173 +13,176 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 18 /* N 32-bit words worth of info */ -#define NBUGINTS 1 /* N 32-bit bug flags */ +#define NCAPINTS 18 /* N 32-bit words worth of info */ +#define NBUGINTS 1 /* N 32-bit bug flags */ /* * Note: If the comment begins with a quoted string, that string is used * in /proc/cpuinfo instead of the macro name. If the string is "", * this feature bit is not displayed in /proc/cpuinfo at all. + * + * When adding new features here that depend on other features, + * please update the table in kernel/cpu/cpuid-deps.c as well. */ -/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ -#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ -#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ -#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ -#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ -#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ -#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ -#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ -#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ -#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ -#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ -#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ -#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ -#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ -#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ -#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ - /* (plus FCMOVcc, FCOMI with FPU) */ -#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ -#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ -#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ -#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ -#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ -#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ -#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ -#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ -#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ -#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ -#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ -#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ -#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ -#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ -#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ +/* Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 */ +#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ +#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ +#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ +#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ +#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ +#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ +#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ +#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ +#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ +#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ +#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ +#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ +#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ +#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ +#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ +#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ +#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ +#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ +#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ +#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ +#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ +#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ +#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ +#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ +#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ +#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ +#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ +#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ +#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ +#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ /* Don't duplicate feature flags which are redundant with Intel! */ -#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ -#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ -#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ -#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ -#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ -#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ -#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ -#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ -#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ -#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ +#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ +#define X86_FEATURE_MP ( 1*32+19) /* MP Capable */ +#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ +#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ +#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ +#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ +#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ +#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64, 64-bit support) */ +#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow extensions */ +#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow */ /* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ -#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ -#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ -#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ +#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ +#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ +#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ /* Other features, Linux-defined mapping, word 3 */ /* This range is used for feature bits which conflict or are synthesized */ -#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ -#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ -#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ -#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ -/* cpu types for specific tunings: */ -#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ -#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ -#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ -#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ -#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ -#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ -#define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */ -#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ -#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ -#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ -#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ -#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ -#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ -#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ -#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ -#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ -#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ -#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ -#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ -#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ -#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ -#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */ -#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ -#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ -#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ -#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ -#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ +#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ +#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ +#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ +#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ + +/* CPU types for specific tunings: */ +#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ +#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ +#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ +#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ +#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ +#define X86_FEATURE_UP ( 3*32+ 9) /* SMP kernel running on UP */ +#define X86_FEATURE_ART ( 3*32+10) /* Always running timer (ART) */ +#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ +#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ +#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ +#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ +#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ +#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ +#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" MFENCE synchronizes RDTSC */ +#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ +#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ +#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ +#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ +#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* CPU topology enum extensions */ +#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ +#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ +#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */ +#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */ +#define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */ +#define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ +#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ +#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ -/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ -#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ -#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ -#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ -#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ -#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ -#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ -#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ -#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ -#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ -#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ -#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ -#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ -#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ -#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ -#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ -#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ -#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ -#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ -#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ -#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ -#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ -#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ -#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ -#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */ -#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ -#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ -#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ -#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ -#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ -#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ -#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ +/* Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 */ +#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ +#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ +#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ +#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" MONITOR/MWAIT support */ +#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */ +#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ +#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer Mode eXtensions */ +#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ +#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ +#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ +#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ +#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ +#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ +#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B instruction */ +#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ +#define X86_FEATURE_PDCM ( 4*32+15) /* Perf/Debug Capabilities MSR */ +#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ +#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ +#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ +#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ +#define X86_FEATURE_X2APIC ( 4*32+21) /* X2APIC */ +#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ +#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ +#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* TSC deadline timer */ +#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ +#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV instructions */ +#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE instruction enabled in the OS */ +#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ +#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit FP conversions */ +#define X86_FEATURE_RDRAND ( 4*32+30) /* RDRAND instruction */ +#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ -#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ -#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ -#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ -#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ -#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ -#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ -#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ -#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ -#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ -#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ +#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ +#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ +#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ +#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ +#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ +#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ +#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ +#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ +#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ +#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ -/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ -#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ -#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ -#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ -#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ -#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ -#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ -#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ -#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ -#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ -#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ -#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ -#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ -#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ -#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ -#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ -#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ -#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ -#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ -#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ -#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ -#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ -#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ -#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ -#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ -#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */ -#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ +/* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */ +#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ +#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ +#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure Virtual Machine */ +#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ +#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ +#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ +#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ +#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ +#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ +#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ +#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ +#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ +#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ +#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ +#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ +#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ +#define X86_FEATURE_TCE ( 6*32+17) /* Translation Cache Extension */ +#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ +#define X86_FEATURE_TBM ( 6*32+21) /* Trailing Bit Manipulations */ +#define X86_FEATURE_TOPOEXT ( 6*32+22) /* Topology extensions CPUID leafs */ +#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* Core performance counter extensions */ +#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ +#define X86_FEATURE_BPEXT ( 6*32+26) /* Data breakpoint extension */ +#define X86_FEATURE_PTSC ( 6*32+27) /* Performance time-stamp counter */ +#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */ +#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX instructions) */ /* * Auxiliary flags: Linux defined - For features scattered in various @@ -187,146 +190,154 @@ * * Reuse free bits when adding new feature flags! */ -#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */ -#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */ -#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ -#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ -#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ -#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ -#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ +#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT instructions */ +#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */ +#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ +#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ +#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ +#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ +#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ -#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ -#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ -#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ +#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ +#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ +#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ -#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ -#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ -#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ -#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ +#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ +#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ +#define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */ -#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ +#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ /* Virtualization flags: Linux defined, word 8 */ -#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ -#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ -#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ -#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ -#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ +#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ +#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ +#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ +#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ +#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ -#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ -#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ +#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */ +#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ -/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ -#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ -#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ -#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ -#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ -#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ -#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ -#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ -#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ -#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ -#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ -#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ -#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ -#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ -#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ -#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ -#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ -#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ -#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ -#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ -#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ -#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ -#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ -#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ -#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ -#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ -#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ -#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ +/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ +#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ +#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3B */ +#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ +#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ +#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ +#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ +#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ +#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */ +#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ +#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ +#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ +#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ +#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ +#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ +#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ +#define X86_FEATURE_RDSEED ( 9*32+18) /* RDSEED instruction */ +#define X86_FEATURE_ADX ( 9*32+19) /* ADCX and ADOX instructions */ +#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ +#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ +#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ +#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ +#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ +#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ +#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ +#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ +#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ +#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ -/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ -#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ -#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ -#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ -#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ +/* Extended state features, CPUID level 0x0000000d:1 (EAX), word 10 */ +#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT instruction */ +#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */ +#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */ +#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */ -/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */ -#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (EDX), word 11 */ +#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ -/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ -#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ -#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ -#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (EDX), word 12 */ +#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring */ +#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ +#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ -/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ -#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ -#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ +/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ +#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ +#define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ -/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ -#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ -#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ -#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ -#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ -#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ -#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ -#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ -#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ -#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ -#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ +/* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ +#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ +#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ +#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ +#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ +#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ +#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ +#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ +#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ +#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ +#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ -/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */ -#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ -#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ -#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ -#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ -#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ -#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ -#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ -#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ -#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ -#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ -#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ -#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ -#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ +/* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */ +#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ +#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ +#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ +#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ +#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ +#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ +#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ +#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ +#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ +#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ +#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ +#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ +#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ -/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ -#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ -#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ -#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ -#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ -#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ -#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ +#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ +#define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */ +#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ +#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ +#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ +#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ +#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */ +#define X86_FEATURE_AVX512_VNNI (16*32+11) /* Vector Neural Network Instructions */ +#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ +#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ +#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ +#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ -/* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ -#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ -#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */ -#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */ +/* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */ +#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */ +#define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */ +#define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */ /* * BUG word(s) */ -#define X86_BUG(x) (NCAPINTS*32 + (x)) +#define X86_BUG(x) (NCAPINTS*32 + (x)) -#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ -#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ -#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ -#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ -#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ -#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ -#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ -#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ -#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ +#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ +#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ +#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ +#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ +#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ +#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ +#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ +#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ +#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ #ifdef CONFIG_X86_32 /* * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional * to avoid confusion. */ -#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ +#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ #endif -#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ -#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ -#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ -#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ +#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ +#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ +#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ +#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ + #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index c10c9128f54e..14d6d5007314 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -16,6 +16,12 @@ # define DISABLE_MPX (1<<(X86_FEATURE_MPX & 31)) #endif +#ifdef CONFIG_X86_INTEL_UMIP +# define DISABLE_UMIP 0 +#else +# define DISABLE_UMIP (1<<(X86_FEATURE_UMIP & 31)) +#endif + #ifdef CONFIG_X86_64 # define DISABLE_VME (1<<(X86_FEATURE_VME & 31)) # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) @@ -63,7 +69,7 @@ #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 #define DISABLED_MASK15 0 -#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57) +#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP) #define DISABLED_MASK17 0 #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h index 02aff0867211..1c78580e58be 100644 --- a/arch/x86/include/asm/inat.h +++ b/arch/x86/include/asm/inat.h @@ -97,6 +97,16 @@ #define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM) #define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS) +/* Identifiers for segment registers */ +#define INAT_SEG_REG_IGNORE 0 +#define INAT_SEG_REG_DEFAULT 1 +#define INAT_SEG_REG_CS 2 +#define INAT_SEG_REG_SS 3 +#define INAT_SEG_REG_DS 4 +#define INAT_SEG_REG_ES 5 +#define INAT_SEG_REG_FS 6 +#define INAT_SEG_REG_GS 7 + /* Attribute search APIs */ extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode); extern int inat_get_last_prefix_id(insn_byte_t last_pfx); diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h new file mode 100644 index 000000000000..e1d3b4ce8a92 --- /dev/null +++ b/arch/x86/include/asm/insn-eval.h @@ -0,0 +1,23 @@ +#ifndef _ASM_X86_INSN_EVAL_H +#define _ASM_X86_INSN_EVAL_H +/* + * A collection of utility functions for x86 instruction analysis to be + * used in a kernel context. Useful when, for instance, making sense + * of the registers indicated by operands. + */ + +#include <linux/compiler.h> +#include <linux/bug.h> +#include <linux/err.h> +#include <asm/ptrace.h> + +#define INSN_CODE_SEG_ADDR_SZ(params) ((params >> 4) & 0xf) +#define INSN_CODE_SEG_OPND_SZ(params) (params & 0xf) +#define INSN_CODE_SEG_PARAMS(oper_sz, addr_sz) (oper_sz | (addr_sz << 4)) + +void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs); +int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs); +unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx); +char insn_get_code_seg_params(struct pt_regs *regs); + +#endif /* _ASM_X86_INSN_EVAL_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 11398d55aefa..93ae8aee1780 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -266,6 +266,21 @@ static inline void slow_down_io(void) #endif +#ifdef CONFIG_AMD_MEM_ENCRYPT +#include <linux/jump_label.h> + +extern struct static_key_false sev_enable_key; +static inline bool sev_key_active(void) +{ + return static_branch_unlikely(&sev_enable_key); +} + +#else /* !CONFIG_AMD_MEM_ENCRYPT */ + +static inline bool sev_key_active(void) { return false; } + +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + #define BUILDIO(bwl, bw, type) \ static inline void out##bwl(unsigned type value, int port) \ { \ @@ -296,14 +311,34 @@ static inline unsigned type in##bwl##_p(int port) \ \ static inline void outs##bwl(int port, const void *addr, unsigned long count) \ { \ - asm volatile("rep; outs" #bwl \ - : "+S"(addr), "+c"(count) : "d"(port) : "memory"); \ + if (sev_key_active()) { \ + unsigned type *value = (unsigned type *)addr; \ + while (count) { \ + out##bwl(*value, port); \ + value++; \ + count--; \ + } \ + } else { \ + asm volatile("rep; outs" #bwl \ + : "+S"(addr), "+c"(count) \ + : "d"(port) : "memory"); \ + } \ } \ \ static inline void ins##bwl(int port, void *addr, unsigned long count) \ { \ - asm volatile("rep; ins" #bwl \ - : "+D"(addr), "+c"(count) : "d"(port) : "memory"); \ + if (sev_key_active()) { \ + unsigned type *value = (unsigned type *)addr; \ + while (count) { \ + *value = in##bwl(port); \ + value++; \ + count--; \ + } \ + } else { \ + asm volatile("rep; ins" #bwl \ + : "+D"(addr), "+c"(count) \ + : "d"(port) : "memory"); \ + } \ } BUILDIO(b, b, char) diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 6a77c63540f7..c9459a4c3c68 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -42,11 +42,17 @@ void __init sme_early_init(void); void __init sme_encrypt_kernel(void); void __init sme_enable(struct boot_params *bp); +int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size); +int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size); + /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void); void swiotlb_set_mem_attributes(void *vaddr, unsigned long size); +bool sme_active(void); +bool sev_active(void); + #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define sme_me_mask 0ULL @@ -64,6 +70,14 @@ static inline void __init sme_early_init(void) { } static inline void __init sme_encrypt_kernel(void) { } static inline void __init sme_enable(struct boot_params *bp) { } +static inline bool sme_active(void) { return false; } +static inline bool sev_active(void) { return false; } + +static inline int __init +early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; } +static inline int __init +early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; } + #endif /* CONFIG_AMD_MEM_ENCRYPT */ /* diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 8546fafa21a9..7948a17febb4 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -6,7 +6,7 @@ #include <asm/orc_types.h> struct mod_arch_specific { -#ifdef CONFIG_ORC_UNWINDER +#ifdef CONFIG_UNWINDER_ORC unsigned int num_orcs; int *orc_unwind_ip; struct orc_entry *orc_unwind; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ab022618a50a..34c4922bbc3f 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -324,6 +324,9 @@ #define MSR_AMD64_IBSBRTARGET 0xc001103b #define MSR_AMD64_IBSOPDATA4 0xc001103d #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ +#define MSR_AMD64_SEV 0xc0010131 +#define MSR_AMD64_SEV_ENABLED_BIT 0 +#define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT) /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index fd81228e8037..283efcaac8af 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -16,10 +16,9 @@ #include <linux/cpumask.h> #include <asm/frame.h> -static inline void load_sp0(struct tss_struct *tss, - struct thread_struct *thread) +static inline void load_sp0(unsigned long sp0) { - PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread); + PVOP_VCALL1(pv_cpu_ops.load_sp0, sp0); } /* The paravirtualized CPUID instruction. */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 10cc3b9709fe..6ec54d01972d 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -134,7 +134,7 @@ struct pv_cpu_ops { void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries); void (*free_ldt)(struct desc_struct *ldt, unsigned entries); - void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); + void (*load_sp0)(unsigned long sp0); void (*set_iopl_mask)(unsigned mask); diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 377f1ffd18be..ba3c523aaf16 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -526,7 +526,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr, { bool oldbit; - asm volatile("bt "__percpu_arg(2)",%1\n\t" + asm volatile("bt "__percpu_arg(2)",%1" CC_SET(c) : CC_OUT(c) (oldbit) : "m" (*(unsigned long __percpu *)addr), "Ir" (nr)); diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 59df7b47a434..9e9b05fc4860 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -200,10 +200,9 @@ enum page_cache_mode { #define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ - _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_ENC) #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ _PAGE_DIRTY | _PAGE_ENC) +#define _PAGE_TABLE (_KERNPG_TABLE | _PAGE_USER) #define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _PAGE_ENC) #define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _PAGE_ENC) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index bdac19ab2488..2db7cf720b04 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -431,7 +431,9 @@ typedef struct { struct thread_struct { /* Cached TLS descriptors: */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; +#ifdef CONFIG_X86_32 unsigned long sp0; +#endif unsigned long sp; #ifdef CONFIG_X86_32 unsigned long sysenter_cs; @@ -518,16 +520,9 @@ static inline void native_set_iopl_mask(unsigned mask) } static inline void -native_load_sp0(struct tss_struct *tss, struct thread_struct *thread) +native_load_sp0(unsigned long sp0) { - tss->x86_tss.sp0 = thread->sp0; -#ifdef CONFIG_X86_32 - /* Only happens when SEP is enabled, no need to test "SEP"arately: */ - if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { - tss->x86_tss.ss1 = thread->sysenter_cs; - wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); - } -#endif + this_cpu_write(cpu_tss.x86_tss.sp0, sp0); } static inline void native_swapgs(void) @@ -547,15 +542,20 @@ static inline unsigned long current_top_of_stack(void) #endif } +static inline bool on_thread_stack(void) +{ + return (unsigned long)(current_top_of_stack() - + current_stack_pointer) < THREAD_SIZE; +} + #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else #define __cpuid native_cpuid -static inline void load_sp0(struct tss_struct *tss, - struct thread_struct *thread) +static inline void load_sp0(unsigned long sp0) { - native_load_sp0(tss, thread); + native_load_sp0(sp0); } #define set_iopl_mask native_set_iopl_mask @@ -804,6 +804,15 @@ static inline void spin_lock_prefetch(const void *x) #define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \ TOP_OF_KERNEL_STACK_PADDING) +#define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1)) + +#define task_pt_regs(task) \ +({ \ + unsigned long __ptr = (unsigned long)task_stack_page(task); \ + __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ + ((struct pt_regs *)__ptr) - 1; \ +}) + #ifdef CONFIG_X86_32 /* * User space process size: 3GB (default). @@ -823,23 +832,6 @@ static inline void spin_lock_prefetch(const void *x) .addr_limit = KERNEL_DS, \ } -/* - * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack. - * This is necessary to guarantee that the entire "struct pt_regs" - * is accessible even if the CPU haven't stored the SS/ESP registers - * on the stack (interrupt gate does not save these registers - * when switching to the same priv ring). - * Therefore beware: accessing the ss/esp fields of the - * "struct pt_regs" is possible, but they may contain the - * completely wrong values. - */ -#define task_pt_regs(task) \ -({ \ - unsigned long __ptr = (unsigned long)task_stack_page(task); \ - __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ - ((struct pt_regs *)__ptr) - 1; \ -}) - #define KSTK_ESP(task) (task_pt_regs(task)->sp) #else @@ -873,11 +865,9 @@ static inline void spin_lock_prefetch(const void *x) #define STACK_TOP_MAX TASK_SIZE_MAX #define INIT_THREAD { \ - .sp0 = TOP_OF_INIT_STACK, \ .addr_limit = KERNEL_DS, \ } -#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) extern unsigned long KSTK_ESP(struct task_struct *task); #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index c0e3c45cf6ab..14131dd06b29 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -136,9 +136,9 @@ static inline int v8086_mode(struct pt_regs *regs) #endif } -#ifdef CONFIG_X86_64 static inline bool user_64bit_mode(struct pt_regs *regs) { +#ifdef CONFIG_X86_64 #ifndef CONFIG_PARAVIRT /* * On non-paravirt systems, this is the only long mode CPL 3 @@ -149,8 +149,12 @@ static inline bool user_64bit_mode(struct pt_regs *regs) /* Headers are too twisted for this to go in paravirt.h. */ return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs; #endif +#else /* !CONFIG_X86_64 */ + return false; +#endif } +#ifdef CONFIG_X86_64 #define current_user_stack_pointer() current_pt_regs()->sp #define compat_user_stack_pointer() current_pt_regs()->sp #endif diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index d8f3a6ae9f6c..f91c365e57c3 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -29,7 +29,7 @@ cc_label: \ #define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \ do { \ bool c; \ - asm volatile (fullop ";" CC_SET(cc) \ + asm volatile (fullop CC_SET(cc) \ : [counter] "+m" (var), CC_OUT(cc) (c) \ : __VA_ARGS__ : clobbers); \ return c; \ diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 899084b70412..8c6bd6863db9 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_SWITCH_TO_H #define _ASM_X86_SWITCH_TO_H +#include <linux/sched/task_stack.h> + struct task_struct; /* one of the stranger aspects of C forward declarations */ struct task_struct *__switch_to_asm(struct task_struct *prev, @@ -73,4 +75,26 @@ do { \ ((last) = __switch_to_asm((prev), (next))); \ } while (0) +#ifdef CONFIG_X86_32 +static inline void refresh_sysenter_cs(struct thread_struct *thread) +{ + /* Only happens when SEP is enabled, no need to test "SEP"arately: */ + if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs)) + return; + + this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs); + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); +} +#endif + +/* This is used when switching tasks or entering/exiting vm86 mode. */ +static inline void update_sp0(struct task_struct *task) +{ +#ifdef CONFIG_X86_32 + load_sp0(task->thread.sp0); +#else + load_sp0(task_top_of_stack(task)); +#endif +} + #endif /* _ASM_X86_SWITCH_TO_H */ diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 91dfcafe27a6..bad25bb80679 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -21,7 +21,7 @@ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); asmlinkage long sys_iopl(unsigned int); /* kernel/ldt.c */ -asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); +asmlinkage long sys_modify_ldt(int, void __user *, unsigned long); /* kernel/signal.c */ asmlinkage long sys_rt_sigreturn(void); diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index fa60398bbc3a..069c04be1507 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -34,11 +34,6 @@ DECLARE_EVENT_CLASS(x86_fpu, ) ); -DEFINE_EVENT(x86_fpu, x86_fpu_state, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -); - DEFINE_EVENT(x86_fpu, x86_fpu_before_save, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) @@ -74,11 +69,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_activate_state, TP_ARGS(fpu) ); -DEFINE_EVENT(x86_fpu, x86_fpu_deactivate_state, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -); - DEFINE_EVENT(x86_fpu, x86_fpu_init_state, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index b0cced97a6ce..1fadd310ff68 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -38,9 +38,9 @@ asmlinkage void simd_coprocessor_error(void); #if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) asmlinkage void xen_divide_error(void); +asmlinkage void xen_xennmi(void); asmlinkage void xen_xendebug(void); asmlinkage void xen_xenint3(void); -asmlinkage void xen_nmi(void); asmlinkage void xen_overflow(void); asmlinkage void xen_bounds(void); asmlinkage void xen_invalid_op(void); @@ -145,4 +145,22 @@ enum { X86_TRAP_IRET = 32, /* 32, IRET Exception */ }; +/* + * Page fault error code bits: + * + * bit 0 == 0: no page found 1: protection fault + * bit 1 == 0: read access 1: write access + * bit 2 == 0: kernel-mode access 1: user-mode access + * bit 3 == 1: use of reserved bit detected + * bit 4 == 1: fault was an instruction fetch + * bit 5 == 1: protection keys block access + */ +enum x86_pf_error_code { + X86_PF_PROT = 1 << 0, + X86_PF_WRITE = 1 << 1, + X86_PF_USER = 1 << 2, + X86_PF_RSVD = 1 << 3, + X86_PF_INSTR = 1 << 4, + X86_PF_PK = 1 << 5, +}; #endif /* _ASM_X86_TRAPS_H */ diff --git a/arch/x86/include/asm/umip.h b/arch/x86/include/asm/umip.h new file mode 100644 index 000000000000..db43f2a0d92c --- /dev/null +++ b/arch/x86/include/asm/umip.h @@ -0,0 +1,12 @@ +#ifndef _ASM_X86_UMIP_H +#define _ASM_X86_UMIP_H + +#include <linux/types.h> +#include <asm/ptrace.h> + +#ifdef CONFIG_X86_INTEL_UMIP +bool fixup_umip_exception(struct pt_regs *regs); +#else +static inline bool fixup_umip_exception(struct pt_regs *regs) { return false; } +#endif /* CONFIG_X86_INTEL_UMIP */ +#endif /* _ASM_X86_UMIP_H */ diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index 87adc0d38c4a..e9cc6fe1fc6f 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -13,11 +13,11 @@ struct unwind_state { struct task_struct *task; int graph_idx; bool error; -#if defined(CONFIG_ORC_UNWINDER) +#if defined(CONFIG_UNWINDER_ORC) bool signal, full_regs; unsigned long sp, bp, ip; struct pt_regs *regs; -#elif defined(CONFIG_FRAME_POINTER_UNWINDER) +#elif defined(CONFIG_UNWINDER_FRAME_POINTER) bool got_irq; unsigned long *bp, *orig_sp, ip; struct pt_regs *regs; @@ -51,7 +51,7 @@ void unwind_start(struct unwind_state *state, struct task_struct *task, __unwind_start(state, task, regs, first_frame); } -#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER) +#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) { if (unwind_done(state)) @@ -66,7 +66,7 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) } #endif -#ifdef CONFIG_ORC_UNWINDER +#ifdef CONFIG_UNWINDER_ORC void unwind_init(void); void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size); diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 554aa8f24f91..09cc06483bed 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -110,5 +110,4 @@ struct kvm_vcpu_pv_apf_data { #define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK #define KVM_PV_EOI_DISABLED 0x0 - #endif /* _UAPI_ASM_X86_KVM_PARA_H */ diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index 6f3355399665..7e1e730396ae 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -105,6 +105,8 @@ #define X86_CR4_OSFXSR _BITUL(X86_CR4_OSFXSR_BIT) #define X86_CR4_OSXMMEXCPT_BIT 10 /* enable unmasked SSE exceptions */ #define X86_CR4_OSXMMEXCPT _BITUL(X86_CR4_OSXMMEXCPT_BIT) +#define X86_CR4_UMIP_BIT 11 /* enable UMIP support */ +#define X86_CR4_UMIP _BITUL(X86_CR4_UMIP_BIT) #define X86_CR4_LA57_BIT 12 /* enable 5-level page tables */ #define X86_CR4_LA57 _BITUL(X86_CR4_LA57_BIT) #define X86_CR4_VMXE_BIT 13 /* enable VMX virtualization */ @@ -152,5 +154,8 @@ #define CX86_ARR_BASE 0xc4 #define CX86_RCR_BASE 0xdc +#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ + X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ + X86_CR0_PG) #endif /* _UAPI_ASM_X86_PROCESSOR_FLAGS_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5f70044340ff..81bb565f4497 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -25,9 +25,9 @@ endif KASAN_SANITIZE_head$(BITS).o := n KASAN_SANITIZE_dumpstack.o := n KASAN_SANITIZE_dumpstack_$(BITS).o := n -KASAN_SANITIZE_stacktrace.o := n +KASAN_SANITIZE_stacktrace.o := n +KASAN_SANITIZE_paravirt.o := n -OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y OBJECT_FILES_NON_STANDARD_test_nx.o := y @@ -127,10 +127,11 @@ obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o +obj-$(CONFIG_X86_INTEL_UMIP) += umip.o -obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o -obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o -obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o +obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o +obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o +obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o ### # 64 bit specific files diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 3344d3382e91..dbaf14d69ebd 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -442,7 +442,6 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end, { const s32 *poff; - mutex_lock(&text_mutex); for (poff = start; poff < end; poff++) { u8 *ptr = (u8 *)poff + *poff; @@ -452,7 +451,6 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end, if (*ptr == 0x3e) text_poke(ptr, ((unsigned char []){0xf0}), 1); } - mutex_unlock(&text_mutex); } static void alternatives_smp_unlock(const s32 *start, const s32 *end, @@ -460,7 +458,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end, { const s32 *poff; - mutex_lock(&text_mutex); for (poff = start; poff < end; poff++) { u8 *ptr = (u8 *)poff + *poff; @@ -470,7 +467,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end, if (*ptr == 0xf0) text_poke(ptr, ((unsigned char []){0x3E}), 1); } - mutex_unlock(&text_mutex); } struct smp_alt_module { @@ -489,8 +485,7 @@ struct smp_alt_module { struct list_head next; }; static LIST_HEAD(smp_alt_modules); -static DEFINE_MUTEX(smp_alt); -static bool uniproc_patched = false; /* protected by smp_alt */ +static bool uniproc_patched = false; /* protected by text_mutex */ void __init_or_module alternatives_smp_module_add(struct module *mod, char *name, @@ -499,7 +494,7 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, { struct smp_alt_module *smp; - mutex_lock(&smp_alt); + mutex_lock(&text_mutex); if (!uniproc_patched) goto unlock; @@ -526,14 +521,14 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, smp_unlock: alternatives_smp_unlock(locks, locks_end, text, text_end); unlock: - mutex_unlock(&smp_alt); + mutex_unlock(&text_mutex); } void __init_or_module alternatives_smp_module_del(struct module *mod) { struct smp_alt_module *item; - mutex_lock(&smp_alt); + mutex_lock(&text_mutex); list_for_each_entry(item, &smp_alt_modules, next) { if (mod != item->mod) continue; @@ -541,7 +536,7 @@ void __init_or_module alternatives_smp_module_del(struct module *mod) kfree(item); break; } - mutex_unlock(&smp_alt); + mutex_unlock(&text_mutex); } void alternatives_enable_smp(void) @@ -551,7 +546,7 @@ void alternatives_enable_smp(void) /* Why bother if there are no other CPUs? */ BUG_ON(num_possible_cpus() == 1); - mutex_lock(&smp_alt); + mutex_lock(&text_mutex); if (uniproc_patched) { pr_info("switching to SMP code\n"); @@ -563,10 +558,13 @@ void alternatives_enable_smp(void) mod->text, mod->text_end); uniproc_patched = false; } - mutex_unlock(&smp_alt); + mutex_unlock(&text_mutex); } -/* Return 1 if the address range is reserved for smp-alternatives */ +/* + * Return 1 if the address range is reserved for SMP-alternatives. + * Must hold text_mutex. + */ int alternatives_text_reserved(void *start, void *end) { struct smp_alt_module *mod; @@ -574,6 +572,8 @@ int alternatives_text_reserved(void *start, void *end) u8 *text_start = start; u8 *text_end = end; + lockdep_assert_held(&text_mutex); + list_for_each_entry(mod, &smp_alt_modules, next) { if (mod->text > text_end || mod->text_end < text_start) continue; diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index c60922a66385..90cb82dbba57 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -23,6 +23,7 @@ obj-y += rdrand.o obj-y += match.o obj-y += bugs.o obj-$(CONFIG_CPU_FREQ) += aperfmperf.o +obj-y += cpuid-deps.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c9176bae7fd8..47f8a85be11c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -329,6 +329,28 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) } } +static __always_inline void setup_umip(struct cpuinfo_x86 *c) +{ + /* Check the boot processor, plus build option for UMIP. */ + if (!cpu_feature_enabled(X86_FEATURE_UMIP)) + goto out; + + /* Check the current processor's cpuid bits. */ + if (!cpu_has(c, X86_FEATURE_UMIP)) + goto out; + + cr4_set_bits(X86_CR4_UMIP); + + return; + +out: + /* + * Make sure UMIP is disabled in case it was enabled in a + * previous boot (e.g., via kexec). + */ + cr4_clear_bits(X86_CR4_UMIP); +} + /* * Protection Keys are not available in 32-bit mode. */ @@ -1147,9 +1169,10 @@ static void identify_cpu(struct cpuinfo_x86 *c) /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); - /* Set up SMEP/SMAP */ + /* Set up SMEP/SMAP/UMIP */ setup_smep(c); setup_smap(c); + setup_umip(c); /* * The vendor-specific functions might have changed features. @@ -1301,18 +1324,16 @@ void print_cpu_info(struct cpuinfo_x86 *c) pr_cont(")\n"); } -static __init int setup_disablecpuid(char *arg) +/* + * clearcpuid= was already parsed in fpu__init_parse_early_param. + * But we need to keep a dummy __setup around otherwise it would + * show up as an environment variable for init. + */ +static __init int setup_clearcpuid(char *arg) { - int bit; - - if (get_option(&arg, &bit) && bit >= 0 && bit < NCAPINTS * 32) - setup_clear_cpu_cap(bit); - else - return 0; - return 1; } -__setup("clearcpuid=", setup_disablecpuid); +__setup("clearcpuid=", setup_clearcpuid); #ifdef CONFIG_X86_64 DEFINE_PER_CPU_FIRST(union irq_stack_union, @@ -1572,9 +1593,13 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, me); - load_sp0(t, ¤t->thread); + /* + * Initialize the TSS. Don't bother initializing sp0, as the initial + * task never enters user mode. + */ set_tss_desc(cpu, t); load_TR_desc(); + load_mm_ldt(&init_mm); clear_all_debug_regs(); @@ -1596,7 +1621,6 @@ void cpu_init(void) int cpu = smp_processor_id(); struct task_struct *curr = current; struct tss_struct *t = &per_cpu(cpu_tss, cpu); - struct thread_struct *thread = &curr->thread; wait_for_master_cpu(cpu); @@ -1627,9 +1651,13 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, curr); - load_sp0(t, thread); + /* + * Initialize the TSS. Don't bother initializing sp0, as the initial + * task never enters user mode. + */ set_tss_desc(cpu, t); load_TR_desc(); + load_mm_ldt(&init_mm); t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c new file mode 100644 index 000000000000..904b0a3c4e53 --- /dev/null +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -0,0 +1,121 @@ +/* Declare dependencies between CPUIDs */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <asm/cpufeature.h> + +struct cpuid_dep { + unsigned int feature; + unsigned int depends; +}; + +/* + * Table of CPUID features that depend on others. + * + * This only includes dependencies that can be usefully disabled, not + * features part of the base set (like FPU). + * + * Note this all is not __init / __initdata because it can be + * called from cpu hotplug. It shouldn't do anything in this case, + * but it's difficult to tell that to the init reference checker. + */ +const static struct cpuid_dep cpuid_deps[] = { + { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE }, + { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE }, + { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE }, + { X86_FEATURE_AVX, X86_FEATURE_XSAVE }, + { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, + { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, + { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, + { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR }, + { X86_FEATURE_XMM, X86_FEATURE_FXSR }, + { X86_FEATURE_XMM2, X86_FEATURE_XMM }, + { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, + { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 }, + { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, }, + { X86_FEATURE_F16C, X86_FEATURE_XMM2, }, + { X86_FEATURE_AES, X86_FEATURE_XMM2 }, + { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, + { X86_FEATURE_FMA, X86_FEATURE_AVX }, + { X86_FEATURE_AVX2, X86_FEATURE_AVX, }, + { X86_FEATURE_AVX512F, X86_FEATURE_AVX, }, + { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL }, + { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL }, + { X86_FEATURE_VAES, X86_FEATURE_AVX512VL }, + { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, + {} +}; + +static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature) +{ + /* + * Note: This could use the non atomic __*_bit() variants, but the + * rest of the cpufeature code uses atomics as well, so keep it for + * consistency. Cleanup all of it separately. + */ + if (!c) { + clear_cpu_cap(&boot_cpu_data, feature); + set_bit(feature, (unsigned long *)cpu_caps_cleared); + } else { + clear_bit(feature, (unsigned long *)c->x86_capability); + } +} + +/* Take the capabilities and the BUG bits into account */ +#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8) + +static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) +{ + DECLARE_BITMAP(disable, MAX_FEATURE_BITS); + const struct cpuid_dep *d; + bool changed; + + if (WARN_ON(feature >= MAX_FEATURE_BITS)) + return; + + clear_feature(c, feature); + + /* Collect all features to disable, handling dependencies */ + memset(disable, 0, sizeof(disable)); + __set_bit(feature, disable); + + /* Loop until we get a stable state. */ + do { + changed = false; + for (d = cpuid_deps; d->feature; d++) { + if (!test_bit(d->depends, disable)) + continue; + if (__test_and_set_bit(d->feature, disable)) + continue; + + changed = true; + clear_feature(c, d->feature); + } + } while (changed); +} + +void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) +{ + do_clear_cpu_cap(c, feature); +} + +void setup_clear_cpu_cap(unsigned int feature) +{ + do_clear_cpu_cap(NULL, feature); +} diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 44404e2307bb..10e74d4778a1 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -209,7 +209,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) } #ifdef CONFIG_KEXEC_FILE -static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg) +static int get_nr_ram_ranges_callback(struct resource *res, void *arg) { unsigned int *nr_ranges = arg; @@ -342,7 +342,7 @@ static int elf_header_exclude_ranges(struct crash_elf_data *ced, return ret; } -static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg) +static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg) { struct crash_elf_data *ced = arg; Elf64_Ehdr *ehdr; @@ -355,7 +355,7 @@ static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg) ehdr = ced->ehdr; /* Exclude unwanted mem ranges */ - ret = elf_header_exclude_ranges(ced, start, end); + ret = elf_header_exclude_ranges(ced, res->start, res->end); if (ret) return ret; @@ -518,14 +518,14 @@ static int add_e820_entry(struct boot_params *params, struct e820_entry *entry) return 0; } -static int memmap_entry_callback(u64 start, u64 end, void *arg) +static int memmap_entry_callback(struct resource *res, void *arg) { struct crash_memmap_data *cmd = arg; struct boot_params *params = cmd->params; struct e820_entry ei; - ei.addr = start; - ei.size = end - start + 1; + ei.addr = res->start; + ei.size = resource_size(res); ei.type = cmd->type; add_e820_entry(params, &ei); @@ -619,12 +619,12 @@ out: return ret; } -static int determine_backup_region(u64 start, u64 end, void *arg) +static int determine_backup_region(struct resource *res, void *arg) { struct kimage *image = arg; - image->arch.backup_src_start = start; - image->arch.backup_src_sz = end - start + 1; + image->arch.backup_src_start = res->start; + image->arch.backup_src_sz = resource_size(res); /* Expecting only one range for backup region */ return 1; diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 7affb7e3d9a5..6abd83572b01 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -249,6 +249,10 @@ static void __init fpu__init_system_ctx_switch(void) */ static void __init fpu__init_parse_early_param(void) { + char arg[32]; + char *argptr = arg; + int bit; + if (cmdline_find_option_bool(boot_command_line, "no387")) setup_clear_cpu_cap(X86_FEATURE_FPU); @@ -266,6 +270,13 @@ static void __init fpu__init_parse_early_param(void) if (cmdline_find_option_bool(boot_command_line, "noxsaves")) setup_clear_cpu_cap(X86_FEATURE_XSAVES); + + if (cmdline_find_option(boot_command_line, "clearcpuid", arg, + sizeof(arg)) && + get_option(&argptr, &bit) && + bit >= 0 && + bit < NCAPINTS * 32) + setup_clear_cpu_cap(bit); } /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index f1d5476c9022..87a57b7642d3 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -15,6 +15,7 @@ #include <asm/fpu/xstate.h> #include <asm/tlbflush.h> +#include <asm/cpufeature.h> /* * Although we spell it out in here, the Processor Trace @@ -36,6 +37,19 @@ static const char *xfeature_names[] = "unknown xstate feature" , }; +static short xsave_cpuid_features[] __initdata = { + X86_FEATURE_FPU, + X86_FEATURE_XMM, + X86_FEATURE_AVX, + X86_FEATURE_MPX, + X86_FEATURE_MPX, + X86_FEATURE_AVX512F, + X86_FEATURE_AVX512F, + X86_FEATURE_AVX512F, + X86_FEATURE_INTEL_PT, + X86_FEATURE_PKU, +}; + /* * Mask of xstate features supported by the CPU and the kernel: */ @@ -59,26 +73,6 @@ unsigned int fpu_user_xstate_size; void fpu__xstate_clear_all_cpu_caps(void) { setup_clear_cpu_cap(X86_FEATURE_XSAVE); - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - setup_clear_cpu_cap(X86_FEATURE_XSAVEC); - setup_clear_cpu_cap(X86_FEATURE_XSAVES); - setup_clear_cpu_cap(X86_FEATURE_AVX); - setup_clear_cpu_cap(X86_FEATURE_AVX2); - setup_clear_cpu_cap(X86_FEATURE_AVX512F); - setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA); - setup_clear_cpu_cap(X86_FEATURE_AVX512PF); - setup_clear_cpu_cap(X86_FEATURE_AVX512ER); - setup_clear_cpu_cap(X86_FEATURE_AVX512CD); - setup_clear_cpu_cap(X86_FEATURE_AVX512DQ); - setup_clear_cpu_cap(X86_FEATURE_AVX512BW); - setup_clear_cpu_cap(X86_FEATURE_AVX512VL); - setup_clear_cpu_cap(X86_FEATURE_MPX); - setup_clear_cpu_cap(X86_FEATURE_XGETBV1); - setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI); - setup_clear_cpu_cap(X86_FEATURE_PKU); - setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW); - setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS); - setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ); } /* @@ -726,6 +720,7 @@ void __init fpu__init_system_xstate(void) unsigned int eax, ebx, ecx, edx; static int on_boot_cpu __initdata = 1; int err; + int i; WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; @@ -759,6 +754,14 @@ void __init fpu__init_system_xstate(void) goto out_disable; } + /* + * Clear XSAVE features that are disabled in the normal CPUID. + */ + for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { + if (!boot_cpu_has(xsave_cpuid_features[i])) + xfeatures_mask &= ~BIT(i); + } + xfeatures_mask &= fpu__get_supported_xfeatures_mask(); /* Enable xstate instructions to be able to continue with initialization: */ diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index f1d528bb66a6..c29020907886 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -212,9 +212,6 @@ ENTRY(startup_32_smp) #endif .Ldefault_entry: -#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ - X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ - X86_CR0_PG) movl $(CR0_STATE & ~X86_CR0_PG),%eax movl %eax,%cr0 @@ -402,7 +399,7 @@ ENTRY(early_idt_handler_array) # 24(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 + .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 pushl $0 # Dummy error code, to make stack frame uniform .endif pushl $i # 20(%esp) Vector number diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 6dde3f3fc1f8..7dca675fe78d 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -38,11 +38,12 @@ * */ -#define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1)) #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) +#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) PGD_START_KERNEL = pgd_index(__START_KERNEL_map) +#endif L3_START_KERNEL = pud_index(__START_KERNEL_map) .text @@ -50,6 +51,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map) .code64 .globl startup_64 startup_64: + UNWIND_HINT_EMPTY /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, * and someone has loaded an identity mapped page table @@ -89,6 +91,7 @@ startup_64: addq $(early_top_pgt - __START_KERNEL_map), %rax jmp 1f ENTRY(secondary_startup_64) + UNWIND_HINT_EMPTY /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, * and someone has loaded a mapped page table. @@ -133,6 +136,7 @@ ENTRY(secondary_startup_64) movq $1f, %rax jmp *%rax 1: + UNWIND_HINT_EMPTY /* Check if nx is implemented */ movl $0x80000001, %eax @@ -150,9 +154,6 @@ ENTRY(secondary_startup_64) 1: wrmsr /* Make changes effective */ /* Setup cr0 */ -#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ - X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ - X86_CR0_PG) movl $CR0_STATE, %eax /* Make changes effective */ movq %rax, %cr0 @@ -235,7 +236,7 @@ ENTRY(secondary_startup_64) pushq %rax # target address in negative space lretq .Lafter_lret: -ENDPROC(secondary_startup_64) +END(secondary_startup_64) #include "verify_cpu.S" @@ -247,6 +248,7 @@ ENDPROC(secondary_startup_64) */ ENTRY(start_cpu0) movq initial_stack(%rip), %rsp + UNWIND_HINT_EMPTY jmp .Ljump_to_C_code ENDPROC(start_cpu0) #endif @@ -266,26 +268,24 @@ ENDPROC(start_cpu0) .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS __FINITDATA -bad_address: - jmp bad_address - __INIT ENTRY(early_idt_handler_array) - # 104(%rsp) %rflags - # 96(%rsp) %cs - # 88(%rsp) %rip - # 80(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 - pushq $0 # Dummy error code, to make stack frame uniform + .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 + UNWIND_HINT_IRET_REGS + pushq $0 # Dummy error code, to make stack frame uniform + .else + UNWIND_HINT_IRET_REGS offset=8 .endif pushq $i # 72(%rsp) Vector number jmp early_idt_handler_common + UNWIND_HINT_IRET_REGS i = i + 1 .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr -ENDPROC(early_idt_handler_array) + UNWIND_HINT_IRET_REGS offset=16 +END(early_idt_handler_array) early_idt_handler_common: /* @@ -313,6 +313,7 @@ early_idt_handler_common: pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + UNWIND_HINT_REGS cmpq $14,%rsi /* Page fault? */ jnz 10f @@ -327,8 +328,8 @@ early_idt_handler_common: 20: decl early_recursion_flag(%rip) - jmp restore_regs_and_iret -ENDPROC(early_idt_handler_common) + jmp restore_regs_and_return_to_kernel +END(early_idt_handler_common) __INITDATA @@ -362,10 +363,7 @@ NEXT_PAGE(early_dynamic_pgts) .data -#ifndef CONFIG_XEN -NEXT_PAGE(init_top_pgt) - .fill 512,8,0 -#else +#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) NEXT_PAGE(init_top_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 @@ -382,6 +380,9 @@ NEXT_PAGE(level2_ident_pgt) * Don't set NX because code runs from these pages. */ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) +#else +NEXT_PAGE(init_top_pgt) + .fill 512,8,0 #endif #ifdef CONFIG_X86_5LEVEL @@ -435,7 +436,7 @@ ENTRY(phys_base) EXPORT_SYMBOL(phys_base) #include "../../x86/xen/xen-head.S" - + __PAGE_ALIGNED_BSS NEXT_PAGE(empty_zero_page) .skip PAGE_SIZE diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 8bb9594d0761..3df743b60c80 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -75,8 +75,8 @@ static int parse_no_kvmclock_vsyscall(char *arg) early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); -static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); -static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); +static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); +static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64); static int has_steal_clock = 0; /* @@ -312,7 +312,7 @@ static void kvm_register_steal_time(void) cpu, (unsigned long long) slow_virt_to_phys(st)); } -static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; +static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) { @@ -426,9 +426,42 @@ void kvm_disable_steal_time(void) wrmsr(MSR_KVM_STEAL_TIME, 0, 0); } +static inline void __set_percpu_decrypted(void *ptr, unsigned long size) +{ + early_set_memory_decrypted((unsigned long) ptr, size); +} + +/* + * Iterate through all possible CPUs and map the memory region pointed + * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once. + * + * Note: we iterate through all possible CPUs to ensure that CPUs + * hotplugged will have their per-cpu variable already mapped as + * decrypted. + */ +static void __init sev_map_percpu_data(void) +{ + int cpu; + + if (!sev_active()) + return; + + for_each_possible_cpu(cpu) { + __set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason)); + __set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time)); + __set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi)); + } +} + #ifdef CONFIG_SMP static void __init kvm_smp_prepare_boot_cpu(void) { + /* + * Map the per-cpu variables as decrypted before kvm_guest_cpu_init() + * shares the guest physical address with the hypervisor. + */ + sev_map_percpu_data(); + kvm_guest_cpu_init(); native_smp_prepare_boot_cpu(); kvm_spinlock_init(); @@ -496,6 +529,7 @@ void __init kvm_guest_init(void) kvm_cpu_online, kvm_cpu_down_prepare) < 0) pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n"); #else + sev_map_percpu_data(); kvm_guest_cpu_init(); #endif diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 5b609e28ce3f..77b492c2d658 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -27,6 +27,7 @@ #include <linux/sched.h> #include <linux/sched/clock.h> +#include <asm/mem_encrypt.h> #include <asm/x86_init.h> #include <asm/reboot.h> #include <asm/kvmclock.h> @@ -45,7 +46,7 @@ early_param("no-kvmclock", parse_no_kvmclock); /* The hypervisor will put information about time periodically here */ static struct pvclock_vsyscall_time_info *hv_clock; -static struct pvclock_wall_clock wall_clock; +static struct pvclock_wall_clock *wall_clock; struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) { @@ -64,15 +65,15 @@ static void kvm_get_wallclock(struct timespec *now) int low, high; int cpu; - low = (int)__pa_symbol(&wall_clock); - high = ((u64)__pa_symbol(&wall_clock) >> 32); + low = (int)slow_virt_to_phys(wall_clock); + high = ((u64)slow_virt_to_phys(wall_clock) >> 32); native_write_msr(msr_kvm_wall_clock, low, high); cpu = get_cpu(); vcpu_time = &hv_clock[cpu].pvti; - pvclock_read_wallclock(&wall_clock, vcpu_time, now); + pvclock_read_wallclock(wall_clock, vcpu_time, now); put_cpu(); } @@ -249,11 +250,39 @@ static void kvm_shutdown(void) native_machine_shutdown(); } +static phys_addr_t __init kvm_memblock_alloc(phys_addr_t size, + phys_addr_t align) +{ + phys_addr_t mem; + + mem = memblock_alloc(size, align); + if (!mem) + return 0; + + if (sev_active()) { + if (early_set_memory_decrypted((unsigned long)__va(mem), size)) + goto e_free; + } + + return mem; +e_free: + memblock_free(mem, size); + return 0; +} + +static void __init kvm_memblock_free(phys_addr_t addr, phys_addr_t size) +{ + if (sev_active()) + early_set_memory_encrypted((unsigned long)__va(addr), size); + + memblock_free(addr, size); +} + void __init kvmclock_init(void) { struct pvclock_vcpu_time_info *vcpu_time; - unsigned long mem; - int size, cpu; + unsigned long mem, mem_wall_clock; + int size, cpu, wall_clock_size; u8 flags; size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); @@ -267,21 +296,35 @@ void __init kvmclock_init(void) } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))) return; - printk(KERN_INFO "kvm-clock: Using msrs %x and %x", - msr_kvm_system_time, msr_kvm_wall_clock); + wall_clock_size = PAGE_ALIGN(sizeof(struct pvclock_wall_clock)); + mem_wall_clock = kvm_memblock_alloc(wall_clock_size, PAGE_SIZE); + if (!mem_wall_clock) + return; - mem = memblock_alloc(size, PAGE_SIZE); - if (!mem) + wall_clock = __va(mem_wall_clock); + memset(wall_clock, 0, wall_clock_size); + + mem = kvm_memblock_alloc(size, PAGE_SIZE); + if (!mem) { + kvm_memblock_free(mem_wall_clock, wall_clock_size); + wall_clock = NULL; return; + } + hv_clock = __va(mem); memset(hv_clock, 0, size); if (kvm_register_clock("primary cpu clock")) { hv_clock = NULL; - memblock_free(mem, size); + kvm_memblock_free(mem, size); + kvm_memblock_free(mem_wall_clock, wall_clock_size); + wall_clock = NULL; return; } + printk(KERN_INFO "kvm-clock: Using msrs %x and %x", + msr_kvm_system_time, msr_kvm_wall_clock); + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index a2fcf037bd80..1c1eae961340 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -13,6 +13,7 @@ #include <linux/string.h> #include <linux/mm.h> #include <linux/smp.h> +#include <linux/syscalls.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> @@ -295,8 +296,8 @@ out: return error; } -asmlinkage int sys_modify_ldt(int func, void __user *ptr, - unsigned long bytecount) +SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , + unsigned long , bytecount) { int ret = -ENOSYS; @@ -314,5 +315,14 @@ asmlinkage int sys_modify_ldt(int func, void __user *ptr, ret = write_ldt(ptr, bytecount, 0); break; } - return ret; + /* + * The SYSCALL_DEFINE() macros give us an 'unsigned long' + * return type, but tht ABI for sys_modify_ldt() expects + * 'int'. This cast gives us an int-sized value in %rax + * for the return code. The 'unsigned' is necessary so + * the compiler does not try to sign-extend the negative + * return codes into the high half of the register when + * taking the value from int->long. + */ + return (unsigned int)ret; } diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 3fe690067802..6b07faaa1579 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -7,7 +7,7 @@ #include <linux/init.h> #include <linux/ioport.h> -static int found(u64 start, u64 end, void *data) +static int found(struct resource *res, void *data) { return 1; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c67685337c5a..97fb3e5737f5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -49,7 +49,13 @@ */ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { .x86_tss = { - .sp0 = TOP_OF_INIT_STACK, + /* + * .sp0 is only used when entering ring 0 from a lower + * privilege level. Since the init task never runs anything + * but ring 0 code, there is no need for a valid value here. + * Poison it. + */ + .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, #ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, .ss1 = __KERNEL_CS, diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 11966251cd42..45bf0c5f93e1 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -284,9 +284,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Reload esp0 and cpu_current_top_of_stack. This changes - * current_thread_info(). + * current_thread_info(). Refresh the SYSENTER configuration in + * case prev or next is vm86. */ - load_sp0(tss, next); + update_sp0(next_p); + refresh_sysenter_cs(next); this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 302e7b2572d1..eeeb34f85c25 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -274,7 +274,6 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, struct inactive_task_frame *frame; struct task_struct *me = current; - p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; childregs = task_pt_regs(p); fork_frame = container_of(childregs, struct fork_frame, regs); frame = &fork_frame->frame; @@ -464,8 +463,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ this_cpu_write(current_task, next_p); - /* Reload esp0 and ss1. This changes current_thread_info(). */ - load_sp0(tss, next); + /* Reload sp0. */ + update_sp0(next_p); /* * Now maybe reload the debug registers and handle I/O bitmaps diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0957dd73d127..507100a72eb3 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -380,9 +380,11 @@ static void __init reserve_initrd(void) * If SME is active, this memory will be marked encrypted by the * kernel when it is accessed (including relocation). However, the * ramdisk image was loaded decrypted by the bootloader, so make - * sure that it is encrypted before accessing it. + * sure that it is encrypted before accessing it. For SEV the + * ramdisk will already be encrypted, so only do this for SME. */ - sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image); + if (sme_active()) + sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image); initrd_start = 0; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7ab6db86b573..88fc3a51640c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -963,8 +963,7 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle) #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ irq_ctx_init(cpu); - per_cpu(cpu_current_top_of_stack, cpu) = - (unsigned long)task_stack_page(idle) + THREAD_SIZE; + per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); #else initial_gs = per_cpu_offset(cpu); #endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5a6b8f809792..c3aca0b533a1 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -60,6 +60,7 @@ #include <asm/trace/mpx.h> #include <asm/mpx.h> #include <asm/vm86.h> +#include <asm/umip.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> @@ -141,8 +142,7 @@ void ist_begin_non_atomic(struct pt_regs *regs) * will catch asm bugs and any attempt to use ist_preempt_enable * from double_fault. */ - BUG_ON((unsigned long)(current_top_of_stack() - - current_stack_pointer) >= THREAD_SIZE); + BUG_ON(!on_thread_stack()); preempt_enable_no_resched(); } @@ -518,6 +518,11 @@ do_general_protection(struct pt_regs *regs, long error_code) RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); cond_local_irq_enable(regs); + if (static_cpu_has(X86_FEATURE_UMIP)) { + if (user_mode(regs) && fixup_umip_exception(regs)) + return; + } + if (v8086_mode(regs)) { local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c new file mode 100644 index 000000000000..6ba82be68cff --- /dev/null +++ b/arch/x86/kernel/umip.c @@ -0,0 +1,366 @@ +/* + * umip.c Emulation for instruction protected by the Intel User-Mode + * Instruction Prevention feature + * + * Copyright (c) 2017, Intel Corporation. + * Ricardo Neri <ricardo.neri-calderon@linux.intel.com> + */ + +#include <linux/uaccess.h> +#include <asm/umip.h> +#include <asm/traps.h> +#include <asm/insn.h> +#include <asm/insn-eval.h> +#include <linux/ratelimit.h> + +#undef pr_fmt +#define pr_fmt(fmt) "umip: " fmt + +/** DOC: Emulation for User-Mode Instruction Prevention (UMIP) + * + * The feature User-Mode Instruction Prevention present in recent Intel + * processor prevents a group of instructions (sgdt, sidt, sldt, smsw, and str) + * from being executed with CPL > 0. Otherwise, a general protection fault is + * issued. + * + * Rather than relaying to the user space the general protection fault caused by + * the UMIP-protected instructions (in the form of a SIGSEGV signal), it can be + * trapped and emulate the result of such instructions to provide dummy values. + * This allows to both conserve the current kernel behavior and not reveal the + * system resources that UMIP intends to protect (i.e., the locations of the + * global descriptor and interrupt descriptor tables, the segment selectors of + * the local descriptor table, the value of the task state register and the + * contents of the CR0 register). + * + * This emulation is needed because certain applications (e.g., WineHQ and + * DOSEMU2) rely on this subset of instructions to function. + * + * The instructions protected by UMIP can be split in two groups. Those which + * return a kernel memory address (sgdt and sidt) and those which return a + * value (sldt, str and smsw). + * + * For the instructions that return a kernel memory address, applications + * such as WineHQ rely on the result being located in the kernel memory space, + * not the actual location of the table. The result is emulated as a hard-coded + * value that, lies close to the top of the kernel memory. The limit for the GDT + * and the IDT are set to zero. + * + * Given that sldt and str are not commonly used in programs that run on WineHQ + * or DOSEMU2, they are not emulated. + * + * The instruction smsw is emulated to return the value that the register CR0 + * has at boot time as set in the head_32. + * + * Also, emulation is provided only for 32-bit processes; 64-bit processes + * that attempt to use the instructions that UMIP protects will receive the + * SIGSEGV signal issued as a consequence of the general protection fault. + * + * Care is taken to appropriately emulate the results when segmentation is + * used. That is, rather than relying on USER_DS and USER_CS, the function + * insn_get_addr_ref() inspects the segment descriptor pointed by the + * registers in pt_regs. This ensures that we correctly obtain the segment + * base address and the address and operand sizes even if the user space + * application uses a local descriptor table. + */ + +#define UMIP_DUMMY_GDT_BASE 0xfffe0000 +#define UMIP_DUMMY_IDT_BASE 0xffff0000 + +/* + * The SGDT and SIDT instructions store the contents of the global descriptor + * table and interrupt table registers, respectively. The destination is a + * memory operand of X+2 bytes. X bytes are used to store the base address of + * the table and 2 bytes are used to store the limit. In 32-bit processes, the + * only processes for which emulation is provided, X has a value of 4. + */ +#define UMIP_GDT_IDT_BASE_SIZE 4 +#define UMIP_GDT_IDT_LIMIT_SIZE 2 + +#define UMIP_INST_SGDT 0 /* 0F 01 /0 */ +#define UMIP_INST_SIDT 1 /* 0F 01 /1 */ +#define UMIP_INST_SMSW 3 /* 0F 01 /4 */ + +/** + * identify_insn() - Identify a UMIP-protected instruction + * @insn: Instruction structure with opcode and ModRM byte. + * + * From the opcode and ModRM.reg in @insn identify, if any, a UMIP-protected + * instruction that can be emulated. + * + * Returns: + * + * On success, a constant identifying a specific UMIP-protected instruction that + * can be emulated. + * + * -EINVAL on error or when not an UMIP-protected instruction that can be + * emulated. + */ +static int identify_insn(struct insn *insn) +{ + /* By getting modrm we also get the opcode. */ + insn_get_modrm(insn); + + if (!insn->modrm.nbytes) + return -EINVAL; + + /* All the instructions of interest start with 0x0f. */ + if (insn->opcode.bytes[0] != 0xf) + return -EINVAL; + + if (insn->opcode.bytes[1] == 0x1) { + switch (X86_MODRM_REG(insn->modrm.value)) { + case 0: + return UMIP_INST_SGDT; + case 1: + return UMIP_INST_SIDT; + case 4: + return UMIP_INST_SMSW; + default: + return -EINVAL; + } + } + + /* SLDT AND STR are not emulated */ + return -EINVAL; +} + +/** + * emulate_umip_insn() - Emulate UMIP instructions and return dummy values + * @insn: Instruction structure with operands + * @umip_inst: A constant indicating the instruction to emulate + * @data: Buffer into which the dummy result is stored + * @data_size: Size of the emulated result + * + * Emulate an instruction protected by UMIP and provide a dummy result. The + * result of the emulation is saved in @data. The size of the results depends + * on both the instruction and type of operand (register vs memory address). + * The size of the result is updated in @data_size. Caller is responsible + * of providing a @data buffer of at least UMIP_GDT_IDT_BASE_SIZE + + * UMIP_GDT_IDT_LIMIT_SIZE bytes. + * + * Returns: + * + * 0 on success, -EINVAL on error while emulating. + */ +static int emulate_umip_insn(struct insn *insn, int umip_inst, + unsigned char *data, int *data_size) +{ + unsigned long dummy_base_addr, dummy_value; + unsigned short dummy_limit = 0; + + if (!data || !data_size || !insn) + return -EINVAL; + /* + * These two instructions return the base address and limit of the + * global and interrupt descriptor table, respectively. According to the + * Intel Software Development manual, the base address can be 24-bit, + * 32-bit or 64-bit. Limit is always 16-bit. If the operand size is + * 16-bit, the returned value of the base address is supposed to be a + * zero-extended 24-byte number. However, it seems that a 32-byte number + * is always returned irrespective of the operand size. + */ + if (umip_inst == UMIP_INST_SGDT || umip_inst == UMIP_INST_SIDT) { + /* SGDT and SIDT do not use registers operands. */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) + return -EINVAL; + + if (umip_inst == UMIP_INST_SGDT) + dummy_base_addr = UMIP_DUMMY_GDT_BASE; + else + dummy_base_addr = UMIP_DUMMY_IDT_BASE; + + *data_size = UMIP_GDT_IDT_LIMIT_SIZE + UMIP_GDT_IDT_BASE_SIZE; + + memcpy(data + 2, &dummy_base_addr, UMIP_GDT_IDT_BASE_SIZE); + memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE); + + } else if (umip_inst == UMIP_INST_SMSW) { + dummy_value = CR0_STATE; + + /* + * Even though the CR0 register has 4 bytes, the number + * of bytes to be copied in the result buffer is determined + * by whether the operand is a register or a memory location. + * If operand is a register, return as many bytes as the operand + * size. If operand is memory, return only the two least + * siginificant bytes of CR0. + */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) + *data_size = insn->opnd_bytes; + else + *data_size = 2; + + memcpy(data, &dummy_value, *data_size); + /* STR and SLDT are not emulated */ + } else { + return -EINVAL; + } + + return 0; +} + +/** + * force_sig_info_umip_fault() - Force a SIGSEGV with SEGV_MAPERR + * @addr: Address that caused the signal + * @regs: Register set containing the instruction pointer + * + * Force a SIGSEGV signal with SEGV_MAPERR as the error code. This function is + * intended to be used to provide a segmentation fault when the result of the + * UMIP emulation could not be copied to the user space memory. + * + * Returns: none + */ +static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs) +{ + siginfo_t info; + struct task_struct *tsk = current; + + tsk->thread.cr2 = (unsigned long)addr; + tsk->thread.error_code = X86_PF_USER | X86_PF_WRITE; + tsk->thread.trap_nr = X86_TRAP_PF; + + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_MAPERR; + info.si_addr = addr; + force_sig_info(SIGSEGV, &info, tsk); + + if (!(show_unhandled_signals && unhandled_signal(tsk, SIGSEGV))) + return; + + pr_err_ratelimited("%s[%d] umip emulation segfault ip:%lx sp:%lx error:%x in %lx\n", + tsk->comm, task_pid_nr(tsk), regs->ip, + regs->sp, X86_PF_USER | X86_PF_WRITE, + regs->ip); +} + +/** + * fixup_umip_exception() - Fixup a general protection fault caused by UMIP + * @regs: Registers as saved when entering the #GP handler + * + * The instructions sgdt, sidt, str, smsw, sldt cause a general protection + * fault if executed with CPL > 0 (i.e., from user space). If the offending + * user-space process is not in long mode, this function fixes the exception + * up and provides dummy results for sgdt, sidt and smsw; str and sldt are not + * fixed up. Also long mode user-space processes are not fixed up. + * + * If operands are memory addresses, results are copied to user-space memory as + * indicated by the instruction pointed by eIP using the registers indicated in + * the instruction operands. If operands are registers, results are copied into + * the context that was saved when entering kernel mode. + * + * Returns: + * + * True if emulation was successful; false if not. + */ +bool fixup_umip_exception(struct pt_regs *regs) +{ + int not_copied, nr_copied, reg_offset, dummy_data_size, umip_inst; + unsigned long seg_base = 0, *reg_addr; + /* 10 bytes is the maximum size of the result of UMIP instructions */ + unsigned char dummy_data[10] = { 0 }; + unsigned char buf[MAX_INSN_SIZE]; + void __user *uaddr; + struct insn insn; + char seg_defs; + + if (!regs) + return false; + + /* Do not emulate 64-bit processes. */ + if (user_64bit_mode(regs)) + return false; + + /* + * If not in user-space long mode, a custom code segment could be in + * use. This is true in protected mode (if the process defined a local + * descriptor table), or virtual-8086 mode. In most of the cases + * seg_base will be zero as in USER_CS. + */ + if (!user_64bit_mode(regs)) + seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); + + if (seg_base == -1L) + return false; + + not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip), + sizeof(buf)); + nr_copied = sizeof(buf) - not_copied; + + /* + * The copy_from_user above could have failed if user code is protected + * by a memory protection key. Give up on emulation in such a case. + * Should we issue a page fault? + */ + if (!nr_copied) + return false; + + insn_init(&insn, buf, nr_copied, user_64bit_mode(regs)); + + /* + * Override the default operand and address sizes with what is specified + * in the code segment descriptor. The instruction decoder only sets + * the address size it to either 4 or 8 address bytes and does nothing + * for the operand bytes. This OK for most of the cases, but we could + * have special cases where, for instance, a 16-bit code segment + * descriptor is used. + * If there is an address override prefix, the instruction decoder + * correctly updates these values, even for 16-bit defaults. + */ + seg_defs = insn_get_code_seg_params(regs); + if (seg_defs == -EINVAL) + return false; + + insn.addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs); + insn.opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs); + + insn_get_length(&insn); + if (nr_copied < insn.length) + return false; + + umip_inst = identify_insn(&insn); + if (umip_inst < 0) + return false; + + if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size)) + return false; + + /* + * If operand is a register, write result to the copy of the register + * value that was pushed to the stack when entering into kernel mode. + * Upon exit, the value we write will be restored to the actual hardware + * register. + */ + if (X86_MODRM_MOD(insn.modrm.value) == 3) { + reg_offset = insn_get_modrm_rm_off(&insn, regs); + + /* + * Negative values are usually errors. In memory addressing, + * the exception is -EDOM. Since we expect a register operand, + * all negative values are errors. + */ + if (reg_offset < 0) + return false; + + reg_addr = (unsigned long *)((unsigned long)regs + reg_offset); + memcpy(reg_addr, dummy_data, dummy_data_size); + } else { + uaddr = insn_get_addr_ref(&insn, regs); + if ((unsigned long)uaddr == -1L) + return false; + + nr_copied = copy_to_user(uaddr, dummy_data, dummy_data_size); + if (nr_copied > 0) { + /* + * If copy fails, send a signal and tell caller that + * fault was fixed up. + */ + force_sig_info_umip_fault(uaddr, regs); + return true; + } + } + + /* increase IP to let the program keep going */ + regs->ip += insn.length; + return true; +} diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 495c776de4b4..a3755d293a48 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -271,12 +271,15 @@ static bool is_prefix_bad(struct insn *insn) int i; for (i = 0; i < insn->prefixes.nbytes; i++) { - switch (insn->prefixes.bytes[i]) { - case 0x26: /* INAT_PFX_ES */ - case 0x2E: /* INAT_PFX_CS */ - case 0x36: /* INAT_PFX_DS */ - case 0x3E: /* INAT_PFX_SS */ - case 0xF0: /* INAT_PFX_LOCK */ + insn_attr_t attr; + + attr = inat_get_opcode_attribute(insn->prefixes.bytes[i]); + switch (attr) { + case INAT_MAKE_PREFIX(INAT_PFX_ES): + case INAT_MAKE_PREFIX(INAT_PFX_CS): + case INAT_MAKE_PREFIX(INAT_PFX_DS): + case INAT_MAKE_PREFIX(INAT_PFX_SS): + case INAT_MAKE_PREFIX(INAT_PFX_LOCK): return true; } } diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index 014ea59aa153..3d3c2f71f617 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -33,7 +33,7 @@ #include <asm/cpufeatures.h> #include <asm/msr-index.h> -verify_cpu: +ENTRY(verify_cpu) pushf # Save caller passed flags push $0 # Kill any dangerous flags popf @@ -139,3 +139,4 @@ verify_cpu: popf # Restore caller passed flags xorl %eax, %eax ret +ENDPROC(verify_cpu) diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 68244742ecb0..5edb27f1a2c4 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -55,6 +55,7 @@ #include <asm/irq.h> #include <asm/traps.h> #include <asm/vm86.h> +#include <asm/switch_to.h> /* * Known problems: @@ -94,7 +95,6 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) { - struct tss_struct *tss; struct task_struct *tsk = current; struct vm86plus_struct __user *user; struct vm86 *vm86 = current->thread.vm86; @@ -146,12 +146,13 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) do_exit(SIGSEGV); } - tss = &per_cpu(cpu_tss, get_cpu()); + preempt_disable(); tsk->thread.sp0 = vm86->saved_sp0; tsk->thread.sysenter_cs = __KERNEL_CS; - load_sp0(tss, &tsk->thread); + update_sp0(tsk); + refresh_sysenter_cs(&tsk->thread); vm86->saved_sp0 = 0; - put_cpu(); + preempt_enable(); memcpy(®s->pt, &vm86->regs32, sizeof(struct pt_regs)); @@ -237,7 +238,6 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg) static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) { - struct tss_struct *tss; struct task_struct *tsk = current; struct vm86 *vm86 = tsk->thread.vm86; struct kernel_vm86_regs vm86regs; @@ -365,15 +365,17 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) vm86->saved_sp0 = tsk->thread.sp0; lazy_save_gs(vm86->regs32.gs); - tss = &per_cpu(cpu_tss, get_cpu()); /* make room for real-mode segments */ + preempt_disable(); tsk->thread.sp0 += 16; - if (static_cpu_has(X86_FEATURE_SEP)) + if (static_cpu_has(X86_FEATURE_SEP)) { tsk->thread.sysenter_cs = 0; + refresh_sysenter_cs(&tsk->thread); + } - load_sp0(tss, &tsk->thread); - put_cpu(); + update_sp0(tsk); + preempt_enable(); if (vm86->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk->mm); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 457f681ef379..7b181b61170e 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -24,7 +24,7 @@ lib-y := delay.o misc.o cmdline.o cpu.o lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o -lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o +lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c new file mode 100644 index 000000000000..35625d279458 --- /dev/null +++ b/arch/x86/lib/insn-eval.c @@ -0,0 +1,1364 @@ +/* + * Utility functions for x86 operand and address decoding + * + * Copyright (C) Intel Corporation 2017 + */ +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/ratelimit.h> +#include <linux/mmu_context.h> +#include <asm/desc_defs.h> +#include <asm/desc.h> +#include <asm/inat.h> +#include <asm/insn.h> +#include <asm/insn-eval.h> +#include <asm/ldt.h> +#include <asm/vm86.h> + +#undef pr_fmt +#define pr_fmt(fmt) "insn: " fmt + +enum reg_type { + REG_TYPE_RM = 0, + REG_TYPE_INDEX, + REG_TYPE_BASE, +}; + +/** + * is_string_insn() - Determine if instruction is a string instruction + * @insn: Instruction containing the opcode to inspect + * + * Returns: + * + * true if the instruction, determined by the opcode, is any of the + * string instructions as defined in the Intel Software Development manual. + * False otherwise. + */ +static bool is_string_insn(struct insn *insn) +{ + insn_get_opcode(insn); + + /* All string instructions have a 1-byte opcode. */ + if (insn->opcode.nbytes != 1) + return false; + + switch (insn->opcode.bytes[0]) { + case 0x6c ... 0x6f: /* INS, OUTS */ + case 0xa4 ... 0xa7: /* MOVS, CMPS */ + case 0xaa ... 0xaf: /* STOS, LODS, SCAS */ + return true; + default: + return false; + } +} + +/** + * get_seg_reg_override_idx() - obtain segment register override index + * @insn: Valid instruction with segment override prefixes + * + * Inspect the instruction prefixes in @insn and find segment overrides, if any. + * + * Returns: + * + * A constant identifying the segment register to use, among CS, SS, DS, + * ES, FS, or GS. INAT_SEG_REG_DEFAULT is returned if no segment override + * prefixes were found. + * + * -EINVAL in case of error. + */ +static int get_seg_reg_override_idx(struct insn *insn) +{ + int idx = INAT_SEG_REG_DEFAULT; + int num_overrides = 0, i; + + insn_get_prefixes(insn); + + /* Look for any segment override prefixes. */ + for (i = 0; i < insn->prefixes.nbytes; i++) { + insn_attr_t attr; + + attr = inat_get_opcode_attribute(insn->prefixes.bytes[i]); + switch (attr) { + case INAT_MAKE_PREFIX(INAT_PFX_CS): + idx = INAT_SEG_REG_CS; + num_overrides++; + break; + case INAT_MAKE_PREFIX(INAT_PFX_SS): + idx = INAT_SEG_REG_SS; + num_overrides++; + break; + case INAT_MAKE_PREFIX(INAT_PFX_DS): + idx = INAT_SEG_REG_DS; + num_overrides++; + break; + case INAT_MAKE_PREFIX(INAT_PFX_ES): + idx = INAT_SEG_REG_ES; + num_overrides++; + break; + case INAT_MAKE_PREFIX(INAT_PFX_FS): + idx = INAT_SEG_REG_FS; + num_overrides++; + break; + case INAT_MAKE_PREFIX(INAT_PFX_GS): + idx = INAT_SEG_REG_GS; + num_overrides++; + break; + /* No default action needed. */ + } + } + + /* More than one segment override prefix leads to undefined behavior. */ + if (num_overrides > 1) + return -EINVAL; + + return idx; +} + +/** + * check_seg_overrides() - check if segment override prefixes are allowed + * @insn: Valid instruction with segment override prefixes + * @regoff: Operand offset, in pt_regs, for which the check is performed + * + * For a particular register used in register-indirect addressing, determine if + * segment override prefixes can be used. Specifically, no overrides are allowed + * for rDI if used with a string instruction. + * + * Returns: + * + * True if segment override prefixes can be used with the register indicated + * in @regoff. False if otherwise. + */ +static bool check_seg_overrides(struct insn *insn, int regoff) +{ + if (regoff == offsetof(struct pt_regs, di) && is_string_insn(insn)) + return false; + + return true; +} + +/** + * resolve_default_seg() - resolve default segment register index for an operand + * @insn: Instruction with opcode and address size. Must be valid. + * @regs: Register values as seen when entering kernel mode + * @off: Operand offset, in pt_regs, for which resolution is needed + * + * Resolve the default segment register index associated with the instruction + * operand register indicated by @off. Such index is resolved based on defaults + * described in the Intel Software Development Manual. + * + * Returns: + * + * If in protected mode, a constant identifying the segment register to use, + * among CS, SS, ES or DS. If in long mode, INAT_SEG_REG_IGNORE. + * + * -EINVAL in case of error. + */ +static int resolve_default_seg(struct insn *insn, struct pt_regs *regs, int off) +{ + if (user_64bit_mode(regs)) + return INAT_SEG_REG_IGNORE; + /* + * Resolve the default segment register as described in Section 3.7.4 + * of the Intel Software Development Manual Vol. 1: + * + * + DS for all references involving r[ABCD]X, and rSI. + * + If used in a string instruction, ES for rDI. Otherwise, DS. + * + AX, CX and DX are not valid register operands in 16-bit address + * encodings but are valid for 32-bit and 64-bit encodings. + * + -EDOM is reserved to identify for cases in which no register + * is used (i.e., displacement-only addressing). Use DS. + * + SS for rSP or rBP. + * + CS for rIP. + */ + + switch (off) { + case offsetof(struct pt_regs, ax): + case offsetof(struct pt_regs, cx): + case offsetof(struct pt_regs, dx): + /* Need insn to verify address size. */ + if (insn->addr_bytes == 2) + return -EINVAL; + + case -EDOM: + case offsetof(struct pt_regs, bx): + case offsetof(struct pt_regs, si): + return INAT_SEG_REG_DS; + + case offsetof(struct pt_regs, di): + if (is_string_insn(insn)) + return INAT_SEG_REG_ES; + return INAT_SEG_REG_DS; + + case offsetof(struct pt_regs, bp): + case offsetof(struct pt_regs, sp): + return INAT_SEG_REG_SS; + + case offsetof(struct pt_regs, ip): + return INAT_SEG_REG_CS; + + default: + return -EINVAL; + } +} + +/** + * resolve_seg_reg() - obtain segment register index + * @insn: Instruction with operands + * @regs: Register values as seen when entering kernel mode + * @regoff: Operand offset, in pt_regs, used to deterimine segment register + * + * Determine the segment register associated with the operands and, if + * applicable, prefixes and the instruction pointed by @insn. + * + * The segment register associated to an operand used in register-indirect + * addressing depends on: + * + * a) Whether running in long mode (in such a case segments are ignored, except + * if FS or GS are used). + * + * b) Whether segment override prefixes can be used. Certain instructions and + * registers do not allow override prefixes. + * + * c) Whether segment overrides prefixes are found in the instruction prefixes. + * + * d) If there are not segment override prefixes or they cannot be used, the + * default segment register associated with the operand register is used. + * + * The function checks first if segment override prefixes can be used with the + * operand indicated by @regoff. If allowed, obtain such overridden segment + * register index. Lastly, if not prefixes were found or cannot be used, resolve + * the segment register index to use based on the defaults described in the + * Intel documentation. In long mode, all segment register indexes will be + * ignored, except if overrides were found for FS or GS. All these operations + * are done using helper functions. + * + * The operand register, @regoff, is represented as the offset from the base of + * pt_regs. + * + * As stated, the main use of this function is to determine the segment register + * index based on the instruction, its operands and prefixes. Hence, @insn + * must be valid. However, if @regoff indicates rIP, we don't need to inspect + * @insn at all as in this case CS is used in all cases. This case is checked + * before proceeding further. + * + * Please note that this function does not return the value in the segment + * register (i.e., the segment selector) but our defined index. The segment + * selector needs to be obtained using get_segment_selector() and passing the + * segment register index resolved by this function. + * + * Returns: + * + * An index identifying the segment register to use, among CS, SS, DS, + * ES, FS, or GS. INAT_SEG_REG_IGNORE is returned if running in long mode. + * + * -EINVAL in case of error. + */ +static int resolve_seg_reg(struct insn *insn, struct pt_regs *regs, int regoff) +{ + int idx; + + /* + * In the unlikely event of having to resolve the segment register + * index for rIP, do it first. Segment override prefixes should not + * be used. Hence, it is not necessary to inspect the instruction, + * which may be invalid at this point. + */ + if (regoff == offsetof(struct pt_regs, ip)) { + if (user_64bit_mode(regs)) + return INAT_SEG_REG_IGNORE; + else + return INAT_SEG_REG_CS; + } + + if (!insn) + return -EINVAL; + + if (!check_seg_overrides(insn, regoff)) + return resolve_default_seg(insn, regs, regoff); + + idx = get_seg_reg_override_idx(insn); + if (idx < 0) + return idx; + + if (idx == INAT_SEG_REG_DEFAULT) + return resolve_default_seg(insn, regs, regoff); + + /* + * In long mode, segment override prefixes are ignored, except for + * overrides for FS and GS. + */ + if (user_64bit_mode(regs)) { + if (idx != INAT_SEG_REG_FS && + idx != INAT_SEG_REG_GS) + idx = INAT_SEG_REG_IGNORE; + } + + return idx; +} + +/** + * get_segment_selector() - obtain segment selector + * @regs: Register values as seen when entering kernel mode + * @seg_reg_idx: Segment register index to use + * + * Obtain the segment selector from any of the CS, SS, DS, ES, FS, GS segment + * registers. In CONFIG_X86_32, the segment is obtained from either pt_regs or + * kernel_vm86_regs as applicable. In CONFIG_X86_64, CS and SS are obtained + * from pt_regs. DS, ES, FS and GS are obtained by reading the actual CPU + * registers. This done for only for completeness as in CONFIG_X86_64 segment + * registers are ignored. + * + * Returns: + * + * Value of the segment selector, including null when running in + * long mode. + * + * -EINVAL on error. + */ +static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx) +{ +#ifdef CONFIG_X86_64 + unsigned short sel; + + switch (seg_reg_idx) { + case INAT_SEG_REG_IGNORE: + return 0; + case INAT_SEG_REG_CS: + return (unsigned short)(regs->cs & 0xffff); + case INAT_SEG_REG_SS: + return (unsigned short)(regs->ss & 0xffff); + case INAT_SEG_REG_DS: + savesegment(ds, sel); + return sel; + case INAT_SEG_REG_ES: + savesegment(es, sel); + return sel; + case INAT_SEG_REG_FS: + savesegment(fs, sel); + return sel; + case INAT_SEG_REG_GS: + savesegment(gs, sel); + return sel; + default: + return -EINVAL; + } +#else /* CONFIG_X86_32 */ + struct kernel_vm86_regs *vm86regs = (struct kernel_vm86_regs *)regs; + + if (v8086_mode(regs)) { + switch (seg_reg_idx) { + case INAT_SEG_REG_CS: + return (unsigned short)(regs->cs & 0xffff); + case INAT_SEG_REG_SS: + return (unsigned short)(regs->ss & 0xffff); + case INAT_SEG_REG_DS: + return vm86regs->ds; + case INAT_SEG_REG_ES: + return vm86regs->es; + case INAT_SEG_REG_FS: + return vm86regs->fs; + case INAT_SEG_REG_GS: + return vm86regs->gs; + case INAT_SEG_REG_IGNORE: + /* fall through */ + default: + return -EINVAL; + } + } + + switch (seg_reg_idx) { + case INAT_SEG_REG_CS: + return (unsigned short)(regs->cs & 0xffff); + case INAT_SEG_REG_SS: + return (unsigned short)(regs->ss & 0xffff); + case INAT_SEG_REG_DS: + return (unsigned short)(regs->ds & 0xffff); + case INAT_SEG_REG_ES: + return (unsigned short)(regs->es & 0xffff); + case INAT_SEG_REG_FS: + return (unsigned short)(regs->fs & 0xffff); + case INAT_SEG_REG_GS: + /* + * GS may or may not be in regs as per CONFIG_X86_32_LAZY_GS. + * The macro below takes care of both cases. + */ + return get_user_gs(regs); + case INAT_SEG_REG_IGNORE: + /* fall through */ + default: + return -EINVAL; + } +#endif /* CONFIG_X86_64 */ +} + +static int get_reg_offset(struct insn *insn, struct pt_regs *regs, + enum reg_type type) +{ + int regno = 0; + + static const int regoff[] = { + offsetof(struct pt_regs, ax), + offsetof(struct pt_regs, cx), + offsetof(struct pt_regs, dx), + offsetof(struct pt_regs, bx), + offsetof(struct pt_regs, sp), + offsetof(struct pt_regs, bp), + offsetof(struct pt_regs, si), + offsetof(struct pt_regs, di), +#ifdef CONFIG_X86_64 + offsetof(struct pt_regs, r8), + offsetof(struct pt_regs, r9), + offsetof(struct pt_regs, r10), + offsetof(struct pt_regs, r11), + offsetof(struct pt_regs, r12), + offsetof(struct pt_regs, r13), + offsetof(struct pt_regs, r14), + offsetof(struct pt_regs, r15), +#endif + }; + int nr_registers = ARRAY_SIZE(regoff); + /* + * Don't possibly decode a 32-bit instructions as + * reading a 64-bit-only register. + */ + if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64) + nr_registers -= 8; + + switch (type) { + case REG_TYPE_RM: + regno = X86_MODRM_RM(insn->modrm.value); + + /* + * ModRM.mod == 0 and ModRM.rm == 5 means a 32-bit displacement + * follows the ModRM byte. + */ + if (!X86_MODRM_MOD(insn->modrm.value) && regno == 5) + return -EDOM; + + if (X86_REX_B(insn->rex_prefix.value)) + regno += 8; + break; + + case REG_TYPE_INDEX: + regno = X86_SIB_INDEX(insn->sib.value); + if (X86_REX_X(insn->rex_prefix.value)) + regno += 8; + + /* + * If ModRM.mod != 3 and SIB.index = 4 the scale*index + * portion of the address computation is null. This is + * true only if REX.X is 0. In such a case, the SIB index + * is used in the address computation. + */ + if (X86_MODRM_MOD(insn->modrm.value) != 3 && regno == 4) + return -EDOM; + break; + + case REG_TYPE_BASE: + regno = X86_SIB_BASE(insn->sib.value); + /* + * If ModRM.mod is 0 and SIB.base == 5, the base of the + * register-indirect addressing is 0. In this case, a + * 32-bit displacement follows the SIB byte. + */ + if (!X86_MODRM_MOD(insn->modrm.value) && regno == 5) + return -EDOM; + + if (X86_REX_B(insn->rex_prefix.value)) + regno += 8; + break; + + default: + pr_err_ratelimited("invalid register type: %d\n", type); + return -EINVAL; + } + + if (regno >= nr_registers) { + WARN_ONCE(1, "decoded an instruction with an invalid register"); + return -EINVAL; + } + return regoff[regno]; +} + +/** + * get_reg_offset_16() - Obtain offset of register indicated by instruction + * @insn: Instruction containing ModRM byte + * @regs: Register values as seen when entering kernel mode + * @offs1: Offset of the first operand register + * @offs2: Offset of the second opeand register, if applicable + * + * Obtain the offset, in pt_regs, of the registers indicated by the ModRM byte + * in @insn. This function is to be used with 16-bit address encodings. The + * @offs1 and @offs2 will be written with the offset of the two registers + * indicated by the instruction. In cases where any of the registers is not + * referenced by the instruction, the value will be set to -EDOM. + * + * Returns: + * + * 0 on success, -EINVAL on error. + */ +static int get_reg_offset_16(struct insn *insn, struct pt_regs *regs, + int *offs1, int *offs2) +{ + /* + * 16-bit addressing can use one or two registers. Specifics of + * encodings are given in Table 2-1. "16-Bit Addressing Forms with the + * ModR/M Byte" of the Intel Software Development Manual. + */ + static const int regoff1[] = { + offsetof(struct pt_regs, bx), + offsetof(struct pt_regs, bx), + offsetof(struct pt_regs, bp), + offsetof(struct pt_regs, bp), + offsetof(struct pt_regs, si), + offsetof(struct pt_regs, di), + offsetof(struct pt_regs, bp), + offsetof(struct pt_regs, bx), + }; + + static const int regoff2[] = { + offsetof(struct pt_regs, si), + offsetof(struct pt_regs, di), + offsetof(struct pt_regs, si), + offsetof(struct pt_regs, di), + -EDOM, + -EDOM, + -EDOM, + -EDOM, + }; + + if (!offs1 || !offs2) + return -EINVAL; + + /* Operand is a register, use the generic function. */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) { + *offs1 = insn_get_modrm_rm_off(insn, regs); + *offs2 = -EDOM; + return 0; + } + + *offs1 = regoff1[X86_MODRM_RM(insn->modrm.value)]; + *offs2 = regoff2[X86_MODRM_RM(insn->modrm.value)]; + + /* + * If ModRM.mod is 0 and ModRM.rm is 110b, then we use displacement- + * only addressing. This means that no registers are involved in + * computing the effective address. Thus, ensure that the first + * register offset is invalild. The second register offset is already + * invalid under the aforementioned conditions. + */ + if ((X86_MODRM_MOD(insn->modrm.value) == 0) && + (X86_MODRM_RM(insn->modrm.value) == 6)) + *offs1 = -EDOM; + + return 0; +} + +/** + * get_desc() - Obtain pointer to a segment descriptor + * @sel: Segment selector + * + * Given a segment selector, obtain a pointer to the segment descriptor. + * Both global and local descriptor tables are supported. + * + * Returns: + * + * Pointer to segment descriptor on success. + * + * NULL on error. + */ +static struct desc_struct *get_desc(unsigned short sel) +{ + struct desc_ptr gdt_desc = {0, 0}; + unsigned long desc_base; + +#ifdef CONFIG_MODIFY_LDT_SYSCALL + if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) { + struct desc_struct *desc = NULL; + struct ldt_struct *ldt; + + /* Bits [15:3] contain the index of the desired entry. */ + sel >>= 3; + + mutex_lock(¤t->active_mm->context.lock); + ldt = current->active_mm->context.ldt; + if (ldt && sel < ldt->nr_entries) + desc = &ldt->entries[sel]; + + mutex_unlock(¤t->active_mm->context.lock); + + return desc; + } +#endif + native_store_gdt(&gdt_desc); + + /* + * Segment descriptors have a size of 8 bytes. Thus, the index is + * multiplied by 8 to obtain the memory offset of the desired descriptor + * from the base of the GDT. As bits [15:3] of the segment selector + * contain the index, it can be regarded as multiplied by 8 already. + * All that remains is to clear bits [2:0]. + */ + desc_base = sel & ~(SEGMENT_RPL_MASK | SEGMENT_TI_MASK); + + if (desc_base > gdt_desc.size) + return NULL; + + return (struct desc_struct *)(gdt_desc.address + desc_base); +} + +/** + * insn_get_seg_base() - Obtain base address of segment descriptor. + * @regs: Register values as seen when entering kernel mode + * @seg_reg_idx: Index of the segment register pointing to seg descriptor + * + * Obtain the base address of the segment as indicated by the segment descriptor + * pointed by the segment selector. The segment selector is obtained from the + * input segment register index @seg_reg_idx. + * + * Returns: + * + * In protected mode, base address of the segment. Zero in long mode, + * except when FS or GS are used. In virtual-8086 mode, the segment + * selector shifted 4 bits to the right. + * + * -1L in case of error. + */ +unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx) +{ + struct desc_struct *desc; + short sel; + + sel = get_segment_selector(regs, seg_reg_idx); + if (sel < 0) + return -1L; + + if (v8086_mode(regs)) + /* + * Base is simply the segment selector shifted 4 + * bits to the right. + */ + return (unsigned long)(sel << 4); + + if (user_64bit_mode(regs)) { + /* + * Only FS or GS will have a base address, the rest of + * the segments' bases are forced to 0. + */ + unsigned long base; + + if (seg_reg_idx == INAT_SEG_REG_FS) + rdmsrl(MSR_FS_BASE, base); + else if (seg_reg_idx == INAT_SEG_REG_GS) + /* + * swapgs was called at the kernel entry point. Thus, + * MSR_KERNEL_GS_BASE will have the user-space GS base. + */ + rdmsrl(MSR_KERNEL_GS_BASE, base); + else + base = 0; + return base; + } + + /* In protected mode the segment selector cannot be null. */ + if (!sel) + return -1L; + + desc = get_desc(sel); + if (!desc) + return -1L; + + return get_desc_base(desc); +} + +/** + * get_seg_limit() - Obtain the limit of a segment descriptor + * @regs: Register values as seen when entering kernel mode + * @seg_reg_idx: Index of the segment register pointing to seg descriptor + * + * Obtain the limit of the segment as indicated by the segment descriptor + * pointed by the segment selector. The segment selector is obtained from the + * input segment register index @seg_reg_idx. + * + * Returns: + * + * In protected mode, the limit of the segment descriptor in bytes. + * In long mode and virtual-8086 mode, segment limits are not enforced. Thus, + * limit is returned as -1L to imply a limit-less segment. + * + * Zero is returned on error. + */ +static unsigned long get_seg_limit(struct pt_regs *regs, int seg_reg_idx) +{ + struct desc_struct *desc; + unsigned long limit; + short sel; + + sel = get_segment_selector(regs, seg_reg_idx); + if (sel < 0) + return 0; + + if (user_64bit_mode(regs) || v8086_mode(regs)) + return -1L; + + if (!sel) + return 0; + + desc = get_desc(sel); + if (!desc) + return 0; + + /* + * If the granularity bit is set, the limit is given in multiples + * of 4096. This also means that the 12 least significant bits are + * not tested when checking the segment limits. In practice, + * this means that the segment ends in (limit << 12) + 0xfff. + */ + limit = get_desc_limit(desc); + if (desc->g) + limit = (limit << 12) + 0xfff; + + return limit; +} + +/** + * insn_get_code_seg_params() - Obtain code segment parameters + * @regs: Structure with register values as seen when entering kernel mode + * + * Obtain address and operand sizes of the code segment. It is obtained from the + * selector contained in the CS register in regs. In protected mode, the default + * address is determined by inspecting the L and D bits of the segment + * descriptor. In virtual-8086 mode, the default is always two bytes for both + * address and operand sizes. + * + * Returns: + * + * A signed 8-bit value containing the default parameters on success. + * + * -EINVAL on error. + */ +char insn_get_code_seg_params(struct pt_regs *regs) +{ + struct desc_struct *desc; + short sel; + + if (v8086_mode(regs)) + /* Address and operand size are both 16-bit. */ + return INSN_CODE_SEG_PARAMS(2, 2); + + sel = get_segment_selector(regs, INAT_SEG_REG_CS); + if (sel < 0) + return sel; + + desc = get_desc(sel); + if (!desc) + return -EINVAL; + + /* + * The most significant byte of the Type field of the segment descriptor + * determines whether a segment contains data or code. If this is a data + * segment, return error. + */ + if (!(desc->type & BIT(3))) + return -EINVAL; + + switch ((desc->l << 1) | desc->d) { + case 0: /* + * Legacy mode. CS.L=0, CS.D=0. Address and operand size are + * both 16-bit. + */ + return INSN_CODE_SEG_PARAMS(2, 2); + case 1: /* + * Legacy mode. CS.L=0, CS.D=1. Address and operand size are + * both 32-bit. + */ + return INSN_CODE_SEG_PARAMS(4, 4); + case 2: /* + * IA-32e 64-bit mode. CS.L=1, CS.D=0. Address size is 64-bit; + * operand size is 32-bit. + */ + return INSN_CODE_SEG_PARAMS(4, 8); + case 3: /* Invalid setting. CS.L=1, CS.D=1 */ + /* fall through */ + default: + return -EINVAL; + } +} + +/** + * insn_get_modrm_rm_off() - Obtain register in r/m part of the ModRM byte + * @insn: Instruction containing the ModRM byte + * @regs: Register values as seen when entering kernel mode + * + * Returns: + * + * The register indicated by the r/m part of the ModRM byte. The + * register is obtained as an offset from the base of pt_regs. In specific + * cases, the returned value can be -EDOM to indicate that the particular value + * of ModRM does not refer to a register and shall be ignored. + */ +int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs) +{ + return get_reg_offset(insn, regs, REG_TYPE_RM); +} + +/** + * get_seg_base_limit() - obtain base address and limit of a segment + * @insn: Instruction. Must be valid. + * @regs: Register values as seen when entering kernel mode + * @regoff: Operand offset, in pt_regs, used to resolve segment descriptor + * @base: Obtained segment base + * @limit: Obtained segment limit + * + * Obtain the base address and limit of the segment associated with the operand + * @regoff and, if any or allowed, override prefixes in @insn. This function is + * different from insn_get_seg_base() as the latter does not resolve the segment + * associated with the instruction operand. If a limit is not needed (e.g., + * when running in long mode), @limit can be NULL. + * + * Returns: + * + * 0 on success. @base and @limit will contain the base address and of the + * resolved segment, respectively. + * + * -EINVAL on error. + */ +static int get_seg_base_limit(struct insn *insn, struct pt_regs *regs, + int regoff, unsigned long *base, + unsigned long *limit) +{ + int seg_reg_idx; + + if (!base) + return -EINVAL; + + seg_reg_idx = resolve_seg_reg(insn, regs, regoff); + if (seg_reg_idx < 0) + return seg_reg_idx; + + *base = insn_get_seg_base(regs, seg_reg_idx); + if (*base == -1L) + return -EINVAL; + + if (!limit) + return 0; + + *limit = get_seg_limit(regs, seg_reg_idx); + if (!(*limit)) + return -EINVAL; + + return 0; +} + +/** + * get_eff_addr_reg() - Obtain effective address from register operand + * @insn: Instruction. Must be valid. + * @regs: Register values as seen when entering kernel mode + * @regoff: Obtained operand offset, in pt_regs, with the effective address + * @eff_addr: Obtained effective address + * + * Obtain the effective address stored in the register operand as indicated by + * the ModRM byte. This function is to be used only with register addressing + * (i.e., ModRM.mod is 3). The effective address is saved in @eff_addr. The + * register operand, as an offset from the base of pt_regs, is saved in @regoff; + * such offset can then be used to resolve the segment associated with the + * operand. This function can be used with any of the supported address sizes + * in x86. + * + * Returns: + * + * 0 on success. @eff_addr will have the effective address stored in the + * operand indicated by ModRM. @regoff will have such operand as an offset from + * the base of pt_regs. + * + * -EINVAL on error. + */ +static int get_eff_addr_reg(struct insn *insn, struct pt_regs *regs, + int *regoff, long *eff_addr) +{ + insn_get_modrm(insn); + + if (!insn->modrm.nbytes) + return -EINVAL; + + if (X86_MODRM_MOD(insn->modrm.value) != 3) + return -EINVAL; + + *regoff = get_reg_offset(insn, regs, REG_TYPE_RM); + if (*regoff < 0) + return -EINVAL; + + /* Ignore bytes that are outside the address size. */ + if (insn->addr_bytes == 2) + *eff_addr = regs_get_register(regs, *regoff) & 0xffff; + else if (insn->addr_bytes == 4) + *eff_addr = regs_get_register(regs, *regoff) & 0xffffffff; + else /* 64-bit address */ + *eff_addr = regs_get_register(regs, *regoff); + + return 0; +} + +/** + * get_eff_addr_modrm() - Obtain referenced effective address via ModRM + * @insn: Instruction. Must be valid. + * @regs: Register values as seen when entering kernel mode + * @regoff: Obtained operand offset, in pt_regs, associated with segment + * @eff_addr: Obtained effective address + * + * Obtain the effective address referenced by the ModRM byte of @insn. After + * identifying the registers involved in the register-indirect memory reference, + * its value is obtained from the operands in @regs. The computed address is + * stored @eff_addr. Also, the register operand that indicates the associated + * segment is stored in @regoff, this parameter can later be used to determine + * such segment. + * + * Returns: + * + * 0 on success. @eff_addr will have the referenced effective address. @regoff + * will have a register, as an offset from the base of pt_regs, that can be used + * to resolve the associated segment. + * + * -EINVAL on error. + */ +static int get_eff_addr_modrm(struct insn *insn, struct pt_regs *regs, + int *regoff, long *eff_addr) +{ + long tmp; + + if (insn->addr_bytes != 8 && insn->addr_bytes != 4) + return -EINVAL; + + insn_get_modrm(insn); + + if (!insn->modrm.nbytes) + return -EINVAL; + + if (X86_MODRM_MOD(insn->modrm.value) > 2) + return -EINVAL; + + *regoff = get_reg_offset(insn, regs, REG_TYPE_RM); + + /* + * -EDOM means that we must ignore the address_offset. In such a case, + * in 64-bit mode the effective address relative to the rIP of the + * following instruction. + */ + if (*regoff == -EDOM) { + if (user_64bit_mode(regs)) + tmp = regs->ip + insn->length; + else + tmp = 0; + } else if (*regoff < 0) { + return -EINVAL; + } else { + tmp = regs_get_register(regs, *regoff); + } + + if (insn->addr_bytes == 4) { + int addr32 = (int)(tmp & 0xffffffff) + insn->displacement.value; + + *eff_addr = addr32 & 0xffffffff; + } else { + *eff_addr = tmp + insn->displacement.value; + } + + return 0; +} + +/** + * get_eff_addr_modrm_16() - Obtain referenced effective address via ModRM + * @insn: Instruction. Must be valid. + * @regs: Register values as seen when entering kernel mode + * @regoff: Obtained operand offset, in pt_regs, associated with segment + * @eff_addr: Obtained effective address + * + * Obtain the 16-bit effective address referenced by the ModRM byte of @insn. + * After identifying the registers involved in the register-indirect memory + * reference, its value is obtained from the operands in @regs. The computed + * address is stored @eff_addr. Also, the register operand that indicates + * the associated segment is stored in @regoff, this parameter can later be used + * to determine such segment. + * + * Returns: + * + * 0 on success. @eff_addr will have the referenced effective address. @regoff + * will have a register, as an offset from the base of pt_regs, that can be used + * to resolve the associated segment. + * + * -EINVAL on error. + */ +static int get_eff_addr_modrm_16(struct insn *insn, struct pt_regs *regs, + int *regoff, short *eff_addr) +{ + int addr_offset1, addr_offset2, ret; + short addr1 = 0, addr2 = 0, displacement; + + if (insn->addr_bytes != 2) + return -EINVAL; + + insn_get_modrm(insn); + + if (!insn->modrm.nbytes) + return -EINVAL; + + if (X86_MODRM_MOD(insn->modrm.value) > 2) + return -EINVAL; + + ret = get_reg_offset_16(insn, regs, &addr_offset1, &addr_offset2); + if (ret < 0) + return -EINVAL; + + /* + * Don't fail on invalid offset values. They might be invalid because + * they cannot be used for this particular value of ModRM. Instead, use + * them in the computation only if they contain a valid value. + */ + if (addr_offset1 != -EDOM) + addr1 = regs_get_register(regs, addr_offset1) & 0xffff; + + if (addr_offset2 != -EDOM) + addr2 = regs_get_register(regs, addr_offset2) & 0xffff; + + displacement = insn->displacement.value & 0xffff; + *eff_addr = addr1 + addr2 + displacement; + + /* + * The first operand register could indicate to use of either SS or DS + * registers to obtain the segment selector. The second operand + * register can only indicate the use of DS. Thus, the first operand + * will be used to obtain the segment selector. + */ + *regoff = addr_offset1; + + return 0; +} + +/** + * get_eff_addr_sib() - Obtain referenced effective address via SIB + * @insn: Instruction. Must be valid. + * @regs: Register values as seen when entering kernel mode + * @regoff: Obtained operand offset, in pt_regs, associated with segment + * @eff_addr: Obtained effective address + * + * Obtain the effective address referenced by the SIB byte of @insn. After + * identifying the registers involved in the indexed, register-indirect memory + * reference, its value is obtained from the operands in @regs. The computed + * address is stored @eff_addr. Also, the register operand that indicates the + * associated segment is stored in @regoff, this parameter can later be used to + * determine such segment. + * + * Returns: + * + * 0 on success. @eff_addr will have the referenced effective address. + * @base_offset will have a register, as an offset from the base of pt_regs, + * that can be used to resolve the associated segment. + * + * -EINVAL on error. + */ +static int get_eff_addr_sib(struct insn *insn, struct pt_regs *regs, + int *base_offset, long *eff_addr) +{ + long base, indx; + int indx_offset; + + if (insn->addr_bytes != 8 && insn->addr_bytes != 4) + return -EINVAL; + + insn_get_modrm(insn); + + if (!insn->modrm.nbytes) + return -EINVAL; + + if (X86_MODRM_MOD(insn->modrm.value) > 2) + return -EINVAL; + + insn_get_sib(insn); + + if (!insn->sib.nbytes) + return -EINVAL; + + *base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE); + indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX); + + /* + * Negative values in the base and index offset means an error when + * decoding the SIB byte. Except -EDOM, which means that the registers + * should not be used in the address computation. + */ + if (*base_offset == -EDOM) + base = 0; + else if (*base_offset < 0) + return -EINVAL; + else + base = regs_get_register(regs, *base_offset); + + if (indx_offset == -EDOM) + indx = 0; + else if (indx_offset < 0) + return -EINVAL; + else + indx = regs_get_register(regs, indx_offset); + + if (insn->addr_bytes == 4) { + int addr32, base32, idx32; + + base32 = base & 0xffffffff; + idx32 = indx & 0xffffffff; + + addr32 = base32 + idx32 * (1 << X86_SIB_SCALE(insn->sib.value)); + addr32 += insn->displacement.value; + + *eff_addr = addr32 & 0xffffffff; + } else { + *eff_addr = base + indx * (1 << X86_SIB_SCALE(insn->sib.value)); + *eff_addr += insn->displacement.value; + } + + return 0; +} + +/** + * get_addr_ref_16() - Obtain the 16-bit address referred by instruction + * @insn: Instruction containing ModRM byte and displacement + * @regs: Register values as seen when entering kernel mode + * + * This function is to be used with 16-bit address encodings. Obtain the memory + * address referred by the instruction's ModRM and displacement bytes. Also, the + * segment used as base is determined by either any segment override prefixes in + * @insn or the default segment of the registers involved in the address + * computation. In protected mode, segment limits are enforced. + * + * Returns: + * + * Linear address referenced by the instruction operands on success. + * + * -1L on error. + */ +static void __user *get_addr_ref_16(struct insn *insn, struct pt_regs *regs) +{ + unsigned long linear_addr = -1L, seg_base, seg_limit; + int ret, regoff; + short eff_addr; + long tmp; + + insn_get_modrm(insn); + insn_get_displacement(insn); + + if (insn->addr_bytes != 2) + goto out; + + if (X86_MODRM_MOD(insn->modrm.value) == 3) { + ret = get_eff_addr_reg(insn, regs, ®off, &tmp); + if (ret) + goto out; + + eff_addr = tmp; + } else { + ret = get_eff_addr_modrm_16(insn, regs, ®off, &eff_addr); + if (ret) + goto out; + } + + ret = get_seg_base_limit(insn, regs, regoff, &seg_base, &seg_limit); + if (ret) + goto out; + + /* + * Before computing the linear address, make sure the effective address + * is within the limits of the segment. In virtual-8086 mode, segment + * limits are not enforced. In such a case, the segment limit is -1L to + * reflect this fact. + */ + if ((unsigned long)(eff_addr & 0xffff) > seg_limit) + goto out; + + linear_addr = (unsigned long)(eff_addr & 0xffff) + seg_base; + + /* Limit linear address to 20 bits */ + if (v8086_mode(regs)) + linear_addr &= 0xfffff; + +out: + return (void __user *)linear_addr; +} + +/** + * get_addr_ref_32() - Obtain a 32-bit linear address + * @insn: Instruction with ModRM, SIB bytes and displacement + * @regs: Register values as seen when entering kernel mode + * + * This function is to be used with 32-bit address encodings to obtain the + * linear memory address referred by the instruction's ModRM, SIB, + * displacement bytes and segment base address, as applicable. If in protected + * mode, segment limits are enforced. + * + * Returns: + * + * Linear address referenced by instruction and registers on success. + * + * -1L on error. + */ +static void __user *get_addr_ref_32(struct insn *insn, struct pt_regs *regs) +{ + unsigned long linear_addr = -1L, seg_base, seg_limit; + int eff_addr, regoff; + long tmp; + int ret; + + if (insn->addr_bytes != 4) + goto out; + + if (X86_MODRM_MOD(insn->modrm.value) == 3) { + ret = get_eff_addr_reg(insn, regs, ®off, &tmp); + if (ret) + goto out; + + eff_addr = tmp; + + } else { + if (insn->sib.nbytes) { + ret = get_eff_addr_sib(insn, regs, ®off, &tmp); + if (ret) + goto out; + + eff_addr = tmp; + } else { + ret = get_eff_addr_modrm(insn, regs, ®off, &tmp); + if (ret) + goto out; + + eff_addr = tmp; + } + } + + ret = get_seg_base_limit(insn, regs, regoff, &seg_base, &seg_limit); + if (ret) + goto out; + + /* + * In protected mode, before computing the linear address, make sure + * the effective address is within the limits of the segment. + * 32-bit addresses can be used in long and virtual-8086 modes if an + * address override prefix is used. In such cases, segment limits are + * not enforced. When in virtual-8086 mode, the segment limit is -1L + * to reflect this situation. + * + * After computed, the effective address is treated as an unsigned + * quantity. + */ + if (!user_64bit_mode(regs) && ((unsigned int)eff_addr > seg_limit)) + goto out; + + /* + * Even though 32-bit address encodings are allowed in virtual-8086 + * mode, the address range is still limited to [0x-0xffff]. + */ + if (v8086_mode(regs) && (eff_addr & ~0xffff)) + goto out; + + /* + * Data type long could be 64 bits in size. Ensure that our 32-bit + * effective address is not sign-extended when computing the linear + * address. + */ + linear_addr = (unsigned long)(eff_addr & 0xffffffff) + seg_base; + + /* Limit linear address to 20 bits */ + if (v8086_mode(regs)) + linear_addr &= 0xfffff; + +out: + return (void __user *)linear_addr; +} + +/** + * get_addr_ref_64() - Obtain a 64-bit linear address + * @insn: Instruction struct with ModRM and SIB bytes and displacement + * @regs: Structure with register values as seen when entering kernel mode + * + * This function is to be used with 64-bit address encodings to obtain the + * linear memory address referred by the instruction's ModRM, SIB, + * displacement bytes and segment base address, as applicable. + * + * Returns: + * + * Linear address referenced by instruction and registers on success. + * + * -1L on error. + */ +#ifndef CONFIG_X86_64 +static void __user *get_addr_ref_64(struct insn *insn, struct pt_regs *regs) +{ + return (void __user *)-1L; +} +#else +static void __user *get_addr_ref_64(struct insn *insn, struct pt_regs *regs) +{ + unsigned long linear_addr = -1L, seg_base; + int regoff, ret; + long eff_addr; + + if (insn->addr_bytes != 8) + goto out; + + if (X86_MODRM_MOD(insn->modrm.value) == 3) { + ret = get_eff_addr_reg(insn, regs, ®off, &eff_addr); + if (ret) + goto out; + + } else { + if (insn->sib.nbytes) { + ret = get_eff_addr_sib(insn, regs, ®off, &eff_addr); + if (ret) + goto out; + } else { + ret = get_eff_addr_modrm(insn, regs, ®off, &eff_addr); + if (ret) + goto out; + } + + } + + ret = get_seg_base_limit(insn, regs, regoff, &seg_base, NULL); + if (ret) + goto out; + + linear_addr = (unsigned long)eff_addr + seg_base; + +out: + return (void __user *)linear_addr; +} +#endif /* CONFIG_X86_64 */ + +/** + * insn_get_addr_ref() - Obtain the linear address referred by instruction + * @insn: Instruction structure containing ModRM byte and displacement + * @regs: Structure with register values as seen when entering kernel mode + * + * Obtain the linear address referred by the instruction's ModRM, SIB and + * displacement bytes, and segment base, as applicable. In protected mode, + * segment limits are enforced. + * + * Returns: + * + * Linear address referenced by instruction and registers on success. + * + * -1L on error. + */ +void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) +{ + if (!insn || !regs) + return (void __user *)-1L; + + switch (insn->addr_bytes) { + case 2: + return get_addr_ref_16(insn, regs); + case 4: + return get_addr_ref_32(insn, regs); + case 8: + return get_addr_ref_64(insn, regs); + default: + return (void __user *)-1L; + } +} diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b0ff378650a9..3109ba6c6ede 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -30,26 +30,6 @@ #include <asm/trace/exceptions.h> /* - * Page fault error code bits: - * - * bit 0 == 0: no page found 1: protection fault - * bit 1 == 0: read access 1: write access - * bit 2 == 0: kernel-mode access 1: user-mode access - * bit 3 == 1: use of reserved bit detected - * bit 4 == 1: fault was an instruction fetch - * bit 5 == 1: protection keys block access - */ -enum x86_pf_error_code { - - PF_PROT = 1 << 0, - PF_WRITE = 1 << 1, - PF_USER = 1 << 2, - PF_RSVD = 1 << 3, - PF_INSTR = 1 << 4, - PF_PK = 1 << 5, -}; - -/* * Returns 0 if mmiotrace is disabled, or if the fault is not * handled by mmiotrace: */ @@ -150,7 +130,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) * If it was a exec (instruction fetch) fault on NX page, then * do not ignore the fault: */ - if (error_code & PF_INSTR) + if (error_code & X86_PF_INSTR) return 0; instr = (void *)convert_ip_to_linear(current, regs); @@ -180,7 +160,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) * siginfo so userspace can discover which protection key was set * on the PTE. * - * If we get here, we know that the hardware signaled a PF_PK + * If we get here, we know that the hardware signaled a X86_PF_PK * fault and that there was a VMA once we got in the fault * handler. It does *not* guarantee that the VMA we find here * was the one that we faulted on. @@ -205,7 +185,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey) /* * force_sig_info_fault() is called from a number of * contexts, some of which have a VMA and some of which - * do not. The PF_PK handing happens after we have a + * do not. The X86_PF_PK handing happens after we have a * valid VMA, so we should never reach this without a * valid VMA. */ @@ -698,7 +678,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, if (!oops_may_print()) return; - if (error_code & PF_INSTR) { + if (error_code & X86_PF_INSTR) { unsigned int level; pgd_t *pgd; pte_t *pte; @@ -780,7 +760,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, */ if (current->thread.sig_on_uaccess_err && signal) { tsk->thread.trap_nr = X86_TRAP_PF; - tsk->thread.error_code = error_code | PF_USER; + tsk->thread.error_code = error_code | X86_PF_USER; tsk->thread.cr2 = address; /* XXX: hwpoison faults will set the wrong code. */ @@ -898,7 +878,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; /* User mode accesses just cause a SIGSEGV */ - if (error_code & PF_USER) { + if (error_code & X86_PF_USER) { /* * It's possible to have interrupts off here: */ @@ -919,7 +899,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, * Instruction fetch faults in the vsyscall page might need * emulation. */ - if (unlikely((error_code & PF_INSTR) && + if (unlikely((error_code & X86_PF_INSTR) && ((address & ~0xfff) == VSYSCALL_ADDR))) { if (emulate_vsyscall(regs, address)) return; @@ -932,7 +912,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, * are always protection faults. */ if (address >= TASK_SIZE_MAX) - error_code |= PF_PROT; + error_code |= X86_PF_PROT; if (likely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); @@ -993,11 +973,11 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code, if (!boot_cpu_has(X86_FEATURE_OSPKE)) return false; - if (error_code & PF_PK) + if (error_code & X86_PF_PK) return true; /* this checks permission keys on the VMA: */ - if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), - (error_code & PF_INSTR), foreign)) + if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), + (error_code & X86_PF_INSTR), foreign)) return true; return false; } @@ -1025,7 +1005,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, int code = BUS_ADRERR; /* Kernel mode? Handle exceptions or die: */ - if (!(error_code & PF_USER)) { + if (!(error_code & X86_PF_USER)) { no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); return; } @@ -1053,14 +1033,14 @@ static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 *pkey, unsigned int fault) { - if (fatal_signal_pending(current) && !(error_code & PF_USER)) { + if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) { no_context(regs, error_code, address, 0, 0); return; } if (fault & VM_FAULT_OOM) { /* Kernel mode? Handle exceptions or die: */ - if (!(error_code & PF_USER)) { + if (!(error_code & X86_PF_USER)) { no_context(regs, error_code, address, SIGSEGV, SEGV_MAPERR); return; @@ -1085,16 +1065,16 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, static int spurious_fault_check(unsigned long error_code, pte_t *pte) { - if ((error_code & PF_WRITE) && !pte_write(*pte)) + if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) return 0; - if ((error_code & PF_INSTR) && !pte_exec(*pte)) + if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) return 0; /* * Note: We do not do lazy flushing on protection key - * changes, so no spurious fault will ever set PF_PK. + * changes, so no spurious fault will ever set X86_PF_PK. */ - if ((error_code & PF_PK)) + if ((error_code & X86_PF_PK)) return 1; return 1; @@ -1140,8 +1120,8 @@ spurious_fault(unsigned long error_code, unsigned long address) * change, so user accesses are not expected to cause spurious * faults. */ - if (error_code != (PF_WRITE | PF_PROT) - && error_code != (PF_INSTR | PF_PROT)) + if (error_code != (X86_PF_WRITE | X86_PF_PROT) && + error_code != (X86_PF_INSTR | X86_PF_PROT)) return 0; pgd = init_mm.pgd + pgd_index(address); @@ -1201,19 +1181,19 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) * always an unconditional error and can never result in * a follow-up action to resolve the fault, like a COW. */ - if (error_code & PF_PK) + if (error_code & X86_PF_PK) return 1; /* * Make sure to check the VMA so that we do not perform - * faults just to hit a PF_PK as soon as we fill in a + * faults just to hit a X86_PF_PK as soon as we fill in a * page. */ - if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), - (error_code & PF_INSTR), foreign)) + if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), + (error_code & X86_PF_INSTR), foreign)) return 1; - if (error_code & PF_WRITE) { + if (error_code & X86_PF_WRITE) { /* write, present and write, not present: */ if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; @@ -1221,7 +1201,7 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) } /* read, present: */ - if (unlikely(error_code & PF_PROT)) + if (unlikely(error_code & X86_PF_PROT)) return 1; /* read, not present: */ @@ -1244,7 +1224,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) if (!static_cpu_has(X86_FEATURE_SMAP)) return false; - if (error_code & PF_USER) + if (error_code & X86_PF_USER) return false; if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC)) @@ -1297,7 +1277,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * protection error (error_code & 9) == 0. */ if (unlikely(fault_in_kernel_space(address))) { - if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { + if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { if (vmalloc_fault(address) >= 0) return; @@ -1325,7 +1305,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, if (unlikely(kprobes_fault(regs))) return; - if (unlikely(error_code & PF_RSVD)) + if (unlikely(error_code & X86_PF_RSVD)) pgtable_bad(regs, error_code, address); if (unlikely(smap_violation(error_code, regs))) { @@ -1351,7 +1331,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, */ if (user_mode(regs)) { local_irq_enable(); - error_code |= PF_USER; + error_code |= X86_PF_USER; flags |= FAULT_FLAG_USER; } else { if (regs->flags & X86_EFLAGS_IF) @@ -1360,9 +1340,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - if (error_code & PF_WRITE) + if (error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; - if (error_code & PF_INSTR) + if (error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; /* @@ -1382,7 +1362,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * space check, thus avoiding the deadlock: */ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { - if ((error_code & PF_USER) == 0 && + if (!(error_code & X86_PF_USER) && !search_exception_tables(regs->ip)) { bad_area_nosemaphore(regs, error_code, address, NULL); return; @@ -1409,7 +1389,7 @@ retry: bad_area(regs, error_code, address); return; } - if (error_code & PF_USER) { + if (error_code & X86_PF_USER) { /* * Accessing the stack below %sp is always a bug. * The large cushion allows instructions like enter diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 048fbe8fc274..adcea90a2046 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1426,16 +1426,16 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE) void register_page_bootmem_memmap(unsigned long section_nr, - struct page *start_page, unsigned long size) + struct page *start_page, unsigned long nr_pages) { unsigned long addr = (unsigned long)start_page; - unsigned long end = (unsigned long)(start_page + size); + unsigned long end = (unsigned long)(start_page + nr_pages); unsigned long next; pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; - unsigned int nr_pages; + unsigned int nr_pmd_pages; struct page *page; for (; addr < end; addr = next) { @@ -1482,9 +1482,9 @@ void register_page_bootmem_memmap(unsigned long section_nr, if (pmd_none(*pmd)) continue; - nr_pages = 1 << (get_order(PMD_SIZE)); + nr_pmd_pages = 1 << get_order(PMD_SIZE); page = pmd_page(*pmd); - while (nr_pages--) + while (nr_pmd_pages--) get_page_bootmem(section_nr, page++, SECTION_INFO); } diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 34f0e1847dd6..6e4573b1da34 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -27,6 +27,11 @@ #include "physaddr.h" +struct ioremap_mem_flags { + bool system_ram; + bool desc_other; +}; + /* * Fix up the linear direct mapping of the kernel to avoid cache attribute * conflicts. @@ -56,17 +61,59 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size, return err; } -static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages, - void *arg) +static bool __ioremap_check_ram(struct resource *res) { + unsigned long start_pfn, stop_pfn; unsigned long i; - for (i = 0; i < nr_pages; ++i) - if (pfn_valid(start_pfn + i) && - !PageReserved(pfn_to_page(start_pfn + i))) - return 1; + if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM) + return false; - return 0; + start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT; + stop_pfn = (res->end + 1) >> PAGE_SHIFT; + if (stop_pfn > start_pfn) { + for (i = 0; i < (stop_pfn - start_pfn); ++i) + if (pfn_valid(start_pfn + i) && + !PageReserved(pfn_to_page(start_pfn + i))) + return true; + } + + return false; +} + +static int __ioremap_check_desc_other(struct resource *res) +{ + return (res->desc != IORES_DESC_NONE); +} + +static int __ioremap_res_check(struct resource *res, void *arg) +{ + struct ioremap_mem_flags *flags = arg; + + if (!flags->system_ram) + flags->system_ram = __ioremap_check_ram(res); + + if (!flags->desc_other) + flags->desc_other = __ioremap_check_desc_other(res); + + return flags->system_ram && flags->desc_other; +} + +/* + * To avoid multiple resource walks, this function walks resources marked as + * IORESOURCE_MEM and IORESOURCE_BUSY and looking for system RAM and/or a + * resource described not as IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES). + */ +static void __ioremap_check_mem(resource_size_t addr, unsigned long size, + struct ioremap_mem_flags *flags) +{ + u64 start, end; + + start = (u64)addr; + end = start + size - 1; + memset(flags, 0, sizeof(*flags)); + + walk_mem_res(start, end, flags, __ioremap_res_check); } /* @@ -87,9 +134,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, unsigned long size, enum page_cache_mode pcm, void *caller) { unsigned long offset, vaddr; - resource_size_t pfn, last_pfn, last_addr; + resource_size_t last_addr; const resource_size_t unaligned_phys_addr = phys_addr; const unsigned long unaligned_size = size; + struct ioremap_mem_flags mem_flags; struct vm_struct *area; enum page_cache_mode new_pcm; pgprot_t prot; @@ -108,13 +156,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, return NULL; } + __ioremap_check_mem(phys_addr, size, &mem_flags); + /* * Don't allow anybody to remap normal RAM that we're using.. */ - pfn = phys_addr >> PAGE_SHIFT; - last_pfn = last_addr >> PAGE_SHIFT; - if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL, - __ioremap_check_ram) == 1) { + if (mem_flags.system_ram) { WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n", &phys_addr, &last_addr); return NULL; @@ -146,7 +193,15 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, pcm = new_pcm; } + /* + * If the page being mapped is in memory and SEV is active then + * make sure the memory encryption attribute is enabled in the + * resulting mapping. + */ prot = PAGE_KERNEL_IO; + if (sev_active() && mem_flags.desc_other) + prot = pgprot_encrypted(prot); + switch (pcm) { case _PAGE_CACHE_MODE_UC: default: @@ -422,6 +477,9 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) * areas should be mapped decrypted. And since the encryption key can * change across reboots, persistent memory should also be mapped * decrypted. + * + * If SEV is active, that implies that BIOS/UEFI also ran encrypted so + * only persistent memory should be mapped decrypted. */ static bool memremap_should_map_decrypted(resource_size_t phys_addr, unsigned long size) @@ -458,6 +516,11 @@ static bool memremap_should_map_decrypted(resource_size_t phys_addr, case E820_TYPE_ACPI: case E820_TYPE_NVS: case E820_TYPE_UNUSABLE: + /* For SEV, these areas are encrypted */ + if (sev_active()) + break; + /* Fallthrough */ + case E820_TYPE_PRAM: return true; default: @@ -581,7 +644,7 @@ static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size, unsigned long flags) { - if (!sme_active()) + if (!mem_encrypt_active()) return true; if (flags & MEMREMAP_ENC) @@ -590,12 +653,13 @@ bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size, if (flags & MEMREMAP_DEC) return false; - if (memremap_is_setup_data(phys_addr, size) || - memremap_is_efi_data(phys_addr, size) || - memremap_should_map_decrypted(phys_addr, size)) - return false; + if (sme_active()) { + if (memremap_is_setup_data(phys_addr, size) || + memremap_is_efi_data(phys_addr, size)) + return false; + } - return true; + return !memremap_should_map_decrypted(phys_addr, size); } /* @@ -608,17 +672,24 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, unsigned long size, pgprot_t prot) { - if (!sme_active()) + bool encrypted_prot; + + if (!mem_encrypt_active()) return prot; - if (early_memremap_is_setup_data(phys_addr, size) || - memremap_is_efi_data(phys_addr, size) || - memremap_should_map_decrypted(phys_addr, size)) - prot = pgprot_decrypted(prot); - else - prot = pgprot_encrypted(prot); + encrypted_prot = true; + + if (sme_active()) { + if (early_memremap_is_setup_data(phys_addr, size) || + memremap_is_efi_data(phys_addr, size)) + encrypted_prot = false; + } + + if (encrypted_prot && memremap_should_map_decrypted(phys_addr, size)) + encrypted_prot = false; - return prot; + return encrypted_prot ? pgprot_encrypted(prot) + : pgprot_decrypted(prot); } bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size) diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 8f5be3eb40dd..2b60dc6e64b1 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -16,6 +16,8 @@ extern struct range pfn_mapped[E820_MAX_ENTRIES]; +static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); + static int __init map_range(struct range *range) { unsigned long start; @@ -31,8 +33,10 @@ static void __init clear_pgds(unsigned long start, unsigned long end) { pgd_t *pgd; + /* See comment in kasan_init() */ + unsigned long pgd_end = end & PGDIR_MASK; - for (; start < end; start += PGDIR_SIZE) { + for (; start < pgd_end; start += PGDIR_SIZE) { pgd = pgd_offset_k(start); /* * With folded p4d, pgd_clear() is nop, use p4d_clear() @@ -43,29 +47,61 @@ static void __init clear_pgds(unsigned long start, else pgd_clear(pgd); } + + pgd = pgd_offset_k(start); + for (; start < end; start += P4D_SIZE) + p4d_clear(p4d_offset(pgd, start)); +} + +static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr) +{ + unsigned long p4d; + + if (!IS_ENABLED(CONFIG_X86_5LEVEL)) + return (p4d_t *)pgd; + + p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK; + p4d += __START_KERNEL_map - phys_base; + return (p4d_t *)p4d + p4d_index(addr); +} + +static void __init kasan_early_p4d_populate(pgd_t *pgd, + unsigned long addr, + unsigned long end) +{ + pgd_t pgd_entry; + p4d_t *p4d, p4d_entry; + unsigned long next; + + if (pgd_none(*pgd)) { + pgd_entry = __pgd(_KERNPG_TABLE | __pa_nodebug(kasan_zero_p4d)); + set_pgd(pgd, pgd_entry); + } + + p4d = early_p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + + if (!p4d_none(*p4d)) + continue; + + p4d_entry = __p4d(_KERNPG_TABLE | __pa_nodebug(kasan_zero_pud)); + set_p4d(p4d, p4d_entry); + } while (p4d++, addr = next, addr != end && p4d_none(*p4d)); } static void __init kasan_map_early_shadow(pgd_t *pgd) { - int i; - unsigned long start = KASAN_SHADOW_START; + /* See comment in kasan_init() */ + unsigned long addr = KASAN_SHADOW_START & PGDIR_MASK; unsigned long end = KASAN_SHADOW_END; + unsigned long next; - for (i = pgd_index(start); start < end; i++) { - switch (CONFIG_PGTABLE_LEVELS) { - case 4: - pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) | - _KERNPG_TABLE); - break; - case 5: - pgd[i] = __pgd(__pa_nodebug(kasan_zero_p4d) | - _KERNPG_TABLE); - break; - default: - BUILD_BUG(); - } - start += PGDIR_SIZE; - } + pgd += pgd_index(addr); + do { + next = pgd_addr_end(addr, end); + kasan_early_p4d_populate(pgd, addr, next); + } while (pgd++, addr = next, addr != end); } #ifdef CONFIG_KASAN_INLINE @@ -102,7 +138,7 @@ void __init kasan_early_init(void) for (i = 0; i < PTRS_PER_PUD; i++) kasan_zero_pud[i] = __pud(pud_val); - for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++) + for (i = 0; IS_ENABLED(CONFIG_X86_5LEVEL) && i < PTRS_PER_P4D; i++) kasan_zero_p4d[i] = __p4d(p4d_val); kasan_map_early_shadow(early_top_pgt); @@ -118,12 +154,35 @@ void __init kasan_init(void) #endif memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt)); + + /* + * We use the same shadow offset for 4- and 5-level paging to + * facilitate boot-time switching between paging modes. + * As result in 5-level paging mode KASAN_SHADOW_START and + * KASAN_SHADOW_END are not aligned to PGD boundary. + * + * KASAN_SHADOW_START doesn't share PGD with anything else. + * We claim whole PGD entry to make things easier. + * + * KASAN_SHADOW_END lands in the last PGD entry and it collides with + * bunch of things like kernel code, modules, EFI mapping, etc. + * We need to take extra steps to not overwrite them. + */ + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + void *ptr; + + ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END)); + memcpy(tmp_p4d_table, (void *)ptr, sizeof(tmp_p4d_table)); + set_pgd(&early_top_pgt[pgd_index(KASAN_SHADOW_END)], + __pgd(__pa(tmp_p4d_table) | _KERNPG_TABLE)); + } + load_cr3(early_top_pgt); __flush_tlb_all(); - clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); + clear_pgds(KASAN_SHADOW_START & PGDIR_MASK, KASAN_SHADOW_END); - kasan_populate_zero_shadow((void *)KASAN_SHADOW_START, + kasan_populate_zero_shadow((void *)(KASAN_SHADOW_START & PGDIR_MASK), kasan_mem_to_shadow((void *)PAGE_OFFSET)); for (i = 0; i < E820_MAX_ENTRIES; i++) { diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 0286327e65fa..d9a9e9fc75dd 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -30,6 +30,8 @@ #include <asm/msr.h> #include <asm/cmdline.h> +#include "mm_internal.h" + static char sme_cmdline_arg[] __initdata = "mem_encrypt"; static char sme_cmdline_on[] __initdata = "on"; static char sme_cmdline_off[] __initdata = "off"; @@ -41,6 +43,10 @@ static char sme_cmdline_off[] __initdata = "off"; */ u64 sme_me_mask __section(.data) = 0; EXPORT_SYMBOL(sme_me_mask); +DEFINE_STATIC_KEY_FALSE(sev_enable_key); +EXPORT_SYMBOL_GPL(sev_enable_key); + +static bool sev_enabled __section(.data); /* Buffer used for early in-place encryption by BSP, no locking needed */ static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE); @@ -63,7 +69,6 @@ static void __init __sme_early_enc_dec(resource_size_t paddr, if (!sme_me_mask) return; - local_flush_tlb(); wbinvd(); /* @@ -190,8 +195,238 @@ void __init sme_early_init(void) /* Update the protection map with memory encryption mask */ for (i = 0; i < ARRAY_SIZE(protection_map); i++) protection_map[i] = pgprot_encrypted(protection_map[i]); + + if (sev_active()) + swiotlb_force = SWIOTLB_FORCE; +} + +static void *sev_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp, unsigned long attrs) +{ + unsigned long dma_mask; + unsigned int order; + struct page *page; + void *vaddr = NULL; + + dma_mask = dma_alloc_coherent_mask(dev, gfp); + order = get_order(size); + + /* + * Memory will be memset to zero after marking decrypted, so don't + * bother clearing it before. + */ + gfp &= ~__GFP_ZERO; + + page = alloc_pages_node(dev_to_node(dev), gfp, order); + if (page) { + dma_addr_t addr; + + /* + * Since we will be clearing the encryption bit, check the + * mask with it already cleared. + */ + addr = __sme_clr(phys_to_dma(dev, page_to_phys(page))); + if ((addr + size) > dma_mask) { + __free_pages(page, get_order(size)); + } else { + vaddr = page_address(page); + *dma_handle = addr; + } + } + + if (!vaddr) + vaddr = swiotlb_alloc_coherent(dev, size, dma_handle, gfp); + + if (!vaddr) + return NULL; + + /* Clear the SME encryption bit for DMA use if not swiotlb area */ + if (!is_swiotlb_buffer(dma_to_phys(dev, *dma_handle))) { + set_memory_decrypted((unsigned long)vaddr, 1 << order); + memset(vaddr, 0, PAGE_SIZE << order); + *dma_handle = __sme_clr(*dma_handle); + } + + return vaddr; } +static void sev_free(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, unsigned long attrs) +{ + /* Set the SME encryption bit for re-use if not swiotlb area */ + if (!is_swiotlb_buffer(dma_to_phys(dev, dma_handle))) + set_memory_encrypted((unsigned long)vaddr, + 1 << get_order(size)); + + swiotlb_free_coherent(dev, size, vaddr, dma_handle); +} + +static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) +{ + pgprot_t old_prot, new_prot; + unsigned long pfn, pa, size; + pte_t new_pte; + + switch (level) { + case PG_LEVEL_4K: + pfn = pte_pfn(*kpte); + old_prot = pte_pgprot(*kpte); + break; + case PG_LEVEL_2M: + pfn = pmd_pfn(*(pmd_t *)kpte); + old_prot = pmd_pgprot(*(pmd_t *)kpte); + break; + case PG_LEVEL_1G: + pfn = pud_pfn(*(pud_t *)kpte); + old_prot = pud_pgprot(*(pud_t *)kpte); + break; + default: + return; + } + + new_prot = old_prot; + if (enc) + pgprot_val(new_prot) |= _PAGE_ENC; + else + pgprot_val(new_prot) &= ~_PAGE_ENC; + + /* If prot is same then do nothing. */ + if (pgprot_val(old_prot) == pgprot_val(new_prot)) + return; + + pa = pfn << page_level_shift(level); + size = page_level_size(level); + + /* + * We are going to perform in-place en-/decryption and change the + * physical page attribute from C=1 to C=0 or vice versa. Flush the + * caches to ensure that data gets accessed with the correct C-bit. + */ + clflush_cache_range(__va(pa), size); + + /* Encrypt/decrypt the contents in-place */ + if (enc) + sme_early_encrypt(pa, size); + else + sme_early_decrypt(pa, size); + + /* Change the page encryption mask. */ + new_pte = pfn_pte(pfn, new_prot); + set_pte_atomic(kpte, new_pte); +} + +static int __init early_set_memory_enc_dec(unsigned long vaddr, + unsigned long size, bool enc) +{ + unsigned long vaddr_end, vaddr_next; + unsigned long psize, pmask; + int split_page_size_mask; + int level, ret; + pte_t *kpte; + + vaddr_next = vaddr; + vaddr_end = vaddr + size; + + for (; vaddr < vaddr_end; vaddr = vaddr_next) { + kpte = lookup_address(vaddr, &level); + if (!kpte || pte_none(*kpte)) { + ret = 1; + goto out; + } + + if (level == PG_LEVEL_4K) { + __set_clr_pte_enc(kpte, level, enc); + vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE; + continue; + } + + psize = page_level_size(level); + pmask = page_level_mask(level); + + /* + * Check whether we can change the large page in one go. + * We request a split when the address is not aligned and + * the number of pages to set/clear encryption bit is smaller + * than the number of pages in the large page. + */ + if (vaddr == (vaddr & pmask) && + ((vaddr_end - vaddr) >= psize)) { + __set_clr_pte_enc(kpte, level, enc); + vaddr_next = (vaddr & pmask) + psize; + continue; + } + + /* + * The virtual address is part of a larger page, create the next + * level page table mapping (4K or 2M). If it is part of a 2M + * page then we request a split of the large page into 4K + * chunks. A 1GB large page is split into 2M pages, resp. + */ + if (level == PG_LEVEL_2M) + split_page_size_mask = 0; + else + split_page_size_mask = 1 << PG_LEVEL_2M; + + kernel_physical_mapping_init(__pa(vaddr & pmask), + __pa((vaddr_end & pmask) + psize), + split_page_size_mask); + } + + ret = 0; + +out: + __flush_tlb_all(); + return ret; +} + +int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size) +{ + return early_set_memory_enc_dec(vaddr, size, false); +} + +int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) +{ + return early_set_memory_enc_dec(vaddr, size, true); +} + +/* + * SME and SEV are very similar but they are not the same, so there are + * times that the kernel will need to distinguish between SME and SEV. The + * sme_active() and sev_active() functions are used for this. When a + * distinction isn't needed, the mem_encrypt_active() function can be used. + * + * The trampoline code is a good example for this requirement. Before + * paging is activated, SME will access all memory as decrypted, but SEV + * will access all memory as encrypted. So, when APs are being brought + * up under SME the trampoline area cannot be encrypted, whereas under SEV + * the trampoline area must be encrypted. + */ +bool sme_active(void) +{ + return sme_me_mask && !sev_enabled; +} +EXPORT_SYMBOL_GPL(sme_active); + +bool sev_active(void) +{ + return sme_me_mask && sev_enabled; +} +EXPORT_SYMBOL_GPL(sev_active); + +static const struct dma_map_ops sev_dma_ops = { + .alloc = sev_alloc, + .free = sev_free, + .map_page = swiotlb_map_page, + .unmap_page = swiotlb_unmap_page, + .map_sg = swiotlb_map_sg_attrs, + .unmap_sg = swiotlb_unmap_sg_attrs, + .sync_single_for_cpu = swiotlb_sync_single_for_cpu, + .sync_single_for_device = swiotlb_sync_single_for_device, + .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, + .sync_sg_for_device = swiotlb_sync_sg_for_device, + .mapping_error = swiotlb_dma_mapping_error, +}; + /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void) { @@ -201,7 +436,23 @@ void __init mem_encrypt_init(void) /* Call into SWIOTLB to update the SWIOTLB DMA buffers */ swiotlb_update_mem_attributes(); - pr_info("AMD Secure Memory Encryption (SME) active\n"); + /* + * With SEV, DMA operations cannot use encryption. New DMA ops + * are required in order to mark the DMA areas as decrypted or + * to use bounce buffers. + */ + if (sev_active()) + dma_ops = &sev_dma_ops; + + /* + * With SEV, we need to unroll the rep string I/O instructions. + */ + if (sev_active()) + static_branch_enable(&sev_enable_key); + + pr_info("AMD %s active\n", + sev_active() ? "Secure Encrypted Virtualization (SEV)" + : "Secure Memory Encryption (SME)"); } void swiotlb_set_mem_attributes(void *vaddr, unsigned long size) @@ -529,37 +780,63 @@ void __init __nostackprotector sme_enable(struct boot_params *bp) { const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off; unsigned int eax, ebx, ecx, edx; + unsigned long feature_mask; bool active_by_default; unsigned long me_mask; char buffer[16]; u64 msr; - /* Check for the SME support leaf */ + /* Check for the SME/SEV support leaf */ eax = 0x80000000; ecx = 0; native_cpuid(&eax, &ebx, &ecx, &edx); if (eax < 0x8000001f) return; +#define AMD_SME_BIT BIT(0) +#define AMD_SEV_BIT BIT(1) /* - * Check for the SME feature: - * CPUID Fn8000_001F[EAX] - Bit 0 - * Secure Memory Encryption support - * CPUID Fn8000_001F[EBX] - Bits 5:0 - * Pagetable bit position used to indicate encryption + * Set the feature mask (SME or SEV) based on whether we are + * running under a hypervisor. + */ + eax = 1; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT; + + /* + * Check for the SME/SEV feature: + * CPUID Fn8000_001F[EAX] + * - Bit 0 - Secure Memory Encryption support + * - Bit 1 - Secure Encrypted Virtualization support + * CPUID Fn8000_001F[EBX] + * - Bits 5:0 - Pagetable bit position used to indicate encryption */ eax = 0x8000001f; ecx = 0; native_cpuid(&eax, &ebx, &ecx, &edx); - if (!(eax & 1)) + if (!(eax & feature_mask)) return; me_mask = 1UL << (ebx & 0x3f); - /* Check if SME is enabled */ - msr = __rdmsr(MSR_K8_SYSCFG); - if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + /* Check if memory encryption is enabled */ + if (feature_mask == AMD_SME_BIT) { + /* For SME, check the SYSCFG MSR */ + msr = __rdmsr(MSR_K8_SYSCFG); + if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + return; + } else { + /* For SEV, check the SEV MSR */ + msr = __rdmsr(MSR_AMD64_SEV); + if (!(msr & MSR_AMD64_SEV_ENABLED)) + return; + + /* SEV state cannot be controlled by a command line option */ + sme_me_mask = me_mask; + sev_enabled = true; return; + } /* * Fixups have not been applied to phys_base yet and we're running diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 7eb06701a935..e500949bae24 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -13,6 +13,7 @@ #include <linux/sched/sysctl.h> #include <asm/insn.h> +#include <asm/insn-eval.h> #include <asm/mman.h> #include <asm/mmu_context.h> #include <asm/mpx.h> @@ -61,123 +62,6 @@ static unsigned long mpx_mmap(unsigned long len) return addr; } -enum reg_type { - REG_TYPE_RM = 0, - REG_TYPE_INDEX, - REG_TYPE_BASE, -}; - -static int get_reg_offset(struct insn *insn, struct pt_regs *regs, - enum reg_type type) -{ - int regno = 0; - - static const int regoff[] = { - offsetof(struct pt_regs, ax), - offsetof(struct pt_regs, cx), - offsetof(struct pt_regs, dx), - offsetof(struct pt_regs, bx), - offsetof(struct pt_regs, sp), - offsetof(struct pt_regs, bp), - offsetof(struct pt_regs, si), - offsetof(struct pt_regs, di), -#ifdef CONFIG_X86_64 - offsetof(struct pt_regs, r8), - offsetof(struct pt_regs, r9), - offsetof(struct pt_regs, r10), - offsetof(struct pt_regs, r11), - offsetof(struct pt_regs, r12), - offsetof(struct pt_regs, r13), - offsetof(struct pt_regs, r14), - offsetof(struct pt_regs, r15), -#endif - }; - int nr_registers = ARRAY_SIZE(regoff); - /* - * Don't possibly decode a 32-bit instructions as - * reading a 64-bit-only register. - */ - if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64) - nr_registers -= 8; - - switch (type) { - case REG_TYPE_RM: - regno = X86_MODRM_RM(insn->modrm.value); - if (X86_REX_B(insn->rex_prefix.value)) - regno += 8; - break; - - case REG_TYPE_INDEX: - regno = X86_SIB_INDEX(insn->sib.value); - if (X86_REX_X(insn->rex_prefix.value)) - regno += 8; - break; - - case REG_TYPE_BASE: - regno = X86_SIB_BASE(insn->sib.value); - if (X86_REX_B(insn->rex_prefix.value)) - regno += 8; - break; - - default: - pr_err("invalid register type"); - BUG(); - break; - } - - if (regno >= nr_registers) { - WARN_ONCE(1, "decoded an instruction with an invalid register"); - return -EINVAL; - } - return regoff[regno]; -} - -/* - * return the address being referenced be instruction - * for rm=3 returning the content of the rm reg - * for rm!=3 calculates the address using SIB and Disp - */ -static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs) -{ - unsigned long addr, base, indx; - int addr_offset, base_offset, indx_offset; - insn_byte_t sib; - - insn_get_modrm(insn); - insn_get_sib(insn); - sib = insn->sib.value; - - if (X86_MODRM_MOD(insn->modrm.value) == 3) { - addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); - if (addr_offset < 0) - goto out_err; - addr = regs_get_register(regs, addr_offset); - } else { - if (insn->sib.nbytes) { - base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE); - if (base_offset < 0) - goto out_err; - - indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX); - if (indx_offset < 0) - goto out_err; - - base = regs_get_register(regs, base_offset); - indx = regs_get_register(regs, indx_offset); - addr = base + indx * (1 << X86_SIB_SCALE(sib)); - } else { - addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); - if (addr_offset < 0) - goto out_err; - addr = regs_get_register(regs, addr_offset); - } - addr += insn->displacement.value; - } - return (void __user *)addr; -out_err: - return (void __user *)-1; -} - static int mpx_insn_decode(struct insn *insn, struct pt_regs *regs) { @@ -290,7 +174,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) info->si_signo = SIGSEGV; info->si_errno = 0; info->si_code = SEGV_BNDERR; - info->si_addr = mpx_get_addr_ref(&insn, regs); + info->si_addr = insn_get_addr_ref(&insn, regs); /* * We were not able to extract an address from the instruction, * probably because there was something invalid in it. diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index dfb7d657cf43..3fe68483463c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1781,8 +1781,8 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) unsigned long start; int ret; - /* Nothing to do if the SME is not active */ - if (!sme_active()) + /* Nothing to do if memory encryption is not active */ + if (!mem_encrypt_active()) return 0; /* Should not be working on unaligned addresses */ diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 20fb31579b69..9e4ee5b04b2d 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -33,6 +33,7 @@ #include <linux/reboot.h> #include <linux/slab.h> #include <linux/ucs2_string.h> +#include <linux/mem_encrypt.h> #include <asm/setup.h> #include <asm/page.h> @@ -370,7 +371,11 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) * as trim_bios_range() will reserve the first page and isolate it away * from memory allocators anyway. */ - if (kernel_map_pages_in_pgd(pgd, 0x0, 0x0, 1, _PAGE_RW)) { + pf = _PAGE_RW; + if (sev_active()) + pf |= _PAGE_ENC; + + if (kernel_map_pages_in_pgd(pgd, 0x0, 0x0, 1, pf)) { pr_err("Failed to create 1:1 mapping for the first page!\n"); return 1; } @@ -413,6 +418,9 @@ static void __init __map_region(efi_memory_desc_t *md, u64 va) if (!(md->attribute & EFI_MEMORY_WB)) flags |= _PAGE_PCD; + if (sev_active()) + flags |= _PAGE_ENC; + pfn = md->phys_addr >> PAGE_SHIFT; if (kernel_map_pages_in_pgd(pgd, pfn, va, md->num_pages, flags)) pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n", @@ -539,6 +547,9 @@ static int __init efi_update_mem_attr(struct mm_struct *mm, efi_memory_desc_t *m if (!(md->attribute & EFI_MEMORY_RO)) pf |= _PAGE_RW; + if (sev_active()) + pf |= _PAGE_ENC; + return efi_update_mappings(md, pf); } @@ -590,6 +601,9 @@ void __init efi_runtime_update_mappings(void) (md->type != EFI_RUNTIME_SERVICES_CODE)) pf |= _PAGE_RW; + if (sev_active()) + pf |= _PAGE_ENC; + efi_update_mappings(md, pf); } } diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index ed84d3917a59..d10105825d57 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -64,9 +64,10 @@ static void __init setup_real_mode(void) /* * If SME is active, the trampoline area will need to be in * decrypted memory in order to bring up other processors - * successfully. + * successfully. This is not needed for SEV. */ - set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT); + if (sme_active()) + set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT); memcpy(base, real_mode_blob, size); diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c index 836a1eb5df43..3ee234b6234d 100644 --- a/arch/x86/um/ldt.c +++ b/arch/x86/um/ldt.c @@ -6,6 +6,7 @@ #include <linux/mm.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/syscalls.h> #include <linux/uaccess.h> #include <asm/unistd.h> #include <os.h> @@ -369,7 +370,9 @@ void free_ldt(struct mm_context *mm) mm->arch.ldt.entry_count = 0; } -int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) +SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , + unsigned long , bytecount) { - return do_modify_ldt_skas(func, ptr, bytecount); + /* See non-um modify_ldt() for why we do this cast */ + return (unsigned int)do_modify_ldt_skas(func, ptr, bytecount); } diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index d4396e27b1fb..e55d276afc70 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -601,7 +601,7 @@ static struct trap_array_entry trap_array[] = { #ifdef CONFIG_X86_MCE { machine_check, xen_machine_check, true }, #endif - { nmi, xen_nmi, true }, + { nmi, xen_xennmi, true }, { overflow, xen_overflow, false }, #ifdef CONFIG_IA32_EMULATION { entry_INT80_compat, xen_entry_INT80_compat, false }, @@ -811,15 +811,14 @@ static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, } } -static void xen_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) +static void xen_load_sp0(unsigned long sp0) { struct multicall_space mcs; mcs = xen_mc_entry(0); - MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); + MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0); xen_mc_issue(PARAVIRT_LAZY_CPU); - tss->x86_tss.sp0 = thread->sp0; + this_cpu_write(cpu_tss.x86_tss.sp0, sp0); } void xen_set_iopl_mask(unsigned mask) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 71495f1a86d7..2ccdaba31a07 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -449,7 +449,7 @@ __visible pmd_t xen_make_pmd(pmdval_t pmd) } PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); -#if CONFIG_PGTABLE_LEVELS == 4 +#ifdef CONFIG_X86_64 __visible pudval_t xen_pud_val(pud_t pud) { return pte_mfn_to_pfn(pud.pud); @@ -538,7 +538,7 @@ static void xen_set_p4d(p4d_t *ptr, p4d_t val) xen_mc_issue(PARAVIRT_LAZY_MMU); } -#endif /* CONFIG_PGTABLE_LEVELS == 4 */ +#endif /* CONFIG_X86_64 */ static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd, int (*func)(struct mm_struct *mm, struct page *, enum pt_level), @@ -580,21 +580,17 @@ static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d, int (*func)(struct mm_struct *mm, struct page *, enum pt_level), bool last, unsigned long limit) { - int i, nr, flush = 0; + int flush = 0; + pud_t *pud; - nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D; - for (i = 0; i < nr; i++) { - pud_t *pud; - if (p4d_none(p4d[i])) - continue; + if (p4d_none(*p4d)) + return flush; - pud = pud_offset(&p4d[i], 0); - if (PTRS_PER_PUD > 1) - flush |= (*func)(mm, virt_to_page(pud), PT_PUD); - flush |= xen_pud_walk(mm, pud, func, - last && i == nr - 1, limit); - } + pud = pud_offset(p4d, 0); + if (PTRS_PER_PUD > 1) + flush |= (*func)(mm, virt_to_page(pud), PT_PUD); + flush |= xen_pud_walk(mm, pud, func, last, limit); return flush; } @@ -644,8 +640,6 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, continue; p4d = p4d_offset(&pgd[i], 0); - if (PTRS_PER_P4D > 1) - flush |= (*func)(mm, virt_to_page(p4d), PT_P4D); flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit); } @@ -1176,22 +1170,14 @@ static void __init xen_cleanmfnmap(unsigned long vaddr) { pgd_t *pgd; p4d_t *p4d; - unsigned int i; bool unpin; unpin = (vaddr == 2 * PGDIR_SIZE); vaddr &= PMD_MASK; pgd = pgd_offset_k(vaddr); p4d = p4d_offset(pgd, 0); - for (i = 0; i < PTRS_PER_P4D; i++) { - if (p4d_none(p4d[i])) - continue; - xen_cleanmfnmap_p4d(p4d + i, unpin); - } - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - set_pgd(pgd, __pgd(0)); - xen_cleanmfnmap_free_pgtbl(p4d, unpin); - } + if (!p4d_none(*p4d)) + xen_cleanmfnmap_p4d(p4d, unpin); } static void __init xen_pagetable_p2m_free(void) @@ -1692,7 +1678,7 @@ static void xen_release_pmd(unsigned long pfn) xen_release_ptpage(pfn, PT_PMD); } -#if CONFIG_PGTABLE_LEVELS >= 4 +#ifdef CONFIG_X86_64 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) { xen_alloc_ptpage(mm, pfn, PT_PUD); @@ -2029,13 +2015,12 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) */ void __init xen_relocate_p2m(void) { - phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys; + phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys; unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end; - int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d; + int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud; pte_t *pt; pmd_t *pmd; pud_t *pud; - p4d_t *p4d = NULL; pgd_t *pgd; unsigned long *new_p2m; int save_pud; @@ -2045,11 +2030,7 @@ void __init xen_relocate_p2m(void) n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT; n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT; n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT; - if (PTRS_PER_P4D > 1) - n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; - else - n_p4d = 0; - n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d; + n_frames = n_pte + n_pt + n_pmd + n_pud; new_area = xen_find_free_area(PFN_PHYS(n_frames)); if (!new_area) { @@ -2065,76 +2046,56 @@ void __init xen_relocate_p2m(void) * To avoid any possible virtual address collision, just use * 2 * PUD_SIZE for the new area. */ - p4d_phys = new_area; - pud_phys = p4d_phys + PFN_PHYS(n_p4d); + pud_phys = new_area; pmd_phys = pud_phys + PFN_PHYS(n_pud); pt_phys = pmd_phys + PFN_PHYS(n_pmd); p2m_pfn = PFN_DOWN(pt_phys) + n_pt; pgd = __va(read_cr3_pa()); new_p2m = (unsigned long *)(2 * PGDIR_SIZE); - idx_p4d = 0; save_pud = n_pud; - do { - if (n_p4d > 0) { - p4d = early_memremap(p4d_phys, PAGE_SIZE); - clear_page(p4d); - n_pud = min(save_pud, PTRS_PER_P4D); - } - for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { - pud = early_memremap(pud_phys, PAGE_SIZE); - clear_page(pud); - for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); - idx_pmd++) { - pmd = early_memremap(pmd_phys, PAGE_SIZE); - clear_page(pmd); - for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); - idx_pt++) { - pt = early_memremap(pt_phys, PAGE_SIZE); - clear_page(pt); - for (idx_pte = 0; - idx_pte < min(n_pte, PTRS_PER_PTE); - idx_pte++) { - set_pte(pt + idx_pte, - pfn_pte(p2m_pfn, PAGE_KERNEL)); - p2m_pfn++; - } - n_pte -= PTRS_PER_PTE; - early_memunmap(pt, PAGE_SIZE); - make_lowmem_page_readonly(__va(pt_phys)); - pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, - PFN_DOWN(pt_phys)); - set_pmd(pmd + idx_pt, - __pmd(_PAGE_TABLE | pt_phys)); - pt_phys += PAGE_SIZE; + for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { + pud = early_memremap(pud_phys, PAGE_SIZE); + clear_page(pud); + for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); + idx_pmd++) { + pmd = early_memremap(pmd_phys, PAGE_SIZE); + clear_page(pmd); + for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); + idx_pt++) { + pt = early_memremap(pt_phys, PAGE_SIZE); + clear_page(pt); + for (idx_pte = 0; + idx_pte < min(n_pte, PTRS_PER_PTE); + idx_pte++) { + set_pte(pt + idx_pte, + pfn_pte(p2m_pfn, PAGE_KERNEL)); + p2m_pfn++; } - n_pt -= PTRS_PER_PMD; - early_memunmap(pmd, PAGE_SIZE); - make_lowmem_page_readonly(__va(pmd_phys)); - pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, - PFN_DOWN(pmd_phys)); - set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); - pmd_phys += PAGE_SIZE; + n_pte -= PTRS_PER_PTE; + early_memunmap(pt, PAGE_SIZE); + make_lowmem_page_readonly(__va(pt_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, + PFN_DOWN(pt_phys)); + set_pmd(pmd + idx_pt, + __pmd(_PAGE_TABLE | pt_phys)); + pt_phys += PAGE_SIZE; } - n_pmd -= PTRS_PER_PUD; - early_memunmap(pud, PAGE_SIZE); - make_lowmem_page_readonly(__va(pud_phys)); - pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); - if (n_p4d > 0) - set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys)); - else - set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); - pud_phys += PAGE_SIZE; - } - if (n_p4d > 0) { - save_pud -= PTRS_PER_P4D; - early_memunmap(p4d, PAGE_SIZE); - make_lowmem_page_readonly(__va(p4d_phys)); - pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys)); - set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys)); - p4d_phys += PAGE_SIZE; + n_pt -= PTRS_PER_PMD; + early_memunmap(pmd, PAGE_SIZE); + make_lowmem_page_readonly(__va(pmd_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, + PFN_DOWN(pmd_phys)); + set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); + pmd_phys += PAGE_SIZE; } - } while (++idx_p4d < n_p4d); + n_pmd -= PTRS_PER_PUD; + early_memunmap(pud, PAGE_SIZE); + make_lowmem_page_readonly(__va(pud_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); + set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); + pud_phys += PAGE_SIZE; + } /* Now copy the old p2m info to the new area. */ memcpy(new_p2m, xen_p2m_addr, size); @@ -2361,7 +2322,7 @@ static void __init xen_post_allocator_init(void) pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pmd = xen_set_pmd; pv_mmu_ops.set_pud = xen_set_pud; -#if CONFIG_PGTABLE_LEVELS >= 4 +#ifdef CONFIG_X86_64 pv_mmu_ops.set_p4d = xen_set_p4d; #endif @@ -2371,7 +2332,7 @@ static void __init xen_post_allocator_init(void) pv_mmu_ops.alloc_pmd = xen_alloc_pmd; pv_mmu_ops.release_pte = xen_release_pte; pv_mmu_ops.release_pmd = xen_release_pmd; -#if CONFIG_PGTABLE_LEVELS >= 4 +#ifdef CONFIG_X86_64 pv_mmu_ops.alloc_pud = xen_alloc_pud; pv_mmu_ops.release_pud = xen_release_pud; #endif @@ -2435,14 +2396,14 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), -#if CONFIG_PGTABLE_LEVELS >= 4 +#ifdef CONFIG_X86_64 .pud_val = PV_CALLEE_SAVE(xen_pud_val), .make_pud = PV_CALLEE_SAVE(xen_make_pud), .set_p4d = xen_set_p4d_hyper, .alloc_pud = xen_alloc_pmd_init, .release_pud = xen_release_pmd_init, -#endif /* CONFIG_PGTABLE_LEVELS == 4 */ +#endif /* CONFIG_X86_64 */ .activate_mm = xen_activate_mm, .dup_mmap = xen_dup_mmap, diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 05f91ce9b55e..c0c756c76afe 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -14,6 +14,7 @@ * single-threaded. */ #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/smp.h> @@ -294,12 +295,19 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) #endif memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); + /* + * Bring up the CPU in cpu_bringup_and_idle() with the stack + * pointing just below where pt_regs would be if it were a normal + * kernel entry. + */ ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; ctxt->flags = VGCF_IN_KERNEL; ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ ctxt->user_regs.ds = __USER_DS; ctxt->user_regs.es = __USER_DS; ctxt->user_regs.ss = __KERNEL_DS; + ctxt->user_regs.cs = __KERNEL_CS; + ctxt->user_regs.esp = (unsigned long)task_pt_regs(idle); xen_copy_trap_info(ctxt->trap_ctxt); @@ -314,8 +322,13 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->gdt_frames[0] = gdt_mfn; ctxt->gdt_ents = GDT_ENTRIES; + /* + * Set SS:SP that Xen will use when entering guest kernel mode + * from guest user mode. Subsequent calls to load_sp0() can + * change this value. + */ ctxt->kernel_ss = __KERNEL_DS; - ctxt->kernel_sp = idle->thread.sp0; + ctxt->kernel_sp = task_top_of_stack(idle); #ifdef CONFIG_X86_32 ctxt->event_callback_cs = __KERNEL_CS; @@ -327,10 +340,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) (unsigned long)xen_hypervisor_callback; ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; - ctxt->user_regs.cs = __KERNEL_CS; per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); - ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir)); if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt)) BUG(); diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index c98a48c861fd..8a10c9a9e2b5 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -30,7 +30,7 @@ xen_pv_trap debug xen_pv_trap xendebug xen_pv_trap int3 xen_pv_trap xenint3 -xen_pv_trap nmi +xen_pv_trap xennmi xen_pv_trap overflow xen_pv_trap bounds xen_pv_trap invalid_op diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index b5b8d7f43557..497cc55a0c16 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -10,6 +10,7 @@ #include <asm/boot.h> #include <asm/asm.h> #include <asm/page_types.h> +#include <asm/unwind_hints.h> #include <xen/interface/elfnote.h> #include <xen/interface/features.h> @@ -20,6 +21,7 @@ #ifdef CONFIG_XEN_PV __INIT ENTRY(startup_xen) + UNWIND_HINT_EMPTY cld /* Clear .bss */ @@ -34,21 +36,24 @@ ENTRY(startup_xen) mov $init_thread_union+THREAD_SIZE, %_ASM_SP jmp xen_start_kernel - +END(startup_xen) __FINIT #endif .pushsection .text .balign PAGE_SIZE ENTRY(hypercall_page) - .skip PAGE_SIZE + .rept (PAGE_SIZE / 32) + UNWIND_HINT_EMPTY + .skip 32 + .endr #define HYPERCALL(n) \ .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \ .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32 #include <asm/xen-hypercalls.h> #undef HYPERCALL - +END(hypercall_page) .popsection ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") |