diff options
Diffstat (limited to 'arch/x86')
298 files changed, 9319 insertions, 6289 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 685692c94f05..ee2fb9d37745 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -16,6 +16,7 @@ config X86_64 def_bool y depends on 64BIT select X86_DEV_DMA_OPS + select ARCH_USE_CMPXCHG_LOCKREF ### Arch settings config X86 @@ -65,6 +66,7 @@ config X86 select HAVE_KERNEL_LZMA select HAVE_KERNEL_XZ select HAVE_KERNEL_LZO + select HAVE_KERNEL_LZ4 select HAVE_HW_BREAKPOINT select HAVE_MIXED_BREAKPOINTS_REGS select PERF_EVENTS @@ -80,8 +82,6 @@ config X86 select HAVE_USER_RETURN_NOTIFIER select ARCH_BINFMT_ELF_RANDOMIZE_PIE select HAVE_ARCH_JUMP_LABEL - select HAVE_TEXT_POKE_SMP - select HAVE_GENERIC_HARDIRQS select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select SPARSE_IRQ select GENERIC_FIND_FIRST_BIT @@ -102,6 +102,7 @@ config X86 select HAVE_ARCH_SECCOMP_FILTER select BUILDTIME_EXTABLE_SORT select GENERIC_CMOS_UPDATE + select HAVE_ARCH_SOFT_DIRTY select CLOCKSOURCE_WATCHDOG select GENERIC_CLOCKEVENTS select ARCH_CLOCKSOURCE_DATA if X86_64 @@ -121,6 +122,7 @@ config X86 select OLD_SIGACTION if X86_32 select COMPAT_OLD_SIGACTION if IA32_EMULATION select RTC_LIB + select HAVE_DEBUG_STACKOVERFLOW config INSTRUCTION_DECODER def_bool y @@ -207,6 +209,12 @@ config ARCH_HIBERNATION_POSSIBLE config ARCH_SUSPEND_POSSIBLE def_bool y +config ARCH_WANT_HUGE_PMD_SHARE + def_bool y + +config ARCH_WANT_GENERAL_HUGETLB + def_bool y + config ZONE_DMA32 bool default X86_64 @@ -336,6 +344,7 @@ config X86_EXTENDED_PLATFORM If you enable this option then you'll be able to select support for the following (non-PC) 32 bit x86 platforms: + Goldfish (Android emulator) AMD Elan NUMAQ (IBM/Sequent) RDC R-321x SoC @@ -410,6 +419,7 @@ config X86_UV config X86_GOLDFISH bool "Goldfish (Virtual Platform)" depends on X86_32 + depends on X86_EXTENDED_PLATFORM ---help--- Enable support for the Goldfish virtual platform used primarily for Android development. Unless you are building for the Android @@ -471,11 +481,12 @@ config X86_INTEL_LPSS bool "Intel Low Power Subsystem Support" depends on ACPI select COMMON_CLK + select PINCTRL ---help--- Select to build support for Intel Low Power Subsystem such as found on Intel Lynxpoint PCH. Selecting this option enables - things like clock tree (common clock framework) which are needed - by the LPSS peripheral drivers. + things like clock tree (common clock framework) and pincontrol + which are needed by the LPSS peripheral drivers. config X86_RDC321X bool "RDC R-321x SoC" @@ -621,6 +632,7 @@ config PARAVIRT_DEBUG config PARAVIRT_SPINLOCKS bool "Paravirtualization layer for spinlocks" depends on PARAVIRT && SMP + select UNINLINE_SPIN_UNLOCK ---help--- Paravirtualized spinlocks allow a pvops backend to replace the spinlock implementation with something virtualization-friendly @@ -645,6 +657,15 @@ config KVM_GUEST underlying device model, the host provides the guest with timing infrastructure such as time of day, and system time +config KVM_DEBUG_FS + bool "Enable debug information for KVM Guests in debugfs" + depends on KVM_GUEST && DEBUG_FS + default n + ---help--- + This option enables collection of various statistics for KVM guest. + Statistics are displayed in debugfs filesystem. Enabling this option + may incur significant overhead. + source "arch/x86/lguest/Kconfig" config PARAVIRT_TIME_ACCOUNTING @@ -1058,8 +1079,16 @@ config MICROCODE_INTEL_LIB depends on MICROCODE_INTEL config MICROCODE_INTEL_EARLY + def_bool n + +config MICROCODE_AMD_EARLY + def_bool n + +config MICROCODE_EARLY bool "Early load microcode" - depends on MICROCODE_INTEL && BLK_DEV_INITRD + depends on MICROCODE=y && BLK_DEV_INITRD + select MICROCODE_INTEL_EARLY if MICROCODE_INTEL + select MICROCODE_AMD_EARLY if MICROCODE_AMD default y help This option provides functionality to read additional microcode data @@ -1067,10 +1096,6 @@ config MICROCODE_INTEL_EARLY microcode to CPU's as early as possible. No functional change if no microcode data is glued to the initrd, therefore it's safe to say Y. -config MICROCODE_EARLY - def_bool y - depends on MICROCODE_INTEL_EARLY - config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" ---help--- @@ -1329,8 +1354,12 @@ config ARCH_SELECT_MEMORY_MODEL depends on ARCH_SPARSEMEM_ENABLE config ARCH_MEMORY_PROBE - def_bool y + bool "Enable sysfs memory/probe interface" depends on X86_64 && MEMORY_HOTPLUG + help + This option enables a sysfs memory/probe interface for testing. + See Documentation/memory-hotplug.txt for more information. + If you are unsure how to answer this question, answer N. config ARCH_PROC_KCORE_TEXT def_bool y @@ -1612,9 +1641,9 @@ config KEXEC It is an ongoing process to be certain the hardware in a machine is properly shutdown, so do not be surprised if this code does not - initially work for you. It may help to enable device hotplugging - support. As of this writing the exact hardware interface is - strongly in flux, so no good recommendation can be made. + initially work for you. As of this writing the exact hardware + interface is strongly in flux, so no good recommendation can be + made. config CRASH_DUMP bool "kernel crash dumps" @@ -1701,9 +1730,10 @@ config X86_NEED_RELOCS depends on X86_32 && RELOCATABLE config PHYSICAL_ALIGN - hex "Alignment value to which kernel should be aligned" if X86_32 + hex "Alignment value to which kernel should be aligned" default "0x1000000" - range 0x2000 0x1000000 + range 0x2000 0x1000000 if X86_32 + range 0x200000 0x1000000 if X86_64 ---help--- This value puts the alignment restrictions on physical address where kernel is loaded and run from. Kernel is compiled for an @@ -1721,11 +1751,14 @@ config PHYSICAL_ALIGN end result is that kernel runs from a physical address meeting above alignment restrictions. + On 32-bit this value must be a multiple of 0x2000. On 64-bit + this value must be a multiple of 0x200000. + Don't change this unless you know what you are doing. config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" - depends on SMP && HOTPLUG + depends on SMP ---help--- Say Y here to allow turning CPUs off and on. CPUs can be controlled through /sys/devices/system/cpu. @@ -1999,7 +2032,6 @@ menu "Bus options (PCI etc.)" config PCI bool "PCI support" default y - select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC) ---help--- Find out whether you have a PCI motherboard. PCI is the name of a bus system, i.e. the way the CPU talks to the other stuff inside @@ -2246,15 +2278,41 @@ source "drivers/pcmcia/Kconfig" source "drivers/pci/hotplug/Kconfig" config RAPIDIO - bool "RapidIO support" + tristate "RapidIO support" depends on PCI default n help - If you say Y here, the kernel will include drivers and + If enabled this option will include drivers and the core infrastructure code to support RapidIO interconnect devices. source "drivers/rapidio/Kconfig" +config X86_SYSFB + bool "Mark VGA/VBE/EFI FB as generic system framebuffer" + help + Firmwares often provide initial graphics framebuffers so the BIOS, + bootloader or kernel can show basic video-output during boot for + user-guidance and debugging. Historically, x86 used the VESA BIOS + Extensions and EFI-framebuffers for this, which are mostly limited + to x86. + This option, if enabled, marks VGA/VBE/EFI framebuffers as generic + framebuffers so the new generic system-framebuffer drivers can be + used on x86. If the framebuffer is not compatible with the generic + modes, it is adverticed as fallback platform framebuffer so legacy + drivers like efifb, vesafb and uvesafb can pick it up. + If this option is not selected, all system framebuffers are always + marked as fallback platform framebuffers as usual. + + Note: Legacy fbdev drivers, including vesafb, efifb, uvesafb, will + not be able to pick up generic system framebuffers if this option + is selected. You are highly encouraged to enable simplefb as + replacement if you select this option. simplefb can correctly deal + with generic system framebuffers. But you should still keep vesafb + and others enabled as fallback if a system framebuffer is + incompatible with simplefb. + + If unsure, say Y. + endmenu @@ -2265,6 +2323,7 @@ source "fs/Kconfig.binfmt" config IA32_EMULATION bool "IA32 Emulation" depends on X86_64 + select BINFMT_ELF select COMPAT_BINFMT_ELF select HAVE_UID16 ---help--- @@ -2316,10 +2375,6 @@ config HAVE_ATOMIC_IOMAP def_bool y depends on X86_32 -config HAVE_TEXT_POKE_SMP - bool - select STOP_MACHINE if SMP - config X86_DEV_DMA_OPS bool depends on X86_64 || STA2X11 diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index c198b7e13e7b..78d91afb8e50 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -59,16 +59,6 @@ config EARLY_PRINTK_DBGP with klogd/syslogd or the X server. You should normally N here, unless you want to debug such a crash. You need usb debug device. -config DEBUG_STACKOVERFLOW - bool "Check for stack overflows" - depends on DEBUG_KERNEL - ---help--- - Say Y here if you want to check the overflows of kernel, IRQ - and exception stacks. This option will cause messages of the - stacks in detail when free stack space drops below a certain - limit. - If in doubt, say "N". - config X86_PTDUMP bool "Export kernel pagetable layout to userspace via debugfs" depends on DEBUG_KERNEL @@ -122,7 +112,6 @@ config DEBUG_NX_TEST config DOUBLEFAULT default y bool "Enable doublefault exception handler" if EXPERT - depends on X86_32 ---help--- This option allows trapping of rare doublefault exceptions that would otherwise cause a system to silently reboot. Disabling this @@ -304,4 +293,14 @@ config DEBUG_NMI_SELFTEST If unsure, say N. +config X86_DEBUG_STATIC_CPU_HAS + bool "Debug alternatives" + depends on DEBUG_KERNEL + ---help--- + This option causes additional code to be generated which + fails if static_cpu_has() is used before alternatives have + run. + + If unsure, say N. + endmenu diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 5c477260294f..41250fb33985 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -16,6 +16,10 @@ endif # e.g.: obj-y += foo_$(BITS).o export BITS +ifdef CONFIG_X86_NEED_RELOCS + LDFLAGS_vmlinux := --emit-relocs +endif + ifeq ($(CONFIG_X86_32),y) BITS := 32 UTS_MACHINE := i386 @@ -25,10 +29,6 @@ ifeq ($(CONFIG_X86_32),y) KBUILD_AFLAGS += $(biarch) KBUILD_CFLAGS += $(biarch) - ifdef CONFIG_RELOCATABLE - LDFLAGS_vmlinux := --emit-relocs - endif - KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return # Never want PIC in a 32-bit kernel, prevent breakage with GCC built @@ -220,6 +220,12 @@ archclean: $(Q)$(MAKE) $(clean)=$(boot) $(Q)$(MAKE) $(clean)=arch/x86/tools +PHONY += kvmconfig +kvmconfig: + $(if $(wildcard $(objtree)/.config),, $(error You need an existing .config for this target)) + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/kconfig/merge_config.sh -m -O $(objtree) $(objtree)/.config arch/x86/configs/kvm_guest.config + $(Q)yes "" | $(MAKE) oldconfig + define archhelp echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' echo ' install - Install kernel using' @@ -233,4 +239,5 @@ define archhelp echo ' bzdisk/fdimage*/isoimage also accept:' echo ' FDARGS="..." arguments for the booted kernel' echo ' FDINITRD=file initrd for the booted kernel' + echo ' kvmconfig - Enable additional options for guest kernel support' endef diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 5b7531966b84..ef72baeff484 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -355,6 +355,7 @@ int strncmp(const char *cs, const char *ct, size_t count); size_t strnlen(const char *s, size_t maxlen); unsigned int atou(const char *s); unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base); +size_t strlen(const char *s); /* tty.c */ void puts(const char *); diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 5ef205c5f37b..dcd90df10ab4 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -4,7 +4,8 @@ # create a compressed vmlinux image from the original vmlinux # -targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.xz vmlinux.bin.lzo +targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ + vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 KBUILD_CFLAGS += -fno-strict-aliasing -fPIC @@ -63,12 +64,15 @@ $(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE $(call if_changed,xzkern) $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE $(call if_changed,lzo) +$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE + $(call if_changed,lz4) suffix-$(CONFIG_KERNEL_GZIP) := gz suffix-$(CONFIG_KERNEL_BZIP2) := bz2 suffix-$(CONFIG_KERNEL_LZMA) := lzma suffix-$(CONFIG_KERNEL_XZ) := xz suffix-$(CONFIG_KERNEL_LZO) := lzo +suffix-$(CONFIG_KERNEL_LZ4) := lz4 quiet_cmd_mkpiggy = MKPIGGY $@ cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 35ee62fccf98..b7388a425f09 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -225,7 +225,7 @@ static void low_free(unsigned long size, unsigned long addr) unsigned long nr_pages; nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; - efi_call_phys2(sys_table->boottime->free_pages, addr, size); + efi_call_phys2(sys_table->boottime->free_pages, addr, nr_pages); } static void find_bits(unsigned long mask, u8 *pos, u8 *size) @@ -251,51 +251,6 @@ static void find_bits(unsigned long mask, u8 *pos, u8 *size) *size = len; } -static efi_status_t setup_efi_vars(struct boot_params *params) -{ - struct setup_data *data; - struct efi_var_bootdata *efidata; - u64 store_size, remaining_size, var_size; - efi_status_t status; - - if (sys_table->runtime->hdr.revision < EFI_2_00_SYSTEM_TABLE_REVISION) - return EFI_UNSUPPORTED; - - data = (struct setup_data *)(unsigned long)params->hdr.setup_data; - - while (data && data->next) - data = (struct setup_data *)(unsigned long)data->next; - - status = efi_call_phys4((void *)sys_table->runtime->query_variable_info, - EFI_VARIABLE_NON_VOLATILE | - EFI_VARIABLE_BOOTSERVICE_ACCESS | - EFI_VARIABLE_RUNTIME_ACCESS, &store_size, - &remaining_size, &var_size); - - if (status != EFI_SUCCESS) - return status; - - status = efi_call_phys3(sys_table->boottime->allocate_pool, - EFI_LOADER_DATA, sizeof(*efidata), &efidata); - - if (status != EFI_SUCCESS) - return status; - - efidata->data.type = SETUP_EFI_VARS; - efidata->data.len = sizeof(struct efi_var_bootdata) - - sizeof(struct setup_data); - efidata->data.next = 0; - efidata->store_size = store_size; - efidata->remaining_size = remaining_size; - efidata->max_var_size = var_size; - - if (data) - data->next = (unsigned long)efidata; - else - params->hdr.setup_data = (unsigned long)efidata; - -} - static efi_status_t setup_efi_pci(struct boot_params *params) { efi_pci_io_protocol *pci; @@ -1037,18 +992,20 @@ static efi_status_t exit_boot(struct boot_params *boot_params, efi_memory_desc_t *mem_map; efi_status_t status; __u32 desc_version; + bool called_exit = false; u8 nr_entries; int i; size = sizeof(*mem_map) * 32; again: - size += sizeof(*mem_map); + size += sizeof(*mem_map) * 2; _size = size; status = low_alloc(size, 1, (unsigned long *)&mem_map); if (status != EFI_SUCCESS) return status; +get_map: status = efi_call_phys5(sys_table->boottime->get_memory_map, &size, mem_map, &key, &desc_size, &desc_version); if (status == EFI_BUFFER_TOO_SMALL) { @@ -1074,8 +1031,20 @@ again: /* Might as well exit boot services now */ status = efi_call_phys2(sys_table->boottime->exit_boot_services, handle, key); - if (status != EFI_SUCCESS) - goto free_mem_map; + if (status != EFI_SUCCESS) { + /* + * ExitBootServices() will fail if any of the event + * handlers change the memory map. In which case, we + * must be prepared to retry, but only once so that + * we're guaranteed to exit on repeated failures instead + * of spinning forever. + */ + if (called_exit) + goto free_mem_map; + + called_exit = true; + goto get_map; + } /* Historic? */ boot_params->alt_mem_k = 32 * 1024; @@ -1202,8 +1171,6 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table, setup_graphics(boot_params); - setup_efi_vars(boot_params); - setup_efi_pci(boot_params); status = efi_call_phys3(sys_table->boottime->allocate_pool, diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 1e3184f6072f..5d6f6891b188 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -181,8 +181,9 @@ relocated: /* * Do the decompression, and jump to the new kernel.. */ - leal z_extract_offset_negative(%ebx), %ebp /* push arguments for decompress_kernel: */ + pushl $z_output_len /* decompressed length */ + leal z_extract_offset_negative(%ebx), %ebp pushl %ebp /* output address */ pushl $z_input_len /* input_len */ leal input_data(%ebx), %eax @@ -191,33 +192,7 @@ relocated: pushl %eax /* heap area */ pushl %esi /* real mode pointer */ call decompress_kernel - addl $20, %esp - -#if CONFIG_RELOCATABLE -/* - * Find the address of the relocations. - */ - leal z_output_len(%ebp), %edi - -/* - * Calculate the delta between where vmlinux was compiled to run - * and where it was actually loaded. - */ - movl %ebp, %ebx - subl $LOAD_PHYSICAL_ADDR, %ebx - jz 2f /* Nothing to be done if loaded at compiled addr. */ -/* - * Process relocations. - */ - -1: subl $4, %edi - movl (%edi), %ecx - testl %ecx, %ecx - jz 2f - addl %ebx, -__PAGE_OFFSET(%ebx, %ecx) - jmp 1b -2: -#endif + addl $24, %esp /* * Jump to the decompressed kernel. diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 16f24e6dad79..c337422b575d 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -27,8 +27,6 @@ #include <linux/init.h> #include <linux/linkage.h> #include <asm/segment.h> -#include <asm/pgtable_types.h> -#include <asm/page_types.h> #include <asm/boot.h> #include <asm/msr.h> #include <asm/processor-flags.h> @@ -340,6 +338,7 @@ relocated: leaq input_data(%rip), %rdx /* input_data */ movl $z_input_len, %ecx /* input_len */ movq %rbp, %r8 /* output target address */ + movq $z_output_len, %r9 /* decompressed length */ call decompress_kernel popq %rsi diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 7cb56c6ca351..434f077d2c4d 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -145,6 +145,10 @@ static int lines, cols; #include "../../../../lib/decompress_unlzo.c" #endif +#ifdef CONFIG_KERNEL_LZ4 +#include "../../../../lib/decompress_unlz4.c" +#endif + static void scroll(void) { int i; @@ -267,6 +271,79 @@ static void error(char *x) asm("hlt"); } +#if CONFIG_X86_NEED_RELOCS +static void handle_relocations(void *output, unsigned long output_len) +{ + int *reloc; + unsigned long delta, map, ptr; + unsigned long min_addr = (unsigned long)output; + unsigned long max_addr = min_addr + output_len; + + /* + * Calculate the delta between where vmlinux was linked to load + * and where it was actually loaded. + */ + delta = min_addr - LOAD_PHYSICAL_ADDR; + if (!delta) { + debug_putstr("No relocation needed... "); + return; + } + debug_putstr("Performing relocations... "); + + /* + * The kernel contains a table of relocation addresses. Those + * addresses have the final load address of the kernel in virtual + * memory. We are currently working in the self map. So we need to + * create an adjustment for kernel memory addresses to the self map. + * This will involve subtracting out the base address of the kernel. + */ + map = delta - __START_KERNEL_map; + + /* + * Process relocations: 32 bit relocations first then 64 bit after. + * Two sets of binary relocations are added to the end of the kernel + * before compression. Each relocation table entry is the kernel + * address of the location which needs to be updated stored as a + * 32-bit value which is sign extended to 64 bits. + * + * Format is: + * + * kernel bits... + * 0 - zero terminator for 64 bit relocations + * 64 bit relocation repeated + * 0 - zero terminator for 32 bit relocations + * 32 bit relocation repeated + * + * So we work backwards from the end of the decompressed image. + */ + for (reloc = output + output_len - sizeof(*reloc); *reloc; reloc--) { + int extended = *reloc; + extended += map; + + ptr = (unsigned long)extended; + if (ptr < min_addr || ptr > max_addr) + error("32-bit relocation outside of kernel!\n"); + + *(uint32_t *)ptr += delta; + } +#ifdef CONFIG_X86_64 + for (reloc--; *reloc; reloc--) { + long extended = *reloc; + extended += map; + + ptr = (unsigned long)extended; + if (ptr < min_addr || ptr > max_addr) + error("64-bit relocation outside of kernel!\n"); + + *(uint64_t *)ptr += delta; + } +#endif +} +#else +static inline void handle_relocations(void *output, unsigned long output_len) +{ } +#endif + static void parse_elf(void *output) { #ifdef CONFIG_X86_64 @@ -321,7 +398,8 @@ static void parse_elf(void *output) asmlinkage void decompress_kernel(void *rmode, memptr heap, unsigned char *input_data, unsigned long input_len, - unsigned char *output) + unsigned char *output, + unsigned long output_len) { real_mode = rmode; @@ -361,6 +439,7 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, debug_putstr("\nDecompressing Linux... "); decompress(input_data, input_len, NULL, NULL, output, NULL, error); parse_elf(output); + handle_relocations(output, output_len); debug_putstr("done.\nBooting the kernel.\n"); return; } diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c index cdac91ca55d3..565083c16e5c 100644 --- a/arch/x86/boot/printf.c +++ b/arch/x86/boot/printf.c @@ -55,7 +55,7 @@ static char *number(char *str, long num, int base, int size, int precision, locase = (type & SMALL); if (type & LEFT) type &= ~ZEROPAD; - if (base < 2 || base > 36) + if (base < 2 || base > 16) return NULL; c = (type & ZEROPAD) ? '0' : ' '; sign = 0; diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index 94c544650020..c941d6a8887f 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -243,6 +243,7 @@ static void parse_zoffset(char *fname) c = fread(buf, 1, sizeof(buf) - 1, file); if (ferror(file)) die("read-error on `zoffset.h'"); + fclose(file); buf[c] = 0; p = (char *)buf; diff --git a/arch/x86/configs/kvm_guest.config b/arch/x86/configs/kvm_guest.config new file mode 100644 index 000000000000..f9affcc3b9f1 --- /dev/null +++ b/arch/x86/configs/kvm_guest.config @@ -0,0 +1,28 @@ +CONFIG_NET=y +CONFIG_NET_CORE=y +CONFIG_NETDEVICES=y +CONFIG_BLOCK=y +CONFIG_BLK_DEV=y +CONFIG_NETWORK_FILESYSTEMS=y +CONFIG_INET=y +CONFIG_TTY=y +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_BINFMT_ELF=y +CONFIG_PCI=y +CONFIG_PCI_MSI=y +CONFIG_DEBUG_KERNEL=y +CONFIG_VIRTUALIZATION=y +CONFIG_HYPERVISOR_GUEST=y +CONFIG_PARAVIRT=y +CONFIG_KVM_GUEST=y +CONFIG_VIRTIO=y +CONFIG_VIRTIO_PCI=y +CONFIG_VIRTIO_BLK=y +CONFIG_VIRTIO_CONSOLE=y +CONFIG_VIRTIO_NET=y +CONFIG_9P_FS=y +CONFIG_NET_9P=y +CONFIG_NET_9P_VIRTIO=y diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index a3a0ed80f17c..7d6ba9db1be9 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -3,8 +3,6 @@ # avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) -avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ - $(comma)4)$(comma)%ymm2,yes,no) obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o @@ -29,6 +27,7 @@ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o +obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o # These modules require assembler to support AVX. ifeq ($(avx_supported),yes) @@ -42,10 +41,8 @@ endif # These modules require assembler to support AVX2. ifeq ($(avx2_supported),yes) - obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o - obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o endif aes-i586-y := aes-i586-asm_32.o aes_glue.o @@ -73,10 +70,8 @@ ifeq ($(avx_supported),yes) endif ifeq ($(avx2_supported),yes) - blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o - twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o endif aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o @@ -87,3 +82,4 @@ crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o +crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 62fe22cd4cba..477e9d75149b 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -2681,56 +2681,68 @@ ENTRY(aesni_xts_crypt8) addq %rcx, KEYP movdqa IV, STATE1 - pxor 0x00(INP), STATE1 + movdqu 0x00(INP), INC + pxor INC, STATE1 movdqu IV, 0x00(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE2 - pxor 0x10(INP), STATE2 + movdqu 0x10(INP), INC + pxor INC, STATE2 movdqu IV, 0x10(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE3 - pxor 0x20(INP), STATE3 + movdqu 0x20(INP), INC + pxor INC, STATE3 movdqu IV, 0x20(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE4 - pxor 0x30(INP), STATE4 + movdqu 0x30(INP), INC + pxor INC, STATE4 movdqu IV, 0x30(OUTP) call *%r11 - pxor 0x00(OUTP), STATE1 + movdqu 0x00(OUTP), INC + pxor INC, STATE1 movdqu STATE1, 0x00(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE1 - pxor 0x40(INP), STATE1 + movdqu 0x40(INP), INC + pxor INC, STATE1 movdqu IV, 0x40(OUTP) - pxor 0x10(OUTP), STATE2 + movdqu 0x10(OUTP), INC + pxor INC, STATE2 movdqu STATE2, 0x10(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE2 - pxor 0x50(INP), STATE2 + movdqu 0x50(INP), INC + pxor INC, STATE2 movdqu IV, 0x50(OUTP) - pxor 0x20(OUTP), STATE3 + movdqu 0x20(OUTP), INC + pxor INC, STATE3 movdqu STATE3, 0x20(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE3 - pxor 0x60(INP), STATE3 + movdqu 0x60(INP), INC + pxor INC, STATE3 movdqu IV, 0x60(OUTP) - pxor 0x30(OUTP), STATE4 + movdqu 0x30(OUTP), INC + pxor INC, STATE4 movdqu STATE4, 0x30(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE4 - pxor 0x70(INP), STATE4 + movdqu 0x70(INP), INC + pxor INC, STATE4 movdqu IV, 0x70(OUTP) _aesni_gf128mul_x_ble() @@ -2738,16 +2750,20 @@ ENTRY(aesni_xts_crypt8) call *%r11 - pxor 0x40(OUTP), STATE1 + movdqu 0x40(OUTP), INC + pxor INC, STATE1 movdqu STATE1, 0x40(OUTP) - pxor 0x50(OUTP), STATE2 + movdqu 0x50(OUTP), INC + pxor INC, STATE2 movdqu STATE2, 0x50(OUTP) - pxor 0x60(OUTP), STATE3 + movdqu 0x60(OUTP), INC + pxor INC, STATE3 movdqu STATE3, 0x60(OUTP) - pxor 0x70(OUTP), STATE4 + movdqu 0x70(OUTP), INC + pxor INC, STATE4 movdqu STATE4, 0x70(OUTP) ret diff --git a/arch/x86/crypto/blowfish-avx2-asm_64.S b/arch/x86/crypto/blowfish-avx2-asm_64.S deleted file mode 100644 index 784452e0d05d..000000000000 --- a/arch/x86/crypto/blowfish-avx2-asm_64.S +++ /dev/null @@ -1,449 +0,0 @@ -/* - * x86_64/AVX2 assembler optimized version of Blowfish - * - * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -#include <linux/linkage.h> - -.file "blowfish-avx2-asm_64.S" - -.data -.align 32 - -.Lprefetch_mask: -.long 0*64 -.long 1*64 -.long 2*64 -.long 3*64 -.long 4*64 -.long 5*64 -.long 6*64 -.long 7*64 - -.Lbswap32_mask: -.long 0x00010203 -.long 0x04050607 -.long 0x08090a0b -.long 0x0c0d0e0f - -.Lbswap128_mask: - .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -.Lbswap_iv_mask: - .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0 - -.text -/* structure of crypto context */ -#define p 0 -#define s0 ((16 + 2) * 4) -#define s1 ((16 + 2 + (1 * 256)) * 4) -#define s2 ((16 + 2 + (2 * 256)) * 4) -#define s3 ((16 + 2 + (3 * 256)) * 4) - -/* register macros */ -#define CTX %rdi -#define RIO %rdx - -#define RS0 %rax -#define RS1 %r8 -#define RS2 %r9 -#define RS3 %r10 - -#define RLOOP %r11 -#define RLOOPd %r11d - -#define RXr0 %ymm8 -#define RXr1 %ymm9 -#define RXr2 %ymm10 -#define RXr3 %ymm11 -#define RXl0 %ymm12 -#define RXl1 %ymm13 -#define RXl2 %ymm14 -#define RXl3 %ymm15 - -/* temp regs */ -#define RT0 %ymm0 -#define RT0x %xmm0 -#define RT1 %ymm1 -#define RT1x %xmm1 -#define RIDX0 %ymm2 -#define RIDX1 %ymm3 -#define RIDX1x %xmm3 -#define RIDX2 %ymm4 -#define RIDX3 %ymm5 - -/* vpgatherdd mask and '-1' */ -#define RNOT %ymm6 - -/* byte mask, (-1 >> 24) */ -#define RBYTE %ymm7 - -/*********************************************************************** - * 32-way AVX2 blowfish - ***********************************************************************/ -#define F(xl, xr) \ - vpsrld $24, xl, RIDX0; \ - vpsrld $16, xl, RIDX1; \ - vpsrld $8, xl, RIDX2; \ - vpand RBYTE, RIDX1, RIDX1; \ - vpand RBYTE, RIDX2, RIDX2; \ - vpand RBYTE, xl, RIDX3; \ - \ - vpgatherdd RNOT, (RS0, RIDX0, 4), RT0; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpcmpeqd RIDX0, RIDX0, RIDX0; \ - \ - vpgatherdd RNOT, (RS1, RIDX1, 4), RT1; \ - vpcmpeqd RIDX1, RIDX1, RIDX1; \ - vpaddd RT0, RT1, RT0; \ - \ - vpgatherdd RIDX0, (RS2, RIDX2, 4), RT1; \ - vpxor RT0, RT1, RT0; \ - \ - vpgatherdd RIDX1, (RS3, RIDX3, 4), RT1; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpaddd RT0, RT1, RT0; \ - \ - vpxor RT0, xr, xr; - -#define add_roundkey(xl, nmem) \ - vpbroadcastd nmem, RT0; \ - vpxor RT0, xl ## 0, xl ## 0; \ - vpxor RT0, xl ## 1, xl ## 1; \ - vpxor RT0, xl ## 2, xl ## 2; \ - vpxor RT0, xl ## 3, xl ## 3; - -#define round_enc() \ - add_roundkey(RXr, p(CTX,RLOOP,4)); \ - F(RXl0, RXr0); \ - F(RXl1, RXr1); \ - F(RXl2, RXr2); \ - F(RXl3, RXr3); \ - \ - add_roundkey(RXl, p+4(CTX,RLOOP,4)); \ - F(RXr0, RXl0); \ - F(RXr1, RXl1); \ - F(RXr2, RXl2); \ - F(RXr3, RXl3); - -#define round_dec() \ - add_roundkey(RXr, p+4*2(CTX,RLOOP,4)); \ - F(RXl0, RXr0); \ - F(RXl1, RXr1); \ - F(RXl2, RXr2); \ - F(RXl3, RXr3); \ - \ - add_roundkey(RXl, p+4(CTX,RLOOP,4)); \ - F(RXr0, RXl0); \ - F(RXr1, RXl1); \ - F(RXr2, RXl2); \ - F(RXr3, RXl3); - -#define init_round_constants() \ - vpcmpeqd RNOT, RNOT, RNOT; \ - leaq s0(CTX), RS0; \ - leaq s1(CTX), RS1; \ - leaq s2(CTX), RS2; \ - leaq s3(CTX), RS3; \ - vpsrld $24, RNOT, RBYTE; - -#define transpose_2x2(x0, x1, t0) \ - vpunpckldq x0, x1, t0; \ - vpunpckhdq x0, x1, x1; \ - \ - vpunpcklqdq t0, x1, x0; \ - vpunpckhqdq t0, x1, x1; - -#define read_block(xl, xr) \ - vbroadcasti128 .Lbswap32_mask, RT1; \ - \ - vpshufb RT1, xl ## 0, xl ## 0; \ - vpshufb RT1, xr ## 0, xr ## 0; \ - vpshufb RT1, xl ## 1, xl ## 1; \ - vpshufb RT1, xr ## 1, xr ## 1; \ - vpshufb RT1, xl ## 2, xl ## 2; \ - vpshufb RT1, xr ## 2, xr ## 2; \ - vpshufb RT1, xl ## 3, xl ## 3; \ - vpshufb RT1, xr ## 3, xr ## 3; \ - \ - transpose_2x2(xl ## 0, xr ## 0, RT0); \ - transpose_2x2(xl ## 1, xr ## 1, RT0); \ - transpose_2x2(xl ## 2, xr ## 2, RT0); \ - transpose_2x2(xl ## 3, xr ## 3, RT0); - -#define write_block(xl, xr) \ - vbroadcasti128 .Lbswap32_mask, RT1; \ - \ - transpose_2x2(xl ## 0, xr ## 0, RT0); \ - transpose_2x2(xl ## 1, xr ## 1, RT0); \ - transpose_2x2(xl ## 2, xr ## 2, RT0); \ - transpose_2x2(xl ## 3, xr ## 3, RT0); \ - \ - vpshufb RT1, xl ## 0, xl ## 0; \ - vpshufb RT1, xr ## 0, xr ## 0; \ - vpshufb RT1, xl ## 1, xl ## 1; \ - vpshufb RT1, xr ## 1, xr ## 1; \ - vpshufb RT1, xl ## 2, xl ## 2; \ - vpshufb RT1, xr ## 2, xr ## 2; \ - vpshufb RT1, xl ## 3, xl ## 3; \ - vpshufb RT1, xr ## 3, xr ## 3; - -.align 8 -__blowfish_enc_blk32: - /* input: - * %rdi: ctx, CTX - * RXl0..4, RXr0..4: plaintext - * output: - * RXl0..4, RXr0..4: ciphertext (RXl <=> RXr swapped) - */ - init_round_constants(); - - read_block(RXl, RXr); - - movl $1, RLOOPd; - add_roundkey(RXl, p+4*(0)(CTX)); - -.align 4 -.L__enc_loop: - round_enc(); - - leal 2(RLOOPd), RLOOPd; - cmpl $17, RLOOPd; - jne .L__enc_loop; - - add_roundkey(RXr, p+4*(17)(CTX)); - - write_block(RXl, RXr); - - ret; -ENDPROC(__blowfish_enc_blk32) - -.align 8 -__blowfish_dec_blk32: - /* input: - * %rdi: ctx, CTX - * RXl0..4, RXr0..4: ciphertext - * output: - * RXl0..4, RXr0..4: plaintext (RXl <=> RXr swapped) - */ - init_round_constants(); - - read_block(RXl, RXr); - - movl $14, RLOOPd; - add_roundkey(RXl, p+4*(17)(CTX)); - -.align 4 -.L__dec_loop: - round_dec(); - - addl $-2, RLOOPd; - jns .L__dec_loop; - - add_roundkey(RXr, p+4*(0)(CTX)); - - write_block(RXl, RXr); - - ret; -ENDPROC(__blowfish_dec_blk32) - -ENTRY(blowfish_ecb_enc_32way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - */ - - vzeroupper; - - vmovdqu 0*32(%rdx), RXl0; - vmovdqu 1*32(%rdx), RXr0; - vmovdqu 2*32(%rdx), RXl1; - vmovdqu 3*32(%rdx), RXr1; - vmovdqu 4*32(%rdx), RXl2; - vmovdqu 5*32(%rdx), RXr2; - vmovdqu 6*32(%rdx), RXl3; - vmovdqu 7*32(%rdx), RXr3; - - call __blowfish_enc_blk32; - - vmovdqu RXr0, 0*32(%rsi); - vmovdqu RXl0, 1*32(%rsi); - vmovdqu RXr1, 2*32(%rsi); - vmovdqu RXl1, 3*32(%rsi); - vmovdqu RXr2, 4*32(%rsi); - vmovdqu RXl2, 5*32(%rsi); - vmovdqu RXr3, 6*32(%rsi); - vmovdqu RXl3, 7*32(%rsi); - - vzeroupper; - - ret; -ENDPROC(blowfish_ecb_enc_32way) - -ENTRY(blowfish_ecb_dec_32way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - */ - - vzeroupper; - - vmovdqu 0*32(%rdx), RXl0; - vmovdqu 1*32(%rdx), RXr0; - vmovdqu 2*32(%rdx), RXl1; - vmovdqu 3*32(%rdx), RXr1; - vmovdqu 4*32(%rdx), RXl2; - vmovdqu 5*32(%rdx), RXr2; - vmovdqu 6*32(%rdx), RXl3; - vmovdqu 7*32(%rdx), RXr3; - - call __blowfish_dec_blk32; - - vmovdqu RXr0, 0*32(%rsi); - vmovdqu RXl0, 1*32(%rsi); - vmovdqu RXr1, 2*32(%rsi); - vmovdqu RXl1, 3*32(%rsi); - vmovdqu RXr2, 4*32(%rsi); - vmovdqu RXl2, 5*32(%rsi); - vmovdqu RXr3, 6*32(%rsi); - vmovdqu RXl3, 7*32(%rsi); - - vzeroupper; - - ret; -ENDPROC(blowfish_ecb_dec_32way) - -ENTRY(blowfish_cbc_dec_32way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - */ - - vzeroupper; - - vmovdqu 0*32(%rdx), RXl0; - vmovdqu 1*32(%rdx), RXr0; - vmovdqu 2*32(%rdx), RXl1; - vmovdqu 3*32(%rdx), RXr1; - vmovdqu 4*32(%rdx), RXl2; - vmovdqu 5*32(%rdx), RXr2; - vmovdqu 6*32(%rdx), RXl3; - vmovdqu 7*32(%rdx), RXr3; - - call __blowfish_dec_blk32; - - /* xor with src */ - vmovq (%rdx), RT0x; - vpshufd $0x4f, RT0x, RT0x; - vinserti128 $1, 8(%rdx), RT0, RT0; - vpxor RT0, RXr0, RXr0; - vpxor 0*32+24(%rdx), RXl0, RXl0; - vpxor 1*32+24(%rdx), RXr1, RXr1; - vpxor 2*32+24(%rdx), RXl1, RXl1; - vpxor 3*32+24(%rdx), RXr2, RXr2; - vpxor 4*32+24(%rdx), RXl2, RXl2; - vpxor 5*32+24(%rdx), RXr3, RXr3; - vpxor 6*32+24(%rdx), RXl3, RXl3; - - vmovdqu RXr0, (0*32)(%rsi); - vmovdqu RXl0, (1*32)(%rsi); - vmovdqu RXr1, (2*32)(%rsi); - vmovdqu RXl1, (3*32)(%rsi); - vmovdqu RXr2, (4*32)(%rsi); - vmovdqu RXl2, (5*32)(%rsi); - vmovdqu RXr3, (6*32)(%rsi); - vmovdqu RXl3, (7*32)(%rsi); - - vzeroupper; - - ret; -ENDPROC(blowfish_cbc_dec_32way) - -ENTRY(blowfish_ctr_32way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (big endian, 64bit) - */ - - vzeroupper; - - vpcmpeqd RT0, RT0, RT0; - vpsrldq $8, RT0, RT0; /* a: -1, b: 0, c: -1, d: 0 */ - - vpcmpeqd RT1x, RT1x, RT1x; - vpaddq RT1x, RT1x, RT1x; /* a: -2, b: -2 */ - vpxor RIDX0, RIDX0, RIDX0; - vinserti128 $1, RT1x, RIDX0, RIDX0; /* a: 0, b: 0, c: -2, d: -2 */ - - vpaddq RIDX0, RT0, RT0; /* a: -1, b: 0, c: -3, d: -2 */ - - vpcmpeqd RT1, RT1, RT1; - vpaddq RT1, RT1, RT1; /* a: -2, b: -2, c: -2, d: -2 */ - vpaddq RT1, RT1, RIDX2; /* a: -4, b: -4, c: -4, d: -4 */ - - vbroadcasti128 .Lbswap_iv_mask, RIDX0; - vbroadcasti128 .Lbswap128_mask, RIDX1; - - /* load IV and byteswap */ - vmovq (%rcx), RT1x; - vinserti128 $1, RT1x, RT1, RT1; /* a: BE, b: 0, c: BE, d: 0 */ - vpshufb RIDX0, RT1, RT1; /* a: LE, b: LE, c: LE, d: LE */ - - /* construct IVs */ - vpsubq RT0, RT1, RT1; /* a: le1, b: le0, c: le3, d: le2 */ - vpshufb RIDX1, RT1, RXl0; /* a: be0, b: be1, c: be2, d: be3 */ - vpsubq RIDX2, RT1, RT1; /* le5, le4, le7, le6 */ - vpshufb RIDX1, RT1, RXr0; /* be4, be5, be6, be7 */ - vpsubq RIDX2, RT1, RT1; - vpshufb RIDX1, RT1, RXl1; - vpsubq RIDX2, RT1, RT1; - vpshufb RIDX1, RT1, RXr1; - vpsubq RIDX2, RT1, RT1; - vpshufb RIDX1, RT1, RXl2; - vpsubq RIDX2, RT1, RT1; - vpshufb RIDX1, RT1, RXr2; - vpsubq RIDX2, RT1, RT1; - vpshufb RIDX1, RT1, RXl3; - vpsubq RIDX2, RT1, RT1; - vpshufb RIDX1, RT1, RXr3; - - /* store last IV */ - vpsubq RIDX2, RT1, RT1; /* a: le33, b: le32, ... */ - vpshufb RIDX1x, RT1x, RT1x; /* a: be32, ... */ - vmovq RT1x, (%rcx); - - call __blowfish_enc_blk32; - - /* dst = src ^ iv */ - vpxor 0*32(%rdx), RXr0, RXr0; - vpxor 1*32(%rdx), RXl0, RXl0; - vpxor 2*32(%rdx), RXr1, RXr1; - vpxor 3*32(%rdx), RXl1, RXl1; - vpxor 4*32(%rdx), RXr2, RXr2; - vpxor 5*32(%rdx), RXl2, RXl2; - vpxor 6*32(%rdx), RXr3, RXr3; - vpxor 7*32(%rdx), RXl3, RXl3; - vmovdqu RXr0, (0*32)(%rsi); - vmovdqu RXl0, (1*32)(%rsi); - vmovdqu RXr1, (2*32)(%rsi); - vmovdqu RXl1, (3*32)(%rsi); - vmovdqu RXr2, (4*32)(%rsi); - vmovdqu RXl2, (5*32)(%rsi); - vmovdqu RXr3, (6*32)(%rsi); - vmovdqu RXl3, (7*32)(%rsi); - - vzeroupper; - - ret; -ENDPROC(blowfish_ctr_32way) diff --git a/arch/x86/crypto/blowfish_avx2_glue.c b/arch/x86/crypto/blowfish_avx2_glue.c deleted file mode 100644 index 4417e9aea78d..000000000000 --- a/arch/x86/crypto/blowfish_avx2_glue.c +++ /dev/null @@ -1,585 +0,0 @@ -/* - * Glue Code for x86_64/AVX2 assembler optimized version of Blowfish - * - * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> - * - * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: - * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> - * CTR part based on code (crypto/ctr.c) by: - * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/crypto.h> -#include <linux/err.h> -#include <crypto/algapi.h> -#include <crypto/blowfish.h> -#include <crypto/cryptd.h> -#include <crypto/ctr.h> -#include <asm/i387.h> -#include <asm/xcr.h> -#include <asm/xsave.h> -#include <asm/crypto/blowfish.h> -#include <asm/crypto/ablk_helper.h> -#include <crypto/scatterwalk.h> - -#define BF_AVX2_PARALLEL_BLOCKS 32 - -/* 32-way AVX2 parallel cipher functions */ -asmlinkage void blowfish_ecb_enc_32way(struct bf_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void blowfish_ecb_dec_32way(struct bf_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void blowfish_cbc_dec_32way(struct bf_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void blowfish_ctr_32way(struct bf_ctx *ctx, u8 *dst, const u8 *src, - __be64 *iv); - -static inline bool bf_fpu_begin(bool fpu_enabled, unsigned int nbytes) -{ - if (fpu_enabled) - return true; - - /* FPU is only used when chunk to be processed is large enough, so - * do not enable FPU until it is necessary. - */ - if (nbytes < BF_BLOCK_SIZE * BF_AVX2_PARALLEL_BLOCKS) - return false; - - kernel_fpu_begin(); - return true; -} - -static inline void bf_fpu_end(bool fpu_enabled) -{ - if (fpu_enabled) - kernel_fpu_end(); -} - -static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, - bool enc) -{ - bool fpu_enabled = false; - struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = BF_BLOCK_SIZE; - unsigned int nbytes; - int err; - - err = blkcipher_walk_virt(desc, walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - while ((nbytes = walk->nbytes)) { - u8 *wsrc = walk->src.virt.addr; - u8 *wdst = walk->dst.virt.addr; - - fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes); - - /* Process multi-block AVX2 batch */ - if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) { - do { - if (enc) - blowfish_ecb_enc_32way(ctx, wdst, wsrc); - else - blowfish_ecb_dec_32way(ctx, wdst, wsrc); - - wsrc += bsize * BF_AVX2_PARALLEL_BLOCKS; - wdst += bsize * BF_AVX2_PARALLEL_BLOCKS; - nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS; - } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Process multi-block batch */ - if (nbytes >= bsize * BF_PARALLEL_BLOCKS) { - do { - if (enc) - blowfish_enc_blk_4way(ctx, wdst, wsrc); - else - blowfish_dec_blk_4way(ctx, wdst, wsrc); - - wsrc += bsize * BF_PARALLEL_BLOCKS; - wdst += bsize * BF_PARALLEL_BLOCKS; - nbytes -= bsize * BF_PARALLEL_BLOCKS; - } while (nbytes >= bsize * BF_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - if (enc) - blowfish_enc_blk(ctx, wdst, wsrc); - else - blowfish_dec_blk(ctx, wdst, wsrc); - - wsrc += bsize; - wdst += bsize; - nbytes -= bsize; - } while (nbytes >= bsize); - -done: - err = blkcipher_walk_done(desc, walk, nbytes); - } - - bf_fpu_end(fpu_enabled); - return err; -} - -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, true); -} - -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, false); -} - -static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - unsigned int bsize = BF_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u64 *src = (u64 *)walk->src.virt.addr; - u64 *dst = (u64 *)walk->dst.virt.addr; - u64 *iv = (u64 *)walk->iv; - - do { - *dst = *src ^ *iv; - blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst); - iv = dst; - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - - *(u64 *)walk->iv = *iv; - return nbytes; -} - -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - - while ((nbytes = walk.nbytes)) { - nbytes = __cbc_encrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } - - return err; -} - -static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = BF_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u64 *src = (u64 *)walk->src.virt.addr; - u64 *dst = (u64 *)walk->dst.virt.addr; - u64 last_iv; - int i; - - /* Start of the last block. */ - src += nbytes / bsize - 1; - dst += nbytes / bsize - 1; - - last_iv = *src; - - /* Process multi-block AVX2 batch */ - if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) { - do { - nbytes -= bsize * (BF_AVX2_PARALLEL_BLOCKS - 1); - src -= BF_AVX2_PARALLEL_BLOCKS - 1; - dst -= BF_AVX2_PARALLEL_BLOCKS - 1; - - blowfish_cbc_dec_32way(ctx, (u8 *)dst, (u8 *)src); - - nbytes -= bsize; - if (nbytes < bsize) - goto done; - - *dst ^= *(src - 1); - src -= 1; - dst -= 1; - } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Process multi-block batch */ - if (nbytes >= bsize * BF_PARALLEL_BLOCKS) { - u64 ivs[BF_PARALLEL_BLOCKS - 1]; - - do { - nbytes -= bsize * (BF_PARALLEL_BLOCKS - 1); - src -= BF_PARALLEL_BLOCKS - 1; - dst -= BF_PARALLEL_BLOCKS - 1; - - for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++) - ivs[i] = src[i]; - - blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src); - - for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++) - dst[i + 1] ^= ivs[i]; - - nbytes -= bsize; - if (nbytes < bsize) - goto done; - - *dst ^= *(src - 1); - src -= 1; - dst -= 1; - } while (nbytes >= bsize * BF_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - for (;;) { - blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src); - - nbytes -= bsize; - if (nbytes < bsize) - break; - - *dst ^= *(src - 1); - src -= 1; - dst -= 1; - } - -done: - *dst ^= *(u64 *)walk->iv; - *(u64 *)walk->iv = last_iv; - - return nbytes; -} - -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - bool fpu_enabled = false; - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - while ((nbytes = walk.nbytes)) { - fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes); - nbytes = __cbc_decrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } - - bf_fpu_end(fpu_enabled); - return err; -} - -static void ctr_crypt_final(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - u8 *ctrblk = walk->iv; - u8 keystream[BF_BLOCK_SIZE]; - u8 *src = walk->src.virt.addr; - u8 *dst = walk->dst.virt.addr; - unsigned int nbytes = walk->nbytes; - - blowfish_enc_blk(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); - - crypto_inc(ctrblk, BF_BLOCK_SIZE); -} - -static unsigned int __ctr_crypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - unsigned int bsize = BF_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u64 *src = (u64 *)walk->src.virt.addr; - u64 *dst = (u64 *)walk->dst.virt.addr; - int i; - - /* Process multi-block AVX2 batch */ - if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) { - do { - blowfish_ctr_32way(ctx, (u8 *)dst, (u8 *)src, - (__be64 *)walk->iv); - - src += BF_AVX2_PARALLEL_BLOCKS; - dst += BF_AVX2_PARALLEL_BLOCKS; - nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS; - } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Process four block batch */ - if (nbytes >= bsize * BF_PARALLEL_BLOCKS) { - __be64 ctrblocks[BF_PARALLEL_BLOCKS]; - u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv); - - do { - /* create ctrblks for parallel encrypt */ - for (i = 0; i < BF_PARALLEL_BLOCKS; i++) { - if (dst != src) - dst[i] = src[i]; - - ctrblocks[i] = cpu_to_be64(ctrblk++); - } - - blowfish_enc_blk_xor_4way(ctx, (u8 *)dst, - (u8 *)ctrblocks); - - src += BF_PARALLEL_BLOCKS; - dst += BF_PARALLEL_BLOCKS; - nbytes -= bsize * BF_PARALLEL_BLOCKS; - } while (nbytes >= bsize * BF_PARALLEL_BLOCKS); - - *(__be64 *)walk->iv = cpu_to_be64(ctrblk); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - u64 ctrblk; - - if (dst != src) - *dst = *src; - - ctrblk = *(u64 *)walk->iv; - be64_add_cpu((__be64 *)walk->iv, 1); - - blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk); - - src += 1; - dst += 1; - } while ((nbytes -= bsize) >= bsize); - -done: - return nbytes; -} - -static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - bool fpu_enabled = false; - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, BF_BLOCK_SIZE); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) { - fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes); - nbytes = __ctr_crypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } - - bf_fpu_end(fpu_enabled); - - if (walk.nbytes) { - ctr_crypt_final(desc, &walk); - err = blkcipher_walk_done(desc, &walk, 0); - } - - return err; -} - -static struct crypto_alg bf_algs[6] = { { - .cra_name = "__ecb-blowfish-avx2", - .cra_driver_name = "__driver-ecb-blowfish-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = BF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct bf_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .setkey = blowfish_setkey, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}, { - .cra_name = "__cbc-blowfish-avx2", - .cra_driver_name = "__driver-cbc-blowfish-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = BF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct bf_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .setkey = blowfish_setkey, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}, { - .cra_name = "__ctr-blowfish-avx2", - .cra_driver_name = "__driver-ctr-blowfish-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct bf_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .ivsize = BF_BLOCK_SIZE, - .setkey = blowfish_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, - }, -}, { - .cra_name = "ecb(blowfish)", - .cra_driver_name = "ecb-blowfish-avx2", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = BF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "cbc(blowfish)", - .cra_driver_name = "cbc-blowfish-avx2", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = BF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .ivsize = BF_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = __ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "ctr(blowfish)", - .cra_driver_name = "ctr-blowfish-avx2", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .ivsize = BF_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_encrypt, - .geniv = "chainiv", - }, - }, -} }; - - -static int __init init(void) -{ - u64 xcr0; - - if (!cpu_has_avx2 || !cpu_has_osxsave) { - pr_info("AVX2 instructions are not detected.\n"); - return -ENODEV; - } - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX detected but unusable.\n"); - return -ENODEV; - } - - return crypto_register_algs(bf_algs, ARRAY_SIZE(bf_algs)); -} - -static void __exit fini(void) -{ - crypto_unregister_algs(bf_algs, ARRAY_SIZE(bf_algs)); -} - -module_init(init); -module_exit(fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Blowfish Cipher Algorithm, AVX2 optimized"); -MODULE_ALIAS("blowfish"); -MODULE_ALIAS("blowfish-asm"); diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 3548d76dbaa9..50ec333b70e6 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -1,7 +1,7 @@ /* * Glue Code for assembler optimized version of Blowfish * - * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> * * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> @@ -32,24 +32,40 @@ #include <linux/module.h> #include <linux/types.h> #include <crypto/algapi.h> -#include <asm/crypto/blowfish.h> /* regular block cipher functions */ asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src, bool xor); -EXPORT_SYMBOL_GPL(__blowfish_enc_blk); - asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); -EXPORT_SYMBOL_GPL(blowfish_dec_blk); /* 4-way parallel cipher functions */ asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, const u8 *src, bool xor); -EXPORT_SYMBOL_GPL(__blowfish_enc_blk_4way); - asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst, const u8 *src); -EXPORT_SYMBOL_GPL(blowfish_dec_blk_4way); + +static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src) +{ + __blowfish_enc_blk(ctx, dst, src, false); +} + +static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst, + const u8 *src) +{ + __blowfish_enc_blk(ctx, dst, src, true); +} + +static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, + const u8 *src) +{ + __blowfish_enc_blk_4way(ctx, dst, src, false); +} + +static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst, + const u8 *src) +{ + __blowfish_enc_blk_4way(ctx, dst, src, true); +} static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index 91a1878fcc3e..0e0b8863a34b 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -51,16 +51,6 @@ #define ymm14_x xmm14 #define ymm15_x xmm15 -/* - * AES-NI instructions do not support ymmX registers, so we need splitting and - * merging. - */ -#define vaesenclast256(zero, yreg, tmp) \ - vextracti128 $1, yreg, tmp##_x; \ - vaesenclast zero##_x, yreg##_x, yreg##_x; \ - vaesenclast zero##_x, tmp##_x, tmp##_x; \ - vinserti128 $1, tmp##_x, yreg, yreg; - /********************************************************************** 32-way camellia **********************************************************************/ @@ -79,46 +69,70 @@ * S-function with AES subbytes \ */ \ vbroadcasti128 .Linv_shift_row, t4; \ - vpbroadcastb .L0f0f0f0f, t7; \ - vbroadcasti128 .Lpre_tf_lo_s1, t0; \ - vbroadcasti128 .Lpre_tf_hi_s1, t1; \ + vpbroadcastd .L0f0f0f0f, t7; \ + vbroadcasti128 .Lpre_tf_lo_s1, t5; \ + vbroadcasti128 .Lpre_tf_hi_s1, t6; \ + vbroadcasti128 .Lpre_tf_lo_s4, t2; \ + vbroadcasti128 .Lpre_tf_hi_s4, t3; \ \ /* AES inverse shift rows */ \ vpshufb t4, x0, x0; \ vpshufb t4, x7, x7; \ - vpshufb t4, x1, x1; \ - vpshufb t4, x4, x4; \ - vpshufb t4, x2, x2; \ - vpshufb t4, x5, x5; \ vpshufb t4, x3, x3; \ vpshufb t4, x6, x6; \ + vpshufb t4, x2, x2; \ + vpshufb t4, x5, x5; \ + vpshufb t4, x1, x1; \ + vpshufb t4, x4, x4; \ \ /* prefilter sboxes 1, 2 and 3 */ \ - vbroadcasti128 .Lpre_tf_lo_s4, t2; \ - vbroadcasti128 .Lpre_tf_hi_s4, t3; \ - filter_8bit(x0, t0, t1, t7, t6); \ - filter_8bit(x7, t0, t1, t7, t6); \ - filter_8bit(x1, t0, t1, t7, t6); \ - filter_8bit(x4, t0, t1, t7, t6); \ - filter_8bit(x2, t0, t1, t7, t6); \ - filter_8bit(x5, t0, t1, t7, t6); \ - \ /* prefilter sbox 4 */ \ + filter_8bit(x0, t5, t6, t7, t4); \ + filter_8bit(x7, t5, t6, t7, t4); \ + vextracti128 $1, x0, t0##_x; \ + vextracti128 $1, x7, t1##_x; \ + filter_8bit(x3, t2, t3, t7, t4); \ + filter_8bit(x6, t2, t3, t7, t4); \ + vextracti128 $1, x3, t3##_x; \ + vextracti128 $1, x6, t2##_x; \ + filter_8bit(x2, t5, t6, t7, t4); \ + filter_8bit(x5, t5, t6, t7, t4); \ + filter_8bit(x1, t5, t6, t7, t4); \ + filter_8bit(x4, t5, t6, t7, t4); \ + \ vpxor t4##_x, t4##_x, t4##_x; \ - filter_8bit(x3, t2, t3, t7, t6); \ - filter_8bit(x6, t2, t3, t7, t6); \ \ /* AES subbytes + AES shift rows */ \ + vextracti128 $1, x2, t6##_x; \ + vextracti128 $1, x5, t5##_x; \ + vaesenclast t4##_x, x0##_x, x0##_x; \ + vaesenclast t4##_x, t0##_x, t0##_x; \ + vinserti128 $1, t0##_x, x0, x0; \ + vaesenclast t4##_x, x7##_x, x7##_x; \ + vaesenclast t4##_x, t1##_x, t1##_x; \ + vinserti128 $1, t1##_x, x7, x7; \ + vaesenclast t4##_x, x3##_x, x3##_x; \ + vaesenclast t4##_x, t3##_x, t3##_x; \ + vinserti128 $1, t3##_x, x3, x3; \ + vaesenclast t4##_x, x6##_x, x6##_x; \ + vaesenclast t4##_x, t2##_x, t2##_x; \ + vinserti128 $1, t2##_x, x6, x6; \ + vextracti128 $1, x1, t3##_x; \ + vextracti128 $1, x4, t2##_x; \ vbroadcasti128 .Lpost_tf_lo_s1, t0; \ vbroadcasti128 .Lpost_tf_hi_s1, t1; \ - vaesenclast256(t4, x0, t5); \ - vaesenclast256(t4, x7, t5); \ - vaesenclast256(t4, x1, t5); \ - vaesenclast256(t4, x4, t5); \ - vaesenclast256(t4, x2, t5); \ - vaesenclast256(t4, x5, t5); \ - vaesenclast256(t4, x3, t5); \ - vaesenclast256(t4, x6, t5); \ + vaesenclast t4##_x, x2##_x, x2##_x; \ + vaesenclast t4##_x, t6##_x, t6##_x; \ + vinserti128 $1, t6##_x, x2, x2; \ + vaesenclast t4##_x, x5##_x, x5##_x; \ + vaesenclast t4##_x, t5##_x, t5##_x; \ + vinserti128 $1, t5##_x, x5, x5; \ + vaesenclast t4##_x, x1##_x, x1##_x; \ + vaesenclast t4##_x, t3##_x, t3##_x; \ + vinserti128 $1, t3##_x, x1, x1; \ + vaesenclast t4##_x, x4##_x, x4##_x; \ + vaesenclast t4##_x, t2##_x, t2##_x; \ + vinserti128 $1, t2##_x, x4, x4; \ \ /* postfilter sboxes 1 and 4 */ \ vbroadcasti128 .Lpost_tf_lo_s3, t2; \ @@ -139,22 +153,12 @@ /* postfilter sbox 2 */ \ filter_8bit(x1, t4, t5, t7, t2); \ filter_8bit(x4, t4, t5, t7, t2); \ + vpxor t7, t7, t7; \ \ vpsrldq $1, t0, t1; \ vpsrldq $2, t0, t2; \ + vpshufb t7, t1, t1; \ vpsrldq $3, t0, t3; \ - vpsrldq $4, t0, t4; \ - vpsrldq $5, t0, t5; \ - vpsrldq $6, t0, t6; \ - vpsrldq $7, t0, t7; \ - vpbroadcastb t0##_x, t0; \ - vpbroadcastb t1##_x, t1; \ - vpbroadcastb t2##_x, t2; \ - vpbroadcastb t3##_x, t3; \ - vpbroadcastb t4##_x, t4; \ - vpbroadcastb t6##_x, t6; \ - vpbroadcastb t5##_x, t5; \ - vpbroadcastb t7##_x, t7; \ \ /* P-function */ \ vpxor x5, x0, x0; \ @@ -162,11 +166,21 @@ vpxor x7, x2, x2; \ vpxor x4, x3, x3; \ \ + vpshufb t7, t2, t2; \ + vpsrldq $4, t0, t4; \ + vpshufb t7, t3, t3; \ + vpsrldq $5, t0, t5; \ + vpshufb t7, t4, t4; \ + \ vpxor x2, x4, x4; \ vpxor x3, x5, x5; \ vpxor x0, x6, x6; \ vpxor x1, x7, x7; \ \ + vpsrldq $6, t0, t6; \ + vpshufb t7, t5, t5; \ + vpshufb t7, t6, t6; \ + \ vpxor x7, x0, x0; \ vpxor x4, x1, x1; \ vpxor x5, x2, x2; \ @@ -179,12 +193,16 @@ \ /* Add key material and result to CD (x becomes new CD) */ \ \ - vpxor t7, x0, x0; \ - vpxor 4 * 32(mem_cd), x0, x0; \ - \ vpxor t6, x1, x1; \ vpxor 5 * 32(mem_cd), x1, x1; \ \ + vpsrldq $7, t0, t6; \ + vpshufb t7, t0, t0; \ + vpshufb t7, t6, t7; \ + \ + vpxor t7, x0, x0; \ + vpxor 4 * 32(mem_cd), x0, x0; \ + \ vpxor t5, x2, x2; \ vpxor 6 * 32(mem_cd), x2, x2; \ \ @@ -204,7 +222,7 @@ vpxor 3 * 32(mem_cd), x7, x7; /* - * Size optimization... with inlined roundsm16 binary would be over 5 times + * Size optimization... with inlined roundsm32 binary would be over 5 times * larger and would only marginally faster. */ .align 8 @@ -324,13 +342,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) */ \ vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ vpxor tt0, tt0, tt0; \ - vpbroadcastb t0##_x, t3; \ + vpshufb tt0, t0, t3; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t2; \ + vpshufb tt0, t0, t2; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t1; \ + vpshufb tt0, t0, t1; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t0; \ + vpshufb tt0, t0, t0; \ \ vpand l0, t0, t0; \ vpand l1, t1, t1; \ @@ -340,6 +358,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ \ vpxor l4, t0, l4; \ + vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ vmovdqu l4, 4 * 32(l); \ vpxor l5, t1, l5; \ vmovdqu l5, 5 * 32(l); \ @@ -354,14 +373,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) * rl ^= t2; \ */ \ \ - vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ - vpbroadcastb t0##_x, t3; \ + vpshufb tt0, t0, t3; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t2; \ + vpshufb tt0, t0, t2; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t1; \ + vpshufb tt0, t0, t1; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t0; \ + vpshufb tt0, t0, t0; \ \ vpor 4 * 32(r), t0, t0; \ vpor 5 * 32(r), t1, t1; \ @@ -373,6 +391,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) vpxor 2 * 32(r), t2, t2; \ vpxor 3 * 32(r), t3, t3; \ vmovdqu t0, 0 * 32(r); \ + vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ vmovdqu t1, 1 * 32(r); \ vmovdqu t2, 2 * 32(r); \ vmovdqu t3, 3 * 32(r); \ @@ -382,14 +401,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) * t2 &= rl; \ * rr ^= rol32(t2, 1); \ */ \ - vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ - vpbroadcastb t0##_x, t3; \ + vpshufb tt0, t0, t3; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t2; \ + vpshufb tt0, t0, t2; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t1; \ + vpshufb tt0, t0, t1; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t0; \ + vpshufb tt0, t0, t0; \ \ vpand 0 * 32(r), t0, t0; \ vpand 1 * 32(r), t1, t1; \ @@ -403,6 +421,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) vpxor 6 * 32(r), t2, t2; \ vpxor 7 * 32(r), t3, t3; \ vmovdqu t0, 4 * 32(r); \ + vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ vmovdqu t1, 5 * 32(r); \ vmovdqu t2, 6 * 32(r); \ vmovdqu t3, 7 * 32(r); \ @@ -413,14 +432,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) * ll ^= t0; \ */ \ \ - vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ - vpbroadcastb t0##_x, t3; \ + vpshufb tt0, t0, t3; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t2; \ + vpshufb tt0, t0, t2; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t1; \ + vpshufb tt0, t0, t1; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t0; \ + vpshufb tt0, t0, t0; \ \ vpor l4, t0, t0; \ vpor l5, t1, t1; \ diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index 5cb86ccd4acb..c171dcbf192d 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -62,7 +62,7 @@ static void camellia_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) } /* camellia sboxes */ -const u64 camellia_sp10011110[256] = { +__visible const u64 camellia_sp10011110[256] = { 0x7000007070707000ULL, 0x8200008282828200ULL, 0x2c00002c2c2c2c00ULL, 0xec0000ecececec00ULL, 0xb30000b3b3b3b300ULL, 0x2700002727272700ULL, 0xc00000c0c0c0c000ULL, 0xe50000e5e5e5e500ULL, 0xe40000e4e4e4e400ULL, @@ -151,7 +151,7 @@ const u64 camellia_sp10011110[256] = { 0x9e00009e9e9e9e00ULL, }; -const u64 camellia_sp22000222[256] = { +__visible const u64 camellia_sp22000222[256] = { 0xe0e0000000e0e0e0ULL, 0x0505000000050505ULL, 0x5858000000585858ULL, 0xd9d9000000d9d9d9ULL, 0x6767000000676767ULL, 0x4e4e0000004e4e4eULL, 0x8181000000818181ULL, 0xcbcb000000cbcbcbULL, 0xc9c9000000c9c9c9ULL, @@ -240,7 +240,7 @@ const u64 camellia_sp22000222[256] = { 0x3d3d0000003d3d3dULL, }; -const u64 camellia_sp03303033[256] = { +__visible const u64 camellia_sp03303033[256] = { 0x0038380038003838ULL, 0x0041410041004141ULL, 0x0016160016001616ULL, 0x0076760076007676ULL, 0x00d9d900d900d9d9ULL, 0x0093930093009393ULL, 0x0060600060006060ULL, 0x00f2f200f200f2f2ULL, 0x0072720072007272ULL, @@ -329,7 +329,7 @@ const u64 camellia_sp03303033[256] = { 0x004f4f004f004f4fULL, }; -const u64 camellia_sp00444404[256] = { +__visible const u64 camellia_sp00444404[256] = { 0x0000707070700070ULL, 0x00002c2c2c2c002cULL, 0x0000b3b3b3b300b3ULL, 0x0000c0c0c0c000c0ULL, 0x0000e4e4e4e400e4ULL, 0x0000575757570057ULL, 0x0000eaeaeaea00eaULL, 0x0000aeaeaeae00aeULL, 0x0000232323230023ULL, @@ -418,7 +418,7 @@ const u64 camellia_sp00444404[256] = { 0x00009e9e9e9e009eULL, }; -const u64 camellia_sp02220222[256] = { +__visible const u64 camellia_sp02220222[256] = { 0x00e0e0e000e0e0e0ULL, 0x0005050500050505ULL, 0x0058585800585858ULL, 0x00d9d9d900d9d9d9ULL, 0x0067676700676767ULL, 0x004e4e4e004e4e4eULL, 0x0081818100818181ULL, 0x00cbcbcb00cbcbcbULL, 0x00c9c9c900c9c9c9ULL, @@ -507,7 +507,7 @@ const u64 camellia_sp02220222[256] = { 0x003d3d3d003d3d3dULL, }; -const u64 camellia_sp30333033[256] = { +__visible const u64 camellia_sp30333033[256] = { 0x3800383838003838ULL, 0x4100414141004141ULL, 0x1600161616001616ULL, 0x7600767676007676ULL, 0xd900d9d9d900d9d9ULL, 0x9300939393009393ULL, 0x6000606060006060ULL, 0xf200f2f2f200f2f2ULL, 0x7200727272007272ULL, @@ -596,7 +596,7 @@ const u64 camellia_sp30333033[256] = { 0x4f004f4f4f004f4fULL, }; -const u64 camellia_sp44044404[256] = { +__visible const u64 camellia_sp44044404[256] = { 0x7070007070700070ULL, 0x2c2c002c2c2c002cULL, 0xb3b300b3b3b300b3ULL, 0xc0c000c0c0c000c0ULL, 0xe4e400e4e4e400e4ULL, 0x5757005757570057ULL, 0xeaea00eaeaea00eaULL, 0xaeae00aeaeae00aeULL, 0x2323002323230023ULL, @@ -685,7 +685,7 @@ const u64 camellia_sp44044404[256] = { 0x9e9e009e9e9e009eULL, }; -const u64 camellia_sp11101110[256] = { +__visible const u64 camellia_sp11101110[256] = { 0x7070700070707000ULL, 0x8282820082828200ULL, 0x2c2c2c002c2c2c00ULL, 0xececec00ececec00ULL, 0xb3b3b300b3b3b300ULL, 0x2727270027272700ULL, 0xc0c0c000c0c0c000ULL, 0xe5e5e500e5e5e500ULL, 0xe4e4e400e4e4e400ULL, @@ -828,8 +828,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) subRL[1] ^= (subRL[1] & ~subRL[9]) << 32; /* modified for FLinv(kl2) */ - dw = (subRL[1] & subRL[9]) >> 32, - subRL[1] ^= rol32(dw, 1); + dw = (subRL[1] & subRL[9]) >> 32; + subRL[1] ^= rol32(dw, 1); /* round 8 */ subRL[11] ^= subRL[1]; @@ -840,8 +840,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) subRL[1] ^= (subRL[1] & ~subRL[17]) << 32; /* modified for FLinv(kl4) */ - dw = (subRL[1] & subRL[17]) >> 32, - subRL[1] ^= rol32(dw, 1); + dw = (subRL[1] & subRL[17]) >> 32; + subRL[1] ^= rol32(dw, 1); /* round 14 */ subRL[19] ^= subRL[1]; @@ -859,8 +859,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) } else { subRL[1] ^= (subRL[1] & ~subRL[25]) << 32; /* modified for FLinv(kl6) */ - dw = (subRL[1] & subRL[25]) >> 32, - subRL[1] ^= rol32(dw, 1); + dw = (subRL[1] & subRL[25]) >> 32; + subRL[1] ^= rol32(dw, 1); /* round 20 */ subRL[27] ^= subRL[1]; @@ -882,8 +882,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) kw4 ^= (kw4 & ~subRL[24]) << 32; /* modified for FL(kl5) */ - dw = (kw4 & subRL[24]) >> 32, - kw4 ^= rol32(dw, 1); + dw = (kw4 & subRL[24]) >> 32; + kw4 ^= rol32(dw, 1); } /* round 17 */ @@ -895,8 +895,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) kw4 ^= (kw4 & ~subRL[16]) << 32; /* modified for FL(kl3) */ - dw = (kw4 & subRL[16]) >> 32, - kw4 ^= rol32(dw, 1); + dw = (kw4 & subRL[16]) >> 32; + kw4 ^= rol32(dw, 1); /* round 11 */ subRL[14] ^= kw4; @@ -907,8 +907,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) kw4 ^= (kw4 & ~subRL[8]) << 32; /* modified for FL(kl1) */ - dw = (kw4 & subRL[8]) >> 32, - kw4 ^= rol32(dw, 1); + dw = (kw4 & subRL[8]) >> 32; + kw4 ^= rol32(dw, 1); /* round 5 */ subRL[6] ^= kw4; @@ -928,8 +928,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) SET_SUBKEY_LR(6, subRL[5] ^ subRL[7]); /* round 5 */ tl = (subRL[10] >> 32) ^ (subRL[10] & ~subRL[8]); - dw = tl & (subRL[8] >> 32), /* FL(kl1) */ - tr = subRL[10] ^ rol32(dw, 1); + dw = tl & (subRL[8] >> 32); /* FL(kl1) */ + tr = subRL[10] ^ rol32(dw, 1); tt = (tr | ((u64)tl << 32)); SET_SUBKEY_LR(7, subRL[6] ^ tt); /* round 6 */ @@ -937,8 +937,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) SET_SUBKEY_LR(9, subRL[9]); /* FLinv(kl2) */ tl = (subRL[7] >> 32) ^ (subRL[7] & ~subRL[9]); - dw = tl & (subRL[9] >> 32), /* FLinv(kl2) */ - tr = subRL[7] ^ rol32(dw, 1); + dw = tl & (subRL[9] >> 32); /* FLinv(kl2) */ + tr = subRL[7] ^ rol32(dw, 1); tt = (tr | ((u64)tl << 32)); SET_SUBKEY_LR(10, subRL[11] ^ tt); /* round 7 */ @@ -948,8 +948,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) SET_SUBKEY_LR(14, subRL[13] ^ subRL[15]); /* round 11 */ tl = (subRL[18] >> 32) ^ (subRL[18] & ~subRL[16]); - dw = tl & (subRL[16] >> 32), /* FL(kl3) */ - tr = subRL[18] ^ rol32(dw, 1); + dw = tl & (subRL[16] >> 32); /* FL(kl3) */ + tr = subRL[18] ^ rol32(dw, 1); tt = (tr | ((u64)tl << 32)); SET_SUBKEY_LR(15, subRL[14] ^ tt); /* round 12 */ @@ -957,8 +957,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) SET_SUBKEY_LR(17, subRL[17]); /* FLinv(kl4) */ tl = (subRL[15] >> 32) ^ (subRL[15] & ~subRL[17]); - dw = tl & (subRL[17] >> 32), /* FLinv(kl4) */ - tr = subRL[15] ^ rol32(dw, 1); + dw = tl & (subRL[17] >> 32); /* FLinv(kl4) */ + tr = subRL[15] ^ rol32(dw, 1); tt = (tr | ((u64)tl << 32)); SET_SUBKEY_LR(18, subRL[19] ^ tt); /* round 13 */ @@ -972,8 +972,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) SET_SUBKEY_LR(24, subRL[24] ^ subRL[23]); /* kw3 */ } else { tl = (subRL[26] >> 32) ^ (subRL[26] & ~subRL[24]); - dw = tl & (subRL[24] >> 32), /* FL(kl5) */ - tr = subRL[26] ^ rol32(dw, 1); + dw = tl & (subRL[24] >> 32); /* FL(kl5) */ + tr = subRL[26] ^ rol32(dw, 1); tt = (tr | ((u64)tl << 32)); SET_SUBKEY_LR(23, subRL[22] ^ tt); /* round 18 */ @@ -981,8 +981,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) SET_SUBKEY_LR(25, subRL[25]); /* FLinv(kl6) */ tl = (subRL[23] >> 32) ^ (subRL[23] & ~subRL[25]); - dw = tl & (subRL[25] >> 32), /* FLinv(kl6) */ - tr = subRL[23] ^ rol32(dw, 1); + dw = tl & (subRL[25] >> 32); /* FLinv(kl6) */ + tr = subRL[23] ^ rol32(dw, 1); tt = (tr | ((u64)tl << 32)); SET_SUBKEY_LR(26, subRL[27] ^ tt); /* round 19 */ diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S new file mode 100644 index 000000000000..35e97569d05f --- /dev/null +++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S @@ -0,0 +1,643 @@ +######################################################################## +# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions +# +# Copyright (c) 2013, Intel Corporation +# +# Authors: +# Erdinc Ozturk <erdinc.ozturk@intel.com> +# Vinodh Gopal <vinodh.gopal@intel.com> +# James Guilford <james.guilford@intel.com> +# Tim Chen <tim.c.chen@linux.intel.com> +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of the Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# +# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## +# Function API: +# UINT16 crc_t10dif_pcl( +# UINT16 init_crc, //initial CRC value, 16 bits +# const unsigned char *buf, //buffer pointer to calculate CRC on +# UINT64 len //buffer length in bytes (64-bit data) +# ); +# +# Reference paper titled "Fast CRC Computation for Generic +# Polynomials Using PCLMULQDQ Instruction" +# URL: http://www.intel.com/content/dam/www/public/us/en/documents +# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +# +# + +#include <linux/linkage.h> + +.text + +#define arg1 %rdi +#define arg2 %rsi +#define arg3 %rdx + +#define arg1_low32 %edi + +ENTRY(crc_t10dif_pcl) +.align 16 + + # adjust the 16-bit initial_crc value, scale it to 32 bits + shl $16, arg1_low32 + + # Allocate Stack Space + mov %rsp, %rcx + sub $16*2, %rsp + # align stack to 16 byte boundary + and $~(0x10 - 1), %rsp + + # check if smaller than 256 + cmp $256, arg3 + + # for sizes less than 128, we can't fold 64B at a time... + jl _less_than_128 + + + # load the initial crc value + movd arg1_low32, %xmm10 # initial crc + + # crc value does not need to be byte-reflected, but it needs + # to be moved to the high part of the register. + # because data will be byte-reflected and will align with + # initial crc at correct place. + pslldq $12, %xmm10 + + movdqa SHUF_MASK(%rip), %xmm11 + # receive the initial 64B data, xor the initial crc value + movdqu 16*0(arg2), %xmm0 + movdqu 16*1(arg2), %xmm1 + movdqu 16*2(arg2), %xmm2 + movdqu 16*3(arg2), %xmm3 + movdqu 16*4(arg2), %xmm4 + movdqu 16*5(arg2), %xmm5 + movdqu 16*6(arg2), %xmm6 + movdqu 16*7(arg2), %xmm7 + + pshufb %xmm11, %xmm0 + # XOR the initial_crc value + pxor %xmm10, %xmm0 + pshufb %xmm11, %xmm1 + pshufb %xmm11, %xmm2 + pshufb %xmm11, %xmm3 + pshufb %xmm11, %xmm4 + pshufb %xmm11, %xmm5 + pshufb %xmm11, %xmm6 + pshufb %xmm11, %xmm7 + + movdqa rk3(%rip), %xmm10 #xmm10 has rk3 and rk4 + #imm value of pclmulqdq instruction + #will determine which constant to use + + ################################################################# + # we subtract 256 instead of 128 to save one instruction from the loop + sub $256, arg3 + + # at this section of the code, there is 64*x+y (0<=y<64) bytes of + # buffer. The _fold_64_B_loop will fold 64B at a time + # until we have 64+y Bytes of buffer + + + # fold 64B at a time. This section of the code folds 4 xmm + # registers in parallel +_fold_64_B_loop: + + # update the buffer pointer + add $128, arg2 # buf += 64# + + movdqu 16*0(arg2), %xmm9 + movdqu 16*1(arg2), %xmm12 + pshufb %xmm11, %xmm9 + pshufb %xmm11, %xmm12 + movdqa %xmm0, %xmm8 + movdqa %xmm1, %xmm13 + pclmulqdq $0x0 , %xmm10, %xmm0 + pclmulqdq $0x11, %xmm10, %xmm8 + pclmulqdq $0x0 , %xmm10, %xmm1 + pclmulqdq $0x11, %xmm10, %xmm13 + pxor %xmm9 , %xmm0 + xorps %xmm8 , %xmm0 + pxor %xmm12, %xmm1 + xorps %xmm13, %xmm1 + + movdqu 16*2(arg2), %xmm9 + movdqu 16*3(arg2), %xmm12 + pshufb %xmm11, %xmm9 + pshufb %xmm11, %xmm12 + movdqa %xmm2, %xmm8 + movdqa %xmm3, %xmm13 + pclmulqdq $0x0, %xmm10, %xmm2 + pclmulqdq $0x11, %xmm10, %xmm8 + pclmulqdq $0x0, %xmm10, %xmm3 + pclmulqdq $0x11, %xmm10, %xmm13 + pxor %xmm9 , %xmm2 + xorps %xmm8 , %xmm2 + pxor %xmm12, %xmm3 + xorps %xmm13, %xmm3 + + movdqu 16*4(arg2), %xmm9 + movdqu 16*5(arg2), %xmm12 + pshufb %xmm11, %xmm9 + pshufb %xmm11, %xmm12 + movdqa %xmm4, %xmm8 + movdqa %xmm5, %xmm13 + pclmulqdq $0x0, %xmm10, %xmm4 + pclmulqdq $0x11, %xmm10, %xmm8 + pclmulqdq $0x0, %xmm10, %xmm5 + pclmulqdq $0x11, %xmm10, %xmm13 + pxor %xmm9 , %xmm4 + xorps %xmm8 , %xmm4 + pxor %xmm12, %xmm5 + xorps %xmm13, %xmm5 + + movdqu 16*6(arg2), %xmm9 + movdqu 16*7(arg2), %xmm12 + pshufb %xmm11, %xmm9 + pshufb %xmm11, %xmm12 + movdqa %xmm6 , %xmm8 + movdqa %xmm7 , %xmm13 + pclmulqdq $0x0 , %xmm10, %xmm6 + pclmulqdq $0x11, %xmm10, %xmm8 + pclmulqdq $0x0 , %xmm10, %xmm7 + pclmulqdq $0x11, %xmm10, %xmm13 + pxor %xmm9 , %xmm6 + xorps %xmm8 , %xmm6 + pxor %xmm12, %xmm7 + xorps %xmm13, %xmm7 + + sub $128, arg3 + + # check if there is another 64B in the buffer to be able to fold + jge _fold_64_B_loop + ################################################################## + + + add $128, arg2 + # at this point, the buffer pointer is pointing at the last y Bytes + # of the buffer the 64B of folded data is in 4 of the xmm + # registers: xmm0, xmm1, xmm2, xmm3 + + + # fold the 8 xmm registers to 1 xmm register with different constants + + movdqa rk9(%rip), %xmm10 + movdqa %xmm0, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm0 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + xorps %xmm0, %xmm7 + + movdqa rk11(%rip), %xmm10 + movdqa %xmm1, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm1 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + xorps %xmm1, %xmm7 + + movdqa rk13(%rip), %xmm10 + movdqa %xmm2, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm2 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm2, %xmm7 + + movdqa rk15(%rip), %xmm10 + movdqa %xmm3, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm3 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + xorps %xmm3, %xmm7 + + movdqa rk17(%rip), %xmm10 + movdqa %xmm4, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm4 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm4, %xmm7 + + movdqa rk19(%rip), %xmm10 + movdqa %xmm5, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm5 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + xorps %xmm5, %xmm7 + + movdqa rk1(%rip), %xmm10 #xmm10 has rk1 and rk2 + #imm value of pclmulqdq instruction + #will determine which constant to use + movdqa %xmm6, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm6 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm6, %xmm7 + + + # instead of 64, we add 48 to the loop counter to save 1 instruction + # from the loop instead of a cmp instruction, we use the negative + # flag with the jl instruction + add $128-16, arg3 + jl _final_reduction_for_128 + + # now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 + # and the rest is in memory. We can fold 16 bytes at a time if y>=16 + # continue folding 16B at a time + +_16B_reduction_loop: + movdqa %xmm7, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm7 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + movdqu (arg2), %xmm0 + pshufb %xmm11, %xmm0 + pxor %xmm0 , %xmm7 + add $16, arg2 + sub $16, arg3 + # instead of a cmp instruction, we utilize the flags with the + # jge instruction equivalent of: cmp arg3, 16-16 + # check if there is any more 16B in the buffer to be able to fold + jge _16B_reduction_loop + + #now we have 16+z bytes left to reduce, where 0<= z < 16. + #first, we reduce the data in the xmm7 register + + +_final_reduction_for_128: + # check if any more data to fold. If not, compute the CRC of + # the final 128 bits + add $16, arg3 + je _128_done + + # here we are getting data that is less than 16 bytes. + # since we know that there was data before the pointer, we can + # offset the input pointer before the actual point, to receive + # exactly 16 bytes. after that the registers need to be adjusted. +_get_last_two_xmms: + movdqa %xmm7, %xmm2 + + movdqu -16(arg2, arg3), %xmm1 + pshufb %xmm11, %xmm1 + + # get rid of the extra data that was loaded before + # load the shift constant + lea pshufb_shf_table+16(%rip), %rax + sub arg3, %rax + movdqu (%rax), %xmm0 + + # shift xmm2 to the left by arg3 bytes + pshufb %xmm0, %xmm2 + + # shift xmm7 to the right by 16-arg3 bytes + pxor mask1(%rip), %xmm0 + pshufb %xmm0, %xmm7 + pblendvb %xmm2, %xmm1 #xmm0 is implicit + + # fold 16 Bytes + movdqa %xmm1, %xmm2 + movdqa %xmm7, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm7 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm2, %xmm7 + +_128_done: + # compute crc of a 128-bit value + movdqa rk5(%rip), %xmm10 # rk5 and rk6 in xmm10 + movdqa %xmm7, %xmm0 + + #64b fold + pclmulqdq $0x1, %xmm10, %xmm7 + pslldq $8 , %xmm0 + pxor %xmm0, %xmm7 + + #32b fold + movdqa %xmm7, %xmm0 + + pand mask2(%rip), %xmm0 + + psrldq $12, %xmm7 + pclmulqdq $0x10, %xmm10, %xmm7 + pxor %xmm0, %xmm7 + + #barrett reduction +_barrett: + movdqa rk7(%rip), %xmm10 # rk7 and rk8 in xmm10 + movdqa %xmm7, %xmm0 + pclmulqdq $0x01, %xmm10, %xmm7 + pslldq $4, %xmm7 + pclmulqdq $0x11, %xmm10, %xmm7 + + pslldq $4, %xmm7 + pxor %xmm0, %xmm7 + pextrd $1, %xmm7, %eax + +_cleanup: + # scale the result back to 16 bits + shr $16, %eax + mov %rcx, %rsp + ret + +######################################################################## + +.align 16 +_less_than_128: + + # check if there is enough buffer to be able to fold 16B at a time + cmp $32, arg3 + jl _less_than_32 + movdqa SHUF_MASK(%rip), %xmm11 + + # now if there is, load the constants + movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10 + + movd arg1_low32, %xmm0 # get the initial crc value + pslldq $12, %xmm0 # align it to its correct place + movdqu (arg2), %xmm7 # load the plaintext + pshufb %xmm11, %xmm7 # byte-reflect the plaintext + pxor %xmm0, %xmm7 + + + # update the buffer pointer + add $16, arg2 + + # update the counter. subtract 32 instead of 16 to save one + # instruction from the loop + sub $32, arg3 + + jmp _16B_reduction_loop + + +.align 16 +_less_than_32: + # mov initial crc to the return value. this is necessary for + # zero-length buffers. + mov arg1_low32, %eax + test arg3, arg3 + je _cleanup + + movdqa SHUF_MASK(%rip), %xmm11 + + movd arg1_low32, %xmm0 # get the initial crc value + pslldq $12, %xmm0 # align it to its correct place + + cmp $16, arg3 + je _exact_16_left + jl _less_than_16_left + + movdqu (arg2), %xmm7 # load the plaintext + pshufb %xmm11, %xmm7 # byte-reflect the plaintext + pxor %xmm0 , %xmm7 # xor the initial crc value + add $16, arg2 + sub $16, arg3 + movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10 + jmp _get_last_two_xmms + + +.align 16 +_less_than_16_left: + # use stack space to load data less than 16 bytes, zero-out + # the 16B in memory first. + + pxor %xmm1, %xmm1 + mov %rsp, %r11 + movdqa %xmm1, (%r11) + + cmp $4, arg3 + jl _only_less_than_4 + + # backup the counter value + mov arg3, %r9 + cmp $8, arg3 + jl _less_than_8_left + + # load 8 Bytes + mov (arg2), %rax + mov %rax, (%r11) + add $8, %r11 + sub $8, arg3 + add $8, arg2 +_less_than_8_left: + + cmp $4, arg3 + jl _less_than_4_left + + # load 4 Bytes + mov (arg2), %eax + mov %eax, (%r11) + add $4, %r11 + sub $4, arg3 + add $4, arg2 +_less_than_4_left: + + cmp $2, arg3 + jl _less_than_2_left + + # load 2 Bytes + mov (arg2), %ax + mov %ax, (%r11) + add $2, %r11 + sub $2, arg3 + add $2, arg2 +_less_than_2_left: + cmp $1, arg3 + jl _zero_left + + # load 1 Byte + mov (arg2), %al + mov %al, (%r11) +_zero_left: + movdqa (%rsp), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + # shl r9, 4 + lea pshufb_shf_table+16(%rip), %rax + sub %r9, %rax + movdqu (%rax), %xmm0 + pxor mask1(%rip), %xmm0 + + pshufb %xmm0, %xmm7 + jmp _128_done + +.align 16 +_exact_16_left: + movdqu (arg2), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + jmp _128_done + +_only_less_than_4: + cmp $3, arg3 + jl _only_less_than_3 + + # load 3 Bytes + mov (arg2), %al + mov %al, (%r11) + + mov 1(arg2), %al + mov %al, 1(%r11) + + mov 2(arg2), %al + mov %al, 2(%r11) + + movdqa (%rsp), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + psrldq $5, %xmm7 + + jmp _barrett +_only_less_than_3: + cmp $2, arg3 + jl _only_less_than_2 + + # load 2 Bytes + mov (arg2), %al + mov %al, (%r11) + + mov 1(arg2), %al + mov %al, 1(%r11) + + movdqa (%rsp), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + psrldq $6, %xmm7 + + jmp _barrett +_only_less_than_2: + + # load 1 Byte + mov (arg2), %al + mov %al, (%r11) + + movdqa (%rsp), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + psrldq $7, %xmm7 + + jmp _barrett + +ENDPROC(crc_t10dif_pcl) + +.data + +# precomputed constants +# these constants are precomputed from the poly: +# 0x8bb70000 (0x8bb7 scaled to 32 bits) +.align 16 +# Q = 0x18BB70000 +# rk1 = 2^(32*3) mod Q << 32 +# rk2 = 2^(32*5) mod Q << 32 +# rk3 = 2^(32*15) mod Q << 32 +# rk4 = 2^(32*17) mod Q << 32 +# rk5 = 2^(32*3) mod Q << 32 +# rk6 = 2^(32*2) mod Q << 32 +# rk7 = floor(2^64/Q) +# rk8 = Q +rk1: +.quad 0x2d56000000000000 +rk2: +.quad 0x06df000000000000 +rk3: +.quad 0x9d9d000000000000 +rk4: +.quad 0x7cf5000000000000 +rk5: +.quad 0x2d56000000000000 +rk6: +.quad 0x1368000000000000 +rk7: +.quad 0x00000001f65a57f8 +rk8: +.quad 0x000000018bb70000 + +rk9: +.quad 0xceae000000000000 +rk10: +.quad 0xbfd6000000000000 +rk11: +.quad 0x1e16000000000000 +rk12: +.quad 0x713c000000000000 +rk13: +.quad 0xf7f9000000000000 +rk14: +.quad 0x80a6000000000000 +rk15: +.quad 0x044c000000000000 +rk16: +.quad 0xe658000000000000 +rk17: +.quad 0xad18000000000000 +rk18: +.quad 0xa497000000000000 +rk19: +.quad 0x6ee3000000000000 +rk20: +.quad 0xe7b5000000000000 + + + +mask1: +.octa 0x80808080808080808080808080808080 +mask2: +.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF + +SHUF_MASK: +.octa 0x000102030405060708090A0B0C0D0E0F + +pshufb_shf_table: +# use these values for shift constants for the pshufb instruction +# different alignments result in values as shown: +# DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1 +# DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2 +# DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3 +# DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4 +# DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5 +# DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6 +# DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7 +# DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8 +# DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9 +# DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10 +# DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11 +# DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12 +# DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13 +# DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14 +# DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15 +.octa 0x8f8e8d8c8b8a89888786858483828100 +.octa 0x000e0d0c0b0a09080706050403020100 diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c new file mode 100644 index 000000000000..7845d7fd54c0 --- /dev/null +++ b/arch/x86/crypto/crct10dif-pclmul_glue.c @@ -0,0 +1,151 @@ +/* + * Cryptographic API. + * + * T10 Data Integrity Field CRC16 Crypto Transform using PCLMULQDQ Instructions + * + * Copyright (C) 2013 Intel Corporation + * Author: Tim Chen <tim.c.chen@linux.intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <linux/types.h> +#include <linux/module.h> +#include <linux/crc-t10dif.h> +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <asm/i387.h> +#include <asm/cpufeature.h> +#include <asm/cpu_device_id.h> + +asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf, + size_t len); + +struct chksum_desc_ctx { + __u16 crc; +}; + +/* + * Steps through buffer one byte at at time, calculates reflected + * crc using table. + */ + +static int chksum_init(struct shash_desc *desc) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->crc = 0; + + return 0; +} + +static int chksum_update(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + if (irq_fpu_usable()) { + kernel_fpu_begin(); + ctx->crc = crc_t10dif_pcl(ctx->crc, data, length); + kernel_fpu_end(); + } else + ctx->crc = crc_t10dif_generic(ctx->crc, data, length); + return 0; +} + +static int chksum_final(struct shash_desc *desc, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + *(__u16 *)out = ctx->crc; + return 0; +} + +static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + if (irq_fpu_usable()) { + kernel_fpu_begin(); + *(__u16 *)out = crc_t10dif_pcl(*crcp, data, len); + kernel_fpu_end(); + } else + *(__u16 *)out = crc_t10dif_generic(*crcp, data, len); + return 0; +} + +static int chksum_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup(&ctx->crc, data, len, out); +} + +static int chksum_digest(struct shash_desc *desc, const u8 *data, + unsigned int length, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup(&ctx->crc, data, length, out); +} + +static struct shash_alg alg = { + .digestsize = CRC_T10DIF_DIGEST_SIZE, + .init = chksum_init, + .update = chksum_update, + .final = chksum_final, + .finup = chksum_finup, + .digest = chksum_digest, + .descsize = sizeof(struct chksum_desc_ctx), + .base = { + .cra_name = "crct10dif", + .cra_driver_name = "crct10dif-pclmul", + .cra_priority = 200, + .cra_blocksize = CRC_T10DIF_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static const struct x86_cpu_id crct10dif_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, crct10dif_cpu_id); + +static int __init crct10dif_intel_mod_init(void) +{ + if (!x86_match_cpu(crct10dif_cpu_id)) + return -ENODEV; + + return crypto_register_shash(&alg); +} + +static void __exit crct10dif_intel_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(crct10dif_intel_mod_init); +module_exit(crct10dif_intel_mod_fini); + +MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>"); +MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ."); +MODULE_LICENSE("GPL"); + +MODULE_ALIAS("crct10dif"); +MODULE_ALIAS("crct10dif-pclmul"); diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index 597d4da69656..50226c4b86ed 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -187,7 +187,36 @@ static int sha256_ssse3_import(struct shash_desc *desc, const void *in) return 0; } -static struct shash_alg alg = { +static int sha224_ssse3_init(struct shash_desc *desc) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + sctx->state[0] = SHA224_H0; + sctx->state[1] = SHA224_H1; + sctx->state[2] = SHA224_H2; + sctx->state[3] = SHA224_H3; + sctx->state[4] = SHA224_H4; + sctx->state[5] = SHA224_H5; + sctx->state[6] = SHA224_H6; + sctx->state[7] = SHA224_H7; + sctx->count = 0; + + return 0; +} + +static int sha224_ssse3_final(struct shash_desc *desc, u8 *hash) +{ + u8 D[SHA256_DIGEST_SIZE]; + + sha256_ssse3_final(desc, D); + + memcpy(hash, D, SHA224_DIGEST_SIZE); + memset(D, 0, SHA256_DIGEST_SIZE); + + return 0; +} + +static struct shash_alg algs[] = { { .digestsize = SHA256_DIGEST_SIZE, .init = sha256_ssse3_init, .update = sha256_ssse3_update, @@ -204,7 +233,24 @@ static struct shash_alg alg = { .cra_blocksize = SHA256_BLOCK_SIZE, .cra_module = THIS_MODULE, } -}; +}, { + .digestsize = SHA224_DIGEST_SIZE, + .init = sha224_ssse3_init, + .update = sha256_ssse3_update, + .final = sha224_ssse3_final, + .export = sha256_ssse3_export, + .import = sha256_ssse3_import, + .descsize = sizeof(struct sha256_state), + .statesize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha224", + .cra_driver_name = "sha224-ssse3", + .cra_priority = 150, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA224_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; #ifdef CONFIG_AS_AVX static bool __init avx_usable(void) @@ -227,7 +273,7 @@ static bool __init avx_usable(void) static int __init sha256_ssse3_mod_init(void) { - /* test for SSE3 first */ + /* test for SSSE3 first */ if (cpu_has_ssse3) sha256_transform_asm = sha256_transform_ssse3; @@ -254,7 +300,7 @@ static int __init sha256_ssse3_mod_init(void) else #endif pr_info("Using SSSE3 optimized SHA-256 implementation\n"); - return crypto_register_shash(&alg); + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); } pr_info("Neither AVX nor SSSE3 is available/usable.\n"); @@ -263,7 +309,7 @@ static int __init sha256_ssse3_mod_init(void) static void __exit sha256_ssse3_mod_fini(void) { - crypto_unregister_shash(&alg); + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); } module_init(sha256_ssse3_mod_init); @@ -273,3 +319,4 @@ MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated"); MODULE_ALIAS("sha256"); +MODULE_ALIAS("sha384"); diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 6cbd8df348d2..f30cd10293f0 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -194,7 +194,37 @@ static int sha512_ssse3_import(struct shash_desc *desc, const void *in) return 0; } -static struct shash_alg alg = { +static int sha384_ssse3_init(struct shash_desc *desc) +{ + struct sha512_state *sctx = shash_desc_ctx(desc); + + sctx->state[0] = SHA384_H0; + sctx->state[1] = SHA384_H1; + sctx->state[2] = SHA384_H2; + sctx->state[3] = SHA384_H3; + sctx->state[4] = SHA384_H4; + sctx->state[5] = SHA384_H5; + sctx->state[6] = SHA384_H6; + sctx->state[7] = SHA384_H7; + + sctx->count[0] = sctx->count[1] = 0; + + return 0; +} + +static int sha384_ssse3_final(struct shash_desc *desc, u8 *hash) +{ + u8 D[SHA512_DIGEST_SIZE]; + + sha512_ssse3_final(desc, D); + + memcpy(hash, D, SHA384_DIGEST_SIZE); + memset(D, 0, SHA512_DIGEST_SIZE); + + return 0; +} + +static struct shash_alg algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_ssse3_init, .update = sha512_ssse3_update, @@ -211,7 +241,24 @@ static struct shash_alg alg = { .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } -}; +}, { + .digestsize = SHA384_DIGEST_SIZE, + .init = sha384_ssse3_init, + .update = sha512_ssse3_update, + .final = sha384_ssse3_final, + .export = sha512_ssse3_export, + .import = sha512_ssse3_import, + .descsize = sizeof(struct sha512_state), + .statesize = sizeof(struct sha512_state), + .base = { + .cra_name = "sha384", + .cra_driver_name = "sha384-ssse3", + .cra_priority = 150, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA384_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; #ifdef CONFIG_AS_AVX static bool __init avx_usable(void) @@ -234,7 +281,7 @@ static bool __init avx_usable(void) static int __init sha512_ssse3_mod_init(void) { - /* test for SSE3 first */ + /* test for SSSE3 first */ if (cpu_has_ssse3) sha512_transform_asm = sha512_transform_ssse3; @@ -261,7 +308,7 @@ static int __init sha512_ssse3_mod_init(void) else #endif pr_info("Using SSSE3 optimized SHA-512 implementation\n"); - return crypto_register_shash(&alg); + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); } pr_info("Neither AVX nor SSSE3 is available/usable.\n"); @@ -270,7 +317,7 @@ static int __init sha512_ssse3_mod_init(void) static void __exit sha512_ssse3_mod_fini(void) { - crypto_unregister_shash(&alg); + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); } module_init(sha512_ssse3_mod_init); @@ -280,3 +327,4 @@ MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated"); MODULE_ALIAS("sha512"); +MODULE_ALIAS("sha384"); diff --git a/arch/x86/crypto/twofish-avx2-asm_64.S b/arch/x86/crypto/twofish-avx2-asm_64.S deleted file mode 100644 index e1a83b9cd389..000000000000 --- a/arch/x86/crypto/twofish-avx2-asm_64.S +++ /dev/null @@ -1,600 +0,0 @@ -/* - * x86_64/AVX2 assembler optimized version of Twofish - * - * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -#include <linux/linkage.h> -#include "glue_helper-asm-avx2.S" - -.file "twofish-avx2-asm_64.S" - -.data -.align 16 - -.Lvpshufb_mask0: -.long 0x80808000 -.long 0x80808004 -.long 0x80808008 -.long 0x8080800c - -.Lbswap128_mask: - .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -.Lxts_gf128mul_and_shl1_mask_0: - .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 -.Lxts_gf128mul_and_shl1_mask_1: - .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 - -.text - -/* structure of crypto context */ -#define s0 0 -#define s1 1024 -#define s2 2048 -#define s3 3072 -#define w 4096 -#define k 4128 - -/* register macros */ -#define CTX %rdi - -#define RS0 CTX -#define RS1 %r8 -#define RS2 %r9 -#define RS3 %r10 -#define RK %r11 -#define RW %rax -#define RROUND %r12 -#define RROUNDd %r12d - -#define RA0 %ymm8 -#define RB0 %ymm9 -#define RC0 %ymm10 -#define RD0 %ymm11 -#define RA1 %ymm12 -#define RB1 %ymm13 -#define RC1 %ymm14 -#define RD1 %ymm15 - -/* temp regs */ -#define RX0 %ymm0 -#define RY0 %ymm1 -#define RX1 %ymm2 -#define RY1 %ymm3 -#define RT0 %ymm4 -#define RIDX %ymm5 - -#define RX0x %xmm0 -#define RY0x %xmm1 -#define RX1x %xmm2 -#define RY1x %xmm3 -#define RT0x %xmm4 - -/* vpgatherdd mask and '-1' */ -#define RNOT %ymm6 - -/* byte mask, (-1 >> 24) */ -#define RBYTE %ymm7 - -/********************************************************************** - 16-way AVX2 twofish - **********************************************************************/ -#define init_round_constants() \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpsrld $24, RNOT, RBYTE; \ - leaq k(CTX), RK; \ - leaq w(CTX), RW; \ - leaq s1(CTX), RS1; \ - leaq s2(CTX), RS2; \ - leaq s3(CTX), RS3; \ - -#define g16(ab, rs0, rs1, rs2, rs3, xy) \ - vpand RBYTE, ab ## 0, RIDX; \ - vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - \ - vpand RBYTE, ab ## 1, RIDX; \ - vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - \ - vpsrld $8, ab ## 0, RIDX; \ - vpand RBYTE, RIDX, RIDX; \ - vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpxor RT0, xy ## 0, xy ## 0; \ - \ - vpsrld $8, ab ## 1, RIDX; \ - vpand RBYTE, RIDX, RIDX; \ - vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpxor RT0, xy ## 1, xy ## 1; \ - \ - vpsrld $16, ab ## 0, RIDX; \ - vpand RBYTE, RIDX, RIDX; \ - vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpxor RT0, xy ## 0, xy ## 0; \ - \ - vpsrld $16, ab ## 1, RIDX; \ - vpand RBYTE, RIDX, RIDX; \ - vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpxor RT0, xy ## 1, xy ## 1; \ - \ - vpsrld $24, ab ## 0, RIDX; \ - vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpxor RT0, xy ## 0, xy ## 0; \ - \ - vpsrld $24, ab ## 1, RIDX; \ - vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpxor RT0, xy ## 1, xy ## 1; - -#define g1_16(a, x) \ - g16(a, RS0, RS1, RS2, RS3, x); - -#define g2_16(b, y) \ - g16(b, RS1, RS2, RS3, RS0, y); - -#define encrypt_round_end16(a, b, c, d, nk) \ - vpaddd RY0, RX0, RX0; \ - vpaddd RX0, RY0, RY0; \ - vpbroadcastd nk(RK,RROUND,8), RT0; \ - vpaddd RT0, RX0, RX0; \ - vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ - vpaddd RT0, RY0, RY0; \ - \ - vpxor RY0, d ## 0, d ## 0; \ - \ - vpxor RX0, c ## 0, c ## 0; \ - vpsrld $1, c ## 0, RT0; \ - vpslld $31, c ## 0, c ## 0; \ - vpor RT0, c ## 0, c ## 0; \ - \ - vpaddd RY1, RX1, RX1; \ - vpaddd RX1, RY1, RY1; \ - vpbroadcastd nk(RK,RROUND,8), RT0; \ - vpaddd RT0, RX1, RX1; \ - vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ - vpaddd RT0, RY1, RY1; \ - \ - vpxor RY1, d ## 1, d ## 1; \ - \ - vpxor RX1, c ## 1, c ## 1; \ - vpsrld $1, c ## 1, RT0; \ - vpslld $31, c ## 1, c ## 1; \ - vpor RT0, c ## 1, c ## 1; \ - -#define encrypt_round16(a, b, c, d, nk) \ - g2_16(b, RY); \ - \ - vpslld $1, b ## 0, RT0; \ - vpsrld $31, b ## 0, b ## 0; \ - vpor RT0, b ## 0, b ## 0; \ - \ - vpslld $1, b ## 1, RT0; \ - vpsrld $31, b ## 1, b ## 1; \ - vpor RT0, b ## 1, b ## 1; \ - \ - g1_16(a, RX); \ - \ - encrypt_round_end16(a, b, c, d, nk); - -#define encrypt_round_first16(a, b, c, d, nk) \ - vpslld $1, d ## 0, RT0; \ - vpsrld $31, d ## 0, d ## 0; \ - vpor RT0, d ## 0, d ## 0; \ - \ - vpslld $1, d ## 1, RT0; \ - vpsrld $31, d ## 1, d ## 1; \ - vpor RT0, d ## 1, d ## 1; \ - \ - encrypt_round16(a, b, c, d, nk); - -#define encrypt_round_last16(a, b, c, d, nk) \ - g2_16(b, RY); \ - \ - g1_16(a, RX); \ - \ - encrypt_round_end16(a, b, c, d, nk); - -#define decrypt_round_end16(a, b, c, d, nk) \ - vpaddd RY0, RX0, RX0; \ - vpaddd RX0, RY0, RY0; \ - vpbroadcastd nk(RK,RROUND,8), RT0; \ - vpaddd RT0, RX0, RX0; \ - vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ - vpaddd RT0, RY0, RY0; \ - \ - vpxor RX0, c ## 0, c ## 0; \ - \ - vpxor RY0, d ## 0, d ## 0; \ - vpsrld $1, d ## 0, RT0; \ - vpslld $31, d ## 0, d ## 0; \ - vpor RT0, d ## 0, d ## 0; \ - \ - vpaddd RY1, RX1, RX1; \ - vpaddd RX1, RY1, RY1; \ - vpbroadcastd nk(RK,RROUND,8), RT0; \ - vpaddd RT0, RX1, RX1; \ - vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ - vpaddd RT0, RY1, RY1; \ - \ - vpxor RX1, c ## 1, c ## 1; \ - \ - vpxor RY1, d ## 1, d ## 1; \ - vpsrld $1, d ## 1, RT0; \ - vpslld $31, d ## 1, d ## 1; \ - vpor RT0, d ## 1, d ## 1; - -#define decrypt_round16(a, b, c, d, nk) \ - g1_16(a, RX); \ - \ - vpslld $1, a ## 0, RT0; \ - vpsrld $31, a ## 0, a ## 0; \ - vpor RT0, a ## 0, a ## 0; \ - \ - vpslld $1, a ## 1, RT0; \ - vpsrld $31, a ## 1, a ## 1; \ - vpor RT0, a ## 1, a ## 1; \ - \ - g2_16(b, RY); \ - \ - decrypt_round_end16(a, b, c, d, nk); - -#define decrypt_round_first16(a, b, c, d, nk) \ - vpslld $1, c ## 0, RT0; \ - vpsrld $31, c ## 0, c ## 0; \ - vpor RT0, c ## 0, c ## 0; \ - \ - vpslld $1, c ## 1, RT0; \ - vpsrld $31, c ## 1, c ## 1; \ - vpor RT0, c ## 1, c ## 1; \ - \ - decrypt_round16(a, b, c, d, nk) - -#define decrypt_round_last16(a, b, c, d, nk) \ - g1_16(a, RX); \ - \ - g2_16(b, RY); \ - \ - decrypt_round_end16(a, b, c, d, nk); - -#define encrypt_cycle16() \ - encrypt_round16(RA, RB, RC, RD, 0); \ - encrypt_round16(RC, RD, RA, RB, 8); - -#define encrypt_cycle_first16() \ - encrypt_round_first16(RA, RB, RC, RD, 0); \ - encrypt_round16(RC, RD, RA, RB, 8); - -#define encrypt_cycle_last16() \ - encrypt_round16(RA, RB, RC, RD, 0); \ - encrypt_round_last16(RC, RD, RA, RB, 8); - -#define decrypt_cycle16(n) \ - decrypt_round16(RC, RD, RA, RB, 8); \ - decrypt_round16(RA, RB, RC, RD, 0); - -#define decrypt_cycle_first16(n) \ - decrypt_round_first16(RC, RD, RA, RB, 8); \ - decrypt_round16(RA, RB, RC, RD, 0); - -#define decrypt_cycle_last16(n) \ - decrypt_round16(RC, RD, RA, RB, 8); \ - decrypt_round_last16(RA, RB, RC, RD, 0); - -#define transpose_4x4(x0,x1,x2,x3,t1,t2) \ - vpunpckhdq x1, x0, t2; \ - vpunpckldq x1, x0, x0; \ - \ - vpunpckldq x3, x2, t1; \ - vpunpckhdq x3, x2, x2; \ - \ - vpunpckhqdq t1, x0, x1; \ - vpunpcklqdq t1, x0, x0; \ - \ - vpunpckhqdq x2, t2, x3; \ - vpunpcklqdq x2, t2, x2; - -#define read_blocks8(offs,a,b,c,d) \ - transpose_4x4(a, b, c, d, RX0, RY0); - -#define write_blocks8(offs,a,b,c,d) \ - transpose_4x4(a, b, c, d, RX0, RY0); - -#define inpack_enc8(a,b,c,d) \ - vpbroadcastd 4*0(RW), RT0; \ - vpxor RT0, a, a; \ - \ - vpbroadcastd 4*1(RW), RT0; \ - vpxor RT0, b, b; \ - \ - vpbroadcastd 4*2(RW), RT0; \ - vpxor RT0, c, c; \ - \ - vpbroadcastd 4*3(RW), RT0; \ - vpxor RT0, d, d; - -#define outunpack_enc8(a,b,c,d) \ - vpbroadcastd 4*4(RW), RX0; \ - vpbroadcastd 4*5(RW), RY0; \ - vpxor RX0, c, RX0; \ - vpxor RY0, d, RY0; \ - \ - vpbroadcastd 4*6(RW), RT0; \ - vpxor RT0, a, c; \ - vpbroadcastd 4*7(RW), RT0; \ - vpxor RT0, b, d; \ - \ - vmovdqa RX0, a; \ - vmovdqa RY0, b; - -#define inpack_dec8(a,b,c,d) \ - vpbroadcastd 4*4(RW), RX0; \ - vpbroadcastd 4*5(RW), RY0; \ - vpxor RX0, a, RX0; \ - vpxor RY0, b, RY0; \ - \ - vpbroadcastd 4*6(RW), RT0; \ - vpxor RT0, c, a; \ - vpbroadcastd 4*7(RW), RT0; \ - vpxor RT0, d, b; \ - \ - vmovdqa RX0, c; \ - vmovdqa RY0, d; - -#define outunpack_dec8(a,b,c,d) \ - vpbroadcastd 4*0(RW), RT0; \ - vpxor RT0, a, a; \ - \ - vpbroadcastd 4*1(RW), RT0; \ - vpxor RT0, b, b; \ - \ - vpbroadcastd 4*2(RW), RT0; \ - vpxor RT0, c, c; \ - \ - vpbroadcastd 4*3(RW), RT0; \ - vpxor RT0, d, d; - -#define read_blocks16(a,b,c,d) \ - read_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \ - read_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1); - -#define write_blocks16(a,b,c,d) \ - write_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \ - write_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1); - -#define xor_blocks16(a,b,c,d) \ - xor_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \ - xor_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1); - -#define inpack_enc16(a,b,c,d) \ - inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \ - inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1); - -#define outunpack_enc16(a,b,c,d) \ - outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \ - outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1); - -#define inpack_dec16(a,b,c,d) \ - inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \ - inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1); - -#define outunpack_dec16(a,b,c,d) \ - outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \ - outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1); - -.align 8 -__twofish_enc_blk16: - /* input: - * %rdi: ctx, CTX - * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext - * output: - * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext - */ - init_round_constants(); - - read_blocks16(RA, RB, RC, RD); - inpack_enc16(RA, RB, RC, RD); - - xorl RROUNDd, RROUNDd; - encrypt_cycle_first16(); - movl $2, RROUNDd; - -.align 4 -.L__enc_loop: - encrypt_cycle16(); - - addl $2, RROUNDd; - cmpl $14, RROUNDd; - jne .L__enc_loop; - - encrypt_cycle_last16(); - - outunpack_enc16(RA, RB, RC, RD); - write_blocks16(RA, RB, RC, RD); - - ret; -ENDPROC(__twofish_enc_blk16) - -.align 8 -__twofish_dec_blk16: - /* input: - * %rdi: ctx, CTX - * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext - * output: - * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext - */ - init_round_constants(); - - read_blocks16(RA, RB, RC, RD); - inpack_dec16(RA, RB, RC, RD); - - movl $14, RROUNDd; - decrypt_cycle_first16(); - movl $12, RROUNDd; - -.align 4 -.L__dec_loop: - decrypt_cycle16(); - - addl $-2, RROUNDd; - jnz .L__dec_loop; - - decrypt_cycle_last16(); - - outunpack_dec16(RA, RB, RC, RD); - write_blocks16(RA, RB, RC, RD); - - ret; -ENDPROC(__twofish_dec_blk16) - -ENTRY(twofish_ecb_enc_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - */ - - vzeroupper; - pushq %r12; - - load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); - - call __twofish_enc_blk16; - - store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); - - popq %r12; - vzeroupper; - - ret; -ENDPROC(twofish_ecb_enc_16way) - -ENTRY(twofish_ecb_dec_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - */ - - vzeroupper; - pushq %r12; - - load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); - - call __twofish_dec_blk16; - - store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); - - popq %r12; - vzeroupper; - - ret; -ENDPROC(twofish_ecb_dec_16way) - -ENTRY(twofish_cbc_dec_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - */ - - vzeroupper; - pushq %r12; - - load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); - - call __twofish_dec_blk16; - - store_cbc_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1, - RX0); - - popq %r12; - vzeroupper; - - ret; -ENDPROC(twofish_cbc_dec_16way) - -ENTRY(twofish_ctr_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (little endian, 128bit) - */ - - vzeroupper; - pushq %r12; - - load_ctr_16way(%rcx, .Lbswap128_mask, RA0, RB0, RC0, RD0, RA1, RB1, RC1, - RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT, - RBYTE); - - call __twofish_enc_blk16; - - store_ctr_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); - - popq %r12; - vzeroupper; - - ret; -ENDPROC(twofish_ctr_16way) - -.align 8 -twofish_xts_crypt_16way: - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - * %r8: pointer to __twofish_enc_blk16 or __twofish_dec_blk16 - */ - - vzeroupper; - pushq %r12; - - load_xts_16way(%rcx, %rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, - RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT, - .Lxts_gf128mul_and_shl1_mask_0, - .Lxts_gf128mul_and_shl1_mask_1); - - call *%r8; - - store_xts_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); - - popq %r12; - vzeroupper; - - ret; -ENDPROC(twofish_xts_crypt_16way) - -ENTRY(twofish_xts_enc_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - leaq __twofish_enc_blk16, %r8; - jmp twofish_xts_crypt_16way; -ENDPROC(twofish_xts_enc_16way) - -ENTRY(twofish_xts_dec_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - leaq __twofish_dec_blk16, %r8; - jmp twofish_xts_crypt_16way; -ENDPROC(twofish_xts_dec_16way) diff --git a/arch/x86/crypto/twofish_avx2_glue.c b/arch/x86/crypto/twofish_avx2_glue.c deleted file mode 100644 index ce33b5be64ee..000000000000 --- a/arch/x86/crypto/twofish_avx2_glue.c +++ /dev/null @@ -1,584 +0,0 @@ -/* - * Glue Code for x86_64/AVX2 assembler optimized version of Twofish - * - * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/crypto.h> -#include <linux/err.h> -#include <crypto/algapi.h> -#include <crypto/ctr.h> -#include <crypto/twofish.h> -#include <crypto/lrw.h> -#include <crypto/xts.h> -#include <asm/xcr.h> -#include <asm/xsave.h> -#include <asm/crypto/twofish.h> -#include <asm/crypto/ablk_helper.h> -#include <asm/crypto/glue_helper.h> -#include <crypto/scatterwalk.h> - -#define TF_AVX2_PARALLEL_BLOCKS 16 - -/* 16-way AVX2 parallel cipher functions */ -asmlinkage void twofish_ecb_enc_16way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void twofish_ecb_dec_16way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void twofish_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src); - -asmlinkage void twofish_ctr_16way(void *ctx, u128 *dst, const u128 *src, - le128 *iv); - -asmlinkage void twofish_xts_enc_16way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); -asmlinkage void twofish_xts_dec_16way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); - -static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src) -{ - __twofish_enc_blk_3way(ctx, dst, src, false); -} - -static const struct common_glue_ctx twofish_enc = { - .num_funcs = 4, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_16way) } - }, { - .num_blocks = 8, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) } - }, { - .num_blocks = 3, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) } - }, { - .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) } - } } -}; - -static const struct common_glue_ctx twofish_ctr = { - .num_funcs = 4, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_16way) } - }, { - .num_blocks = 8, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) } - }, { - .num_blocks = 3, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) } - }, { - .num_blocks = 1, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) } - } } -}; - -static const struct common_glue_ctx twofish_enc_xts = { - .num_funcs = 3, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_16way) } - }, { - .num_blocks = 8, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) } - }, { - .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) } - } } -}; - -static const struct common_glue_ctx twofish_dec = { - .num_funcs = 4, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_16way) } - }, { - .num_blocks = 8, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) } - }, { - .num_blocks = 3, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) } - }, { - .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) } - } } -}; - -static const struct common_glue_ctx twofish_dec_cbc = { - .num_funcs = 4, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_16way) } - }, { - .num_blocks = 8, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) } - }, { - .num_blocks = 3, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) } - }, { - .num_blocks = 1, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) } - } } -}; - -static const struct common_glue_ctx twofish_dec_xts = { - .num_funcs = 3, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_16way) } - }, { - .num_blocks = 8, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) } - }, { - .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) } - } } -}; - -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes); -} - -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes); -} - -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc, - dst, src, nbytes); -} - -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src, - nbytes); -} - -static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes); -} - -static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes) -{ - /* since reusing AVX functions, starts using FPU at 8 parallel blocks */ - return glue_fpu_begin(TF_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes); -} - -static inline void twofish_fpu_end(bool fpu_enabled) -{ - glue_fpu_end(fpu_enabled); -} - -struct crypt_priv { - struct twofish_ctx *ctx; - bool fpu_enabled; -}; - -static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) -{ - const unsigned int bsize = TF_BLOCK_SIZE; - struct crypt_priv *ctx = priv; - int i; - - ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); - - while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) { - twofish_ecb_enc_16way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS; - nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS; - } - - while (nbytes >= 8 * bsize) { - twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * 8; - nbytes -= bsize * 8; - } - - while (nbytes >= 3 * bsize) { - twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * 3; - nbytes -= bsize * 3; - } - - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) - twofish_enc_blk(ctx->ctx, srcdst, srcdst); -} - -static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) -{ - const unsigned int bsize = TF_BLOCK_SIZE; - struct crypt_priv *ctx = priv; - int i; - - ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); - - while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) { - twofish_ecb_dec_16way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS; - nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS; - } - - while (nbytes >= 8 * bsize) { - twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * 8; - nbytes -= bsize * 8; - } - - while (nbytes >= 3 * bsize) { - twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * 3; - nbytes -= bsize * 3; - } - - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) - twofish_dec_blk(ctx->ctx, srcdst, srcdst); -} - -static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[TF_AVX2_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->twofish_ctx, - .fpu_enabled = false, - }; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = &crypt_ctx, - .crypt_fn = encrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = lrw_crypt(desc, dst, src, nbytes, &req); - twofish_fpu_end(crypt_ctx.fpu_enabled); - - return ret; -} - -static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[TF_AVX2_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->twofish_ctx, - .fpu_enabled = false, - }; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = &crypt_ctx, - .crypt_fn = decrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = lrw_crypt(desc, dst, src, nbytes, &req); - twofish_fpu_end(crypt_ctx.fpu_enabled); - - return ret; -} - -static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - - return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes, - XTS_TWEAK_CAST(twofish_enc_blk), - &ctx->tweak_ctx, &ctx->crypt_ctx); -} - -static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - - return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes, - XTS_TWEAK_CAST(twofish_enc_blk), - &ctx->tweak_ctx, &ctx->crypt_ctx); -} - -static struct crypto_alg tf_algs[10] = { { - .cra_name = "__ecb-twofish-avx2", - .cra_driver_name = "__driver-ecb-twofish-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .setkey = twofish_setkey, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}, { - .cra_name = "__cbc-twofish-avx2", - .cra_driver_name = "__driver-cbc-twofish-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .setkey = twofish_setkey, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}, { - .cra_name = "__ctr-twofish-avx2", - .cra_driver_name = "__driver-ctr-twofish-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct twofish_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = twofish_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, - }, -}, { - .cra_name = "__lrw-twofish-avx2", - .cra_driver_name = "__driver-lrw-twofish-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_lrw_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_exit = lrw_twofish_exit_tfm, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE + - TF_BLOCK_SIZE, - .max_keysize = TF_MAX_KEY_SIZE + - TF_BLOCK_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = lrw_twofish_setkey, - .encrypt = lrw_encrypt, - .decrypt = lrw_decrypt, - }, - }, -}, { - .cra_name = "__xts-twofish-avx2", - .cra_driver_name = "__driver-xts-twofish-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_xts_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE * 2, - .max_keysize = TF_MAX_KEY_SIZE * 2, - .ivsize = TF_BLOCK_SIZE, - .setkey = xts_twofish_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, - }, - }, -}, { - .cra_name = "ecb(twofish)", - .cra_driver_name = "ecb-twofish-avx2", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "cbc(twofish)", - .cra_driver_name = "cbc-twofish-avx2", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = __ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "ctr(twofish)", - .cra_driver_name = "ctr-twofish-avx2", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_encrypt, - .geniv = "chainiv", - }, - }, -}, { - .cra_name = "lrw(twofish)", - .cra_driver_name = "lrw-twofish-avx2", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = TF_MIN_KEY_SIZE + - TF_BLOCK_SIZE, - .max_keysize = TF_MAX_KEY_SIZE + - TF_BLOCK_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "xts(twofish)", - .cra_driver_name = "xts-twofish-avx2", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = TF_MIN_KEY_SIZE * 2, - .max_keysize = TF_MAX_KEY_SIZE * 2, - .ivsize = TF_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -} }; - -static int __init init(void) -{ - u64 xcr0; - - if (!cpu_has_avx2 || !cpu_has_osxsave) { - pr_info("AVX2 instructions are not detected.\n"); - return -ENODEV; - } - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX2 detected but unusable.\n"); - return -ENODEV; - } - - return crypto_register_algs(tf_algs, ARRAY_SIZE(tf_algs)); -} - -static void __exit fini(void) -{ - crypto_unregister_algs(tf_algs, ARRAY_SIZE(tf_algs)); -} - -module_init(init); -module_exit(fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX2 optimized"); -MODULE_ALIAS("twofish"); -MODULE_ALIAS("twofish-asm"); diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 2047a562f6b3..a62ba541884e 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -50,26 +50,18 @@ /* 8-way parallel cipher functions */ asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst, const u8 *src); -EXPORT_SYMBOL_GPL(twofish_ecb_enc_8way); - asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst, const u8 *src); -EXPORT_SYMBOL_GPL(twofish_ecb_dec_8way); asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst, const u8 *src); -EXPORT_SYMBOL_GPL(twofish_cbc_dec_8way); - asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst, const u8 *src, le128 *iv); -EXPORT_SYMBOL_GPL(twofish_ctr_8way); asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst, const u8 *src, le128 *iv); -EXPORT_SYMBOL_GPL(twofish_xts_enc_8way); asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst, const u8 *src, le128 *iv); -EXPORT_SYMBOL_GPL(twofish_xts_dec_8way); static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, const u8 *src) @@ -77,19 +69,17 @@ static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, __twofish_enc_blk_3way(ctx, dst, src, false); } -void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) +static void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) { glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(twofish_enc_blk)); } -EXPORT_SYMBOL_GPL(twofish_xts_enc); -void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) +static void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) { glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(twofish_dec_blk)); } -EXPORT_SYMBOL_GPL(twofish_xts_dec); static const struct common_glue_ctx twofish_enc = { diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 805078e08013..bae3aba95b15 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -192,7 +192,7 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, /* struct user */ DUMP_WRITE(&dump, sizeof(dump)); /* Now dump all of the user data. Include malloced stuff as well */ - DUMP_SEEK(PAGE_SIZE); + DUMP_SEEK(PAGE_SIZE - sizeof(dump)); /* now we start writing out the user space info */ set_fs(USER_DS); /* Dump the data area */ @@ -308,8 +308,6 @@ static int load_aout_binary(struct linux_binprm *bprm) (current->mm->start_data = N_DATADDR(ex)); current->mm->brk = ex.a_bss + (current->mm->start_brk = N_BSSADDR(ex)); - current->mm->free_area_cache = TASK_UNMAPPED_BASE; - current->mm->cached_hole_size = 0; retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); if (retval < 0) { diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index cf1a471a18a2..665a730307f2 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -34,8 +34,6 @@ #include <asm/sys_ia32.h> #include <asm/smap.h> -#define FIX_EFLAGS __FIX_EFLAGS - int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) { int err = 0; @@ -459,7 +457,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, else put_user_ex(0, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); - err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp); + compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 474dc1b59f72..4299eb05023c 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -452,7 +452,7 @@ ia32_badsys: CFI_ENDPROC - .macro PTREGSCALL label, func, arg + .macro PTREGSCALL label, func ALIGN GLOBAL(\label) leaq \func(%rip),%rax diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index b31bf97775fc..b1977bad5435 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -86,6 +86,7 @@ extern int acpi_pci_disabled; extern int acpi_skip_timer_override; extern int acpi_use_timer_override; extern int acpi_fix_pin2_polarity; +extern int acpi_disable_cmcff; extern u8 acpi_sci_flags; extern int acpi_sci_override_gsi; @@ -111,7 +112,7 @@ static inline void acpi_disable_pci(void) } /* Low-level suspend routine. */ -extern int acpi_suspend_lowlevel(void); +extern int (*acpi_suspend_lowlevel)(void); /* Physical address to resume after wakeup */ #define acpi_wakeup_address ((unsigned long)(real_mode_header->wakeup_start)) @@ -168,6 +169,7 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf) #define acpi_lapic 0 #define acpi_ioapic 0 +#define acpi_disable_cmcff 0 static inline void acpi_noirq_set(void) { } static inline void acpi_disable_pci(void) { } static inline void disable_acpi(void) { } diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 58ed6d96a6ac..0a3f9c9f98d5 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -5,6 +5,7 @@ #include <linux/stddef.h> #include <linux/stringify.h> #include <asm/asm.h> +#include <asm/ptrace.h> /* * Alternative inline assembly for SMP. @@ -220,20 +221,11 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len); * no thread can be preempted in the instructions being modified (no iret to an * invalid instruction possible) or if the instructions are changed from a * consistent state to another consistent state atomically. - * More care must be taken when modifying code in the SMP case because of - * Intel's errata. text_poke_smp() takes care that errata, but still - * doesn't support NMI/MCE handler code modifying. * On the local CPU you need to be protected again NMI or MCE handlers seeing an * inconsistent instruction while you patch. */ -struct text_poke_param { - void *addr; - const void *opcode; - size_t len; -}; - extern void *text_poke(void *addr, const void *opcode, size_t len); -extern void *text_poke_smp(void *addr, const void *opcode, size_t len); -extern void text_poke_smp_batch(struct text_poke_param *params, int n); +extern int poke_int3_handler(struct pt_regs *regs); +extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 338803422239..1d2091a226bc 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -12,6 +12,7 @@ #include <asm/fixmap.h> #include <asm/mpspec.h> #include <asm/msr.h> +#include <asm/idle.h> #define ARCH_APICTIMER_STOPS_ON_C3 1 @@ -687,5 +688,33 @@ extern int default_check_phys_apicid_present(int phys_apicid); #endif #endif /* CONFIG_X86_LOCAL_APIC */ +extern void irq_enter(void); +extern void irq_exit(void); + +static inline void entering_irq(void) +{ + irq_enter(); + exit_idle(); +} + +static inline void entering_ack_irq(void) +{ + ack_APIC_irq(); + entering_irq(); +} + +static inline void exiting_irq(void) +{ + irq_exit(); +} + +static inline void exiting_ack_irq(void) +{ + irq_exit(); + /* Ack only at the end to avoid potential reentry */ + ack_APIC_irq(); +} + +extern void ioapic_zap_locks(void); #endif /* _ASM_X86_APIC_H */ diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 1c2d247f65ce..4582e8e1cd1a 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -3,21 +3,25 @@ #ifdef __ASSEMBLY__ # define __ASM_FORM(x) x +# define __ASM_FORM_RAW(x) x # define __ASM_FORM_COMMA(x) x, #else # define __ASM_FORM(x) " " #x " " +# define __ASM_FORM_RAW(x) #x # define __ASM_FORM_COMMA(x) " " #x "," #endif #ifdef CONFIG_X86_32 # define __ASM_SEL(a,b) __ASM_FORM(a) +# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a) #else # define __ASM_SEL(a,b) __ASM_FORM(b) +# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b) #endif #define __ASM_SIZE(inst, ...) __ASM_SEL(inst##l##__VA_ARGS__, \ inst##q##__VA_ARGS__) -#define __ASM_REG(reg) __ASM_SEL(e##reg, r##reg) +#define __ASM_REG(reg) __ASM_SEL_RAW(e##reg, r##reg) #define _ASM_PTR __ASM_SEL(.long, .quad) #define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8) diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 6dfd0195bb55..41639ce8fd63 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -15,6 +15,14 @@ #include <linux/compiler.h> #include <asm/alternative.h> +#if BITS_PER_LONG == 32 +# define _BITOPS_LONG_SHIFT 5 +#elif BITS_PER_LONG == 64 +# define _BITOPS_LONG_SHIFT 6 +#else +# error "Unexpected BITS_PER_LONG" +#endif + #define BIT_64(n) (U64_C(1) << (n)) /* @@ -59,7 +67,7 @@ * restricted to acting on a single-word quantity. */ static __always_inline void -set_bit(unsigned int nr, volatile unsigned long *addr) +set_bit(long nr, volatile unsigned long *addr) { if (IS_IMMEDIATE(nr)) { asm volatile(LOCK_PREFIX "orb %1,%0" @@ -81,7 +89,7 @@ set_bit(unsigned int nr, volatile unsigned long *addr) * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static inline void __set_bit(int nr, volatile unsigned long *addr) +static inline void __set_bit(long nr, volatile unsigned long *addr) { asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory"); } @@ -97,7 +105,7 @@ static inline void __set_bit(int nr, volatile unsigned long *addr) * in order to ensure changes are visible on other processors. */ static __always_inline void -clear_bit(int nr, volatile unsigned long *addr) +clear_bit(long nr, volatile unsigned long *addr) { if (IS_IMMEDIATE(nr)) { asm volatile(LOCK_PREFIX "andb %1,%0" @@ -118,13 +126,13 @@ clear_bit(int nr, volatile unsigned long *addr) * clear_bit() is atomic and implies release semantics before the memory * operation. It can be used for an unlock. */ -static inline void clear_bit_unlock(unsigned nr, volatile unsigned long *addr) +static inline void clear_bit_unlock(long nr, volatile unsigned long *addr) { barrier(); clear_bit(nr, addr); } -static inline void __clear_bit(int nr, volatile unsigned long *addr) +static inline void __clear_bit(long nr, volatile unsigned long *addr) { asm volatile("btr %1,%0" : ADDR : "Ir" (nr)); } @@ -141,7 +149,7 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr) * No memory barrier is required here, because x86 cannot reorder stores past * older loads. Same principle as spin_unlock. */ -static inline void __clear_bit_unlock(unsigned nr, volatile unsigned long *addr) +static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) { barrier(); __clear_bit(nr, addr); @@ -159,7 +167,7 @@ static inline void __clear_bit_unlock(unsigned nr, volatile unsigned long *addr) * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static inline void __change_bit(int nr, volatile unsigned long *addr) +static inline void __change_bit(long nr, volatile unsigned long *addr) { asm volatile("btc %1,%0" : ADDR : "Ir" (nr)); } @@ -173,7 +181,7 @@ static inline void __change_bit(int nr, volatile unsigned long *addr) * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ -static inline void change_bit(int nr, volatile unsigned long *addr) +static inline void change_bit(long nr, volatile unsigned long *addr) { if (IS_IMMEDIATE(nr)) { asm volatile(LOCK_PREFIX "xorb %1,%0" @@ -194,7 +202,7 @@ static inline void change_bit(int nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int test_and_set_bit(int nr, volatile unsigned long *addr) +static inline int test_and_set_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -212,7 +220,7 @@ static inline int test_and_set_bit(int nr, volatile unsigned long *addr) * This is the same as test_and_set_bit on x86. */ static __always_inline int -test_and_set_bit_lock(int nr, volatile unsigned long *addr) +test_and_set_bit_lock(long nr, volatile unsigned long *addr) { return test_and_set_bit(nr, addr); } @@ -226,7 +234,7 @@ test_and_set_bit_lock(int nr, volatile unsigned long *addr) * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) +static inline int __test_and_set_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -245,7 +253,7 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) +static inline int test_and_clear_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -272,7 +280,7 @@ static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) * accessed from a hypervisor on the same CPU if running in a VM: don't change * this without also updating arch/x86/kernel/kvm.c */ -static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) +static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -284,7 +292,7 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) } /* WARNING: non atomic and it can be reordered! */ -static inline int __test_and_change_bit(int nr, volatile unsigned long *addr) +static inline int __test_and_change_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -304,7 +312,7 @@ static inline int __test_and_change_bit(int nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int test_and_change_bit(int nr, volatile unsigned long *addr) +static inline int test_and_change_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -315,13 +323,13 @@ static inline int test_and_change_bit(int nr, volatile unsigned long *addr) return oldbit; } -static __always_inline int constant_test_bit(unsigned int nr, const volatile unsigned long *addr) +static __always_inline int constant_test_bit(long nr, const volatile unsigned long *addr) { - return ((1UL << (nr % BITS_PER_LONG)) & - (addr[nr / BITS_PER_LONG])) != 0; + return ((1UL << (nr & (BITS_PER_LONG-1))) & + (addr[nr >> _BITOPS_LONG_SHIFT])) != 0; } -static inline int variable_test_bit(int nr, volatile const unsigned long *addr) +static inline int variable_test_bit(long nr, volatile const unsigned long *addr) { int oldbit; diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h index 653668d140f9..4a8cb8d7cbd5 100644 --- a/arch/x86/include/asm/bootparam_utils.h +++ b/arch/x86/include/asm/bootparam_utils.h @@ -35,9 +35,9 @@ static void sanitize_boot_params(struct boot_params *boot_params) */ if (boot_params->sentinel) { /* fields in boot_params are left uninitialized, clear them */ - memset(&boot_params->olpc_ofw_header, 0, + memset(&boot_params->ext_ramdisk_image, 0, (char *)&boot_params->efi_info - - (char *)&boot_params->olpc_ofw_header); + (char *)&boot_params->ext_ramdisk_image); memset(&boot_params->kbd_status, 0, (char *)&boot_params->hdr - (char *)&boot_params->kbd_status); diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h index 46fc474fd819..f50de6951738 100644 --- a/arch/x86/include/asm/checksum_32.h +++ b/arch/x86/include/asm/checksum_32.h @@ -49,9 +49,15 @@ static inline __wsum csum_partial_copy_from_user(const void __user *src, int len, __wsum sum, int *err_ptr) { + __wsum ret; + might_sleep(); - return csum_partial_copy_generic((__force void *)src, dst, - len, sum, err_ptr, NULL); + stac(); + ret = csum_partial_copy_generic((__force void *)src, dst, + len, sum, err_ptr, NULL); + clac(); + + return ret; } /* @@ -176,10 +182,16 @@ static inline __wsum csum_and_copy_to_user(const void *src, int len, __wsum sum, int *err_ptr) { + __wsum ret; + might_sleep(); - if (access_ok(VERIFY_WRITE, dst, len)) - return csum_partial_copy_generic(src, (__force void *)dst, - len, sum, NULL, err_ptr); + if (access_ok(VERIFY_WRITE, dst, len)) { + stac(); + ret = csum_partial_copy_generic(src, (__force void *)dst, + len, sum, NULL, err_ptr); + clac(); + return ret; + } if (len) *err_ptr = -EFAULT; diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h index 9bfdc41629ec..e6fd8a026c7b 100644 --- a/arch/x86/include/asm/checksum_64.h +++ b/arch/x86/include/asm/checksum_64.h @@ -133,7 +133,7 @@ extern __wsum csum_partial(const void *buff, int len, __wsum sum); /* Do not call this directly. Use the wrappers below */ -extern __wsum csum_partial_copy_generic(const void *src, const void *dst, +extern __visible __wsum csum_partial_copy_generic(const void *src, const void *dst, int len, __wsum sum, int *src_err_ptr, int *dst_err_ptr); diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 5f9a1243190e..d2b12988d2ed 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -28,7 +28,7 @@ struct x86_cpu { #ifdef CONFIG_HOTPLUG_CPU extern int arch_register_cpu(int num); extern void arch_unregister_cpu(int); -extern void __cpuinit start_cpu0(void); +extern void start_cpu0(void); #ifdef CONFIG_DEBUG_HOTPLUG_CPU0 extern int _debug_hotplug_cpu(int cpu, int action); #endif diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index e99ac27f95b2..d3f5c63078d8 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -92,7 +92,7 @@ #define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */ #define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */ #define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ - /* 21 available, was AMD_C1E */ +#define X86_FEATURE_ALWAYS (3*32+21) /* "" Always-present feature */ #define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */ #define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ #define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ @@ -356,15 +356,38 @@ extern const char * const x86_power_flags[32]; #endif /* CONFIG_X86_64 */ #if __GNUC__ >= 4 +extern void warn_pre_alternatives(void); +extern bool __static_cpu_has_safe(u16 bit); + /* * Static testing of CPU features. Used the same as boot_cpu_has(). * These are only valid after alternatives have run, but will statically * patch the target code for additional performance. - * */ static __always_inline __pure bool __static_cpu_has(u16 bit) { -#if __GNUC__ > 4 || __GNUC_MINOR__ >= 5 +#ifdef CC_HAVE_ASM_GOTO + +#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS + + /* + * Catch too early usage of this before alternatives + * have run. + */ + asm goto("1: jmp %l[t_warn]\n" + "2:\n" + ".section .altinstructions,\"a\"\n" + " .long 1b - .\n" + " .long 0\n" /* no replacement */ + " .word %P0\n" /* 1: do replace */ + " .byte 2b - 1b\n" /* source len */ + " .byte 0\n" /* replacement len */ + ".previous\n" + /* skipping size check since replacement size = 0 */ + : : "i" (X86_FEATURE_ALWAYS) : : t_warn); + +#endif + asm goto("1: jmp %l[t_no]\n" "2:\n" ".section .altinstructions,\"a\"\n" @@ -379,7 +402,15 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) return true; t_no: return false; -#else + +#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS + t_warn: + warn_pre_alternatives(); + return false; +#endif + +#else /* CC_HAVE_ASM_GOTO */ + u8 flag; /* Open-coded due to __stringify() in ALTERNATIVE() */ asm volatile("1: movb $0,%0\n" @@ -400,7 +431,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) ".previous\n" : "=qm" (flag) : "i" (bit)); return flag; -#endif + +#endif /* CC_HAVE_ASM_GOTO */ } #define static_cpu_has(bit) \ @@ -411,11 +443,94 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) __static_cpu_has(bit) : \ boot_cpu_has(bit) \ ) + +static __always_inline __pure bool _static_cpu_has_safe(u16 bit) +{ +#ifdef CC_HAVE_ASM_GOTO +/* + * We need to spell the jumps to the compiler because, depending on the offset, + * the replacement jump can be bigger than the original jump, and this we cannot + * have. Thus, we force the jump to the widest, 4-byte, signed relative + * offset even though the last would often fit in less bytes. + */ + asm goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n" + "2:\n" + ".section .altinstructions,\"a\"\n" + " .long 1b - .\n" /* src offset */ + " .long 3f - .\n" /* repl offset */ + " .word %P1\n" /* always replace */ + " .byte 2b - 1b\n" /* src len */ + " .byte 4f - 3f\n" /* repl len */ + ".previous\n" + ".section .altinstr_replacement,\"ax\"\n" + "3: .byte 0xe9\n .long %l[t_no] - 2b\n" + "4:\n" + ".previous\n" + ".section .altinstructions,\"a\"\n" + " .long 1b - .\n" /* src offset */ + " .long 0\n" /* no replacement */ + " .word %P0\n" /* feature bit */ + " .byte 2b - 1b\n" /* src len */ + " .byte 0\n" /* repl len */ + ".previous\n" + : : "i" (bit), "i" (X86_FEATURE_ALWAYS) + : : t_dynamic, t_no); + return true; + t_no: + return false; + t_dynamic: + return __static_cpu_has_safe(bit); +#else + u8 flag; + /* Open-coded due to __stringify() in ALTERNATIVE() */ + asm volatile("1: movb $2,%0\n" + "2:\n" + ".section .altinstructions,\"a\"\n" + " .long 1b - .\n" /* src offset */ + " .long 3f - .\n" /* repl offset */ + " .word %P2\n" /* always replace */ + " .byte 2b - 1b\n" /* source len */ + " .byte 4f - 3f\n" /* replacement len */ + ".previous\n" + ".section .discard,\"aw\",@progbits\n" + " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ + ".previous\n" + ".section .altinstr_replacement,\"ax\"\n" + "3: movb $0,%0\n" + "4:\n" + ".previous\n" + ".section .altinstructions,\"a\"\n" + " .long 1b - .\n" /* src offset */ + " .long 5f - .\n" /* repl offset */ + " .word %P1\n" /* feature bit */ + " .byte 4b - 3b\n" /* src len */ + " .byte 6f - 5f\n" /* repl len */ + ".previous\n" + ".section .discard,\"aw\",@progbits\n" + " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */ + ".previous\n" + ".section .altinstr_replacement,\"ax\"\n" + "5: movb $1,%0\n" + "6:\n" + ".previous\n" + : "=qm" (flag) + : "i" (bit), "i" (X86_FEATURE_ALWAYS)); + return (flag == 2 ? __static_cpu_has_safe(bit) : flag); +#endif /* CC_HAVE_ASM_GOTO */ +} + +#define static_cpu_has_safe(bit) \ +( \ + __builtin_constant_p(boot_cpu_has(bit)) ? \ + boot_cpu_has(bit) : \ + _static_cpu_has_safe(bit) \ +) #else /* * gcc 3.x is too stupid to do the static test; fall back to dynamic. */ -#define static_cpu_has(bit) boot_cpu_has(bit) +#define static_cpu_has(bit) boot_cpu_has(bit) +#define static_cpu_has_safe(bit) boot_cpu_has(bit) #endif #define cpu_has_bug(c, bit) cpu_has(c, (bit)) diff --git a/arch/x86/include/asm/crypto/blowfish.h b/arch/x86/include/asm/crypto/blowfish.h deleted file mode 100644 index f097b2face10..000000000000 --- a/arch/x86/include/asm/crypto/blowfish.h +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef ASM_X86_BLOWFISH_H -#define ASM_X86_BLOWFISH_H - -#include <linux/crypto.h> -#include <crypto/blowfish.h> - -#define BF_PARALLEL_BLOCKS 4 - -/* regular block cipher functions */ -asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src, - bool xor); -asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); - -/* 4-way parallel cipher functions */ -asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, - const u8 *src, bool xor); -asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst, - const u8 *src); - -static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src) -{ - __blowfish_enc_blk(ctx, dst, src, false); -} - -static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst, - const u8 *src) -{ - __blowfish_enc_blk(ctx, dst, src, true); -} - -static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, - const u8 *src) -{ - __blowfish_enc_blk_4way(ctx, dst, src, false); -} - -static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst, - const u8 *src) -{ - __blowfish_enc_blk_4way(ctx, dst, src, true); -} - -#endif diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h index e655c6029b45..878c51ceebb5 100644 --- a/arch/x86/include/asm/crypto/twofish.h +++ b/arch/x86/include/asm/crypto/twofish.h @@ -28,20 +28,6 @@ asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst, const u8 *src); -/* 8-way parallel cipher functions */ -asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); -asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); -asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); - /* helpers from twofish_x86_64-3way module */ extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src); extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, @@ -57,8 +43,4 @@ extern void lrw_twofish_exit_tfm(struct crypto_tfm *tfm); extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen); -/* helpers from twofish-avx module */ -extern void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv); -extern void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv); - #endif /* ASM_X86_TWOFISH_H */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 8bf1c06070d5..b90e5dfeee46 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -36,8 +36,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in extern struct desc_ptr idt_descr; extern gate_desc idt_table[]; -extern struct desc_ptr nmi_idt_descr; -extern gate_desc nmi_idt_table[]; +extern struct desc_ptr debug_idt_descr; +extern gate_desc debug_idt_table[]; struct gdt_page { struct desc_struct gdt[GDT_ENTRIES]; @@ -316,7 +316,20 @@ static inline void set_nmi_gate(int gate, void *addr) gate_desc s; pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS); - write_idt_entry(nmi_idt_table, gate, &s); + write_idt_entry(debug_idt_table, gate, &s); +} +#endif + +#ifdef CONFIG_TRACING +extern struct desc_ptr trace_idt_descr; +extern gate_desc trace_idt_table[]; +static inline void write_trace_idt_entry(int entry, const gate_desc *gate) +{ + write_idt_entry(trace_idt_table, entry, gate); +} +#else +static inline void write_trace_idt_entry(int entry, const gate_desc *gate) +{ } #endif @@ -331,6 +344,7 @@ static inline void _set_gate(int gate, unsigned type, void *addr, * setup time */ write_idt_entry(idt_table, gate, &s); + write_trace_idt_entry(gate, &s); } /* @@ -360,12 +374,39 @@ static inline void alloc_system_vector(int vector) } } -static inline void alloc_intr_gate(unsigned int n, void *addr) +#ifdef CONFIG_TRACING +static inline void trace_set_intr_gate(unsigned int gate, void *addr) +{ + gate_desc s; + + pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS); + write_idt_entry(trace_idt_table, gate, &s); +} + +static inline void __trace_alloc_intr_gate(unsigned int n, void *addr) +{ + trace_set_intr_gate(n, addr); +} +#else +static inline void trace_set_intr_gate(unsigned int gate, void *addr) +{ +} + +#define __trace_alloc_intr_gate(n, addr) +#endif + +static inline void __alloc_intr_gate(unsigned int n, void *addr) { - alloc_system_vector(n); set_intr_gate(n, addr); } +#define alloc_intr_gate(n, addr) \ + do { \ + alloc_system_vector(n); \ + __alloc_intr_gate(n, addr); \ + __trace_alloc_intr_gate(n, trace_##addr); \ + } while (0) + /* * This routine sets up an interrupt gate at directory privilege level 3. */ @@ -405,4 +446,70 @@ static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist) _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); } +#ifdef CONFIG_X86_64 +DECLARE_PER_CPU(u32, debug_idt_ctr); +static inline bool is_debug_idt_enabled(void) +{ + if (this_cpu_read(debug_idt_ctr)) + return true; + + return false; +} + +static inline void load_debug_idt(void) +{ + load_idt((const struct desc_ptr *)&debug_idt_descr); +} +#else +static inline bool is_debug_idt_enabled(void) +{ + return false; +} + +static inline void load_debug_idt(void) +{ +} +#endif + +#ifdef CONFIG_TRACING +extern atomic_t trace_idt_ctr; +static inline bool is_trace_idt_enabled(void) +{ + if (atomic_read(&trace_idt_ctr)) + return true; + + return false; +} + +static inline void load_trace_idt(void) +{ + load_idt((const struct desc_ptr *)&trace_idt_descr); +} +#else +static inline bool is_trace_idt_enabled(void) +{ + return false; +} + +static inline void load_trace_idt(void) +{ +} +#endif + +/* + * The load_current_idt() must be called with interrupts disabled + * to avoid races. That way the IDT will always be set back to the expected + * descriptor. It's also called when a CPU is being initialized, and + * that doesn't need to disable interrupts, as nothing should be + * bothering the CPU then. + */ +static inline void load_current_idt(void) +{ + if (is_debug_idt_enabled()) + load_debug_idt(); + else if (is_trace_idt_enabled()) + load_trace_idt(); + else + load_idt((const struct desc_ptr *)&idt_descr); +} #endif /* _ASM_X86_DESC_H */ diff --git a/arch/x86/include/asm/dma-contiguous.h b/arch/x86/include/asm/dma-contiguous.h index c09241659971..b4b38bacb404 100644 --- a/arch/x86/include/asm/dma-contiguous.h +++ b/arch/x86/include/asm/dma-contiguous.h @@ -4,7 +4,6 @@ #ifdef __KERNEL__ #include <linux/types.h> -#include <asm-generic/dma-contiguous.h> static inline void dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) { } diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index cccd07fa5e3a..779c2efe2e97 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -29,7 +29,7 @@ extern void e820_setup_gap(void); extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, unsigned long start_addr, unsigned long long end_addr); struct setup_data; -extern void parse_e820_ext(struct setup_data *data); +extern void parse_e820_ext(u64 phys_addr, u32 data_len); #if defined(CONFIG_X86_64) || \ (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 2fb5d5884e23..0062a0125041 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -52,40 +52,40 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6); #define efi_call_phys0(f) \ - efi_call0((void *)(f)) + efi_call0((f)) #define efi_call_phys1(f, a1) \ - efi_call1((void *)(f), (u64)(a1)) + efi_call1((f), (u64)(a1)) #define efi_call_phys2(f, a1, a2) \ - efi_call2((void *)(f), (u64)(a1), (u64)(a2)) + efi_call2((f), (u64)(a1), (u64)(a2)) #define efi_call_phys3(f, a1, a2, a3) \ - efi_call3((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3)) + efi_call3((f), (u64)(a1), (u64)(a2), (u64)(a3)) #define efi_call_phys4(f, a1, a2, a3, a4) \ - efi_call4((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \ + efi_call4((f), (u64)(a1), (u64)(a2), (u64)(a3), \ (u64)(a4)) #define efi_call_phys5(f, a1, a2, a3, a4, a5) \ - efi_call5((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \ + efi_call5((f), (u64)(a1), (u64)(a2), (u64)(a3), \ (u64)(a4), (u64)(a5)) #define efi_call_phys6(f, a1, a2, a3, a4, a5, a6) \ - efi_call6((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \ + efi_call6((f), (u64)(a1), (u64)(a2), (u64)(a3), \ (u64)(a4), (u64)(a5), (u64)(a6)) #define efi_call_virt0(f) \ - efi_call0((void *)(efi.systab->runtime->f)) + efi_call0((efi.systab->runtime->f)) #define efi_call_virt1(f, a1) \ - efi_call1((void *)(efi.systab->runtime->f), (u64)(a1)) + efi_call1((efi.systab->runtime->f), (u64)(a1)) #define efi_call_virt2(f, a1, a2) \ - efi_call2((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2)) + efi_call2((efi.systab->runtime->f), (u64)(a1), (u64)(a2)) #define efi_call_virt3(f, a1, a2, a3) \ - efi_call3((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ + efi_call3((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ (u64)(a3)) #define efi_call_virt4(f, a1, a2, a3, a4) \ - efi_call4((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ + efi_call4((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ (u64)(a3), (u64)(a4)) #define efi_call_virt5(f, a1, a2, a3, a4, a5) \ - efi_call5((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ + efi_call5((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ (u64)(a3), (u64)(a4), (u64)(a5)) #define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ - efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ + efi_call6((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, @@ -102,13 +102,6 @@ extern void efi_call_phys_epilog(void); extern void efi_unmap_memmap(void); extern void efi_memory_uc(u64 addr, unsigned long size); -struct efi_var_bootdata { - struct setup_data data; - u64 store_size; - u64 remaining_size; - u64 max_var_size; -}; - #ifdef CONFIG_EFI static inline bool efi_is_native(void) diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h index 75ce3f47d204..77a99ac06d00 100644 --- a/arch/x86/include/asm/emergency-restart.h +++ b/arch/x86/include/asm/emergency-restart.h @@ -1,18 +1,6 @@ #ifndef _ASM_X86_EMERGENCY_RESTART_H #define _ASM_X86_EMERGENCY_RESTART_H -enum reboot_type { - BOOT_TRIPLE = 't', - BOOT_KBD = 'k', - BOOT_BIOS = 'b', - BOOT_ACPI = 'a', - BOOT_EFI = 'e', - BOOT_CF9 = 'p', - BOOT_CF9_COND = 'q', -}; - -extern enum reboot_type reboot_type; - extern void machine_emergency_restart(void); #endif /* _ASM_X86_EMERGENCY_RESTART_H */ diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 9bd4ecac72be..dc5fa661465f 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -13,14 +13,16 @@ BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) -BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) -BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) +BUILD_INTERRUPT3(irq_move_cleanup_interrupt, IRQ_MOVE_CLEANUP_VECTOR, + smp_irq_move_cleanup_interrupt) +BUILD_INTERRUPT3(reboot_interrupt, REBOOT_VECTOR, smp_reboot_interrupt) #endif BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) #ifdef CONFIG_HAVE_KVM -BUILD_INTERRUPT(kvm_posted_intr_ipi, POSTED_INTR_VECTOR) +BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR, + smp_kvm_posted_intr_ipi) #endif /* diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 0dc7d9e21c34..e846225265ed 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -81,11 +81,11 @@ enum fixed_addresses { + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1, VVAR_PAGE, VSYSCALL_HPET, -#endif #ifdef CONFIG_PARAVIRT_CLOCK PVCLOCK_FIXMAP_BEGIN, PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1, #endif +#endif FIX_DBGP_BASE, FIX_EARLYCON_MEM_BASE, #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index e25cc33ec54d..4d0bda7b11e3 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -62,10 +62,8 @@ extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, #define xstateregs_active fpregs_active #ifdef CONFIG_MATH_EMULATION -# define HAVE_HWFP (boot_cpu_data.hard_math) extern void finit_soft_fpu(struct i387_soft_struct *soft); #else -# define HAVE_HWFP 1 static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} #endif @@ -345,7 +343,7 @@ static inline void __thread_fpu_end(struct task_struct *tsk) static inline void __thread_fpu_begin(struct task_struct *tsk) { - if (!use_eager_fpu()) + if (!static_cpu_has_safe(X86_FEATURE_EAGER_FPU)) clts(); __thread_set_has_fpu(tsk); } diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 1da97efad08a..92b3bae08b74 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -26,56 +26,73 @@ #include <asm/sections.h> /* Interrupt handlers registered during init_IRQ */ -extern void apic_timer_interrupt(void); -extern void x86_platform_ipi(void); -extern void kvm_posted_intr_ipi(void); -extern void error_interrupt(void); -extern void irq_work_interrupt(void); - -extern void spurious_interrupt(void); -extern void thermal_interrupt(void); -extern void reschedule_interrupt(void); - -extern void invalidate_interrupt(void); -extern void invalidate_interrupt0(void); -extern void invalidate_interrupt1(void); -extern void invalidate_interrupt2(void); -extern void invalidate_interrupt3(void); -extern void invalidate_interrupt4(void); -extern void invalidate_interrupt5(void); -extern void invalidate_interrupt6(void); -extern void invalidate_interrupt7(void); -extern void invalidate_interrupt8(void); -extern void invalidate_interrupt9(void); -extern void invalidate_interrupt10(void); -extern void invalidate_interrupt11(void); -extern void invalidate_interrupt12(void); -extern void invalidate_interrupt13(void); -extern void invalidate_interrupt14(void); -extern void invalidate_interrupt15(void); -extern void invalidate_interrupt16(void); -extern void invalidate_interrupt17(void); -extern void invalidate_interrupt18(void); -extern void invalidate_interrupt19(void); -extern void invalidate_interrupt20(void); -extern void invalidate_interrupt21(void); -extern void invalidate_interrupt22(void); -extern void invalidate_interrupt23(void); -extern void invalidate_interrupt24(void); -extern void invalidate_interrupt25(void); -extern void invalidate_interrupt26(void); -extern void invalidate_interrupt27(void); -extern void invalidate_interrupt28(void); -extern void invalidate_interrupt29(void); -extern void invalidate_interrupt30(void); -extern void invalidate_interrupt31(void); - -extern void irq_move_cleanup_interrupt(void); -extern void reboot_interrupt(void); -extern void threshold_interrupt(void); - -extern void call_function_interrupt(void); -extern void call_function_single_interrupt(void); +extern asmlinkage void apic_timer_interrupt(void); +extern asmlinkage void x86_platform_ipi(void); +extern asmlinkage void kvm_posted_intr_ipi(void); +extern asmlinkage void error_interrupt(void); +extern asmlinkage void irq_work_interrupt(void); + +extern asmlinkage void spurious_interrupt(void); +extern asmlinkage void thermal_interrupt(void); +extern asmlinkage void reschedule_interrupt(void); + +extern asmlinkage void invalidate_interrupt(void); +extern asmlinkage void invalidate_interrupt0(void); +extern asmlinkage void invalidate_interrupt1(void); +extern asmlinkage void invalidate_interrupt2(void); +extern asmlinkage void invalidate_interrupt3(void); +extern asmlinkage void invalidate_interrupt4(void); +extern asmlinkage void invalidate_interrupt5(void); +extern asmlinkage void invalidate_interrupt6(void); +extern asmlinkage void invalidate_interrupt7(void); +extern asmlinkage void invalidate_interrupt8(void); +extern asmlinkage void invalidate_interrupt9(void); +extern asmlinkage void invalidate_interrupt10(void); +extern asmlinkage void invalidate_interrupt11(void); +extern asmlinkage void invalidate_interrupt12(void); +extern asmlinkage void invalidate_interrupt13(void); +extern asmlinkage void invalidate_interrupt14(void); +extern asmlinkage void invalidate_interrupt15(void); +extern asmlinkage void invalidate_interrupt16(void); +extern asmlinkage void invalidate_interrupt17(void); +extern asmlinkage void invalidate_interrupt18(void); +extern asmlinkage void invalidate_interrupt19(void); +extern asmlinkage void invalidate_interrupt20(void); +extern asmlinkage void invalidate_interrupt21(void); +extern asmlinkage void invalidate_interrupt22(void); +extern asmlinkage void invalidate_interrupt23(void); +extern asmlinkage void invalidate_interrupt24(void); +extern asmlinkage void invalidate_interrupt25(void); +extern asmlinkage void invalidate_interrupt26(void); +extern asmlinkage void invalidate_interrupt27(void); +extern asmlinkage void invalidate_interrupt28(void); +extern asmlinkage void invalidate_interrupt29(void); +extern asmlinkage void invalidate_interrupt30(void); +extern asmlinkage void invalidate_interrupt31(void); + +extern asmlinkage void irq_move_cleanup_interrupt(void); +extern asmlinkage void reboot_interrupt(void); +extern asmlinkage void threshold_interrupt(void); + +extern asmlinkage void call_function_interrupt(void); +extern asmlinkage void call_function_single_interrupt(void); + +#ifdef CONFIG_TRACING +/* Interrupt handlers registered during init_IRQ */ +extern void trace_apic_timer_interrupt(void); +extern void trace_x86_platform_ipi(void); +extern void trace_error_interrupt(void); +extern void trace_irq_work_interrupt(void); +extern void trace_spurious_interrupt(void); +extern void trace_thermal_interrupt(void); +extern void trace_reschedule_interrupt(void); +extern void trace_threshold_interrupt(void); +extern void trace_call_function_interrupt(void); +extern void trace_call_function_single_interrupt(void); +#define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt +#define trace_reboot_interrupt reboot_interrupt +#define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi +#endif /* CONFIG_TRACING */ /* IOAPIC */ #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs)) @@ -155,22 +172,18 @@ extern atomic_t irq_mis_count; extern void eisa_set_level_irq(unsigned int irq); /* SMP */ -extern void smp_apic_timer_interrupt(struct pt_regs *); -extern void smp_spurious_interrupt(struct pt_regs *); -extern void smp_x86_platform_ipi(struct pt_regs *); -extern void smp_error_interrupt(struct pt_regs *); +extern __visible void smp_apic_timer_interrupt(struct pt_regs *); +extern __visible void smp_spurious_interrupt(struct pt_regs *); +extern __visible void smp_x86_platform_ipi(struct pt_regs *); +extern __visible void smp_error_interrupt(struct pt_regs *); #ifdef CONFIG_X86_IO_APIC extern asmlinkage void smp_irq_move_cleanup_interrupt(void); #endif #ifdef CONFIG_SMP -extern void smp_reschedule_interrupt(struct pt_regs *); -extern void smp_call_function_interrupt(struct pt_regs *); -extern void smp_call_function_single_interrupt(struct pt_regs *); -#ifdef CONFIG_X86_32 -extern void smp_invalidate_interrupt(struct pt_regs *); -#else -extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *); -#endif +extern __visible void smp_reschedule_interrupt(struct pt_regs *); +extern __visible void smp_call_function_interrupt(struct pt_regs *); +extern __visible void smp_call_function_single_interrupt(struct pt_regs *); +extern __visible void smp_invalidate_interrupt(struct pt_regs *); #endif extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 2d4b5e6107cd..e42f758a0fbd 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -33,7 +33,7 @@ struct hypervisor_x86 { const char *name; /* Detection routine */ - bool (*detect)(void); + uint32_t (*detect)(void); /* Adjust CPU feature bits (run once per CPU) */ void (*set_cpu_features)(struct cpuinfo_x86 *); diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index d8e8eefbe24c..34f69cb9350a 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -345,4 +345,11 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, #define IO_SPACE_LIMIT 0xffff +#ifdef CONFIG_MTRR +extern int __must_check arch_phys_wc_add(unsigned long base, + unsigned long size); +extern void arch_phys_wc_del(int handle); +#define arch_phys_wc_add arch_phys_wc_add +#endif + #endif /* _ASM_X86_IO_H */ diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index ba870bb6dd8e..0ea10f27d613 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -33,7 +33,7 @@ extern void (*x86_platform_ipi_callback)(void); extern void native_init_IRQ(void); extern bool handle_irq(unsigned irq, struct pt_regs *regs); -extern unsigned int do_IRQ(struct pt_regs *regs); +extern __visible unsigned int do_IRQ(struct pt_regs *regs); /* Interrupt vector management */ extern DECLARE_BITMAP(used_vectors, NR_VECTORS); @@ -41,4 +41,9 @@ extern int vector_used_by_percpu_irq(unsigned int vector); extern void init_ISA_irqs(void); +#ifdef CONFIG_X86_LOCAL_APIC +void arch_trigger_all_cpu_backtrace(void); +#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace +#endif + #endif /* _ASM_X86_IRQ_H */ diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 3a16c1483b45..64507f35800c 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -3,18 +3,23 @@ #ifdef __KERNEL__ +#include <linux/stringify.h> #include <linux/types.h> #include <asm/nops.h> #include <asm/asm.h> #define JUMP_LABEL_NOP_SIZE 5 -#define STATIC_KEY_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" +#ifdef CONFIG_X86_64 +# define STATIC_KEY_INIT_NOP P6_NOP5_ATOMIC +#else +# define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC +#endif static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:" - STATIC_KEY_INITIAL_NOP + ".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t" ".pushsection __jump_table, \"aw\" \n\t" _ASM_ALIGN "\n\t" _ASM_PTR "1b, %l[l_yes], %c0 \n\t" diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 5a6d2873f80e..9454c167629f 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -49,10 +49,10 @@ typedef u8 kprobe_opcode_t; #define flush_insn_slot(p) do { } while (0) /* optinsn template addresses */ -extern kprobe_opcode_t optprobe_template_entry; -extern kprobe_opcode_t optprobe_template_val; -extern kprobe_opcode_t optprobe_template_call; -extern kprobe_opcode_t optprobe_template_end; +extern __visible kprobe_opcode_t optprobe_template_entry; +extern __visible kprobe_opcode_t optprobe_template_val; +extern __visible kprobe_opcode_t optprobe_template_call; +extern __visible kprobe_opcode_t optprobe_template_end; #define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE) #define MAX_OPTINSN_SIZE \ (((unsigned long)&optprobe_template_end - \ @@ -62,7 +62,7 @@ extern kprobe_opcode_t optprobe_template_end; extern const int kretprobe_blacklist_size; void arch_remove_kprobe(struct kprobe *p); -void kretprobe_trampoline(void); +asmlinkage void kretprobe_trampoline(void); /* Architecture specific copy of original instruction*/ struct arch_specific_insn { diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3741c653767c..c76ff74a98f2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -59,7 +59,7 @@ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ - | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \ + | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) @@ -222,14 +222,22 @@ struct kvm_mmu_page { int root_count; /* Currently serving as active root */ unsigned int unsync_children; unsigned long parent_ptes; /* Reverse mapping for parent_pte */ + + /* The page is obsolete if mmu_valid_gen != kvm->arch.mmu_valid_gen. */ + unsigned long mmu_valid_gen; + DECLARE_BITMAP(unsync_child_bitmap, 512); #ifdef CONFIG_X86_32 + /* + * Used out of the mmu-lock to avoid reading spte values while an + * update is in progress; see the comments in __get_spte_lockless(). + */ int clear_spte_count; #endif + /* Number of writes since the last time traversal visited this page. */ int write_flooding_count; - bool mmio_cached; }; struct kvm_pio_request { @@ -278,6 +286,7 @@ struct kvm_mmu { u64 *pae_root; u64 *lm_root; u64 rsvd_bits_mask[2][4]; + u64 bad_mt_xwr; /* * Bitmap: bit set = last pte in walk @@ -315,6 +324,7 @@ struct kvm_pmu { u64 global_ovf_ctrl; u64 counter_bitmask[2]; u64 global_ctrl_mask; + u64 reserved_bits; u8 version; struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED]; @@ -503,6 +513,14 @@ struct kvm_vcpu_arch { * instruction. */ bool write_fault_to_shadow_pgtable; + + /* set at EPT violation at this point */ + unsigned long exit_qualification; + + /* pv related host specific info */ + struct { + bool pv_unhalted; + } pv; }; struct kvm_lpage_info { @@ -529,11 +547,14 @@ struct kvm_arch { unsigned int n_requested_mmu_pages; unsigned int n_max_mmu_pages; unsigned int indirect_shadow_pages; + unsigned long mmu_valid_gen; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; /* * Hash table of struct kvm_mmu_page. */ struct list_head active_mmu_pages; + struct list_head zapped_obsolete_pages; + struct list_head assigned_dev_head; struct iommu_domain *iommu_domain; int iommu_flags; @@ -769,7 +790,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); -void kvm_mmu_zap_mmio_sptes(struct kvm *kvm); +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); @@ -791,8 +812,8 @@ extern u32 kvm_min_guest_tsc_khz; extern u32 kvm_max_guest_tsc_khz; enum emulation_result { - EMULATE_DONE, /* no further processing */ - EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ + EMULATE_DONE, /* no further processing */ + EMULATE_USER_EXIT, /* kvm_run ready for userspace exit */ EMULATE_FAIL, /* can't emulate this instruction */ }; diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 695399f2d5eb..1df115909758 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -85,26 +85,20 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, return ret; } -static inline bool kvm_para_available(void) +static inline uint32_t kvm_cpuid_base(void) { - unsigned int eax, ebx, ecx, edx; - char signature[13]; - if (boot_cpu_data.cpuid_level < 0) - return false; /* So we don't blow up on old processors */ + return 0; /* So we don't blow up on old processors */ - if (cpu_has_hypervisor) { - cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); - memcpy(signature + 0, &ebx, 4); - memcpy(signature + 4, &ecx, 4); - memcpy(signature + 8, &edx, 4); - signature[12] = 0; + if (cpu_has_hypervisor) + return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0); - if (strcmp(signature, "KVMKVMKVM") == 0) - return true; - } + return 0; +} - return false; +static inline bool kvm_para_available(void) +{ + return kvm_cpuid_base() != 0; } static inline unsigned int kvm_arch_para_features(void) @@ -118,10 +112,20 @@ void kvm_async_pf_task_wait(u32 token); void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_pf_reason(void); extern void kvm_disable_steal_time(void); -#else -#define kvm_guest_init() do { } while (0) + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +void __init kvm_spinlock_init(void); +#else /* !CONFIG_PARAVIRT_SPINLOCKS */ +static inline void kvm_spinlock_init(void) +{ +} +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +#else /* CONFIG_KVM_GUEST */ +#define kvm_guest_init() do {} while (0) #define kvm_async_pf_task_wait(T) do {} while(0) #define kvm_async_pf_task_wake(T) do {} while(0) + static inline u32 kvm_read_and_reset_pf_reason(void) { return 0; diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h index d354fb781c57..a55c7efcc4ed 100644 --- a/arch/x86/include/asm/mc146818rtc.h +++ b/arch/x86/include/asm/mc146818rtc.h @@ -95,8 +95,8 @@ static inline unsigned char current_lock_cmos_reg(void) unsigned char rtc_cmos_read(unsigned char addr); void rtc_cmos_write(unsigned char val, unsigned char addr); -extern int mach_set_rtc_mmss(unsigned long nowtime); -extern unsigned long mach_get_cmos_time(void); +extern int mach_set_rtc_mmss(const struct timespec *now); +extern void mach_get_cmos_time(struct timespec *now); #define RTC_IRQ 8 diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index fa5f71e021d5..cbe6b9e404ce 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -32,11 +32,20 @@ #define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */ #define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ #define MCI_STATUS_AR (1ULL<<55) /* Action required */ -#define MCACOD 0xffff /* MCA Error Code */ + +/* + * Note that the full MCACOD field of IA32_MCi_STATUS MSR is + * bits 15:0. But bit 12 is the 'F' bit, defined for corrected + * errors to indicate that errors are being filtered by hardware. + * We should mask out bit 12 when looking for specific signatures + * of uncorrected errors - so the F bit is deliberately skipped + * in this #define. + */ +#define MCACOD 0xefff /* MCA Error Code */ /* Architecturally defined codes from SDM Vol. 3B Chapter 15 */ #define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */ -#define MCACOD_SCRUBMSK 0xfff0 +#define MCACOD_SCRUBMSK 0xeff0 /* Skip bit 12 ('F' bit) */ #define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */ #define MCACOD_DATA 0x0134 /* Data Load */ #define MCACOD_INSTR 0x0150 /* Instruction Fetch */ @@ -61,7 +70,7 @@ #define MCJ_CTX_IRQ 0x2 /* inject context: IRQ */ #define MCJ_NMI_BROADCAST 0x4 /* do NMI broadcasting */ #define MCJ_EXCEPTION 0x8 /* raise as exception */ -#define MCJ_IRQ_BRAODCAST 0x10 /* do IRQ broadcasting */ +#define MCJ_IRQ_BROADCAST 0x10 /* do IRQ broadcasting */ #define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */ @@ -188,6 +197,9 @@ extern void register_mce_write_callback(ssize_t (*)(struct file *filp, const char __user *ubuf, size_t usize, loff_t *off)); +/* Disable CMCI/polling for MCA bank claimed by firmware */ +extern void mce_disable_bank(int bank); + /* * Exception handler */ @@ -214,6 +226,13 @@ void mce_log_therm_throt_event(__u64 status); /* Interrupt Handler for core thermal thresholds */ extern int (*platform_thermal_notify)(__u64 msr_val); +/* Interrupt Handler for package thermal thresholds */ +extern int (*platform_thermal_package_notify)(__u64 msr_val); + +/* Callback support of rate control, return true, if + * callback has rate control */ +extern bool (*platform_thermal_package_rate_control)(void); + #ifdef CONFIG_X86_THERMAL_VECTOR extern void mcheck_intel_therm_init(void); #else diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 6825e2efd1b4..f98bd6625318 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -60,11 +60,11 @@ static inline void __exit exit_amd_microcode(void) {} #ifdef CONFIG_MICROCODE_EARLY #define MAX_UCODE_COUNT 128 extern void __init load_ucode_bsp(void); -extern __init void load_ucode_ap(void); +extern void load_ucode_ap(void); extern int __init save_microcode_in_initrd(void); #else static inline void __init load_ucode_bsp(void) {} -static inline __init void load_ucode_ap(void) {} +static inline void load_ucode_ap(void) {} static inline int __init save_microcode_in_initrd(void) { return 0; diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h new file mode 100644 index 000000000000..4c019179a57d --- /dev/null +++ b/arch/x86/include/asm/microcode_amd.h @@ -0,0 +1,78 @@ +#ifndef _ASM_X86_MICROCODE_AMD_H +#define _ASM_X86_MICROCODE_AMD_H + +#include <asm/microcode.h> + +#define UCODE_MAGIC 0x00414d44 +#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 +#define UCODE_UCODE_TYPE 0x00000001 + +#define SECTION_HDR_SIZE 8 +#define CONTAINER_HDR_SZ 12 + +struct equiv_cpu_entry { + u32 installed_cpu; + u32 fixed_errata_mask; + u32 fixed_errata_compare; + u16 equiv_cpu; + u16 res; +} __attribute__((packed)); + +struct microcode_header_amd { + u32 data_code; + u32 patch_id; + u16 mc_patch_data_id; + u8 mc_patch_data_len; + u8 init_flag; + u32 mc_patch_data_checksum; + u32 nb_dev_id; + u32 sb_dev_id; + u16 processor_rev_id; + u8 nb_rev_id; + u8 sb_rev_id; + u8 bios_api_rev; + u8 reserved1[3]; + u32 match_reg[8]; +} __attribute__((packed)); + +struct microcode_amd { + struct microcode_header_amd hdr; + unsigned int mpb[0]; +}; + +static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table, + unsigned int sig) +{ + int i = 0; + + if (!equiv_cpu_table) + return 0; + + while (equiv_cpu_table[i].installed_cpu != 0) { + if (sig == equiv_cpu_table[i].installed_cpu) + return equiv_cpu_table[i].equiv_cpu; + + i++; + } + return 0; +} + +extern int __apply_microcode_amd(struct microcode_amd *mc_amd); +extern int apply_microcode_amd(int cpu); +extern enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); + +#ifdef CONFIG_MICROCODE_AMD_EARLY +#ifdef CONFIG_X86_32 +#define MPB_MAX_SIZE PAGE_SIZE +extern u8 amd_bsp_mpb[MPB_MAX_SIZE]; +#endif +extern void __init load_ucode_amd_bsp(void); +extern void load_ucode_amd_ap(void); +extern int __init save_microcode_in_initrd_amd(void); +#else +static inline void __init load_ucode_amd_bsp(void) {} +static inline void load_ucode_amd_ap(void) {} +static inline int __init save_microcode_in_initrd_amd(void) { return -EINVAL; } +#endif + +#endif /* _ASM_X86_MICROCODE_AMD_H */ diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index 5356f927d411..9067166409bf 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -65,12 +65,14 @@ update_match_revision(struct microcode_header_intel *mc_header, int rev); #ifdef CONFIG_MICROCODE_INTEL_EARLY extern void __init load_ucode_intel_bsp(void); -extern void __cpuinit load_ucode_intel_ap(void); +extern void load_ucode_intel_ap(void); extern void show_ucode_info_early(void); +extern int __init save_microcode_in_initrd_intel(void); #else static inline __init void load_ucode_intel_bsp(void) {} -static inline __cpuinit void load_ucode_intel_ap(void) {} +static inline void load_ucode_intel_ap(void) {} static inline void show_ucode_info_early(void) {} +static inline int __init save_microcode_in_initrd_intel(void) { return -EINVAL; } #endif #if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU) diff --git a/arch/x86/include/asm/mmconfig.h b/arch/x86/include/asm/mmconfig.h index 9b119da1d105..04a3fed22cfe 100644 --- a/arch/x86/include/asm/mmconfig.h +++ b/arch/x86/include/asm/mmconfig.h @@ -2,8 +2,8 @@ #define _ASM_X86_MMCONFIG_H #ifdef CONFIG_PCI_MMCONFIG -extern void __cpuinit fam10h_check_enable_mmcfg(void); -extern void __cpuinit check_enable_amd_mmconf_dmi(void); +extern void fam10h_check_enable_mmcfg(void); +extern void check_enable_amd_mmconf_dmi(void); #else static inline void fam10h_check_enable_mmcfg(void) { } static inline void check_enable_amd_mmconf_dmi(void) { } diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index cdbf36776106..be12c534fd59 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -45,22 +45,28 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, /* Re-load page tables */ load_cr3(next->pgd); - /* stop flush ipis for the previous mm */ + /* Stop flush ipis for the previous mm */ cpumask_clear_cpu(cpu, mm_cpumask(prev)); - /* - * load the LDT, if the LDT is different: - */ + /* Load the LDT, if the LDT is different: */ if (unlikely(prev->context.ldt != next->context.ldt)) load_LDT_nolock(&next->context); } #ifdef CONFIG_SMP - else { + else { this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); - if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) { - /* We were in lazy tlb mode and leave_mm disabled + if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { + /* + * On established mms, the mm_cpumask is only changed + * from irq context, from ptep_clear_flush() while in + * lazy tlb mode, and here. Irqs are blocked during + * schedule, protecting us from simultaneous changes. + */ + cpumask_set_cpu(cpu, mm_cpumask(next)); + /* + * We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. */ diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 3e2f42a4b872..626cf70082d7 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -94,7 +94,7 @@ static inline void early_reserve_e820_mpc_new(void) { } #define default_get_smp_config x86_init_uint_noop #endif -void __cpuinit generic_processor_info(int apicid, int version); +void generic_processor_info(int apicid, int version); #ifdef CONFIG_ACPI extern void mp_register_ioapic(int id, u32 address, u32 gsi_base); extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, diff --git a/arch/x86/include/asm/mrst-vrtc.h b/arch/x86/include/asm/mrst-vrtc.h index 73668abdbedf..1e69a75412a4 100644 --- a/arch/x86/include/asm/mrst-vrtc.h +++ b/arch/x86/include/asm/mrst-vrtc.h @@ -3,7 +3,7 @@ extern unsigned char vrtc_cmos_read(unsigned char reg); extern void vrtc_cmos_write(unsigned char val, unsigned char reg); -extern unsigned long vrtc_get_time(void); -extern int vrtc_set_mmss(unsigned long nowtime); +extern void vrtc_get_time(struct timespec *now); +extern int vrtc_set_mmss(const struct timespec *now); #endif diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index c2934be2446a..cd9c41938b8a 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -12,6 +12,9 @@ struct ms_hyperv_info { extern struct ms_hyperv_info ms_hyperv; void hyperv_callback_vector(void); +#ifdef CONFIG_TRACING +#define trace_hyperv_callback_vector hyperv_callback_vector +#endif void hyperv_vector_handler(struct pt_regs *regs); void hv_register_vmbus_handler(int irq, irq_handler_t handler); diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index e235582f9930..f768f6298419 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -26,7 +26,10 @@ #include <uapi/asm/mtrr.h> -/* The following functions are for use by other drivers */ +/* + * The following functions are for use by other drivers that cannot use + * arch_phys_wc_add and arch_phys_wc_del. + */ # ifdef CONFIG_MTRR extern u8 mtrr_type_lookup(u64 addr, u64 end); extern void mtrr_save_fixed_ranges(void *); @@ -45,6 +48,7 @@ extern void mtrr_aps_init(void); extern void mtrr_bp_restore(void); extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); +extern int phys_wc_to_mtrr_index(int handle); # else static inline u8 mtrr_type_lookup(u64 addr, u64 end) { @@ -80,6 +84,10 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn) static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) { } +static inline int phys_wc_to_mtrr_index(int handle) +{ + return -1; +} #define mtrr_ap_init() do {} while (0) #define mtrr_bp_init() do {} while (0) diff --git a/arch/x86/include/asm/mutex_32.h b/arch/x86/include/asm/mutex_32.h index 03f90c8a5a7c..0208c3c2cbc6 100644 --- a/arch/x86/include/asm/mutex_32.h +++ b/arch/x86/include/asm/mutex_32.h @@ -42,17 +42,14 @@ do { \ * __mutex_fastpath_lock_retval - try to take the lock by moving the count * from 1 to a 0 value * @count: pointer of type atomic_t - * @fail_fn: function to call if the original value was not 1 * - * Change the count from 1 to a value lower than 1, and call <fail_fn> if it - * wasn't 1 originally. This function returns 0 if the fastpath succeeds, - * or anything the slow path function returns + * Change the count from 1 to a value lower than 1. This function returns 0 + * if the fastpath succeeds, or -1 otherwise. */ -static inline int __mutex_fastpath_lock_retval(atomic_t *count, - int (*fail_fn)(atomic_t *)) +static inline int __mutex_fastpath_lock_retval(atomic_t *count) { if (unlikely(atomic_dec_return(count) < 0)) - return fail_fn(count); + return -1; else return 0; } diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h index 68a87b0f8e29..e7e6751648ed 100644 --- a/arch/x86/include/asm/mutex_64.h +++ b/arch/x86/include/asm/mutex_64.h @@ -16,6 +16,20 @@ * * Atomically decrements @v and calls <fail_fn> if the result is negative. */ +#ifdef CC_HAVE_ASM_GOTO +static inline void __mutex_fastpath_lock(atomic_t *v, + void (*fail_fn)(atomic_t *)) +{ + asm volatile goto(LOCK_PREFIX " decl %0\n" + " jns %l[exit]\n" + : : "m" (v->counter) + : "memory", "cc" + : exit); + fail_fn(v); +exit: + return; +} +#else #define __mutex_fastpath_lock(v, fail_fn) \ do { \ unsigned long dummy; \ @@ -32,22 +46,20 @@ do { \ : "rax", "rsi", "rdx", "rcx", \ "r8", "r9", "r10", "r11", "memory"); \ } while (0) +#endif /** * __mutex_fastpath_lock_retval - try to take the lock by moving the count * from 1 to a 0 value * @count: pointer of type atomic_t - * @fail_fn: function to call if the original value was not 1 * - * Change the count from 1 to a value lower than 1, and call <fail_fn> if - * it wasn't 1 originally. This function returns 0 if the fastpath succeeds, - * or anything the slow path function returns + * Change the count from 1 to a value lower than 1. This function returns 0 + * if the fastpath succeeds, or -1 otherwise. */ -static inline int __mutex_fastpath_lock_retval(atomic_t *count, - int (*fail_fn)(atomic_t *)) +static inline int __mutex_fastpath_lock_retval(atomic_t *count) { if (unlikely(atomic_dec_return(count) < 0)) - return fail_fn(count); + return -1; else return 0; } @@ -59,6 +71,20 @@ static inline int __mutex_fastpath_lock_retval(atomic_t *count, * * Atomically increments @v and calls <fail_fn> if the result is nonpositive. */ +#ifdef CC_HAVE_ASM_GOTO +static inline void __mutex_fastpath_unlock(atomic_t *v, + void (*fail_fn)(atomic_t *)) +{ + asm volatile goto(LOCK_PREFIX " incl %0\n" + " jg %l[exit]\n" + : : "m" (v->counter) + : "memory", "cc" + : exit); + fail_fn(v); +exit: + return; +} +#else #define __mutex_fastpath_unlock(v, fail_fn) \ do { \ unsigned long dummy; \ @@ -75,6 +101,7 @@ do { \ : "rax", "rsi", "rdx", "rcx", \ "r8", "r9", "r10", "r11", "memory"); \ } while (0) +#endif #define __mutex_slowpath_needs_to_unlock() 1 diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index c0fa356e90de..86f9301903c8 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -18,9 +18,7 @@ extern int proc_nmi_enabled(struct ctl_table *, int , void __user *, size_t *, loff_t *); extern int unknown_nmi_panic; -void arch_trigger_all_cpu_backtrace(void); -#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace -#endif +#endif /* CONFIG_X86_LOCAL_APIC */ #define NMI_FLAG_FIRST 1 diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 1b99ee5c9f00..4064acae625d 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -39,7 +39,7 @@ static inline void set_apicid_to_node(int apicid, s16 node) __apicid_to_node[apicid] = node; } -extern int __cpuinit numa_cpu_node(int cpu); +extern int numa_cpu_node(int cpu); #else /* CONFIG_NUMA */ static inline void set_apicid_to_node(int apicid, s16 node) @@ -60,8 +60,8 @@ static inline int numa_cpu_node(int cpu) extern void numa_set_node(int cpu, int node); extern void numa_clear_node(int cpu); extern void __init init_cpu_to_node(void); -extern void __cpuinit numa_add_cpu(int cpu); -extern void __cpuinit numa_remove_cpu(int cpu); +extern void numa_add_cpu(int cpu); +extern void numa_remove_cpu(int cpu); #else /* CONFIG_NUMA */ static inline void numa_set_node(int cpu, int node) { } static inline void numa_clear_node(int cpu) { } diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index ef17af013475..f48b17df4224 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -15,6 +15,8 @@ */ #define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) +#define __START_KERNEL_map __PAGE_OFFSET + #define THREAD_SIZE_ORDER 1 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 6c896fbe21db..43dcd804ebd5 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -32,11 +32,6 @@ */ #define __PAGE_OFFSET _AC(0xffff880000000000, UL) -#define __PHYSICAL_START ((CONFIG_PHYSICAL_START + \ - (CONFIG_PHYSICAL_ALIGN - 1)) & \ - ~(CONFIG_PHYSICAL_ALIGN - 1)) - -#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) #define __START_KERNEL_map _AC(0xffffffff80000000, UL) /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 54c97879195e..f97fbe3abb67 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -33,6 +33,11 @@ (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#define __PHYSICAL_START ALIGN(CONFIG_PHYSICAL_START, \ + CONFIG_PHYSICAL_ALIGN) + +#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) + #ifdef CONFIG_X86_64 #include <asm/page_64_types.h> #else diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index cfdc9ee4c900..401f350ef71b 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -712,36 +712,16 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) -static inline int arch_spin_is_locked(struct arch_spinlock *lock) +static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock, + __ticket_t ticket) { - return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock); + PVOP_VCALLEE2(pv_lock_ops.lock_spinning, lock, ticket); } -static inline int arch_spin_is_contended(struct arch_spinlock *lock) +static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock, + __ticket_t ticket) { - return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock); -} -#define arch_spin_is_contended arch_spin_is_contended - -static __always_inline void arch_spin_lock(struct arch_spinlock *lock) -{ - PVOP_VCALL1(pv_lock_ops.spin_lock, lock); -} - -static __always_inline void arch_spin_lock_flags(struct arch_spinlock *lock, - unsigned long flags) -{ - PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags); -} - -static __always_inline int arch_spin_trylock(struct arch_spinlock *lock) -{ - return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock); -} - -static __always_inline void arch_spin_unlock(struct arch_spinlock *lock) -{ - PVOP_VCALL1(pv_lock_ops.spin_unlock, lock); + PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket); } #endif diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 0db1fcac668c..aab8f671b523 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -327,13 +327,15 @@ struct pv_mmu_ops { }; struct arch_spinlock; +#ifdef CONFIG_SMP +#include <asm/spinlock_types.h> +#else +typedef u16 __ticket_t; +#endif + struct pv_lock_ops { - int (*spin_is_locked)(struct arch_spinlock *lock); - int (*spin_is_contended)(struct arch_spinlock *lock); - void (*spin_lock)(struct arch_spinlock *lock); - void (*spin_lock_flags)(struct arch_spinlock *lock, unsigned long flags); - int (*spin_trylock)(struct arch_spinlock *lock); - void (*spin_unlock)(struct arch_spinlock *lock); + struct paravirt_callee_save lock_spinning; + void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket); }; /* This contains all the paravirt structures: we get a convenient @@ -387,7 +389,8 @@ extern struct pv_lock_ops pv_lock_ops; /* Simple instruction patching code. */ #define DEF_NATIVE(ops, name, code) \ - extern const char start_##ops##_##name[], end_##ops##_##name[]; \ + extern const char start_##ops##_##name[] __visible, \ + end_##ops##_##name[] __visible; \ asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") unsigned paravirt_patch_nop(void); diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index d9e9e6c7ed32..7d7443283a9d 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -100,29 +100,6 @@ static inline void early_quirks(void) { } extern void pci_iommu_alloc(void); #ifdef CONFIG_PCI_MSI -/* MSI arch specific hooks */ -static inline int x86_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) -{ - return x86_msi.setup_msi_irqs(dev, nvec, type); -} - -static inline void x86_teardown_msi_irqs(struct pci_dev *dev) -{ - x86_msi.teardown_msi_irqs(dev); -} - -static inline void x86_teardown_msi_irq(unsigned int irq) -{ - x86_msi.teardown_msi_irq(irq); -} -static inline void x86_restore_msi_irqs(struct pci_dev *dev, int irq) -{ - x86_msi.restore_msi_irqs(dev, irq); -} -#define arch_setup_msi_irqs x86_setup_msi_irqs -#define arch_teardown_msi_irqs x86_teardown_msi_irqs -#define arch_teardown_msi_irq x86_teardown_msi_irq -#define arch_restore_msi_irqs x86_restore_msi_irqs /* implemented in arch/x86/kernel/apic/io_apic. */ struct msi_desc; int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); @@ -130,16 +107,9 @@ void native_teardown_msi_irq(unsigned int irq); void native_restore_msi_irqs(struct pci_dev *dev, int irq); int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, unsigned int irq_base, unsigned int irq_offset); -/* default to the implementation in drivers/lib/msi.c */ -#define HAVE_DEFAULT_MSI_TEARDOWN_IRQS -#define HAVE_DEFAULT_MSI_RESTORE_IRQS -void default_teardown_msi_irqs(struct pci_dev *dev); -void default_restore_msi_irqs(struct pci_dev *dev, int irq); #else #define native_setup_msi_irqs NULL #define native_teardown_msi_irq NULL -#define default_teardown_msi_irqs NULL -#define default_restore_msi_irqs NULL #endif #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 57cb63402213..8249df45d2f2 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -29,6 +29,9 @@ #define ARCH_PERFMON_EVENTSEL_INV (1ULL << 23) #define ARCH_PERFMON_EVENTSEL_CMASK 0xFF000000ULL +#define HSW_IN_TX (1ULL << 32) +#define HSW_IN_TX_CHECKPOINTED (1ULL << 33) + #define AMD64_EVENTSEL_INT_CORE_ENABLE (1ULL << 36) #define AMD64_EVENTSEL_GUESTONLY (1ULL << 40) #define AMD64_EVENTSEL_HOSTONLY (1ULL << 41) diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index f2b489cf1602..3bf2dd0cf61f 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -55,9 +55,53 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif +#ifdef CONFIG_MEM_SOFT_DIRTY + +/* + * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE, _PAGE_BIT_SOFT_DIRTY and + * _PAGE_BIT_PROTNONE are taken, split up the 28 bits of offset + * into this range. + */ +#define PTE_FILE_MAX_BITS 28 +#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) +#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) +#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) +#define PTE_FILE_SHIFT4 (_PAGE_BIT_SOFT_DIRTY + 1) +#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) +#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) +#define PTE_FILE_BITS3 (PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1) + +#define pte_to_pgoff(pte) \ + ((((pte).pte_low >> (PTE_FILE_SHIFT1)) \ + & ((1U << PTE_FILE_BITS1) - 1))) \ + + ((((pte).pte_low >> (PTE_FILE_SHIFT2)) \ + & ((1U << PTE_FILE_BITS2) - 1)) \ + << (PTE_FILE_BITS1)) \ + + ((((pte).pte_low >> (PTE_FILE_SHIFT3)) \ + & ((1U << PTE_FILE_BITS3) - 1)) \ + << (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ + + ((((pte).pte_low >> (PTE_FILE_SHIFT4))) \ + << (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)) + +#define pgoff_to_pte(off) \ + ((pte_t) { .pte_low = \ + ((((off)) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ + + ((((off) >> PTE_FILE_BITS1) \ + & ((1U << PTE_FILE_BITS2) - 1)) \ + << PTE_FILE_SHIFT2) \ + + ((((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ + & ((1U << PTE_FILE_BITS3) - 1)) \ + << PTE_FILE_SHIFT3) \ + + ((((off) >> \ + (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3))) \ + << PTE_FILE_SHIFT4) \ + + _PAGE_FILE }) + +#else /* CONFIG_MEM_SOFT_DIRTY */ + /* * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, - * split up the 29 bits of offset into this range: + * split up the 29 bits of offset into this range. */ #define PTE_FILE_MAX_BITS 29 #define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) @@ -88,6 +132,8 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) << PTE_FILE_SHIFT3) \ + _PAGE_FILE }) +#endif /* CONFIG_MEM_SOFT_DIRTY */ + /* Encode and de-code a swap entry */ #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 4cc9f2b7cdc3..81bb91b49a88 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -179,6 +179,9 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) /* * Bits 0, 6 and 7 are taken in the low part of the pte, * put the 32 bits of offset into the high part. + * + * For soft-dirty tracking 11 bit is taken from + * the low part of pte as well. */ #define pte_to_pgoff(pte) ((pte).pte_high) #define pgoff_to_pte(off) \ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1e672234c4ff..3d1999458709 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -22,7 +22,8 @@ * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. */ -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; +extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] + __visible; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) extern spinlock_t pgd_lock; @@ -207,7 +208,7 @@ static inline pte_t pte_mkexec(pte_t pte) static inline pte_t pte_mkdirty(pte_t pte) { - return pte_set_flags(pte, _PAGE_DIRTY); + return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); } static inline pte_t pte_mkyoung(pte_t pte) @@ -271,7 +272,7 @@ static inline pmd_t pmd_wrprotect(pmd_t pmd) static inline pmd_t pmd_mkdirty(pmd_t pmd) { - return pmd_set_flags(pmd, _PAGE_DIRTY); + return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); } static inline pmd_t pmd_mkhuge(pmd_t pmd) @@ -294,6 +295,41 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd) return pmd_clear_flags(pmd, _PAGE_PRESENT); } +static inline int pte_soft_dirty(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SOFT_DIRTY; +} + +static inline int pmd_soft_dirty(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_SOFT_DIRTY; +} + +static inline pte_t pte_mksoft_dirty(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SOFT_DIRTY); +} + +static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); +} + +static inline pte_t pte_file_clear_soft_dirty(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); +} + +static inline pte_t pte_file_mksoft_dirty(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SOFT_DIRTY); +} + +static inline int pte_file_soft_dirty(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SOFT_DIRTY; +} + /* * Mask out unsupported bits in a present pgprot. Non-present pgprots * can use those bits for other purposes, so leave them be. @@ -395,6 +431,7 @@ pte_t *populate_extra_pte(unsigned long vaddr); #ifndef __ASSEMBLY__ #include <linux/mm_types.h> +#include <linux/mmdebug.h> #include <linux/log2.h> static inline int pte_none(pte_t pte) @@ -506,9 +543,6 @@ static inline unsigned long pages_to_mb(unsigned long npg) return npg >> (20 - PAGE_SHIFT); } -#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ - remap_pfn_range(vma, vaddr, pfn, size, prot) - #if PAGETABLE_LEVELS > 2 static inline int pud_none(pud_t pud) { @@ -816,6 +850,24 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, { } +static inline pte_t pte_swp_mksoft_dirty(pte_t pte) +{ + VM_BUG_ON(pte_present(pte)); + return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); +} + +static inline int pte_swp_soft_dirty(pte_t pte) +{ + VM_BUG_ON(pte_present(pte)); + return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; +} + +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) +{ + VM_BUG_ON(pte_present(pte)); + return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); +} + #include <asm-generic/pgtable.h> #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index e6423002c10b..0ecac257fb26 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -55,6 +55,36 @@ #define _PAGE_HIDDEN (_AT(pteval_t, 0)) #endif +/* + * The same hidden bit is used by kmemcheck, but since kmemcheck + * works on kernel pages while soft-dirty engine on user space, + * they do not conflict with each other. + */ + +#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_HIDDEN + +#ifdef CONFIG_MEM_SOFT_DIRTY +#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) +#else +#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) +#endif + +/* + * Tracking soft dirty bit when a page goes to a swap is tricky. + * We need a bit which can be stored in pte _and_ not conflict + * with swap entry format. On x86 bits 6 and 7 are *not* involved + * into swap entry computation, but bit 6 is used for nonlinear + * file mapping, so we borrow bit 7 for soft dirty tracking. + * + * Please note that this bit must be treated as swap dirty page + * mark if and only if the PTE has present bit clear! + */ +#ifdef CONFIG_MEM_SOFT_DIRTY +#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE +#else +#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) +#endif + #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #else diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 22224b3b43bb..987c75ecc334 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -89,9 +89,9 @@ struct cpuinfo_x86 { char wp_works_ok; /* It doesn't on 386's */ /* Problems on some 486Dx4's and old 386's: */ - char hard_math; char rfu; char pad0; + char pad1; #else /* Number of 4K pages in DTLB/ITLB combined(in pages): */ int x86_tlbsize; @@ -164,6 +164,7 @@ extern const struct seq_operations cpuinfo_op; #define cache_line_size() (boot_cpu_data.x86_cache_alignment) extern void cpu_detect(struct cpuinfo_x86 *c); +extern void fpu_detect(struct cpuinfo_x86 *c); extern void early_cpu_init(void); extern void identify_boot_cpu(void); @@ -411,7 +412,7 @@ union irq_stack_union { }; }; -DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union); +DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible; DECLARE_INIT_PER_CPU(irq_stack_union); DECLARE_PER_CPU(char *, irq_stack_ptr); @@ -941,33 +942,19 @@ extern int set_tsc_mode(unsigned int val); extern u16 amd_get_nb_id(int cpu); -struct aperfmperf { - u64 aperf, mperf; -}; - -static inline void get_aperfmperf(struct aperfmperf *am) +static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) { - WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF)); - - rdmsrl(MSR_IA32_APERF, am->aperf); - rdmsrl(MSR_IA32_MPERF, am->mperf); -} + uint32_t base, eax, signature[3]; -#define APERFMPERF_SHIFT 10 + for (base = 0x40000000; base < 0x40010000; base += 0x100) { + cpuid(base, &eax, &signature[0], &signature[1], &signature[2]); -static inline -unsigned long calc_aperfmperf_ratio(struct aperfmperf *old, - struct aperfmperf *new) -{ - u64 aperf = new->aperf - old->aperf; - u64 mperf = new->mperf - old->mperf; - unsigned long ratio = aperf; - - mperf >>= APERFMPERF_SHIFT; - if (mperf) - ratio = div64_u64(aperf, mperf); + if (!memcmp(sig, signature, 12) && + (leaves == 0 || ((eax - base) >= leaves))) + return base; + } - return ratio; + return 0; } extern unsigned long arch_align_stack(unsigned long sp); @@ -981,5 +968,5 @@ bool xen_set_default_idle(void); #endif void stop_this_cpu(void *dummy); - +void df_debug(struct pt_regs *regs, long error_code); #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index 60bef663609a..bade6ac3b14f 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -27,7 +27,7 @@ extern int of_ioapic; extern u64 initial_dtb; extern void add_dtb(u64 data); extern void x86_add_irq_domains(void); -void __cpuinit x86_of_pci_init(void); +void x86_of_pci_init(void); void x86_dtb_init(void); #else static inline void add_dtb(u64 data) { } diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 109a9dd5d454..be8269b00e2a 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -93,7 +93,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, struct pvclock_vsyscall_time_info { struct pvclock_vcpu_time_info pvti; - u32 migrate_count; } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index b7bf3505e1ec..347555492dad 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -6,6 +6,8 @@ #define COMMAND_LINE_SIZE 2048 +#include <linux/linkage.h> + #ifdef __i386__ #include <linux/pfn.h> @@ -108,11 +110,11 @@ void *extend_brk(size_t size, size_t align); extern void probe_roms(void); #ifdef __i386__ -void __init i386_start_kernel(void); +asmlinkage void __init i386_start_kernel(void); #else -void __init x86_64_start_kernel(char *real_mode); -void __init x86_64_start_reservations(char *real_mode_data); +asmlinkage void __init x86_64_start_kernel(char *real_mode); +asmlinkage void __init x86_64_start_reservations(char *real_mode_data); #endif /* __i386__ */ #endif /* _SETUP */ diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h index beff97f7df37..7a958164088c 100644 --- a/arch/x86/include/asm/sighandling.h +++ b/arch/x86/include/asm/sighandling.h @@ -7,10 +7,10 @@ #include <asm/processor-flags.h> -#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \ +#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \ X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \ X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \ - X86_EFLAGS_CF) + X86_EFLAGS_CF | X86_EFLAGS_RF) void signal_fault(struct pt_regs *regs, void __user *frame, char *where); diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index b073aaea747c..4137890e88e3 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -179,7 +179,7 @@ static inline int wbinvd_on_all_cpus(void) } #endif /* CONFIG_SMP */ -extern unsigned disabled_cpus __cpuinitdata; +extern unsigned disabled_cpus; #ifdef CONFIG_X86_32_SMP /* diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 41fc93a2e225..645cad2c95ff 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -16,7 +16,7 @@ static inline void native_clts(void) * all loads stores around it, which can hurt performance. Solution is to * use a variable and mimic reads and writes to it to enforce serialization */ -static unsigned long __force_order; +extern unsigned long __force_order; static inline unsigned long native_read_cr0(void) { @@ -101,7 +101,7 @@ static inline void native_wbinvd(void) asm volatile("wbinvd": : :"memory"); } -extern void native_load_gs_index(unsigned); +extern asmlinkage void native_load_gs_index(unsigned); #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 33692eaabab5..bf156ded74b5 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -1,11 +1,14 @@ #ifndef _ASM_X86_SPINLOCK_H #define _ASM_X86_SPINLOCK_H +#include <linux/jump_label.h> #include <linux/atomic.h> #include <asm/page.h> #include <asm/processor.h> #include <linux/compiler.h> #include <asm/paravirt.h> +#include <asm/bitops.h> + /* * Your basic SMP spinlocks, allowing only a single CPU anywhere * @@ -34,6 +37,36 @@ # define UNLOCK_LOCK_PREFIX #endif +/* How long a lock should spin before we consider blocking */ +#define SPIN_THRESHOLD (1 << 15) + +extern struct static_key paravirt_ticketlocks_enabled; +static __always_inline bool static_key_false(struct static_key *key); + +#ifdef CONFIG_PARAVIRT_SPINLOCKS + +static inline void __ticket_enter_slowpath(arch_spinlock_t *lock) +{ + set_bit(0, (volatile unsigned long *)&lock->tickets.tail); +} + +#else /* !CONFIG_PARAVIRT_SPINLOCKS */ +static __always_inline void __ticket_lock_spinning(arch_spinlock_t *lock, + __ticket_t ticket) +{ +} +static inline void __ticket_unlock_kick(arch_spinlock_t *lock, + __ticket_t ticket) +{ +} + +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) +{ + return lock.tickets.head == lock.tickets.tail; +} + /* * Ticket locks are conceptually two parts, one indicating the current head of * the queue, and the other indicating the current tail. The lock is acquired @@ -47,81 +80,101 @@ * in the high part, because a wide xadd increment of the low part would carry * up and contaminate the high part. */ -static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) +static __always_inline void arch_spin_lock(arch_spinlock_t *lock) { - register struct __raw_tickets inc = { .tail = 1 }; + register struct __raw_tickets inc = { .tail = TICKET_LOCK_INC }; inc = xadd(&lock->tickets, inc); + if (likely(inc.head == inc.tail)) + goto out; + inc.tail &= ~TICKET_SLOWPATH_FLAG; for (;;) { - if (inc.head == inc.tail) - break; - cpu_relax(); - inc.head = ACCESS_ONCE(lock->tickets.head); + unsigned count = SPIN_THRESHOLD; + + do { + if (ACCESS_ONCE(lock->tickets.head) == inc.tail) + goto out; + cpu_relax(); + } while (--count); + __ticket_lock_spinning(lock, inc.tail); } - barrier(); /* make sure nothing creeps before the lock is taken */ +out: barrier(); /* make sure nothing creeps before the lock is taken */ } -static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) +static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) { arch_spinlock_t old, new; old.tickets = ACCESS_ONCE(lock->tickets); - if (old.tickets.head != old.tickets.tail) + if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG)) return 0; - new.head_tail = old.head_tail + (1 << TICKET_SHIFT); + new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT); /* cmpxchg is a full barrier, so nothing can move before it */ return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail; } -static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) +static inline void __ticket_unlock_slowpath(arch_spinlock_t *lock, + arch_spinlock_t old) { - __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX); + arch_spinlock_t new; + + BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS); + + /* Perform the unlock on the "before" copy */ + old.tickets.head += TICKET_LOCK_INC; + + /* Clear the slowpath flag */ + new.head_tail = old.head_tail & ~(TICKET_SLOWPATH_FLAG << TICKET_SHIFT); + + /* + * If the lock is uncontended, clear the flag - use cmpxchg in + * case it changes behind our back though. + */ + if (new.tickets.head != new.tickets.tail || + cmpxchg(&lock->head_tail, old.head_tail, + new.head_tail) != old.head_tail) { + /* + * Lock still has someone queued for it, so wake up an + * appropriate waiter. + */ + __ticket_unlock_kick(lock, old.tickets.head); + } } -static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) +static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) { - struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); + if (TICKET_SLOWPATH_FLAG && + static_key_false(¶virt_ticketlocks_enabled)) { + arch_spinlock_t prev; - return tmp.tail != tmp.head; -} + prev = *lock; + add_smp(&lock->tickets.head, TICKET_LOCK_INC); -static inline int __ticket_spin_is_contended(arch_spinlock_t *lock) -{ - struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); + /* add_smp() is a full mb() */ - return (__ticket_t)(tmp.tail - tmp.head) > 1; + if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) + __ticket_unlock_slowpath(lock, prev); + } else + __add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX); } -#ifndef CONFIG_PARAVIRT_SPINLOCKS - static inline int arch_spin_is_locked(arch_spinlock_t *lock) { - return __ticket_spin_is_locked(lock); -} - -static inline int arch_spin_is_contended(arch_spinlock_t *lock) -{ - return __ticket_spin_is_contended(lock); -} -#define arch_spin_is_contended arch_spin_is_contended + struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); -static __always_inline void arch_spin_lock(arch_spinlock_t *lock) -{ - __ticket_spin_lock(lock); + return tmp.tail != tmp.head; } -static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) +static inline int arch_spin_is_contended(arch_spinlock_t *lock) { - return __ticket_spin_trylock(lock); -} + struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); -static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) -{ - __ticket_spin_unlock(lock); + return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC; } +#define arch_spin_is_contended arch_spin_is_contended static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) @@ -129,8 +182,6 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, arch_spin_lock(lock); } -#endif /* CONFIG_PARAVIRT_SPINLOCKS */ - static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) { while (arch_spin_is_locked(lock)) @@ -233,8 +284,4 @@ static inline void arch_write_unlock(arch_rwlock_t *rw) #define arch_read_relax(lock) cpu_relax() #define arch_write_relax(lock) cpu_relax() -/* The {read|write|spin}_lock() on x86 are full memory barriers. */ -static inline void smp_mb__after_lock(void) { } -#define ARCH_HAS_SMP_MB_AFTER_LOCK - #endif /* _ASM_X86_SPINLOCK_H */ diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index ad0ad07fc006..4f1bea19945b 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -1,13 +1,17 @@ #ifndef _ASM_X86_SPINLOCK_TYPES_H #define _ASM_X86_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H -# error "please don't include this file directly" -#endif - #include <linux/types.h> -#if (CONFIG_NR_CPUS < 256) +#ifdef CONFIG_PARAVIRT_SPINLOCKS +#define __TICKET_LOCK_INC 2 +#define TICKET_SLOWPATH_FLAG ((__ticket_t)1) +#else +#define __TICKET_LOCK_INC 1 +#define TICKET_SLOWPATH_FLAG ((__ticket_t)0) +#endif + +#if (CONFIG_NR_CPUS < (256 / __TICKET_LOCK_INC)) typedef u8 __ticket_t; typedef u16 __ticketpair_t; #else @@ -15,6 +19,8 @@ typedef u16 __ticket_t; typedef u32 __ticketpair_t; #endif +#define TICKET_LOCK_INC ((__ticket_t)__TICKET_LOCK_INC) + #define TICKET_SHIFT (sizeof(__ticket_t) * 8) typedef struct arch_spinlock { diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 4ec45b3abba1..d7f3b3b78ac3 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -2,8 +2,8 @@ #define _ASM_X86_SWITCH_TO_H struct task_struct; /* one of the stranger aspects of C forward declarations */ -struct task_struct *__switch_to(struct task_struct *prev, - struct task_struct *next); +__visible struct task_struct *__switch_to(struct task_struct *prev, + struct task_struct *next); struct tss_struct; void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss); diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h index 9d09b4073b60..05af3b31d522 100644 --- a/arch/x86/include/asm/sync_bitops.h +++ b/arch/x86/include/asm/sync_bitops.h @@ -26,9 +26,9 @@ * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ -static inline void sync_set_bit(int nr, volatile unsigned long *addr) +static inline void sync_set_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; btsl %1,%0" + asm volatile("lock; bts %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -44,9 +44,9 @@ static inline void sync_set_bit(int nr, volatile unsigned long *addr) * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() * in order to ensure changes are visible on other processors. */ -static inline void sync_clear_bit(int nr, volatile unsigned long *addr) +static inline void sync_clear_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; btrl %1,%0" + asm volatile("lock; btr %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -61,9 +61,9 @@ static inline void sync_clear_bit(int nr, volatile unsigned long *addr) * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ -static inline void sync_change_bit(int nr, volatile unsigned long *addr) +static inline void sync_change_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; btcl %1,%0" + asm volatile("lock; btc %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -77,11 +77,11 @@ static inline void sync_change_bit(int nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int sync_test_and_set_bit(int nr, volatile unsigned long *addr) +static inline int sync_test_and_set_bit(long nr, volatile unsigned long *addr) { int oldbit; - asm volatile("lock; btsl %2,%1\n\tsbbl %0,%0" + asm volatile("lock; bts %2,%1\n\tsbbl %0,%0" : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); return oldbit; @@ -95,11 +95,11 @@ static inline int sync_test_and_set_bit(int nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int sync_test_and_clear_bit(int nr, volatile unsigned long *addr) +static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) { int oldbit; - asm volatile("lock; btrl %2,%1\n\tsbbl %0,%0" + asm volatile("lock; btr %2,%1\n\tsbbl %0,%0" : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); return oldbit; @@ -113,11 +113,11 @@ static inline int sync_test_and_clear_bit(int nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int sync_test_and_change_bit(int nr, volatile unsigned long *addr) +static inline int sync_test_and_change_bit(long nr, volatile unsigned long *addr) { int oldbit; - asm volatile("lock; btcl %2,%1\n\tsbbl %0,%0" + asm volatile("lock; btc %2,%1\n\tsbbl %0,%0" : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); return oldbit; diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 2e188d68397c..aea284b41312 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -20,7 +20,8 @@ #include <asm/thread_info.h> /* for TS_COMPAT */ #include <asm/unistd.h> -extern const unsigned long sys_call_table[]; +typedef void (*sys_call_ptr_t)(void); +extern const sys_call_ptr_t sys_call_table[]; /* * Only the low 32 bits of orig_ax are meaningful, so we return int. diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 2917a6452c49..592a6a672e07 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -24,7 +24,7 @@ asmlinkage long sys_iopl(unsigned int); asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); /* kernel/signal.c */ -long sys_rt_sigreturn(void); +asmlinkage long sys_rt_sigreturn(void); /* kernel/tls.c */ asmlinkage long sys_set_thread_area(struct user_desc __user *); @@ -34,7 +34,7 @@ asmlinkage long sys_get_thread_area(struct user_desc __user *); #ifdef CONFIG_X86_32 /* kernel/signal.c */ -unsigned long sys_sigreturn(void); +asmlinkage unsigned long sys_sigreturn(void); /* kernel/vm86_32.c */ asmlinkage long sys_vm86old(struct vm86_struct __user *); @@ -44,7 +44,7 @@ asmlinkage long sys_vm86(unsigned long, unsigned long); /* X86_64 only */ /* kernel/process_64.c */ -long sys_arch_prctl(int, unsigned long); +asmlinkage long sys_arch_prctl(int, unsigned long); /* kernel/sys_x86_64.c */ asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, diff --git a/arch/x86/include/asm/sysfb.h b/arch/x86/include/asm/sysfb.h new file mode 100644 index 000000000000..2aeb3e25579c --- /dev/null +++ b/arch/x86/include/asm/sysfb.h @@ -0,0 +1,98 @@ +#ifndef _ARCH_X86_KERNEL_SYSFB_H +#define _ARCH_X86_KERNEL_SYSFB_H + +/* + * Generic System Framebuffers on x86 + * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <linux/kernel.h> +#include <linux/platform_data/simplefb.h> +#include <linux/screen_info.h> + +enum { + M_I17, /* 17-Inch iMac */ + M_I20, /* 20-Inch iMac */ + M_I20_SR, /* 20-Inch iMac (Santa Rosa) */ + M_I24, /* 24-Inch iMac */ + M_I24_8_1, /* 24-Inch iMac, 8,1th gen */ + M_I24_10_1, /* 24-Inch iMac, 10,1th gen */ + M_I27_11_1, /* 27-Inch iMac, 11,1th gen */ + M_MINI, /* Mac Mini */ + M_MINI_3_1, /* Mac Mini, 3,1th gen */ + M_MINI_4_1, /* Mac Mini, 4,1th gen */ + M_MB, /* MacBook */ + M_MB_2, /* MacBook, 2nd rev. */ + M_MB_3, /* MacBook, 3rd rev. */ + M_MB_5_1, /* MacBook, 5th rev. */ + M_MB_6_1, /* MacBook, 6th rev. */ + M_MB_7_1, /* MacBook, 7th rev. */ + M_MB_SR, /* MacBook, 2nd gen, (Santa Rosa) */ + M_MBA, /* MacBook Air */ + M_MBA_3, /* Macbook Air, 3rd rev */ + M_MBP, /* MacBook Pro */ + M_MBP_2, /* MacBook Pro 2nd gen */ + M_MBP_2_2, /* MacBook Pro 2,2nd gen */ + M_MBP_SR, /* MacBook Pro (Santa Rosa) */ + M_MBP_4, /* MacBook Pro, 4th gen */ + M_MBP_5_1, /* MacBook Pro, 5,1th gen */ + M_MBP_5_2, /* MacBook Pro, 5,2th gen */ + M_MBP_5_3, /* MacBook Pro, 5,3rd gen */ + M_MBP_6_1, /* MacBook Pro, 6,1th gen */ + M_MBP_6_2, /* MacBook Pro, 6,2th gen */ + M_MBP_7_1, /* MacBook Pro, 7,1th gen */ + M_MBP_8_2, /* MacBook Pro, 8,2nd gen */ + M_UNKNOWN /* placeholder */ +}; + +struct efifb_dmi_info { + char *optname; + unsigned long base; + int stride; + int width; + int height; + int flags; +}; + +#ifdef CONFIG_EFI + +extern struct efifb_dmi_info efifb_dmi_list[]; +void sysfb_apply_efi_quirks(void); + +#else /* CONFIG_EFI */ + +static inline void sysfb_apply_efi_quirks(void) +{ +} + +#endif /* CONFIG_EFI */ + +#ifdef CONFIG_X86_SYSFB + +bool parse_mode(const struct screen_info *si, + struct simplefb_platform_data *mode); +int create_simplefb(const struct screen_info *si, + const struct simplefb_platform_data *mode); + +#else /* CONFIG_X86_SYSFB */ + +static inline bool parse_mode(const struct screen_info *si, + struct simplefb_platform_data *mode) +{ + return false; +} + +static inline int create_simplefb(const struct screen_info *si, + const struct simplefb_platform_data *mode) +{ + return -EINVAL; +} + +#endif /* CONFIG_X86_SYSFB */ + +#endif /* _ARCH_X86_KERNEL_SYSFB_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index a1df6e84691f..27811190cbd7 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -89,7 +89,6 @@ struct thread_info { #define TIF_FORK 18 /* ret_from_fork */ #define TIF_NOHZ 19 /* in adaptive nohz mode */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ -#define TIF_DEBUG 21 /* uses debug registers */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ @@ -113,7 +112,6 @@ struct thread_info { #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) #define _TIF_NOHZ (1 << TIF_NOHZ) -#define _TIF_DEBUG (1 << TIF_DEBUG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) @@ -154,7 +152,7 @@ struct thread_info { (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP) #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) -#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) +#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) #define PREEMPT_ACTIVE 0x10000000 diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 50a7fc0f824a..e6d90babc245 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -62,7 +62,8 @@ static inline void __flush_tlb_all(void) static inline void __flush_tlb_one(unsigned long addr) { - __flush_tlb_single(addr); + count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); + __flush_tlb_single(addr); } #define TLB_FLUSH_ALL -1UL @@ -84,14 +85,38 @@ static inline void __flush_tlb_one(unsigned long addr) #ifndef CONFIG_SMP -#define flush_tlb() __flush_tlb() -#define flush_tlb_all() __flush_tlb_all() -#define local_flush_tlb() __flush_tlb() +/* "_up" is for UniProcessor. + * + * This is a helper for other header functions. *Not* intended to be called + * directly. All global TLB flushes need to either call this, or to bump the + * vm statistics themselves. + */ +static inline void __flush_tlb_up(void) +{ + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + __flush_tlb(); +} + +static inline void flush_tlb_all(void) +{ + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + __flush_tlb_all(); +} + +static inline void flush_tlb(void) +{ + __flush_tlb_up(); +} + +static inline void local_flush_tlb(void) +{ + __flush_tlb_up(); +} static inline void flush_tlb_mm(struct mm_struct *mm) { if (mm == current->active_mm) - __flush_tlb(); + __flush_tlb_up(); } static inline void flush_tlb_page(struct vm_area_struct *vma, @@ -105,14 +130,14 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { if (vma->vm_mm == current->active_mm) - __flush_tlb(); + __flush_tlb_up(); } static inline void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag) { if (mm == current->active_mm) - __flush_tlb(); + __flush_tlb_up(); } static inline void native_flush_tlb_others(const struct cpumask *cpumask, diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 095b21507b6a..d35f24e231cd 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -124,9 +124,6 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu); #define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_thread_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) - -/* indicates that pointers to the topology cpumask_t maps are valid */ -#define arch_provides_topology_pointers yes #endif static inline void arch_fix_phys_package_id(int num, u32 slot) diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h new file mode 100644 index 000000000000..2874df24e7a4 --- /dev/null +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -0,0 +1,104 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM irq_vectors + +#if !defined(_TRACE_IRQ_VECTORS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_IRQ_VECTORS_H + +#include <linux/tracepoint.h> + +extern void trace_irq_vector_regfunc(void); +extern void trace_irq_vector_unregfunc(void); + +DECLARE_EVENT_CLASS(x86_irq_vector, + + TP_PROTO(int vector), + + TP_ARGS(vector), + + TP_STRUCT__entry( + __field( int, vector ) + ), + + TP_fast_assign( + __entry->vector = vector; + ), + + TP_printk("vector=%d", __entry->vector) ); + +#define DEFINE_IRQ_VECTOR_EVENT(name) \ +DEFINE_EVENT_FN(x86_irq_vector, name##_entry, \ + TP_PROTO(int vector), \ + TP_ARGS(vector), \ + trace_irq_vector_regfunc, \ + trace_irq_vector_unregfunc); \ +DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ + TP_PROTO(int vector), \ + TP_ARGS(vector), \ + trace_irq_vector_regfunc, \ + trace_irq_vector_unregfunc); + + +/* + * local_timer - called when entering/exiting a local timer interrupt + * vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(local_timer); + +/* + * reschedule - called when entering/exiting a reschedule vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(reschedule); + +/* + * spurious_apic - called when entering/exiting a spurious apic vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(spurious_apic); + +/* + * error_apic - called when entering/exiting an error apic vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(error_apic); + +/* + * x86_platform_ipi - called when entering/exiting a x86 platform ipi interrupt + * vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(x86_platform_ipi); + +/* + * irq_work - called when entering/exiting a irq work interrupt + * vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(irq_work); + +/* + * call_function - called when entering/exiting a call function interrupt + * vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(call_function); + +/* + * call_function_single - called when entering/exiting a call function + * single interrupt vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(call_function_single); + +/* + * threshold_apic - called when entering/exiting a threshold apic interrupt + * vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(threshold_apic); + +/* + * thermal_apic - called when entering/exiting a thermal apic interrupt + * vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(thermal_apic); + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE irq_vectors +#endif /* _TRACE_IRQ_VECTORS_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 88eae2aec619..7036cb60cd87 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -6,11 +6,7 @@ #include <asm/debugreg.h> #include <asm/siginfo.h> /* TRAP_TRACE, ... */ -#ifdef CONFIG_X86_32 -#define dotraplinkage -#else -#define dotraplinkage asmlinkage -#endif +#define dotraplinkage __visible asmlinkage void divide_error(void); asmlinkage void debug(void); diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index c91e8b9d588b..235be70d5bb4 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -49,6 +49,7 @@ extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); +extern int check_tsc_disabled(void); extern unsigned long native_calibrate_tsc(void); extern int tsc_clocksource_reliable; diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 5ee26875baea..5838fa911aa0 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -153,16 +153,19 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) * Careful: we have to cast the result to the type of the pointer * for sign reasons. * - * The use of %edx as the register specifier is a bit of a + * The use of _ASM_DX as the register specifier is a bit of a * simplification, as gcc only cares about it as the starting point * and not size: for a 64-bit value it will use %ecx:%edx on 32 bits * (%ecx being the next register in gcc's x86 register sequence), and * %rdx on 64 bits. + * + * Clang/LLVM cares about the size of the register, but still wants + * the base register for something that ends up being a pair. */ #define get_user(x, ptr) \ ({ \ int __ret_gu; \ - register __inttype(*(ptr)) __val_gu asm("%edx"); \ + register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \ __chk_user_ptr(ptr); \ might_fault(); \ asm volatile("call __get_user_%P3" \ diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 142810c457dc..4f7923dd0007 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -235,7 +235,7 @@ extern long __copy_user_nocache(void *dst, const void __user *src, static inline int __copy_from_user_nocache(void *dst, const void __user *src, unsigned size) { - might_sleep(); + might_fault(); return __copy_user_nocache(dst, src, size, 1); } diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index a06983cdc125..0b46ef261c77 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -731,6 +731,9 @@ static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits) } extern void uv_bau_message_intr1(void); +#ifdef CONFIG_TRACING +#define trace_uv_bau_message_intr1 uv_bau_message_intr1 +#endif extern void uv_bau_timeout_intr1(void); struct atomic_short { diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index f3e01a2cbaa1..966502d4682e 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -387,6 +387,7 @@ enum vmcs_field { #define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 #define VMX_EPT_EXTENT_CONTEXT 1 #define VMX_EPT_EXTENT_GLOBAL 2 +#define VMX_EPT_EXTENT_SHIFT 24 #define VMX_EPT_EXECUTE_ONLY_BIT (1ull) #define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6) @@ -394,6 +395,7 @@ enum vmcs_field { #define VMX_EPTP_WB_BIT (1ull << 14) #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) #define VMX_EPT_1GB_PAGE_BIT (1ull << 17) +#define VMX_EPT_INVEPT_BIT (1ull << 20) #define VMX_EPT_AD_BIT (1ull << 21) #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index de656ac2af41..d76ac40da206 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -35,7 +35,7 @@ #define DEFINE_VVAR(type, name) \ type name \ - __attribute__((section(".vvar_" #name), aligned(16))) + __attribute__((section(".vvar_" #name), aligned(16))) __visible #define VVAR(name) (*vvaraddr_ ## name) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index d8d99222b36a..828a1565ba57 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -142,6 +142,8 @@ struct x86_cpuinit_ops { void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node); }; +struct timespec; + /** * struct x86_platform_ops - platform specific runtime functions * @calibrate_tsc: calibrate TSC @@ -156,8 +158,8 @@ struct x86_cpuinit_ops { */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); - unsigned long (*get_wallclock)(void); - int (*set_wallclock)(unsigned long nowtime); + void (*get_wallclock)(struct timespec *ts); + int (*set_wallclock)(const struct timespec *ts); void (*iommu_shutdown)(void); bool (*is_untracked_pat_range)(u64 start, u64 end); void (*nmi_init)(void); diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h index ca842f2769ef..608a79d5a466 100644 --- a/arch/x86/include/asm/xen/events.h +++ b/arch/x86/include/asm/xen/events.h @@ -7,6 +7,7 @@ enum ipi_vector { XEN_CALL_FUNCTION_SINGLE_VECTOR, XEN_SPIN_UNLOCK_VECTOR, XEN_IRQ_WORK_VECTOR, + XEN_NMI_VECTOR, XEN_NR_IPIS, }; diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 125f344f06a9..d866959e5685 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -40,21 +40,7 @@ extern struct start_info *xen_start_info; static inline uint32_t xen_cpuid_base(void) { - uint32_t base, eax, ebx, ecx, edx; - char signature[13]; - - for (base = 0x40000000; base < 0x40010000; base += 0x100) { - cpuid(base, &eax, &ebx, &ecx, &edx); - *(uint32_t *)(signature + 0) = ebx; - *(uint32_t *)(signature + 4) = ecx; - *(uint32_t *)(signature + 8) = edx; - signature[12] = 0; - - if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2)) - return base; - } - - return 0; + return hypervisor_cpuid_base("XenVMMXenVMM", 2); } #ifdef CONFIG_XEN diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 6aef9fbc09b7..b913915e8e63 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -79,30 +79,38 @@ static inline int phys_to_machine_mapping_valid(unsigned long pfn) return get_phys_to_machine(pfn) != INVALID_P2M_ENTRY; } -static inline unsigned long mfn_to_pfn(unsigned long mfn) +static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn) { unsigned long pfn; - int ret = 0; + int ret; if (xen_feature(XENFEAT_auto_translated_physmap)) return mfn; - if (unlikely(mfn >= machine_to_phys_nr)) { - pfn = ~0; - goto try_override; - } - pfn = 0; + if (unlikely(mfn >= machine_to_phys_nr)) + return ~0; + /* * The array access can fail (e.g., device space beyond end of RAM). * In such cases it doesn't matter what we return (we return garbage), * but we must handle the fault without crashing! */ ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); -try_override: - /* ret might be < 0 if there are no entries in the m2p for mfn */ if (ret < 0) - pfn = ~0; - else if (get_phys_to_machine(pfn) != mfn) + return ~0; + + return pfn; +} + +static inline unsigned long mfn_to_pfn(unsigned long mfn) +{ + unsigned long pfn; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return mfn; + + pfn = mfn_to_pfn_no_overrides(mfn); + if (get_phys_to_machine(pfn) != mfn) { /* * If this appears to be a foreign mfn (because the pfn * doesn't map back to the mfn), then check the local override @@ -111,6 +119,7 @@ try_override: * m2p_find_override_pfn returns ~0 if it doesn't find anything. */ pfn = m2p_find_override_pfn(mfn, ~0); + } /* * pfn is ~0 if there are no entries in the m2p for mfn or if the diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h index 7ea79c5fa1f2..492b29802f57 100644 --- a/arch/x86/include/asm/xor_avx.h +++ b/arch/x86/include/asm/xor_avx.h @@ -167,12 +167,12 @@ static struct xor_block_template xor_block_avx = { #define AVX_XOR_SPEED \ do { \ - if (cpu_has_avx) \ + if (cpu_has_avx && cpu_has_osxsave) \ xor_speed(&xor_block_avx); \ } while (0) #define AVX_SELECT(FASTEST) \ - (cpu_has_avx ? &xor_block_avx : FASTEST) + (cpu_has_avx && cpu_has_osxsave ? &xor_block_avx : FASTEST) #else diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 08744242b8d2..c15ddaf90710 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -6,7 +6,6 @@ #define SETUP_E820_EXT 1 #define SETUP_DTB 2 #define SETUP_PCI 3 -#define SETUP_EFI_VARS 4 /* ram_size flags */ #define RAMDISK_IMAGE_START_MASK 0x07FF diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 06fdbd987e97..94dc8ca434e0 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -23,6 +23,7 @@ #define KVM_FEATURE_ASYNC_PF 4 #define KVM_FEATURE_STEAL_TIME 5 #define KVM_FEATURE_PV_EOI 6 +#define KVM_FEATURE_PV_UNHALT 7 /* The last 8 bits are used to indicate how to interpret the flags field * in pvclock structure. If no bits are set, all flags are ignored. diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index 2af848dfa754..bb0465090ae5 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -170,6 +170,9 @@ #define MSR_KNC_EVNTSEL0 0x00000028 #define MSR_KNC_EVNTSEL1 0x00000029 +/* Alternative perfctr range with full access. */ +#define MSR_IA32_PMC0 0x000004c1 + /* AMD64 MSRs. Not complete. See the architecture manual for a more complete list. */ diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index 54991a746043..180a0c3c224d 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -2,75 +2,129 @@ #define _UAPI_ASM_X86_PROCESSOR_FLAGS_H /* Various flags defined: can be included from assembler. */ +#include <linux/const.h> + /* * EFLAGS bits */ -#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ -#define X86_EFLAGS_BIT1 0x00000002 /* Bit 1 - always on */ -#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ -#define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */ -#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ -#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ -#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ -#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ -#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ -#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ -#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ -#define X86_EFLAGS_NT 0x00004000 /* Nested Task */ -#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ -#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ -#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */ -#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */ -#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ -#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ +#define X86_EFLAGS_CF_BIT 0 /* Carry Flag */ +#define X86_EFLAGS_CF _BITUL(X86_EFLAGS_CF_BIT) +#define X86_EFLAGS_FIXED_BIT 1 /* Bit 1 - always on */ +#define X86_EFLAGS_FIXED _BITUL(X86_EFLAGS_FIXED_BIT) +#define X86_EFLAGS_PF_BIT 2 /* Parity Flag */ +#define X86_EFLAGS_PF _BITUL(X86_EFLAGS_PF_BIT) +#define X86_EFLAGS_AF_BIT 4 /* Auxiliary carry Flag */ +#define X86_EFLAGS_AF _BITUL(X86_EFLAGS_AF_BIT) +#define X86_EFLAGS_ZF_BIT 6 /* Zero Flag */ +#define X86_EFLAGS_ZF _BITUL(X86_EFLAGS_ZF_BIT) +#define X86_EFLAGS_SF_BIT 7 /* Sign Flag */ +#define X86_EFLAGS_SF _BITUL(X86_EFLAGS_SF_BIT) +#define X86_EFLAGS_TF_BIT 8 /* Trap Flag */ +#define X86_EFLAGS_TF _BITUL(X86_EFLAGS_TF_BIT) +#define X86_EFLAGS_IF_BIT 9 /* Interrupt Flag */ +#define X86_EFLAGS_IF _BITUL(X86_EFLAGS_IF_BIT) +#define X86_EFLAGS_DF_BIT 10 /* Direction Flag */ +#define X86_EFLAGS_DF _BITUL(X86_EFLAGS_DF_BIT) +#define X86_EFLAGS_OF_BIT 11 /* Overflow Flag */ +#define X86_EFLAGS_OF _BITUL(X86_EFLAGS_OF_BIT) +#define X86_EFLAGS_IOPL_BIT 12 /* I/O Privilege Level (2 bits) */ +#define X86_EFLAGS_IOPL (_AC(3,UL) << X86_EFLAGS_IOPL_BIT) +#define X86_EFLAGS_NT_BIT 14 /* Nested Task */ +#define X86_EFLAGS_NT _BITUL(X86_EFLAGS_NT_BIT) +#define X86_EFLAGS_RF_BIT 16 /* Resume Flag */ +#define X86_EFLAGS_RF _BITUL(X86_EFLAGS_RF_BIT) +#define X86_EFLAGS_VM_BIT 17 /* Virtual Mode */ +#define X86_EFLAGS_VM _BITUL(X86_EFLAGS_VM_BIT) +#define X86_EFLAGS_AC_BIT 18 /* Alignment Check/Access Control */ +#define X86_EFLAGS_AC _BITUL(X86_EFLAGS_AC_BIT) +#define X86_EFLAGS_AC_BIT 18 /* Alignment Check/Access Control */ +#define X86_EFLAGS_AC _BITUL(X86_EFLAGS_AC_BIT) +#define X86_EFLAGS_VIF_BIT 19 /* Virtual Interrupt Flag */ +#define X86_EFLAGS_VIF _BITUL(X86_EFLAGS_VIF_BIT) +#define X86_EFLAGS_VIP_BIT 20 /* Virtual Interrupt Pending */ +#define X86_EFLAGS_VIP _BITUL(X86_EFLAGS_VIP_BIT) +#define X86_EFLAGS_ID_BIT 21 /* CPUID detection */ +#define X86_EFLAGS_ID _BITUL(X86_EFLAGS_ID_BIT) /* * Basic CPU control in CR0 */ -#define X86_CR0_PE 0x00000001 /* Protection Enable */ -#define X86_CR0_MP 0x00000002 /* Monitor Coprocessor */ -#define X86_CR0_EM 0x00000004 /* Emulation */ -#define X86_CR0_TS 0x00000008 /* Task Switched */ -#define X86_CR0_ET 0x00000010 /* Extension Type */ -#define X86_CR0_NE 0x00000020 /* Numeric Error */ -#define X86_CR0_WP 0x00010000 /* Write Protect */ -#define X86_CR0_AM 0x00040000 /* Alignment Mask */ -#define X86_CR0_NW 0x20000000 /* Not Write-through */ -#define X86_CR0_CD 0x40000000 /* Cache Disable */ -#define X86_CR0_PG 0x80000000 /* Paging */ +#define X86_CR0_PE_BIT 0 /* Protection Enable */ +#define X86_CR0_PE _BITUL(X86_CR0_PE_BIT) +#define X86_CR0_MP_BIT 1 /* Monitor Coprocessor */ +#define X86_CR0_MP _BITUL(X86_CR0_MP_BIT) +#define X86_CR0_EM_BIT 2 /* Emulation */ +#define X86_CR0_EM _BITUL(X86_CR0_EM_BIT) +#define X86_CR0_TS_BIT 3 /* Task Switched */ +#define X86_CR0_TS _BITUL(X86_CR0_TS_BIT) +#define X86_CR0_ET_BIT 4 /* Extension Type */ +#define X86_CR0_ET _BITUL(X86_CR0_ET_BIT) +#define X86_CR0_NE_BIT 5 /* Numeric Error */ +#define X86_CR0_NE _BITUL(X86_CR0_NE_BIT) +#define X86_CR0_WP_BIT 16 /* Write Protect */ +#define X86_CR0_WP _BITUL(X86_CR0_WP_BIT) +#define X86_CR0_AM_BIT 18 /* Alignment Mask */ +#define X86_CR0_AM _BITUL(X86_CR0_AM_BIT) +#define X86_CR0_NW_BIT 29 /* Not Write-through */ +#define X86_CR0_NW _BITUL(X86_CR0_NW_BIT) +#define X86_CR0_CD_BIT 30 /* Cache Disable */ +#define X86_CR0_CD _BITUL(X86_CR0_CD_BIT) +#define X86_CR0_PG_BIT 31 /* Paging */ +#define X86_CR0_PG _BITUL(X86_CR0_PG_BIT) /* * Paging options in CR3 */ -#define X86_CR3_PWT 0x00000008 /* Page Write Through */ -#define X86_CR3_PCD 0x00000010 /* Page Cache Disable */ -#define X86_CR3_PCID_MASK 0x00000fff /* PCID Mask */ +#define X86_CR3_PWT_BIT 3 /* Page Write Through */ +#define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT) +#define X86_CR3_PCD_BIT 4 /* Page Cache Disable */ +#define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT) +#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */ /* * Intel CPU features in CR4 */ -#define X86_CR4_VME 0x00000001 /* enable vm86 extensions */ -#define X86_CR4_PVI 0x00000002 /* virtual interrupts flag enable */ -#define X86_CR4_TSD 0x00000004 /* disable time stamp at ipl 3 */ -#define X86_CR4_DE 0x00000008 /* enable debugging extensions */ -#define X86_CR4_PSE 0x00000010 /* enable page size extensions */ -#define X86_CR4_PAE 0x00000020 /* enable physical address extensions */ -#define X86_CR4_MCE 0x00000040 /* Machine check enable */ -#define X86_CR4_PGE 0x00000080 /* enable global pages */ -#define X86_CR4_PCE 0x00000100 /* enable performance counters at ipl 3 */ -#define X86_CR4_OSFXSR 0x00000200 /* enable fast FPU save and restore */ -#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ -#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ -#define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */ -#define X86_CR4_PCIDE 0x00020000 /* enable PCID support */ -#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ -#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ -#define X86_CR4_SMAP 0x00200000 /* enable SMAP support */ +#define X86_CR4_VME_BIT 0 /* enable vm86 extensions */ +#define X86_CR4_VME _BITUL(X86_CR4_VME_BIT) +#define X86_CR4_PVI_BIT 1 /* virtual interrupts flag enable */ +#define X86_CR4_PVI _BITUL(X86_CR4_PVI_BIT) +#define X86_CR4_TSD_BIT 2 /* disable time stamp at ipl 3 */ +#define X86_CR4_TSD _BITUL(X86_CR4_TSD_BIT) +#define X86_CR4_DE_BIT 3 /* enable debugging extensions */ +#define X86_CR4_DE _BITUL(X86_CR4_DE_BIT) +#define X86_CR4_PSE_BIT 4 /* enable page size extensions */ +#define X86_CR4_PSE _BITUL(X86_CR4_PSE_BIT) +#define X86_CR4_PAE_BIT 5 /* enable physical address extensions */ +#define X86_CR4_PAE _BITUL(X86_CR4_PAE_BIT) +#define X86_CR4_MCE_BIT 6 /* Machine check enable */ +#define X86_CR4_MCE _BITUL(X86_CR4_MCE_BIT) +#define X86_CR4_PGE_BIT 7 /* enable global pages */ +#define X86_CR4_PGE _BITUL(X86_CR4_PGE_BIT) +#define X86_CR4_PCE_BIT 8 /* enable performance counters at ipl 3 */ +#define X86_CR4_PCE _BITUL(X86_CR4_PCE_BIT) +#define X86_CR4_OSFXSR_BIT 9 /* enable fast FPU save and restore */ +#define X86_CR4_OSFXSR _BITUL(X86_CR4_OSFXSR_BIT) +#define X86_CR4_OSXMMEXCPT_BIT 10 /* enable unmasked SSE exceptions */ +#define X86_CR4_OSXMMEXCPT _BITUL(X86_CR4_OSXMMEXCPT_BIT) +#define X86_CR4_VMXE_BIT 13 /* enable VMX virtualization */ +#define X86_CR4_VMXE _BITUL(X86_CR4_VMXE_BIT) +#define X86_CR4_SMXE_BIT 14 /* enable safer mode (TXT) */ +#define X86_CR4_SMXE _BITUL(X86_CR4_SMXE_BIT) +#define X86_CR4_FSGSBASE_BIT 16 /* enable RDWRFSGS support */ +#define X86_CR4_FSGSBASE _BITUL(X86_CR4_FSGSBASE_BIT) +#define X86_CR4_PCIDE_BIT 17 /* enable PCID support */ +#define X86_CR4_PCIDE _BITUL(X86_CR4_PCIDE_BIT) +#define X86_CR4_OSXSAVE_BIT 18 /* enable xsave and xrestore */ +#define X86_CR4_OSXSAVE _BITUL(X86_CR4_OSXSAVE_BIT) +#define X86_CR4_SMEP_BIT 20 /* enable SMEP support */ +#define X86_CR4_SMEP _BITUL(X86_CR4_SMEP_BIT) +#define X86_CR4_SMAP_BIT 21 /* enable SMAP support */ +#define X86_CR4_SMAP _BITUL(X86_CR4_SMAP_BIT) /* * x86-64 Task Priority Register, CR8 */ -#define X86_CR8_TPR 0x0000000F /* task priority register */ +#define X86_CR8_TPR _AC(0x0000000f,UL) /* task priority register */ /* * AMD and Transmeta use MSRs for configuration; see <asm/msr-index.h> diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index d651082c7cf7..0e79420376eb 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -65,6 +65,7 @@ #define EXIT_REASON_EOI_INDUCED 45 #define EXIT_REASON_EPT_VIOLATION 48 #define EXIT_REASON_EPT_MISCONFIG 49 +#define EXIT_REASON_INVEPT 50 #define EXIT_REASON_PREEMPTION_TIMER 52 #define EXIT_REASON_WBINVD 54 #define EXIT_REASON_XSETBV 55 @@ -106,12 +107,13 @@ { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ + { EXIT_REASON_INVEPT, "INVEPT" }, \ + { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }, \ { EXIT_REASON_WBINVD, "WBINVD" }, \ { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \ { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ { EXIT_REASON_INVD, "INVD" }, \ - { EXIT_REASON_INVPCID, "INVPCID" }, \ - { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" } + { EXIT_REASON_INVPCID, "INVPCID" } #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 7bd3bd310106..a5408b965c9d 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -16,6 +16,8 @@ CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg endif +CFLAGS_irq.o := -I$(src)/../include/asm/trace + obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o @@ -67,7 +69,7 @@ obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-y += kprobes/ obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o +obj-$(CONFIG_DOUBLEFAULT) += doublefault.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o @@ -93,6 +95,7 @@ obj-$(CONFIG_MICROCODE_INTEL_LIB) += microcode_intel_lib.o microcode-y := microcode_core.o microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o +obj-$(CONFIG_MICROCODE_AMD_EARLY) += microcode_amd_early.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o @@ -100,8 +103,12 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o obj-$(CONFIG_OF) += devicetree.o obj-$(CONFIG_UPROBES) += uprobes.o +obj-y += sysfb.o +obj-$(CONFIG_X86_SYSFB) += sysfb_simplefb.o +obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o +obj-$(CONFIG_TRACING) += tracepoint.o ### # 64 bit specific files diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 230c8ea878e5..40c76604199f 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -44,6 +44,7 @@ #include <asm/mpspec.h> #include <asm/smp.h> +#include "sleep.h" /* To include x86_acpi_suspend_lowlevel */ static int __initdata acpi_force = 0; u32 acpi_rsdt_forced; int acpi_disabled; @@ -66,6 +67,7 @@ EXPORT_SYMBOL(acpi_pci_disabled); int acpi_lapic; int acpi_ioapic; int acpi_strict; +int acpi_disable_cmcff; u8 acpi_sci_flags __initdata; int acpi_sci_override_gsi __initdata; @@ -140,16 +142,8 @@ static u32 irq_to_gsi(int irq) } /* - * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END, - * to map the target physical address. The problem is that set_fixmap() - * provides a single page, and it is possible that the page is not - * sufficient. - * By using this area, we can map up to MAX_IO_APICS pages temporarily, - * i.e. until the next __va_range() call. - * - * Important Safety Note: The fixed I/O APIC page numbers are *subtracted* - * from the fixed base. That's why we start at FIX_IO_APIC_BASE_END and - * count idx down while incrementing the phys address. + * This is just a simple wrapper around early_ioremap(), + * with sanity checks for phys == 0 and size == 0. */ char *__init __acpi_map_table(unsigned long phys, unsigned long size) { @@ -159,6 +153,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) return early_ioremap(phys, size); } + void __init __acpi_unmap_table(char *map, unsigned long size) { if (!map || !size) @@ -194,11 +189,11 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) return 0; } -static void __cpuinit acpi_register_lapic(int id, u8 enabled) +static void acpi_register_lapic(int id, u8 enabled) { unsigned int ver = 0; - if (id >= (MAX_LOCAL_APIC-1)) { + if (id >= MAX_LOCAL_APIC) { printk(KERN_INFO PREFIX "skipped apicid that is too big\n"); return; } @@ -559,6 +554,12 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, int (*__acpi_register_gsi)(struct device *dev, u32 gsi, int trigger, int polarity) = acpi_register_gsi_pic; +#ifdef CONFIG_ACPI_SLEEP +int (*acpi_suspend_lowlevel)(void) = x86_acpi_suspend_lowlevel; +#else +int (*acpi_suspend_lowlevel)(void); +#endif + /* * success: return IRQ number (>=0) * failure: return < 0 @@ -600,7 +601,7 @@ void __init acpi_set_irq_model_ioapic(void) #ifdef CONFIG_ACPI_HOTPLUG_CPU #include <acpi/processor.h> -static void __cpuinit acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { #ifdef CONFIG_ACPI_NUMA int nid; @@ -613,7 +614,7 @@ static void __cpuinit acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) #endif } -static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) +static int _acpi_map_lsapic(acpi_handle handle, int *pcpu) { struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; union acpi_object *obj; @@ -1113,6 +1114,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) int ioapic; int ioapic_pin; struct io_apic_irq_attr irq_attr; + int ret; if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; @@ -1142,7 +1144,9 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, polarity == ACPI_ACTIVE_HIGH ? 0 : 1); - io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr); + ret = io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr); + if (ret < 0) + gsi = INT_MIN; return gsi; } @@ -1619,6 +1623,10 @@ static int __init parse_acpi(char *arg) /* "acpi=copy_dsdt" copys DSDT */ else if (strcmp(arg, "copy_dsdt") == 0) { acpi_gbl_copy_dsdt_locally = 1; + } + /* "acpi=nocmcff" disables FF mode for corrected errors */ + else if (strcmp(arg, "nocmcff") == 0) { + acpi_disable_cmcff = 1; } else { /* Core will printk when we return error. */ return -EINVAL; diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index b44577bc9744..33120100ff5e 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -26,12 +26,12 @@ static char temp_stack[4096]; #endif /** - * acpi_suspend_lowlevel - save kernel state + * x86_acpi_suspend_lowlevel - save kernel state * * Create an identity mapped page table and copy the wakeup routine to * low memory. */ -int acpi_suspend_lowlevel(void) +int x86_acpi_suspend_lowlevel(void) { struct wakeup_header *header = (struct wakeup_header *) __va(real_mode_header->wakeup_header); @@ -48,9 +48,20 @@ int acpi_suspend_lowlevel(void) #ifndef CONFIG_64BIT native_store_gdt((struct desc_ptr *)&header->pmode_gdt); + /* + * We have to check that we can write back the value, and not + * just read it. At least on 90 nm Pentium M (Family 6, Model + * 13), reading an invalid MSR is not guaranteed to trap, see + * Erratum X4 in "Intel Pentium M Processor on 90 nm Process + * with 2-MB L2 Cache and Intel® Processor A100 and A110 on 90 + * nm process with 512-KB L2 Cache Specification Update". + */ if (!rdmsr_safe(MSR_EFER, &header->pmode_efer_low, - &header->pmode_efer_high)) + &header->pmode_efer_high) && + !wrmsr_safe(MSR_EFER, + header->pmode_efer_low, + header->pmode_efer_high)) header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_EFER); #endif /* !CONFIG_64BIT */ @@ -61,7 +72,10 @@ int acpi_suspend_lowlevel(void) } if (!rdmsr_safe(MSR_IA32_MISC_ENABLE, &header->pmode_misc_en_low, - &header->pmode_misc_en_high)) + &header->pmode_misc_en_high) && + !wrmsr_safe(MSR_IA32_MISC_ENABLE, + header->pmode_misc_en_low, + header->pmode_misc_en_high)) header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE); header->realmode_flags = acpi_realmode_flags; diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h index 67f59f8c6956..c9c2c982d5e4 100644 --- a/arch/x86/kernel/acpi/sleep.h +++ b/arch/x86/kernel/acpi/sleep.h @@ -15,3 +15,5 @@ extern unsigned long acpi_copy_wakeup_routine(unsigned long); extern void wakeup_long64(void); extern void do_suspend_lowlevel(void); + +extern int x86_acpi_suspend_lowlevel(void); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c15cf9a25e27..15e8563e5c24 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -11,6 +11,7 @@ #include <linux/memory.h> #include <linux/stop_machine.h> #include <linux/slab.h> +#include <linux/kdebug.h> #include <asm/alternative.h> #include <asm/sections.h> #include <asm/pgtable.h> @@ -596,97 +597,93 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) return addr; } -/* - * Cross-modifying kernel text with stop_machine(). - * This code originally comes from immediate value. - */ -static atomic_t stop_machine_first; -static int wrote_text; +static void do_sync_core(void *info) +{ + sync_core(); +} -struct text_poke_params { - struct text_poke_param *params; - int nparams; -}; +static bool bp_patching_in_progress; +static void *bp_int3_handler, *bp_int3_addr; -static int __kprobes stop_machine_text_poke(void *data) +int poke_int3_handler(struct pt_regs *regs) { - struct text_poke_params *tpp = data; - struct text_poke_param *p; - int i; + /* bp_patching_in_progress */ + smp_rmb(); - if (atomic_xchg(&stop_machine_first, 0)) { - for (i = 0; i < tpp->nparams; i++) { - p = &tpp->params[i]; - text_poke(p->addr, p->opcode, p->len); - } - smp_wmb(); /* Make sure other cpus see that this has run */ - wrote_text = 1; - } else { - while (!wrote_text) - cpu_relax(); - smp_mb(); /* Load wrote_text before following execution */ - } + if (likely(!bp_patching_in_progress)) + return 0; - for (i = 0; i < tpp->nparams; i++) { - p = &tpp->params[i]; - flush_icache_range((unsigned long)p->addr, - (unsigned long)p->addr + p->len); - } - /* - * Intel Archiecture Software Developer's Manual section 7.1.3 specifies - * that a core serializing instruction such as "cpuid" should be - * executed on _each_ core before the new instruction is made visible. - */ - sync_core(); - return 0; -} + if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr) + return 0; + + /* set up the specified breakpoint handler */ + regs->ip = (unsigned long) bp_int3_handler; + + return 1; -/** - * text_poke_smp - Update instructions on a live kernel on SMP - * @addr: address to modify - * @opcode: source of the copy - * @len: length to copy - * - * Modify multi-byte instruction by using stop_machine() on SMP. This allows - * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying - * should be allowed, since stop_machine() does _not_ protect code against - * NMI and MCE. - * - * Note: Must be called under get_online_cpus() and text_mutex. - */ -void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) -{ - struct text_poke_params tpp; - struct text_poke_param p; - - p.addr = addr; - p.opcode = opcode; - p.len = len; - tpp.params = &p; - tpp.nparams = 1; - atomic_set(&stop_machine_first, 1); - wrote_text = 0; - /* Use __stop_machine() because the caller already got online_cpus. */ - __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask); - return addr; } /** - * text_poke_smp_batch - Update instructions on a live kernel on SMP - * @params: an array of text_poke parameters - * @n: the number of elements in params. + * text_poke_bp() -- update instructions on live kernel on SMP + * @addr: address to patch + * @opcode: opcode of new instruction + * @len: length to copy + * @handler: address to jump to when the temporary breakpoint is hit * - * Modify multi-byte instruction by using stop_machine() on SMP. Since the - * stop_machine() is heavy task, it is better to aggregate text_poke requests - * and do it once if possible. + * Modify multi-byte instruction by using int3 breakpoint on SMP. + * We completely avoid stop_machine() here, and achieve the + * synchronization using int3 breakpoint. * - * Note: Must be called under get_online_cpus() and text_mutex. + * The way it is done: + * - add a int3 trap to the address that will be patched + * - sync cores + * - update all but the first byte of the patched range + * - sync cores + * - replace the first byte (int3) by the first byte of + * replacing opcode + * - sync cores + * + * Note: must be called under text_mutex. */ -void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n) +void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) { - struct text_poke_params tpp = {.params = params, .nparams = n}; + unsigned char int3 = 0xcc; + + bp_int3_handler = handler; + bp_int3_addr = (u8 *)addr + sizeof(int3); + bp_patching_in_progress = true; + /* + * Corresponding read barrier in int3 notifier for + * making sure the in_progress flags is correctly ordered wrt. + * patching + */ + smp_wmb(); + + text_poke(addr, &int3, sizeof(int3)); - atomic_set(&stop_machine_first, 1); - wrote_text = 0; - __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask); + on_each_cpu(do_sync_core, NULL, 1); + + if (len - sizeof(int3) > 0) { + /* patch all but the first byte */ + text_poke((char *)addr + sizeof(int3), + (const char *) opcode + sizeof(int3), + len - sizeof(int3)); + /* + * According to Intel, this core syncing is very likely + * not necessary and we'd be safe even without it. But + * better safe than sorry (plus there's not only Intel). + */ + on_each_cpu(do_sync_core, NULL, 1); + } + + /* patch the first byte */ + text_poke(addr, opcode, sizeof(int3)); + + on_each_cpu(do_sync_core, NULL, 1); + + bp_patching_in_progress = false; + smp_wmb(); + + return addr; } + diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 3048ded1b598..59554dca96ec 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -20,6 +20,7 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, {} }; @@ -27,6 +28,7 @@ EXPORT_SYMBOL(amd_nb_misc_ids); static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, {} }; @@ -81,13 +83,20 @@ int amd_cache_northbridges(void) next_northbridge(misc, amd_nb_misc_ids); node_to_amd_nb(i)->link = link = next_northbridge(link, amd_nb_link_ids); - } + } + /* GART present only on Fam15h upto model 0fh */ if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || - boot_cpu_data.x86 == 0x15) + (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10)) amd_northbridges.flags |= AMD_NB_GART; /* + * Check for L3 cache presence. + */ + if (!cpuid_edx(0x80000006)) + return 0; + + /* * Some CPU families support L3 Cache Index Disable. There are some * limitations because of E382 and E388 on family 0x10. */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 904611bf0e5a..a7eb82d9b012 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -35,6 +35,7 @@ #include <linux/smp.h> #include <linux/mm.h> +#include <asm/trace/irq_vectors.h> #include <asm/irq_remapping.h> #include <asm/perf_event.h> #include <asm/x86_init.h> @@ -57,7 +58,7 @@ unsigned int num_processors; -unsigned disabled_cpus __cpuinitdata; +unsigned disabled_cpus; /* Processor that is doing the boot up */ unsigned int boot_cpu_physical_apicid = -1U; @@ -543,7 +544,7 @@ static DEFINE_PER_CPU(struct clock_event_device, lapic_events); * Setup the local APIC timer for this CPU. Copy the initialized values * of the boot CPU and register the clock event in the framework. */ -static void __cpuinit setup_APIC_timer(void) +static void setup_APIC_timer(void) { struct clock_event_device *levt = &__get_cpu_var(lapic_events); @@ -865,7 +866,7 @@ void __init setup_boot_APIC_clock(void) setup_APIC_timer(); } -void __cpuinit setup_secondary_APIC_clock(void) +void setup_secondary_APIC_clock(void) { setup_APIC_timer(); } @@ -912,24 +913,42 @@ static void local_apic_timer_interrupt(void) * [ if a single-CPU system runs an SMP kernel then we call the local * interrupt as well. Thus we cannot inline the local irq ... ] */ -void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); /* * NOTE! We'd better ACK the irq immediately, * because timer handling can be slow. + * + * update_process_times() expects us to have done irq_enter(). + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. */ - ack_APIC_irq(); + entering_ack_irq(); + local_apic_timer_interrupt(); + exiting_irq(); + + set_irq_regs(old_regs); +} + +__visible void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + /* + * NOTE! We'd better ACK the irq immediately, + * because timer handling can be slow. + * * update_process_times() expects us to have done irq_enter(). * Besides, if we don't timer interrupts ignore the global * interrupt lock, which is the WrongThing (tm) to do. */ - irq_enter(); - exit_idle(); + entering_ack_irq(); + trace_local_timer_entry(LOCAL_TIMER_VECTOR); local_apic_timer_interrupt(); - irq_exit(); + trace_local_timer_exit(LOCAL_TIMER_VECTOR); + exiting_irq(); set_irq_regs(old_regs); } @@ -1210,7 +1229,7 @@ void __init init_bsp_APIC(void) apic_write(APIC_LVT1, value); } -static void __cpuinit lapic_setup_esr(void) +static void lapic_setup_esr(void) { unsigned int oldvalue, value, maxlvt; @@ -1257,7 +1276,7 @@ static void __cpuinit lapic_setup_esr(void) * Used to setup local APIC while initializing BSP or bringin up APs. * Always called with preemption disabled. */ -void __cpuinit setup_local_APIC(void) +void setup_local_APIC(void) { int cpu = smp_processor_id(); unsigned int value, queued; @@ -1452,7 +1471,7 @@ void __cpuinit setup_local_APIC(void) #endif } -void __cpuinit end_local_APIC_setup(void) +void end_local_APIC_setup(void) { lapic_setup_esr(); @@ -1907,12 +1926,10 @@ int __init APIC_init_uniprocessor(void) /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -void smp_spurious_interrupt(struct pt_regs *regs) +static inline void __smp_spurious_interrupt(void) { u32 v; - irq_enter(); - exit_idle(); /* * Check if this really is a spurious interrupt and ACK it * if it is a vectored one. Just in case... @@ -1927,13 +1944,28 @@ void smp_spurious_interrupt(struct pt_regs *regs) /* see sw-dev-man vol 3, chapter 7.4.13.5 */ pr_info("spurious APIC interrupt on CPU#%d, " "should never happen.\n", smp_processor_id()); - irq_exit(); +} + +__visible void smp_spurious_interrupt(struct pt_regs *regs) +{ + entering_irq(); + __smp_spurious_interrupt(); + exiting_irq(); +} + +__visible void smp_trace_spurious_interrupt(struct pt_regs *regs) +{ + entering_irq(); + trace_spurious_apic_entry(SPURIOUS_APIC_VECTOR); + __smp_spurious_interrupt(); + trace_spurious_apic_exit(SPURIOUS_APIC_VECTOR); + exiting_irq(); } /* * This interrupt should never happen with our APIC/SMP architecture */ -void smp_error_interrupt(struct pt_regs *regs) +static inline void __smp_error_interrupt(struct pt_regs *regs) { u32 v0, v1; u32 i = 0; @@ -1948,8 +1980,6 @@ void smp_error_interrupt(struct pt_regs *regs) "Illegal register address", /* APIC Error Bit 7 */ }; - irq_enter(); - exit_idle(); /* First tickle the hardware, only then report what went on. -- REW */ v0 = apic_read(APIC_ESR); apic_write(APIC_ESR, 0); @@ -1970,7 +2000,22 @@ void smp_error_interrupt(struct pt_regs *regs) apic_printk(APIC_DEBUG, KERN_CONT "\n"); - irq_exit(); +} + +__visible void smp_error_interrupt(struct pt_regs *regs) +{ + entering_irq(); + __smp_error_interrupt(regs); + exiting_irq(); +} + +__visible void smp_trace_error_interrupt(struct pt_regs *regs) +{ + entering_irq(); + trace_error_apic_entry(ERROR_APIC_VECTOR); + __smp_error_interrupt(regs); + trace_error_apic_exit(ERROR_APIC_VECTOR); + exiting_irq(); } /** @@ -2062,7 +2107,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) apic_write(APIC_LVT1, value); } -void __cpuinit generic_processor_info(int apicid, int version) +void generic_processor_info(int apicid, int version) { int cpu, max = nr_cpu_ids; bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, @@ -2302,7 +2347,7 @@ static void lapic_resume(void) apic_write(APIC_SPIV, apic_pm_state.apic_spiv); apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) +#if defined(CONFIG_X86_MCE_INTEL) if (maxlvt >= 5) apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); #endif @@ -2332,7 +2377,7 @@ static struct syscore_ops lapic_syscore_ops = { .suspend = lapic_suspend, }; -static void __cpuinit apic_pm_activate(void) +static void apic_pm_activate(void) { apic_pm_state.active = 1; } @@ -2357,7 +2402,7 @@ static void apic_pm_activate(void) { } #ifdef CONFIG_X86_64 -static int __cpuinit apic_cluster_num(void) +static int apic_cluster_num(void) { int i, clusters, zeros; unsigned id; @@ -2402,10 +2447,10 @@ static int __cpuinit apic_cluster_num(void) return clusters; } -static int __cpuinitdata multi_checked; -static int __cpuinitdata multi; +static int multi_checked; +static int multi; -static int __cpuinit set_multi(const struct dmi_system_id *d) +static int set_multi(const struct dmi_system_id *d) { if (multi) return 0; @@ -2414,7 +2459,7 @@ static int __cpuinit set_multi(const struct dmi_system_id *d) return 0; } -static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = { +static const struct dmi_system_id multi_dmi_table[] = { { .callback = set_multi, .ident = "IBM System Summit2", @@ -2426,7 +2471,7 @@ static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = { {} }; -static void __cpuinit dmi_check_multi(void) +static void dmi_check_multi(void) { if (multi_checked) return; @@ -2443,7 +2488,7 @@ static void __cpuinit dmi_check_multi(void) * multi-chassis. * Use DMI to check them */ -__cpuinit int apic_is_clustered_box(void) +int apic_is_clustered_box(void) { dmi_check_multi(); if (multi) diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 9a9110918ca7..3e67f9e3d7ef 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -74,7 +74,7 @@ static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) return initial_apic_id >> index_msb; } -static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) +static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) { union numachip_csr_g3_ext_irq_gen int_gen; diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 0874799a98c6..c55224731b2d 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -130,7 +130,7 @@ int es7000_plat; */ -static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) +static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) { unsigned long vect = 0, psaival = 0; diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 31cb9ae992b7..a698d7165c96 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -9,6 +9,7 @@ * */ #include <asm/apic.h> +#include <asm/nmi.h> #include <linux/cpumask.h> #include <linux/kdebug.h> diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9ed796ccc32c..e63a5bd2a78f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1534,6 +1534,11 @@ void intel_ir_io_apic_print_entries(unsigned int apic, } } +void ioapic_zap_locks(void) +{ + raw_spin_lock_init(&ioapic_lock); +} + __apicdebuginit(void) print_IO_APIC(int ioapic_idx) { union IO_APIC_reg_00 reg_00; @@ -3375,12 +3380,15 @@ int io_apic_setup_irq_pin_once(unsigned int irq, int node, { unsigned int ioapic_idx = attr->ioapic, pin = attr->ioapic_pin; int ret; + struct IO_APIC_route_entry orig_entry; /* Avoid redundant programming */ if (test_bit(pin, ioapics[ioapic_idx].pin_programmed)) { - pr_debug("Pin %d-%d already programmed\n", - mpc_ioapic_id(ioapic_idx), pin); - return 0; + pr_debug("Pin %d-%d already programmed\n", mpc_ioapic_id(ioapic_idx), pin); + orig_entry = ioapic_read_entry(attr->ioapic, pin); + if (attr->trigger == orig_entry.trigger && attr->polarity == orig_entry.polarity) + return 0; + return -EBUSY; } ret = io_apic_setup_irq_pin(irq, node, attr); if (!ret) diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index d661ee95cabf..1e42e8f305ee 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -105,7 +105,7 @@ static void __init smp_dump_qct(void) } } -void __cpuinit numaq_tsc_disable(void) +void numaq_tsc_disable(void) { if (!found_numaq) return; diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index c88baa4ff0e5..140e29db478d 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -148,7 +148,7 @@ static void init_x2apic_ldr(void) /* * At CPU state changes, update the x2apic cluster sibling info. */ -static int __cpuinit +static int update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int this_cpu = (unsigned long)hcpu; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 794f6eb54cd3..1191ac1c9d25 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -25,6 +25,7 @@ #include <linux/kdebug.h> #include <linux/delay.h> #include <linux/crash_dump.h> +#include <linux/reboot.h> #include <asm/uv/uv_mmrs.h> #include <asm/uv/uv_hub.h> @@ -36,7 +37,6 @@ #include <asm/ipi.h> #include <asm/smp.h> #include <asm/x86_init.h> -#include <asm/emergency-restart.h> #include <asm/nmi.h> /* BMC sets a bit this MMR non-zero before sending an NMI */ @@ -51,6 +51,8 @@ DEFINE_PER_CPU(int, x2apic_extra_bits); static enum uv_system_type uv_system_type; static u64 gru_start_paddr, gru_end_paddr; +static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr; +static u64 gru_dist_lmask, gru_dist_umask; static union uvh_apicid uvh_apicid; int uv_min_hub_revision_id; EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); @@ -72,7 +74,20 @@ static unsigned long __init uv_early_read_mmr(unsigned long addr) static inline bool is_GRU_range(u64 start, u64 end) { - return start >= gru_start_paddr && end <= gru_end_paddr; + if (gru_dist_base) { + u64 su = start & gru_dist_umask; /* upper (incl pnode) bits */ + u64 sl = start & gru_dist_lmask; /* base offset bits */ + u64 eu = end & gru_dist_umask; + u64 el = end & gru_dist_lmask; + + /* Must reside completely within a single GRU range */ + return (sl == gru_dist_base && el == gru_dist_base && + su >= gru_first_node_paddr && + su <= gru_last_node_paddr && + eu == su); + } else { + return start >= gru_start_paddr && end <= gru_end_paddr; + } } static bool uv_is_untracked_pat_range(u64 start, u64 end) @@ -194,7 +209,7 @@ EXPORT_SYMBOL_GPL(uv_possible_blades); unsigned long sn_rtc_cycles_per_second; EXPORT_SYMBOL(sn_rtc_cycles_per_second); -static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) +static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) { #ifdef CONFIG_SMP unsigned long val; @@ -401,7 +416,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, }; -static __cpuinit void set_x2apic_extra_bits(int pnode) +static void set_x2apic_extra_bits(int pnode) { __this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift); } @@ -463,26 +478,63 @@ static __init void map_high(char *id, unsigned long base, int pshift, pr_info("UV: Map %s_HI base address NULL\n", id); return; } - pr_info("UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, paddr + bytes); + pr_debug("UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, paddr + bytes); if (map_type == map_uc) init_extra_mapping_uc(paddr, bytes); else init_extra_mapping_wb(paddr, bytes); } +static __init void map_gru_distributed(unsigned long c) +{ + union uvh_rh_gam_gru_overlay_config_mmr_u gru; + u64 paddr; + unsigned long bytes; + int nid; + + gru.v = c; + /* only base bits 42:28 relevant in dist mode */ + gru_dist_base = gru.v & 0x000007fff0000000UL; + if (!gru_dist_base) { + pr_info("UV: Map GRU_DIST base address NULL\n"); + return; + } + bytes = 1UL << UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; + gru_dist_lmask = ((1UL << uv_hub_info->m_val) - 1) & ~(bytes - 1); + gru_dist_umask = ~((1UL << uv_hub_info->m_val) - 1); + gru_dist_base &= gru_dist_lmask; /* Clear bits above M */ + for_each_online_node(nid) { + paddr = ((u64)uv_node_to_pnode(nid) << uv_hub_info->m_val) | + gru_dist_base; + init_extra_mapping_wb(paddr, bytes); + gru_first_node_paddr = min(paddr, gru_first_node_paddr); + gru_last_node_paddr = max(paddr, gru_last_node_paddr); + } + /* Save upper (63:M) bits of address only for is_GRU_range */ + gru_first_node_paddr &= gru_dist_umask; + gru_last_node_paddr &= gru_dist_umask; + pr_debug("UV: Map GRU_DIST base 0x%016llx 0x%016llx - 0x%016llx\n", + gru_dist_base, gru_first_node_paddr, gru_last_node_paddr); +} + static __init void map_gru_high(int max_pnode) { union uvh_rh_gam_gru_overlay_config_mmr_u gru; int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); - if (gru.s.enable) { - map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb); - gru_start_paddr = ((u64)gru.s.base << shift); - gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1); - } else { + if (!gru.s.enable) { pr_info("UV: GRU disabled\n"); + return; + } + + if (is_uv3_hub() && gru.s3.mode) { + map_gru_distributed(gru.v); + return; } + map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb); + gru_start_paddr = ((u64)gru.s.base << shift); + gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1); } static __init void map_mmr_high(int max_pnode) @@ -683,7 +735,7 @@ static void uv_heartbeat(unsigned long ignored) mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL); } -static void __cpuinit uv_heartbeat_enable(int cpu) +static void uv_heartbeat_enable(int cpu) { while (!uv_cpu_hub_info(cpu)->scir.enabled) { struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; @@ -700,7 +752,7 @@ static void __cpuinit uv_heartbeat_enable(int cpu) } #ifdef CONFIG_HOTPLUG_CPU -static void __cpuinit uv_heartbeat_disable(int cpu) +static void uv_heartbeat_disable(int cpu) { if (uv_cpu_hub_info(cpu)->scir.enabled) { uv_cpu_hub_info(cpu)->scir.enabled = 0; @@ -712,8 +764,8 @@ static void __cpuinit uv_heartbeat_disable(int cpu) /* * cpu hotplug notifier */ -static __cpuinit int uv_scir_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int uv_scir_cpu_notify(struct notifier_block *self, unsigned long action, + void *hcpu) { long cpu = (long)hcpu; @@ -783,7 +835,7 @@ int uv_set_vga_state(struct pci_dev *pdev, bool decode, * Called on each cpu to initialize the per_cpu UV data area. * FIXME: hotplug not supported yet */ -void __cpuinit uv_cpu_init(void) +void uv_cpu_init(void) { /* CPU 0 initilization will be done via uv_system_init. */ if (!uv_blade_info) diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 53a4e2744846..3ab03430211d 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -392,7 +392,7 @@ static struct cpuidle_device apm_cpuidle_device; /* * Local variables */ -static struct { +__visible struct { unsigned long offset; unsigned short segment; } apm_bios_entry; diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 0ef4bba2acb7..d67c4be3e8b1 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -28,7 +28,6 @@ void foo(void) OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor); OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model); OFFSET(CPUINFO_x86_mask, cpuinfo_x86, x86_mask); - OFFSET(CPUINFO_hard_math, cpuinfo_x86, hard_math); OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level); OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability); OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index b0684e4a73aa..47b56a7e99cb 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -31,11 +31,15 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o ifdef CONFIG_PERF_EVENTS obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o perf_event_amd_uncore.o +ifdef CONFIG_AMD_IOMMU +obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o +endif obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o endif + obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 5013a48d1aff..903a264af981 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -66,10 +66,10 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) * performance at the same time.. */ -extern void vide(void); -__asm__(".align 4\nvide: ret"); +extern __visible void vide(void); +__asm__(".globl vide\n\t.align 4\nvide: ret"); -static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c) +static void init_amd_k5(struct cpuinfo_x86 *c) { /* * General Systems BIOSen alias the cpu frequency registers @@ -87,10 +87,10 @@ static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c) } -static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) +static void init_amd_k6(struct cpuinfo_x86 *c) { u32 l, h; - int mbytes = num_physpages >> (20-PAGE_SHIFT); + int mbytes = get_num_physpages() >> (20-PAGE_SHIFT); if (c->x86_model < 6) { /* Based on AMD doc 20734R - June 2000 */ @@ -179,7 +179,7 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) } } -static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) +static void amd_k7_smp_check(struct cpuinfo_x86 *c) { /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) @@ -222,7 +222,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) add_taint(TAINT_UNSAFE_SMP, LOCKDEP_NOW_UNRELIABLE); } -static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) +static void init_amd_k7(struct cpuinfo_x86 *c) { u32 l, h; @@ -267,7 +267,7 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) * To workaround broken NUMA config. Read the comment in * srat_detect_node(). */ -static int __cpuinit nearby_node(int apicid) +static int nearby_node(int apicid) { int i, node; @@ -292,7 +292,7 @@ static int __cpuinit nearby_node(int apicid) * (2) AMD processors supporting compute units */ #ifdef CONFIG_X86_HT -static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) +static void amd_get_topology(struct cpuinfo_x86 *c) { u32 nodes, cores_per_cu = 1; u8 node_id; @@ -342,7 +342,7 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) * On a AMD dual core setup the lower bits of the APIC id distingush the cores. * Assumes number of cores is a power of two. */ -static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) +static void amd_detect_cmp(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_HT unsigned bits; @@ -369,7 +369,7 @@ u16 amd_get_nb_id(int cpu) } EXPORT_SYMBOL_GPL(amd_get_nb_id); -static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) +static void srat_detect_node(struct cpuinfo_x86 *c) { #ifdef CONFIG_NUMA int cpu = smp_processor_id(); @@ -421,7 +421,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) #endif } -static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) +static void early_init_amd_mc(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_HT unsigned bits, ecx; @@ -447,7 +447,7 @@ static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) #endif } -static void __cpuinit bsp_init_amd(struct cpuinfo_x86 *c) +static void bsp_init_amd(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { @@ -475,7 +475,7 @@ static void __cpuinit bsp_init_amd(struct cpuinfo_x86 *c) } } -static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) +static void early_init_amd(struct cpuinfo_x86 *c) { early_init_amd_mc(c); @@ -512,9 +512,9 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) static const int amd_erratum_383[]; static const int amd_erratum_400[]; -static bool cpu_has_amd_erratum(const int *erratum); +static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); -static void __cpuinit init_amd(struct cpuinfo_x86 *c) +static void init_amd(struct cpuinfo_x86 *c) { u32 dummy; unsigned long long value; @@ -729,19 +729,18 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) value &= ~(1ULL << 24); wrmsrl_safe(MSR_AMD64_BU_CFG2, value); - if (cpu_has_amd_erratum(amd_erratum_383)) + if (cpu_has_amd_erratum(c, amd_erratum_383)) set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); } - if (cpu_has_amd_erratum(amd_erratum_400)) + if (cpu_has_amd_erratum(c, amd_erratum_400)) set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); } #ifdef CONFIG_X86_32 -static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, - unsigned int size) +static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) { /* AMD errata T13 (order #21922) */ if ((c->x86 == 6)) { @@ -757,7 +756,7 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, } #endif -static void __cpuinit cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) +static void cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) { tlb_flushall_shift = 5; @@ -765,7 +764,7 @@ static void __cpuinit cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) tlb_flushall_shift = 4; } -static void __cpuinit cpu_detect_tlb_amd(struct cpuinfo_x86 *c) +static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) { u32 ebx, eax, ecx, edx; u16 mask = 0xfff; @@ -820,7 +819,7 @@ static void __cpuinit cpu_detect_tlb_amd(struct cpuinfo_x86 *c) cpu_set_tlb_flushall_shift(c); } -static const struct cpu_dev __cpuinitconst amd_cpu_dev = { +static const struct cpu_dev amd_cpu_dev = { .c_vendor = "AMD", .c_ident = { "AuthenticAMD" }, #ifdef CONFIG_X86_32 @@ -879,23 +878,13 @@ static const int amd_erratum_400[] = static const int amd_erratum_383[] = AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); -static bool cpu_has_amd_erratum(const int *erratum) + +static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) { - struct cpuinfo_x86 *cpu = __this_cpu_ptr(&cpu_info); int osvw_id = *erratum++; u32 range; u32 ms; - /* - * If called early enough that current_cpu_data hasn't been initialized - * yet, fall back to boot_cpu_data. - */ - if (cpu->x86 == 0) - cpu = &boot_cpu_data; - - if (cpu->x86_vendor != X86_VENDOR_AMD) - return false; - if (osvw_id >= 0 && osvw_id < 65536 && cpu_has(cpu, X86_FEATURE_OSVW)) { u64 osvw_len; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 4112be9a4659..03445346ee0a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -17,15 +17,6 @@ #include <asm/paravirt.h> #include <asm/alternative.h> -static int __init no_387(char *s) -{ - boot_cpu_data.hard_math = 0; - write_cr0(X86_CR0_TS | X86_CR0_EM | X86_CR0_MP | read_cr0()); - return 1; -} - -__setup("no387", no_387); - static double __initdata x = 4195835.0; static double __initdata y = 3145727.0; @@ -44,15 +35,6 @@ static void __init check_fpu(void) { s32 fdiv_bug; - if (!boot_cpu_data.hard_math) { -#ifndef CONFIG_MATH_EMULATION - pr_emerg("No coprocessor found and no math emulation present\n"); - pr_emerg("Giving up\n"); - for (;;) ; -#endif - return; - } - kernel_fpu_begin(); /* @@ -107,5 +89,6 @@ void __init check_bugs(void) * kernel_fpu_begin/end() in check_fpu() relies on the patched * alternative instructions. */ - check_fpu(); + if (cpu_has_fpu) + check_fpu(); } diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 159103c0b1f4..fbf6c3bc2400 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -11,7 +11,7 @@ #ifdef CONFIG_X86_OOSTORE -static u32 __cpuinit power2(u32 x) +static u32 power2(u32 x) { u32 s = 1; @@ -25,7 +25,7 @@ static u32 __cpuinit power2(u32 x) /* * Set up an actual MCR */ -static void __cpuinit centaur_mcr_insert(int reg, u32 base, u32 size, int key) +static void centaur_mcr_insert(int reg, u32 base, u32 size, int key) { u32 lo, hi; @@ -42,7 +42,7 @@ static void __cpuinit centaur_mcr_insert(int reg, u32 base, u32 size, int key) * * Shortcut: We know you can't put 4Gig of RAM on a winchip */ -static u32 __cpuinit ramtop(void) +static u32 ramtop(void) { u32 clip = 0xFFFFFFFFUL; u32 top = 0; @@ -91,7 +91,7 @@ static u32 __cpuinit ramtop(void) /* * Compute a set of MCR's to give maximum coverage */ -static int __cpuinit centaur_mcr_compute(int nr, int key) +static int centaur_mcr_compute(int nr, int key) { u32 mem = ramtop(); u32 root = power2(mem); @@ -157,7 +157,7 @@ static int __cpuinit centaur_mcr_compute(int nr, int key) return ct; } -static void __cpuinit centaur_create_optimal_mcr(void) +static void centaur_create_optimal_mcr(void) { int used; int i; @@ -181,7 +181,7 @@ static void __cpuinit centaur_create_optimal_mcr(void) wrmsr(MSR_IDT_MCR0+i, 0, 0); } -static void __cpuinit winchip2_create_optimal_mcr(void) +static void winchip2_create_optimal_mcr(void) { u32 lo, hi; int used; @@ -217,7 +217,7 @@ static void __cpuinit winchip2_create_optimal_mcr(void) /* * Handle the MCR key on the Winchip 2. */ -static void __cpuinit winchip2_unprotect_mcr(void) +static void winchip2_unprotect_mcr(void) { u32 lo, hi; u32 key; @@ -229,7 +229,7 @@ static void __cpuinit winchip2_unprotect_mcr(void) wrmsr(MSR_IDT_MCR_CTRL, lo, hi); } -static void __cpuinit winchip2_protect_mcr(void) +static void winchip2_protect_mcr(void) { u32 lo, hi; @@ -247,7 +247,7 @@ static void __cpuinit winchip2_protect_mcr(void) #define RNG_ENABLED (1 << 3) #define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */ -static void __cpuinit init_c3(struct cpuinfo_x86 *c) +static void init_c3(struct cpuinfo_x86 *c) { u32 lo, hi; @@ -318,7 +318,7 @@ enum { EAMD3D = 1<<20, }; -static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) +static void early_init_centaur(struct cpuinfo_x86 *c) { switch (c->x86) { #ifdef CONFIG_X86_32 @@ -337,7 +337,7 @@ static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) #endif } -static void __cpuinit init_centaur(struct cpuinfo_x86 *c) +static void init_centaur(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_32 char *name; @@ -468,7 +468,7 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c) #endif } -static unsigned int __cpuinit +static unsigned int centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) { #ifdef CONFIG_X86_32 @@ -488,7 +488,7 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) return size; } -static const struct cpu_dev __cpuinitconst centaur_cpu_dev = { +static const struct cpu_dev centaur_cpu_dev = { .c_vendor = "Centaur", .c_ident = { "CentaurHauls" }, .c_early_init = early_init_centaur, diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 22018f70a671..2793d1f095a2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -63,7 +63,7 @@ void __init setup_cpu_local_masks(void) alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); } -static void __cpuinit default_init(struct cpuinfo_x86 *c) +static void default_init(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_64 cpu_detect_cache_sizes(c); @@ -80,13 +80,13 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c) #endif } -static const struct cpu_dev __cpuinitconst default_cpu = { +static const struct cpu_dev default_cpu = { .c_init = default_init, .c_vendor = "Unknown", .c_x86_vendor = X86_VENDOR_UNKNOWN, }; -static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; +static const struct cpu_dev *this_cpu = &default_cpu; DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { #ifdef CONFIG_X86_64 @@ -160,8 +160,8 @@ static int __init x86_xsaveopt_setup(char *s) __setup("noxsaveopt", x86_xsaveopt_setup); #ifdef CONFIG_X86_32 -static int cachesize_override __cpuinitdata = -1; -static int disable_x86_serial_nr __cpuinitdata = 1; +static int cachesize_override = -1; +static int disable_x86_serial_nr = 1; static int __init cachesize_setup(char *str) { @@ -215,12 +215,12 @@ static inline int flag_is_changeable_p(u32 flag) } /* Probe for the CPUID instruction */ -int __cpuinit have_cpuid_p(void) +int have_cpuid_p(void) { return flag_is_changeable_p(X86_EFLAGS_ID); } -static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +static void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) { unsigned long lo, hi; @@ -298,7 +298,7 @@ struct cpuid_dependent_feature { u32 level; }; -static const struct cpuid_dependent_feature __cpuinitconst +static const struct cpuid_dependent_feature cpuid_dependent_features[] = { { X86_FEATURE_MWAIT, 0x00000005 }, { X86_FEATURE_DCA, 0x00000009 }, @@ -306,7 +306,7 @@ cpuid_dependent_features[] = { { 0, 0 } }; -static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) +static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) { const struct cpuid_dependent_feature *df; @@ -344,7 +344,7 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) */ /* Look up CPU names by table lookup. */ -static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c) +static const char *table_lookup_model(struct cpuinfo_x86 *c) { const struct cpu_model_info *info; @@ -364,8 +364,8 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c) return NULL; /* Not found */ } -__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata; -__u32 cpu_caps_set[NCAPINTS] __cpuinitdata; +__u32 cpu_caps_cleared[NCAPINTS]; +__u32 cpu_caps_set[NCAPINTS]; void load_percpu_segment(int cpu) { @@ -394,9 +394,9 @@ void switch_to_new_gdt(int cpu) load_percpu_segment(cpu); } -static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {}; +static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; -static void __cpuinit get_model_name(struct cpuinfo_x86 *c) +static void get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; char *p, *q; @@ -425,7 +425,7 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c) } } -void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c) +void cpu_detect_cache_sizes(struct cpuinfo_x86 *c) { unsigned int n, dummy, ebx, ecx, edx, l2size; @@ -479,7 +479,7 @@ u16 __read_mostly tlb_lld_4m[NR_INFO]; */ s8 __read_mostly tlb_flushall_shift = -1; -void __cpuinit cpu_detect_tlb(struct cpuinfo_x86 *c) +void cpu_detect_tlb(struct cpuinfo_x86 *c) { if (this_cpu->c_detect_tlb) this_cpu->c_detect_tlb(c); @@ -493,7 +493,7 @@ void __cpuinit cpu_detect_tlb(struct cpuinfo_x86 *c) tlb_flushall_shift); } -void __cpuinit detect_ht(struct cpuinfo_x86 *c) +void detect_ht(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_HT u32 eax, ebx, ecx, edx; @@ -544,7 +544,7 @@ out: #endif } -static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) +static void get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; int i; @@ -571,7 +571,7 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) this_cpu = &default_cpu; } -void __cpuinit cpu_detect(struct cpuinfo_x86 *c) +void cpu_detect(struct cpuinfo_x86 *c) { /* Get vendor name */ cpuid(0x00000000, (unsigned int *)&c->cpuid_level, @@ -601,7 +601,7 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c) } } -void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) +void get_cpu_cap(struct cpuinfo_x86 *c) { u32 tfms, xlvl; u32 ebx; @@ -652,7 +652,7 @@ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) init_scattered_cpuid_features(c); } -static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) +static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_32 int i; @@ -711,10 +711,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) return; cpu_detect(c); - get_cpu_vendor(c); - get_cpu_cap(c); + fpu_detect(c); if (this_cpu->c_early_init) this_cpu->c_early_init(c); @@ -724,6 +723,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_bsp_init) this_cpu->c_bsp_init(c); + + setup_force_cpu_cap(X86_FEATURE_ALWAYS); } void __init early_cpu_init(void) @@ -768,7 +769,7 @@ void __init early_cpu_init(void) * unless we can find a reliable way to detect all the broken cases. * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). */ -static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) +static void detect_nopl(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_32 clear_cpu_cap(c, X86_FEATURE_NOPL); @@ -777,7 +778,7 @@ static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) #endif } -static void __cpuinit generic_identify(struct cpuinfo_x86 *c) +static void generic_identify(struct cpuinfo_x86 *c) { c->extended_cpuid_level = 0; @@ -814,7 +815,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) /* * This does the hard work of actually picking apart the CPU stuff... */ -static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) +static void identify_cpu(struct cpuinfo_x86 *c) { int i; @@ -959,7 +960,7 @@ void __init identify_boot_cpu(void) cpu_detect_tlb(&boot_cpu_data); } -void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) +void identify_secondary_cpu(struct cpuinfo_x86 *c) { BUG_ON(c == &boot_cpu_data); identify_cpu(c); @@ -974,14 +975,14 @@ struct msr_range { unsigned max; }; -static const struct msr_range msr_range_array[] __cpuinitconst = { +static const struct msr_range msr_range_array[] = { { 0x00000000, 0x00000418}, { 0xc0000000, 0xc000040b}, { 0xc0010000, 0xc0010142}, { 0xc0011000, 0xc001103b}, }; -static void __cpuinit __print_cpu_msr(void) +static void __print_cpu_msr(void) { unsigned index_min, index_max; unsigned index; @@ -1000,7 +1001,7 @@ static void __cpuinit __print_cpu_msr(void) } } -static int show_msr __cpuinitdata; +static int show_msr; static __init int setup_show_msr(char *arg) { @@ -1021,7 +1022,7 @@ static __init int setup_noclflush(char *arg) } __setup("noclflush", setup_noclflush); -void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) +void print_cpu_info(struct cpuinfo_x86 *c) { const char *vendor = NULL; @@ -1050,7 +1051,7 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) print_cpu_msr(c); } -void __cpuinit print_cpu_msr(struct cpuinfo_x86 *c) +void print_cpu_msr(struct cpuinfo_x86 *c) { if (c->cpu_index < show_msr) __print_cpu_msr(); @@ -1071,11 +1072,11 @@ __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; -struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1, - (unsigned long) nmi_idt_table }; +struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1, + (unsigned long) debug_idt_table }; DEFINE_PER_CPU_FIRST(union irq_stack_union, - irq_stack_union) __aligned(PAGE_SIZE); + irq_stack_union) __aligned(PAGE_SIZE) __visible; /* * The following four percpu variables are hot. Align current_task to @@ -1092,7 +1093,7 @@ EXPORT_PER_CPU_SYMBOL(kernel_stack); DEFINE_PER_CPU(char *, irq_stack_ptr) = init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; -DEFINE_PER_CPU(unsigned int, irq_count) = -1; +DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); @@ -1148,20 +1149,20 @@ int is_debug_stack(unsigned long addr) addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ)); } -static DEFINE_PER_CPU(u32, debug_stack_use_ctr); +DEFINE_PER_CPU(u32, debug_idt_ctr); void debug_stack_set_zero(void) { - this_cpu_inc(debug_stack_use_ctr); - load_idt((const struct desc_ptr *)&nmi_idt_descr); + this_cpu_inc(debug_idt_ctr); + load_current_idt(); } void debug_stack_reset(void) { - if (WARN_ON(!this_cpu_read(debug_stack_use_ctr))) + if (WARN_ON(!this_cpu_read(debug_idt_ctr))) return; - if (this_cpu_dec_return(debug_stack_use_ctr) == 0) - load_idt((const struct desc_ptr *)&idt_descr); + if (this_cpu_dec_return(debug_idt_ctr) == 0) + load_current_idt(); } #else /* CONFIG_X86_64 */ @@ -1215,7 +1216,7 @@ static void dbg_restore_debug_regs(void) */ #ifdef CONFIG_X86_64 -void __cpuinit cpu_init(void) +void cpu_init(void) { struct orig_ist *oist; struct task_struct *me; @@ -1257,7 +1258,7 @@ void __cpuinit cpu_init(void) switch_to_new_gdt(cpu); loadsegment(fs, 0); - load_idt((const struct desc_ptr *)&idt_descr); + load_current_idt(); memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); syscall_init(); @@ -1314,7 +1315,7 @@ void __cpuinit cpu_init(void) #else -void __cpuinit cpu_init(void) +void cpu_init(void) { int cpu = smp_processor_id(); struct task_struct *curr = current; @@ -1334,7 +1335,7 @@ void __cpuinit cpu_init(void) if (cpu_has_vme || cpu_has_tsc || cpu_has_de) clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - load_idt(&idt_descr); + load_current_idt(); switch_to_new_gdt(cpu); /* @@ -1363,3 +1364,17 @@ void __cpuinit cpu_init(void) fpu_init(); } #endif + +#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS +void warn_pre_alternatives(void) +{ + WARN(1, "You're using static_cpu_has before alternatives have run!\n"); +} +EXPORT_SYMBOL_GPL(warn_pre_alternatives); +#endif + +inline bool __static_cpu_has_safe(u16 bit) +{ + return boot_cpu_has(bit); +} +EXPORT_SYMBOL_GPL(__static_cpu_has_safe); diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index d048d5ca43c1..d0969c75ab54 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -15,7 +15,7 @@ /* * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU */ -static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +static void __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) { unsigned char ccr2, ccr3; @@ -44,7 +44,7 @@ static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) } } -static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +static void do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) { unsigned long flags; @@ -59,25 +59,25 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) * Actually since bugs.h doesn't even reference this perhaps someone should * fix the documentation ??? */ -static unsigned char Cx86_dir0_msb __cpuinitdata = 0; +static unsigned char Cx86_dir0_msb = 0; -static const char __cpuinitconst Cx86_model[][9] = { +static const char Cx86_model[][9] = { "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ", "M II ", "Unknown" }; -static const char __cpuinitconst Cx486_name[][5] = { +static const char Cx486_name[][5] = { "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx", "SRx2", "DRx2" }; -static const char __cpuinitconst Cx486S_name[][4] = { +static const char Cx486S_name[][4] = { "S", "S2", "Se", "S2e" }; -static const char __cpuinitconst Cx486D_name[][4] = { +static const char Cx486D_name[][4] = { "DX", "DX2", "?", "?", "?", "DX4" }; -static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock"; -static const char __cpuinitconst cyrix_model_mult1[] = "12??43"; -static const char __cpuinitconst cyrix_model_mult2[] = "12233445"; +static char Cx86_cb[] = "?.5x Core/Bus Clock"; +static const char cyrix_model_mult1[] = "12??43"; +static const char cyrix_model_mult2[] = "12233445"; /* * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old @@ -87,7 +87,7 @@ static const char __cpuinitconst cyrix_model_mult2[] = "12233445"; * FIXME: our newer udelay uses the tsc. We don't need to frob with SLOP */ -static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c) +static void check_cx686_slop(struct cpuinfo_x86 *c) { unsigned long flags; @@ -112,7 +112,7 @@ static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c) } -static void __cpuinit set_cx86_reorder(void) +static void set_cx86_reorder(void) { u8 ccr3; @@ -127,7 +127,7 @@ static void __cpuinit set_cx86_reorder(void) setCx86(CX86_CCR3, ccr3); } -static void __cpuinit set_cx86_memwb(void) +static void set_cx86_memwb(void) { printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); @@ -143,7 +143,7 @@ static void __cpuinit set_cx86_memwb(void) * Configure later MediaGX and/or Geode processor. */ -static void __cpuinit geode_configure(void) +static void geode_configure(void) { unsigned long flags; u8 ccr3; @@ -166,7 +166,7 @@ static void __cpuinit geode_configure(void) local_irq_restore(flags); } -static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c) +static void early_init_cyrix(struct cpuinfo_x86 *c) { unsigned char dir0, dir0_msn, dir1 = 0; @@ -185,7 +185,7 @@ static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c) } } -static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) +static void init_cyrix(struct cpuinfo_x86 *c) { unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0; char *buf = c->x86_model_id; @@ -333,7 +333,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) switch (dir0_lsn) { case 0xd: /* either a 486SLC or DLC w/o DEVID */ dir0_msn = 0; - p = Cx486_name[(c->hard_math) ? 1 : 0]; + p = Cx486_name[(cpu_has_fpu ? 1 : 0)]; break; case 0xe: /* a 486S A step */ @@ -356,7 +356,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) /* * Handle National Semiconductor branded processors */ -static void __cpuinit init_nsc(struct cpuinfo_x86 *c) +static void init_nsc(struct cpuinfo_x86 *c) { /* * There may be GX1 processors in the wild that are branded @@ -405,7 +405,7 @@ static inline int test_cyrix_52div(void) return (unsigned char) (test >> 8) == 0x02; } -static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) +static void cyrix_identify(struct cpuinfo_x86 *c) { /* Detect Cyrix with disabled CPUID */ if (c->x86 == 4 && test_cyrix_52div()) { @@ -441,7 +441,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) } } -static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = { +static const struct cpu_dev cyrix_cpu_dev = { .c_vendor = "Cyrix", .c_ident = { "CyrixInstead" }, .c_early_init = early_init_cyrix, @@ -452,7 +452,7 @@ static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = { cpu_dev_register(cyrix_cpu_dev); -static const struct cpu_dev __cpuinitconst nsc_cpu_dev = { +static const struct cpu_dev nsc_cpu_dev = { .c_vendor = "NSC", .c_ident = { "Geode by NSC" }, .c_init = init_nsc, diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 1e7e84a02eba..36ce402a3fa5 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -25,11 +25,6 @@ #include <asm/processor.h> #include <asm/hypervisor.h> -/* - * Hypervisor detect order. This is specified explicitly here because - * some hypervisors might implement compatibility modes for other - * hypervisors and therefore need to be detected in specific sequence. - */ static const __initconst struct hypervisor_x86 * const hypervisors[] = { #ifdef CONFIG_XEN_PVHVM @@ -49,18 +44,22 @@ static inline void __init detect_hypervisor_vendor(void) { const struct hypervisor_x86 *h, * const *p; + uint32_t pri, max_pri = 0; for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) { h = *p; - if (h->detect()) { + pri = h->detect(); + if (pri != 0 && pri > max_pri) { + max_pri = pri; x86_hyper = h; - printk(KERN_INFO "Hypervisor detected: %s\n", h->name); - break; } } + + if (max_pri) + printk(KERN_INFO "Hypervisor detected: %s\n", x86_hyper->name); } -void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) +void init_hypervisor(struct cpuinfo_x86 *c) { if (x86_hyper && x86_hyper->set_cpu_features) x86_hyper->set_cpu_features(c); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 9b0c441c03f5..ec7299566f79 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -26,7 +26,7 @@ #include <asm/apic.h> #endif -static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) +static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; @@ -163,7 +163,7 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) * This is called before we do cpu ident work */ -int __cpuinit ppro_with_ram_bug(void) +int ppro_with_ram_bug(void) { /* Uses data from early_cpu_detect now */ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && @@ -176,7 +176,7 @@ int __cpuinit ppro_with_ram_bug(void) return 0; } -static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) +static void intel_smp_check(struct cpuinfo_x86 *c) { /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) @@ -196,7 +196,7 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) } } -static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) +static void intel_workarounds(struct cpuinfo_x86 *c) { unsigned long lo, hi; @@ -275,12 +275,12 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) intel_smp_check(c); } #else -static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) +static void intel_workarounds(struct cpuinfo_x86 *c) { } #endif -static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) +static void srat_detect_node(struct cpuinfo_x86 *c) { #ifdef CONFIG_NUMA unsigned node; @@ -300,7 +300,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) /* * find out the number of processor cores on the die */ -static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) +static int intel_num_cpu_cores(struct cpuinfo_x86 *c) { unsigned int eax, ebx, ecx, edx; @@ -315,7 +315,7 @@ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) return 1; } -static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c) +static void detect_vmx_virtcap(struct cpuinfo_x86 *c) { /* Intel VMX MSR indicated features */ #define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000 @@ -353,7 +353,7 @@ static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c) } } -static void __cpuinit init_intel(struct cpuinfo_x86 *c) +static void init_intel(struct cpuinfo_x86 *c) { unsigned int l2 = 0; @@ -472,7 +472,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) } #ifdef CONFIG_X86_32 -static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) +static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) { /* * Intel PIII Tualatin. This comes in two flavours. @@ -506,7 +506,7 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i #define STLB_4K 0x41 -static const struct _tlb_table intel_tlb_table[] __cpuinitconst = { +static const struct _tlb_table intel_tlb_table[] = { { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" }, { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" }, { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" }, @@ -536,7 +536,7 @@ static const struct _tlb_table intel_tlb_table[] __cpuinitconst = { { 0x00, 0, 0 } }; -static void __cpuinit intel_tlb_lookup(const unsigned char desc) +static void intel_tlb_lookup(const unsigned char desc) { unsigned char k; if (desc == 0) @@ -605,7 +605,7 @@ static void __cpuinit intel_tlb_lookup(const unsigned char desc) } } -static void __cpuinit intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c) +static void intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c) { switch ((c->x86 << 8) + c->x86_model) { case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ @@ -634,7 +634,7 @@ static void __cpuinit intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c) } } -static void __cpuinit intel_detect_tlb(struct cpuinfo_x86 *c) +static void intel_detect_tlb(struct cpuinfo_x86 *c) { int i, j, n; unsigned int regs[4]; @@ -661,7 +661,7 @@ static void __cpuinit intel_detect_tlb(struct cpuinfo_x86 *c) intel_tlb_flushall_shift_set(c); } -static const struct cpu_dev __cpuinitconst intel_cpu_dev = { +static const struct cpu_dev intel_cpu_dev = { .c_vendor = "Intel", .c_ident = { "GenuineIntel" }, #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 7c6f7d548c0f..1414c90feaba 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -37,7 +37,7 @@ struct _cache_table { /* All the cache descriptor types we care about (no TLB or trace cache entries) */ -static const struct _cache_table __cpuinitconst cache_table[] = +static const struct _cache_table cache_table[] = { { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ @@ -203,7 +203,7 @@ union l3_cache { unsigned val; }; -static const unsigned short __cpuinitconst assocs[] = { +static const unsigned short assocs[] = { [1] = 1, [2] = 2, [4] = 4, @@ -217,10 +217,10 @@ static const unsigned short __cpuinitconst assocs[] = { [0xf] = 0xffff /* fully associative - no way to show this currently */ }; -static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 }; -static const unsigned char __cpuinitconst types[] = { 1, 2, 3, 3 }; +static const unsigned char levels[] = { 1, 1, 2, 3 }; +static const unsigned char types[] = { 1, 2, 3, 3 }; -static void __cpuinit +static void amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, union _cpuid4_leaf_ebx *ebx, union _cpuid4_leaf_ecx *ecx) @@ -302,7 +302,7 @@ struct _cache_attr { /* * L3 cache descriptors */ -static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb) +static void amd_calc_l3_indices(struct amd_northbridge *nb) { struct amd_l3_cache *l3 = &nb->l3_cache; unsigned int sc0, sc1, sc2, sc3; @@ -325,7 +325,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb) l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; } -static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) +static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) { int node; @@ -528,8 +528,7 @@ static struct _cache_attr subcaches = #endif /* CONFIG_AMD_NB && CONFIG_SYSFS */ static int -__cpuinit cpuid4_cache_lookup_regs(int index, - struct _cpuid4_info_regs *this_leaf) +cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) { union _cpuid4_leaf_eax eax; union _cpuid4_leaf_ebx ebx; @@ -560,7 +559,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index, return 0; } -static int __cpuinit find_num_cache_leaves(struct cpuinfo_x86 *c) +static int find_num_cache_leaves(struct cpuinfo_x86 *c) { unsigned int eax, ebx, ecx, edx, op; union _cpuid4_leaf_eax cache_eax; @@ -580,7 +579,7 @@ static int __cpuinit find_num_cache_leaves(struct cpuinfo_x86 *c) return i; } -void __cpuinit init_amd_cacheinfo(struct cpuinfo_x86 *c) +void init_amd_cacheinfo(struct cpuinfo_x86 *c) { if (cpu_has_topoext) { @@ -593,7 +592,7 @@ void __cpuinit init_amd_cacheinfo(struct cpuinfo_x86 *c) } } -unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) +unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) { /* Cache sizes */ unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; @@ -618,36 +617,34 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) * parameters cpuid leaf to find the cache details */ for (i = 0; i < num_cache_leaves; i++) { - struct _cpuid4_info_regs this_leaf; + struct _cpuid4_info_regs this_leaf = {}; int retval; retval = cpuid4_cache_lookup_regs(i, &this_leaf); - if (retval >= 0) { - switch (this_leaf.eax.split.level) { - case 1: - if (this_leaf.eax.split.type == - CACHE_TYPE_DATA) - new_l1d = this_leaf.size/1024; - else if (this_leaf.eax.split.type == - CACHE_TYPE_INST) - new_l1i = this_leaf.size/1024; - break; - case 2: - new_l2 = this_leaf.size/1024; - num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); - l2_id = c->apicid & ~((1 << index_msb) - 1); - break; - case 3: - new_l3 = this_leaf.size/1024; - num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; - index_msb = get_count_order( - num_threads_sharing); - l3_id = c->apicid & ~((1 << index_msb) - 1); - break; - default: - break; - } + if (retval < 0) + continue; + + switch (this_leaf.eax.split.level) { + case 1: + if (this_leaf.eax.split.type == CACHE_TYPE_DATA) + new_l1d = this_leaf.size/1024; + else if (this_leaf.eax.split.type == CACHE_TYPE_INST) + new_l1i = this_leaf.size/1024; + break; + case 2: + new_l2 = this_leaf.size/1024; + num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + l2_id = c->apicid & ~((1 << index_msb) - 1); + break; + case 3: + new_l3 = this_leaf.size/1024; + num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + l3_id = c->apicid & ~((1 << index_msb) - 1); + break; + default: + break; } } } @@ -746,7 +743,7 @@ static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info); #ifdef CONFIG_SMP -static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) +static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) { struct _cpuid4_info *this_leaf; int i, sibling; @@ -795,7 +792,7 @@ static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) return 1; } -static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) +static void cache_shared_cpu_map_setup(unsigned int cpu, int index) { struct _cpuid4_info *this_leaf, *sibling_leaf; unsigned long num_threads_sharing; @@ -830,7 +827,7 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) } } } -static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) +static void cache_remove_shared_cpu_map(unsigned int cpu, int index) { struct _cpuid4_info *this_leaf, *sibling_leaf; int sibling; @@ -843,16 +840,16 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) } } #else -static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) +static void cache_shared_cpu_map_setup(unsigned int cpu, int index) { } -static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) +static void cache_remove_shared_cpu_map(unsigned int cpu, int index) { } #endif -static void __cpuinit free_cache_attributes(unsigned int cpu) +static void free_cache_attributes(unsigned int cpu) { int i; @@ -863,7 +860,7 @@ static void __cpuinit free_cache_attributes(unsigned int cpu) per_cpu(ici_cpuid4_info, cpu) = NULL; } -static void __cpuinit get_cpu_leaves(void *_retval) +static void get_cpu_leaves(void *_retval) { int j, *retval = _retval, cpu = smp_processor_id(); @@ -883,7 +880,7 @@ static void __cpuinit get_cpu_leaves(void *_retval) } } -static int __cpuinit detect_cache_attributes(unsigned int cpu) +static int detect_cache_attributes(unsigned int cpu) { int retval; @@ -1017,7 +1014,7 @@ static struct attribute *default_attrs[] = { }; #ifdef CONFIG_AMD_NB -static struct attribute ** __cpuinit amd_l3_attrs(void) +static struct attribute **amd_l3_attrs(void) { static struct attribute **attrs; int n; @@ -1093,7 +1090,7 @@ static struct kobj_type ktype_percpu_entry = { .sysfs_ops = &sysfs_ops, }; -static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu) +static void cpuid4_cache_sysfs_exit(unsigned int cpu) { kfree(per_cpu(ici_cache_kobject, cpu)); kfree(per_cpu(ici_index_kobject, cpu)); @@ -1102,7 +1099,7 @@ static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu) free_cache_attributes(cpu); } -static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu) +static int cpuid4_cache_sysfs_init(unsigned int cpu) { int err; @@ -1134,7 +1131,7 @@ err_out: static DECLARE_BITMAP(cache_dev_map, NR_CPUS); /* Add/Remove cache interface for CPU device */ -static int __cpuinit cache_add_dev(struct device *dev) +static int cache_add_dev(struct device *dev) { unsigned int cpu = dev->id; unsigned long i, j; @@ -1185,7 +1182,7 @@ static int __cpuinit cache_add_dev(struct device *dev) return 0; } -static void __cpuinit cache_remove_dev(struct device *dev) +static void cache_remove_dev(struct device *dev) { unsigned int cpu = dev->id; unsigned long i; @@ -1202,8 +1199,8 @@ static void __cpuinit cache_remove_dev(struct device *dev) cpuid4_cache_sysfs_exit(cpu); } -static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int cacheinfo_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; struct device *dev; @@ -1222,7 +1219,7 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = { +static struct notifier_block cacheinfo_cpu_notifier = { .notifier_call = cacheinfo_cpu_callback, }; diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index ddc72f839332..5ac2d1fb28bc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -153,7 +153,7 @@ static void raise_mce(struct mce *m) return; #ifdef CONFIG_X86_LOCAL_APIC - if (m->inject_flags & (MCJ_IRQ_BRAODCAST | MCJ_NMI_BROADCAST)) { + if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) { unsigned long start; int cpu; @@ -167,7 +167,7 @@ static void raise_mce(struct mce *m) cpumask_clear_cpu(cpu, mce_inject_cpumask); } if (!cpumask_empty(mce_inject_cpumask)) { - if (m->inject_flags & MCJ_IRQ_BRAODCAST) { + if (m->inject_flags & MCJ_IRQ_BROADCAST) { /* * don't wait because mce_irq_ipi is necessary * to be sync with following raise_local diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 5b7d4fa5d3b7..09edd0b65fef 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -25,15 +25,18 @@ int mce_severity(struct mce *a, int tolerant, char **msg); struct dentry *mce_get_debugfs_dir(void); extern struct mce_bank *mce_banks; +extern mce_banks_t mce_banks_ce_disabled; #ifdef CONFIG_X86_MCE_INTEL unsigned long mce_intel_adjust_timer(unsigned long interval); void mce_intel_cmci_poll(void); void mce_intel_hcpu_update(unsigned long cpu); +void cmci_disable_bank(int bank); #else # define mce_intel_adjust_timer mce_adjust_timer_default static inline void mce_intel_cmci_poll(void) { } static inline void mce_intel_hcpu_update(unsigned long cpu) { } +static inline void cmci_disable_bank(int bank) { } #endif void mce_timer_kick(unsigned long interval); diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index beb1f1689e52..c370e1c4468b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -110,22 +110,17 @@ static struct severity { /* known AR MCACODs: */ #ifdef CONFIG_MEMORY_FAILURE MCESEV( - KEEP, "HT thread notices Action required: data load error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), - MCGMASK(MCG_STATUS_EIPV, 0) + KEEP, "Action required but unaffected thread is continuable", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR), + MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV) ), MCESEV( - AR, "Action required: data load error", + AR, "Action required: data load error in a user process", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), USER ), MCESEV( - KEEP, "HT thread notices Action required: instruction fetch error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), - MCGMASK(MCG_STATUS_EIPV, 0) - ), - MCESEV( - AR, "Action required: instruction fetch error", + AR, "Action required: instruction fetch error in a user process", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), USER ), diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 9239504b41cb..b3218cdee95f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -89,11 +89,23 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; -/* MCA banks polled by the period polling timer for corrected events */ +/* + * MCA banks polled by the period polling timer for corrected events. + * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). + */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL }; +/* + * MCA banks controlled through firmware first for corrected errors. + * This is a global list of banks for which we won't enable CMCI and we + * won't poll. Firmware controls these banks and is responsible for + * reporting corrected errors through GHES. Uncorrected/recoverable + * errors are still notified through a machine check. + */ +mce_banks_t mce_banks_ce_disabled; + static DEFINE_PER_CPU(struct work_struct, mce_work); static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); @@ -1360,7 +1372,7 @@ int mce_notify_irq(void) } EXPORT_SYMBOL_GPL(mce_notify_irq); -static int __cpuinit __mcheck_cpu_mce_banks_init(void) +static int __mcheck_cpu_mce_banks_init(void) { int i; u8 num_banks = mca_cfg.banks; @@ -1381,7 +1393,7 @@ static int __cpuinit __mcheck_cpu_mce_banks_init(void) /* * Initialize Machine Checks for a CPU. */ -static int __cpuinit __mcheck_cpu_cap_init(void) +static int __mcheck_cpu_cap_init(void) { unsigned b; u64 cap; @@ -1480,7 +1492,7 @@ static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) } /* Add per CPU specific workarounds here */ -static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) +static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { struct mca_config *cfg = &mca_cfg; @@ -1590,7 +1602,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) return 0; } -static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) +static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { if (c->x86 != 5) return 0; @@ -1661,7 +1673,7 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) = * Called for each booted CPU to set up machine checks. * Must be called with preempt off: */ -void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) +void mcheck_cpu_init(struct cpuinfo_x86 *c) { if (mca_cfg.disabled) return; @@ -1932,6 +1944,25 @@ static struct miscdevice mce_chrdev_device = { &mce_chrdev_ops, }; +static void __mce_disable_bank(void *arg) +{ + int bank = *((int *)arg); + __clear_bit(bank, __get_cpu_var(mce_poll_banks)); + cmci_disable_bank(bank); +} + +void mce_disable_bank(int bank) +{ + if (bank >= mca_cfg.banks) { + pr_warn(FW_BUG + "Ignoring request to disable invalid MCA bank %d.\n", + bank); + return; + } + set_bit(bank, mce_banks_ce_disabled); + on_each_cpu(__mce_disable_bank, &bank, 1); +} + /* * mce=off Disables machine check * mce=no_cmci Disables CMCI @@ -2079,7 +2110,6 @@ static struct bus_type mce_subsys = { DEFINE_PER_CPU(struct device *, mce_device); -__cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) @@ -2225,7 +2255,7 @@ static void mce_device_release(struct device *dev) } /* Per cpu device init. All of the cpus still share the same ctrl bank: */ -static __cpuinit int mce_device_create(unsigned int cpu) +static int mce_device_create(unsigned int cpu) { struct device *dev; int err; @@ -2271,7 +2301,7 @@ error: return err; } -static __cpuinit void mce_device_remove(unsigned int cpu) +static void mce_device_remove(unsigned int cpu) { struct device *dev = per_cpu(mce_device, cpu); int i; @@ -2291,7 +2321,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu) } /* Make sure there are no machine checks on offlined CPUs. */ -static void __cpuinit mce_disable_cpu(void *h) +static void mce_disable_cpu(void *h) { unsigned long action = *(unsigned long *)h; int i; @@ -2309,7 +2339,7 @@ static void __cpuinit mce_disable_cpu(void *h) } } -static void __cpuinit mce_reenable_cpu(void *h) +static void mce_reenable_cpu(void *h) { unsigned long action = *(unsigned long *)h; int i; @@ -2328,7 +2358,7 @@ static void __cpuinit mce_reenable_cpu(void *h) } /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int __cpuinit +static int mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; @@ -2364,7 +2394,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -static struct notifier_block mce_cpu_notifier __cpuinitdata = { +static struct notifier_block mce_cpu_notifier = { .notifier_call = mce_cpu_callback, }; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 9cb52767999a..603df4f74640 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -458,10 +458,8 @@ static struct kobj_type threshold_ktype = { .default_attrs = default_attrs, }; -static __cpuinit int allocate_threshold_blocks(unsigned int cpu, - unsigned int bank, - unsigned int block, - u32 address) +static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, + unsigned int block, u32 address) { struct threshold_block *b = NULL; u32 low, high; @@ -543,7 +541,7 @@ out_free: return err; } -static __cpuinit int __threshold_add_blocks(struct threshold_bank *b) +static int __threshold_add_blocks(struct threshold_bank *b) { struct list_head *head = &b->blocks->miscj; struct threshold_block *pos = NULL; @@ -567,7 +565,7 @@ static __cpuinit int __threshold_add_blocks(struct threshold_bank *b) return err; } -static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) +static int threshold_create_bank(unsigned int cpu, unsigned int bank) { struct device *dev = per_cpu(mce_device, cpu); struct amd_northbridge *nb = NULL; @@ -632,7 +630,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) } /* create dir/files for all valid threshold banks */ -static __cpuinit int threshold_create_device(unsigned int cpu) +static int threshold_create_device(unsigned int cpu) { unsigned int bank; struct threshold_bank **bp; @@ -736,7 +734,7 @@ static void threshold_remove_device(unsigned int cpu) } /* get notified when a cpu comes on/off */ -static void __cpuinit +static void amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu) { switch (action) { diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index ae1697c2afe3..4cfe0458ca66 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -24,6 +24,18 @@ * Also supports reliable discovery of shared banks. */ +/* + * CMCI can be delivered to multiple cpus that share a machine check bank + * so we need to designate a single cpu to process errors logged in each bank + * in the interrupt handler (otherwise we would have many races and potential + * double reporting of the same error). + * Note that this can change when a cpu is offlined or brought online since + * some MCA banks are shared across cpus. When a cpu is offlined, cmci_clear() + * disables CMCI on all banks owned by the cpu and clears this bitfield. At + * this point, cmci_rediscover() kicks in and a different cpu may end up + * taking ownership of some of the shared MCA banks that were previously + * owned by the offlined cpu. + */ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); /* @@ -191,6 +203,10 @@ static void cmci_discover(int banks) if (test_bit(i, owned)) continue; + /* Skip banks in firmware first mode */ + if (test_bit(i, mce_banks_ce_disabled)) + continue; + rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Already owned by someone else? */ @@ -259,6 +275,19 @@ void cmci_recheck(void) local_irq_restore(flags); } +/* Caller must hold the lock on cmci_discover_lock */ +static void __cmci_disable_bank(int bank) +{ + u64 val; + + if (!test_bit(bank, __get_cpu_var(mce_banks_owned))) + return; + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + val &= ~MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + __clear_bit(bank, __get_cpu_var(mce_banks_owned)); +} + /* * Disable CMCI on this CPU for all banks it owns when it goes down. * This allows other CPUs to claim the banks on rediscovery. @@ -268,20 +297,12 @@ void cmci_clear(void) unsigned long flags; int i; int banks; - u64 val; if (!cmci_supported(&banks)) return; raw_spin_lock_irqsave(&cmci_discover_lock, flags); - for (i = 0; i < banks; i++) { - if (!test_bit(i, __get_cpu_var(mce_banks_owned))) - continue; - /* Disable CMCI */ - rdmsrl(MSR_IA32_MCx_CTL2(i), val); - val &= ~MCI_CTL2_CMCI_EN; - wrmsrl(MSR_IA32_MCx_CTL2(i), val); - __clear_bit(i, __get_cpu_var(mce_banks_owned)); - } + for (i = 0; i < banks; i++) + __cmci_disable_bank(i); raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } @@ -315,6 +336,19 @@ void cmci_reenable(void) cmci_discover(banks); } +void cmci_disable_bank(int bank) +{ + int banks; + unsigned long flags; + + if (!cmci_supported(&banks)) + return; + + raw_spin_lock_irqsave(&cmci_discover_lock, flags); + __cmci_disable_bank(bank); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + static void intel_init_cmci(void) { int banks; diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 47a1870279aa..3eec7de76efb 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -29,6 +29,7 @@ #include <asm/idle.h> #include <asm/mce.h> #include <asm/msr.h> +#include <asm/trace/irq_vectors.h> /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) @@ -54,12 +55,24 @@ struct thermal_state { struct _thermal_state package_power_limit; struct _thermal_state core_thresh0; struct _thermal_state core_thresh1; + struct _thermal_state pkg_thresh0; + struct _thermal_state pkg_thresh1; }; /* Callback to handle core threshold interrupts */ int (*platform_thermal_notify)(__u64 msr_val); EXPORT_SYMBOL(platform_thermal_notify); +/* Callback to handle core package threshold_interrupts */ +int (*platform_thermal_package_notify)(__u64 msr_val); +EXPORT_SYMBOL_GPL(platform_thermal_package_notify); + +/* Callback support of rate control, return true, if + * callback has rate control */ +bool (*platform_thermal_package_rate_control)(void); +EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control); + + static DEFINE_PER_CPU(struct thermal_state, thermal_state); static atomic_t therm_throt_en = ATOMIC_INIT(0); @@ -181,11 +194,6 @@ static int therm_throt_process(bool new_event, int event, int level) this_cpu, level == CORE_LEVEL ? "Core" : "Package", state->count); - else - printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n", - this_cpu, - level == CORE_LEVEL ? "Core" : "Package", - state->count); return 1; } if (old_event) { @@ -193,36 +201,46 @@ static int therm_throt_process(bool new_event, int event, int level) printk(KERN_INFO "CPU%d: %s temperature/speed normal\n", this_cpu, level == CORE_LEVEL ? "Core" : "Package"); - else - printk(KERN_INFO "CPU%d: %s power limit normal\n", - this_cpu, - level == CORE_LEVEL ? "Core" : "Package"); return 1; } return 0; } -static int thresh_event_valid(int event) +static int thresh_event_valid(int level, int event) { struct _thermal_state *state; unsigned int this_cpu = smp_processor_id(); struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); u64 now = get_jiffies_64(); - state = (event == 0) ? &pstate->core_thresh0 : &pstate->core_thresh1; + if (level == PACKAGE_LEVEL) + state = (event == 0) ? &pstate->pkg_thresh0 : + &pstate->pkg_thresh1; + else + state = (event == 0) ? &pstate->core_thresh0 : + &pstate->core_thresh1; if (time_before64(now, state->next_check)) return 0; state->next_check = now + CHECK_INTERVAL; + + return 1; +} + +static bool int_pln_enable; +static int __init int_pln_enable_setup(char *s) +{ + int_pln_enable = true; + return 1; } +__setup("int_pln_enable", int_pln_enable_setup); #ifdef CONFIG_SYSFS /* Add/Remove thermal_throttle interface for CPU device: */ -static __cpuinit int thermal_throttle_add_dev(struct device *dev, - unsigned int cpu) +static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu) { int err; struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -231,7 +249,7 @@ static __cpuinit int thermal_throttle_add_dev(struct device *dev, if (err) return err; - if (cpu_has(c, X86_FEATURE_PLN)) + if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) err = sysfs_add_file_to_group(&dev->kobj, &dev_attr_core_power_limit_count.attr, thermal_attr_group.name); @@ -239,7 +257,7 @@ static __cpuinit int thermal_throttle_add_dev(struct device *dev, err = sysfs_add_file_to_group(&dev->kobj, &dev_attr_package_throttle_count.attr, thermal_attr_group.name); - if (cpu_has(c, X86_FEATURE_PLN)) + if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) err = sysfs_add_file_to_group(&dev->kobj, &dev_attr_package_power_limit_count.attr, thermal_attr_group.name); @@ -248,7 +266,7 @@ static __cpuinit int thermal_throttle_add_dev(struct device *dev, return err; } -static __cpuinit void thermal_throttle_remove_dev(struct device *dev) +static void thermal_throttle_remove_dev(struct device *dev) { sysfs_remove_group(&dev->kobj, &thermal_attr_group); } @@ -257,7 +275,7 @@ static __cpuinit void thermal_throttle_remove_dev(struct device *dev) static DEFINE_MUTEX(therm_cpu_lock); /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static __cpuinit int +static int thermal_throttle_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -288,7 +306,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb, return notifier_from_errno(err); } -static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata = +static struct notifier_block thermal_throttle_cpu_notifier = { .notifier_call = thermal_throttle_cpu_callback, }; @@ -321,6 +339,39 @@ device_initcall(thermal_throttle_init_device); #endif /* CONFIG_SYSFS */ +static void notify_package_thresholds(__u64 msr_val) +{ + bool notify_thres_0 = false; + bool notify_thres_1 = false; + + if (!platform_thermal_package_notify) + return; + + /* lower threshold check */ + if (msr_val & THERM_LOG_THRESHOLD0) + notify_thres_0 = true; + /* higher threshold check */ + if (msr_val & THERM_LOG_THRESHOLD1) + notify_thres_1 = true; + + if (!notify_thres_0 && !notify_thres_1) + return; + + if (platform_thermal_package_rate_control && + platform_thermal_package_rate_control()) { + /* Rate control is implemented in callback */ + platform_thermal_package_notify(msr_val); + return; + } + + /* lower threshold reached */ + if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0)) + platform_thermal_package_notify(msr_val); + /* higher threshold reached */ + if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1)) + platform_thermal_package_notify(msr_val); +} + static void notify_thresholds(__u64 msr_val) { /* check whether the interrupt handler is defined; @@ -330,10 +381,12 @@ static void notify_thresholds(__u64 msr_val) return; /* lower threshold reached */ - if ((msr_val & THERM_LOG_THRESHOLD0) && thresh_event_valid(0)) + if ((msr_val & THERM_LOG_THRESHOLD0) && + thresh_event_valid(CORE_LEVEL, 0)) platform_thermal_notify(msr_val); /* higher threshold reached */ - if ((msr_val & THERM_LOG_THRESHOLD1) && thresh_event_valid(1)) + if ((msr_val & THERM_LOG_THRESHOLD1) && + thresh_event_valid(CORE_LEVEL, 1)) platform_thermal_notify(msr_val); } @@ -352,17 +405,19 @@ static void intel_thermal_interrupt(void) CORE_LEVEL) != 0) mce_log_therm_throt_event(msr_val); - if (this_cpu_has(X86_FEATURE_PLN)) + if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, CORE_LEVEL); if (this_cpu_has(X86_FEATURE_PTS)) { rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); + /* check violations of package thermal thresholds */ + notify_package_thresholds(msr_val); therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, THERMAL_THROTTLING_EVENT, PACKAGE_LEVEL); - if (this_cpu_has(X86_FEATURE_PLN)) + if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) therm_throt_process(msr_val & PACKAGE_THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, @@ -378,15 +433,26 @@ static void unexpected_thermal_interrupt(void) static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; -asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) +static inline void __smp_thermal_interrupt(void) { - irq_enter(); - exit_idle(); inc_irq_stat(irq_thermal_count); smp_thermal_vector(); - irq_exit(); - /* Ack only at the end to avoid potential reentry */ - ack_APIC_irq(); +} + +asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) +{ + entering_irq(); + __smp_thermal_interrupt(); + exiting_ack_irq(); +} + +asmlinkage void smp_trace_thermal_interrupt(struct pt_regs *regs) +{ + entering_irq(); + trace_thermal_apic_entry(THERMAL_APIC_VECTOR); + __smp_thermal_interrupt(); + trace_thermal_apic_exit(THERMAL_APIC_VECTOR); + exiting_ack_irq(); } /* Thermal monitoring depends on APIC, ACPI and clock modulation */ @@ -470,9 +536,13 @@ void intel_init_thermal(struct cpuinfo_x86 *c) apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - if (cpu_has(c, X86_FEATURE_PLN)) + if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) + wrmsr(MSR_IA32_THERM_INTERRUPT, + (l | (THERM_INT_LOW_ENABLE + | THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h); + else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) wrmsr(MSR_IA32_THERM_INTERRUPT, - l | (THERM_INT_LOW_ENABLE + l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h); else wrmsr(MSR_IA32_THERM_INTERRUPT, @@ -480,9 +550,14 @@ void intel_init_thermal(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_PTS)) { rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); - if (cpu_has(c, X86_FEATURE_PLN)) + if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, - l | (PACKAGE_THERM_INT_LOW_ENABLE + (l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE)) + & ~PACKAGE_THERM_INT_PLN_ENABLE, h); + else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | (PACKAGE_THERM_INT_LOW_ENABLE | PACKAGE_THERM_INT_HIGH_ENABLE | PACKAGE_THERM_INT_PLN_ENABLE), h); else diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index aa578cadb940..fe6b1c86645b 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -8,6 +8,7 @@ #include <asm/apic.h> #include <asm/idle.h> #include <asm/mce.h> +#include <asm/trace/irq_vectors.h> static void default_threshold_interrupt(void) { @@ -17,13 +18,24 @@ static void default_threshold_interrupt(void) void (*mce_threshold_vector)(void) = default_threshold_interrupt; -asmlinkage void smp_threshold_interrupt(void) +static inline void __smp_threshold_interrupt(void) { - irq_enter(); - exit_idle(); inc_irq_stat(irq_threshold_count); mce_threshold_vector(); - irq_exit(); - /* Ack only at the end to avoid potential reentry */ - ack_APIC_irq(); +} + +asmlinkage void smp_threshold_interrupt(void) +{ + entering_irq(); + __smp_threshold_interrupt(); + exiting_ack_irq(); +} + +asmlinkage void smp_trace_threshold_interrupt(void) +{ + entering_irq(); + trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); + __smp_threshold_interrupt(); + trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); + exiting_ack_irq(); } diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 8f4be53ea04b..71a39f3621ba 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -27,20 +27,23 @@ struct ms_hyperv_info ms_hyperv; EXPORT_SYMBOL_GPL(ms_hyperv); -static bool __init ms_hyperv_platform(void) +static uint32_t __init ms_hyperv_platform(void) { u32 eax; u32 hyp_signature[3]; if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) - return false; + return 0; cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]); - return eax >= HYPERV_CPUID_MIN && - eax <= HYPERV_CPUID_MAX && - !memcmp("Microsoft Hv", hyp_signature, 12); + if (eax >= HYPERV_CPUID_MIN && + eax <= HYPERV_CPUID_MAX && + !memcmp("Microsoft Hv", hyp_signature, 12)) + return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS; + + return 0; } static cycle_t read_hv_clock(struct clocksource *arg) diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 35ffda5d0727..5f90b85ff22e 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -714,15 +714,15 @@ int __init mtrr_cleanup(unsigned address_bits) if (mtrr_tom2) x_remove_size = (mtrr_tom2 >> PAGE_SHIFT) - x_remove_base; - nr_range = x86_get_mtrr_mem_range(range, 0, x_remove_base, x_remove_size); /* * [0, 1M) should always be covered by var mtrr with WB * and fixed mtrrs should take effect before var mtrr for it: */ - nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0, + nr_range = add_range_with_merge(range, RANGE_NUM, 0, 0, 1ULL<<(20 - PAGE_SHIFT)); - /* Sort the ranges: */ - sort_range(range, nr_range); + /* add from var mtrr at last */ + nr_range = x86_get_mtrr_mem_range(range, nr_range, + x_remove_base, x_remove_size); range_sums = sum_ranges(range, nr_range); printk(KERN_INFO "total RAM covered: %ldM\n", diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 68a3343e5798..9e451b0876b5 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -167,7 +167,7 @@ static void post_set(void) setCx86(CX86_CCR3, ccr3); /* Enable caches */ - write_cr0(read_cr0() & 0xbfffffff); + write_cr0(read_cr0() & ~X86_CR0_CD); /* Restore value of CR4 */ if (cpu_has_pge) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index fa72a39e5d46..ce2d0a2c3e4f 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -510,8 +510,9 @@ generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) static void generic_get_mtrr(unsigned int reg, unsigned long *base, unsigned long *size, mtrr_type *type) { - unsigned int mask_lo, mask_hi, base_lo, base_hi; - unsigned int tmp, hi; + u32 mask_lo, mask_hi, base_lo, base_hi; + unsigned int hi; + u64 tmp, mask; /* * get_mtrr doesn't need to update mtrr_state, also it could be called @@ -532,18 +533,18 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); /* Work out the shifted address mask: */ - tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT; - mask_lo = size_or_mask | tmp; + tmp = (u64)mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT; + mask = size_or_mask | tmp; /* Expand tmp with high bits to all 1s: */ - hi = fls(tmp); + hi = fls64(tmp); if (hi > 0) { - tmp |= ~((1<<(hi - 1)) - 1); + tmp |= ~((1ULL<<(hi - 1)) - 1); - if (tmp != mask_lo) { + if (tmp != mask) { printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); - mask_lo = tmp; + mask = tmp; } } @@ -551,8 +552,8 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, * This works correctly if size is a power of two, i.e. a * contiguous range: */ - *size = -mask_lo; - *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; + *size = -mask; + *base = (u64)base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; *type = base_lo & 0xff; out_put_cpu: @@ -682,6 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) } /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb(); /* Save MTRR state */ @@ -695,13 +697,14 @@ static void prepare_set(void) __acquires(set_atomicity_lock) static void post_set(void) __releases(set_atomicity_lock) { /* Flush TLBs (no need to flush caches - they are disabled) */ + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb(); /* Intel (P6) standard MTRRs */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); /* Enable caches */ - write_cr0(read_cr0() & 0xbfffffff); + write_cr0(read_cr0() & ~X86_CR0_CD); /* Restore value of CR4 */ if (cpu_has_pge) diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 726bf963c227..f961de9964c7 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -51,9 +51,13 @@ #include <asm/e820.h> #include <asm/mtrr.h> #include <asm/msr.h> +#include <asm/pat.h> #include "mtrr.h" +/* arch_phys_wc_add returns an MTRR register index plus this offset. */ +#define MTRR_TO_PHYS_WC_OFFSET 1000 + u32 num_var_ranges; unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; @@ -305,7 +309,8 @@ int mtrr_add_page(unsigned long base, unsigned long size, return -EINVAL; } - if (base & size_or_mask || size & size_or_mask) { + if ((base | (base + size - 1)) >> + (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) { pr_warning("mtrr: base or size exceeds the MTRR width\n"); return -EINVAL; } @@ -524,6 +529,73 @@ int mtrr_del(int reg, unsigned long base, unsigned long size) } EXPORT_SYMBOL(mtrr_del); +/** + * arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable + * @base: Physical base address + * @size: Size of region + * + * If PAT is available, this does nothing. If PAT is unavailable, it + * attempts to add a WC MTRR covering size bytes starting at base and + * logs an error if this fails. + * + * Drivers must store the return value to pass to mtrr_del_wc_if_needed, + * but drivers should not try to interpret that return value. + */ +int arch_phys_wc_add(unsigned long base, unsigned long size) +{ + int ret; + + if (pat_enabled) + return 0; /* Success! (We don't need to do anything.) */ + + ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true); + if (ret < 0) { + pr_warn("Failed to add WC MTRR for [%p-%p]; performance may suffer.", + (void *)base, (void *)(base + size - 1)); + return ret; + } + return ret + MTRR_TO_PHYS_WC_OFFSET; +} +EXPORT_SYMBOL(arch_phys_wc_add); + +/* + * arch_phys_wc_del - undoes arch_phys_wc_add + * @handle: Return value from arch_phys_wc_add + * + * This cleans up after mtrr_add_wc_if_needed. + * + * The API guarantees that mtrr_del_wc_if_needed(error code) and + * mtrr_del_wc_if_needed(0) do nothing. + */ +void arch_phys_wc_del(int handle) +{ + if (handle >= 1) { + WARN_ON(handle < MTRR_TO_PHYS_WC_OFFSET); + mtrr_del(handle - MTRR_TO_PHYS_WC_OFFSET, 0, 0); + } +} +EXPORT_SYMBOL(arch_phys_wc_del); + +/* + * phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value + * @handle: Return value from arch_phys_wc_add + * + * This will turn the return value from arch_phys_wc_add into an mtrr + * index suitable for debugging. + * + * Note: There is no legitimate use for this function, except possibly + * in printk line. Alas there is an illegitimate use in some ancient + * drm ioctls. + */ +int phys_wc_to_mtrr_index(int handle) +{ + if (handle < MTRR_TO_PHYS_WC_OFFSET) + return -1; + else + return handle - MTRR_TO_PHYS_WC_OFFSET; +} +EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index); + /* * HACK ALERT! * These should be called implicitly, but we can't yet until all the initcall @@ -583,6 +655,7 @@ static struct syscore_ops mtrr_syscore_ops = { int __initdata changed_by_mtrr_cleanup; +#define SIZE_OR_MASK_BITS(n) (~((1ULL << ((n) - PAGE_SHIFT)) - 1)) /** * mtrr_bp_init - initialize mtrrs on the boot CPU * @@ -600,7 +673,7 @@ void __init mtrr_bp_init(void) if (cpu_has_mtrr) { mtrr_if = &generic_mtrr_ops; - size_or_mask = 0xff000000; /* 36 bits */ + size_or_mask = SIZE_OR_MASK_BITS(36); size_and_mask = 0x00f00000; phys_addr = 36; @@ -619,7 +692,7 @@ void __init mtrr_bp_init(void) boot_cpu_data.x86_mask == 0x4)) phys_addr = 36; - size_or_mask = ~((1ULL << (phys_addr - PAGE_SHIFT)) - 1); + size_or_mask = SIZE_OR_MASK_BITS(phys_addr); size_and_mask = ~size_or_mask & 0xfffff00000ULL; } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && boot_cpu_data.x86 == 6) { @@ -627,7 +700,7 @@ void __init mtrr_bp_init(void) * VIA C* family have Intel style MTRRs, * but don't support PAE */ - size_or_mask = 0xfff00000; /* 32 bits */ + size_or_mask = SIZE_OR_MASK_BITS(32); size_and_mask = 0; phys_addr = 32; } @@ -637,21 +710,21 @@ void __init mtrr_bp_init(void) if (cpu_has_k6_mtrr) { /* Pre-Athlon (K6) AMD CPU MTRRs */ mtrr_if = mtrr_ops[X86_VENDOR_AMD]; - size_or_mask = 0xfff00000; /* 32 bits */ + size_or_mask = SIZE_OR_MASK_BITS(32); size_and_mask = 0; } break; case X86_VENDOR_CENTAUR: if (cpu_has_centaur_mcr) { mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR]; - size_or_mask = 0xfff00000; /* 32 bits */ + size_or_mask = SIZE_OR_MASK_BITS(32); size_and_mask = 0; } break; case X86_VENDOR_CYRIX: if (cpu_has_cyrix_arr) { mtrr_if = mtrr_ops[X86_VENDOR_CYRIX]; - size_or_mask = 0xfff00000; /* 32 bits */ + size_or_mask = SIZE_OR_MASK_BITS(32); size_and_mask = 0; } break; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 1025f3c99d20..897783b3302a 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -403,7 +403,8 @@ int x86_pmu_hw_config(struct perf_event *event) * check that PEBS LBR correction does not conflict with * whatever the user is asking with attr->branch_sample_type */ - if (event->attr.precise_ip > 1) { + if (event->attr.precise_ip > 1 && + x86_pmu.intel_cap.pebs_format < 2) { u64 *br_type = &event->attr.branch_sample_type; if (has_branch_stack(event)) { @@ -568,7 +569,7 @@ struct sched_state { struct perf_sched { int max_weight; int max_events; - struct event_constraint **constraints; + struct perf_event **events; struct sched_state state; int saved_states; struct sched_state saved[SCHED_STATES_MAX]; @@ -577,7 +578,7 @@ struct perf_sched { /* * Initialize interator that runs through all events and counters. */ -static void perf_sched_init(struct perf_sched *sched, struct event_constraint **c, +static void perf_sched_init(struct perf_sched *sched, struct perf_event **events, int num, int wmin, int wmax) { int idx; @@ -585,10 +586,10 @@ static void perf_sched_init(struct perf_sched *sched, struct event_constraint ** memset(sched, 0, sizeof(*sched)); sched->max_events = num; sched->max_weight = wmax; - sched->constraints = c; + sched->events = events; for (idx = 0; idx < num; idx++) { - if (c[idx]->weight == wmin) + if (events[idx]->hw.constraint->weight == wmin) break; } @@ -635,8 +636,7 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) if (sched->state.event >= sched->max_events) return false; - c = sched->constraints[sched->state.event]; - + c = sched->events[sched->state.event]->hw.constraint; /* Prefer fixed purpose counters */ if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { idx = INTEL_PMC_IDX_FIXED; @@ -694,7 +694,7 @@ static bool perf_sched_next_event(struct perf_sched *sched) if (sched->state.weight > sched->max_weight) return false; } - c = sched->constraints[sched->state.event]; + c = sched->events[sched->state.event]->hw.constraint; } while (c->weight != sched->state.weight); sched->state.counter = 0; /* start with first counter */ @@ -705,12 +705,12 @@ static bool perf_sched_next_event(struct perf_sched *sched) /* * Assign a counter for each event. */ -int perf_assign_events(struct event_constraint **constraints, int n, +int perf_assign_events(struct perf_event **events, int n, int wmin, int wmax, int *assign) { struct perf_sched sched; - perf_sched_init(&sched, constraints, n, wmin, wmax); + perf_sched_init(&sched, events, n, wmin, wmax); do { if (!perf_sched_find_counter(&sched)) @@ -724,16 +724,19 @@ int perf_assign_events(struct event_constraint **constraints, int n, int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { - struct event_constraint *c, *constraints[X86_PMC_IDX_MAX]; + struct event_constraint *c; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + struct perf_event *e; int i, wmin, wmax, num = 0; struct hw_perf_event *hwc; bitmap_zero(used_mask, X86_PMC_IDX_MAX); for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { + hwc = &cpuc->event_list[i]->hw; c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); - constraints[i] = c; + hwc->constraint = c; + wmin = min(wmin, c->weight); wmax = max(wmax, c->weight); } @@ -743,7 +746,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) */ for (i = 0; i < n; i++) { hwc = &cpuc->event_list[i]->hw; - c = constraints[i]; + c = hwc->constraint; /* never assigned */ if (hwc->idx == -1) @@ -764,16 +767,35 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) /* slow path */ if (i != n) - num = perf_assign_events(constraints, n, wmin, wmax, assign); + num = perf_assign_events(cpuc->event_list, n, wmin, + wmax, assign); /* + * Mark the event as committed, so we do not put_constraint() + * in case new events are added and fail scheduling. + */ + if (!num && assign) { + for (i = 0; i < n; i++) { + e = cpuc->event_list[i]; + e->hw.flags |= PERF_X86_EVENT_COMMITTED; + } + } + /* * scheduling failed or is just a simulation, * free resources if necessary */ if (!assign || num) { for (i = 0; i < n; i++) { + e = cpuc->event_list[i]; + /* + * do not put_constraint() on comitted events, + * because they are good to go + */ + if ((e->hw.flags & PERF_X86_EVENT_COMMITTED)) + continue; + if (x86_pmu.put_event_constraints) - x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]); + x86_pmu.put_event_constraints(cpuc, e); } } return num ? -EINVAL : 0; @@ -1153,6 +1175,11 @@ static void x86_pmu_del(struct perf_event *event, int flags) int i; /* + * event is descheduled + */ + event->hw.flags &= ~PERF_X86_EVENT_COMMITTED; + + /* * If we're called during a txn, we don't need to do anything. * The events never got scheduled and ->cancel_txn will truncate * the event_list. @@ -1249,16 +1276,26 @@ void perf_events_lapic_init(void) static int __kprobes perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) { + int ret; + u64 start_clock; + u64 finish_clock; + if (!atomic_read(&active_events)) return NMI_DONE; - return x86_pmu.handle_irq(regs); + start_clock = local_clock(); + ret = x86_pmu.handle_irq(regs); + finish_clock = local_clock(); + + perf_sample_event_took(finish_clock - start_clock); + + return ret; } struct event_constraint emptyconstraint; struct event_constraint unconstrained; -static int __cpuinit +static int x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; @@ -1469,7 +1506,7 @@ static int __init init_hw_perf_events(void) err = amd_pmu_init(); break; default: - return 0; + err = -ENOTSUPP; } if (err != 0) { pr_cont("no PMU driver, software events only.\n"); @@ -1846,8 +1883,9 @@ static struct pmu pmu = { void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { - userpg->cap_usr_time = 0; - userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc; + userpg->cap_user_time = 0; + userpg->cap_user_time_zero = 0; + userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; userpg->pmc_width = x86_pmu.cntval_bits; if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) @@ -1856,10 +1894,15 @@ void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) return; - userpg->cap_usr_time = 1; + userpg->cap_user_time = 1; userpg->time_mult = this_cpu_read(cyc2ns); userpg->time_shift = CYC2NS_SCALE_FACTOR; userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; + + if (sched_clock_stable && !check_tsc_disabled()) { + userpg->cap_user_time_zero = 1; + userpg->time_zero = this_cpu_read(cyc2ns_offset); + } } /* diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index ba9aadfa683b..cc16faae0538 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -63,10 +63,12 @@ struct event_constraint { int flags; }; /* - * struct event_constraint flags + * struct hw_perf_event.flags flags */ #define PERF_X86_EVENT_PEBS_LDLAT 0x1 /* ld+ldlat data address sampling */ #define PERF_X86_EVENT_PEBS_ST 0x2 /* st data address sampling */ +#define PERF_X86_EVENT_PEBS_ST_HSW 0x4 /* haswell style st data sampling */ +#define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */ struct amd_nb { int nb_id; /* NorthBridge id */ @@ -227,11 +229,14 @@ struct cpu_hw_events { * - inv * - edge * - cnt-mask + * - in_tx + * - in_tx_checkpointed * The other filters are supported by fixed counters. * The any-thread option is supported starting with v3. */ +#define FIXED_EVENT_FLAGS (X86_RAW_EVENT_MASK|HSW_IN_TX|HSW_IN_TX_CHECKPOINTED) #define FIXED_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK) + EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS) /* * Constraint on the Event code + UMask @@ -247,6 +252,11 @@ struct cpu_hw_events { __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST) +/* DataLA version of store sampling without extra enable bit. */ +#define INTEL_PST_HSW_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW) + #define EVENT_CONSTRAINT_END \ EVENT_CONSTRAINT(0, 0, 0) @@ -301,6 +311,11 @@ union perf_capabilities { u64 pebs_arch_reg:1; u64 pebs_format:4; u64 smm_freeze:1; + /* + * PMU supports separate counter range for writing + * values > 32bit. + */ + u64 full_width_write:1; }; u64 capabilities; }; @@ -375,6 +390,7 @@ struct x86_pmu { struct event_constraint *event_constraints; struct x86_pmu_quirk *quirks; int perfctr_second_write; + bool late_ack; /* * sysfs attrs @@ -528,7 +544,7 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, void x86_pmu_enable_all(int added); -int perf_assign_events(struct event_constraint **constraints, int n, +int perf_assign_events(struct perf_event **events, int n, int wmin, int wmax, int *assign); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign); @@ -625,6 +641,8 @@ extern struct event_constraint intel_core2_pebs_event_constraints[]; extern struct event_constraint intel_atom_pebs_event_constraints[]; +extern struct event_constraint intel_slm_pebs_event_constraints[]; + extern struct event_constraint intel_nehalem_pebs_event_constraints[]; extern struct event_constraint intel_westmere_pebs_event_constraints[]; @@ -633,6 +651,8 @@ extern struct event_constraint intel_snb_pebs_event_constraints[]; extern struct event_constraint intel_ivb_pebs_event_constraints[]; +extern struct event_constraint intel_hsw_pebs_event_constraints[]; + struct event_constraint *intel_pebs_constraints(struct perf_event *event); void intel_pmu_pebs_enable(struct perf_event *event); diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 7e28d9467bb4..beeb7cc07044 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -347,8 +347,7 @@ static struct amd_nb *amd_alloc_nb(int cpu) struct amd_nb *nb; int i; - nb = kmalloc_node(sizeof(struct amd_nb), GFP_KERNEL | __GFP_ZERO, - cpu_to_node(cpu)); + nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu)); if (!nb) return NULL; @@ -648,48 +647,48 @@ static __initconst const struct x86_pmu amd_pmu = { .cpu_dead = amd_pmu_cpu_dead, }; -static int setup_event_constraints(void) +static int __init amd_core_pmu_init(void) { - if (boot_cpu_data.x86 == 0x15) + if (!cpu_has_perfctr_core) + return 0; + + switch (boot_cpu_data.x86) { + case 0x15: + pr_cont("Fam15h "); x86_pmu.get_event_constraints = amd_get_event_constraints_f15h; - return 0; -} + break; -static int setup_perfctr_core(void) -{ - if (!cpu_has_perfctr_core) { - WARN(x86_pmu.get_event_constraints == amd_get_event_constraints_f15h, - KERN_ERR "Odd, counter constraints enabled but no core perfctrs detected!"); + default: + pr_err("core perfctr but no constraints; unknown hardware!\n"); return -ENODEV; } - WARN(x86_pmu.get_event_constraints == amd_get_event_constraints, - KERN_ERR "hw perf events core counters need constraints handler!"); - /* * If core performance counter extensions exists, we must use * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also - * x86_pmu_addr_offset(). + * amd_pmu_addr_offset(). */ x86_pmu.eventsel = MSR_F15H_PERF_CTL; x86_pmu.perfctr = MSR_F15H_PERF_CTR; x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE; - printk(KERN_INFO "perf: AMD core performance counters detected\n"); - + pr_cont("core perfctr, "); return 0; } __init int amd_pmu_init(void) { + int ret; + /* Performance-monitoring supported from K7 and later: */ if (boot_cpu_data.x86 < 6) return -ENODEV; x86_pmu = amd_pmu; - setup_event_constraints(); - setup_perfctr_core(); + ret = amd_core_pmu_init(); + if (ret) + return ret; /* Events are common for all AMDs */ memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 5f0581e713c2..e09f0bfb7b8f 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -851,7 +851,7 @@ static void clear_APIC_ibs(void *dummy) setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); } -static int __cpuinit +static int perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.c b/arch/x86/kernel/cpu/perf_event_amd_iommu.c new file mode 100644 index 000000000000..639d1289b1ba --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_amd_iommu.c @@ -0,0 +1,502 @@ +/* + * Copyright (C) 2013 Advanced Micro Devices, Inc. + * + * Author: Steven Kinney <Steven.Kinney@amd.com> + * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com> + * + * Perf: amd_iommu - AMD IOMMU Performance Counter PMU implementation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/perf_event.h> +#include <linux/module.h> +#include <linux/cpumask.h> +#include <linux/slab.h> + +#include "perf_event.h" +#include "perf_event_amd_iommu.h" + +#define COUNTER_SHIFT 16 + +#define _GET_BANK(ev) ((u8)(ev->hw.extra_reg.reg >> 8)) +#define _GET_CNTR(ev) ((u8)(ev->hw.extra_reg.reg)) + +/* iommu pmu config masks */ +#define _GET_CSOURCE(ev) ((ev->hw.config & 0xFFULL)) +#define _GET_DEVID(ev) ((ev->hw.config >> 8) & 0xFFFFULL) +#define _GET_PASID(ev) ((ev->hw.config >> 24) & 0xFFFFULL) +#define _GET_DOMID(ev) ((ev->hw.config >> 40) & 0xFFFFULL) +#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config) & 0xFFFFULL) +#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL) +#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL) + +static struct perf_amd_iommu __perf_iommu; + +struct perf_amd_iommu { + struct pmu pmu; + u8 max_banks; + u8 max_counters; + u64 cntr_assign_mask; + raw_spinlock_t lock; + const struct attribute_group *attr_groups[4]; +}; + +#define format_group attr_groups[0] +#define cpumask_group attr_groups[1] +#define events_group attr_groups[2] +#define null_group attr_groups[3] + +/*--------------------------------------------- + * sysfs format attributes + *---------------------------------------------*/ +PMU_FORMAT_ATTR(csource, "config:0-7"); +PMU_FORMAT_ATTR(devid, "config:8-23"); +PMU_FORMAT_ATTR(pasid, "config:24-39"); +PMU_FORMAT_ATTR(domid, "config:40-55"); +PMU_FORMAT_ATTR(devid_mask, "config1:0-15"); +PMU_FORMAT_ATTR(pasid_mask, "config1:16-31"); +PMU_FORMAT_ATTR(domid_mask, "config1:32-47"); + +static struct attribute *iommu_format_attrs[] = { + &format_attr_csource.attr, + &format_attr_devid.attr, + &format_attr_pasid.attr, + &format_attr_domid.attr, + &format_attr_devid_mask.attr, + &format_attr_pasid_mask.attr, + &format_attr_domid_mask.attr, + NULL, +}; + +static struct attribute_group amd_iommu_format_group = { + .name = "format", + .attrs = iommu_format_attrs, +}; + +/*--------------------------------------------- + * sysfs events attributes + *---------------------------------------------*/ +struct amd_iommu_event_desc { + struct kobj_attribute attr; + const char *event; +}; + +static ssize_t _iommu_event_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct amd_iommu_event_desc *event = + container_of(attr, struct amd_iommu_event_desc, attr); + return sprintf(buf, "%s\n", event->event); +} + +#define AMD_IOMMU_EVENT_DESC(_name, _event) \ +{ \ + .attr = __ATTR(_name, 0444, _iommu_event_show, NULL), \ + .event = _event, \ +} + +static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = { + AMD_IOMMU_EVENT_DESC(mem_pass_untrans, "csource=0x01"), + AMD_IOMMU_EVENT_DESC(mem_pass_pretrans, "csource=0x02"), + AMD_IOMMU_EVENT_DESC(mem_pass_excl, "csource=0x03"), + AMD_IOMMU_EVENT_DESC(mem_target_abort, "csource=0x04"), + AMD_IOMMU_EVENT_DESC(mem_trans_total, "csource=0x05"), + AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_hit, "csource=0x06"), + AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_mis, "csource=0x07"), + AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_hit, "csource=0x08"), + AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_mis, "csource=0x09"), + AMD_IOMMU_EVENT_DESC(mem_dte_hit, "csource=0x0a"), + AMD_IOMMU_EVENT_DESC(mem_dte_mis, "csource=0x0b"), + AMD_IOMMU_EVENT_DESC(page_tbl_read_tot, "csource=0x0c"), + AMD_IOMMU_EVENT_DESC(page_tbl_read_nst, "csource=0x0d"), + AMD_IOMMU_EVENT_DESC(page_tbl_read_gst, "csource=0x0e"), + AMD_IOMMU_EVENT_DESC(int_dte_hit, "csource=0x0f"), + AMD_IOMMU_EVENT_DESC(int_dte_mis, "csource=0x10"), + AMD_IOMMU_EVENT_DESC(cmd_processed, "csource=0x11"), + AMD_IOMMU_EVENT_DESC(cmd_processed_inv, "csource=0x12"), + AMD_IOMMU_EVENT_DESC(tlb_inv, "csource=0x13"), + { /* end: all zeroes */ }, +}; + +/*--------------------------------------------- + * sysfs cpumask attributes + *---------------------------------------------*/ +static cpumask_t iommu_cpumask; + +static ssize_t _iommu_cpumask_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &iommu_cpumask); + buf[n++] = '\n'; + buf[n] = '\0'; + return n; +} +static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL); + +static struct attribute *iommu_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group amd_iommu_cpumask_group = { + .attrs = iommu_cpumask_attrs, +}; + +/*---------------------------------------------*/ + +static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu) +{ + unsigned long flags; + int shift, bank, cntr, retval; + int max_banks = perf_iommu->max_banks; + int max_cntrs = perf_iommu->max_counters; + + raw_spin_lock_irqsave(&perf_iommu->lock, flags); + + for (bank = 0, shift = 0; bank < max_banks; bank++) { + for (cntr = 0; cntr < max_cntrs; cntr++) { + shift = bank + (bank*3) + cntr; + if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) { + continue; + } else { + perf_iommu->cntr_assign_mask |= (1ULL<<shift); + retval = ((u16)((u16)bank<<8) | (u8)(cntr)); + goto out; + } + } + } + retval = -ENOSPC; +out: + raw_spin_unlock_irqrestore(&perf_iommu->lock, flags); + return retval; +} + +static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu, + u8 bank, u8 cntr) +{ + unsigned long flags; + int max_banks, max_cntrs; + int shift = 0; + + max_banks = perf_iommu->max_banks; + max_cntrs = perf_iommu->max_counters; + + if ((bank > max_banks) || (cntr > max_cntrs)) + return -EINVAL; + + shift = bank + cntr + (bank*3); + + raw_spin_lock_irqsave(&perf_iommu->lock, flags); + perf_iommu->cntr_assign_mask &= ~(1ULL<<shift); + raw_spin_unlock_irqrestore(&perf_iommu->lock, flags); + + return 0; +} + +static int perf_iommu_event_init(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct perf_amd_iommu *perf_iommu; + u64 config, config1; + + /* test the event attr type check for PMU enumeration */ + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* + * IOMMU counters are shared across all cores. + * Therefore, it does not support per-process mode. + * Also, it does not support event sampling mode. + */ + if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) + return -EINVAL; + + /* IOMMU counters do not have usr/os/guest/host bits */ + if (event->attr.exclude_user || event->attr.exclude_kernel || + event->attr.exclude_host || event->attr.exclude_guest) + return -EINVAL; + + if (event->cpu < 0) + return -EINVAL; + + perf_iommu = &__perf_iommu; + + if (event->pmu != &perf_iommu->pmu) + return -ENOENT; + + if (perf_iommu) { + config = event->attr.config; + config1 = event->attr.config1; + } else { + return -EINVAL; + } + + /* integrate with iommu base devid (0000), assume one iommu */ + perf_iommu->max_banks = + amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID); + perf_iommu->max_counters = + amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID); + if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0)) + return -EINVAL; + + /* update the hw_perf_event struct with the iommu config data */ + hwc->config = config; + hwc->extra_reg.config = config1; + + return 0; +} + +static void perf_iommu_enable_event(struct perf_event *ev) +{ + u8 csource = _GET_CSOURCE(ev); + u16 devid = _GET_DEVID(ev); + u64 reg = 0ULL; + + reg = csource; + amd_iommu_pc_get_set_reg_val(devid, + _GET_BANK(ev), _GET_CNTR(ev) , + IOMMU_PC_COUNTER_SRC_REG, ®, true); + + reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32); + if (reg) + reg |= (1UL << 31); + amd_iommu_pc_get_set_reg_val(devid, + _GET_BANK(ev), _GET_CNTR(ev) , + IOMMU_PC_DEVID_MATCH_REG, ®, true); + + reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32); + if (reg) + reg |= (1UL << 31); + amd_iommu_pc_get_set_reg_val(devid, + _GET_BANK(ev), _GET_CNTR(ev) , + IOMMU_PC_PASID_MATCH_REG, ®, true); + + reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32); + if (reg) + reg |= (1UL << 31); + amd_iommu_pc_get_set_reg_val(devid, + _GET_BANK(ev), _GET_CNTR(ev) , + IOMMU_PC_DOMID_MATCH_REG, ®, true); +} + +static void perf_iommu_disable_event(struct perf_event *event) +{ + u64 reg = 0ULL; + + amd_iommu_pc_get_set_reg_val(_GET_DEVID(event), + _GET_BANK(event), _GET_CNTR(event), + IOMMU_PC_COUNTER_SRC_REG, ®, true); +} + +static void perf_iommu_start(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + pr_debug("perf: amd_iommu:perf_iommu_start\n"); + if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) + return; + + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + hwc->state = 0; + + if (flags & PERF_EF_RELOAD) { + u64 prev_raw_count = local64_read(&hwc->prev_count); + amd_iommu_pc_get_set_reg_val(_GET_DEVID(event), + _GET_BANK(event), _GET_CNTR(event), + IOMMU_PC_COUNTER_REG, &prev_raw_count, true); + } + + perf_iommu_enable_event(event); + perf_event_update_userpage(event); + +} + +static void perf_iommu_read(struct perf_event *event) +{ + u64 count = 0ULL; + u64 prev_raw_count = 0ULL; + u64 delta = 0ULL; + struct hw_perf_event *hwc = &event->hw; + pr_debug("perf: amd_iommu:perf_iommu_read\n"); + + amd_iommu_pc_get_set_reg_val(_GET_DEVID(event), + _GET_BANK(event), _GET_CNTR(event), + IOMMU_PC_COUNTER_REG, &count, false); + + /* IOMMU pc counter register is only 48 bits */ + count &= 0xFFFFFFFFFFFFULL; + + prev_raw_count = local64_read(&hwc->prev_count); + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + count) != prev_raw_count) + return; + + /* Handling 48-bit counter overflowing */ + delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT); + delta >>= COUNTER_SHIFT; + local64_add(delta, &event->count); + +} + +static void perf_iommu_stop(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + u64 config; + + pr_debug("perf: amd_iommu:perf_iommu_stop\n"); + + if (hwc->state & PERF_HES_UPTODATE) + return; + + perf_iommu_disable_event(event); + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + + if (hwc->state & PERF_HES_UPTODATE) + return; + + config = hwc->config; + perf_iommu_read(event); + hwc->state |= PERF_HES_UPTODATE; +} + +static int perf_iommu_add(struct perf_event *event, int flags) +{ + int retval; + struct perf_amd_iommu *perf_iommu = + container_of(event->pmu, struct perf_amd_iommu, pmu); + + pr_debug("perf: amd_iommu:perf_iommu_add\n"); + event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + /* request an iommu bank/counter */ + retval = get_next_avail_iommu_bnk_cntr(perf_iommu); + if (retval != -ENOSPC) + event->hw.extra_reg.reg = (u16)retval; + else + return retval; + + if (flags & PERF_EF_START) + perf_iommu_start(event, PERF_EF_RELOAD); + + return 0; +} + +static void perf_iommu_del(struct perf_event *event, int flags) +{ + struct perf_amd_iommu *perf_iommu = + container_of(event->pmu, struct perf_amd_iommu, pmu); + + pr_debug("perf: amd_iommu:perf_iommu_del\n"); + perf_iommu_stop(event, PERF_EF_UPDATE); + + /* clear the assigned iommu bank/counter */ + clear_avail_iommu_bnk_cntr(perf_iommu, + _GET_BANK(event), + _GET_CNTR(event)); + + perf_event_update_userpage(event); +} + +static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu) +{ + struct attribute **attrs; + struct attribute_group *attr_group; + int i = 0, j; + + while (amd_iommu_v2_event_descs[i].attr.attr.name) + i++; + + attr_group = kzalloc(sizeof(struct attribute *) + * (i + 1) + sizeof(*attr_group), GFP_KERNEL); + if (!attr_group) + return -ENOMEM; + + attrs = (struct attribute **)(attr_group + 1); + for (j = 0; j < i; j++) + attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr; + + attr_group->name = "events"; + attr_group->attrs = attrs; + perf_iommu->events_group = attr_group; + + return 0; +} + +static __init void amd_iommu_pc_exit(void) +{ + if (__perf_iommu.events_group != NULL) { + kfree(__perf_iommu.events_group); + __perf_iommu.events_group = NULL; + } +} + +static __init int _init_perf_amd_iommu( + struct perf_amd_iommu *perf_iommu, char *name) +{ + int ret; + + raw_spin_lock_init(&perf_iommu->lock); + + /* Init format attributes */ + perf_iommu->format_group = &amd_iommu_format_group; + + /* Init cpumask attributes to only core 0 */ + cpumask_set_cpu(0, &iommu_cpumask); + perf_iommu->cpumask_group = &amd_iommu_cpumask_group; + + /* Init events attributes */ + if (_init_events_attrs(perf_iommu) != 0) + pr_err("perf: amd_iommu: Only support raw events.\n"); + + /* Init null attributes */ + perf_iommu->null_group = NULL; + perf_iommu->pmu.attr_groups = perf_iommu->attr_groups; + + ret = perf_pmu_register(&perf_iommu->pmu, name, -1); + if (ret) { + pr_err("perf: amd_iommu: Failed to initialized.\n"); + amd_iommu_pc_exit(); + } else { + pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n", + amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID), + amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID)); + } + + return ret; +} + +static struct perf_amd_iommu __perf_iommu = { + .pmu = { + .event_init = perf_iommu_event_init, + .add = perf_iommu_add, + .del = perf_iommu_del, + .start = perf_iommu_start, + .stop = perf_iommu_stop, + .read = perf_iommu_read, + }, + .max_banks = 0x00, + .max_counters = 0x00, + .cntr_assign_mask = 0ULL, + .format_group = NULL, + .cpumask_group = NULL, + .events_group = NULL, + .null_group = NULL, +}; + +static __init int amd_iommu_pc_init(void) +{ + /* Make sure the IOMMU PC resource is available */ + if (!amd_iommu_pc_supported()) + return -ENODEV; + + _init_perf_amd_iommu(&__perf_iommu, "amd_iommu"); + + return 0; +} + +device_initcall(amd_iommu_pc_init); diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.h b/arch/x86/kernel/cpu/perf_event_amd_iommu.h new file mode 100644 index 000000000000..845d173278e3 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_amd_iommu.h @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2013 Advanced Micro Devices, Inc. + * + * Author: Steven Kinney <Steven.Kinney@amd.com> + * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _PERF_EVENT_AMD_IOMMU_H_ +#define _PERF_EVENT_AMD_IOMMU_H_ + +/* iommu pc mmio region register indexes */ +#define IOMMU_PC_COUNTER_REG 0x00 +#define IOMMU_PC_COUNTER_SRC_REG 0x08 +#define IOMMU_PC_PASID_MATCH_REG 0x10 +#define IOMMU_PC_DOMID_MATCH_REG 0x18 +#define IOMMU_PC_DEVID_MATCH_REG 0x20 +#define IOMMU_PC_COUNTER_REPORT_REG 0x28 + +/* maximun specified bank/counters */ +#define PC_MAX_SPEC_BNKS 64 +#define PC_MAX_SPEC_CNTRS 16 + +/* iommu pc reg masks*/ +#define IOMMU_BASE_DEVID 0x0000 + +/* amd_iommu_init.c external support functions */ +extern bool amd_iommu_pc_supported(void); + +extern u8 amd_iommu_pc_get_max_banks(u16 devid); + +extern u8 amd_iommu_pc_get_max_counters(u16 devid); + +extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr, + u8 fxn, u64 *value, bool is_write); + +#endif /*_PERF_EVENT_AMD_IOMMU_H_*/ diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c index c0c661adf03e..754291adec33 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c @@ -288,13 +288,13 @@ static struct pmu amd_l2_pmu = { .read = amd_uncore_read, }; -static struct amd_uncore * __cpuinit amd_uncore_alloc(unsigned int cpu) +static struct amd_uncore *amd_uncore_alloc(unsigned int cpu) { return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL, cpu_to_node(cpu)); } -static void __cpuinit amd_uncore_cpu_up_prepare(unsigned int cpu) +static void amd_uncore_cpu_up_prepare(unsigned int cpu) { struct amd_uncore *uncore; @@ -322,8 +322,8 @@ static void __cpuinit amd_uncore_cpu_up_prepare(unsigned int cpu) } static struct amd_uncore * -__cpuinit amd_uncore_find_online_sibling(struct amd_uncore *this, - struct amd_uncore * __percpu *uncores) +amd_uncore_find_online_sibling(struct amd_uncore *this, + struct amd_uncore * __percpu *uncores) { unsigned int cpu; struct amd_uncore *that; @@ -348,7 +348,7 @@ __cpuinit amd_uncore_find_online_sibling(struct amd_uncore *this, return this; } -static void __cpuinit amd_uncore_cpu_starting(unsigned int cpu) +static void amd_uncore_cpu_starting(unsigned int cpu) { unsigned int eax, ebx, ecx, edx; struct amd_uncore *uncore; @@ -376,8 +376,8 @@ static void __cpuinit amd_uncore_cpu_starting(unsigned int cpu) } } -static void __cpuinit uncore_online(unsigned int cpu, - struct amd_uncore * __percpu *uncores) +static void uncore_online(unsigned int cpu, + struct amd_uncore * __percpu *uncores) { struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu); @@ -388,7 +388,7 @@ static void __cpuinit uncore_online(unsigned int cpu, cpumask_set_cpu(cpu, uncore->active_mask); } -static void __cpuinit amd_uncore_cpu_online(unsigned int cpu) +static void amd_uncore_cpu_online(unsigned int cpu) { if (amd_uncore_nb) uncore_online(cpu, amd_uncore_nb); @@ -397,8 +397,8 @@ static void __cpuinit amd_uncore_cpu_online(unsigned int cpu) uncore_online(cpu, amd_uncore_l2); } -static void __cpuinit uncore_down_prepare(unsigned int cpu, - struct amd_uncore * __percpu *uncores) +static void uncore_down_prepare(unsigned int cpu, + struct amd_uncore * __percpu *uncores) { unsigned int i; struct amd_uncore *this = *per_cpu_ptr(uncores, cpu); @@ -423,7 +423,7 @@ static void __cpuinit uncore_down_prepare(unsigned int cpu, } } -static void __cpuinit amd_uncore_cpu_down_prepare(unsigned int cpu) +static void amd_uncore_cpu_down_prepare(unsigned int cpu) { if (amd_uncore_nb) uncore_down_prepare(cpu, amd_uncore_nb); @@ -432,8 +432,7 @@ static void __cpuinit amd_uncore_cpu_down_prepare(unsigned int cpu) uncore_down_prepare(cpu, amd_uncore_l2); } -static void __cpuinit uncore_dead(unsigned int cpu, - struct amd_uncore * __percpu *uncores) +static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores) { struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu); @@ -445,7 +444,7 @@ static void __cpuinit uncore_dead(unsigned int cpu, *per_cpu_ptr(amd_uncore_nb, cpu) = NULL; } -static void __cpuinit amd_uncore_cpu_dead(unsigned int cpu) +static void amd_uncore_cpu_dead(unsigned int cpu) { if (amd_uncore_nb) uncore_dead(cpu, amd_uncore_nb); @@ -454,7 +453,7 @@ static void __cpuinit amd_uncore_cpu_dead(unsigned int cpu) uncore_dead(cpu, amd_uncore_l2); } -static int __cpuinit +static int amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -489,7 +488,7 @@ amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action, return NOTIFY_OK; } -static struct notifier_block amd_uncore_cpu_notifier_block __cpuinitdata = { +static struct notifier_block amd_uncore_cpu_notifier_block = { .notifier_call = amd_uncore_cpu_notifier, .priority = CPU_PRI_PERF + 1, }; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index f60d41ff9a97..f31a1655d1ff 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include <linux/export.h> +#include <asm/cpufeature.h> #include <asm/hardirq.h> #include <asm/apic.h> @@ -80,7 +81,8 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly = static struct extra_reg intel_nehalem_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b), EVENT_EXTRA_END }; @@ -122,6 +124,7 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly = INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */ INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */ INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */ + INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */ INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */ INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */ @@ -142,8 +145,9 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly = static struct extra_reg intel_westmere_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), - INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1), + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1), INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b), EVENT_EXTRA_END }; @@ -161,17 +165,28 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly = EVENT_CONSTRAINT_END }; +static struct event_constraint intel_slm_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */ + EVENT_CONSTRAINT_END +}; + static struct extra_reg intel_snb_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0), - INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1), - INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1), INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), EVENT_EXTRA_END }; static struct extra_reg intel_snbep_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0), - INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1), + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), EVENT_EXTRA_END }; @@ -190,6 +205,22 @@ struct attribute *snb_events_attrs[] = { NULL, }; +static struct event_constraint intel_hsw_event_constraints[] = { + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.* */ + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ + INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ + /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ + INTEL_EVENT_CONSTRAINT(0x08a3, 0x4), + /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */ + INTEL_EVENT_CONSTRAINT(0x0ca3, 0x4), + /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */ + INTEL_EVENT_CONSTRAINT(0x04a3, 0xf), + EVENT_CONSTRAINT_END +}; + static u64 intel_pmu_event_map(int hw_event) { return intel_perfmon_event_map[hw_event]; @@ -865,6 +896,140 @@ static __initconst const u64 atom_hw_cache_event_ids }, }; +static struct extra_reg intel_slm_extra_regs[] __read_mostly = +{ + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffffull, RSP_1), + EVENT_EXTRA_END +}; + +#define SLM_DMND_READ SNB_DMND_DATA_RD +#define SLM_DMND_WRITE SNB_DMND_RFO +#define SLM_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO) + +#define SLM_SNP_ANY (SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM) +#define SLM_LLC_ACCESS SNB_RESP_ANY +#define SLM_LLC_MISS (SLM_SNP_ANY|SNB_NON_DRAM) + +static __initconst const u64 slm_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS, + [ C(RESULT_MISS) ] = SLM_DMND_READ|SLM_LLC_MISS, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS, + [ C(RESULT_MISS) ] = SLM_DMND_WRITE|SLM_LLC_MISS, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS, + [ C(RESULT_MISS) ] = SLM_DMND_PREFETCH|SLM_LLC_MISS, + }, + }, +}; + +static __initconst const u64 slm_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0104, /* LD_DCU_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */ + [ C(RESULT_MISS) ] = 0x0280, /* ICACGE.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0804, /* LD_DTLB_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) { /* user explicitly requested branch sampling */ @@ -872,7 +1037,8 @@ static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) return true; /* implicit branch sampling to correct PEBS skid */ - if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) + if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 && + x86_pmu.intel_cap.pebs_format < 2) return true; return false; @@ -1167,15 +1333,11 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) cpuc = &__get_cpu_var(cpu_hw_events); /* - * Some chipsets need to unmask the LVTPC in a particular spot - * inside the nmi handler. As a result, the unmasking was pushed - * into all the nmi handlers. - * - * This handler doesn't seem to have any issues with the unmasking - * so it was left at the top. + * No known reason to not always do late ACK, + * but just in case do it opt-in. */ - apic_write(APIC_LVTPC, APIC_DM_NMI); - + if (!x86_pmu.late_ack) + apic_write(APIC_LVTPC, APIC_DM_NMI); intel_pmu_disable_all(); handled = intel_pmu_drain_bts_buffer(); status = intel_pmu_get_status(); @@ -1188,8 +1350,12 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) again: intel_pmu_ack_status(status); if (++loops > 100) { - WARN_ONCE(1, "perfevents: irq loop stuck!\n"); - perf_event_print_debug(); + static bool warned = false; + if (!warned) { + WARN(1, "perfevents: irq loop stuck!\n"); + perf_event_print_debug(); + warned = true; + } intel_pmu_reset(); goto done; } @@ -1235,6 +1401,13 @@ again: done: intel_pmu_enable_all(0); + /* + * Only unmask the NMI after the overflow counters + * have been reset. This avoids spurious NMIs on + * Haswell CPUs. + */ + if (x86_pmu.late_ack) + apic_write(APIC_LVTPC, APIC_DM_NMI); return handled; } @@ -1276,11 +1449,11 @@ static void intel_fixup_er(struct perf_event *event, int idx) if (idx == EXTRA_REG_RSP_0) { event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= 0x01b7; + event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event; event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; } else if (idx == EXTRA_REG_RSP_1) { event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= 0x01bb; + event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event; event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; } } @@ -1425,7 +1598,6 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) if (x86_pmu.event_constraints) { for_each_event_constraint(c, x86_pmu.event_constraints) { if ((event->hw.config & c->cmask) == c->code) { - /* hw.flags zeroed at initialization */ event->hw.flags |= c->flags; return c; } @@ -1473,7 +1645,6 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, static void intel_put_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { - event->hw.flags = 0; intel_put_shared_regs_event_constraints(cpuc, event); } @@ -1646,6 +1817,47 @@ static void core_pmu_enable_all(int added) } } +static int hsw_hw_config(struct perf_event *event) +{ + int ret = intel_pmu_hw_config(event); + + if (ret) + return ret; + if (!boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has(X86_FEATURE_HLE)) + return 0; + event->hw.config |= event->attr.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED); + + /* + * IN_TX/IN_TX-CP filters are not supported by the Haswell PMU with + * PEBS or in ANY thread mode. Since the results are non-sensical forbid + * this combination. + */ + if ((event->hw.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)) && + ((event->hw.config & ARCH_PERFMON_EVENTSEL_ANY) || + event->attr.precise_ip > 0)) + return -EOPNOTSUPP; + + return 0; +} + +static struct event_constraint counter2_constraint = + EVENT_CONSTRAINT(0, 0x4, 0); + +static struct event_constraint * +hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct event_constraint *c = intel_get_event_constraints(cpuc, event); + + /* Handle special quirk on in_tx_checkpointed only in counter 2 */ + if (event->hw.config & HSW_IN_TX_CHECKPOINTED) { + if (c->idxmsk64 & (1U << 2)) + return &counter2_constraint; + return &emptyconstraint; + } + + return c; +} + PMU_FORMAT_ATTR(event, "config:0-7" ); PMU_FORMAT_ATTR(umask, "config:8-15" ); PMU_FORMAT_ATTR(edge, "config:18" ); @@ -1653,6 +1865,8 @@ PMU_FORMAT_ATTR(pc, "config:19" ); PMU_FORMAT_ATTR(any, "config:21" ); /* v3 + */ PMU_FORMAT_ATTR(inv, "config:23" ); PMU_FORMAT_ATTR(cmask, "config:24-31" ); +PMU_FORMAT_ATTR(in_tx, "config:32"); +PMU_FORMAT_ATTR(in_tx_cp, "config:33"); static struct attribute *intel_arch_formats_attr[] = { &format_attr_event.attr, @@ -1807,6 +2021,8 @@ static struct attribute *intel_arch3_formats_attr[] = { &format_attr_any.attr, &format_attr_inv.attr, &format_attr_cmask.attr, + &format_attr_in_tx.attr, + &format_attr_in_tx_cp.attr, &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */ &format_attr_ldlat.attr, /* PEBS load latency */ @@ -1966,6 +2182,15 @@ static __init void intel_nehalem_quirk(void) } } +EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3"); +EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82") + +static struct attribute *hsw_events_attrs[] = { + EVENT_PTR(mem_ld_hsw), + EVENT_PTR(mem_st_hsw), + NULL +}; + __init int intel_pmu_init(void) { union cpuid10_edx edx; @@ -2099,6 +2324,22 @@ __init int intel_pmu_init(void) pr_cont("Atom events, "); break; + case 55: /* Atom 22nm "Silvermont" */ + case 77: /* Avoton "Silvermont" */ + memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_atom(); + + x86_pmu.event_constraints = intel_slm_event_constraints; + x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints; + x86_pmu.extra_regs = intel_slm_extra_regs; + x86_pmu.er_flags |= ERF_HAS_RSP_1; + pr_cont("Silvermont events, "); + break; + case 37: /* 32 nm nehalem, "Clarkdale" */ case 44: /* 32 nm nehalem, "Gulftown" */ case 47: /* 32 nm Xeon E7 */ @@ -2189,6 +2430,31 @@ __init int intel_pmu_init(void) break; + case 60: /* Haswell Client */ + case 70: + case 71: + case 63: + case 69: + x86_pmu.late_ack = true; + memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_snb(); + + x86_pmu.event_constraints = intel_hsw_event_constraints; + x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints; + x86_pmu.extra_regs = intel_snb_extra_regs; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; + /* all extra regs are per-cpu when HT is on */ + x86_pmu.er_flags |= ERF_HAS_RSP_1; + x86_pmu.er_flags |= ERF_NO_HT_SHARING; + + x86_pmu.hw_config = hsw_hw_config; + x86_pmu.get_event_constraints = hsw_get_event_constraints; + x86_pmu.cpu_events = hsw_events_attrs; + pr_cont("Haswell events, "); + break; + default: switch (x86_pmu.version) { case 1: @@ -2227,7 +2493,7 @@ __init int intel_pmu_init(void) * counter, so do not extend mask to generic counters */ for_each_event_constraint(c, x86_pmu.event_constraints) { - if (c->cmask != X86_RAW_EVENT_MASK + if (c->cmask != FIXED_EVENT_FLAGS || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) { continue; } @@ -2237,5 +2503,12 @@ __init int intel_pmu_init(void) } } + /* Support full width counters using alternative MSR range */ + if (x86_pmu.intel_cap.full_width_write) { + x86_pmu.max_period = x86_pmu.cntval_mask; + x86_pmu.perfctr = MSR_IA32_PMC0; + pr_cont("full-width counters, "); + } + return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 60250f687052..ab3ba1c1b7dd 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -107,6 +107,19 @@ static u64 precise_store_data(u64 status) return val; } +static u64 precise_store_data_hsw(u64 status) +{ + union perf_mem_data_src dse; + + dse.val = 0; + dse.mem_op = PERF_MEM_OP_STORE; + dse.mem_lvl = PERF_MEM_LVL_NA; + if (status & 1) + dse.mem_lvl = PERF_MEM_LVL_L1; + /* Nothing else supported. Sorry. */ + return dse.val; +} + static u64 load_latency_data(u64 status) { union intel_x86_pebs_dse dse; @@ -165,6 +178,22 @@ struct pebs_record_nhm { u64 status, dla, dse, lat; }; +/* + * Same as pebs_record_nhm, with two additional fields. + */ +struct pebs_record_hsw { + struct pebs_record_nhm nhm; + /* + * Real IP of the event. In the Intel documentation this + * is called eventingrip. + */ + u64 real_ip; + /* + * TSX tuning information field: abort cycles and abort flags. + */ + u64 tsx_tuning; +}; + void init_debug_store_on_cpu(int cpu) { struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; @@ -195,7 +224,7 @@ static int alloc_pebs_buffer(int cpu) if (!x86_pmu.pebs) return 0; - buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); + buffer = kzalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL, node); if (unlikely(!buffer)) return -ENOMEM; @@ -233,7 +262,7 @@ static int alloc_bts_buffer(int cpu) if (!x86_pmu.bts) return 0; - buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); + buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL, node); if (unlikely(!buffer)) return -ENOMEM; @@ -266,7 +295,7 @@ static int alloc_ds_buffer(int cpu) int node = cpu_to_node(cpu); struct debug_store *ds; - ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node); + ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node); if (unlikely(!ds)) return -ENOMEM; @@ -488,6 +517,32 @@ struct event_constraint intel_atom_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_slm_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x0103, 0x1), /* REHABQ.LD_BLOCK_ST_FORWARD_PS */ + INTEL_UEVENT_CONSTRAINT(0x0803, 0x1), /* REHABQ.LD_SPLITS_PS */ + INTEL_UEVENT_CONSTRAINT(0x0204, 0x1), /* MEM_UOPS_RETIRED.L2_HIT_LOADS_PS */ + INTEL_UEVENT_CONSTRAINT(0x0404, 0x1), /* MEM_UOPS_RETIRED.L2_MISS_LOADS_PS */ + INTEL_UEVENT_CONSTRAINT(0x0804, 0x1), /* MEM_UOPS_RETIRED.DTLB_MISS_LOADS_PS */ + INTEL_UEVENT_CONSTRAINT(0x2004, 0x1), /* MEM_UOPS_RETIRED.HITM_PS */ + INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY_PS */ + INTEL_UEVENT_CONSTRAINT(0x00c4, 0x1), /* BR_INST_RETIRED.ALL_BRANCHES_PS */ + INTEL_UEVENT_CONSTRAINT(0x7ec4, 0x1), /* BR_INST_RETIRED.JCC_PS */ + INTEL_UEVENT_CONSTRAINT(0xbfc4, 0x1), /* BR_INST_RETIRED.FAR_BRANCH_PS */ + INTEL_UEVENT_CONSTRAINT(0xebc4, 0x1), /* BR_INST_RETIRED.NON_RETURN_IND_PS */ + INTEL_UEVENT_CONSTRAINT(0xf7c4, 0x1), /* BR_INST_RETIRED.RETURN_PS */ + INTEL_UEVENT_CONSTRAINT(0xf9c4, 0x1), /* BR_INST_RETIRED.CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfbc4, 0x1), /* BR_INST_RETIRED.IND_CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfdc4, 0x1), /* BR_INST_RETIRED.REL_CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfec4, 0x1), /* BR_INST_RETIRED.TAKEN_JCC_PS */ + INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_MISP_RETIRED.ALL_BRANCHES_PS */ + INTEL_UEVENT_CONSTRAINT(0x7ec5, 0x1), /* BR_INST_MISP_RETIRED.JCC_PS */ + INTEL_UEVENT_CONSTRAINT(0xebc5, 0x1), /* BR_INST_MISP_RETIRED.NON_RETURN_IND_PS */ + INTEL_UEVENT_CONSTRAINT(0xf7c5, 0x1), /* BR_INST_MISP_RETIRED.RETURN_PS */ + INTEL_UEVENT_CONSTRAINT(0xfbc5, 0x1), /* BR_INST_MISP_RETIRED.IND_CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfec5, 0x1), /* BR_INST_MISP_RETIRED.TAKEN_JCC_PS */ + EVENT_CONSTRAINT_END +}; + struct event_constraint intel_nehalem_pebs_event_constraints[] = { INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ @@ -529,6 +584,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = { INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */ EVENT_CONSTRAINT_END }; @@ -548,6 +604,42 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_hsw_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_PST_HSW_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ + INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x01c5, 0xf), /* BR_MISP_RETIRED.CONDITIONAL */ + INTEL_UEVENT_CONSTRAINT(0x04c5, 0xf), /* BR_MISP_RETIRED.ALL_BRANCHES */ + INTEL_UEVENT_CONSTRAINT(0x20c5, 0xf), /* BR_MISP_RETIRED.NEAR_TAKEN */ + INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.* */ + /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */ + INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), + /* MEM_UOPS_RETIRED.STLB_MISS_STORES */ + INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), + INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */ + INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */ + /* MEM_UOPS_RETIRED.SPLIT_STORES */ + INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), + INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */ + INTEL_PST_HSW_CONSTRAINT(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */ + INTEL_UEVENT_CONSTRAINT(0x01d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L1_HIT */ + INTEL_UEVENT_CONSTRAINT(0x02d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L2_HIT */ + INTEL_UEVENT_CONSTRAINT(0x04d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L3_HIT */ + /* MEM_LOAD_UOPS_RETIRED.HIT_LFB */ + INTEL_UEVENT_CONSTRAINT(0x40d1, 0xf), + /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS */ + INTEL_UEVENT_CONSTRAINT(0x01d2, 0xf), + /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT */ + INTEL_UEVENT_CONSTRAINT(0x02d2, 0xf), + /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM */ + INTEL_UEVENT_CONSTRAINT(0x01d3, 0xf), + INTEL_UEVENT_CONSTRAINT(0x04c8, 0xf), /* HLE_RETIRED.Abort */ + INTEL_UEVENT_CONSTRAINT(0x04c9, 0xf), /* RTM_RETIRED.Abort */ + + EVENT_CONSTRAINT_END +}; + struct event_constraint *intel_pebs_constraints(struct perf_event *event) { struct event_constraint *c; @@ -588,6 +680,12 @@ void intel_pmu_pebs_disable(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; cpuc->pebs_enabled &= ~(1ULL << hwc->idx); + + if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_LDLAT) + cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32)); + else if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_ST) + cpuc->pebs_enabled &= ~(1ULL << 63); + if (cpuc->enabled) wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); @@ -697,6 +795,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, */ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct pebs_record_nhm *pebs = __pebs; + struct pebs_record_hsw *pebs_hsw = __pebs; struct perf_sample_data data; struct pt_regs regs; u64 sample_type; @@ -706,7 +805,8 @@ static void __intel_pmu_pebs_event(struct perf_event *event, return; fll = event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT; - fst = event->hw.flags & PERF_X86_EVENT_PEBS_ST; + fst = event->hw.flags & (PERF_X86_EVENT_PEBS_ST | + PERF_X86_EVENT_PEBS_ST_HSW); perf_sample_data_init(&data, 0, event->hw.last_period); @@ -717,9 +817,6 @@ static void __intel_pmu_pebs_event(struct perf_event *event, * if PEBS-LL or PreciseStore */ if (fll || fst) { - if (sample_type & PERF_SAMPLE_ADDR) - data.addr = pebs->dla; - /* * Use latency for weight (only avail with PEBS-LL) */ @@ -732,6 +829,9 @@ static void __intel_pmu_pebs_event(struct perf_event *event, if (sample_type & PERF_SAMPLE_DATA_SRC) { if (fll) data.data_src.val = load_latency_data(pebs->dse); + else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) + data.data_src.val = + precise_store_data_hsw(pebs->dse); else data.data_src.val = precise_store_data(pebs->dse); } @@ -753,11 +853,18 @@ static void __intel_pmu_pebs_event(struct perf_event *event, regs.bp = pebs->bp; regs.sp = pebs->sp; - if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(®s)) + if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) { + regs.ip = pebs_hsw->real_ip; + regs.flags |= PERF_EFLAGS_EXACT; + } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(®s)) regs.flags |= PERF_EFLAGS_EXACT; else regs.flags &= ~PERF_EFLAGS_EXACT; + if ((event->attr.sample_type & PERF_SAMPLE_ADDR) && + x86_pmu.intel_cap.pebs_format >= 1) + data.addr = pebs->dla; + if (has_branch_stack(event)) data.br_stack = &cpuc->lbr_stack; @@ -806,35 +913,22 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) __intel_pmu_pebs_event(event, iregs, at); } -static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) +static void __intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, void *at, + void *top) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct debug_store *ds = cpuc->ds; - struct pebs_record_nhm *at, *top; struct perf_event *event = NULL; u64 status = 0; - int bit, n; - - if (!x86_pmu.pebs_active) - return; - - at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; - top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index; + int bit; ds->pebs_index = ds->pebs_buffer_base; - n = top - at; - if (n <= 0) - return; + for (; at < top; at += x86_pmu.pebs_record_size) { + struct pebs_record_nhm *p = at; - /* - * Should not happen, we program the threshold at 1 and do not - * set a reset value. - */ - WARN_ONCE(n > x86_pmu.max_pebs_events, "Unexpected number of pebs records %d\n", n); - - for ( ; at < top; at++) { - for_each_set_bit(bit, (unsigned long *)&at->status, x86_pmu.max_pebs_events) { + for_each_set_bit(bit, (unsigned long *)&p->status, + x86_pmu.max_pebs_events) { event = cpuc->events[bit]; if (!test_bit(bit, cpuc->active_mask)) continue; @@ -857,6 +951,61 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) } } +static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct debug_store *ds = cpuc->ds; + struct pebs_record_nhm *at, *top; + int n; + + if (!x86_pmu.pebs_active) + return; + + at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; + top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index; + + ds->pebs_index = ds->pebs_buffer_base; + + n = top - at; + if (n <= 0) + return; + + /* + * Should not happen, we program the threshold at 1 and do not + * set a reset value. + */ + WARN_ONCE(n > x86_pmu.max_pebs_events, + "Unexpected number of pebs records %d\n", n); + + return __intel_pmu_drain_pebs_nhm(iregs, at, top); +} + +static void intel_pmu_drain_pebs_hsw(struct pt_regs *iregs) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct debug_store *ds = cpuc->ds; + struct pebs_record_hsw *at, *top; + int n; + + if (!x86_pmu.pebs_active) + return; + + at = (struct pebs_record_hsw *)(unsigned long)ds->pebs_buffer_base; + top = (struct pebs_record_hsw *)(unsigned long)ds->pebs_index; + + n = top - at; + if (n <= 0) + return; + /* + * Should not happen, we program the threshold at 1 and do not + * set a reset value. + */ + WARN_ONCE(n > x86_pmu.max_pebs_events, + "Unexpected number of pebs records %d\n", n); + + return __intel_pmu_drain_pebs_nhm(iregs, at, top); +} + /* * BTS, PEBS probe and setup */ @@ -888,6 +1037,12 @@ void intel_ds_init(void) x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; break; + case 2: + pr_cont("PEBS fmt2%c, ", pebs_type); + x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw); + x86_pmu.drain_pebs = intel_pmu_drain_pebs_hsw; + break; + default: printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type); x86_pmu.pebs = 0; diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index d978353c939b..d5be06a5005e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -12,6 +12,16 @@ enum { LBR_FORMAT_LIP = 0x01, LBR_FORMAT_EIP = 0x02, LBR_FORMAT_EIP_FLAGS = 0x03, + LBR_FORMAT_EIP_FLAGS2 = 0x04, + LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_EIP_FLAGS2, +}; + +static enum { + LBR_EIP_FLAGS = 1, + LBR_TSX = 2, +} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = { + [LBR_FORMAT_EIP_FLAGS] = LBR_EIP_FLAGS, + [LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX, }; /* @@ -56,6 +66,8 @@ enum { LBR_FAR) #define LBR_FROM_FLAG_MISPRED (1ULL << 63) +#define LBR_FROM_FLAG_IN_TX (1ULL << 62) +#define LBR_FROM_FLAG_ABORT (1ULL << 61) #define for_each_branch_sample_type(x) \ for ((x) = PERF_SAMPLE_BRANCH_USER; \ @@ -81,9 +93,13 @@ enum { X86_BR_JMP = 1 << 9, /* jump */ X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ X86_BR_IND_CALL = 1 << 11,/* indirect calls */ + X86_BR_ABORT = 1 << 12,/* transaction abort */ + X86_BR_IN_TX = 1 << 13,/* in transaction */ + X86_BR_NO_TX = 1 << 14,/* not in transaction */ }; #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) +#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX) #define X86_BR_ANY \ (X86_BR_CALL |\ @@ -95,6 +111,7 @@ enum { X86_BR_JCC |\ X86_BR_JMP |\ X86_BR_IRQ |\ + X86_BR_ABORT |\ X86_BR_IND_CALL) #define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) @@ -270,21 +287,31 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) for (i = 0; i < x86_pmu.lbr_nr; i++) { unsigned long lbr_idx = (tos - i) & mask; - u64 from, to, mis = 0, pred = 0; + u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0; + int skip = 0; + int lbr_flags = lbr_desc[lbr_format]; rdmsrl(x86_pmu.lbr_from + lbr_idx, from); rdmsrl(x86_pmu.lbr_to + lbr_idx, to); - if (lbr_format == LBR_FORMAT_EIP_FLAGS) { + if (lbr_flags & LBR_EIP_FLAGS) { mis = !!(from & LBR_FROM_FLAG_MISPRED); pred = !mis; - from = (u64)((((s64)from) << 1) >> 1); + skip = 1; + } + if (lbr_flags & LBR_TSX) { + in_tx = !!(from & LBR_FROM_FLAG_IN_TX); + abort = !!(from & LBR_FROM_FLAG_ABORT); + skip = 3; } + from = (u64)((((s64)from) << skip) >> skip); cpuc->lbr_entries[i].from = from; cpuc->lbr_entries[i].to = to; cpuc->lbr_entries[i].mispred = mis; cpuc->lbr_entries[i].predicted = pred; + cpuc->lbr_entries[i].in_tx = in_tx; + cpuc->lbr_entries[i].abort = abort; cpuc->lbr_entries[i].reserved = 0; } cpuc->lbr_stack.nr = i; @@ -310,7 +337,7 @@ void intel_pmu_lbr_read(void) * - in case there is no HW filter * - in case the HW filter has errata or limitations */ -static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event) +static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event) { u64 br_type = event->attr.branch_sample_type; int mask = 0; @@ -318,11 +345,8 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event) if (br_type & PERF_SAMPLE_BRANCH_USER) mask |= X86_BR_USER; - if (br_type & PERF_SAMPLE_BRANCH_KERNEL) { - if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; + if (br_type & PERF_SAMPLE_BRANCH_KERNEL) mask |= X86_BR_KERNEL; - } /* we ignore BRANCH_HV here */ @@ -337,13 +361,21 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event) if (br_type & PERF_SAMPLE_BRANCH_IND_CALL) mask |= X86_BR_IND_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX) + mask |= X86_BR_ABORT; + + if (br_type & PERF_SAMPLE_BRANCH_IN_TX) + mask |= X86_BR_IN_TX; + + if (br_type & PERF_SAMPLE_BRANCH_NO_TX) + mask |= X86_BR_NO_TX; + /* * stash actual user request into reg, it may * be used by fixup code for some CPU */ event->hw.branch_reg.reg = mask; - - return 0; } /* @@ -391,9 +423,7 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event) /* * setup SW LBR filter */ - ret = intel_pmu_setup_sw_lbr_filter(event); - if (ret) - return ret; + intel_pmu_setup_sw_lbr_filter(event); /* * setup HW LBR filter, if any @@ -415,7 +445,7 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event) * decoded (e.g., text page not present), then X86_BR_NONE is * returned. */ -static int branch_type(unsigned long from, unsigned long to) +static int branch_type(unsigned long from, unsigned long to, int abort) { struct insn insn; void *addr; @@ -435,6 +465,9 @@ static int branch_type(unsigned long from, unsigned long to) if (from == 0 || to == 0) return X86_BR_NONE; + if (abort) + return X86_BR_ABORT | to_plm; + if (from_plm == X86_BR_USER) { /* * can happen if measuring at the user level only @@ -581,7 +614,13 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) from = cpuc->lbr_entries[i].from; to = cpuc->lbr_entries[i].to; - type = branch_type(from, to); + type = branch_type(from, to, cpuc->lbr_entries[i].abort); + if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) { + if (cpuc->lbr_entries[i].in_tx) + type |= X86_BR_IN_TX; + else + type |= X86_BR_NO_TX; + } /* if type does not correspond, then discard */ if (type == X86_BR_NONE || (br_sel & type) != type) { diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 52441a2af538..4118f9f68315 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -6,6 +6,8 @@ static struct intel_uncore_type **pci_uncores = empty_uncore; /* pci bus to socket mapping */ static int pcibus_to_physid[256] = { [0 ... 255] = -1, }; +static struct pci_dev *extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX]; + static DEFINE_RAW_SPINLOCK(uncore_box_lock); /* mask of cpus that collect uncore events */ @@ -45,6 +47,24 @@ DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7"); DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15"); DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23"); DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51"); +DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35"); +DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31"); +DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17"); +DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12"); +DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8"); +DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4"); +DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63"); +DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51"); +DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35"); +DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31"); +DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17"); +DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12"); +DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8"); +DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4"); +DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63"); static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) { @@ -281,7 +301,7 @@ static struct attribute *snbep_uncore_cbox_formats_attr[] = { }; static struct attribute *snbep_uncore_pcu_formats_attr[] = { - &format_attr_event.attr, + &format_attr_event_ext.attr, &format_attr_occ_sel.attr, &format_attr_edge.attr, &format_attr_inv.attr, @@ -301,6 +321,24 @@ static struct attribute *snbep_uncore_qpi_formats_attr[] = { &format_attr_edge.attr, &format_attr_inv.attr, &format_attr_thresh8.attr, + &format_attr_match_rds.attr, + &format_attr_match_rnid30.attr, + &format_attr_match_rnid4.attr, + &format_attr_match_dnid.attr, + &format_attr_match_mc.attr, + &format_attr_match_opc.attr, + &format_attr_match_vnw.attr, + &format_attr_match0.attr, + &format_attr_match1.attr, + &format_attr_mask_rds.attr, + &format_attr_mask_rnid30.attr, + &format_attr_mask_rnid4.attr, + &format_attr_mask_dnid.attr, + &format_attr_mask_mc.attr, + &format_attr_mask_opc.attr, + &format_attr_mask_vnw.attr, + &format_attr_mask0.attr, + &format_attr_mask1.attr, NULL, }; @@ -314,8 +352,8 @@ static struct uncore_event_desc snbep_uncore_imc_events[] = { static struct uncore_event_desc snbep_uncore_qpi_events[] = { INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"), INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"), - INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x02,umask=0x08"), - INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x03,umask=0x04"), + INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x102,umask=0x08"), + INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x103,umask=0x04"), { /* end: all zeroes */ }, }; @@ -356,13 +394,16 @@ static struct intel_uncore_ops snbep_uncore_msr_ops = { SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), }; +#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT() \ + .init_box = snbep_uncore_pci_init_box, \ + .disable_box = snbep_uncore_pci_disable_box, \ + .enable_box = snbep_uncore_pci_enable_box, \ + .disable_event = snbep_uncore_pci_disable_event, \ + .read_counter = snbep_uncore_pci_read_counter + static struct intel_uncore_ops snbep_uncore_pci_ops = { - .init_box = snbep_uncore_pci_init_box, - .disable_box = snbep_uncore_pci_disable_box, - .enable_box = snbep_uncore_pci_enable_box, - .disable_event = snbep_uncore_pci_disable_event, - .enable_event = snbep_uncore_pci_enable_event, - .read_counter = snbep_uncore_pci_read_counter, + SNBEP_UNCORE_PCI_OPS_COMMON_INIT(), + .enable_event = snbep_uncore_pci_enable_event, \ }; static struct event_constraint snbep_uncore_cbox_constraints[] = { @@ -536,7 +577,7 @@ __snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *eve if (!uncore_box_is_fake(box)) reg1->alloc |= alloc; - return 0; + return NULL; fail: for (; i >= 0; i--) { if (alloc & (0x1 << i)) @@ -644,7 +685,7 @@ snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event) (!uncore_box_is_fake(box) && reg1->alloc)) return NULL; again: - mask = 0xff << (idx * 8); + mask = 0xffULL << (idx * 8); raw_spin_lock_irqsave(&er->lock, flags); if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) || !((config1 ^ er->config) & mask)) { @@ -726,6 +767,61 @@ static struct intel_uncore_type *snbep_msr_uncores[] = { NULL, }; +enum { + SNBEP_PCI_QPI_PORT0_FILTER, + SNBEP_PCI_QPI_PORT1_FILTER, +}; + +static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) { + reg1->idx = 0; + reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0; + reg1->config = event->attr.config1; + reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0; + reg2->config = event->attr.config2; + } + return 0; +} + +static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + if (reg1->idx != EXTRA_REG_NONE) { + int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER; + struct pci_dev *filter_pdev = extra_pci_dev[box->phys_id][idx]; + WARN_ON_ONCE(!filter_pdev); + if (filter_pdev) { + pci_write_config_dword(filter_pdev, reg1->reg, + (u32)reg1->config); + pci_write_config_dword(filter_pdev, reg1->reg + 4, + (u32)(reg1->config >> 32)); + pci_write_config_dword(filter_pdev, reg2->reg, + (u32)reg2->config); + pci_write_config_dword(filter_pdev, reg2->reg + 4, + (u32)(reg2->config >> 32)); + } + } + + pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static struct intel_uncore_ops snbep_uncore_qpi_ops = { + SNBEP_UNCORE_PCI_OPS_COMMON_INIT(), + .enable_event = snbep_qpi_enable_event, + .hw_config = snbep_qpi_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + #define SNBEP_UNCORE_PCI_COMMON_INIT() \ .perf_ctr = SNBEP_PCI_PMON_CTR0, \ .event_ctl = SNBEP_PCI_PMON_CTL0, \ @@ -755,17 +851,18 @@ static struct intel_uncore_type snbep_uncore_imc = { }; static struct intel_uncore_type snbep_uncore_qpi = { - .name = "qpi", - .num_counters = 4, - .num_boxes = 2, - .perf_ctr_bits = 48, - .perf_ctr = SNBEP_PCI_PMON_CTR0, - .event_ctl = SNBEP_PCI_PMON_CTL0, - .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_PCI_PMON_BOX_CTL, - .ops = &snbep_uncore_pci_ops, - .event_descs = snbep_uncore_qpi_events, - .format_group = &snbep_uncore_qpi_format_group, + .name = "qpi", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCI_PMON_CTR0, + .event_ctl = SNBEP_PCI_PMON_CTL0, + .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &snbep_uncore_qpi_ops, + .event_descs = snbep_uncore_qpi_events, + .format_group = &snbep_uncore_qpi_format_group, }; @@ -807,43 +904,53 @@ static struct intel_uncore_type *snbep_pci_uncores[] = { static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = { { /* Home Agent */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA), - .driver_data = SNBEP_PCI_UNCORE_HA, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0), }, { /* MC Channel 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0), - .driver_data = SNBEP_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0), }, { /* MC Channel 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1), - .driver_data = SNBEP_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1), }, { /* MC Channel 2 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2), - .driver_data = SNBEP_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2), }, { /* MC Channel 3 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3), - .driver_data = SNBEP_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3), }, { /* QPI Port 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0), - .driver_data = SNBEP_PCI_UNCORE_QPI, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0), }, { /* QPI Port 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1), - .driver_data = SNBEP_PCI_UNCORE_QPI, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1), }, { /* R2PCIe */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE), - .driver_data = SNBEP_PCI_UNCORE_R2PCIE, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0), }, { /* R3QPI Link 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0), - .driver_data = SNBEP_PCI_UNCORE_R3QPI, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0), }, { /* R3QPI Link 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1), - .driver_data = SNBEP_PCI_UNCORE_R3QPI, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT0_FILTER), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT1_FILTER), }, { /* end: all zeroes */ } }; @@ -1256,71 +1363,71 @@ static struct intel_uncore_type *ivt_pci_uncores[] = { static DEFINE_PCI_DEVICE_TABLE(ivt_uncore_pci_ids) = { { /* Home Agent 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30), - .driver_data = IVT_PCI_UNCORE_HA, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 0), }, { /* Home Agent 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38), - .driver_data = IVT_PCI_UNCORE_HA, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 1), }, { /* MC0 Channel 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 0), }, { /* MC0 Channel 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 1), }, { /* MC0 Channel 3 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 2), }, { /* MC0 Channel 4 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 3), }, { /* MC1 Channel 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 4), }, { /* MC1 Channel 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 5), }, { /* MC1 Channel 3 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 6), }, { /* MC1 Channel 4 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 7), }, { /* QPI0 Port 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32), - .driver_data = IVT_PCI_UNCORE_QPI, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 0), }, { /* QPI0 Port 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33), - .driver_data = IVT_PCI_UNCORE_QPI, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 1), }, { /* QPI1 Port 2 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a), - .driver_data = IVT_PCI_UNCORE_QPI, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 2), }, { /* R2PCIe */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34), - .driver_data = IVT_PCI_UNCORE_R2PCIE, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R2PCIE, 0), }, { /* R3QPI0 Link 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36), - .driver_data = IVT_PCI_UNCORE_R3QPI, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 0), }, { /* R3QPI0 Link 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37), - .driver_data = IVT_PCI_UNCORE_R3QPI, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 1), }, { /* R3QPI1 Link 2 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e), - .driver_data = IVT_PCI_UNCORE_R3QPI, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 2), }, { /* end: all zeroes */ } }; @@ -1923,7 +2030,7 @@ static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modif { struct hw_perf_event *hwc = &event->hw; struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - int idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8); + u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8); u64 config = reg1->config; /* get the non-shared control bits and shift them */ @@ -2599,14 +2706,14 @@ static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box) box->hrtimer.function = uncore_pmu_hrtimer; } -struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int cpu) +static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int node) { struct intel_uncore_box *box; int i, size; size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg); - box = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu)); + box = kzalloc_node(size, GFP_KERNEL, node); if (!box) return NULL; @@ -2701,7 +2808,7 @@ uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *eve return c; } - if (event->hw.config == ~0ULL) + if (event->attr.config == UNCORE_FIXED_EVENT) return &constraint_fixed; if (type->constraints) { @@ -2723,15 +2830,16 @@ static void uncore_put_event_constraint(struct intel_uncore_box *box, struct per static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n) { unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; - struct event_constraint *c, *constraints[UNCORE_PMC_IDX_MAX]; + struct event_constraint *c; int i, wmin, wmax, ret = 0; struct hw_perf_event *hwc; bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX); for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) { + hwc = &box->event_list[i]->hw; c = uncore_get_event_constraint(box, box->event_list[i]); - constraints[i] = c; + hwc->constraint = c; wmin = min(wmin, c->weight); wmax = max(wmax, c->weight); } @@ -2739,7 +2847,7 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int /* fastpath, try to reuse previous register */ for (i = 0; i < n; i++) { hwc = &box->event_list[i]->hw; - c = constraints[i]; + c = hwc->constraint; /* never assigned */ if (hwc->idx == -1) @@ -2759,7 +2867,8 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int } /* slow path */ if (i != n) - ret = perf_assign_events(constraints, n, wmin, wmax, assign); + ret = perf_assign_events(box->event_list, n, + wmin, wmax, assign); if (!assign || ret) { for (i = 0; i < n; i++) @@ -2922,7 +3031,7 @@ static int uncore_validate_group(struct intel_uncore_pmu *pmu, struct intel_uncore_box *fake_box; int ret = -EINVAL, n; - fake_box = uncore_alloc_box(pmu->type, smp_processor_id()); + fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE); if (!fake_box) return -ENOMEM; @@ -3003,7 +3112,9 @@ static int uncore_pmu_event_init(struct perf_event *event) */ if (pmu->type->single_fixed && pmu->pmu_idx > 0) return -EINVAL; - hwc->config = ~0ULL; + + /* fixed counters have event field hardcoded to zero */ + hwc->config = 0ULL; } else { hwc->config = event->attr.config & pmu->type->event_mask; if (pmu->type->ops->hw_config) { @@ -3165,17 +3276,25 @@ static bool pcidrv_registered; /* * add a pci uncore device */ -static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev) +static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct intel_uncore_pmu *pmu; struct intel_uncore_box *box; - int i, phys_id; + struct intel_uncore_type *type; + int phys_id; phys_id = pcibus_to_physid[pdev->bus->number]; if (phys_id < 0) return -ENODEV; - box = uncore_alloc_box(type, 0); + if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) { + extra_pci_dev[phys_id][UNCORE_PCI_DEV_IDX(id->driver_data)] = pdev; + pci_set_drvdata(pdev, NULL); + return 0; + } + + type = pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)]; + box = uncore_alloc_box(type, NUMA_NO_NODE); if (!box) return -ENOMEM; @@ -3183,21 +3302,11 @@ static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev) * for performance monitoring unit with multiple boxes, * each box has a different function id. */ - for (i = 0; i < type->num_boxes; i++) { - pmu = &type->pmus[i]; - if (pmu->func_id == pdev->devfn) - break; - if (pmu->func_id < 0) { - pmu->func_id = pdev->devfn; - break; - } - pmu = NULL; - } - - if (!pmu) { - kfree(box); - return -EINVAL; - } + pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)]; + if (pmu->func_id < 0) + pmu->func_id = pdev->devfn; + else + WARN_ON_ONCE(pmu->func_id != pdev->devfn); box->phys_id = phys_id; box->pci_dev = pdev; @@ -3215,9 +3324,22 @@ static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev) static void uncore_pci_remove(struct pci_dev *pdev) { struct intel_uncore_box *box = pci_get_drvdata(pdev); - struct intel_uncore_pmu *pmu = box->pmu; - int cpu, phys_id = pcibus_to_physid[pdev->bus->number]; + struct intel_uncore_pmu *pmu; + int i, cpu, phys_id = pcibus_to_physid[pdev->bus->number]; + box = pci_get_drvdata(pdev); + if (!box) { + for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) { + if (extra_pci_dev[phys_id][i] == pdev) { + extra_pci_dev[phys_id][i] = NULL; + break; + } + } + WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX); + return; + } + + pmu = box->pmu; if (WARN_ON_ONCE(phys_id != box->phys_id)) return; @@ -3238,12 +3360,6 @@ static void uncore_pci_remove(struct pci_dev *pdev) kfree(box); } -static int uncore_pci_probe(struct pci_dev *pdev, - const struct pci_device_id *id) -{ - return uncore_pci_add(pci_uncores[id->driver_data], pdev); -} - static int __init uncore_pci_init(void) { int ret; @@ -3295,7 +3411,7 @@ static void __init uncore_pci_exit(void) /* CPU hot plug/unplug are serialized by cpu_add_remove_lock mutex */ static LIST_HEAD(boxes_to_free); -static void __cpuinit uncore_kfree_boxes(void) +static void uncore_kfree_boxes(void) { struct intel_uncore_box *box; @@ -3307,7 +3423,7 @@ static void __cpuinit uncore_kfree_boxes(void) } } -static void __cpuinit uncore_cpu_dying(int cpu) +static void uncore_cpu_dying(int cpu) { struct intel_uncore_type *type; struct intel_uncore_pmu *pmu; @@ -3326,7 +3442,7 @@ static void __cpuinit uncore_cpu_dying(int cpu) } } -static int __cpuinit uncore_cpu_starting(int cpu) +static int uncore_cpu_starting(int cpu) { struct intel_uncore_type *type; struct intel_uncore_pmu *pmu; @@ -3369,7 +3485,7 @@ static int __cpuinit uncore_cpu_starting(int cpu) return 0; } -static int __cpuinit uncore_cpu_prepare(int cpu, int phys_id) +static int uncore_cpu_prepare(int cpu, int phys_id) { struct intel_uncore_type *type; struct intel_uncore_pmu *pmu; @@ -3383,7 +3499,7 @@ static int __cpuinit uncore_cpu_prepare(int cpu, int phys_id) if (pmu->func_id < 0) pmu->func_id = j; - box = uncore_alloc_box(type, cpu); + box = uncore_alloc_box(type, cpu_to_node(cpu)); if (!box) return -ENOMEM; @@ -3395,7 +3511,7 @@ static int __cpuinit uncore_cpu_prepare(int cpu, int phys_id) return 0; } -static void __cpuinit +static void uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu) { struct intel_uncore_type *type; @@ -3433,7 +3549,7 @@ uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_c } } -static void __cpuinit uncore_event_exit_cpu(int cpu) +static void uncore_event_exit_cpu(int cpu) { int i, phys_id, target; @@ -3461,7 +3577,7 @@ static void __cpuinit uncore_event_exit_cpu(int cpu) uncore_change_context(pci_uncores, cpu, target); } -static void __cpuinit uncore_event_init_cpu(int cpu) +static void uncore_event_init_cpu(int cpu) { int i, phys_id; @@ -3477,8 +3593,8 @@ static void __cpuinit uncore_event_init_cpu(int cpu) uncore_change_context(pci_uncores, -1, cpu); } -static int - __cpuinit uncore_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) +static int uncore_cpu_notifier(struct notifier_block *self, + unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; @@ -3518,7 +3634,7 @@ static int return NOTIFY_OK; } -static struct notifier_block uncore_cpu_nb __cpuinitdata = { +static struct notifier_block uncore_cpu_nb = { .notifier_call = uncore_cpu_notifier, /* * to migrate uncore events, our notifier should be executed diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index f9528917f6e8..a80ab71a883d 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -12,6 +12,15 @@ #define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC #define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1) +#define UNCORE_PCI_DEV_DATA(type, idx) ((type << 8) | idx) +#define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff) +#define UNCORE_PCI_DEV_IDX(data) (data & 0xff) +#define UNCORE_EXTRA_PCI_DEV 0xff +#define UNCORE_EXTRA_PCI_DEV_MAX 2 + +/* support up to 8 sockets */ +#define UNCORE_SOCKET_MAX 8 + #define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff) /* SNB event control */ @@ -108,6 +117,7 @@ (SNBEP_PMON_CTL_EV_SEL_MASK | \ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_EV_SEL_EXT | \ SNBEP_PMON_CTL_INVERT | \ SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ @@ -337,10 +347,10 @@ NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK) #define NHMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 11) - 1) | (1 << 23)) -#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7 << (11 + 3 * (n))) +#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (11 + 3 * (n))) #define WSMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 12) - 1) | (1 << 24)) -#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7 << (12 + 3 * (n))) +#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (12 + 3 * (n))) /* * use the 9~13 bits to select event If the 7th bit is not set, diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c index 7b3fe56b1c21..31f0f335ed22 100644 --- a/arch/x86/kernel/cpu/powerflags.c +++ b/arch/x86/kernel/cpu/powerflags.c @@ -11,10 +11,10 @@ const char *const x86_power_flags[32] = { "fid", /* frequency id control */ "vid", /* voltage id control */ "ttp", /* thermal trip */ - "tm", - "stc", - "100mhzsteps", - "hwpstate", + "tm", /* hardware thermal control */ + "stc", /* software thermal control */ + "100mhzsteps", /* 100 MHz multiplier control */ + "hwpstate", /* hardware P-state control */ "", /* tsc invariant mapped to constant_tsc */ "cpb", /* core performance boost */ "eff_freq_ro", /* Readonly aperf/mperf */ diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 37a198bd48c8..aee6317b902f 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -37,8 +37,8 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) static_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no", static_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no", static_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no", - c->hard_math ? "yes" : "no", - c->hard_math ? "yes" : "no", + static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", + static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", c->cpuid_level, c->wp_works_ok ? "yes" : "no"); } diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index feca286c2bb4..88db010845cb 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -52,7 +52,7 @@ static inline int rdrand_long(unsigned long *v) */ #define RESEED_LOOP ((512*128)/sizeof(unsigned long)) -void __cpuinit x86_init_rdrand(struct cpuinfo_x86 *c) +void x86_init_rdrand(struct cpuinfo_x86 *c) { #ifdef CONFIG_ARCH_RANDOM unsigned long tmp; diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index d92b5dad15dd..f2cc63e9cf08 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -24,13 +24,13 @@ enum cpuid_regs { CR_EBX }; -void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) +void init_scattered_cpuid_features(struct cpuinfo_x86 *c) { u32 max_level; u32 regs[4]; const struct cpuid_bit *cb; - static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { + static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_DTHERM, CR_EAX, 0, 0x00000006, 0 }, { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 4397e987a1cf..4c60eaf0571c 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -26,7 +26,7 @@ * exists, use it for populating initial_apicid and cpu topology * detection. */ -void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) +void detect_extended_topology(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP unsigned int eax, ebx, ecx, edx, sub_index; diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index 28000743bbb0..aa0430d69b90 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -5,7 +5,7 @@ #include <asm/msr.h> #include "cpu.h" -static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c) +static void early_init_transmeta(struct cpuinfo_x86 *c) { u32 xlvl; @@ -17,7 +17,7 @@ static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c) } } -static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) +static void init_transmeta(struct cpuinfo_x86 *c) { unsigned int cap_mask, uk, max, dummy; unsigned int cms_rev1, cms_rev2; @@ -98,7 +98,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) #endif } -static const struct cpu_dev __cpuinitconst transmeta_cpu_dev = { +static const struct cpu_dev transmeta_cpu_dev = { .c_vendor = "Transmeta", .c_ident = { "GenuineTMx86", "TransmetaCPU" }, .c_early_init = early_init_transmeta, diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c index fd2c37bf7acb..202759a14121 100644 --- a/arch/x86/kernel/cpu/umc.c +++ b/arch/x86/kernel/cpu/umc.c @@ -8,7 +8,7 @@ * so no special init takes place. */ -static const struct cpu_dev __cpuinitconst umc_cpu_dev = { +static const struct cpu_dev umc_cpu_dev = { .c_vendor = "UMC", .c_ident = { "UMC UMC UMC" }, .c_models = { diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 03a36321ec54..628a059a9a06 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -93,7 +93,7 @@ static void __init vmware_platform_setup(void) * serial key should be enough, as this will always have a VMware * specific string when running under VMware hypervisor. */ -static bool __init vmware_platform(void) +static uint32_t __init vmware_platform(void) { if (cpu_has_hypervisor) { unsigned int eax; @@ -102,12 +102,12 @@ static bool __init vmware_platform(void) cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0], &hyper_vendor_id[1], &hyper_vendor_id[2]); if (!memcmp(hyper_vendor_id, "VMwareVMware", 12)) - return true; + return CPUID_VMWARE_INFO_LEAF; } else if (dmi_available && dmi_name_in_serial("VMware") && __vmware_platform()) - return true; + return 1; - return false; + return 0; } /* @@ -122,7 +122,7 @@ static bool __init vmware_platform(void) * so that the kernel could just trust the hypervisor with providing a * reliable virtual TSC that is suitable for timekeeping. */ -static void __cpuinit vmware_set_cpu_features(struct cpuinfo_x86 *c) +static void vmware_set_cpu_features(struct cpuinfo_x86 *c) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 1e4dbcfe6d31..7d9481c743f8 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -137,7 +137,7 @@ static const struct file_operations cpuid_fops = { .open = cpuid_open, }; -static __cpuinit int cpuid_device_create(int cpu) +static int cpuid_device_create(int cpu) { struct device *dev; @@ -151,9 +151,8 @@ static void cpuid_device_destroy(int cpu) device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); } -static int __cpuinit cpuid_class_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static int cpuid_class_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; int err = 0; diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 74467feb4dc5..e0e0841eef45 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -128,7 +128,9 @@ void native_machine_crash_shutdown(struct pt_regs *regs) cpu_emergency_svm_disable(); lapic_shutdown(); -#if defined(CONFIG_X86_IO_APIC) +#ifdef CONFIG_X86_IO_APIC + /* Prevent crash_kexec() from deadlocking on ioapic_lock. */ + ioapic_zap_locks(); disable_IO_APIC(); #endif #ifdef CONFIG_HPET_TIMER diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index b1581527a236..376dc7873447 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -52,8 +52,7 @@ void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align) } #ifdef CONFIG_BLK_DEV_INITRD -void __init early_init_dt_setup_initrd_arch(unsigned long start, - unsigned long end) +void __init early_init_dt_setup_initrd_arch(u64 start, u64 end) { initrd_start = (unsigned long)__va(start); initrd_end = (unsigned long)__va(end); @@ -133,7 +132,7 @@ static void x86_of_pci_irq_disable(struct pci_dev *dev) { } -void __cpuinit x86_of_pci_init(void) +void x86_of_pci_init(void) { pcibios_enable_irq = x86_of_pci_irq_enable; pcibios_disable_irq = x86_of_pci_irq_disable; @@ -364,9 +363,7 @@ static void dt_add_ioapic_domain(unsigned int ioapic_num, * and assigned so we can keep the 1:1 mapping which the ioapic * is having. */ - ret = irq_domain_associate_many(id, 0, 0, NR_IRQS_LEGACY); - if (ret) - pr_err("Error mapping legacy IRQs: %d\n", ret); + irq_domain_associate_many(id, 0, 0, NR_IRQS_LEGACY); if (num > NR_IRQS_LEGACY) { ret = irq_create_strict_mappings(id, NR_IRQS_LEGACY, diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault.c index 155a13f33ed8..5d3fe8d36e4a 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault.c @@ -9,6 +9,8 @@ #include <asm/processor.h> #include <asm/desc.h> +#ifdef CONFIG_X86_32 + #define DOUBLEFAULT_STACKSIZE (1024) static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE]; #define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE) @@ -67,3 +69,16 @@ struct tss_struct doublefault_tss __cacheline_aligned = { .__cr3 = __pa_nodebug(swapper_pg_dir), } }; + +/* dummy for do_double_fault() call */ +void df_debug(struct pt_regs *regs, long error_code) {} + +#else /* !CONFIG_X86_32 */ + +void df_debug(struct pt_regs *regs, long error_code) +{ + pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); + show_regs(regs); + panic("Machine halted."); +} +#endif diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d32abeabbda5..174da5fc5a7b 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -658,15 +658,18 @@ __init void e820_setup_gap(void) * boot_params.e820_map, others are passed via SETUP_E820_EXT node of * linked list of struct setup_data, which is parsed here. */ -void __init parse_e820_ext(struct setup_data *sdata) +void __init parse_e820_ext(u64 phys_addr, u32 data_len) { int entries; struct e820entry *extmap; + struct setup_data *sdata; + sdata = early_memremap(phys_addr, data_len); entries = sdata->len / sizeof(struct e820entry); extmap = (struct e820entry *)(sdata->data); __append_e820_map(extmap, entries); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + early_iounmap(sdata, data_len); printk(KERN_INFO "e820: extended physical RAM map:\n"); e820_print_map("extended"); } diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 94ab6b90dd3f..b3cd3ebae077 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -12,6 +12,7 @@ #include <linux/pci.h> #include <linux/acpi.h> #include <linux/pci_ids.h> +#include <drm/i915_drm.h> #include <asm/pci-direct.h> #include <asm/dma.h> #include <asm/io_apic.h> @@ -196,16 +197,175 @@ static void __init ati_bugs_contd(int num, int slot, int func) static void __init intel_remapping_check(int num, int slot, int func) { u8 revision; + u16 device; + device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); revision = read_pci_config_byte(num, slot, func, PCI_REVISION_ID); /* - * Revision 0x13 of this chipset supports irq remapping - * but has an erratum that breaks its behavior, flag it as such + * Revision 13 of all triggering devices id in this quirk have + * a problem draining interrupts when irq remapping is enabled, + * and should be flagged as broken. Additionally revisions 0x12 + * and 0x22 of device id 0x3405 has this problem. */ if (revision == 0x13) set_irq_remapping_broken(); + else if ((device == 0x3405) && + ((revision == 0x12) || + (revision == 0x22))) + set_irq_remapping_broken(); + +} + +/* + * Systems with Intel graphics controllers set aside memory exclusively + * for gfx driver use. This memory is not marked in the E820 as reserved + * or as RAM, and so is subject to overlap from E820 manipulation later + * in the boot process. On some systems, MMIO space is allocated on top, + * despite the efforts of the "RAM buffer" approach, which simply rounds + * memory boundaries up to 64M to try to catch space that may decode + * as RAM and so is not suitable for MMIO. + * + * And yes, so far on current devices the base addr is always under 4G. + */ +static u32 __init intel_stolen_base(int num, int slot, int func) +{ + u32 base; + + /* + * For the PCI IDs in this quirk, the stolen base is always + * in 0x5c, aka the BDSM register (yes that's really what + * it's called). + */ + base = read_pci_config(num, slot, func, 0x5c); + base &= ~((1<<20) - 1); + + return base; +} + +#define KB(x) ((x) * 1024) +#define MB(x) (KB (KB (x))) +#define GB(x) (MB (KB (x))) + +static size_t __init gen3_stolen_size(int num, int slot, int func) +{ + size_t stolen_size; + u16 gmch_ctrl; + + gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL); + + switch (gmch_ctrl & I855_GMCH_GMS_MASK) { + case I855_GMCH_GMS_STOLEN_1M: + stolen_size = MB(1); + break; + case I855_GMCH_GMS_STOLEN_4M: + stolen_size = MB(4); + break; + case I855_GMCH_GMS_STOLEN_8M: + stolen_size = MB(8); + break; + case I855_GMCH_GMS_STOLEN_16M: + stolen_size = MB(16); + break; + case I855_GMCH_GMS_STOLEN_32M: + stolen_size = MB(32); + break; + case I915_GMCH_GMS_STOLEN_48M: + stolen_size = MB(48); + break; + case I915_GMCH_GMS_STOLEN_64M: + stolen_size = MB(64); + break; + case G33_GMCH_GMS_STOLEN_128M: + stolen_size = MB(128); + break; + case G33_GMCH_GMS_STOLEN_256M: + stolen_size = MB(256); + break; + case INTEL_GMCH_GMS_STOLEN_96M: + stolen_size = MB(96); + break; + case INTEL_GMCH_GMS_STOLEN_160M: + stolen_size = MB(160); + break; + case INTEL_GMCH_GMS_STOLEN_224M: + stolen_size = MB(224); + break; + case INTEL_GMCH_GMS_STOLEN_352M: + stolen_size = MB(352); + break; + default: + stolen_size = 0; + break; + } + + return stolen_size; +} + +static size_t __init gen6_stolen_size(int num, int slot, int func) +{ + u16 gmch_ctrl; + + gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); + gmch_ctrl >>= SNB_GMCH_GMS_SHIFT; + gmch_ctrl &= SNB_GMCH_GMS_MASK; + + return gmch_ctrl << 25; /* 32 MB units */ +} + +typedef size_t (*stolen_size_fn)(int num, int slot, int func); + +static struct pci_device_id intel_stolen_ids[] __initdata = { + INTEL_I915G_IDS(gen3_stolen_size), + INTEL_I915GM_IDS(gen3_stolen_size), + INTEL_I945G_IDS(gen3_stolen_size), + INTEL_I945GM_IDS(gen3_stolen_size), + INTEL_VLV_M_IDS(gen3_stolen_size), + INTEL_VLV_D_IDS(gen3_stolen_size), + INTEL_PINEVIEW_IDS(gen3_stolen_size), + INTEL_I965G_IDS(gen3_stolen_size), + INTEL_G33_IDS(gen3_stolen_size), + INTEL_I965GM_IDS(gen3_stolen_size), + INTEL_GM45_IDS(gen3_stolen_size), + INTEL_G45_IDS(gen3_stolen_size), + INTEL_IRONLAKE_D_IDS(gen3_stolen_size), + INTEL_IRONLAKE_M_IDS(gen3_stolen_size), + INTEL_SNB_D_IDS(gen6_stolen_size), + INTEL_SNB_M_IDS(gen6_stolen_size), + INTEL_IVB_M_IDS(gen6_stolen_size), + INTEL_IVB_D_IDS(gen6_stolen_size), + INTEL_HSW_D_IDS(gen6_stolen_size), + INTEL_HSW_M_IDS(gen6_stolen_size), +}; +static void __init intel_graphics_stolen(int num, int slot, int func) +{ + size_t size; + int i; + u32 start; + u16 device, subvendor, subdevice; + + device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); + subvendor = read_pci_config_16(num, slot, func, + PCI_SUBSYSTEM_VENDOR_ID); + subdevice = read_pci_config_16(num, slot, func, PCI_SUBSYSTEM_ID); + + for (i = 0; i < ARRAY_SIZE(intel_stolen_ids); i++) { + if (intel_stolen_ids[i].device == device) { + stolen_size_fn stolen_size = + (stolen_size_fn)intel_stolen_ids[i].driver_data; + size = stolen_size(num, slot, func); + start = intel_stolen_base(num, slot, func); + if (size && start) { + /* Mark this space as reserved */ + e820_add_region(start, size, E820_RESERVED); + sanitize_e820_map(e820.map, + ARRAY_SIZE(e820.map), + &e820.nr_map); + } + return; + } + } } #define QFLAG_APPLY_ONCE 0x1 @@ -239,8 +399,12 @@ static struct chipset early_qrk[] __initdata = { PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, { PCI_VENDOR_ID_INTEL, 0x3403, PCI_CLASS_BRIDGE_HOST, PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, + { PCI_VENDOR_ID_INTEL, 0x3405, PCI_CLASS_BRIDGE_HOST, + PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST, PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, + { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID, + QFLAG_APPLY_ONCE, intel_graphics_stolen }, {} }; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 8f3e2dec1df3..f0dcb0ceb6a2 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -801,7 +801,17 @@ ENTRY(name) \ CFI_ENDPROC; \ ENDPROC(name) -#define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name) + +#ifdef CONFIG_TRACING +#define TRACE_BUILD_INTERRUPT(name, nr) \ + BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name) +#else +#define TRACE_BUILD_INTERRUPT(name, nr) +#endif + +#define BUILD_INTERRUPT(name, nr) \ + BUILD_INTERRUPT3(name, nr, smp_##name); \ + TRACE_BUILD_INTERRUPT(name, nr) /* The include is where all of the SMP etc. interrupts come from */ #include <asm/entry_arch.h> @@ -1166,6 +1176,9 @@ ftrace_restore_flags: #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) + cmpl $__PAGE_OFFSET, %esp + jb ftrace_stub /* Paging not enabled yet? */ + cmpl $0, function_trace_stop jne ftrace_stub diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 727208941030..b077f4cc225a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -365,7 +365,7 @@ ENDPROC(native_usergs_sysret64) /*CFI_REL_OFFSET ss,0*/ pushq_cfi %rax /* rsp */ CFI_REL_OFFSET rsp,0 - pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */ + pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) /* eflags - interrupts on */ /*CFI_REL_OFFSET rflags,0*/ pushq_cfi $__KERNEL_CS /* cs */ /*CFI_REL_OFFSET cs,0*/ @@ -487,21 +487,6 @@ ENDPROC(native_usergs_sysret64) TRACE_IRQS_OFF .endm -ENTRY(save_rest) - PARTIAL_FRAME 1 (REST_SKIP+8) - movq 5*8+16(%rsp), %r11 /* save return address */ - movq_cfi rbx, RBX+16 - movq_cfi rbp, RBP+16 - movq_cfi r12, R12+16 - movq_cfi r13, R13+16 - movq_cfi r14, R14+16 - movq_cfi r15, R15+16 - movq %r11, 8(%rsp) /* return address */ - FIXUP_TOP_OF_STACK %r11, 16 - ret - CFI_ENDPROC -END(save_rest) - /* save complete stack frame */ .pushsection .kprobes.text, "ax" ENTRY(save_paranoid) @@ -1138,7 +1123,7 @@ END(common_interrupt) /* * APIC interrupts. */ -.macro apicinterrupt num sym do_sym +.macro apicinterrupt3 num sym do_sym ENTRY(\sym) INTR_FRAME ASM_CLAC @@ -1150,15 +1135,32 @@ ENTRY(\sym) END(\sym) .endm +#ifdef CONFIG_TRACING +#define trace(sym) trace_##sym +#define smp_trace(sym) smp_trace_##sym + +.macro trace_apicinterrupt num sym +apicinterrupt3 \num trace(\sym) smp_trace(\sym) +.endm +#else +.macro trace_apicinterrupt num sym do_sym +.endm +#endif + +.macro apicinterrupt num sym do_sym +apicinterrupt3 \num \sym \do_sym +trace_apicinterrupt \num \sym +.endm + #ifdef CONFIG_SMP -apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ +apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR \ irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt -apicinterrupt REBOOT_VECTOR \ +apicinterrupt3 REBOOT_VECTOR \ reboot_interrupt smp_reboot_interrupt #endif #ifdef CONFIG_X86_UV -apicinterrupt UV_BAU_MESSAGE \ +apicinterrupt3 UV_BAU_MESSAGE \ uv_bau_message_intr1 uv_bau_message_interrupt #endif apicinterrupt LOCAL_TIMER_VECTOR \ @@ -1167,14 +1169,19 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \ x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_HAVE_KVM -apicinterrupt POSTED_INTR_VECTOR \ +apicinterrupt3 POSTED_INTR_VECTOR \ kvm_posted_intr_ipi smp_kvm_posted_intr_ipi #endif +#ifdef CONFIG_X86_MCE_THRESHOLD apicinterrupt THRESHOLD_APIC_VECTOR \ threshold_interrupt smp_threshold_interrupt +#endif + +#ifdef CONFIG_X86_THERMAL_VECTOR apicinterrupt THERMAL_APIC_VECTOR \ thermal_interrupt smp_thermal_interrupt +#endif #ifdef CONFIG_SMP apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ @@ -1451,13 +1458,13 @@ ENTRY(xen_failsafe_callback) CFI_ENDPROC END(xen_failsafe_callback) -apicinterrupt HYPERVISOR_CALLBACK_VECTOR \ +apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ xen_hvm_callback_vector xen_evtchn_do_upcall #endif /* CONFIG_XEN */ #if IS_ENABLED(CONFIG_HYPERV) -apicinterrupt HYPERVISOR_CALLBACK_VECTOR \ +apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ hyperv_callback_vector hyperv_vector_handler #endif /* CONFIG_HYPERV */ diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 138463a24877..06f87bece92a 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -29,7 +29,7 @@ static void __init i386_default_early_setup(void) reserve_ebda_region(); } -void __init i386_start_kernel(void) +asmlinkage void __init i386_start_kernel(void) { sanitize_boot_params(&boot_params); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 55b67614ed94..1be8e43b669e 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -137,7 +137,7 @@ static void __init copy_bootdata(char *real_mode_data) } } -void __init x86_64_start_kernel(char * real_mode_data) +asmlinkage void __init x86_64_start_kernel(char * real_mode_data) { int i; diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 73afd11799ca..81ba27679f18 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -292,7 +292,6 @@ ENDPROC(start_cpu0) * If cpu hotplug is not supported then this code can go in init section * which will be freed later */ -__CPUINIT ENTRY(startup_32_smp) cld movl $(__BOOT_DS),%eax @@ -410,6 +409,7 @@ enable_paging: /* * Check if it is 486 */ + movb $4,X86 # at least 486 cmpl $-1,X86_CPUID je is486 @@ -437,14 +437,12 @@ enable_paging: movl %edx,X86_CAPABILITY is486: - movb $4,X86 movl $0x50022,%ecx # set AM, WP, NE and MP movl %cr0,%eax andl $0x80000011,%eax # Save PG,PE,ET orl %ecx,%eax movl %eax,%cr0 - call check_x87 lgdt early_gdt_descr lidt idt_descr ljmp $(__KERNEL_CS),$1f @@ -467,26 +465,6 @@ is486: pushl $0 # fake return address for unwinder jmp *(initial_code) -/* - * We depend on ET to be correct. This checks for 287/387. - */ -check_x87: - movb $0,X86_HARD_MATH - clts - fninit - fstsw %ax - cmpb $0,%al - je 1f - movl %cr0,%eax /* no coprocessor: have to set bits */ - xorl $4,%eax /* set EM */ - movl %eax,%cr0 - ret - ALIGN -1: movb $1,X86_HARD_MATH - .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */ - ret - - #include "verify_cpu.S" /* diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 321d65ebaffe..e1aabdb314c8 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -512,15 +512,6 @@ ENTRY(phys_base) #include "../../x86/xen/xen-head.S" - .section .bss, "aw", @nobits - .align L1_CACHE_BYTES -ENTRY(idt_table) - .skip IDT_ENTRIES * 16 - - .align L1_CACHE_BYTES -ENTRY(nmi_idt_table) - .skip IDT_ENTRIES * 16 - __PAGE_ALIGNED_BSS NEXT_PAGE(empty_zero_page) .skip PAGE_SIZE diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 02f07634d265..f66ff162dce8 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -393,6 +393,9 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) unregister_hw_breakpoint(t->ptrace_bps[i]); t->ptrace_bps[i] = NULL; } + + t->debugreg6 = 0; + t->ptrace_dr7 = 0; } void hw_breakpoint_restore(void) diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index cb339097b9ea..5d576ab34403 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -108,15 +108,15 @@ EXPORT_SYMBOL(unlazy_fpu); unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; unsigned int xstate_size; EXPORT_SYMBOL_GPL(xstate_size); -static struct i387_fxsave_struct fx_scratch __cpuinitdata; +static struct i387_fxsave_struct fx_scratch; -static void __cpuinit mxcsr_feature_mask_init(void) +static void mxcsr_feature_mask_init(void) { unsigned long mask = 0; if (cpu_has_fxsr) { memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct)); - asm volatile("fxsave %0" : : "m" (fx_scratch)); + asm volatile("fxsave %0" : "+m" (fx_scratch)); mask = fx_scratch.mxcsr_mask; if (mask == 0) mask = 0x0000ffbf; @@ -124,14 +124,14 @@ static void __cpuinit mxcsr_feature_mask_init(void) mxcsr_feature_mask &= mask; } -static void __cpuinit init_thread_xstate(void) +static void init_thread_xstate(void) { /* * Note that xstate_size might be overwriten later during * xsave_init(). */ - if (!HAVE_HWFP) { + if (!cpu_has_fpu) { /* * Disable xsave as we do not support it if i387 * emulation is enabled. @@ -153,11 +153,19 @@ static void __cpuinit init_thread_xstate(void) * into all processes. */ -void __cpuinit fpu_init(void) +void fpu_init(void) { unsigned long cr0; unsigned long cr4_mask = 0; +#ifndef CONFIG_MATH_EMULATION + if (!cpu_has_fpu) { + pr_emerg("No FPU found and no math emulation present\n"); + pr_emerg("Giving up\n"); + for (;;) + asm volatile("hlt"); + } +#endif if (cpu_has_fxsr) cr4_mask |= X86_CR4_OSFXSR; if (cpu_has_xmm) @@ -167,7 +175,7 @@ void __cpuinit fpu_init(void) cr0 = read_cr0(); cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */ - if (!HAVE_HWFP) + if (!cpu_has_fpu) cr0 |= X86_CR0_EM; write_cr0(cr0); @@ -185,7 +193,7 @@ void __cpuinit fpu_init(void) void fpu_finit(struct fpu *fpu) { - if (!HAVE_HWFP) { + if (!cpu_has_fpu) { finit_soft_fpu(&fpu->state->soft); return; } @@ -214,7 +222,7 @@ int init_fpu(struct task_struct *tsk) int ret; if (tsk_used_math(tsk)) { - if (HAVE_HWFP && tsk == current) + if (cpu_has_fpu && tsk == current) unlazy_fpu(tsk); tsk->thread.fpu.last_cpu = ~0; return 0; @@ -511,14 +519,13 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; - if (!HAVE_HWFP) + if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); - if (!cpu_has_fxsr) { + if (!cpu_has_fxsr) return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &target->thread.fpu.state->fsave, 0, -1); - } sanitize_i387_state(target); @@ -545,13 +552,13 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, sanitize_i387_state(target); - if (!HAVE_HWFP) + if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); - if (!cpu_has_fxsr) { + if (!cpu_has_fxsr) return user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.fpu.state->fsave, 0, -1); - } + &target->thread.fpu.state->fsave, 0, + -1); if (pos > 0 || count < sizeof(env)) convert_from_fxsr(&env, target); @@ -592,3 +599,33 @@ int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu) EXPORT_SYMBOL(dump_fpu); #endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ + +static int __init no_387(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_FPU); + return 1; +} + +__setup("no387", no_387); + +void fpu_detect(struct cpuinfo_x86 *c) +{ + unsigned long cr0; + u16 fsw, fcw; + + fsw = fcw = 0xffff; + + cr0 = read_cr0(); + cr0 &= ~(X86_CR0_TS | X86_CR0_EM); + write_cr0(cr0); + + asm volatile("fninit ; fnstsw %0 ; fnstcw %1" + : "+m" (fsw), "+m" (fcw)); + + if (fsw == 0 && (fcw & 0x103f) == 0x003f) + set_cpu_cap(c, X86_FEATURE_FPU); + else + clear_cpu_cap(c, X86_FEATURE_FPU); + + /* The final cr0 value is set in fpu_init() */ +} diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index ac0631d8996f..22d0687e7fda 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -18,6 +18,9 @@ #include <asm/mce.h> #include <asm/hw_irq.h> +#define CREATE_TRACE_POINTS +#include <asm/trace/irq_vectors.h> + atomic_t irq_err_count; /* Function pointer for generic interrupt vector handling */ @@ -174,7 +177,7 @@ u64 arch_irq_stat(void) * SMP cross-CPU interrupts have their own specific * handlers). */ -unsigned int __irq_entry do_IRQ(struct pt_regs *regs) +__visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -204,23 +207,21 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) /* * Handler for X86_PLATFORM_IPI_VECTOR. */ -void smp_x86_platform_ipi(struct pt_regs *regs) +void __smp_x86_platform_ipi(void) { - struct pt_regs *old_regs = set_irq_regs(regs); - - ack_APIC_irq(); - - irq_enter(); - - exit_idle(); - inc_irq_stat(x86_platform_ipis); if (x86_platform_ipi_callback) x86_platform_ipi_callback(); +} - irq_exit(); +__visible void smp_x86_platform_ipi(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + entering_ack_irq(); + __smp_x86_platform_ipi(); + exiting_irq(); set_irq_regs(old_regs); } @@ -228,7 +229,7 @@ void smp_x86_platform_ipi(struct pt_regs *regs) /* * Handler for POSTED_INTERRUPT_VECTOR. */ -void smp_kvm_posted_intr_ipi(struct pt_regs *regs) +__visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -246,6 +247,18 @@ void smp_kvm_posted_intr_ipi(struct pt_regs *regs) } #endif +__visible void smp_trace_x86_platform_ipi(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + entering_ack_irq(); + trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); + __smp_x86_platform_ipi(); + trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); + exiting_irq(); + set_irq_regs(old_regs); +} + EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 344faf8d0d62..4186755f1d7c 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -119,7 +119,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) /* * allocate per-cpu stacks for hardirq and for softirq processing */ -void __cpuinit irq_ctx_init(int cpu) +void irq_ctx_init(int cpu) { union irq_ctx *irqctx; diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index ca8f703a1e70..1de84e3ab4e0 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -8,14 +8,34 @@ #include <linux/irq_work.h> #include <linux/hardirq.h> #include <asm/apic.h> +#include <asm/trace/irq_vectors.h> -void smp_irq_work_interrupt(struct pt_regs *regs) +static inline void irq_work_entering_irq(void) { irq_enter(); ack_APIC_irq(); +} + +static inline void __smp_irq_work_interrupt(void) +{ inc_irq_stat(apic_irq_work_irqs); irq_work_run(); - irq_exit(); +} + +__visible void smp_irq_work_interrupt(struct pt_regs *regs) +{ + irq_work_entering_irq(); + __smp_irq_work_interrupt(); + exiting_irq(); +} + +__visible void smp_trace_irq_work_interrupt(struct pt_regs *regs) +{ + irq_work_entering_irq(); + trace_irq_work_entry(IRQ_WORK_VECTOR); + __smp_irq_work_interrupt(); + trace_irq_work_exit(IRQ_WORK_VECTOR); + exiting_irq(); } void arch_irq_work_raise(void) diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 2889b3d43882..ee11b7dfbfbb 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -24,20 +24,71 @@ union jump_code_union { } __attribute__((packed)); }; +static void bug_at(unsigned char *ip, int line) +{ + /* + * The location is not an op that we were expecting. + * Something went wrong. Crash the box, as something could be + * corrupting the kernel. + */ + pr_warning("Unexpected op at %pS [%p] (%02x %02x %02x %02x %02x) %s:%d\n", + ip, ip, ip[0], ip[1], ip[2], ip[3], ip[4], __FILE__, line); + BUG(); +} + static void __jump_label_transform(struct jump_entry *entry, enum jump_label_type type, - void *(*poker)(void *, const void *, size_t)) + void *(*poker)(void *, const void *, size_t), + int init) { union jump_code_union code; + const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; if (type == JUMP_LABEL_ENABLE) { + /* + * We are enabling this jump label. If it is not a nop + * then something must have gone wrong. + */ + if (unlikely(memcmp((void *)entry->code, ideal_nop, 5) != 0)) + bug_at((void *)entry->code, __LINE__); + code.jump = 0xe9; code.offset = entry->target - (entry->code + JUMP_LABEL_NOP_SIZE); - } else + } else { + /* + * We are disabling this jump label. If it is not what + * we think it is, then something must have gone wrong. + * If this is the first initialization call, then we + * are converting the default nop to the ideal nop. + */ + if (init) { + const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; + if (unlikely(memcmp((void *)entry->code, default_nop, 5) != 0)) + bug_at((void *)entry->code, __LINE__); + } else { + code.jump = 0xe9; + code.offset = entry->target - + (entry->code + JUMP_LABEL_NOP_SIZE); + if (unlikely(memcmp((void *)entry->code, &code, 5) != 0)) + bug_at((void *)entry->code, __LINE__); + } memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE); + } - (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); + /* + * Make text_poke_bp() a default fallback poker. + * + * At the time the change is being done, just ignore whether we + * are doing nop -> jump or jump -> nop transition, and assume + * always nop being the 'currently valid' instruction + * + */ + if (poker) + (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); + else + text_poke_bp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE, + (void *)entry->code + JUMP_LABEL_NOP_SIZE); } void arch_jump_label_transform(struct jump_entry *entry, @@ -45,15 +96,38 @@ void arch_jump_label_transform(struct jump_entry *entry, { get_online_cpus(); mutex_lock(&text_mutex); - __jump_label_transform(entry, type, text_poke_smp); + __jump_label_transform(entry, type, NULL, 0); mutex_unlock(&text_mutex); put_online_cpus(); } +static enum { + JL_STATE_START, + JL_STATE_NO_UPDATE, + JL_STATE_UPDATE, +} jlstate __initdata_or_module = JL_STATE_START; + __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type) { - __jump_label_transform(entry, type, text_poke_early); + /* + * This function is called at boot up and when modules are + * first loaded. Check if the default nop, the one that is + * inserted at compile time, is the ideal nop. If it is, then + * we do not need to update the nop, and we can leave it as is. + * If it is not, then we need to update the nop to the ideal nop. + */ + if (jlstate == JL_STATE_START) { + const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; + const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; + + if (memcmp(ideal_nop, default_nop, 5) != 0) + jlstate = JL_STATE_UPDATE; + else + jlstate = JL_STATE_NO_UPDATE; + } + if (jlstate == JL_STATE_UPDATE) + __jump_label_transform(entry, type, text_poke_early, 1); } #endif diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index 2e9d4b5af036..c6ee63f927ab 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -82,14 +82,9 @@ extern void synthesize_reljump(void *from, void *to); extern void synthesize_relcall(void *from, void *to); #ifdef CONFIG_OPTPROBES -extern int arch_init_optprobes(void); extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter); extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr); #else /* !CONFIG_OPTPROBES */ -static inline int arch_init_optprobes(void) -{ - return 0; -} static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) { return 0; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 9895a9a41380..79a3f9682871 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -365,10 +365,14 @@ int __kprobes __copy_instruction(u8 *dest, u8 *src) return insn.length; } -static void __kprobes arch_copy_kprobe(struct kprobe *p) +static int __kprobes arch_copy_kprobe(struct kprobe *p) { + int ret; + /* Copy an instruction with recovering if other optprobe modifies it.*/ - __copy_instruction(p->ainsn.insn, p->addr); + ret = __copy_instruction(p->ainsn.insn, p->addr); + if (!ret) + return -EINVAL; /* * __copy_instruction can modify the displacement of the instruction, @@ -384,6 +388,8 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p) /* Also, displacement change doesn't affect the first byte */ p->opcode = p->ainsn.insn[0]; + + return 0; } int __kprobes arch_prepare_kprobe(struct kprobe *p) @@ -397,8 +403,8 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) p->ainsn.insn = get_insn_slot(); if (!p->ainsn.insn) return -ENOMEM; - arch_copy_kprobe(p); - return 0; + + return arch_copy_kprobe(p); } void __kprobes arch_arm_kprobe(struct kprobe *p) @@ -655,7 +661,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void) /* * Called from kretprobe_trampoline */ -static __used __kprobes void *trampoline_handler(struct pt_regs *regs) +__visible __used __kprobes void *trampoline_handler(struct pt_regs *regs) { struct kretprobe_instance *ri = NULL; struct hlist_head *head, empty_rp; @@ -1062,7 +1068,7 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) int __init arch_init_kprobes(void) { - return arch_init_optprobes(); + return 0; } int __kprobes arch_trampoline_kprobe(struct kprobe *p) diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 76dc6f095724..898160b42e43 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -88,9 +88,7 @@ static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long v *(unsigned long *)addr = val; } -static void __used __kprobes kprobes_optinsn_template_holder(void) -{ - asm volatile ( +asm ( ".global optprobe_template_entry\n" "optprobe_template_entry:\n" #ifdef CONFIG_X86_64 @@ -129,7 +127,6 @@ static void __used __kprobes kprobes_optinsn_template_holder(void) #endif ".global optprobe_template_end\n" "optprobe_template_end:\n"); -} #define TMPL_MOVE_IDX \ ((long)&optprobe_template_val - (long)&optprobe_template_entry) @@ -371,31 +368,6 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) return 0; } -#define MAX_OPTIMIZE_PROBES 256 -static struct text_poke_param *jump_poke_params; -static struct jump_poke_buffer { - u8 buf[RELATIVEJUMP_SIZE]; -} *jump_poke_bufs; - -static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm, - u8 *insn_buf, - struct optimized_kprobe *op) -{ - s32 rel = (s32)((long)op->optinsn.insn - - ((long)op->kp.addr + RELATIVEJUMP_SIZE)); - - /* Backup instructions which will be replaced by jump address */ - memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, - RELATIVE_ADDR_SIZE); - - insn_buf[0] = RELATIVEJUMP_OPCODE; - *(s32 *)(&insn_buf[1]) = rel; - - tprm->addr = op->kp.addr; - tprm->opcode = insn_buf; - tprm->len = RELATIVEJUMP_SIZE; -} - /* * Replace breakpoints (int3) with relative jumps. * Caller must call with locking kprobe_mutex and text_mutex. @@ -403,37 +375,38 @@ static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm, void __kprobes arch_optimize_kprobes(struct list_head *oplist) { struct optimized_kprobe *op, *tmp; - int c = 0; + u8 insn_buf[RELATIVEJUMP_SIZE]; list_for_each_entry_safe(op, tmp, oplist, list) { + s32 rel = (s32)((long)op->optinsn.insn - + ((long)op->kp.addr + RELATIVEJUMP_SIZE)); + WARN_ON(kprobe_disabled(&op->kp)); - /* Setup param */ - setup_optimize_kprobe(&jump_poke_params[c], - jump_poke_bufs[c].buf, op); + + /* Backup instructions which will be replaced by jump address */ + memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, + RELATIVE_ADDR_SIZE); + + insn_buf[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&insn_buf[1]) = rel; + + text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE, + op->optinsn.insn); + list_del_init(&op->list); - if (++c >= MAX_OPTIMIZE_PROBES) - break; } - - /* - * text_poke_smp doesn't support NMI/MCE code modifying. - * However, since kprobes itself also doesn't support NMI/MCE - * code probing, it's not a problem. - */ - text_poke_smp_batch(jump_poke_params, c); } -static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm, - u8 *insn_buf, - struct optimized_kprobe *op) +/* Replace a relative jump with a breakpoint (int3). */ +void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) { + u8 insn_buf[RELATIVEJUMP_SIZE]; + /* Set int3 to first byte for kprobes */ insn_buf[0] = BREAKPOINT_INSTRUCTION; memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); - - tprm->addr = op->kp.addr; - tprm->opcode = insn_buf; - tprm->len = RELATIVEJUMP_SIZE; + text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE, + op->optinsn.insn); } /* @@ -444,34 +417,11 @@ extern void arch_unoptimize_kprobes(struct list_head *oplist, struct list_head *done_list) { struct optimized_kprobe *op, *tmp; - int c = 0; list_for_each_entry_safe(op, tmp, oplist, list) { - /* Setup param */ - setup_unoptimize_kprobe(&jump_poke_params[c], - jump_poke_bufs[c].buf, op); + arch_unoptimize_kprobe(op); list_move(&op->list, done_list); - if (++c >= MAX_OPTIMIZE_PROBES) - break; } - - /* - * text_poke_smp doesn't support NMI/MCE code modifying. - * However, since kprobes itself also doesn't support NMI/MCE - * code probing, it's not a problem. - */ - text_poke_smp_batch(jump_poke_params, c); -} - -/* Replace a relative jump with a breakpoint (int3). */ -void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) -{ - u8 buf[RELATIVEJUMP_SIZE]; - - /* Set int3 to first byte for kprobes */ - buf[0] = BREAKPOINT_INSTRUCTION; - memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); - text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE); } int __kprobes @@ -491,22 +441,3 @@ setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) } return 0; } - -int __kprobes arch_init_optprobes(void) -{ - /* Allocate code buffer and parameter array */ - jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) * - MAX_OPTIMIZE_PROBES, GFP_KERNEL); - if (!jump_poke_bufs) - return -ENOMEM; - - jump_poke_params = kmalloc(sizeof(struct text_poke_param) * - MAX_OPTIMIZE_PROBES, GFP_KERNEL); - if (!jump_poke_params) { - kfree(jump_poke_bufs); - jump_poke_bufs = NULL; - return -ENOMEM; - } - - return 0; -} diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index cd6d9a5a42f6..697b93af02dd 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -34,6 +34,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/kprobes.h> +#include <linux/debugfs.h> #include <asm/timer.h> #include <asm/cpu.h> #include <asm/traps.h> @@ -320,7 +321,7 @@ static void kvm_guest_apic_eoi_write(u32 reg, u32 val) apic_write(APIC_EOI, APIC_EOI_ACK); } -void __cpuinit kvm_guest_cpu_init(void) +void kvm_guest_cpu_init(void) { if (!kvm_para_available()) return; @@ -419,9 +420,10 @@ static void __init kvm_smp_prepare_boot_cpu(void) WARN_ON(kvm_register_clock("primary cpu clock")); kvm_guest_cpu_init(); native_smp_prepare_boot_cpu(); + kvm_spinlock_init(); } -static void __cpuinit kvm_guest_cpu_online(void *dummy) +static void kvm_guest_cpu_online(void *dummy) { kvm_guest_cpu_init(); } @@ -435,8 +437,8 @@ static void kvm_guest_cpu_offline(void *dummy) apf_task_wake_all(); } -static int __cpuinit kvm_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int kvm_cpu_notify(struct notifier_block *self, unsigned long action, + void *hcpu) { int cpu = (unsigned long)hcpu; switch (action) { @@ -455,7 +457,7 @@ static int __cpuinit kvm_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata kvm_cpu_notifier = { +static struct notifier_block kvm_cpu_notifier = { .notifier_call = kvm_cpu_notify, }; #endif @@ -498,11 +500,9 @@ void __init kvm_guest_init(void) #endif } -static bool __init kvm_detect(void) +static uint32_t __init kvm_detect(void) { - if (!kvm_para_available()) - return false; - return true; + return kvm_cpuid_base(); } const struct hypervisor_x86 x86_hyper_kvm __refconst = { @@ -523,3 +523,263 @@ static __init int activate_jump_labels(void) return 0; } arch_initcall(activate_jump_labels); + +#ifdef CONFIG_PARAVIRT_SPINLOCKS + +/* Kick a cpu by its apicid. Used to wake up a halted vcpu */ +static void kvm_kick_cpu(int cpu) +{ + int apicid; + unsigned long flags = 0; + + apicid = per_cpu(x86_cpu_to_apicid, cpu); + kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid); +} + +enum kvm_contention_stat { + TAKEN_SLOW, + TAKEN_SLOW_PICKUP, + RELEASED_SLOW, + RELEASED_SLOW_KICKED, + NR_CONTENTION_STATS +}; + +#ifdef CONFIG_KVM_DEBUG_FS +#define HISTO_BUCKETS 30 + +static struct kvm_spinlock_stats +{ + u32 contention_stats[NR_CONTENTION_STATS]; + u32 histo_spin_blocked[HISTO_BUCKETS+1]; + u64 time_blocked; +} spinlock_stats; + +static u8 zero_stats; + +static inline void check_zero(void) +{ + u8 ret; + u8 old; + + old = ACCESS_ONCE(zero_stats); + if (unlikely(old)) { + ret = cmpxchg(&zero_stats, old, 0); + /* This ensures only one fellow resets the stat */ + if (ret == old) + memset(&spinlock_stats, 0, sizeof(spinlock_stats)); + } +} + +static inline void add_stats(enum kvm_contention_stat var, u32 val) +{ + check_zero(); + spinlock_stats.contention_stats[var] += val; +} + + +static inline u64 spin_time_start(void) +{ + return sched_clock(); +} + +static void __spin_time_accum(u64 delta, u32 *array) +{ + unsigned index; + + index = ilog2(delta); + check_zero(); + + if (index < HISTO_BUCKETS) + array[index]++; + else + array[HISTO_BUCKETS]++; +} + +static inline void spin_time_accum_blocked(u64 start) +{ + u32 delta; + + delta = sched_clock() - start; + __spin_time_accum(delta, spinlock_stats.histo_spin_blocked); + spinlock_stats.time_blocked += delta; +} + +static struct dentry *d_spin_debug; +static struct dentry *d_kvm_debug; + +struct dentry *kvm_init_debugfs(void) +{ + d_kvm_debug = debugfs_create_dir("kvm", NULL); + if (!d_kvm_debug) + printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n"); + + return d_kvm_debug; +} + +static int __init kvm_spinlock_debugfs(void) +{ + struct dentry *d_kvm; + + d_kvm = kvm_init_debugfs(); + if (d_kvm == NULL) + return -ENOMEM; + + d_spin_debug = debugfs_create_dir("spinlocks", d_kvm); + + debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); + + debugfs_create_u32("taken_slow", 0444, d_spin_debug, + &spinlock_stats.contention_stats[TAKEN_SLOW]); + debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, + &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]); + + debugfs_create_u32("released_slow", 0444, d_spin_debug, + &spinlock_stats.contention_stats[RELEASED_SLOW]); + debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, + &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]); + + debugfs_create_u64("time_blocked", 0444, d_spin_debug, + &spinlock_stats.time_blocked); + + debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, + spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); + + return 0; +} +fs_initcall(kvm_spinlock_debugfs); +#else /* !CONFIG_KVM_DEBUG_FS */ +static inline void add_stats(enum kvm_contention_stat var, u32 val) +{ +} + +static inline u64 spin_time_start(void) +{ + return 0; +} + +static inline void spin_time_accum_blocked(u64 start) +{ +} +#endif /* CONFIG_KVM_DEBUG_FS */ + +struct kvm_lock_waiting { + struct arch_spinlock *lock; + __ticket_t want; +}; + +/* cpus 'waiting' on a spinlock to become available */ +static cpumask_t waiting_cpus; + +/* Track spinlock on which a cpu is waiting */ +static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting); + +static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) +{ + struct kvm_lock_waiting *w; + int cpu; + u64 start; + unsigned long flags; + + if (in_nmi()) + return; + + w = &__get_cpu_var(klock_waiting); + cpu = smp_processor_id(); + start = spin_time_start(); + + /* + * Make sure an interrupt handler can't upset things in a + * partially setup state. + */ + local_irq_save(flags); + + /* + * The ordering protocol on this is that the "lock" pointer + * may only be set non-NULL if the "want" ticket is correct. + * If we're updating "want", we must first clear "lock". + */ + w->lock = NULL; + smp_wmb(); + w->want = want; + smp_wmb(); + w->lock = lock; + + add_stats(TAKEN_SLOW, 1); + + /* + * This uses set_bit, which is atomic but we should not rely on its + * reordering gurantees. So barrier is needed after this call. + */ + cpumask_set_cpu(cpu, &waiting_cpus); + + barrier(); + + /* + * Mark entry to slowpath before doing the pickup test to make + * sure we don't deadlock with an unlocker. + */ + __ticket_enter_slowpath(lock); + + /* + * check again make sure it didn't become free while + * we weren't looking. + */ + if (ACCESS_ONCE(lock->tickets.head) == want) { + add_stats(TAKEN_SLOW_PICKUP, 1); + goto out; + } + + /* + * halt until it's our turn and kicked. Note that we do safe halt + * for irq enabled case to avoid hang when lock info is overwritten + * in irq spinlock slowpath and no spurious interrupt occur to save us. + */ + if (arch_irqs_disabled_flags(flags)) + halt(); + else + safe_halt(); + +out: + cpumask_clear_cpu(cpu, &waiting_cpus); + w->lock = NULL; + local_irq_restore(flags); + spin_time_accum_blocked(start); +} +PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning); + +/* Kick vcpu waiting on @lock->head to reach value @ticket */ +static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) +{ + int cpu; + + add_stats(RELEASED_SLOW, 1); + for_each_cpu(cpu, &waiting_cpus) { + const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu); + if (ACCESS_ONCE(w->lock) == lock && + ACCESS_ONCE(w->want) == ticket) { + add_stats(RELEASED_SLOW_KICKED, 1); + kvm_kick_cpu(cpu); + break; + } + } +} + +/* + * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. + */ +void __init kvm_spinlock_init(void) +{ + if (!kvm_para_available()) + return; + /* Does host kernel support KVM_FEATURE_PV_UNHALT? */ + if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) + return; + + printk(KERN_INFO "KVM setup paravirtual spinlock\n"); + + static_key_slow_inc(¶virt_ticketlocks_enabled); + + pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning); + pv_lock_ops.unlock_kick = kvm_unlock_kick; +} +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index d2c381280e3c..1570e0741344 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -48,10 +48,9 @@ static struct pvclock_wall_clock wall_clock; * have elapsed since the hypervisor wrote the data. So we try to account for * that with system time */ -static unsigned long kvm_get_wallclock(void) +static void kvm_get_wallclock(struct timespec *now) { struct pvclock_vcpu_time_info *vcpu_time; - struct timespec ts; int low, high; int cpu; @@ -64,14 +63,12 @@ static unsigned long kvm_get_wallclock(void) cpu = smp_processor_id(); vcpu_time = &hv_clock[cpu].pvti; - pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); + pvclock_read_wallclock(&wall_clock, vcpu_time, now); preempt_enable(); - - return ts.tv_sec; } -static int kvm_set_wallclock(unsigned long now) +static int kvm_set_wallclock(const struct timespec *now) { return -1; } @@ -185,7 +182,7 @@ static void kvm_restore_sched_clock_state(void) } #ifdef CONFIG_X86_LOCAL_APIC -static void __cpuinit kvm_setup_secondary_clock(void) +static void kvm_setup_secondary_clock(void) { /* * Now that the first cpu already had this clocksource initialized, @@ -242,6 +239,7 @@ void __init kvmclock_init(void) if (!mem) return; hv_clock = __va(mem); + memset(hv_clock, 0, size); if (kvm_register_clock("boot clock")) { hv_clock = NULL; diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index efdec7cd8e01..af99f71aeb7f 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -31,48 +31,12 @@ #include <asm/microcode.h> #include <asm/processor.h> #include <asm/msr.h> +#include <asm/microcode_amd.h> MODULE_DESCRIPTION("AMD Microcode Update Driver"); MODULE_AUTHOR("Peter Oruba"); MODULE_LICENSE("GPL v2"); -#define UCODE_MAGIC 0x00414d44 -#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 -#define UCODE_UCODE_TYPE 0x00000001 - -struct equiv_cpu_entry { - u32 installed_cpu; - u32 fixed_errata_mask; - u32 fixed_errata_compare; - u16 equiv_cpu; - u16 res; -} __attribute__((packed)); - -struct microcode_header_amd { - u32 data_code; - u32 patch_id; - u16 mc_patch_data_id; - u8 mc_patch_data_len; - u8 init_flag; - u32 mc_patch_data_checksum; - u32 nb_dev_id; - u32 sb_dev_id; - u16 processor_rev_id; - u8 nb_rev_id; - u8 sb_rev_id; - u8 bios_api_rev; - u8 reserved1[3]; - u32 match_reg[8]; -} __attribute__((packed)); - -struct microcode_amd { - struct microcode_header_amd hdr; - unsigned int mpb[0]; -}; - -#define SECTION_HDR_SIZE 8 -#define CONTAINER_HDR_SZ 12 - static struct equiv_cpu_entry *equiv_cpu_table; struct ucode_patch { @@ -84,21 +48,10 @@ struct ucode_patch { static LIST_HEAD(pcache); -static u16 find_equiv_id(unsigned int cpu) +static u16 __find_equiv_id(unsigned int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - int i = 0; - - if (!equiv_cpu_table) - return 0; - - while (equiv_cpu_table[i].installed_cpu != 0) { - if (uci->cpu_sig.sig == equiv_cpu_table[i].installed_cpu) - return equiv_cpu_table[i].equiv_cpu; - - i++; - } - return 0; + return find_equiv_id(equiv_cpu_table, uci->cpu_sig.sig); } static u32 find_cpu_family_by_equiv_cpu(u16 equiv_cpu) @@ -163,7 +116,7 @@ static struct ucode_patch *find_patch(unsigned int cpu) { u16 equiv_id; - equiv_id = find_equiv_id(cpu); + equiv_id = __find_equiv_id(cpu); if (!equiv_id) return NULL; @@ -173,18 +126,28 @@ static struct ucode_patch *find_patch(unsigned int cpu) static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct ucode_patch *p; csig->sig = cpuid_eax(0x00000001); csig->rev = c->microcode; + + /* + * a patch could have been loaded early, set uci->mc so that + * mc_bp_resume() can call apply_microcode() + */ + p = find_patch(cpu); + if (p && (p->patch_id == csig->rev)) + uci->mc = p->data; + pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev); return 0; } -static unsigned int verify_patch_size(int cpu, u32 patch_size, +static unsigned int verify_patch_size(u8 family, u32 patch_size, unsigned int size) { - struct cpuinfo_x86 *c = &cpu_data(cpu); u32 max_size; #define F1XH_MPB_MAX_SIZE 2048 @@ -192,7 +155,7 @@ static unsigned int verify_patch_size(int cpu, u32 patch_size, #define F15H_MPB_MAX_SIZE 4096 #define F16H_MPB_MAX_SIZE 3458 - switch (c->x86) { + switch (family) { case 0x14: max_size = F14H_MPB_MAX_SIZE; break; @@ -215,7 +178,21 @@ static unsigned int verify_patch_size(int cpu, u32 patch_size, return patch_size; } -static int apply_microcode_amd(int cpu) +int __apply_microcode_amd(struct microcode_amd *mc_amd) +{ + u32 rev, dummy; + + wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); + + /* verify patch application was successful */ + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + if (rev != mc_amd->hdr.patch_id) + return -1; + + return 0; +} + +int apply_microcode_amd(int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); struct microcode_amd *mc_amd; @@ -239,22 +216,20 @@ static int apply_microcode_amd(int cpu) /* need to apply patch? */ if (rev >= mc_amd->hdr.patch_id) { c->microcode = rev; + uci->cpu_sig.rev = rev; return 0; } - wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); - - /* verify patch application was successful */ - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - if (rev != mc_amd->hdr.patch_id) { + if (__apply_microcode_amd(mc_amd)) { pr_err("CPU%d: update failed for patch_level=0x%08x\n", - cpu, mc_amd->hdr.patch_id); + cpu, mc_amd->hdr.patch_id); return -1; } + pr_info("CPU%d: new patch_level=0x%08x\n", cpu, + mc_amd->hdr.patch_id); - pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev); - uci->cpu_sig.rev = rev; - c->microcode = rev; + uci->cpu_sig.rev = mc_amd->hdr.patch_id; + c->microcode = mc_amd->hdr.patch_id; return 0; } @@ -302,9 +277,8 @@ static void cleanup(void) * driver cannot continue functioning normally. In such cases, we tear * down everything we've used up so far and exit. */ -static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover) +static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover) { - struct cpuinfo_x86 *c = &cpu_data(cpu); struct microcode_header_amd *mc_hdr; struct ucode_patch *patch; unsigned int patch_size, crnt_size, ret; @@ -324,7 +298,7 @@ static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover) /* check if patch is for the current family */ proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff); - if (proc_fam != c->x86) + if (proc_fam != family) return crnt_size; if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { @@ -333,7 +307,7 @@ static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover) return crnt_size; } - ret = verify_patch_size(cpu, patch_size, leftover); + ret = verify_patch_size(family, patch_size, leftover); if (!ret) { pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id); return crnt_size; @@ -364,7 +338,8 @@ static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover) return crnt_size; } -static enum ucode_state load_microcode_amd(int cpu, const u8 *data, size_t size) +static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, + size_t size) { enum ucode_state ret = UCODE_ERROR; unsigned int leftover; @@ -387,7 +362,7 @@ static enum ucode_state load_microcode_amd(int cpu, const u8 *data, size_t size) } while (leftover) { - crnt_size = verify_and_add_patch(cpu, fw, leftover); + crnt_size = verify_and_add_patch(family, fw, leftover); if (crnt_size < 0) return ret; @@ -398,6 +373,32 @@ static enum ucode_state load_microcode_amd(int cpu, const u8 *data, size_t size) return UCODE_OK; } +enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) +{ + enum ucode_state ret; + + /* free old equiv table */ + free_equiv_cpu_table(); + + ret = __load_microcode_amd(family, data, size); + + if (ret != UCODE_OK) + cleanup(); + +#if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32) + /* save BSP's matching patch for early load */ + if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) { + struct ucode_patch *p = find_patch(smp_processor_id()); + if (p) { + memset(amd_bsp_mpb, 0, MPB_MAX_SIZE); + memcpy(amd_bsp_mpb, p->data, min_t(u32, ksize(p->data), + MPB_MAX_SIZE)); + } + } +#endif + return ret; +} + /* * AMD microcode firmware naming convention, up to family 15h they are in * the legacy file: @@ -440,12 +441,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, goto fw_release; } - /* free old equiv table */ - free_equiv_cpu_table(); - - ret = load_microcode_amd(cpu, fw->data, fw->size); - if (ret != UCODE_OK) - cleanup(); + ret = load_microcode_amd(c->x86, fw->data, fw->size); fw_release: release_firmware(fw); diff --git a/arch/x86/kernel/microcode_amd_early.c b/arch/x86/kernel/microcode_amd_early.c new file mode 100644 index 000000000000..6073104ccaa3 --- /dev/null +++ b/arch/x86/kernel/microcode_amd_early.c @@ -0,0 +1,301 @@ +/* + * Copyright (C) 2013 Advanced Micro Devices, Inc. + * + * Author: Jacob Shin <jacob.shin@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/earlycpio.h> +#include <linux/initrd.h> + +#include <asm/cpu.h> +#include <asm/setup.h> +#include <asm/microcode_amd.h> + +static bool ucode_loaded; +static u32 ucode_new_rev; +static unsigned long ucode_offset; +static size_t ucode_size; + +/* + * Microcode patch container file is prepended to the initrd in cpio format. + * See Documentation/x86/early-microcode.txt + */ +static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin"; + +static struct cpio_data __init find_ucode_in_initrd(void) +{ + long offset = 0; + char *path; + void *start; + size_t size; + unsigned long *uoffset; + size_t *usize; + struct cpio_data cd; + +#ifdef CONFIG_X86_32 + struct boot_params *p; + + /* + * On 32-bit, early load occurs before paging is turned on so we need + * to use physical addresses. + */ + p = (struct boot_params *)__pa_nodebug(&boot_params); + path = (char *)__pa_nodebug(ucode_path); + start = (void *)p->hdr.ramdisk_image; + size = p->hdr.ramdisk_size; + uoffset = (unsigned long *)__pa_nodebug(&ucode_offset); + usize = (size_t *)__pa_nodebug(&ucode_size); +#else + path = ucode_path; + start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET); + size = boot_params.hdr.ramdisk_size; + uoffset = &ucode_offset; + usize = &ucode_size; +#endif + + cd = find_cpio_data(path, start, size, &offset); + if (!cd.data) + return cd; + + if (*(u32 *)cd.data != UCODE_MAGIC) { + cd.data = NULL; + cd.size = 0; + return cd; + } + + *uoffset = (u8 *)cd.data - (u8 *)start; + *usize = cd.size; + + return cd; +} + +/* + * Early load occurs before we can vmalloc(). So we look for the microcode + * patch container file in initrd, traverse equivalent cpu table, look for a + * matching microcode patch, and update, all in initrd memory in place. + * When vmalloc() is available for use later -- on 64-bit during first AP load, + * and on 32-bit during save_microcode_in_initrd_amd() -- we can call + * load_microcode_amd() to save equivalent cpu table and microcode patches in + * kernel heap memory. + */ +static void apply_ucode_in_initrd(void *ucode, size_t size) +{ + struct equiv_cpu_entry *eq; + u32 *header; + u8 *data; + u16 eq_id = 0; + int offset, left; + u32 rev, eax; + u32 *new_rev; + unsigned long *uoffset; + size_t *usize; + +#ifdef CONFIG_X86_32 + new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); + uoffset = (unsigned long *)__pa_nodebug(&ucode_offset); + usize = (size_t *)__pa_nodebug(&ucode_size); +#else + new_rev = &ucode_new_rev; + uoffset = &ucode_offset; + usize = &ucode_size; +#endif + + data = ucode; + left = size; + header = (u32 *)data; + + /* find equiv cpu table */ + + if (header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ + header[2] == 0) /* size */ + return; + + eax = cpuid_eax(0x00000001); + + while (left > 0) { + eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ); + + offset = header[2] + CONTAINER_HDR_SZ; + data += offset; + left -= offset; + + eq_id = find_equiv_id(eq, eax); + if (eq_id) + break; + + /* + * support multiple container files appended together. if this + * one does not have a matching equivalent cpu entry, we fast + * forward to the next container file. + */ + while (left > 0) { + header = (u32 *)data; + if (header[0] == UCODE_MAGIC && + header[1] == UCODE_EQUIV_CPU_TABLE_TYPE) + break; + + offset = header[1] + SECTION_HDR_SIZE; + data += offset; + left -= offset; + } + + /* mark where the next microcode container file starts */ + offset = data - (u8 *)ucode; + *uoffset += offset; + *usize -= offset; + ucode = data; + } + + if (!eq_id) { + *usize = 0; + return; + } + + /* find ucode and update if needed */ + + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); + + while (left > 0) { + struct microcode_amd *mc; + + header = (u32 *)data; + if (header[0] != UCODE_UCODE_TYPE || /* type */ + header[1] == 0) /* size */ + break; + + mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE); + if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) + if (__apply_microcode_amd(mc) == 0) { + rev = mc->hdr.patch_id; + *new_rev = rev; + } + + offset = header[1] + SECTION_HDR_SIZE; + data += offset; + left -= offset; + } + + /* mark where this microcode container file ends */ + offset = *usize - (data - (u8 *)ucode); + *usize -= offset; + + if (!(*new_rev)) + *usize = 0; +} + +void __init load_ucode_amd_bsp(void) +{ + struct cpio_data cd = find_ucode_in_initrd(); + if (!cd.data) + return; + + apply_ucode_in_initrd(cd.data, cd.size); +} + +#ifdef CONFIG_X86_32 +u8 amd_bsp_mpb[MPB_MAX_SIZE]; + +/* + * On 32-bit, since AP's early load occurs before paging is turned on, we + * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during + * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During + * save_microcode_in_initrd_amd() BSP's patch is copied to amd_bsp_mpb, which + * is used upon resume from suspend. + */ +void load_ucode_amd_ap(void) +{ + struct microcode_amd *mc; + unsigned long *initrd; + unsigned long *uoffset; + size_t *usize; + void *ucode; + + mc = (struct microcode_amd *)__pa(amd_bsp_mpb); + if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { + __apply_microcode_amd(mc); + return; + } + + initrd = (unsigned long *)__pa(&initrd_start); + uoffset = (unsigned long *)__pa(&ucode_offset); + usize = (size_t *)__pa(&ucode_size); + + if (!*usize || !*initrd) + return; + + ucode = (void *)((unsigned long)__pa(*initrd) + *uoffset); + apply_ucode_in_initrd(ucode, *usize); +} + +static void __init collect_cpu_sig_on_bsp(void *arg) +{ + unsigned int cpu = smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + uci->cpu_sig.sig = cpuid_eax(0x00000001); +} +#else +void load_ucode_amd_ap(void) +{ + unsigned int cpu = smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + u32 rev, eax; + + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); + eax = cpuid_eax(0x00000001); + + uci->cpu_sig.rev = rev; + uci->cpu_sig.sig = eax; + + if (cpu && !ucode_loaded) { + void *ucode; + + if (!ucode_size || !initrd_start) + return; + + ucode = (void *)(initrd_start + ucode_offset); + eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); + if (load_microcode_amd(eax, ucode, ucode_size) != UCODE_OK) + return; + + ucode_loaded = true; + } + + apply_microcode_amd(cpu); +} +#endif + +int __init save_microcode_in_initrd_amd(void) +{ + enum ucode_state ret; + void *ucode; + u32 eax; + +#ifdef CONFIG_X86_32 + unsigned int bsp = boot_cpu_data.cpu_index; + struct ucode_cpu_info *uci = ucode_cpu_info + bsp; + + if (!uci->cpu_sig.sig) + smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1); +#endif + if (ucode_new_rev) + pr_info("microcode: updated early to new patch_level=0x%08x\n", + ucode_new_rev); + + if (ucode_loaded || !ucode_size || !initrd_start) + return 0; + + ucode = (void *)(initrd_start + ucode_offset); + eax = cpuid_eax(0x00000001); + eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); + + ret = load_microcode_amd(eax, ucode, ucode_size); + if (ret != UCODE_OK) + return -EINVAL; + + ucode_loaded = true; + return 0; +} diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 22db92bbdf1a..15c987698b0f 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -468,7 +468,7 @@ static struct syscore_ops mc_syscore_ops = { .resume = mc_bp_resume, }; -static __cpuinit int +static int mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; diff --git a/arch/x86/kernel/microcode_core_early.c b/arch/x86/kernel/microcode_core_early.c index 833d51d6ee06..be7f8514f577 100644 --- a/arch/x86/kernel/microcode_core_early.c +++ b/arch/x86/kernel/microcode_core_early.c @@ -18,6 +18,7 @@ */ #include <linux/module.h> #include <asm/microcode_intel.h> +#include <asm/microcode_amd.h> #include <asm/processor.h> #define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) @@ -40,7 +41,7 @@ * * x86_vendor() gets vendor information directly through cpuid. */ -static int __cpuinit x86_vendor(void) +static int x86_vendor(void) { u32 eax = 0x00000000; u32 ebx, ecx = 0, edx; @@ -56,7 +57,7 @@ static int __cpuinit x86_vendor(void) return X86_VENDOR_UNKNOWN; } -static int __cpuinit x86_family(void) +static int x86_family(void) { u32 eax = 0x00000001; u32 ebx, ecx = 0, edx; @@ -81,11 +82,21 @@ void __init load_ucode_bsp(void) vendor = x86_vendor(); x86 = x86_family(); - if (vendor == X86_VENDOR_INTEL && x86 >= 6) - load_ucode_intel_bsp(); + switch (vendor) { + case X86_VENDOR_INTEL: + if (x86 >= 6) + load_ucode_intel_bsp(); + break; + case X86_VENDOR_AMD: + if (x86 >= 0x10) + load_ucode_amd_bsp(); + break; + default: + break; + } } -void __cpuinit load_ucode_ap(void) +void load_ucode_ap(void) { int vendor, x86; @@ -95,6 +106,36 @@ void __cpuinit load_ucode_ap(void) vendor = x86_vendor(); x86 = x86_family(); - if (vendor == X86_VENDOR_INTEL && x86 >= 6) - load_ucode_intel_ap(); + switch (vendor) { + case X86_VENDOR_INTEL: + if (x86 >= 6) + load_ucode_intel_ap(); + break; + case X86_VENDOR_AMD: + if (x86 >= 0x10) + load_ucode_amd_ap(); + break; + default: + break; + } +} + +int __init save_microcode_in_initrd(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + if (c->x86 >= 6) + save_microcode_in_initrd_intel(); + break; + case X86_VENDOR_AMD: + if (c->x86 >= 0x10) + save_microcode_in_initrd_amd(); + break; + default: + break; + } + + return 0; } diff --git a/arch/x86/kernel/microcode_intel_early.c b/arch/x86/kernel/microcode_intel_early.c index 2e9e12871c2b..1575deb2e636 100644 --- a/arch/x86/kernel/microcode_intel_early.c +++ b/arch/x86/kernel/microcode_intel_early.c @@ -34,7 +34,7 @@ struct mc_saved_data { struct microcode_intel **mc_saved; } mc_saved_data; -static enum ucode_state __cpuinit +static enum ucode_state generic_load_microcode_early(struct microcode_intel **mc_saved_p, unsigned int mc_saved_count, struct ucode_cpu_info *uci) @@ -69,7 +69,7 @@ out: return state; } -static void __cpuinit +static void microcode_pointer(struct microcode_intel **mc_saved, unsigned long *mc_saved_in_initrd, unsigned long initrd_start, int mc_saved_count) @@ -82,7 +82,7 @@ microcode_pointer(struct microcode_intel **mc_saved, } #ifdef CONFIG_X86_32 -static void __cpuinit +static void microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mc_saved_data) { @@ -101,7 +101,7 @@ microcode_phys(struct microcode_intel **mc_saved_tmp, } #endif -static enum ucode_state __cpuinit +static enum ucode_state load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *mc_saved_in_initrd, unsigned long initrd_start, @@ -375,7 +375,7 @@ do { \ #define native_wrmsr(msr, low, high) \ native_write_msr(msr, low, high); -static int __cpuinit collect_cpu_info_early(struct ucode_cpu_info *uci) +static int collect_cpu_info_early(struct ucode_cpu_info *uci) { unsigned int val[2]; u8 x86, x86_model; @@ -529,7 +529,7 @@ int save_mc_for_early(u8 *mc) */ ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count); if (ret) { - pr_err("Can not save microcode patch.\n"); + pr_err("Cannot save microcode patch.\n"); goto out; } @@ -584,7 +584,7 @@ scan_microcode(unsigned long start, unsigned long end, /* * Print ucode update info. */ -static void __cpuinit +static void print_ucode_info(struct ucode_cpu_info *uci, unsigned int date) { int cpu = smp_processor_id(); @@ -605,7 +605,7 @@ static int current_mc_date; /* * Print early updated ucode info after printk works. This is delayed info dump. */ -void __cpuinit show_ucode_info_early(void) +void show_ucode_info_early(void) { struct ucode_cpu_info uci; @@ -621,7 +621,7 @@ void __cpuinit show_ucode_info_early(void) * mc_saved_data.mc_saved and delay printing microcode info in * show_ucode_info_early() until printk() works. */ -static void __cpuinit print_ucode(struct ucode_cpu_info *uci) +static void print_ucode(struct ucode_cpu_info *uci) { struct microcode_intel *mc_intel; int *delay_ucode_info_p; @@ -643,12 +643,12 @@ static void __cpuinit print_ucode(struct ucode_cpu_info *uci) * Flush global tlb. We only do this in x86_64 where paging has been enabled * already and PGE should be enabled as well. */ -static inline void __cpuinit flush_tlb_early(void) +static inline void flush_tlb_early(void) { __native_flush_tlb_global_irq_disabled(); } -static inline void __cpuinit print_ucode(struct ucode_cpu_info *uci) +static inline void print_ucode(struct ucode_cpu_info *uci) { struct microcode_intel *mc_intel; @@ -660,8 +660,8 @@ static inline void __cpuinit print_ucode(struct ucode_cpu_info *uci) } #endif -static int __cpuinit apply_microcode_early(struct mc_saved_data *mc_saved_data, - struct ucode_cpu_info *uci) +static int apply_microcode_early(struct mc_saved_data *mc_saved_data, + struct ucode_cpu_info *uci) { struct microcode_intel *mc_intel; unsigned int val[2]; @@ -699,7 +699,7 @@ static int __cpuinit apply_microcode_early(struct mc_saved_data *mc_saved_data, * This function converts microcode patch offsets previously stored in * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data. */ -int __init save_microcode_in_initrd(void) +int __init save_microcode_in_initrd_intel(void) { unsigned int count = mc_saved_data.mc_saved_count; struct microcode_intel *mc_saved[MAX_UCODE_COUNT]; @@ -711,7 +711,7 @@ int __init save_microcode_in_initrd(void) microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count); ret = save_microcode(&mc_saved_data, mc_saved, count); if (ret) - pr_err("Can not save microcod patches from initrd"); + pr_err("Cannot save microcode patches from initrd.\n"); show_saved_mc(); @@ -763,7 +763,7 @@ load_ucode_intel_bsp(void) #endif } -void __cpuinit load_ucode_intel_ap(void) +void load_ucode_intel_ap(void) { struct mc_saved_data *mc_saved_data_p; struct ucode_cpu_info uci; diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index ac861b8348e2..f4c886d9165c 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -24,14 +24,14 @@ struct pci_hostbridge_probe { u32 device; }; -static u64 __cpuinitdata fam10h_pci_mmconf_base; +static u64 fam10h_pci_mmconf_base; -static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = { +static struct pci_hostbridge_probe pci_probes[] = { { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, }; -static int __cpuinit cmp_range(const void *x1, const void *x2) +static int cmp_range(const void *x1, const void *x2) { const struct range *r1 = x1; const struct range *r2 = x2; @@ -49,7 +49,7 @@ static int __cpuinit cmp_range(const void *x1, const void *x2) /* need to avoid (0xfd<<32), (0xfe<<32), and (0xff<<32), ht used space */ #define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32) #define BASE_VALID(b) ((b) + MMCONF_SIZE <= (0xfdULL<<32) || (b) >= (1ULL<<40)) -static void __cpuinit get_fam10h_pci_mmconf_base(void) +static void get_fam10h_pci_mmconf_base(void) { int i; unsigned bus; @@ -166,7 +166,7 @@ out: fam10h_pci_mmconf_base = base; } -void __cpuinit fam10h_check_enable_mmcfg(void) +void fam10h_check_enable_mmcfg(void) { u64 val; u32 address; @@ -230,7 +230,7 @@ static const struct dmi_system_id __initconst mmconf_dmi_table[] = { {} }; -/* Called from a __cpuinit function, but only on the BSP. */ +/* Called from a non __init function, but only on the BSP. */ void __ref check_enable_amd_mmconf_dmi(void) { dmi_check_system(mmconf_dmi_table); diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index ce130493b802..88458faea2f8 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -200,7 +200,7 @@ static const struct file_operations msr_fops = { .compat_ioctl = msr_ioctl, }; -static int __cpuinit msr_device_create(int cpu) +static int msr_device_create(int cpu) { struct device *dev; @@ -214,8 +214,8 @@ static void msr_device_destroy(int cpu) device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); } -static int __cpuinit msr_class_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int msr_class_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; int err = 0; diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 60308053fdb2..ba77ebc2c353 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -14,6 +14,7 @@ #include <linux/kprobes.h> #include <linux/kdebug.h> #include <linux/nmi.h> +#include <linux/debugfs.h> #include <linux/delay.h> #include <linux/hardirq.h> #include <linux/slab.h> @@ -29,6 +30,9 @@ #include <asm/nmi.h> #include <asm/x86_init.h> +#define CREATE_TRACE_POINTS +#include <trace/events/nmi.h> + struct nmi_desc { spinlock_t lock; struct list_head head; @@ -82,6 +86,15 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic); #define nmi_to_desc(type) (&nmi_desc[type]) +static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC; +static int __init nmi_warning_debugfs(void) +{ + debugfs_create_u64("nmi_longest_ns", 0644, + arch_debugfs_dir, &nmi_longest_ns); + return 0; +} +fs_initcall(nmi_warning_debugfs); + static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) { struct nmi_desc *desc = nmi_to_desc(type); @@ -96,8 +109,28 @@ static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2 * can be latched at any given time. Walk the whole list * to handle those situations. */ - list_for_each_entry_rcu(a, &desc->head, list) - handled += a->handler(type, regs); + list_for_each_entry_rcu(a, &desc->head, list) { + u64 before, delta, whole_msecs; + int remainder_ns, decimal_msecs, thishandled; + + before = local_clock(); + thishandled = a->handler(type, regs); + handled += thishandled; + delta = local_clock() - before; + trace_nmi_handler(a->handler, (int)delta, thishandled); + + if (delta < nmi_longest_ns) + continue; + + nmi_longest_ns = delta; + whole_msecs = delta; + remainder_ns = do_div(whole_msecs, (1000 * 1000)); + decimal_msecs = remainder_ns / 1000; + printk_ratelimited(KERN_INFO + "INFO: NMI handler (%ps) took too long to run: " + "%lld.%03d msecs\n", a->handler, whole_msecs, + decimal_msecs); + } rcu_read_unlock(); diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 676b8c77a976..bbb6c7316341 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -4,25 +4,17 @@ */ #include <linux/spinlock.h> #include <linux/module.h> +#include <linux/jump_label.h> #include <asm/paravirt.h> -static inline void -default_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) -{ - arch_spin_lock(lock); -} - struct pv_lock_ops pv_lock_ops = { #ifdef CONFIG_SMP - .spin_is_locked = __ticket_spin_is_locked, - .spin_is_contended = __ticket_spin_is_contended, - - .spin_lock = __ticket_spin_lock, - .spin_lock_flags = default_spin_lock_flags, - .spin_trylock = __ticket_spin_trylock, - .spin_unlock = __ticket_spin_unlock, + .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop), + .unlock_kick = paravirt_nop, #endif }; EXPORT_SYMBOL(pv_lock_ops); +struct static_key paravirt_ticketlocks_enabled = STATIC_KEY_INIT_FALSE; +EXPORT_SYMBOL(paravirt_ticketlocks_enabled); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index cd6de64cc480..1b10af835c31 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -62,11 +62,6 @@ void __init default_banner(void) pv_info.name); } -/* Simple instruction patching code. */ -#define DEF_NATIVE(ops, name, code) \ - extern const char start_##ops##_##name[], end_##ops##_##name[]; \ - asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") - /* Undefined instruction for dealing with missing ops pointers. */ static const unsigned char ud2a[] = { 0x0f, 0x0b }; @@ -324,7 +319,7 @@ struct pv_time_ops pv_time_ops = { .steal_clock = native_steal_clock, }; -struct pv_irq_ops pv_irq_ops = { +__visible struct pv_irq_ops pv_irq_ops = { .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), @@ -336,7 +331,7 @@ struct pv_irq_ops pv_irq_ops = { #endif }; -struct pv_cpu_ops pv_cpu_ops = { +__visible struct pv_cpu_ops pv_cpu_ops = { .cpuid = native_cpuid, .get_debugreg = native_get_debugreg, .set_debugreg = native_set_debugreg, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 4e7a37ff03ab..c83516be1052 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -36,7 +36,7 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; #ifdef CONFIG_X86_64 static DEFINE_PER_CPU(unsigned char, is_idle); @@ -277,18 +277,6 @@ void exit_idle(void) } #endif -void arch_cpu_idle_prepare(void) -{ - /* - * If we're the non-boot CPU, nothing set the stack canary up - * for us. CPU0 already has it initialized but no harm in - * doing it again. This is a good place for updating it, as - * we wont ever return from this function (so the invalid - * canaries already on the stack wont ever trigger). - */ - boot_init_stack_canary(); -} - void arch_cpu_idle_enter(void) { local_touch_nmi(); @@ -410,7 +398,7 @@ static void amd_e400_idle(void) default_idle(); } -void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) +void select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 7305f7dfc7ab..884f98f69354 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -110,11 +110,16 @@ void __show_regs(struct pt_regs *regs, int all) get_debugreg(d1, 1); get_debugreg(d2, 2); get_debugreg(d3, 3); - printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", - d0, d1, d2, d3); - get_debugreg(d6, 6); get_debugreg(d7, 7); + + /* Only print out debug registers if they are in their non-default state. */ + if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && + (d6 == DR6_RESERVED) && (d7 == 0x400)) + return; + + printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", + d0, d1, d2, d3); printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n", d6, d7); } @@ -147,7 +152,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, childregs->bp = arg; childregs->orig_ax = -1; childregs->cs = __KERNEL_CS | get_kernel_rpl(); - childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1; + childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; p->fpu_counter = 0; p->thread.io_bitmap_ptr = NULL; memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); @@ -242,7 +247,7 @@ EXPORT_SYMBOL_GPL(start_thread); * the task-switch, and shows up in ret_from_fork in entry.S, * for example. */ -__notrace_funcgraph struct task_struct * +__visible __notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 355ae06dbf94..bb1dc51bab05 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,7 +52,7 @@ asmlinkage extern void ret_from_fork(void); -DEFINE_PER_CPU(unsigned long, old_rsp); +asmlinkage DEFINE_PER_CPU(unsigned long, old_rsp); /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs *regs, int all) @@ -105,11 +105,18 @@ void __show_regs(struct pt_regs *regs, int all) get_debugreg(d0, 0); get_debugreg(d1, 1); get_debugreg(d2, 2); - printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); get_debugreg(d3, 3); get_debugreg(d6, 6); get_debugreg(d7, 7); + + /* Only print out debug registers if they are in their non-default state. */ + if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && + (d6 == DR6_RESERVED) && (d7 == 0x400)) + return; + + printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); + } void release_thread(struct task_struct *dead_task) @@ -176,7 +183,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, childregs->bp = arg; childregs->orig_ax = -1; childregs->cs = __KERNEL_CS | get_kernel_rpl(); - childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1; + childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; return 0; } *childregs = *current_pt_regs(); @@ -267,7 +274,7 @@ void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) * Kprobes not supported here. Set the probe on schedule instead. * Function graph tracer not supported too. */ -__notrace_funcgraph struct task_struct * +__visible __notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 29a8120e6fe8..7461f50d5bb1 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -601,30 +601,48 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[]) return dr7; } -static int -ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, - struct task_struct *tsk, int disabled) +static int ptrace_fill_bp_fields(struct perf_event_attr *attr, + int len, int type, bool disabled) +{ + int err, bp_len, bp_type; + + err = arch_bp_generic_fields(len, type, &bp_len, &bp_type); + if (!err) { + attr->bp_len = bp_len; + attr->bp_type = bp_type; + attr->disabled = disabled; + } + + return err; +} + +static struct perf_event * +ptrace_register_breakpoint(struct task_struct *tsk, int len, int type, + unsigned long addr, bool disabled) { - int err; - int gen_len, gen_type; struct perf_event_attr attr; + int err; - /* - * We should have at least an inactive breakpoint at this - * slot. It means the user is writing dr7 without having - * written the address register first - */ - if (!bp) - return -EINVAL; + ptrace_breakpoint_init(&attr); + attr.bp_addr = addr; - err = arch_bp_generic_fields(len, type, &gen_len, &gen_type); + err = ptrace_fill_bp_fields(&attr, len, type, disabled); if (err) - return err; + return ERR_PTR(err); + + return register_user_hw_breakpoint(&attr, ptrace_triggered, + NULL, tsk); +} - attr = bp->attr; - attr.bp_len = gen_len; - attr.bp_type = gen_type; - attr.disabled = disabled; +static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, + int disabled) +{ + struct perf_event_attr attr = bp->attr; + int err; + + err = ptrace_fill_bp_fields(&attr, len, type, disabled); + if (err) + return err; return modify_user_hw_breakpoint(bp, &attr); } @@ -634,67 +652,50 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, */ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) { - struct thread_struct *thread = &(tsk->thread); + struct thread_struct *thread = &tsk->thread; unsigned long old_dr7; - int i, orig_ret = 0, rc = 0; - int enabled, second_pass = 0; - unsigned len, type; - struct perf_event *bp; - - if (ptrace_get_breakpoints(tsk) < 0) - return -ESRCH; + bool second_pass = false; + int i, rc, ret = 0; data &= ~DR_CONTROL_RESERVED; old_dr7 = ptrace_get_dr7(thread->ptrace_bps); + restore: - /* - * Loop through all the hardware breakpoints, making the - * appropriate changes to each. - */ + rc = 0; for (i = 0; i < HBP_NUM; i++) { - enabled = decode_dr7(data, i, &len, &type); - bp = thread->ptrace_bps[i]; - - if (!enabled) { - if (bp) { - /* - * Don't unregister the breakpoints right-away, - * unless all register_user_hw_breakpoint() - * requests have succeeded. This prevents - * any window of opportunity for debug - * register grabbing by other users. - */ - if (!second_pass) - continue; - - rc = ptrace_modify_breakpoint(bp, len, type, - tsk, 1); - if (rc) - break; + unsigned len, type; + bool disabled = !decode_dr7(data, i, &len, &type); + struct perf_event *bp = thread->ptrace_bps[i]; + + if (!bp) { + if (disabled) + continue; + + bp = ptrace_register_breakpoint(tsk, + len, type, 0, disabled); + if (IS_ERR(bp)) { + rc = PTR_ERR(bp); + break; } + + thread->ptrace_bps[i] = bp; continue; } - rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0); + rc = ptrace_modify_breakpoint(bp, len, type, disabled); if (rc) break; } - /* - * Make a second pass to free the remaining unused breakpoints - * or to restore the original breakpoints if an error occurred. - */ - if (!second_pass) { - second_pass = 1; - if (rc < 0) { - orig_ret = rc; - data = old_dr7; - } + + /* Restore if the first pass failed, second_pass shouldn't fail. */ + if (rc && !WARN_ON(second_pass)) { + ret = rc; + data = old_dr7; + second_pass = true; goto restore; } - ptrace_put_breakpoints(tsk); - - return ((orig_ret < 0) ? orig_ret : rc); + return ret; } /* @@ -702,25 +703,17 @@ restore: */ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) { - struct thread_struct *thread = &(tsk->thread); + struct thread_struct *thread = &tsk->thread; unsigned long val = 0; if (n < HBP_NUM) { - struct perf_event *bp; + struct perf_event *bp = thread->ptrace_bps[n]; - if (ptrace_get_breakpoints(tsk) < 0) - return -ESRCH; - - bp = thread->ptrace_bps[n]; - if (!bp) - val = 0; - else + if (bp) val = bp->hw.info.address; - - ptrace_put_breakpoints(tsk); } else if (n == 6) { val = thread->debugreg6; - } else if (n == 7) { + } else if (n == 7) { val = thread->ptrace_dr7; } return val; @@ -729,29 +722,14 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, unsigned long addr) { - struct perf_event *bp; struct thread_struct *t = &tsk->thread; - struct perf_event_attr attr; + struct perf_event *bp = t->ptrace_bps[nr]; int err = 0; - if (ptrace_get_breakpoints(tsk) < 0) - return -ESRCH; - - if (!t->ptrace_bps[nr]) { - ptrace_breakpoint_init(&attr); - /* - * Put stub len and type to register (reserve) an inactive but - * correct bp - */ - attr.bp_addr = addr; - attr.bp_len = HW_BREAKPOINT_LEN_1; - attr.bp_type = HW_BREAKPOINT_W; - attr.disabled = 1; - - bp = register_user_hw_breakpoint(&attr, ptrace_triggered, - NULL, tsk); - + if (!bp) { /* + * Put stub len and type to create an inactive but correct bp. + * * CHECKME: the previous code returned -EIO if the addr wasn't * a valid task virtual addr. The new one will return -EINVAL in * this case. @@ -760,22 +738,20 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, * writing for the user. And anyway this is the previous * behaviour. */ - if (IS_ERR(bp)) { + bp = ptrace_register_breakpoint(tsk, + X86_BREAKPOINT_LEN_1, X86_BREAKPOINT_WRITE, + addr, true); + if (IS_ERR(bp)) err = PTR_ERR(bp); - goto put; - } - - t->ptrace_bps[nr] = bp; + else + t->ptrace_bps[nr] = bp; } else { - bp = t->ptrace_bps[nr]; + struct perf_event_attr attr = bp->attr; - attr = bp->attr; attr.bp_addr = addr; err = modify_user_hw_breakpoint(bp, &attr); } -put: - ptrace_put_breakpoints(tsk); return err; } @@ -785,30 +761,20 @@ put: static int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) { - struct thread_struct *thread = &(tsk->thread); - int rc = 0; - + struct thread_struct *thread = &tsk->thread; /* There are no DR4 or DR5 registers */ - if (n == 4 || n == 5) - return -EIO; + int rc = -EIO; - if (n == 6) { - thread->debugreg6 = val; - goto ret_path; - } if (n < HBP_NUM) { rc = ptrace_set_breakpoint_addr(tsk, n, val); - if (rc) - return rc; - } - /* All that's left is DR7 */ - if (n == 7) { + } else if (n == 6) { + thread->debugreg6 = val; + rc = 0; + } else if (n == 7) { rc = ptrace_write_dr7(tsk, val); if (!rc) thread->ptrace_dr7 = val; } - -ret_path: return rc; } diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 2cb9470ea85b..a16bae3f83b3 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -128,46 +128,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); } -static struct pvclock_vsyscall_time_info *pvclock_vdso_info; - -static struct pvclock_vsyscall_time_info * -pvclock_get_vsyscall_user_time_info(int cpu) -{ - if (!pvclock_vdso_info) { - BUG(); - return NULL; - } - - return &pvclock_vdso_info[cpu]; -} - -struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu) -{ - return &pvclock_get_vsyscall_user_time_info(cpu)->pvti; -} - #ifdef CONFIG_X86_64 -static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l, - void *v) -{ - struct task_migration_notifier *mn = v; - struct pvclock_vsyscall_time_info *pvti; - - pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu); - - /* this is NULL when pvclock vsyscall is not initialized */ - if (unlikely(pvti == NULL)) - return NOTIFY_DONE; - - pvti->migrate_count++; - - return NOTIFY_DONE; -} - -static struct notifier_block pvclock_migrate = { - .notifier_call = pvclock_task_migrate, -}; - /* * Initialize the generic pvclock vsyscall state. This will allocate * a/some page(s) for the per-vcpu pvclock information, set up a @@ -181,17 +142,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); - pvclock_vdso_info = i; - for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, __pa(i) + (idx*PAGE_SIZE), PAGE_KERNEL_VVAR); } - - register_task_migration_notifier(&pvclock_migrate); - return 0; } #endif diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 76fa1e9a2b39..e643e744e4d8 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -36,22 +36,6 @@ void (*pm_power_off)(void); EXPORT_SYMBOL(pm_power_off); static const struct desc_ptr no_idt = {}; -static int reboot_mode; -enum reboot_type reboot_type = BOOT_ACPI; -int reboot_force; - -/* - * This variable is used privately to keep track of whether or not - * reboot_type is still set to its default value (i.e., reboot= hasn't - * been set on the command line). This is needed so that we can - * suppress DMI scanning for reboot quirks. Without it, it's - * impossible to override a faulty reboot quirk without recompiling. - */ -static int reboot_default = 1; - -#ifdef CONFIG_SMP -static int reboot_cpu = -1; -#endif /* * This is set if we need to go through the 'emergency' path. @@ -64,79 +48,6 @@ static int reboot_emergency; bool port_cf9_safe = false; /* - * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] - * warm Don't set the cold reboot flag - * cold Set the cold reboot flag - * bios Reboot by jumping through the BIOS - * smp Reboot by executing reset on BSP or other CPU - * triple Force a triple fault (init) - * kbd Use the keyboard controller. cold reset (default) - * acpi Use the RESET_REG in the FADT - * efi Use efi reset_system runtime service - * pci Use the so-called "PCI reset register", CF9 - * force Avoid anything that could hang. - */ -static int __init reboot_setup(char *str) -{ - for (;;) { - /* - * Having anything passed on the command line via - * reboot= will cause us to disable DMI checking - * below. - */ - reboot_default = 0; - - switch (*str) { - case 'w': - reboot_mode = 0x1234; - break; - - case 'c': - reboot_mode = 0; - break; - -#ifdef CONFIG_SMP - case 's': - if (isdigit(*(str+1))) { - reboot_cpu = (int) (*(str+1) - '0'); - if (isdigit(*(str+2))) - reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0'); - } - /* - * We will leave sorting out the final value - * when we are ready to reboot, since we might not - * have detected BSP APIC ID or smp_num_cpu - */ - break; -#endif /* CONFIG_SMP */ - - case 'b': - case 'a': - case 'k': - case 't': - case 'e': - case 'p': - reboot_type = *str; - break; - - case 'f': - reboot_force = 1; - break; - } - - str = strchr(str, ','); - if (str) - str++; - else - break; - } - return 1; -} - -__setup("reboot=", reboot_setup); - - -/* * Reboot options and system auto-detection code provided by * Dell Inc. so their systems "just work". :-) */ @@ -441,12 +352,28 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { }, { /* Handle problems with rebooting on the Precision M6600. */ .callback = set_pci_reboot, - .ident = "Dell OptiPlex 990", + .ident = "Dell Precision M6600", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"), }, }, + { /* Handle problems with rebooting on the Dell PowerEdge C6100. */ + .callback = set_pci_reboot, + .ident = "Dell PowerEdge C6100", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "C6100"), + }, + }, + { /* Some C6100 machines were shipped with vendor being 'Dell'. */ + .callback = set_pci_reboot, + .ident = "Dell PowerEdge C6100", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell"), + DMI_MATCH(DMI_PRODUCT_NAME, "C6100"), + }, + }, { } }; @@ -536,6 +463,7 @@ static void native_machine_emergency_restart(void) int i; int attempt = 0; int orig_reboot_type = reboot_type; + unsigned short mode; if (reboot_emergency) emergency_vmx_disable_all(); @@ -543,7 +471,8 @@ static void native_machine_emergency_restart(void) tboot_shutdown(TB_SHUTDOWN_REBOOT); /* Tell the BIOS if we want cold or warm reboot */ - *((unsigned short *)__va(0x472)) = reboot_mode; + mode = reboot_mode == REBOOT_WARM ? 0x1234 : 0; + *((unsigned short *)__va(0x472)) = mode; for (;;) { /* Could also try the reset bit in the Hammer NB */ @@ -585,7 +514,7 @@ static void native_machine_emergency_restart(void) case BOOT_EFI: if (efi_enabled(EFI_RUNTIME_SERVICES)) - efi.reset_system(reboot_mode ? + efi.reset_system(reboot_mode == REBOOT_WARM ? EFI_RESET_WARM : EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); @@ -614,26 +543,10 @@ void native_machine_shutdown(void) { /* Stop the cpus and apics */ #ifdef CONFIG_SMP - - /* The boot cpu is always logical cpu 0 */ - int reboot_cpu_id = 0; - - /* See if there has been given a command line override */ - if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) && - cpu_online(reboot_cpu)) - reboot_cpu_id = reboot_cpu; - - /* Make certain the cpu I'm about to reboot on is online */ - if (!cpu_online(reboot_cpu_id)) - reboot_cpu_id = smp_processor_id(); - - /* Make certain I only run on the appropriate processor */ - set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id)); - /* - * O.K Now that I'm on the appropriate processor, stop all of the - * others. Also disable the local irq to not receive the per-cpu - * timer interrupt which may trigger scheduler's load balance. + * Stop all of the others. Also disable the local irq to + * not receive the per-cpu timer interrupt which may trigger + * scheduler's load balance. */ local_irq_disable(); stop_other_cpus(); diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index 36818f8ec2be..e13f8e7c22a6 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -186,7 +186,7 @@ identity_mapped: movl CP_PA_PGD(%ebx), %eax movl %eax, %cr3 movl %cr0, %eax - orl $(1<<31), %eax + orl $X86_CR0_PG, %eax movl %eax, %cr0 lea PAGE_SIZE(%edi), %esp movl %edi, %eax diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 7a6f3b3be3cf..3fd2c693e475 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -151,21 +151,21 @@ identity_mapped: testq %r11, %r11 jnz 1f - xorq %rax, %rax - xorq %rbx, %rbx - xorq %rcx, %rcx - xorq %rdx, %rdx - xorq %rsi, %rsi - xorq %rdi, %rdi - xorq %rbp, %rbp - xorq %r8, %r8 - xorq %r9, %r9 - xorq %r10, %r9 - xorq %r11, %r11 - xorq %r12, %r12 - xorq %r13, %r13 - xorq %r14, %r14 - xorq %r15, %r15 + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %esi, %esi + xorl %edi, %edi + xorl %ebp, %ebp + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + xorl %r13d, %r13d + xorl %r14d, %r14d + xorl %r15d, %r15d ret @@ -212,8 +212,8 @@ virtual_mapped: /* Do the copies */ swap_pages: movq %rdi, %rcx /* Put the page_list in %rcx */ - xorq %rdi, %rdi - xorq %rsi, %rsi + xorl %edi, %edi + xorl %esi, %esi jmp 1f 0: /* top, read another word for the indirection page */ diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 198eb201ed3b..0aa29394ed6f 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -38,8 +38,9 @@ EXPORT_SYMBOL(rtc_lock); * jump to the next second precisely 500 ms later. Check the Motorola * MC146818A or Dallas DS12887 data sheet for details. */ -int mach_set_rtc_mmss(unsigned long nowtime) +int mach_set_rtc_mmss(const struct timespec *now) { + unsigned long nowtime = now->tv_sec; struct rtc_time tm; int retval = 0; @@ -58,7 +59,7 @@ int mach_set_rtc_mmss(unsigned long nowtime) return retval; } -unsigned long mach_get_cmos_time(void) +void mach_get_cmos_time(struct timespec *now) { unsigned int status, year, mon, day, hour, min, sec, century = 0; unsigned long flags; @@ -107,7 +108,8 @@ unsigned long mach_get_cmos_time(void) } else year += CMOS_YEARS_OFFS; - return mktime(year, mon, day, hour, min, sec); + now->tv_sec = mktime(year, mon, day, hour, min, sec); + now->tv_nsec = 0; } /* Routines for accessing the CMOS RAM/RTC. */ @@ -135,18 +137,13 @@ EXPORT_SYMBOL(rtc_cmos_write); int update_persistent_clock(struct timespec now) { - return x86_platform.set_wallclock(now.tv_sec); + return x86_platform.set_wallclock(&now); } /* not static: needed by APM */ void read_persistent_clock(struct timespec *ts) { - unsigned long retval; - - retval = x86_platform.get_wallclock(); - - ts->tv_sec = retval; - ts->tv_nsec = 0; + x86_platform.get_wallclock(ts); } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 56f7fcfe7fa2..f0de6294b955 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -170,7 +170,7 @@ static struct resource bss_resource = { #ifdef CONFIG_X86_32 /* cpu data as detected by the assembly code in head.S */ -struct cpuinfo_x86 new_cpu_data __cpuinitdata = { +struct cpuinfo_x86 new_cpu_data = { .wp_works_ok = -1, }; /* common cpu data for all cpus */ @@ -206,9 +206,9 @@ EXPORT_SYMBOL(boot_cpu_data); #if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) -unsigned long mmu_cr4_features; +__visible unsigned long mmu_cr4_features; #else -unsigned long mmu_cr4_features = X86_CR4_PAE; +__visible unsigned long mmu_cr4_features = X86_CR4_PAE; #endif /* Boot loader ID and version as integers, for the benefit of proc_dointvec */ @@ -426,25 +426,23 @@ static void __init reserve_initrd(void) static void __init parse_setup_data(void) { struct setup_data *data; - u64 pa_data; + u64 pa_data, pa_next; pa_data = boot_params.hdr.setup_data; while (pa_data) { - u32 data_len, map_len; + u32 data_len, map_len, data_type; map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK), (u64)sizeof(struct setup_data)); data = early_memremap(pa_data, map_len); data_len = data->len + sizeof(struct setup_data); - if (data_len > map_len) { - early_iounmap(data, map_len); - data = early_memremap(pa_data, data_len); - map_len = data_len; - } + data_type = data->type; + pa_next = data->next; + early_iounmap(data, map_len); - switch (data->type) { + switch (data_type) { case SETUP_E820_EXT: - parse_e820_ext(data); + parse_e820_ext(pa_data, data_len); break; case SETUP_DTB: add_dtb(pa_data); @@ -452,8 +450,7 @@ static void __init parse_setup_data(void) default: break; } - pa_data = data->next; - early_iounmap(data, map_len); + pa_data = pa_next; } } @@ -1040,8 +1037,6 @@ void __init setup_arch(char **cmdline_p) /* max_low_pfn get updated here */ find_low_pfn_range(); #else - num_physpages = max_pfn; - check_x2apic(); /* How many end-of-memory variables you have, grandma! */ @@ -1072,7 +1067,7 @@ void __init setup_arch(char **cmdline_p) cleanup_highmap(); - memblock.current_limit = ISA_END_ADDRESS; + memblock_set_current_limit(ISA_END_ADDRESS); memblock_x86_fill(); /* @@ -1105,7 +1100,7 @@ void __init setup_arch(char **cmdline_p) setup_real_mode(); - memblock.current_limit = get_max_mapped(); + memblock_set_current_limit(get_max_mapped()); dma_contiguous_reserve(0); /* diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 69562992e457..9e5de6813e1f 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -43,12 +43,6 @@ #include <asm/sigframe.h> -#ifdef CONFIG_X86_32 -# define FIX_EFLAGS (__FIX_EFLAGS | X86_EFLAGS_RF) -#else -# define FIX_EFLAGS __FIX_EFLAGS -#endif - #define COPY(x) do { \ get_user_ex(regs->x, &sc->x); \ } while (0) @@ -364,7 +358,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, else put_user_ex(0, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); - err |= __save_altstack(&frame->uc.uc_stack, regs->sp); + save_altstack_ex(&frame->uc.uc_stack, regs->sp); /* Set up to return from userspace. */ restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); @@ -429,7 +423,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, else put_user_ex(0, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); - err |= __save_altstack(&frame->uc.uc_stack, regs->sp); + save_altstack_ex(&frame->uc.uc_stack, regs->sp); /* Set up to return from userspace. If provided, use a stub already in userspace. */ @@ -496,7 +490,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, else put_user_ex(0, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); - err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp); + compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); put_user_ex(0, &frame->uc.uc__pad0); if (ksig->ka.sa.sa_flags & SA_RESTORER) { @@ -539,7 +533,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, * Do a signal return; undo the signal stack. */ #ifdef CONFIG_X86_32 -unsigned long sys_sigreturn(void) +asmlinkage unsigned long sys_sigreturn(void) { struct pt_regs *regs = current_pt_regs(); struct sigframe __user *frame; @@ -568,7 +562,7 @@ badframe: } #endif /* CONFIG_X86_32 */ -long sys_rt_sigreturn(void) +asmlinkage long sys_rt_sigreturn(void) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; @@ -668,15 +662,17 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) if (!failed) { /* * Clear the direction flag as per the ABI for function entry. - */ - regs->flags &= ~X86_EFLAGS_DF; - /* + * + * Clear RF when entering the signal handler, because + * it might disable possible debug exception from the + * signal handler. + * * Clear TF when entering the signal handler, but * notify any tracer that was single-stepping it. * The tracer may want to single-step inside the * handler too. */ - regs->flags &= ~X86_EFLAGS_TF; + regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF); } signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP)); } @@ -732,7 +728,7 @@ static void do_signal(struct pt_regs *regs) * notification of userspace execution resumption * - triggered by the TIF_WORK_MASK flags */ -void +__visible void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { user_exit(); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 48d2b7ded422..7c3a5a61f2e4 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -30,6 +30,7 @@ #include <asm/proto.h> #include <asm/apic.h> #include <asm/nmi.h> +#include <asm/trace/irq_vectors.h> /* * Some notes on x86 processor bugs affecting SMP operation: * @@ -249,32 +250,87 @@ finish: /* * Reschedule call back. */ -void smp_reschedule_interrupt(struct pt_regs *regs) +static inline void __smp_reschedule_interrupt(void) { - ack_APIC_irq(); inc_irq_stat(irq_resched_count); scheduler_ipi(); +} + +__visible void smp_reschedule_interrupt(struct pt_regs *regs) +{ + ack_APIC_irq(); + __smp_reschedule_interrupt(); /* * KVM uses this interrupt to force a cpu out of guest mode */ } -void smp_call_function_interrupt(struct pt_regs *regs) +static inline void smp_entering_irq(void) { ack_APIC_irq(); irq_enter(); +} + +__visible void smp_trace_reschedule_interrupt(struct pt_regs *regs) +{ + /* + * Need to call irq_enter() before calling the trace point. + * __smp_reschedule_interrupt() calls irq_enter/exit() too (in + * scheduler_ipi(). This is OK, since those functions are allowed + * to nest. + */ + smp_entering_irq(); + trace_reschedule_entry(RESCHEDULE_VECTOR); + __smp_reschedule_interrupt(); + trace_reschedule_exit(RESCHEDULE_VECTOR); + exiting_irq(); + /* + * KVM uses this interrupt to force a cpu out of guest mode + */ +} + +static inline void __smp_call_function_interrupt(void) +{ generic_smp_call_function_interrupt(); inc_irq_stat(irq_call_count); - irq_exit(); } -void smp_call_function_single_interrupt(struct pt_regs *regs) +__visible void smp_call_function_interrupt(struct pt_regs *regs) +{ + smp_entering_irq(); + __smp_call_function_interrupt(); + exiting_irq(); +} + +__visible void smp_trace_call_function_interrupt(struct pt_regs *regs) +{ + smp_entering_irq(); + trace_call_function_entry(CALL_FUNCTION_VECTOR); + __smp_call_function_interrupt(); + trace_call_function_exit(CALL_FUNCTION_VECTOR); + exiting_irq(); +} + +static inline void __smp_call_function_single_interrupt(void) { - ack_APIC_irq(); - irq_enter(); generic_smp_call_function_single_interrupt(); inc_irq_stat(irq_call_count); - irq_exit(); +} + +__visible void smp_call_function_single_interrupt(struct pt_regs *regs) +{ + smp_entering_irq(); + __smp_call_function_single_interrupt(); + exiting_irq(); +} + +__visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs) +{ + smp_entering_irq(); + trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); + __smp_call_function_single_interrupt(); + trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); + exiting_irq(); } static int __init nonmi_ipi_setup(char *str) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 9c73b51817e4..6cacab671f9b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -130,7 +130,7 @@ atomic_t init_deasserted; * Report back to the Boot Processor during boot time or to the caller processor * during CPU online. */ -static void __cpuinit smp_callin(void) +static void smp_callin(void) { int cpuid, phys_id; unsigned long timeout; @@ -237,7 +237,7 @@ static int enable_start_cpu0; /* * Activate a secondary processor. */ -notrace static void __cpuinit start_secondary(void *unused) +static void notrace start_secondary(void *unused) { /* * Don't put *anything* before cpu_init(), SMP booting is too @@ -300,7 +300,7 @@ void __init smp_store_boot_cpu_info(void) * The bootstrap kernel entry code has set these up. Save them for * a given CPU */ -void __cpuinit smp_store_cpu_info(int id) +void smp_store_cpu_info(int id) { struct cpuinfo_x86 *c = &cpu_data(id); @@ -313,7 +313,7 @@ void __cpuinit smp_store_cpu_info(int id) identify_secondary_cpu(c); } -static bool __cpuinit +static bool topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name) { int cpu1 = c->cpu_index, cpu2 = o->cpu_index; @@ -330,7 +330,7 @@ do { \ cpumask_set_cpu((c2), cpu_##_m##_mask(c1)); \ } while (0) -static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { if (cpu_has_topoext) { int cpu1 = c->cpu_index, cpu2 = o->cpu_index; @@ -348,7 +348,7 @@ static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } -static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { int cpu1 = c->cpu_index, cpu2 = o->cpu_index; @@ -359,7 +359,7 @@ static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } -static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +static bool match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { if (c->phys_proc_id == o->phys_proc_id) { if (cpu_has(c, X86_FEATURE_AMD_DCM)) @@ -370,17 +370,17 @@ static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } -void __cpuinit set_cpu_sibling_map(int cpu) +void set_cpu_sibling_map(int cpu) { - bool has_mc = boot_cpu_data.x86_max_cores > 1; bool has_smt = smp_num_siblings > 1; + bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1; struct cpuinfo_x86 *c = &cpu_data(cpu); struct cpuinfo_x86 *o; int i; cpumask_set_cpu(cpu, cpu_sibling_setup_mask); - if (!has_smt && !has_mc) { + if (!has_mp) { cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); cpumask_set_cpu(cpu, cpu_core_mask(cpu)); @@ -394,7 +394,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) if ((i == cpu) || (has_smt && match_smt(c, o))) link_mask(sibling, cpu, i); - if ((i == cpu) || (has_mc && match_llc(c, o))) + if ((i == cpu) || (has_mp && match_llc(c, o))) link_mask(llc_shared, cpu, i); } @@ -406,7 +406,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) for_each_cpu(i, cpu_sibling_setup_mask) { o = &cpu_data(i); - if ((i == cpu) || (has_mc && match_mc(c, o))) { + if ((i == cpu) || (has_mp && match_mc(c, o))) { link_mask(core, cpu, i); /* @@ -499,7 +499,7 @@ void __inquire_remote_apic(int apicid) * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this * won't ... remember to clear down the APIC, etc later. */ -int __cpuinit +int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; @@ -533,7 +533,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) return (send_status | accept_status); } -static int __cpuinit +static int wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; @@ -649,10 +649,11 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) } /* reduce the number of lines printed when booting a large cpu count system */ -static void __cpuinit announce_cpu(int cpu, int apicid) +static void announce_cpu(int cpu, int apicid) { static int current_node = -1; int node = early_cpu_to_node(cpu); + int max_cpu_present = find_last_bit(cpumask_bits(cpu_present_mask), NR_CPUS); if (system_state == SYSTEM_BOOTING) { if (node != current_node) { @@ -661,7 +662,7 @@ static void __cpuinit announce_cpu(int cpu, int apicid) current_node = node; pr_info("Booting Node %3d, Processors ", node); } - pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " OK\n" : ""); + pr_cont(" #%4d%s", cpu, cpu == max_cpu_present ? " OK\n" : ""); return; } else pr_info("Booting Node %d Processor %d APIC 0x%x\n", @@ -691,7 +692,7 @@ static int wakeup_cpu0_nmi(unsigned int cmd, struct pt_regs *regs) * We'll change this code in the future to wake up hard offlined CPU0 if * real platform and request are available. */ -static int __cpuinit +static int wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid, int *cpu0_nmi_registered) { @@ -731,7 +732,7 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid, * Returns zero if CPU booted OK, else error code from * ->wakeup_secondary_cpu. */ -static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) +static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) { volatile u32 *trampoline_status = (volatile u32 *) __va(real_mode_header->trampoline_status); @@ -872,7 +873,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) return boot_error; } -int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) +int native_cpu_up(unsigned int cpu, struct task_struct *tidle) { int apicid = apic->cpu_present_to_apicid(cpu); unsigned long flags; diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index dbded5aedb81..30277e27431a 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -101,7 +101,7 @@ static void find_start_end(unsigned long flags, unsigned long *begin, *begin = new_begin; } } else { - *begin = TASK_UNMAPPED_BASE; + *begin = current->mm->mmap_legacy_base; *end = TASK_SIZE; } } diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c index 147fcd4941c4..e9bcd57d8a9e 100644 --- a/arch/x86/kernel/syscall_32.c +++ b/arch/x86/kernel/syscall_32.c @@ -15,7 +15,7 @@ typedef asmlinkage void (*sys_call_ptr_t)(void); extern asmlinkage void sys_ni_syscall(void); -const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { +__visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index 5c7f8c20da74..4ac730b37f0b 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -4,6 +4,7 @@ #include <linux/sys.h> #include <linux/cache.h> #include <asm/asm-offsets.h> +#include <asm/syscall.h> #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) @@ -19,11 +20,9 @@ #define __SYSCALL_64(nr, sym, compat) [nr] = sym, -typedef void (*sys_call_ptr_t)(void); - extern void sys_ni_syscall(void); -const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { +asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. diff --git a/arch/x86/kernel/sysfb.c b/arch/x86/kernel/sysfb.c new file mode 100644 index 000000000000..193ec2ce46c7 --- /dev/null +++ b/arch/x86/kernel/sysfb.c @@ -0,0 +1,74 @@ +/* + * Generic System Framebuffers on x86 + * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +/* + * Simple-Framebuffer support for x86 systems + * Create a platform-device for any available boot framebuffer. The + * simple-framebuffer platform device is already available on DT systems, so + * this module parses the global "screen_info" object and creates a suitable + * platform device compatible with the "simple-framebuffer" DT object. If + * the framebuffer is incompatible, we instead create a legacy + * "vesa-framebuffer", "efi-framebuffer" or "platform-framebuffer" device and + * pass the screen_info as platform_data. This allows legacy drivers + * to pick these devices up without messing with simple-framebuffer drivers. + * The global "screen_info" is still valid at all times. + * + * If CONFIG_X86_SYSFB is not selected, we never register "simple-framebuffer" + * platform devices, but only use legacy framebuffer devices for + * backwards compatibility. + * + * TODO: We set the dev_id field of all platform-devices to 0. This allows + * other x86 OF/DT parsers to create such devices, too. However, they must + * start at offset 1 for this to work. + */ + +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/platform_data/simplefb.h> +#include <linux/platform_device.h> +#include <linux/screen_info.h> +#include <asm/sysfb.h> + +static __init int sysfb_init(void) +{ + struct screen_info *si = &screen_info; + struct simplefb_platform_data mode; + struct platform_device *pd; + const char *name; + bool compatible; + int ret; + + sysfb_apply_efi_quirks(); + + /* try to create a simple-framebuffer device */ + compatible = parse_mode(si, &mode); + if (compatible) { + ret = create_simplefb(si, &mode); + if (!ret) + return 0; + } + + /* if the FB is incompatible, create a legacy framebuffer device */ + if (si->orig_video_isVGA == VIDEO_TYPE_EFI) + name = "efi-framebuffer"; + else if (si->orig_video_isVGA == VIDEO_TYPE_VLFB) + name = "vesa-framebuffer"; + else + name = "platform-framebuffer"; + + pd = platform_device_register_resndata(NULL, name, 0, + NULL, 0, si, sizeof(*si)); + return IS_ERR(pd) ? PTR_ERR(pd) : 0; +} + +/* must execute after PCI subsystem for EFI quirks */ +device_initcall(sysfb_init); diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c new file mode 100644 index 000000000000..b285d4e8c68e --- /dev/null +++ b/arch/x86/kernel/sysfb_efi.c @@ -0,0 +1,214 @@ +/* + * Generic System Framebuffers on x86 + * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com> + * + * EFI Quirks Copyright (c) 2006 Edgar Hucek <gimli@dark-green.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +/* + * EFI Quirks + * Several EFI systems do not correctly advertise their boot framebuffers. + * Hence, we use this static table of known broken machines and fix up the + * information so framebuffer drivers can load corectly. + */ + +#include <linux/dmi.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/pci.h> +#include <linux/screen_info.h> +#include <video/vga.h> +#include <asm/sysfb.h> + +enum { + OVERRIDE_NONE = 0x0, + OVERRIDE_BASE = 0x1, + OVERRIDE_STRIDE = 0x2, + OVERRIDE_HEIGHT = 0x4, + OVERRIDE_WIDTH = 0x8, +}; + +struct efifb_dmi_info efifb_dmi_list[] = { + [M_I17] = { "i17", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE }, + [M_I20] = { "i20", 0x80010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE }, /* guess */ + [M_I20_SR] = { "imac7", 0x40010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE }, + [M_I24] = { "i24", 0x80010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, /* guess */ + [M_I24_8_1] = { "imac8", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_I24_10_1] = { "imac10", 0xc0010000, 2048 * 4, 1920, 1080, OVERRIDE_NONE }, + [M_I27_11_1] = { "imac11", 0xc0010000, 2560 * 4, 2560, 1440, OVERRIDE_NONE }, + [M_MINI]= { "mini", 0x80000000, 2048 * 4, 1024, 768, OVERRIDE_NONE }, + [M_MINI_3_1] = { "mini31", 0x40010000, 1024 * 4, 1024, 768, OVERRIDE_NONE }, + [M_MINI_4_1] = { "mini41", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_MB] = { "macbook", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MB_5_1] = { "macbook51", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MB_6_1] = { "macbook61", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MB_7_1] = { "macbook71", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MBA] = { "mba", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + /* 11" Macbook Air 3,1 passes the wrong stride */ + [M_MBA_3] = { "mba3", 0, 2048 * 4, 0, 0, OVERRIDE_STRIDE }, + [M_MBP] = { "mbp", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_2] = { "mbp2", 0, 0, 0, 0, OVERRIDE_NONE }, /* placeholder */ + [M_MBP_2_2] = { "mbp22", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_SR] = { "mbp3", 0x80030000, 2048 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_4] = { "mbp4", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_MBP_5_1] = { "mbp51", 0xc0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_5_2] = { "mbp52", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_MBP_5_3] = { "mbp53", 0xd0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_6_1] = { "mbp61", 0x90030000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_MBP_6_2] = { "mbp62", 0x90030000, 2048 * 4, 1680, 1050, OVERRIDE_NONE }, + [M_MBP_7_1] = { "mbp71", 0xc0010000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MBP_8_2] = { "mbp82", 0x90010000, 1472 * 4, 1440, 900, OVERRIDE_NONE }, + [M_UNKNOWN] = { NULL, 0, 0, 0, 0, OVERRIDE_NONE } +}; + +#define choose_value(dmivalue, fwvalue, field, flags) ({ \ + typeof(fwvalue) _ret_ = fwvalue; \ + if ((flags) & (field)) \ + _ret_ = dmivalue; \ + else if ((fwvalue) == 0) \ + _ret_ = dmivalue; \ + _ret_; \ + }) + +static int __init efifb_set_system(const struct dmi_system_id *id) +{ + struct efifb_dmi_info *info = id->driver_data; + + if (info->base == 0 && info->height == 0 && info->width == 0 && + info->stride == 0) + return 0; + + /* Trust the bootloader over the DMI tables */ + if (screen_info.lfb_base == 0) { +#if defined(CONFIG_PCI) + struct pci_dev *dev = NULL; + int found_bar = 0; +#endif + if (info->base) { + screen_info.lfb_base = choose_value(info->base, + screen_info.lfb_base, OVERRIDE_BASE, + info->flags); + +#if defined(CONFIG_PCI) + /* make sure that the address in the table is actually + * on a VGA device's PCI BAR */ + + for_each_pci_dev(dev) { + int i; + if ((dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) + continue; + for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { + resource_size_t start, end; + + start = pci_resource_start(dev, i); + if (start == 0) + break; + end = pci_resource_end(dev, i); + if (screen_info.lfb_base >= start && + screen_info.lfb_base < end) { + found_bar = 1; + } + } + } + if (!found_bar) + screen_info.lfb_base = 0; +#endif + } + } + if (screen_info.lfb_base) { + screen_info.lfb_linelength = choose_value(info->stride, + screen_info.lfb_linelength, OVERRIDE_STRIDE, + info->flags); + screen_info.lfb_width = choose_value(info->width, + screen_info.lfb_width, OVERRIDE_WIDTH, + info->flags); + screen_info.lfb_height = choose_value(info->height, + screen_info.lfb_height, OVERRIDE_HEIGHT, + info->flags); + if (screen_info.orig_video_isVGA == 0) + screen_info.orig_video_isVGA = VIDEO_TYPE_EFI; + } else { + screen_info.lfb_linelength = 0; + screen_info.lfb_width = 0; + screen_info.lfb_height = 0; + screen_info.orig_video_isVGA = 0; + return 0; + } + + printk(KERN_INFO "efifb: dmi detected %s - framebuffer at 0x%08x " + "(%dx%d, stride %d)\n", id->ident, + screen_info.lfb_base, screen_info.lfb_width, + screen_info.lfb_height, screen_info.lfb_linelength); + + return 1; +} + +#define EFIFB_DMI_SYSTEM_ID(vendor, name, enumid) \ + { \ + efifb_set_system, \ + name, \ + { \ + DMI_MATCH(DMI_BIOS_VENDOR, vendor), \ + DMI_MATCH(DMI_PRODUCT_NAME, name) \ + }, \ + &efifb_dmi_list[enumid] \ + } + +static const struct dmi_system_id efifb_dmi_system_table[] __initconst = { + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac4,1", M_I17), + /* At least one of these two will be right; maybe both? */ + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac5,1", M_I20), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac5,1", M_I20), + /* At least one of these two will be right; maybe both? */ + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac6,1", M_I24), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac6,1", M_I24), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac7,1", M_I20_SR), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac8,1", M_I24_8_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac10,1", M_I24_10_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac11,1", M_I27_11_1), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "Macmini1,1", M_MINI), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini3,1", M_MINI_3_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini4,1", M_MINI_4_1), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook1,1", M_MB), + /* At least one of these two will be right; maybe both? */ + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook2,1", M_MB), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook2,1", M_MB), + /* At least one of these two will be right; maybe both? */ + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook3,1", M_MB), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook3,1", M_MB), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook4,1", M_MB), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook5,1", M_MB_5_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook6,1", M_MB_6_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook7,1", M_MB_7_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir1,1", M_MBA), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir3,1", M_MBA_3), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro1,1", M_MBP), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,1", M_MBP_2), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,2", M_MBP_2_2), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro2,1", M_MBP_2), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro3,1", M_MBP_SR), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro3,1", M_MBP_SR), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro4,1", M_MBP_4), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,1", M_MBP_5_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,2", M_MBP_5_2), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,3", M_MBP_5_3), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,1", M_MBP_6_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,2", M_MBP_6_2), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro7,1", M_MBP_7_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro8,2", M_MBP_8_2), + {}, +}; + +__init void sysfb_apply_efi_quirks(void) +{ + if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI || + !(screen_info.capabilities & VIDEO_CAPABILITY_SKIP_QUIRKS)) + dmi_check_system(efifb_dmi_system_table); +} diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c new file mode 100644 index 000000000000..86179d409893 --- /dev/null +++ b/arch/x86/kernel/sysfb_simplefb.c @@ -0,0 +1,95 @@ +/* + * Generic System Framebuffers on x86 + * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +/* + * simple-framebuffer probing + * Try to convert "screen_info" into a "simple-framebuffer" compatible mode. + * If the mode is incompatible, we return "false" and let the caller create + * legacy nodes instead. + */ + +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/platform_data/simplefb.h> +#include <linux/platform_device.h> +#include <linux/screen_info.h> +#include <asm/sysfb.h> + +static const char simplefb_resname[] = "BOOTFB"; +static const struct simplefb_format formats[] = SIMPLEFB_FORMATS; + +/* try parsing x86 screen_info into a simple-framebuffer mode struct */ +__init bool parse_mode(const struct screen_info *si, + struct simplefb_platform_data *mode) +{ + const struct simplefb_format *f; + __u8 type; + unsigned int i; + + type = si->orig_video_isVGA; + if (type != VIDEO_TYPE_VLFB && type != VIDEO_TYPE_EFI) + return false; + + for (i = 0; i < ARRAY_SIZE(formats); ++i) { + f = &formats[i]; + if (si->lfb_depth == f->bits_per_pixel && + si->red_size == f->red.length && + si->red_pos == f->red.offset && + si->green_size == f->green.length && + si->green_pos == f->green.offset && + si->blue_size == f->blue.length && + si->blue_pos == f->blue.offset && + si->rsvd_size == f->transp.length && + si->rsvd_pos == f->transp.offset) { + mode->format = f->name; + mode->width = si->lfb_width; + mode->height = si->lfb_height; + mode->stride = si->lfb_linelength; + return true; + } + } + + return false; +} + +__init int create_simplefb(const struct screen_info *si, + const struct simplefb_platform_data *mode) +{ + struct platform_device *pd; + struct resource res; + unsigned long len; + + /* don't use lfb_size as it may contain the whole VMEM instead of only + * the part that is occupied by the framebuffer */ + len = mode->height * mode->stride; + len = PAGE_ALIGN(len); + if (len > (u64)si->lfb_size << 16) { + printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n"); + return -EINVAL; + } + + /* setup IORESOURCE_MEM as framebuffer memory */ + memset(&res, 0, sizeof(res)); + res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res.name = simplefb_resname; + res.start = si->lfb_base; + res.end = si->lfb_base + len - 1; + if (res.end <= res.start) + return -EINVAL; + + pd = platform_device_register_resndata(NULL, "simple-framebuffer", 0, + &res, 1, mode, sizeof(*mode)); + if (IS_ERR(pd)) + return PTR_ERR(pd); + + return 0; +} diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index f84fe00fad48..91a4496db434 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -31,6 +31,7 @@ #include <linux/pfn.h> #include <linux/mm.h> #include <linux/tboot.h> +#include <linux/debugfs.h> #include <asm/realmode.h> #include <asm/processor.h> @@ -300,6 +301,15 @@ static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) return 0; } +static int tboot_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b) +{ + if (!tboot_enabled()) + return 0; + + pr_warning("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)"); + return -ENODEV; +} + static atomic_t ap_wfs_count; static int tboot_wait_for_aps(int num_aps) @@ -319,8 +329,8 @@ static int tboot_wait_for_aps(int num_aps) return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps); } -static int __cpuinit tboot_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int tboot_cpu_callback(struct notifier_block *nfb, unsigned long action, + void *hcpu) { switch (action) { case CPU_DYING: @@ -333,11 +343,78 @@ static int __cpuinit tboot_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block tboot_cpu_notifier __cpuinitdata = +static struct notifier_block tboot_cpu_notifier = { .notifier_call = tboot_cpu_callback, }; +#ifdef CONFIG_DEBUG_FS + +#define TBOOT_LOG_UUID { 0x26, 0x25, 0x19, 0xc0, 0x30, 0x6b, 0xb4, 0x4d, \ + 0x4c, 0x84, 0xa3, 0xe9, 0x53, 0xb8, 0x81, 0x74 } + +#define TBOOT_SERIAL_LOG_ADDR 0x60000 +#define TBOOT_SERIAL_LOG_SIZE 0x08000 +#define LOG_MAX_SIZE_OFF 16 +#define LOG_BUF_OFF 24 + +static uint8_t tboot_log_uuid[16] = TBOOT_LOG_UUID; + +static ssize_t tboot_log_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) +{ + void __iomem *log_base; + u8 log_uuid[16]; + u32 max_size; + void *kbuf; + int ret = -EFAULT; + + log_base = ioremap_nocache(TBOOT_SERIAL_LOG_ADDR, TBOOT_SERIAL_LOG_SIZE); + if (!log_base) + return ret; + + memcpy_fromio(log_uuid, log_base, sizeof(log_uuid)); + if (memcmp(&tboot_log_uuid, log_uuid, sizeof(log_uuid))) + goto err_iounmap; + + max_size = readl(log_base + LOG_MAX_SIZE_OFF); + if (*ppos >= max_size) { + ret = 0; + goto err_iounmap; + } + + if (*ppos + count > max_size) + count = max_size - *ppos; + + kbuf = kmalloc(count, GFP_KERNEL); + if (!kbuf) { + ret = -ENOMEM; + goto err_iounmap; + } + + memcpy_fromio(kbuf, log_base + LOG_BUF_OFF + *ppos, count); + if (copy_to_user(user_buf, kbuf, count)) + goto err_kfree; + + *ppos += count; + + ret = count; + +err_kfree: + kfree(kbuf); + +err_iounmap: + iounmap(log_base); + + return ret; +} + +static const struct file_operations tboot_log_fops = { + .read = tboot_log_read, + .llseek = default_llseek, +}; + +#endif /* CONFIG_DEBUG_FS */ + static __init int tboot_late_init(void) { if (!tboot_enabled()) @@ -348,7 +425,13 @@ static __init int tboot_late_init(void) atomic_set(&ap_wfs_count, 0); register_hotcpu_notifier(&tboot_cpu_notifier); +#ifdef CONFIG_DEBUG_FS + debugfs_create_file("tboot_log", S_IRUSR, + arch_debugfs_dir, NULL, &tboot_log_fops); +#endif + acpi_os_set_prepare_sleep(&tboot_sleep); + acpi_os_set_prepare_extended_sleep(&tboot_extended_sleep); return 0; } diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c new file mode 100644 index 000000000000..1c113db9ed57 --- /dev/null +++ b/arch/x86/kernel/tracepoint.c @@ -0,0 +1,59 @@ +/* + * Code for supporting irq vector tracepoints. + * + * Copyright (C) 2013 Seiji Aguchi <seiji.aguchi@hds.com> + * + */ +#include <asm/hw_irq.h> +#include <asm/desc.h> +#include <linux/atomic.h> + +atomic_t trace_idt_ctr = ATOMIC_INIT(0); +struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, + (unsigned long) trace_idt_table }; + +/* No need to be aligned, but done to keep all IDTs defined the same way. */ +gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss; + +static int trace_irq_vector_refcount; +static DEFINE_MUTEX(irq_vector_mutex); + +static void set_trace_idt_ctr(int val) +{ + atomic_set(&trace_idt_ctr, val); + /* Ensure the trace_idt_ctr is set before sending IPI */ + wmb(); +} + +static void switch_idt(void *arg) +{ + unsigned long flags; + + local_irq_save(flags); + load_current_idt(); + local_irq_restore(flags); +} + +void trace_irq_vector_regfunc(void) +{ + mutex_lock(&irq_vector_mutex); + if (!trace_irq_vector_refcount) { + set_trace_idt_ctr(1); + smp_call_function(switch_idt, NULL, 0); + switch_idt(NULL); + } + trace_irq_vector_refcount++; + mutex_unlock(&irq_vector_mutex); +} + +void trace_irq_vector_unregfunc(void) +{ + mutex_lock(&irq_vector_mutex); + trace_irq_vector_refcount--; + if (!trace_irq_vector_refcount) { + set_trace_idt_ctr(0); + smp_call_function(switch_idt, NULL, 0); + switch_idt(NULL); + } + mutex_unlock(&irq_vector_mutex); +} diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 772e2a846dec..8c8093b146ca 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -58,24 +58,25 @@ #include <asm/mce.h> #include <asm/fixmap.h> #include <asm/mach_traps.h> +#include <asm/alternative.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> #include <asm/pgalloc.h> #include <asm/proto.h> + +/* No need to be aligned, but done to keep all IDTs defined the same way. */ +gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss; #else #include <asm/processor-flags.h> #include <asm/setup.h> asmlinkage int system_call(void); - -/* - * The IDT has to be page-aligned to simplify the Pentium - * F0 0F bug workaround. - */ -gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, }; #endif +/* Must be page-aligned because the real IDT is used in a fixmap. */ +gate_desc idt_table[NR_VECTORS] __page_aligned_bss; + DECLARE_BITMAP(used_vectors, NR_VECTORS); EXPORT_SYMBOL_GPL(used_vectors); @@ -254,6 +255,9 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_DF; +#ifdef CONFIG_DOUBLEFAULT + df_debug(regs, error_code); +#endif /* * This is always a kernel trap and never fixable (and thus must * never return). @@ -324,6 +328,9 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co ftrace_int3_handler(regs)) return; #endif + if (poke_int3_handler(regs)) + return; + prev_state = exception_enter(); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, @@ -437,7 +444,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) /* Store the virtualized DR6 value */ tsk->thread.debugreg6 = dr6; - if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code, + if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code, SIGTRAP) == NOTIFY_STOP) goto exit; @@ -785,7 +792,7 @@ void __init trap_init(void) x86_init.irqs.trap_init(); #ifdef CONFIG_X86_64 - memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16); + memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16); set_nmi_gate(X86_TRAP_DB, &debug); set_nmi_gate(X86_TRAP_BP, &int3); #endif diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 098b3cfda72e..930e5d48f560 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -89,6 +89,12 @@ int check_tsc_unstable(void) } EXPORT_SYMBOL_GPL(check_tsc_unstable); +int check_tsc_disabled(void) +{ + return tsc_disabled; +} +EXPORT_SYMBOL_GPL(check_tsc_disabled); + #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { @@ -824,7 +830,7 @@ static void __init check_system_tsc_reliable(void) * Make an educated guess if the TSC is trustworthy and synchronized * over all CPUs. */ -__cpuinit int unsynchronized_tsc(void) +int unsynchronized_tsc(void) { if (!cpu_has_tsc || tsc_unstable) return 1; @@ -1020,7 +1026,7 @@ void __init tsc_init(void) * been calibrated. This assumes that CONSTANT_TSC applies to all * cpus in the socket - this should be a safe assumption. */ -unsigned long __cpuinit calibrate_delay_is_known(void) +unsigned long calibrate_delay_is_known(void) { int i, cpu = smp_processor_id(); diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index fc25e60a5884..adfdf56a3714 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -25,24 +25,24 @@ * Entry/exit counters that make sure that both CPUs * run the measurement code at once: */ -static __cpuinitdata atomic_t start_count; -static __cpuinitdata atomic_t stop_count; +static atomic_t start_count; +static atomic_t stop_count; /* * We use a raw spinlock in this exceptional case, because * we want to have the fastest, inlined, non-debug version * of a critical section, to be able to prove TSC time-warps: */ -static __cpuinitdata arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED; -static __cpuinitdata cycles_t last_tsc; -static __cpuinitdata cycles_t max_warp; -static __cpuinitdata int nr_warps; +static cycles_t last_tsc; +static cycles_t max_warp; +static int nr_warps; /* * TSC-warp measurement loop running on both CPUs: */ -static __cpuinit void check_tsc_warp(unsigned int timeout) +static void check_tsc_warp(unsigned int timeout) { cycles_t start, now, prev, end; int i; @@ -121,7 +121,7 @@ static inline unsigned int loop_timeout(int cpu) * Source CPU calls into this - it waits for the freshly booted * target CPU to arrive and then starts the measurement: */ -void __cpuinit check_tsc_sync_source(int cpu) +void check_tsc_sync_source(int cpu) { int cpus = 2; @@ -187,7 +187,7 @@ void __cpuinit check_tsc_sync_source(int cpu) /* * Freshly booted CPUs call into this: */ -void __cpuinit check_tsc_sync_target(void) +void check_tsc_sync_target(void) { int cpus = 2; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 9a907a67be8f..1f96f9347ed9 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -331,7 +331,7 @@ sigsegv: * Assume __initcall executes before all user space. Hopefully kmod * doesn't violate that. We'll find out if it does. */ -static void __cpuinit vsyscall_set_cpu(int cpu) +static void vsyscall_set_cpu(int cpu) { unsigned long d; unsigned long node = 0; @@ -353,13 +353,13 @@ static void __cpuinit vsyscall_set_cpu(int cpu) write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); } -static void __cpuinit cpu_vsyscall_init(void *arg) +static void cpu_vsyscall_init(void *arg) { /* preemption should be already off */ vsyscall_set_cpu(raw_smp_processor_id()); } -static int __cpuinit +static int cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) { long cpu = (long)arg; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 45a14dbbddaf..8ce0072cd700 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -25,7 +25,7 @@ #include <asm/iommu.h> #include <asm/mach_traps.h> -void __cpuinit x86_init_noop(void) { } +void x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } int __init iommu_init_noop(void) { return 0; } void iommu_shutdown_noop(void) { } @@ -85,7 +85,7 @@ struct x86_init_ops x86_init __initdata = { }, }; -struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { +struct x86_cpuinit_ops x86_cpuinit = { .early_percpu_clock_init = x86_init_noop, .setup_percpu_clockev = setup_secondary_APIC_clock, }; @@ -107,6 +107,8 @@ struct x86_platform_ops x86_platform = { }; EXPORT_SYMBOL_GPL(x86_platform); + +#if defined(CONFIG_PCI_MSI) struct x86_msi_ops x86_msi = { .setup_msi_irqs = native_setup_msi_irqs, .compose_msi_msg = native_compose_msi_msg, @@ -116,6 +118,28 @@ struct x86_msi_ops x86_msi = { .setup_hpet_msi = default_setup_hpet_msi, }; +/* MSI arch specific hooks */ +int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + return x86_msi.setup_msi_irqs(dev, nvec, type); +} + +void arch_teardown_msi_irqs(struct pci_dev *dev) +{ + x86_msi.teardown_msi_irqs(dev); +} + +void arch_teardown_msi_irq(unsigned int irq) +{ + x86_msi.teardown_msi_irq(irq); +} + +void arch_restore_msi_irqs(struct pci_dev *dev, int irq) +{ + x86_msi.restore_msi_irqs(dev, irq); +} +#endif + struct x86_io_apic_ops x86_io_apic_ops = { .init = native_io_apic_init_mappings, .read = native_io_apic_read, diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index ada87a329edc..422fd8223470 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -243,7 +243,7 @@ int save_xstate_sig(void __user *buf, void __user *buf_fx, int size) if (!access_ok(VERIFY_WRITE, buf, size)) return -EACCES; - if (!HAVE_HWFP) + if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_get(current, NULL, 0, sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_ia32 __user *) buf) ? -1 : 1; @@ -350,11 +350,10 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) if (!used_math() && init_fpu(tsk)) return -1; - if (!HAVE_HWFP) { + if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_set(current, NULL, 0, sizeof(struct user_i387_ia32_struct), NULL, buf) != 0; - } if (use_xsave()) { struct _fpx_sw_bytes fx_sw_user; @@ -574,7 +573,7 @@ static void __init xstate_enable_boot_cpu(void) * This is somewhat obfuscated due to the lack of powerful enough * overrides for the section checks. */ -void __cpuinit xsave_init(void) +void xsave_init(void) { static __refdata void (*next_func)(void) = xstate_enable_boot_cpu; void (*this_func)(void); @@ -595,7 +594,7 @@ static inline void __init eager_fpu_init_bp(void) setup_init_fpu_buf(); } -void __cpuinit eager_fpu_init(void) +void eager_fpu_init(void) { static __refdata void (*boot_func)(void) = eager_fpu_init_bp; diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index d609e1d84048..bf4fb04d0112 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -5,12 +5,13 @@ CFLAGS_x86.o := -I. CFLAGS_svm.o := -I. CFLAGS_vmx.o := -I. -kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ - coalesced_mmio.o irq_comm.o eventfd.o \ - irqchip.o) -kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(addprefix ../../../virt/kvm/, \ - assigned-dev.o iommu.o) -kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) +KVM := ../../../virt/kvm + +kvm-y += $(KVM)/kvm_main.o $(KVM)/ioapic.o \ + $(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o \ + $(KVM)/eventfd.o $(KVM)/irqchip.o +kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(KVM)/assigned-dev.o $(KVM)/iommu.o +kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o cpuid.o pmu.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index a20ecb5b6cbf..b110fe6c03d4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -413,7 +413,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_CLOCKSOURCE2) | (1 << KVM_FEATURE_ASYNC_PF) | (1 << KVM_FEATURE_PV_EOI) | - (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); + (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | + (1 << KVM_FEATURE_PV_UNHALT); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5953dcea752d..ddc3f3d2afdb 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -61,6 +61,8 @@ #define OpMem8 26ull /* 8-bit zero extended memory operand */ #define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */ #define OpXLat 28ull /* memory at BX/EBX/RBX + zero-extended AL */ +#define OpAccLo 29ull /* Low part of extended acc (AX/AX/EAX/RAX) */ +#define OpAccHi 30ull /* High part of extended acc (-/DX/EDX/RDX) */ #define OpBits 5 /* Width of operand field */ #define OpMask ((1ull << OpBits) - 1) @@ -86,6 +88,7 @@ #define DstMem64 (OpMem64 << DstShift) #define DstImmUByte (OpImmUByte << DstShift) #define DstDX (OpDX << DstShift) +#define DstAccLo (OpAccLo << DstShift) #define DstMask (OpMask << DstShift) /* Source operand type. */ #define SrcShift 6 @@ -108,6 +111,7 @@ #define SrcImm64 (OpImm64 << SrcShift) #define SrcDX (OpDX << SrcShift) #define SrcMem8 (OpMem8 << SrcShift) +#define SrcAccHi (OpAccHi << SrcShift) #define SrcMask (OpMask << SrcShift) #define BitOp (1<<11) #define MemAbs (1<<12) /* Memory operand is absolute displacement */ @@ -138,6 +142,7 @@ /* Source 2 operand type */ #define Src2Shift (31) #define Src2None (OpNone << Src2Shift) +#define Src2Mem (OpMem << Src2Shift) #define Src2CL (OpCL << Src2Shift) #define Src2ImmByte (OpImmByte << Src2Shift) #define Src2One (OpOne << Src2Shift) @@ -155,6 +160,9 @@ #define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ #define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ #define NoWrite ((u64)1 << 45) /* No writeback */ +#define SrcWrite ((u64)1 << 46) /* Write back src operand */ + +#define DstXacc (DstAccLo | SrcAccHi | SrcWrite) #define X2(x...) x, x #define X3(x...) X2(x), x @@ -171,10 +179,11 @@ /* * fastop functions have a special calling convention: * - * dst: [rdx]:rax (in/out) - * src: rbx (in/out) + * dst: rax (in/out) + * src: rdx (in/out) * src2: rcx (in) * flags: rflags (in/out) + * ex: rsi (in:fastop pointer, out:zero if exception) * * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for * different operand sizes can be reached by calculation, rather than a jump @@ -276,174 +285,17 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) } /* - * Instruction emulation: - * Most instructions are emulated directly via a fragment of inline assembly - * code. This allows us to save/restore EFLAGS and thus very easily pick up - * any modified flags. - */ - -#if defined(CONFIG_X86_64) -#define _LO32 "k" /* force 32-bit operand */ -#define _STK "%%rsp" /* stack pointer */ -#elif defined(__i386__) -#define _LO32 "" /* force 32-bit operand */ -#define _STK "%%esp" /* stack pointer */ -#endif - -/* * These EFLAGS bits are restored from saved value during emulation, and * any changes are written back to the saved value after emulation. */ #define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) -/* Before executing instruction: restore necessary bits in EFLAGS. */ -#define _PRE_EFLAGS(_sav, _msk, _tmp) \ - /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \ - "movl %"_sav",%"_LO32 _tmp"; " \ - "push %"_tmp"; " \ - "push %"_tmp"; " \ - "movl %"_msk",%"_LO32 _tmp"; " \ - "andl %"_LO32 _tmp",("_STK"); " \ - "pushf; " \ - "notl %"_LO32 _tmp"; " \ - "andl %"_LO32 _tmp",("_STK"); " \ - "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \ - "pop %"_tmp"; " \ - "orl %"_LO32 _tmp",("_STK"); " \ - "popf; " \ - "pop %"_sav"; " - -/* After executing instruction: write-back necessary bits in EFLAGS. */ -#define _POST_EFLAGS(_sav, _msk, _tmp) \ - /* _sav |= EFLAGS & _msk; */ \ - "pushf; " \ - "pop %"_tmp"; " \ - "andl %"_msk",%"_LO32 _tmp"; " \ - "orl %"_LO32 _tmp",%"_sav"; " - #ifdef CONFIG_X86_64 #define ON64(x) x #else #define ON64(x) #endif -#define ____emulate_2op(ctxt, _op, _x, _y, _suffix, _dsttype) \ - do { \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op _suffix " %"_x"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" ((ctxt)->eflags), \ - "+q" (*(_dsttype*)&(ctxt)->dst.val), \ - "=&r" (_tmp) \ - : _y ((ctxt)->src.val), "i" (EFLAGS_MASK)); \ - } while (0) - - -/* Raw emulation: instruction has two explicit operands. */ -#define __emulate_2op_nobyte(ctxt,_op,_wx,_wy,_lx,_ly,_qx,_qy) \ - do { \ - unsigned long _tmp; \ - \ - switch ((ctxt)->dst.bytes) { \ - case 2: \ - ____emulate_2op(ctxt,_op,_wx,_wy,"w",u16); \ - break; \ - case 4: \ - ____emulate_2op(ctxt,_op,_lx,_ly,"l",u32); \ - break; \ - case 8: \ - ON64(____emulate_2op(ctxt,_op,_qx,_qy,"q",u64)); \ - break; \ - } \ - } while (0) - -#define __emulate_2op(ctxt,_op,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ - do { \ - unsigned long _tmp; \ - switch ((ctxt)->dst.bytes) { \ - case 1: \ - ____emulate_2op(ctxt,_op,_bx,_by,"b",u8); \ - break; \ - default: \ - __emulate_2op_nobyte(ctxt, _op, \ - _wx, _wy, _lx, _ly, _qx, _qy); \ - break; \ - } \ - } while (0) - -/* Source operand is byte-sized and may be restricted to just %cl. */ -#define emulate_2op_SrcB(ctxt, _op) \ - __emulate_2op(ctxt, _op, "b", "c", "b", "c", "b", "c", "b", "c") - -/* Source operand is byte, word, long or quad sized. */ -#define emulate_2op_SrcV(ctxt, _op) \ - __emulate_2op(ctxt, _op, "b", "q", "w", "r", _LO32, "r", "", "r") - -/* Source operand is word, long or quad sized. */ -#define emulate_2op_SrcV_nobyte(ctxt, _op) \ - __emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r") - -/* Instruction has three operands and one operand is stored in ECX register */ -#define __emulate_2op_cl(ctxt, _op, _suffix, _type) \ - do { \ - unsigned long _tmp; \ - _type _clv = (ctxt)->src2.val; \ - _type _srcv = (ctxt)->src.val; \ - _type _dstv = (ctxt)->dst.val; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "5", "2") \ - _op _suffix " %4,%1 \n" \ - _POST_EFLAGS("0", "5", "2") \ - : "=m" ((ctxt)->eflags), "+r" (_dstv), "=&r" (_tmp) \ - : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ - ); \ - \ - (ctxt)->src2.val = (unsigned long) _clv; \ - (ctxt)->src2.val = (unsigned long) _srcv; \ - (ctxt)->dst.val = (unsigned long) _dstv; \ - } while (0) - -#define emulate_2op_cl(ctxt, _op) \ - do { \ - switch ((ctxt)->dst.bytes) { \ - case 2: \ - __emulate_2op_cl(ctxt, _op, "w", u16); \ - break; \ - case 4: \ - __emulate_2op_cl(ctxt, _op, "l", u32); \ - break; \ - case 8: \ - ON64(__emulate_2op_cl(ctxt, _op, "q", ulong)); \ - break; \ - } \ - } while (0) - -#define __emulate_1op(ctxt, _op, _suffix) \ - do { \ - unsigned long _tmp; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op _suffix " %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" ((ctxt)->eflags), "+m" ((ctxt)->dst.val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ - } while (0) - -/* Instruction has only one explicit operand (no source operand). */ -#define emulate_1op(ctxt, _op) \ - do { \ - switch ((ctxt)->dst.bytes) { \ - case 1: __emulate_1op(ctxt, _op, "b"); break; \ - case 2: __emulate_1op(ctxt, _op, "w"); break; \ - case 4: __emulate_1op(ctxt, _op, "l"); break; \ - case 8: ON64(__emulate_1op(ctxt, _op, "q")); break; \ - } \ - } while (0) - static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); #define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t" @@ -462,7 +314,10 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); #define FOPNOP() FOP_ALIGN FOP_RET #define FOP1E(op, dst) \ - FOP_ALIGN #op " %" #dst " \n\t" FOP_RET + FOP_ALIGN "10: " #op " %" #dst " \n\t" FOP_RET + +#define FOP1EEX(op, dst) \ + FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception) #define FASTOP1(op) \ FOP_START(op) \ @@ -472,24 +327,42 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); ON64(FOP1E(op##q, rax)) \ FOP_END +/* 1-operand, using src2 (for MUL/DIV r/m) */ +#define FASTOP1SRC2(op, name) \ + FOP_START(name) \ + FOP1E(op, cl) \ + FOP1E(op, cx) \ + FOP1E(op, ecx) \ + ON64(FOP1E(op, rcx)) \ + FOP_END + +/* 1-operand, using src2 (for MUL/DIV r/m), with exceptions */ +#define FASTOP1SRC2EX(op, name) \ + FOP_START(name) \ + FOP1EEX(op, cl) \ + FOP1EEX(op, cx) \ + FOP1EEX(op, ecx) \ + ON64(FOP1EEX(op, rcx)) \ + FOP_END + #define FOP2E(op, dst, src) \ FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET #define FASTOP2(op) \ FOP_START(op) \ - FOP2E(op##b, al, bl) \ - FOP2E(op##w, ax, bx) \ - FOP2E(op##l, eax, ebx) \ - ON64(FOP2E(op##q, rax, rbx)) \ + FOP2E(op##b, al, dl) \ + FOP2E(op##w, ax, dx) \ + FOP2E(op##l, eax, edx) \ + ON64(FOP2E(op##q, rax, rdx)) \ FOP_END /* 2 operand, word only */ #define FASTOP2W(op) \ FOP_START(op) \ FOPNOP() \ - FOP2E(op##w, ax, bx) \ - FOP2E(op##l, eax, ebx) \ - ON64(FOP2E(op##q, rax, rbx)) \ + FOP2E(op##w, ax, dx) \ + FOP2E(op##l, eax, edx) \ + ON64(FOP2E(op##q, rax, rdx)) \ FOP_END /* 2 operand, src is CL */ @@ -508,14 +381,17 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); #define FASTOP3WCL(op) \ FOP_START(op) \ FOPNOP() \ - FOP3E(op##w, ax, bx, cl) \ - FOP3E(op##l, eax, ebx, cl) \ - ON64(FOP3E(op##q, rax, rbx, cl)) \ + FOP3E(op##w, ax, dx, cl) \ + FOP3E(op##l, eax, edx, cl) \ + ON64(FOP3E(op##q, rax, rdx, cl)) \ FOP_END /* Special case for SETcc - 1 instruction per cc */ #define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t" +asm(".global kvm_fastop_exception \n" + "kvm_fastop_exception: xor %esi, %esi; ret"); + FOP_START(setcc) FOP_SETCC(seto) FOP_SETCC(setno) @@ -538,47 +414,6 @@ FOP_END; FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET FOP_END; -#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \ - do { \ - unsigned long _tmp; \ - ulong *rax = reg_rmw((ctxt), VCPU_REGS_RAX); \ - ulong *rdx = reg_rmw((ctxt), VCPU_REGS_RDX); \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "5", "1") \ - "1: \n\t" \ - _op _suffix " %6; " \ - "2: \n\t" \ - _POST_EFLAGS("0", "5", "1") \ - ".pushsection .fixup,\"ax\" \n\t" \ - "3: movb $1, %4 \n\t" \ - "jmp 2b \n\t" \ - ".popsection \n\t" \ - _ASM_EXTABLE(1b, 3b) \ - : "=m" ((ctxt)->eflags), "=&r" (_tmp), \ - "+a" (*rax), "+d" (*rdx), "+qm"(_ex) \ - : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val)); \ - } while (0) - -/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ -#define emulate_1op_rax_rdx(ctxt, _op, _ex) \ - do { \ - switch((ctxt)->src.bytes) { \ - case 1: \ - __emulate_1op_rax_rdx(ctxt, _op, "b", _ex); \ - break; \ - case 2: \ - __emulate_1op_rax_rdx(ctxt, _op, "w", _ex); \ - break; \ - case 4: \ - __emulate_1op_rax_rdx(ctxt, _op, "l", _ex); \ - break; \ - case 8: ON64( \ - __emulate_1op_rax_rdx(ctxt, _op, "q", _ex)); \ - break; \ - } \ - } while (0) - static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, enum x86_intercept intercept, enum x86_intercept_stage stage) @@ -988,6 +823,11 @@ FASTOP2(xor); FASTOP2(cmp); FASTOP2(test); +FASTOP1SRC2(mul, mul_ex); +FASTOP1SRC2(imul, imul_ex); +FASTOP1SRC2EX(div, div_ex); +FASTOP1SRC2EX(idiv, idiv_ex); + FASTOP3WCL(shld); FASTOP3WCL(shrd); @@ -1013,6 +853,8 @@ FASTOP2W(bts); FASTOP2W(btr); FASTOP2W(btc); +FASTOP2(xadd); + static u8 test_cc(unsigned int condition, unsigned long flags) { u8 rc; @@ -1726,45 +1568,42 @@ static void write_register_operand(struct operand *op) } } -static int writeback(struct x86_emulate_ctxt *ctxt) +static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) { int rc; - if (ctxt->d & NoWrite) - return X86EMUL_CONTINUE; - - switch (ctxt->dst.type) { + switch (op->type) { case OP_REG: - write_register_operand(&ctxt->dst); + write_register_operand(op); break; case OP_MEM: if (ctxt->lock_prefix) rc = segmented_cmpxchg(ctxt, - ctxt->dst.addr.mem, - &ctxt->dst.orig_val, - &ctxt->dst.val, - ctxt->dst.bytes); + op->addr.mem, + &op->orig_val, + &op->val, + op->bytes); else rc = segmented_write(ctxt, - ctxt->dst.addr.mem, - &ctxt->dst.val, - ctxt->dst.bytes); + op->addr.mem, + &op->val, + op->bytes); if (rc != X86EMUL_CONTINUE) return rc; break; case OP_MEM_STR: rc = segmented_write(ctxt, - ctxt->dst.addr.mem, - ctxt->dst.data, - ctxt->dst.bytes * ctxt->dst.count); + op->addr.mem, + op->data, + op->bytes * op->count); if (rc != X86EMUL_CONTINUE) return rc; break; case OP_XMM: - write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); + write_sse_reg(ctxt, &op->vec_val, op->addr.xmm); break; case OP_MM: - write_mmx_reg(ctxt, &ctxt->dst.mm_val, ctxt->dst.addr.mm); + write_mmx_reg(ctxt, &op->mm_val, op->addr.mm); break; case OP_NONE: /* no writeback */ @@ -2117,42 +1956,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_mul_ex(struct x86_emulate_ctxt *ctxt) -{ - u8 ex = 0; - - emulate_1op_rax_rdx(ctxt, "mul", ex); - return X86EMUL_CONTINUE; -} - -static int em_imul_ex(struct x86_emulate_ctxt *ctxt) -{ - u8 ex = 0; - - emulate_1op_rax_rdx(ctxt, "imul", ex); - return X86EMUL_CONTINUE; -} - -static int em_div_ex(struct x86_emulate_ctxt *ctxt) -{ - u8 de = 0; - - emulate_1op_rax_rdx(ctxt, "div", de); - if (de) - return emulate_de(ctxt); - return X86EMUL_CONTINUE; -} - -static int em_idiv_ex(struct x86_emulate_ctxt *ctxt) -{ - u8 de = 0; - - emulate_1op_rax_rdx(ctxt, "idiv", de); - if (de) - return emulate_de(ctxt); - return X86EMUL_CONTINUE; -} - static int em_grp45(struct x86_emulate_ctxt *ctxt) { int rc = X86EMUL_CONTINUE; @@ -2222,6 +2025,17 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) return rc; } +static int em_ret_far_imm(struct x86_emulate_ctxt *ctxt) +{ + int rc; + + rc = em_ret_far(ctxt); + if (rc != X86EMUL_CONTINUE) + return rc; + rsp_increment(ctxt, ctxt->src.val); + return X86EMUL_CONTINUE; +} + static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) { /* Save real source value, then compare EAX against destination. */ @@ -3734,10 +3548,10 @@ static const struct opcode group3[] = { F(DstMem | SrcImm | NoWrite, em_test), F(DstMem | SrcNone | Lock, em_not), F(DstMem | SrcNone | Lock, em_neg), - I(SrcMem, em_mul_ex), - I(SrcMem, em_imul_ex), - I(SrcMem, em_div_ex), - I(SrcMem, em_idiv_ex), + F(DstXacc | Src2Mem, em_mul_ex), + F(DstXacc | Src2Mem, em_imul_ex), + F(DstXacc | Src2Mem, em_div_ex), + F(DstXacc | Src2Mem, em_idiv_ex), }; static const struct opcode group4[] = { @@ -3960,7 +3774,8 @@ static const struct opcode opcode_table[256] = { G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave), - N, I(ImplicitOps | Stack, em_ret_far), + I(ImplicitOps | Stack | SrcImmU16, em_ret_far_imm), + I(ImplicitOps | Stack, em_ret_far), D(ImplicitOps), DI(SrcImmByte, intn), D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), /* 0xD0 - 0xD7 */ @@ -4064,7 +3879,7 @@ static const struct opcode twobyte_table[256] = { F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr), D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xC0 - 0xC7 */ - D2bv(DstMem | SrcReg | ModRM | Lock), + F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd), N, D(DstMem | SrcReg | ModRM | Mov), N, N, N, GD(0, &group9), /* 0xC8 - 0xCF */ @@ -4172,6 +3987,24 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, fetch_register_operand(op); op->orig_val = op->val; break; + case OpAccLo: + op->type = OP_REG; + op->bytes = (ctxt->d & ByteOp) ? 2 : ctxt->op_bytes; + op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); + fetch_register_operand(op); + op->orig_val = op->val; + break; + case OpAccHi: + if (ctxt->d & ByteOp) { + op->type = OP_NONE; + break; + } + op->type = OP_REG; + op->bytes = ctxt->op_bytes; + op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX); + fetch_register_operand(op); + op->orig_val = op->val; + break; case OpDI: op->type = OP_MEM; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; @@ -4553,11 +4386,15 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) { ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; - fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; + if (!(ctxt->d & ByteOp)) + fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" - : "+a"(ctxt->dst.val), "+b"(ctxt->src.val), [flags]"+D"(flags) - : "c"(ctxt->src2.val), [fastop]"S"(fop)); + : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), + [fastop]"+S"(fop) + : "c"(ctxt->src2.val)); ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); + if (!fop) /* exception is returned in fop variable */ + return emulate_de(ctxt); return X86EMUL_CONTINUE; } @@ -4773,9 +4610,17 @@ special_insn: goto done; writeback: - rc = writeback(ctxt); - if (rc != X86EMUL_CONTINUE) - goto done; + if (!(ctxt->d & NoWrite)) { + rc = writeback(ctxt, &ctxt->dst); + if (rc != X86EMUL_CONTINUE) + goto done; + } + if (ctxt->d & SrcWrite) { + BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR); + rc = writeback(ctxt, &ctxt->src); + if (rc != X86EMUL_CONTINUE) + goto done; + } /* * restore dst type in case the decoding will be reused @@ -4872,12 +4717,6 @@ twobyte_insn: ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val : (s16) ctxt->src.val; break; - case 0xc0 ... 0xc1: /* xadd */ - fastop(ctxt, em_add); - /* Write back the register source. */ - ctxt->src.val = ctxt->dst.orig_val; - write_register_operand(&ctxt->src); - break; case 0xc3: /* movnti */ ctxt->dst.bytes = ctxt->op_bytes; ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val : diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0eee2c8b64d1..5439117d5c4c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -79,16 +79,6 @@ static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) *((u32 *) (apic->regs + reg_off)) = val; } -static inline int apic_test_and_set_vector(int vec, void *bitmap) -{ - return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - -static inline int apic_test_and_clear_vector(int vec, void *bitmap) -{ - return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - static inline int apic_test_vector(int vec, void *bitmap) { return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); @@ -331,10 +321,10 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) } EXPORT_SYMBOL_GPL(kvm_apic_update_irr); -static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) +static inline void apic_set_irr(int vec, struct kvm_lapic *apic) { apic->irr_pending = true; - return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); + apic_set_vector(vec, apic->regs + APIC_IRR); } static inline int apic_search_irr(struct kvm_lapic *apic) @@ -681,32 +671,28 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (unlikely(!apic_enabled(apic))) break; + result = 1; + if (dest_map) __set_bit(vcpu->vcpu_id, dest_map); - if (kvm_x86_ops->deliver_posted_interrupt) { - result = 1; + if (kvm_x86_ops->deliver_posted_interrupt) kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); - } else { - result = !apic_test_and_set_irr(vector, apic); - - if (!result) { - if (trig_mode) - apic_debug("level trig mode repeatedly " - "for vector %d", vector); - goto out; - } + else { + apic_set_irr(vector, apic); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } -out: trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, - trig_mode, vector, !result); + trig_mode, vector, false); break; case APIC_DM_REMRD: - apic_debug("Ignoring delivery mode 3\n"); + result = 1; + vcpu->arch.pv.pv_unhalted = 1; + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); break; case APIC_DM_SMI: @@ -1608,8 +1594,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) return; if (atomic_read(&apic->lapic_timer.pending) > 0) { - if (kvm_apic_local_deliver(apic, APIC_LVTT)) - atomic_dec(&apic->lapic_timer.pending); + kvm_apic_local_deliver(apic, APIC_LVTT); + atomic_set(&apic->lapic_timer.pending, 0); } } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 004cc87b781c..dce0df8150df 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -132,8 +132,8 @@ module_param(dbg, bool, 0644); (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ * PT32_LEVEL_BITS))) - 1)) -#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ - | PT64_NX_MASK) +#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ + | shadow_x_mask | shadow_nx_mask) #define ACC_EXEC_MASK 1 #define ACC_WRITE_MASK PT_WRITABLE_MASK @@ -197,15 +197,63 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); -static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access) +/* + * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number, + * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation + * number. + */ +#define MMIO_SPTE_GEN_LOW_SHIFT 3 +#define MMIO_SPTE_GEN_HIGH_SHIFT 52 + +#define MMIO_GEN_SHIFT 19 +#define MMIO_GEN_LOW_SHIFT 9 +#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1) +#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1) +#define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1) + +static u64 generation_mmio_spte_mask(unsigned int gen) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); + u64 mask; + + WARN_ON(gen > MMIO_MAX_GEN); + + mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT; + mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT; + return mask; +} + +static unsigned int get_mmio_spte_generation(u64 spte) +{ + unsigned int gen; + + spte &= ~shadow_mmio_mask; + + gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK; + gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT; + return gen; +} + +static unsigned int kvm_current_mmio_generation(struct kvm *kvm) +{ + /* + * Init kvm generation close to MMIO_MAX_GEN to easily test the + * code of handling generation number wrap-around. + */ + return (kvm_memslots(kvm)->generation + + MMIO_MAX_GEN - 150) & MMIO_GEN_MASK; +} + +static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn, + unsigned access) +{ + unsigned int gen = kvm_current_mmio_generation(kvm); + u64 mask = generation_mmio_spte_mask(gen); access &= ACC_WRITE_MASK | ACC_USER_MASK; + mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT; - sp->mmio_cached = true; - trace_mark_mmio_spte(sptep, gfn, access); - mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT); + trace_mark_mmio_spte(sptep, gfn, access, gen); + mmu_spte_set(sptep, mask); } static bool is_mmio_spte(u64 spte) @@ -215,24 +263,38 @@ static bool is_mmio_spte(u64 spte) static gfn_t get_mmio_spte_gfn(u64 spte) { - return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT; + u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask; + return (spte & ~mask) >> PAGE_SHIFT; } static unsigned get_mmio_spte_access(u64 spte) { - return (spte & ~shadow_mmio_mask) & ~PAGE_MASK; + u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask; + return (spte & ~mask) & ~PAGE_MASK; } -static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) +static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, + pfn_t pfn, unsigned access) { if (unlikely(is_noslot_pfn(pfn))) { - mark_mmio_spte(sptep, gfn, access); + mark_mmio_spte(kvm, sptep, gfn, access); return true; } return false; } +static bool check_mmio_spte(struct kvm *kvm, u64 spte) +{ + unsigned int kvm_gen, spte_gen; + + kvm_gen = kvm_current_mmio_generation(kvm); + spte_gen = get_mmio_spte_generation(spte); + + trace_check_mmio_spte(spte, kvm_gen, spte_gen); + return likely(kvm_gen == spte_gen); +} + static inline u64 rsvd_bits(int s, int e) { return ((1ULL << (e - s + 1)) - 1) << s; @@ -269,11 +331,6 @@ static int is_large_pte(u64 pte) return pte & PT_PAGE_SIZE_MASK; } -static int is_dirty_gpte(unsigned long pte) -{ - return pte & PT_DIRTY_MASK; -} - static int is_rmap_spte(u64 pte) { return is_shadow_present_pte(pte); @@ -404,9 +461,20 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) /* * The idea using the light way get the spte on x86_32 guest is from * gup_get_pte(arch/x86/mm/gup.c). - * The difference is we can not catch the spte tlb flush if we leave - * guest mode, so we emulate it by increase clear_spte_count when spte - * is cleared. + * + * An spte tlb flush may be pending, because kvm_set_pte_rmapp + * coalesces them and we are running out of the MMU lock. Therefore + * we need to protect against in-progress updates of the spte. + * + * Reading the spte while an update is in progress may get the old value + * for the high part of the spte. The race is fine for a present->non-present + * change (because the high part of the spte is ignored for non-present spte), + * but for a present->present change we must reread the spte. + * + * All such changes are done in two steps (present->non-present and + * non-present->present), hence it is enough to count the number of + * present->non-present updates: if it changed while reading the spte, + * we might have hit the race. This is done using clear_spte_count. */ static u64 __get_spte_lockless(u64 *sptep) { @@ -1511,6 +1579,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, if (!direct) sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + + /* + * The active_mmu_pages list is the FIFO list, do not move the + * page until it is zapped. kvm_zap_obsolete_pages depends on + * this feature. See the comments in kvm_zap_obsolete_pages(). + */ list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); sp->parent_ptes = 0; mmu_page_add_parent_pte(vcpu, sp, parent_pte); @@ -1648,6 +1722,16 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); +/* + * NOTE: we should pay more attention on the zapped-obsolete page + * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk + * since it has been deleted from active_mmu_pages but still can be found + * at hast list. + * + * for_each_gfn_indirect_valid_sp has skipped that kind of page and + * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped + * all the obsolete pages. + */ #define for_each_gfn_sp(_kvm, _sp, _gfn) \ hlist_for_each_entry(_sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ @@ -1838,6 +1922,11 @@ static void clear_sp_write_flooding_count(u64 *spte) __clear_sp_write_flooding_count(sp); } +static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); +} + static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gaddr, @@ -1864,6 +1953,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, role.quadrant = quadrant; } for_each_gfn_sp(vcpu->kvm, sp, gfn) { + if (is_obsolete_sp(vcpu->kvm, sp)) + continue; + if (!need_sync && sp->unsync) need_sync = true; @@ -1900,6 +1992,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, account_shadowed(vcpu->kvm, gfn); } + sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; init_shadow_page_table(sp); trace_kvm_mmu_get_page(sp, true); return sp; @@ -1954,12 +2047,18 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) return __shadow_walk_next(iterator, *iterator->sptep); } -static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) +static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed) { u64 spte; + BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK || + VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); + spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask | shadow_accessed_mask; + shadow_user_mask | shadow_x_mask; + + if (accessed) + spte |= shadow_accessed_mask; mmu_spte_set(sptep, spte); } @@ -2070,8 +2169,10 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, ret = mmu_zap_unsync_children(kvm, sp, invalid_list); kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_unlink_parents(kvm, sp); + if (!sp->role.invalid && !sp->role.direct) unaccount_shadowed(kvm, sp->gfn); + if (sp->unsync) kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { @@ -2081,7 +2182,13 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, kvm_mod_used_mmu_pages(kvm, -1); } else { list_move(&sp->link, &kvm->arch.active_mmu_pages); - kvm_reload_remote_mmus(kvm); + + /* + * The obsolete pages can not be used on any vcpus. + * See the comments in kvm_mmu_invalidate_zap_all_pages(). + */ + if (!sp->role.invalid && !is_obsolete_sp(kvm, sp)) + kvm_reload_remote_mmus(kvm); } sp->role.invalid = 1; @@ -2331,7 +2438,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte; int ret = 0; - if (set_mmio_spte(sptep, gfn, pfn, pte_access)) + if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access)) return 0; spte = PT_PRESENT_MASK; @@ -2468,14 +2575,6 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) mmu_free_roots(vcpu); } -static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) -{ - int bit7; - - bit7 = (gpte >> 7) & 1; - return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; -} - static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { @@ -2488,26 +2587,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, return gfn_to_pfn_memslot_atomic(slot, gfn); } -static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *spte, - u64 gpte) -{ - if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) - goto no_present; - - if (!is_present_gpte(gpte)) - goto no_present; - - if (!(gpte & PT_ACCESSED_MASK)) - goto no_present; - - return false; - -no_present: - drop_spte(vcpu->kvm, spte); - return true; -} - static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *start, u64 *end) @@ -2604,7 +2683,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, iterator.level - 1, 1, ACC_ALL, iterator.sptep); - link_shadow_page(iterator.sptep, sp); + link_shadow_page(iterator.sptep, sp, true); } } return emulate; @@ -2702,9 +2781,16 @@ exit: return ret; } -static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code) +static bool page_fault_can_be_fast(u32 error_code) { /* + * Do not fix the mmio spte with invalid generation number which + * need to be updated by slow page fault path. + */ + if (unlikely(error_code & PFERR_RSVD_MASK)) + return false; + + /* * #PF can be fast only if the shadow page table is present and it * is caused by write-protect, that means we just need change the * W bit of the spte which can be done out of mmu-lock. @@ -2748,7 +2834,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, bool ret = false; u64 spte = 0ull; - if (!page_fault_can_be_fast(vcpu, error_code)) + if (!page_fault_can_be_fast(error_code)) return false; walk_shadow_page_lockless_begin(vcpu); @@ -2869,22 +2955,25 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - spin_lock(&vcpu->kvm->mmu_lock); + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || vcpu->arch.mmu.direct_map)) { hpa_t root = vcpu->arch.mmu.root_hpa; + spin_lock(&vcpu->kvm->mmu_lock); sp = page_header(root); --sp->root_count; if (!sp->root_count && sp->role.invalid) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); } - vcpu->arch.mmu.root_hpa = INVALID_PAGE; spin_unlock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.root_hpa = INVALID_PAGE; return; } + + spin_lock(&vcpu->kvm->mmu_lock); for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; @@ -3093,6 +3182,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) mmu_sync_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); } +EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, struct x86_exception *exception) @@ -3148,17 +3238,12 @@ static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr) return spte; } -/* - * If it is a real mmio page fault, return 1 and emulat the instruction - * directly, return 0 to let CPU fault again on the address, -1 is - * returned if bug is detected. - */ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) { u64 spte; if (quickly_check_mmio_pf(vcpu, addr, direct)) - return 1; + return RET_MMIO_PF_EMULATE; spte = walk_shadow_page_get_mmio_spte(vcpu, addr); @@ -3166,12 +3251,15 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) gfn_t gfn = get_mmio_spte_gfn(spte); unsigned access = get_mmio_spte_access(spte); + if (!check_mmio_spte(vcpu->kvm, spte)) + return RET_MMIO_PF_INVALID; + if (direct) addr = 0; trace_handle_mmio_page_fault(addr, gfn, access); vcpu_cache_mmio_info(vcpu, addr, gfn, access); - return 1; + return RET_MMIO_PF_EMULATE; } /* @@ -3179,13 +3267,13 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) * it's a BUG if the gfn is not a mmio page. */ if (direct && !check_direct_spte_mmio_pf(spte)) - return -1; + return RET_MMIO_PF_BUG; /* * If the page table is zapped by other cpus, let CPU fault again on * the address. */ - return 0; + return RET_MMIO_PF_RETRY; } EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); @@ -3195,7 +3283,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, int ret; ret = handle_mmio_page_fault_common(vcpu, addr, direct); - WARN_ON(ret < 0); + WARN_ON(ret == RET_MMIO_PF_BUG); return ret; } @@ -3207,8 +3295,12 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); - if (unlikely(error_code & PFERR_RSVD_MASK)) - return handle_mmio_page_fault(vcpu, gva, error_code, true); + if (unlikely(error_code & PFERR_RSVD_MASK)) { + r = handle_mmio_page_fault(vcpu, gva, error_code, true); + + if (likely(r != RET_MMIO_PF_INVALID)) + return r; + } r = mmu_topup_memory_caches(vcpu); if (r) @@ -3284,8 +3376,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, ASSERT(vcpu); ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); - if (unlikely(error_code & PFERR_RSVD_MASK)) - return handle_mmio_page_fault(vcpu, gpa, error_code, true); + if (unlikely(error_code & PFERR_RSVD_MASK)) { + r = handle_mmio_page_fault(vcpu, gpa, error_code, true); + + if (likely(r != RET_MMIO_PF_INVALID)) + return r; + } r = mmu_topup_memory_caches(vcpu); if (r) @@ -3356,6 +3452,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) ++vcpu->stat.tlb_flush; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } +EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb); static void paging_new_cr3(struct kvm_vcpu *vcpu) { @@ -3379,20 +3476,8 @@ static void paging_free(struct kvm_vcpu *vcpu) nonpaging_free(vcpu); } -static inline void protect_clean_gpte(unsigned *access, unsigned gpte) -{ - unsigned mask; - - BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); - - mask = (unsigned)~ACC_WRITE_MASK; - /* Allow write access to dirty gptes */ - mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK; - *access &= mask; -} - -static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, - int *nr_present) +static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, + unsigned access, int *nr_present) { if (unlikely(is_mmio_spte(*sptep))) { if (gfn != get_mmio_spte_gfn(*sptep)) { @@ -3401,23 +3486,13 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, } (*nr_present)++; - mark_mmio_spte(sptep, gfn, access); + mark_mmio_spte(kvm, sptep, gfn, access); return true; } return false; } -static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte) -{ - unsigned access; - - access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; - access &= ~(gpte >> PT64_NX_SHIFT); - - return access; -} - static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) { unsigned index; @@ -3427,6 +3502,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp return mmu->last_pte_bitmap & (1 << index); } +#define PTTYPE_EPT 18 /* arbitrary */ +#define PTTYPE PTTYPE_EPT +#include "paging_tmpl.h" +#undef PTTYPE + #define PTTYPE 64 #include "paging_tmpl.h" #undef PTTYPE @@ -3441,6 +3521,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; + context->bad_mt_xwr = 0; + if (!context->nx) exb_bit_rsvd = rsvd_bits(63, 63); switch (context->root_level) { @@ -3496,7 +3578,40 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, } } -static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, bool execonly) +{ + int maxphyaddr = cpuid_maxphyaddr(vcpu); + int pte; + + context->rsvd_bits_mask[0][3] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); + context->rsvd_bits_mask[0][2] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); + context->rsvd_bits_mask[0][1] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); + context->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); + + /* large page */ + context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; + context->rsvd_bits_mask[1][2] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); + context->rsvd_bits_mask[1][1] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + + for (pte = 0; pte < 64; pte++) { + int rwx_bits = pte & 7; + int mt = pte >> 3; + if (mt == 0x2 || mt == 0x3 || mt == 0x7 || + rwx_bits == 0x2 || rwx_bits == 0x6 || + (rwx_bits == 0x4 && !execonly)) + context->bad_mt_xwr |= (1ull << pte); + } +} + +static void update_permission_bitmask(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu, bool ept) { unsigned bit, byte, pfec; u8 map; @@ -3514,12 +3629,16 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu w = bit & ACC_WRITE_MASK; u = bit & ACC_USER_MASK; - /* Not really needed: !nx will cause pte.nx to fault */ - x |= !mmu->nx; - /* Allow supervisor writes if !cr0.wp */ - w |= !is_write_protection(vcpu) && !uf; - /* Disallow supervisor fetches of user code if cr4.smep */ - x &= !(smep && u && !uf); + if (!ept) { + /* Not really needed: !nx will cause pte.nx to fault */ + x |= !mmu->nx; + /* Allow supervisor writes if !cr0.wp */ + w |= !is_write_protection(vcpu) && !uf; + /* Disallow supervisor fetches of user code if cr4.smep */ + x &= !(smep && u && !uf); + } else + /* Not really needed: no U/S accesses on ept */ + u = 1; fault = (ff && !x) || (uf && !u) || (wf && !w); map |= fault << bit; @@ -3554,7 +3673,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, context->root_level = level; reset_rsvds_bits_mask(vcpu, context); - update_permission_bitmask(vcpu, context); + update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); ASSERT(is_pae(vcpu)); @@ -3584,7 +3703,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, context->root_level = PT32_ROOT_LEVEL; reset_rsvds_bits_mask(vcpu, context); - update_permission_bitmask(vcpu, context); + update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); context->new_cr3 = paging_new_cr3; @@ -3646,7 +3765,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->gva_to_gpa = paging32_gva_to_gpa; } - update_permission_bitmask(vcpu, context); + update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); return 0; @@ -3678,6 +3797,33 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); +int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, + bool execonly) +{ + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + context->shadow_root_level = kvm_x86_ops->get_tdp_level(); + + context->nx = true; + context->new_cr3 = paging_new_cr3; + context->page_fault = ept_page_fault; + context->gva_to_gpa = ept_gva_to_gpa; + context->sync_page = ept_sync_page; + context->invlpg = ept_invlpg; + context->update_pte = ept_update_pte; + context->free = paging_free; + context->root_level = context->shadow_root_level; + context->root_hpa = INVALID_PAGE; + context->direct_map = false; + + update_permission_bitmask(vcpu, context, true); + reset_rsvds_bits_mask_ept(vcpu, context, execonly); + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); + static int init_kvm_softmmu(struct kvm_vcpu *vcpu) { int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); @@ -3725,7 +3871,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) g_context->gva_to_gpa = paging32_gva_to_gpa_nested; } - update_permission_bitmask(vcpu, g_context); + update_permission_bitmask(vcpu, g_context, false); update_last_pte_bitmap(vcpu, g_context); return 0; @@ -3764,9 +3910,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) if (r) goto out; r = mmu_alloc_roots(vcpu); - spin_lock(&vcpu->kvm->mmu_lock); - mmu_sync_roots(vcpu); - spin_unlock(&vcpu->kvm->mmu_lock); + kvm_mmu_sync_roots(vcpu); if (r) goto out; /* set_cr3() should ensure TLB has been flushed */ @@ -3803,8 +3947,8 @@ static bool need_remote_flush(u64 old, u64 new) return true; if ((old ^ new) & PT64_BASE_ADDR_MASK) return true; - old ^= PT64_NX_MASK; - new ^= PT64_NX_MASK; + old ^= shadow_nx_mask; + new ^= shadow_nx_mask; return (old & ~new & PT64_PERM_MASK) != 0; } @@ -4062,7 +4206,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, switch (er) { case EMULATE_DONE: return 1; - case EMULATE_DO_MMIO: + case EMULATE_USER_EXIT: ++vcpu->stat.mmio_exits; /* fall through */ case EMULATE_FAIL: @@ -4179,46 +4323,110 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) spin_unlock(&kvm->mmu_lock); } -void kvm_mmu_zap_all(struct kvm *kvm) +#define BATCH_ZAP_PAGES 10 +static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; - LIST_HEAD(invalid_list); + int batch = 0; - spin_lock(&kvm->mmu_lock); restart: - list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) + list_for_each_entry_safe_reverse(sp, node, + &kvm->arch.active_mmu_pages, link) { + int ret; + + /* + * No obsolete page exists before new created page since + * active_mmu_pages is the FIFO list. + */ + if (!is_obsolete_sp(kvm, sp)) + break; + + /* + * Since we are reversely walking the list and the invalid + * list will be moved to the head, skip the invalid page + * can help us to avoid the infinity list walking. + */ + if (sp->role.invalid) + continue; + + /* + * Need not flush tlb since we only zap the sp with invalid + * generation number. + */ + if (batch >= BATCH_ZAP_PAGES && + cond_resched_lock(&kvm->mmu_lock)) { + batch = 0; goto restart; + } - kvm_mmu_commit_zap_page(kvm, &invalid_list); - spin_unlock(&kvm->mmu_lock); + ret = kvm_mmu_prepare_zap_page(kvm, sp, + &kvm->arch.zapped_obsolete_pages); + batch += ret; + + if (ret) + goto restart; + } + + /* + * Should flush tlb before free page tables since lockless-walking + * may use the pages. + */ + kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); } -void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) +/* + * Fast invalidate all shadow pages and use lock-break technique + * to zap obsolete pages. + * + * It's required when memslot is being deleted or VM is being + * destroyed, in these cases, we should ensure that KVM MMU does + * not use any resource of the being-deleted slot or all slots + * after calling the function. + */ +void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) { - struct kvm_mmu_page *sp, *node; - LIST_HEAD(invalid_list); - spin_lock(&kvm->mmu_lock); -restart: - list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { - if (!sp->mmio_cached) - continue; - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) - goto restart; - } + trace_kvm_mmu_invalidate_zap_all_pages(kvm); + kvm->arch.mmu_valid_gen++; - kvm_mmu_commit_zap_page(kvm, &invalid_list); + /* + * Notify all vcpus to reload its shadow page table + * and flush TLB. Then all vcpus will switch to new + * shadow page table with the new mmu_valid_gen. + * + * Note: we should do this under the protection of + * mmu-lock, otherwise, vcpu would purge shadow page + * but miss tlb flush. + */ + kvm_reload_remote_mmus(kvm); + + kvm_zap_obsolete_pages(kvm); spin_unlock(&kvm->mmu_lock); } -static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) +static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) +{ + return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); +} + +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm) +{ + /* + * The very rare case: if the generation-number is round, + * zap all shadow pages. + */ + if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) { + printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n"); + kvm_mmu_invalidate_zap_all_pages(kvm); + } +} + +static unsigned long +mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { struct kvm *kvm; int nr_to_scan = sc->nr_to_scan; - - if (nr_to_scan == 0) - goto out; + unsigned long freed = 0; raw_spin_lock(&kvm_lock); @@ -4240,30 +4448,50 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) * want to shrink a VM that only started to populate its MMU * anyway. */ - if (!kvm->arch.n_used_mmu_pages) + if (!kvm->arch.n_used_mmu_pages && + !kvm_has_zapped_obsolete_pages(kvm)) continue; idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); - prepare_zap_oldest_mmu_page(kvm, &invalid_list); + if (kvm_has_zapped_obsolete_pages(kvm)) { + kvm_mmu_commit_zap_page(kvm, + &kvm->arch.zapped_obsolete_pages); + goto unlock; + } + + if (prepare_zap_oldest_mmu_page(kvm, &invalid_list)) + freed++; kvm_mmu_commit_zap_page(kvm, &invalid_list); +unlock: spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); + /* + * unfair on small ones + * per-vm shrinkers cry out + * sadness comes quickly + */ list_move_tail(&kvm->vm_list, &vm_list); break; } raw_spin_unlock(&kvm_lock); + return freed; -out: +} + +static unsigned long +mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ return percpu_counter_read_positive(&kvm_total_used_mmu_pages); } static struct shrinker mmu_shrinker = { - .shrink = mmu_shrink, + .count_objects = mmu_shrink_count, + .scan_objects = mmu_shrink_scan, .seeks = DEFAULT_SEEKS * 10, }; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 2adcbc2cac6d..77e044a0f5f7 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -52,8 +52,27 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); + +/* + * Return values of handle_mmio_page_fault_common: + * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction + * directly. + * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page + * fault path update the mmio spte. + * RET_MMIO_PF_RETRY: let CPU fault again on the address. + * RET_MMIO_PF_BUG: bug is detected. + */ +enum { + RET_MMIO_PF_EMULATE = 1, + RET_MMIO_PF_INVALID = 2, + RET_MMIO_PF_RETRY = 0, + RET_MMIO_PF_BUG = -1 +}; + int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); +int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, + bool execonly); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { @@ -97,4 +116,5 @@ static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access, return (mmu->permissions[pfec >> 1] >> pte_access) & 1; } +void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); #endif diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index b8f6172f4174..9d2e0ffcb190 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -7,16 +7,18 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvmmmu -#define KVM_MMU_PAGE_FIELDS \ - __field(__u64, gfn) \ - __field(__u32, role) \ - __field(__u32, root_count) \ +#define KVM_MMU_PAGE_FIELDS \ + __field(unsigned long, mmu_valid_gen) \ + __field(__u64, gfn) \ + __field(__u32, role) \ + __field(__u32, root_count) \ __field(bool, unsync) -#define KVM_MMU_PAGE_ASSIGN(sp) \ - __entry->gfn = sp->gfn; \ - __entry->role = sp->role.word; \ - __entry->root_count = sp->root_count; \ +#define KVM_MMU_PAGE_ASSIGN(sp) \ + __entry->mmu_valid_gen = sp->mmu_valid_gen; \ + __entry->gfn = sp->gfn; \ + __entry->role = sp->role.word; \ + __entry->root_count = sp->root_count; \ __entry->unsync = sp->unsync; #define KVM_MMU_PAGE_PRINTK() ({ \ @@ -28,8 +30,8 @@ \ role.word = __entry->role; \ \ - trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s" \ - " %snxe root %u %s%c", \ + trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s" \ + " %snxe root %u %s%c", __entry->mmu_valid_gen, \ __entry->gfn, role.level, \ role.cr4_pae ? " pae" : "", \ role.quadrant, \ @@ -197,23 +199,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, TRACE_EVENT( mark_mmio_spte, - TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access), - TP_ARGS(sptep, gfn, access), + TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access, unsigned int gen), + TP_ARGS(sptep, gfn, access, gen), TP_STRUCT__entry( __field(void *, sptep) __field(gfn_t, gfn) __field(unsigned, access) + __field(unsigned int, gen) ), TP_fast_assign( __entry->sptep = sptep; __entry->gfn = gfn; __entry->access = access; + __entry->gen = gen; ), - TP_printk("sptep:%p gfn %llx access %x", __entry->sptep, __entry->gfn, - __entry->access) + TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep, + __entry->gfn, __entry->access, __entry->gen) ); TRACE_EVENT( @@ -274,6 +278,50 @@ TRACE_EVENT( __spte_satisfied(old_spte), __spte_satisfied(new_spte) ) ); + +TRACE_EVENT( + kvm_mmu_invalidate_zap_all_pages, + TP_PROTO(struct kvm *kvm), + TP_ARGS(kvm), + + TP_STRUCT__entry( + __field(unsigned long, mmu_valid_gen) + __field(unsigned int, mmu_used_pages) + ), + + TP_fast_assign( + __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen; + __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages; + ), + + TP_printk("kvm-mmu-valid-gen %lx used_pages %x", + __entry->mmu_valid_gen, __entry->mmu_used_pages + ) +); + + +TRACE_EVENT( + check_mmio_spte, + TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen), + TP_ARGS(spte, kvm_gen, spte_gen), + + TP_STRUCT__entry( + __field(unsigned int, kvm_gen) + __field(unsigned int, spte_gen) + __field(u64, spte) + ), + + TP_fast_assign( + __entry->kvm_gen = kvm_gen; + __entry->spte_gen = spte_gen; + __entry->spte = spte; + ), + + TP_printk("spte %llx kvm_gen %x spte-gen %x valid %d", __entry->spte, + __entry->kvm_gen, __entry->spte_gen, + __entry->kvm_gen == __entry->spte_gen + ) +); #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index da20860b457a..ad75d77999d0 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -23,6 +23,13 @@ * so the code in this file is compiled twice, once per pte size. */ +/* + * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro + * uses for EPT without A/D paging type. + */ +extern u64 __pure __using_nonexistent_pte_bit(void) + __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT"); + #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 @@ -32,6 +39,10 @@ #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK + #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK + #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT + #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS 4 #define CMPXCHG cmpxchg @@ -49,7 +60,26 @@ #define PT_INDEX(addr, level) PT32_INDEX(addr, level) #define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_MAX_FULL_LEVELS 2 + #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK + #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK + #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT + #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define CMPXCHG cmpxchg +#elif PTTYPE == PTTYPE_EPT + #define pt_element_t u64 + #define guest_walker guest_walkerEPT + #define FNAME(name) ept_##name + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_GUEST_ACCESSED_MASK 0 + #define PT_GUEST_DIRTY_MASK 0 + #define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit() + #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit() + #define CMPXCHG cmpxchg64 + #define PT_MAX_FULL_LEVELS 4 #else #error Invalid PTTYPE value #endif @@ -69,6 +99,7 @@ struct guest_walker { pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; + bool pte_writable[PT_MAX_FULL_LEVELS]; unsigned pt_access; unsigned pte_access; gfn_t gfn; @@ -80,6 +111,40 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; } +static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte) +{ + unsigned mask; + + /* dirty bit is not supported, so no need to track it */ + if (!PT_GUEST_DIRTY_MASK) + return; + + BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); + + mask = (unsigned)~ACC_WRITE_MASK; + /* Allow write access to dirty gptes */ + mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & + PT_WRITABLE_MASK; + *access &= mask; +} + +static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) +{ + int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f; + + return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) | + ((mmu->bad_mt_xwr & (1ull << low6)) != 0); +} + +static inline int FNAME(is_present_gpte)(unsigned long pte) +{ +#if PTTYPE != PTTYPE_EPT + return is_present_gpte(pte); +#else + return pte & 7; +#endif +} + static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pt_element_t __user *ptep_user, unsigned index, pt_element_t orig_pte, pt_element_t new_pte) @@ -103,6 +168,42 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return (ret != orig_pte); } +static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *spte, + u64 gpte) +{ + if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + goto no_present; + + if (!FNAME(is_present_gpte)(gpte)) + goto no_present; + + /* if accessed bit is not supported prefetch non accessed gpte */ + if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK)) + goto no_present; + + return false; + +no_present: + drop_spte(vcpu->kvm, spte); + return true; +} + +static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) +{ + unsigned access; +#if PTTYPE == PTTYPE_EPT + access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) | + ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | + ACC_USER_MASK; +#else + access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; + access &= ~(gpte >> PT64_NX_SHIFT); +#endif + + return access; +} + static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct guest_walker *walker, @@ -114,22 +215,43 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, gfn_t table_gfn; int ret; + /* dirty/accessed bits are not supported, so no need to update them */ + if (!PT_GUEST_DIRTY_MASK) + return 0; + for (level = walker->max_level; level >= walker->level; --level) { pte = orig_pte = walker->ptes[level - 1]; table_gfn = walker->table_gfn[level - 1]; ptep_user = walker->ptep_user[level - 1]; index = offset_in_page(ptep_user) / sizeof(pt_element_t); - if (!(pte & PT_ACCESSED_MASK)) { + if (!(pte & PT_GUEST_ACCESSED_MASK)) { trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); - pte |= PT_ACCESSED_MASK; + pte |= PT_GUEST_ACCESSED_MASK; } - if (level == walker->level && write_fault && !is_dirty_gpte(pte)) { + if (level == walker->level && write_fault && + !(pte & PT_GUEST_DIRTY_MASK)) { trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); - pte |= PT_DIRTY_MASK; + pte |= PT_GUEST_DIRTY_MASK; } if (pte == orig_pte) continue; + /* + * If the slot is read-only, simply do not process the accessed + * and dirty bits. This is the correct thing to do if the slot + * is ROM, and page tables in read-as-ROM/write-as-MMIO slots + * are only supported if the accessed and dirty bits are already + * set in the ROM (so that MMIO writes are never needed). + * + * Note that NPT does not allow this at all and faults, since + * it always wants nested page table entries for the guest + * page tables to be writable. And EPT works but will simply + * overwrite the read-only memory to set the accessed and dirty + * bits. + */ + if (unlikely(!walker->pte_writable[level - 1])) + continue; + ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte); if (ret) return ret; @@ -170,7 +292,7 @@ retry_walk: if (walker->level == PT32E_ROOT_LEVEL) { pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); - if (!is_present_gpte(pte)) + if (!FNAME(is_present_gpte)(pte)) goto error; --walker->level; } @@ -179,7 +301,7 @@ retry_walk: ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); - accessed_dirty = PT_ACCESSED_MASK; + accessed_dirty = PT_GUEST_ACCESSED_MASK; pt_access = pte_access = ACC_ALL; ++walker->level; @@ -204,7 +326,8 @@ retry_walk: goto error; real_gfn = gpa_to_gfn(real_gfn); - host_addr = gfn_to_hva(vcpu->kvm, real_gfn); + host_addr = gfn_to_hva_prot(vcpu->kvm, real_gfn, + &walker->pte_writable[walker->level - 1]); if (unlikely(kvm_is_error_hva(host_addr))) goto error; @@ -215,17 +338,17 @@ retry_walk: trace_kvm_mmu_paging_element(pte, walker->level); - if (unlikely(!is_present_gpte(pte))) + if (unlikely(!FNAME(is_present_gpte)(pte))) goto error; - if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, - walker->level))) { + if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, + walker->level))) { errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK; goto error; } accessed_dirty &= pte; - pte_access = pt_access & gpte_access(vcpu, pte); + pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); walker->ptes[walker->level - 1] = pte; } while (!is_last_gpte(mmu, walker->level, pte)); @@ -248,13 +371,15 @@ retry_walk: walker->gfn = real_gpa >> PAGE_SHIFT; if (!write_fault) - protect_clean_gpte(&pte_access, pte); + FNAME(protect_clean_gpte)(&pte_access, pte); else /* - * On a write fault, fold the dirty bit into accessed_dirty by - * shifting it one place right. + * On a write fault, fold the dirty bit into accessed_dirty. + * For modes without A/D bits support accessed_dirty will be + * always clear. */ - accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT); + accessed_dirty &= pte >> + (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT); if (unlikely(!accessed_dirty)) { ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); @@ -279,6 +404,25 @@ error: walker->fault.vector = PF_VECTOR; walker->fault.error_code_valid = true; walker->fault.error_code = errcode; + +#if PTTYPE == PTTYPE_EPT + /* + * Use PFERR_RSVD_MASK in error_code to to tell if EPT + * misconfiguration requires to be injected. The detection is + * done by is_rsvd_bits_set() above. + * + * We set up the value of exit_qualification to inject: + * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation + * [5:3] - Calculated by the page walk of the guest EPT page tables + * [7:8] - Derived from [7:8] of real exit_qualification + * + * The other bits are set to 0. + */ + if (!(errcode & PFERR_RSVD_MASK)) { + vcpu->arch.exit_qualification &= 0x187; + vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3; + } +#endif walker->fault.address = addr; walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; @@ -293,6 +437,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, access); } +#if PTTYPE != PTTYPE_EPT static int FNAME(walk_addr_nested)(struct guest_walker *walker, struct kvm_vcpu *vcpu, gva_t addr, u32 access) @@ -300,6 +445,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker, return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, addr, access); } +#endif static bool FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, @@ -309,14 +455,14 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, gfn_t gfn; pfn_t pfn; - if (prefetch_invalid_gpte(vcpu, sp, spte, gpte)) + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) return false; pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); gfn = gpte_to_gfn(gpte); - pte_access = sp->role.access & gpte_access(vcpu, gpte); - protect_clean_gpte(&pte_access, gpte); + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + FNAME(protect_clean_gpte)(&pte_access, gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, no_dirty_log && (pte_access & ACC_WRITE_MASK)); if (is_error_pfn(pfn)) @@ -446,7 +592,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, goto out_gpte_changed; if (sp) - link_shadow_page(it.sptep, sp); + link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK); } for (; @@ -466,7 +612,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, true, direct_access, it.sptep); - link_shadow_page(it.sptep, sp); + link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK); } clear_sp_write_flooding_count(it.sptep); @@ -552,9 +698,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); - if (unlikely(error_code & PFERR_RSVD_MASK)) - return handle_mmio_page_fault(vcpu, addr, error_code, + if (unlikely(error_code & PFERR_RSVD_MASK)) { + r = handle_mmio_page_fault(vcpu, addr, error_code, mmu_is_nested(vcpu)); + if (likely(r != RET_MMIO_PF_INVALID)) + return r; + }; r = mmu_topup_memory_caches(vcpu); if (r) @@ -724,6 +873,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, return gpa; } +#if PTTYPE != PTTYPE_EPT static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, struct x86_exception *exception) @@ -742,6 +892,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, return gpa; } +#endif /* * Using the cached information from sp->gfns is safe because: @@ -782,17 +933,18 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) sizeof(pt_element_t))) return -EINVAL; - if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) { + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { vcpu->kvm->tlbs_dirty++; continue; } gfn = gpte_to_gfn(gpte); pte_access = sp->role.access; - pte_access &= gpte_access(vcpu, gpte); - protect_clean_gpte(&pte_access, gpte); + pte_access &= FNAME(gpte_access)(vcpu, gpte); + FNAME(protect_clean_gpte)(&pte_access, gpte); - if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) + if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access, + &nr_present)) continue; if (gfn != sp->gfns[i]) { @@ -826,3 +978,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef gpte_to_gfn #undef gpte_to_gfn_lvl #undef CMPXCHG +#undef PT_GUEST_ACCESSED_MASK +#undef PT_GUEST_DIRTY_MASK +#undef PT_GUEST_DIRTY_SHIFT +#undef PT_GUEST_ACCESSED_SHIFT diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index c53e797e7369..5c4f63151b4d 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -160,7 +160,7 @@ static void stop_counter(struct kvm_pmc *pmc) static void reprogram_counter(struct kvm_pmc *pmc, u32 type, unsigned config, bool exclude_user, bool exclude_kernel, - bool intr) + bool intr, bool in_tx, bool in_tx_cp) { struct perf_event *event; struct perf_event_attr attr = { @@ -173,6 +173,10 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type, .exclude_kernel = exclude_kernel, .config = config, }; + if (in_tx) + attr.config |= HSW_IN_TX; + if (in_tx_cp) + attr.config |= HSW_IN_TX_CHECKPOINTED; attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc); @@ -226,7 +230,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | ARCH_PERFMON_EVENTSEL_INV | - ARCH_PERFMON_EVENTSEL_CMASK))) { + ARCH_PERFMON_EVENTSEL_CMASK | + HSW_IN_TX | + HSW_IN_TX_CHECKPOINTED))) { config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, unit_mask); if (config != PERF_COUNT_HW_MAX) @@ -239,7 +245,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) reprogram_counter(pmc, type, config, !(eventsel & ARCH_PERFMON_EVENTSEL_USR), !(eventsel & ARCH_PERFMON_EVENTSEL_OS), - eventsel & ARCH_PERFMON_EVENTSEL_INT); + eventsel & ARCH_PERFMON_EVENTSEL_INT, + (eventsel & HSW_IN_TX), + (eventsel & HSW_IN_TX_CHECKPOINTED)); } static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx) @@ -256,7 +264,7 @@ static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx) arch_events[fixed_pmc_events[idx]].event_type, !(en & 0x2), /* exclude user */ !(en & 0x1), /* exclude kernel */ - pmi); + pmi, false, false); } static inline u8 fixed_en_pmi(u64 ctrl, int idx) @@ -408,7 +416,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { if (data == pmc->eventsel) return 0; - if (!(data & 0xffffffff00200000ull)) { + if (!(data & pmu->reserved_bits)) { reprogram_gp_counter(pmc, data); return 0; } @@ -450,6 +458,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_GP] = 0; pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->version = 0; + pmu->reserved_bits = 0xffffffff00200000ull; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); if (!entry) @@ -478,6 +487,12 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); pmu->global_ctrl_mask = ~pmu->global_ctrl; + + entry = kvm_find_cpuid_entry(vcpu, 7, 0); + if (entry && + (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && + (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) + pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED; } void kvm_pmu_init(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a14a6eaf871d..c0bc80391e40 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1026,7 +1026,10 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) g_tsc_offset = svm->vmcb->control.tsc_offset - svm->nested.hsave->control.tsc_offset; svm->nested.hsave->control.tsc_offset = offset; - } + } else + trace_kvm_write_tsc_offset(vcpu->vcpu_id, + svm->vmcb->control.tsc_offset, + offset); svm->vmcb->control.tsc_offset = offset + g_tsc_offset; @@ -1044,6 +1047,11 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho svm->vmcb->control.tsc_offset += adjustment; if (is_guest_mode(vcpu)) svm->nested.hsave->control.tsc_offset += adjustment; + else + trace_kvm_write_tsc_offset(vcpu->vcpu_id, + svm->vmcb->control.tsc_offset - adjustment, + svm->vmcb->control.tsc_offset); + mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index fe5e00ed7036..545245d7cc63 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -756,6 +756,27 @@ TRACE_EVENT( __entry->gpa_match ? "GPA" : "GVA") ); +TRACE_EVENT(kvm_write_tsc_offset, + TP_PROTO(unsigned int vcpu_id, __u64 previous_tsc_offset, + __u64 next_tsc_offset), + TP_ARGS(vcpu_id, previous_tsc_offset, next_tsc_offset), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( __u64, previous_tsc_offset ) + __field( __u64, next_tsc_offset ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->previous_tsc_offset = previous_tsc_offset; + __entry->next_tsc_offset = next_tsc_offset; + ), + + TP_printk("vcpu=%u prev=%llu next=%llu", __entry->vcpu_id, + __entry->previous_tsc_offset, __entry->next_tsc_offset) +); + #ifdef CONFIG_X86_64 #define host_clocks \ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 260a91939555..3b8e7459dd4d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -373,6 +373,7 @@ struct nested_vmx { * we must keep them pinned while L2 runs. */ struct page *apic_access_page; + u64 msr_ia32_feature_control; }; #define POSTED_INTR_ON 0 @@ -711,10 +712,10 @@ static void nested_release_page_clean(struct page *page) kvm_release_page_clean(page); } +static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); -static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -1039,12 +1040,16 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) (vmcs12->secondary_vm_exec_control & bit); } -static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12, - struct kvm_vcpu *vcpu) +static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) { return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; } +static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); +} + static inline bool is_exception(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -2096,6 +2101,8 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ? vmcs12->tsc_offset : 0)); } else { + trace_kvm_write_tsc_offset(vcpu->vcpu_id, + vmcs_read64(TSC_OFFSET), offset); vmcs_write64(TSC_OFFSET, offset); } } @@ -2103,11 +2110,14 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) { u64 offset = vmcs_read64(TSC_OFFSET); + vmcs_write64(TSC_OFFSET, offset + adjustment); if (is_guest_mode(vcpu)) { /* Even when running L2, the adjustment needs to apply to L1 */ to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment; - } + } else + trace_kvm_write_tsc_offset(vcpu->vcpu_id, offset, + offset + adjustment); } static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) @@ -2150,6 +2160,7 @@ static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; static u32 nested_vmx_misc_low, nested_vmx_misc_high; +static u32 nested_vmx_ept_caps; static __init void nested_vmx_setup_ctls_msrs(void) { /* @@ -2185,14 +2196,17 @@ static __init void nested_vmx_setup_ctls_msrs(void) * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and * 17 must be 1. */ + rdmsr(MSR_IA32_VMX_EXIT_CTLS, + nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ + nested_vmx_exit_ctls_high &= #ifdef CONFIG_X86_64 - nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE; -#else - nested_vmx_exit_ctls_high = 0; + VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif - nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; + VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; + nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | + VM_EXIT_LOAD_IA32_EFER); /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2200,8 +2214,12 @@ static __init void nested_vmx_setup_ctls_msrs(void) /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */ nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_entry_ctls_high &= - VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; - nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; +#ifdef CONFIG_X86_64 + VM_ENTRY_IA32E_MODE | +#endif + VM_ENTRY_LOAD_IA32_PAT; + nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | + VM_ENTRY_LOAD_IA32_EFER); /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, @@ -2236,6 +2254,22 @@ static __init void nested_vmx_setup_ctls_msrs(void) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_WBINVD_EXITING; + if (enable_ept) { + /* nested EPT: emulate EPT also to L1 */ + nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; + nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | + VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; + nested_vmx_ept_caps &= vmx_capability.ept; + /* + * Since invept is completely emulated we support both global + * and context invalidation independent of what host cpu + * supports + */ + nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | + VMX_EPT_EXTENT_CONTEXT_BIT; + } else + nested_vmx_ept_caps = 0; + /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK | @@ -2277,8 +2311,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) switch (msr_index) { case MSR_IA32_FEATURE_CONTROL: - *pdata = 0; - break; + if (nested_vmx_allowed(vcpu)) { + *pdata = to_vmx(vcpu)->nested.msr_ia32_feature_control; + break; + } + return 0; case MSR_IA32_VMX_BASIC: /* * This MSR reports some information about VMX support. We @@ -2341,8 +2378,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) nested_vmx_secondary_ctls_high); break; case MSR_IA32_VMX_EPT_VPID_CAP: - /* Currently, no nested ept or nested vpid */ - *pdata = 0; + /* Currently, no nested vpid support */ + *pdata = nested_vmx_ept_caps; break; default: return 0; @@ -2351,14 +2388,24 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) return 1; } -static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { + u32 msr_index = msr_info->index; + u64 data = msr_info->data; + bool host_initialized = msr_info->host_initiated; + if (!nested_vmx_allowed(vcpu)) return 0; - if (msr_index == MSR_IA32_FEATURE_CONTROL) - /* TODO: the right thing. */ + if (msr_index == MSR_IA32_FEATURE_CONTROL) { + if (!host_initialized && + to_vmx(vcpu)->nested.msr_ia32_feature_control + & FEATURE_CONTROL_LOCKED) + return 0; + to_vmx(vcpu)->nested.msr_ia32_feature_control = data; return 1; + } + /* * No need to treat VMX capability MSRs specially: If we don't handle * them, handle_wrmsr will #GP(0), which is correct (they are readonly) @@ -2489,7 +2536,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; /* Otherwise falls through */ default: - if (vmx_set_vmx_msr(vcpu, msr_index, data)) + if (vmx_set_vmx_msr(vcpu, msr_info)) break; msr = find_msr_entry(vmx, msr_index); if (msr) { @@ -3399,15 +3446,22 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, var->limit = vmx_read_guest_seg_limit(vmx, seg); var->selector = vmx_read_guest_seg_selector(vmx, seg); ar = vmx_read_guest_seg_ar(vmx, seg); + var->unusable = (ar >> 16) & 1; var->type = ar & 15; var->s = (ar >> 4) & 1; var->dpl = (ar >> 5) & 3; - var->present = (ar >> 7) & 1; + /* + * Some userspaces do not preserve unusable property. Since usable + * segment has to be present according to VMX spec we can use present + * property to amend userspace bug by making unusable segment always + * nonpresent. vmx_segment_access_rights() already marks nonpresent + * segment as unusable. + */ + var->present = !var->unusable; var->avl = (ar >> 12) & 1; var->l = (ar >> 13) & 1; var->db = (ar >> 14) & 1; var->g = (ar >> 15) & 1; - var->unusable = (ar >> 16) & 1; } static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) @@ -4176,10 +4230,10 @@ static void ept_set_mmio_spte_mask(void) /* * EPT Misconfigurations can be generated if the value of bits 2:0 * of an EPT paging-structure entry is 110b (write/execute). - * Also, magic bits (0xffull << 49) is set to quickly identify mmio + * Also, magic bits (0x3ull << 62) is set to quickly identify mmio * spte. */ - kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull); + kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull); } /* @@ -5285,14 +5339,29 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) return 0; } + /* + * EPT violation happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + * There are errata that may cause this bit to not be set: + * AAK134, BY25. + */ + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + cpu_has_virtual_nmis() && + (exit_qualification & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(gpa, exit_qualification); /* It is a write fault? */ error_code = exit_qualification & (1U << 1); + /* It is a fetch fault? */ + error_code |= (exit_qualification & (1U << 2)) << 2; /* ept page table is present? */ error_code |= (exit_qualification >> 3) & 0x1; + vcpu->arch.exit_qualification = exit_qualification; + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); } @@ -5366,10 +5435,14 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); ret = handle_mmio_page_fault_common(vcpu, gpa, true); - if (likely(ret == 1)) + if (likely(ret == RET_MMIO_PF_EMULATE)) return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == EMULATE_DONE; - if (unlikely(!ret)) + + if (unlikely(ret == RET_MMIO_PF_INVALID)) + return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0); + + if (unlikely(ret == RET_MMIO_PF_RETRY)) return 1; /* It is the real ept misconfig */ @@ -5422,7 +5495,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); - if (err == EMULATE_DO_MMIO) { + if (err == EMULATE_USER_EXIT) { + ++vcpu->stat.mmio_exits; ret = 0; goto out; } @@ -5551,8 +5625,47 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) free_loaded_vmcs(&vmx->vmcs01); } +/* + * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), + * set the success or error code of an emulated VMX instruction, as specified + * by Vol 2B, VMX Instruction Reference, "Conventions". + */ +static void nested_vmx_succeed(struct kvm_vcpu *vcpu) +{ + vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); +} + +static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu) +{ + vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | + X86_EFLAGS_SF | X86_EFLAGS_OF)) + | X86_EFLAGS_CF); +} + static void nested_vmx_failValid(struct kvm_vcpu *vcpu, - u32 vm_instruction_error); + u32 vm_instruction_error) +{ + if (to_vmx(vcpu)->nested.current_vmptr == -1ull) { + /* + * failValid writes the error number to the current VMCS, which + * can't be done there isn't a current VMCS. + */ + nested_vmx_failInvalid(vcpu); + return; + } + vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_SF | X86_EFLAGS_OF)) + | X86_EFLAGS_ZF); + get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; + /* + * We don't need to force a shadow sync because + * VM_INSTRUCTION_ERROR is not shadowed + */ +} /* * Emulate the VMXON instruction. @@ -5567,6 +5680,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu) struct kvm_segment cs; struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs *shadow_vmcs; + const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED + | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; /* The Intel VMX Instruction Reference lists a bunch of bits that * are prerequisite to running VMXON, most notably cr4.VMXE must be @@ -5595,6 +5710,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); return 1; } + + if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES) + != VMXON_NEEDED_FEATURES) { + kvm_inject_gp(vcpu, 0); + return 1; + } + if (enable_shadow_vmcs) { shadow_vmcs = alloc_vmcs(); if (!shadow_vmcs) @@ -5612,6 +5734,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) vmx->nested.vmxon = true; skip_emulated_instruction(vcpu); + nested_vmx_succeed(vcpu); return 1; } @@ -5696,6 +5819,7 @@ static int handle_vmoff(struct kvm_vcpu *vcpu) return 1; free_nested(to_vmx(vcpu)); skip_emulated_instruction(vcpu); + nested_vmx_succeed(vcpu); return 1; } @@ -5752,48 +5876,6 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, return 0; } -/* - * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), - * set the success or error code of an emulated VMX instruction, as specified - * by Vol 2B, VMX Instruction Reference, "Conventions". - */ -static void nested_vmx_succeed(struct kvm_vcpu *vcpu) -{ - vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | - X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); -} - -static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu) -{ - vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | - X86_EFLAGS_SF | X86_EFLAGS_OF)) - | X86_EFLAGS_CF); -} - -static void nested_vmx_failValid(struct kvm_vcpu *vcpu, - u32 vm_instruction_error) -{ - if (to_vmx(vcpu)->nested.current_vmptr == -1ull) { - /* - * failValid writes the error number to the current VMCS, which - * can't be done there isn't a current VMCS. - */ - nested_vmx_failInvalid(vcpu); - return; - } - vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | - X86_EFLAGS_SF | X86_EFLAGS_OF)) - | X86_EFLAGS_ZF); - get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; - /* - * We don't need to force a shadow sync because - * VM_INSTRUCTION_ERROR is not shadowed - */ -} - /* Emulate the VMCLEAR instruction */ static int handle_vmclear(struct kvm_vcpu *vcpu) { @@ -5956,8 +6038,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) unsigned long field; u64 field_value; struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; - unsigned long *fields = (unsigned long *)shadow_read_write_fields; - int num_fields = max_shadow_read_write_fields; + const unsigned long *fields = shadow_read_write_fields; + const int num_fields = max_shadow_read_write_fields; vmcs_load(shadow_vmcs); @@ -5986,12 +6068,11 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) { - unsigned long *fields[] = { - (unsigned long *)shadow_read_write_fields, - (unsigned long *)shadow_read_only_fields + const unsigned long *fields[] = { + shadow_read_write_fields, + shadow_read_only_fields }; - int num_lists = ARRAY_SIZE(fields); - int max_fields[] = { + const int max_fields[] = { max_shadow_read_write_fields, max_shadow_read_only_fields }; @@ -6002,7 +6083,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) vmcs_load(shadow_vmcs); - for (q = 0; q < num_lists; q++) { + for (q = 0; q < ARRAY_SIZE(fields); q++) { for (i = 0; i < max_fields[q]; i++) { field = fields[q][i]; vmcs12_read_any(&vmx->vcpu, field, &field_value); @@ -6232,6 +6313,74 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) return 1; } +/* Emulate the INVEPT instruction */ +static int handle_invept(struct kvm_vcpu *vcpu) +{ + u32 vmx_instruction_info, types; + unsigned long type; + gva_t gva; + struct x86_exception e; + struct { + u64 eptp, gpa; + } operand; + u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK; + + if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || + !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); + + types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; + + if (!(types & (1UL << type))) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return 1; + } + + /* According to the Intel VMX instruction reference, the memory + * operand is read even if it isn't needed (e.g., for type==global) + */ + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmx_instruction_info, &gva)) + return 1; + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand, + sizeof(operand), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + switch (type) { + case VMX_EPT_EXTENT_CONTEXT: + if ((operand.eptp & eptp_mask) != + (nested_ept_get_cr3(vcpu) & eptp_mask)) + break; + case VMX_EPT_EXTENT_GLOBAL: + kvm_mmu_sync_roots(vcpu); + kvm_mmu_flush_tlb(vcpu); + nested_vmx_succeed(vcpu); + break; + default: + BUG_ON(1); + break; + } + + skip_emulated_instruction(vcpu); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -6276,6 +6425,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, + [EXIT_REASON_INVEPT] = handle_invept, }; static const int kvm_vmx_max_exit_handlers = @@ -6502,6 +6652,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: + case EXIT_REASON_INVEPT: /* * VMX instructions trap unconditionally. This allows L1 to * emulate them for its L2 guest, i.e., allows 3-level nesting! @@ -6534,7 +6685,20 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); case EXIT_REASON_EPT_VIOLATION: + /* + * L0 always deals with the EPT violation. If nested EPT is + * used, and the nested mmu code discovers that the address is + * missing in the guest EPT table (EPT12), the EPT violation + * will be injected with nested_ept_inject_page_fault() + */ + return 0; case EXIT_REASON_EPT_MISCONFIG: + /* + * L2 never uses directly L1's EPT, but rather L0's own EPT + * table (shadow on EPT) or a merged EPT table that L0 built + * (EPT on EPT). So any problems with the structure of the + * table is L0's fault. + */ return 0; case EXIT_REASON_PREEMPTION_TIMER: return vmcs12->pin_based_vm_exec_control & @@ -6622,7 +6786,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( - get_vmcs12(vcpu), vcpu)))) { + get_vmcs12(vcpu))))) { if (vmx_interrupt_allowed(vcpu)) { vmx->soft_vnmi_blocked = 0; } else if (vmx->vnmi_blocked_time > 1000000000LL && @@ -7310,6 +7474,48 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) entry->ecx |= bit(X86_FEATURE_VMX); } +static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + struct vmcs12 *vmcs12; + nested_vmx_vmexit(vcpu); + vmcs12 = get_vmcs12(vcpu); + + if (fault->error_code & PFERR_RSVD_MASK) + vmcs12->vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; + else + vmcs12->vm_exit_reason = EXIT_REASON_EPT_VIOLATION; + vmcs12->exit_qualification = vcpu->arch.exit_qualification; + vmcs12->guest_physical_address = fault->address; +} + +/* Callbacks for nested_ept_init_mmu_context: */ + +static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) +{ + /* return the page table to be shadowed - in our case, EPT12 */ + return get_vmcs12(vcpu)->ept_pointer; +} + +static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) +{ + int r = kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu, + nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT); + + vcpu->arch.mmu.set_cr3 = vmx_set_cr3; + vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; + vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; + + vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; + + return r; +} + +static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) +{ + vcpu->arch.walk_mmu = &vcpu->arch.mmu; +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -7372,7 +7578,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_interruptibility_info); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); - vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags); + vmx_set_rflags(vcpu, vmcs12->guest_rflags); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, vmcs12->guest_pending_dbg_exceptions); vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); @@ -7492,15 +7698,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); - /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */ - vmcs_write32(VM_EXIT_CONTROLS, - vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl); - vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls | + /* L2->L1 exit controls are emulated - the hardware exit is to L0 so + * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER + * bits are further modified by vmx_set_efer() below. + */ + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); + + /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are + * emulated by vmx_set_efer(), below. + */ + vmcs_write32(VM_ENTRY_CONTROLS, + (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER & + ~VM_ENTRY_IA32E_MODE) | (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); - if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); - else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) + vcpu->arch.pat = vmcs12->guest_ia32_pat; + } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); @@ -7522,6 +7737,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx_flush_tlb(vcpu); } + if (nested_cpu_has_ept(vmcs12)) { + kvm_mmu_unload(vcpu); + nested_ept_init_mmu_context(vcpu); + } + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->guest_ia32_efer; else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) @@ -7549,6 +7769,20 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) kvm_set_cr3(vcpu, vmcs12->guest_cr3); kvm_mmu_reset_context(vcpu); + /* + * L1 may access the L2's PDPTR, so save them to construct vmcs12 + */ + if (enable_ept) { + vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); + vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); + vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); + vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); + __clear_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail); + __clear_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_dirty); + } + kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); } @@ -7871,6 +8105,22 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_pending_dbg_exceptions = vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); + /* + * In some cases (usually, nested EPT), L2 is allowed to change its + * own CR3 without exiting. If it has changed it, we must keep it. + * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined + * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. + * + * Additionally, restore L2's PDPTR to vmcs12. + */ + if (enable_ept) { + vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3); + vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); + vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); + vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); + vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); + } + vmcs12->vm_entry_controls = (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE); @@ -7932,6 +8182,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { + struct kvm_segment seg; + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->host_ia32_efer; else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) @@ -7942,7 +8194,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); - vmx_set_rflags(vcpu, X86_EFLAGS_BIT1); + vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); /* * Note that calling vmx_set_cr0 is important, even if cr0 hasn't * actually changed, because it depends on the current state of @@ -7966,7 +8218,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); kvm_set_cr4(vcpu, vmcs12->host_cr4); - /* shadow page tables on either EPT or shadow page tables */ + if (nested_cpu_has_ept(vmcs12)) + nested_ept_uninit_mmu_context(vcpu); + kvm_set_cr3(vcpu, vmcs12->host_cr3); kvm_mmu_reset_context(vcpu); @@ -7985,23 +8239,61 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); - vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base); - vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base); - vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base); - vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector); - vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector); - vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector); - vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector); - vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector); - vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector); - vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector); - - if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) + + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); + vcpu->arch.pat = vmcs12->host_ia32_pat; + } if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl); + /* Set L1 segment info according to Intel SDM + 27.5.2 Loading Host Segment and Descriptor-Table Registers */ + seg = (struct kvm_segment) { + .base = 0, + .limit = 0xFFFFFFFF, + .selector = vmcs12->host_cs_selector, + .type = 11, + .present = 1, + .s = 1, + .g = 1 + }; + if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) + seg.l = 1; + else + seg.db = 1; + vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); + seg = (struct kvm_segment) { + .base = 0, + .limit = 0xFFFFFFFF, + .type = 3, + .present = 1, + .s = 1, + .db = 1, + .g = 1 + }; + seg.selector = vmcs12->host_ds_selector; + vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); + seg.selector = vmcs12->host_es_selector; + vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); + seg.selector = vmcs12->host_ss_selector; + vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); + seg.selector = vmcs12->host_fs_selector; + seg.base = vmcs12->host_fs_base; + vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); + seg.selector = vmcs12->host_gs_selector; + seg.base = vmcs12->host_gs_base; + vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); + seg = (struct kvm_segment) { + .base = vmcs12->host_tr_base, + .limit = 0x67, + .selector = vmcs12->host_tr_selector, + .type = 11, + .present = 1 + }; + vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); + kvm_set_dr(vcpu, 7, 0x400); vmcs_write64(GUEST_IA32_DEBUGCTL, 0); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 094b5d96ab14..e5ca72a5cdb6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -582,8 +582,6 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) if (index != XCR_XFEATURE_ENABLED_MASK) return 1; xcr0 = xcr; - if (kvm_x86_ops->get_cpl(vcpu) != 0) - return 1; if (!(xcr0 & XSTATE_FP)) return 1; if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) @@ -597,7 +595,8 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { - if (__kvm_set_xcr(vcpu, index, xcr)) { + if (kvm_x86_ops->get_cpl(vcpu) != 0 || + __kvm_set_xcr(vcpu, index, xcr)) { kvm_inject_gp(vcpu, 0); return 1; } @@ -619,7 +618,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP)) return 1; - if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS)) + if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE)) return 1; if (is_long_mode(vcpu)) { @@ -683,17 +682,6 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) */ } - /* - * Does the new cr3 value map to physical memory? (Note, we - * catch an invalid cr3 even in real-mode, because it would - * cause trouble later on when we turn on paging anyway.) - * - * A real CPU would silently accept an invalid cr3 and would - * attempt to use it - with largely undefined (and often hard - * to debug) behavior on the guest side. - */ - if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) - return 1; vcpu->arch.cr3 = cr3; __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); vcpu->arch.mmu.new_cr3(vcpu); @@ -851,7 +839,8 @@ static u32 msrs_to_save[] = { #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEATURE_CONTROL }; static unsigned num_msrs_to_save; @@ -1194,20 +1183,37 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) elapsed = ns - kvm->arch.last_tsc_nsec; if (vcpu->arch.virtual_tsc_khz) { + int faulted = 0; + /* n.b - signed multiplication and division required */ usdiff = data - kvm->arch.last_tsc_write; #ifdef CONFIG_X86_64 usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; #else /* do_div() only does unsigned */ - asm("idivl %2; xor %%edx, %%edx" - : "=A"(usdiff) - : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); + asm("1: idivl %[divisor]\n" + "2: xor %%edx, %%edx\n" + " movl $0, %[faulted]\n" + "3:\n" + ".section .fixup,\"ax\"\n" + "4: movl $1, %[faulted]\n" + " jmp 3b\n" + ".previous\n" + + _ASM_EXTABLE(1b, 4b) + + : "=A"(usdiff), [faulted] "=r" (faulted) + : "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz)); + #endif do_div(elapsed, 1000); usdiff -= elapsed; if (usdiff < 0) usdiff = -usdiff; + + /* idivl overflow => difference is larger than USEC_PER_SEC */ + if (faulted) + usdiff = USEC_PER_SEC; } else usdiff = USEC_PER_SEC; /* disable TSC match window below */ @@ -1441,6 +1447,29 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) #endif } +static void kvm_gen_update_masterclock(struct kvm *kvm) +{ +#ifdef CONFIG_X86_64 + int i; + struct kvm_vcpu *vcpu; + struct kvm_arch *ka = &kvm->arch; + + spin_lock(&ka->pvclock_gtod_sync_lock); + kvm_make_mclock_inprogress_request(kvm); + /* no guest entries from this point */ + pvclock_update_vm_gtod_copy(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + + /* guest entries allowed */ + kvm_for_each_vcpu(i, vcpu, kvm) + clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests); + + spin_unlock(&ka->pvclock_gtod_sync_lock); +#endif +} + static int kvm_guest_time_update(struct kvm_vcpu *v) { unsigned long flags, this_tsc_khz; @@ -1588,6 +1617,30 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) return 0; } +/* + * kvmclock updates which are isolated to a given vcpu, such as + * vcpu->cpu migration, should not allow system_timestamp from + * the rest of the vcpus to remain static. Otherwise ntp frequency + * correction applies to one vcpu's system_timestamp but not + * the others. + * + * So in those cases, request a kvmclock update for all vcpus. + * The worst case for a remote vcpu to update its kvmclock + * is then bounded by maximum nohz sleep latency. + */ + +static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) +{ + int i; + struct kvm *kvm = v->kvm; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + kvm_vcpu_kick(vcpu); + } +} + static bool msr_mtrr_valid(unsigned msr) { switch (msr) { @@ -1985,7 +2038,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) kvmclock_reset(vcpu); vcpu->arch.time = data; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ if (!(data & 1)) @@ -2702,7 +2755,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * kvmclock on vcpu->cpu migration */ if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); if (vcpu->cpu != cpu) kvm_migrate_timers(vcpu); vcpu->cpu = cpu; @@ -3766,6 +3819,7 @@ long kvm_arch_vm_ioctl(struct file *filp, delta = user_ns.clock - now_ns; local_irq_enable(); kvm->arch.kvmclock_offset = delta; + kvm_gen_update_masterclock(kvm); break; } case KVM_GET_CLOCK: { @@ -4915,6 +4969,97 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, static int complete_emulated_mmio(struct kvm_vcpu *vcpu); static int complete_emulated_pio(struct kvm_vcpu *vcpu); +static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, + unsigned long *db) +{ + u32 dr6 = 0; + int i; + u32 enable, rwlen; + + enable = dr7; + rwlen = dr7 >> 16; + for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4) + if ((enable & 3) && (rwlen & 15) == type && db[i] == addr) + dr6 |= (1 << i); + return dr6; +} + +static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r) +{ + struct kvm_run *kvm_run = vcpu->run; + + /* + * Use the "raw" value to see if TF was passed to the processor. + * Note that the new value of the flags has not been saved yet. + * + * This is correct even for TF set by the guest, because "the + * processor will not generate this exception after the instruction + * that sets the TF flag". + */ + unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + + if (unlikely(rflags & X86_EFLAGS_TF)) { + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1; + kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; + kvm_run->debug.arch.exception = DB_VECTOR; + kvm_run->exit_reason = KVM_EXIT_DEBUG; + *r = EMULATE_USER_EXIT; + } else { + vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF; + /* + * "Certain debug exceptions may clear bit 0-3. The + * remaining contents of the DR6 register are never + * cleared by the processor". + */ + vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 |= DR6_BS; + kvm_queue_exception(vcpu, DB_VECTOR); + } + } +} + +static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) +{ + struct kvm_run *kvm_run = vcpu->run; + unsigned long eip = vcpu->arch.emulate_ctxt.eip; + u32 dr6 = 0; + + if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) && + (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) { + dr6 = kvm_vcpu_check_hw_bp(eip, 0, + vcpu->arch.guest_debug_dr7, + vcpu->arch.eff_db); + + if (dr6 != 0) { + kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; + kvm_run->debug.arch.pc = kvm_rip_read(vcpu) + + get_segment_base(vcpu, VCPU_SREG_CS); + + kvm_run->debug.arch.exception = DB_VECTOR; + kvm_run->exit_reason = KVM_EXIT_DEBUG; + *r = EMULATE_USER_EXIT; + return true; + } + } + + if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) { + dr6 = kvm_vcpu_check_hw_bp(eip, 0, + vcpu->arch.dr7, + vcpu->arch.db); + + if (dr6 != 0) { + vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 |= dr6; + kvm_queue_exception(vcpu, DB_VECTOR); + *r = EMULATE_DONE; + return true; + } + } + + return false; +} + int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, int emulation_type, @@ -4935,6 +5080,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, if (!(emulation_type & EMULTYPE_NO_DECODE)) { init_emulate_ctxt(vcpu); + + /* + * We will reenter on the same instruction since + * we do not set complete_userspace_io. This does not + * handle watchpoints yet, those would be handled in + * the emulate_ops. + */ + if (kvm_vcpu_check_breakpoint(vcpu, &r)) + return r; + ctxt->interruptibility = 0; ctxt->have_exception = false; ctxt->perm_ok = false; @@ -4991,17 +5146,18 @@ restart: inject_emulated_exception(vcpu); r = EMULATE_DONE; } else if (vcpu->arch.pio.count) { - if (!vcpu->arch.pio.in) + if (!vcpu->arch.pio.in) { + /* FIXME: return into emulator if single-stepping. */ vcpu->arch.pio.count = 0; - else { + } else { writeback = false; vcpu->arch.complete_userspace_io = complete_emulated_pio; } - r = EMULATE_DO_MMIO; + r = EMULATE_USER_EXIT; } else if (vcpu->mmio_needed) { if (!vcpu->mmio_is_write) writeback = false; - r = EMULATE_DO_MMIO; + r = EMULATE_USER_EXIT; vcpu->arch.complete_userspace_io = complete_emulated_mmio; } else if (r == EMULATION_RESTART) goto restart; @@ -5010,10 +5166,12 @@ restart: if (writeback) { toggle_interruptibility(vcpu, ctxt->interruptibility); - kvm_set_rflags(vcpu, ctxt->eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; kvm_rip_write(vcpu, ctxt->eip); + if (r == EMULATE_DONE) + kvm_vcpu_check_singlestep(vcpu, &r); + kvm_set_rflags(vcpu, ctxt->eflags); } else vcpu->arch.emulate_regs_need_sync_to_vcpu = true; @@ -5239,7 +5397,13 @@ static void kvm_set_mmio_spte_mask(void) * Set the reserved bits and the present bit of an paging-structure * entry to generate page fault with PFER.RSV = 1. */ - mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr; + /* Mask the reserved physical address bits. */ + mask = ((1ull << (51 - maxphyaddr + 1)) - 1) << maxphyaddr; + + /* Bit 62 is always reserved for 32bit host. */ + mask |= 0x3ull << 62; + + /* Set the present bit. */ mask |= 1ull; #ifdef CONFIG_X86_64 @@ -5301,7 +5465,7 @@ static struct notifier_block pvclock_gtod_notifier = { int kvm_arch_init(void *opaque) { int r; - struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; + struct kvm_x86_ops *ops = opaque; if (kvm_x86_ops) { printk(KERN_ERR "kvm: already loaded the other module\n"); @@ -5449,6 +5613,23 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) return 1; } +/* + * kvm_pv_kick_cpu_op: Kick a vcpu. + * + * @apicid - apicid of vcpu to be kicked. + */ +static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) +{ + struct kvm_lapic_irq lapic_irq; + + lapic_irq.shorthand = 0; + lapic_irq.dest_mode = 0; + lapic_irq.dest_id = apicid; + + lapic_irq.delivery_mode = APIC_DM_REMRD; + kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL); +} + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; @@ -5482,6 +5663,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) case KVM_HC_VAPIC_POLL_IRQ: ret = 0; break; + case KVM_HC_KICK_CPU: + kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); + ret = 0; + break; default: ret = -KVM_ENOSYS; break; @@ -5499,13 +5684,6 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) char instruction[3]; unsigned long rip = kvm_rip_read(vcpu); - /* - * Blow out the MMU to ensure that no other VCPU has an active mapping - * to ensure that the updated hypercall appears atomically across all - * VCPUs. - */ - kvm_mmu_zap_all(vcpu->kvm); - kvm_x86_ops->patch_hypercall(vcpu, instruction); return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); @@ -5650,29 +5828,6 @@ static void process_nmi(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); } -static void kvm_gen_update_masterclock(struct kvm *kvm) -{ -#ifdef CONFIG_X86_64 - int i; - struct kvm_vcpu *vcpu; - struct kvm_arch *ka = &kvm->arch; - - spin_lock(&ka->pvclock_gtod_sync_lock); - kvm_make_mclock_inprogress_request(kvm); - /* no guest entries from this point */ - pvclock_update_vm_gtod_copy(kvm); - - kvm_for_each_vcpu(i, vcpu, kvm) - set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); - - /* guest entries allowed */ - kvm_for_each_vcpu(i, vcpu, kvm) - clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests); - - spin_unlock(&ka->pvclock_gtod_sync_lock); -#endif -} - static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { u64 eoi_exit_bitmap[4]; @@ -5703,6 +5858,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) __kvm_migrate_timers(vcpu); if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) kvm_gen_update_masterclock(vcpu->kvm); + if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu)) + kvm_gen_kvmclock_update(vcpu); if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { r = kvm_guest_time_update(vcpu); if (unlikely(r)) @@ -5909,6 +6066,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) kvm_apic_accept_events(vcpu); switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: + vcpu->arch.pv.pv_unhalted = false; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; case KVM_MP_STATE_RUNNABLE: @@ -6020,6 +6178,8 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { vcpu->mmio_needed = 0; + + /* FIXME: return into emulator if single-stepping. */ if (vcpu->mmio_is_write) return 1; vcpu->mmio_read_completed = 1; @@ -6208,7 +6368,12 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { kvm_apic_accept_events(vcpu); - mp_state->mp_state = vcpu->arch.mp_state; + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED && + vcpu->arch.pv.pv_unhalted) + mp_state->mp_state = KVM_MP_STATE_RUNNABLE; + else + mp_state->mp_state = vcpu->arch.mp_state; + return 0; } @@ -6729,6 +6894,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) BUG_ON(vcpu->kvm == NULL); kvm = vcpu->kvm; + vcpu->arch.pv.pv_unhalted = false; vcpu->arch.emulate_ctxt.ops = &emulate_ops; if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -6813,6 +6979,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) return -EINVAL; INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ @@ -6977,6 +7144,15 @@ out_free: return -ENOMEM; } +void kvm_arch_memslots_updated(struct kvm *kvm) +{ + /* + * memslots->generation has been incremented. + * mmio generation may have reached its maximum value. + */ + kvm_mmu_invalidate_mmio_sptes(kvm); +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_userspace_memory_region *mem, @@ -7037,26 +7213,17 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, */ if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) kvm_mmu_slot_remove_write_access(kvm, mem->slot); - /* - * If memory slot is created, or moved, we need to clear all - * mmio sptes. - */ - if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { - kvm_mmu_zap_mmio_sptes(kvm); - kvm_reload_remote_mmus(kvm); - } } void kvm_arch_flush_shadow_all(struct kvm *kvm) { - kvm_mmu_zap_all(kvm); - kvm_reload_remote_mmus(kvm); + kvm_mmu_invalidate_zap_all_pages(kvm); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { - kvm_arch_flush_shadow_all(kvm); + kvm_mmu_invalidate_zap_all_pages(kvm); } int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) @@ -7065,6 +7232,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) !vcpu->arch.apf.halted) || !list_empty_careful(&vcpu->async_pf.done) || kvm_apic_has_events(vcpu) + || vcpu->arch.pv.pv_unhalted || atomic_read(&vcpu->arch.nmi_queued) || (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)); @@ -7264,3 +7432,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile index 94e0e54056a9..8f38d577a2fa 100644 --- a/arch/x86/lguest/Makefile +++ b/arch/x86/lguest/Makefile @@ -1,2 +1,2 @@ -obj-y := i386_head.o boot.o +obj-y := head_32.o boot.o CFLAGS_boot.o := $(call cc-option, -fno-stack-protector) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 7114c63f047d..bdf8532494fe 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -7,8 +7,7 @@ * kernel and insert a module (lg.ko) which allows us to run other Linux * kernels the same way we'd run processes. We call the first kernel the Host, * and the others the Guests. The program which sets up and configures Guests - * (such as the example in Documentation/virtual/lguest/lguest.c) is called the - * Launcher. + * (such as the example in tools/lguest/lguest.c) is called the Launcher. * * Secondly, we only run specially modified Guests, not normal kernels: setting * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows @@ -882,9 +881,9 @@ int lguest_setup_irq(unsigned int irq) * It would be far better for everyone if the Guest had its own clock, but * until then the Host gives us the time on every interrupt. */ -static unsigned long lguest_get_wallclock(void) +static void lguest_get_wallclock(struct timespec *now) { - return lguest_data.time.tv_sec; + *now = lguest_data.time; } /* @@ -1057,6 +1056,12 @@ static void lguest_load_sp0(struct tss_struct *tss, } /* Let's just say, I wouldn't do debugging under a Guest. */ +static unsigned long lguest_get_debugreg(int regno) +{ + /* FIXME: Implement */ + return 0; +} + static void lguest_set_debugreg(int regno, unsigned long value) { /* FIXME: Implement */ @@ -1304,6 +1309,7 @@ __init void lguest_init(void) pv_cpu_ops.load_tr_desc = lguest_load_tr_desc; pv_cpu_ops.set_ldt = lguest_set_ldt; pv_cpu_ops.load_tls = lguest_load_tls; + pv_cpu_ops.get_debugreg = lguest_get_debugreg; pv_cpu_ops.set_debugreg = lguest_set_debugreg; pv_cpu_ops.clts = lguest_clts; pv_cpu_ops.read_cr0 = lguest_read_cr0; @@ -1410,7 +1416,7 @@ __init void lguest_init(void) new_cpu_data.x86_capability[0] = cpuid_edx(1); /* Math is always hard! */ - new_cpu_data.hard_math = 1; + set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU); /* We don't have features. We have puppies! Puppies! */ #ifdef CONFIG_X86_MCE diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/head_32.S index 6ddfe4fc23c3..6ddfe4fc23c3 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/head_32.S diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index 25b7ae8d058a..7609e0e421ec 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -6,6 +6,7 @@ */ #include <asm/checksum.h> #include <linux/module.h> +#include <asm/smap.h> /** * csum_partial_copy_from_user - Copy and checksum from user space. @@ -52,8 +53,10 @@ csum_partial_copy_from_user(const void __user *src, void *dst, len -= 2; } } + stac(); isum = csum_partial_copy_generic((__force const void *)src, dst, len, isum, errp, NULL); + clac(); if (unlikely(*errp)) goto out_err; @@ -82,6 +85,8 @@ __wsum csum_partial_copy_to_user(const void *src, void __user *dst, int len, __wsum isum, int *errp) { + __wsum ret; + might_sleep(); if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) { @@ -105,8 +110,11 @@ csum_partial_copy_to_user(const void *src, void __user *dst, } *errp = 0; - return csum_partial_copy_generic(src, (void __force *)dst, - len, isum, NULL, errp); + stac(); + ret = csum_partial_copy_generic(src, (void __force *)dst, + len, isum, NULL, errp); + clac(); + return ret; } EXPORT_SYMBOL(csum_partial_copy_to_user); diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 906fea315791..c905e89e19fe 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -68,7 +68,7 @@ EXPORT_SYMBOL(copy_in_user); * Since protection fault in copy_from/to_user is not a normal situation, * it is not necessary to optimize tail handling. */ -unsigned long +__visible unsigned long copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest) { char c; diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 5d7e51f3fd28..533a85e3a07e 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -1,10 +1,8 @@ # x86 Opcode Maps # # This is (mostly) based on following documentations. -# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2 -# (#325383-040US, October 2011) -# - Intel(R) Advanced Vector Extensions Programming Reference -# (#319433-011,JUNE 2011). +# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2C +# (#326018-047US, June 2013) # #<Opcode maps> # Table: table-name @@ -29,6 +27,7 @@ # - (F3): the last prefix is 0xF3 # - (F2): the last prefix is 0xF2 # - (!F3) : the last prefix is not 0xF3 (including non-last prefix case) +# - (66&F2): Both 0x66 and 0xF2 prefixes are specified. Table: one byte opcode Referrer: @@ -246,8 +245,8 @@ c2: RETN Iw (f64) c3: RETN c4: LES Gz,Mp (i64) | VEX+2byte (Prefix) c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix) -c6: Grp11 Eb,Ib (1A) -c7: Grp11 Ev,Iz (1A) +c6: Grp11A Eb,Ib (1A) +c7: Grp11B Ev,Iz (1A) c8: ENTER Iw,Ib c9: LEAVE (d64) ca: RETF Iw @@ -293,8 +292,8 @@ ef: OUT DX,eAX # 0xf0 - 0xff f0: LOCK (Prefix) f1: -f2: REPNE (Prefix) -f3: REP/REPE (Prefix) +f2: REPNE (Prefix) | XACQUIRE (Prefix) +f3: REP/REPE (Prefix) | XRELEASE (Prefix) f4: HLT f5: CMC f6: Grp3_1 Eb (1A) @@ -326,7 +325,8 @@ AVXcode: 1 0a: 0b: UD2 (1B) 0c: -0d: NOP Ev | GrpP +# AMD's prefetch group. Intel supports prefetchw(/1) only. +0d: GrpP 0e: FEMMS # 3DNow! uses the last imm byte as opcode extension. 0f: 3DNow! Pq,Qq,Ib @@ -729,12 +729,12 @@ dc: VAESENC Vdq,Hdq,Wdq (66),(v1) dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1) de: VAESDEC Vdq,Hdq,Wdq (66),(v1) df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1) -f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) -f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) +f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2) +f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2) f2: ANDN Gy,By,Ey (v) f3: Grp17 (1A) f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) -f6: MULX By,Gy,rDX,Ey (F2),(v) +f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v) f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v) EndTable @@ -861,8 +861,8 @@ EndTable GrpTable: Grp7 0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) -1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001) -2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) +1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) +2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) 3: LIDT Ms 4: SMSW Mw/Rv 5: @@ -880,15 +880,21 @@ EndTable GrpTable: Grp9 1: CMPXCHG8B/16B Mq/Mdq 6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B) -7: VMPTRST Mq | VMPTRST Mq (F3) +7: VMPTRST Mq | VMPTRST Mq (F3) | RDSEED Rv (11B) EndTable GrpTable: Grp10 EndTable -GrpTable: Grp11 -# Note: the operands are given by group opcode -0: MOV +# Grp11A and Grp11B are expressed as Grp11 in Intel SDM +GrpTable: Grp11A +0: MOV Eb,Ib +7: XABORT Ib (000),(11B) +EndTable + +GrpTable: Grp11B +0: MOV Eb,Iz +7: XBEGIN Jz (000),(11B) EndTable GrpTable: Grp12 diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 654be4ae3047..3aaeffcfd67a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -842,23 +842,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, force_sig_info_fault(SIGBUS, code, address, tsk, fault); } -static noinline int +static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, unsigned long address, unsigned int fault) { - /* - * Pagefault was interrupted by SIGKILL. We have no reason to - * continue pagefault. - */ - if (fatal_signal_pending(current)) { - if (!(fault & VM_FAULT_RETRY)) - up_read(¤t->mm->mmap_sem); - if (!(error_code & PF_USER)) - no_context(regs, error_code, address, 0, 0); - return 1; + if (fatal_signal_pending(current) && !(error_code & PF_USER)) { + up_read(¤t->mm->mmap_sem); + no_context(regs, error_code, address, 0, 0); + return; } - if (!(fault & VM_FAULT_ERROR)) - return 0; if (fault & VM_FAULT_OOM) { /* Kernel mode? Handle exceptions or die: */ @@ -866,7 +858,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, up_read(¤t->mm->mmap_sem); no_context(regs, error_code, address, SIGSEGV, SEGV_MAPERR); - return 1; + return; } up_read(¤t->mm->mmap_sem); @@ -884,7 +876,6 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, else BUG(); } - return 1; } static int spurious_fault_check(unsigned long error_code, pte_t *pte) @@ -1011,9 +1002,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) unsigned long address; struct mm_struct *mm; int fault; - int write = error_code & PF_WRITE; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (write ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; tsk = current; mm = tsk->mm; @@ -1083,6 +1072,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) if (user_mode_vm(regs)) { local_irq_enable(); error_code |= PF_USER; + flags |= FAULT_FLAG_USER; } else { if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); @@ -1109,6 +1099,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) return; } + if (error_code & PF_WRITE) + flags |= FAULT_FLAG_WRITE; + /* * When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in @@ -1187,9 +1180,17 @@ good_area: */ fault = handle_mm_fault(mm, vma, address, flags); - if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { - if (mm_fault_error(regs, error_code, address, fault)) - return; + /* + * If we need to retry but a fatal signal is pending, handle the + * signal first. We do not need to release the mmap_sem because it + * would already be released in __lock_page_or_retry in mm/filemap.c. + */ + if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))) + return; + + if (unlikely(fault & VM_FAULT_ERROR)) { + mm_fault_error(regs, error_code, address, fault); + return; } /* diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 252b8f5489ba..4500142bc4aa 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -1,6 +1,7 @@ #include <linux/highmem.h> #include <linux/module.h> #include <linux/swap.h> /* for totalram_pages */ +#include <linux/bootmem.h> void *kmap(struct page *page) { @@ -121,6 +122,11 @@ void __init set_highmem_pages_init(void) struct zone *zone; int nid; + /* + * Explicitly reset zone->managed_pages because set_highmem_pages_init() + * is invoked before free_all_bootmem() + */ + reset_all_zones_managed_pages(); for_each_zone(zone) { unsigned long zone_start_pfn, zone_end_pfn; diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index ae1aa71d0115..9d980d88b747 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -16,169 +16,6 @@ #include <asm/tlbflush.h> #include <asm/pgalloc.h> -static unsigned long page_table_shareable(struct vm_area_struct *svma, - struct vm_area_struct *vma, - unsigned long addr, pgoff_t idx) -{ - unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + - svma->vm_start; - unsigned long sbase = saddr & PUD_MASK; - unsigned long s_end = sbase + PUD_SIZE; - - /* Allow segments to share if only one is marked locked */ - unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; - unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; - - /* - * match the virtual addresses, permission and the alignment of the - * page table page. - */ - if (pmd_index(addr) != pmd_index(saddr) || - vm_flags != svm_flags || - sbase < svma->vm_start || svma->vm_end < s_end) - return 0; - - return saddr; -} - -static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) -{ - unsigned long base = addr & PUD_MASK; - unsigned long end = base + PUD_SIZE; - - /* - * check on proper vm_flags and page table alignment - */ - if (vma->vm_flags & VM_MAYSHARE && - vma->vm_start <= base && end <= vma->vm_end) - return 1; - return 0; -} - -/* - * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() - * and returns the corresponding pte. While this is not necessary for the - * !shared pmd case because we can allocate the pmd later as well, it makes the - * code much cleaner. pmd allocation is essential for the shared case because - * pud has to be populated inside the same i_mmap_mutex section - otherwise - * racing tasks could either miss the sharing (see huge_pte_offset) or select a - * bad pmd for sharing. - */ -static pte_t * -huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) -{ - struct vm_area_struct *vma = find_vma(mm, addr); - struct address_space *mapping = vma->vm_file->f_mapping; - pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + - vma->vm_pgoff; - struct vm_area_struct *svma; - unsigned long saddr; - pte_t *spte = NULL; - pte_t *pte; - - if (!vma_shareable(vma, addr)) - return (pte_t *)pmd_alloc(mm, pud, addr); - - mutex_lock(&mapping->i_mmap_mutex); - vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { - if (svma == vma) - continue; - - saddr = page_table_shareable(svma, vma, addr, idx); - if (saddr) { - spte = huge_pte_offset(svma->vm_mm, saddr); - if (spte) { - get_page(virt_to_page(spte)); - break; - } - } - } - - if (!spte) - goto out; - - spin_lock(&mm->page_table_lock); - if (pud_none(*pud)) - pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); - else - put_page(virt_to_page(spte)); - spin_unlock(&mm->page_table_lock); -out: - pte = (pte_t *)pmd_alloc(mm, pud, addr); - mutex_unlock(&mapping->i_mmap_mutex); - return pte; -} - -/* - * unmap huge page backed by shared pte. - * - * Hugetlb pte page is ref counted at the time of mapping. If pte is shared - * indicated by page_count > 1, unmap is achieved by clearing pud and - * decrementing the ref count. If count == 1, the pte page is not shared. - * - * called with vma->vm_mm->page_table_lock held. - * - * returns: 1 successfully unmapped a shared pte page - * 0 the underlying pte page is not shared, or it is the last user - */ -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ - pgd_t *pgd = pgd_offset(mm, *addr); - pud_t *pud = pud_offset(pgd, *addr); - - BUG_ON(page_count(virt_to_page(ptep)) == 0); - if (page_count(virt_to_page(ptep)) == 1) - return 0; - - pud_clear(pud); - put_page(virt_to_page(ptep)); - *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; - return 1; -} - -pte_t *huge_pte_alloc(struct mm_struct *mm, - unsigned long addr, unsigned long sz) -{ - pgd_t *pgd; - pud_t *pud; - pte_t *pte = NULL; - - pgd = pgd_offset(mm, addr); - pud = pud_alloc(mm, pgd, addr); - if (pud) { - if (sz == PUD_SIZE) { - pte = (pte_t *)pud; - } else { - BUG_ON(sz != PMD_SIZE); - if (pud_none(*pud)) - pte = huge_pmd_share(mm, addr, pud); - else - pte = (pte_t *)pmd_alloc(mm, pud, addr); - } - } - BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); - - return pte; -} - -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd = NULL; - - pgd = pgd_offset(mm, addr); - if (pgd_present(*pgd)) { - pud = pud_offset(pgd, addr); - if (pud_present(*pud)) { - if (pud_large(*pud)) - return (pte_t *)pud; - pmd = pmd_offset(pud, addr); - } - } - return (pte_t *) pmd; -} - #if 0 /* This is just for testing */ struct page * follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) @@ -222,6 +59,10 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, return NULL; } +int pmd_huge_support(void) +{ + return 0; +} #else struct page * @@ -240,30 +81,10 @@ int pud_huge(pud_t pud) return !!(pud_val(pud) & _PAGE_PSE); } -struct page * -follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int write) +int pmd_huge_support(void) { - struct page *page; - - page = pte_page(*(pte_t *)pmd); - if (page) - page += ((address & ~PMD_MASK) >> PAGE_SHIFT); - return page; -} - -struct page * -follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int write) -{ - struct page *page; - - page = pte_page(*(pte_t *)pud); - if (page) - page += ((address & ~PUD_MASK) >> PAGE_SHIFT); - return page; + return 1; } - #endif /* x86_64 also uses this file */ diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index eaac1743def7..04664cdb7fda 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -78,8 +78,8 @@ __ref void *alloc_low_pages(unsigned int num) return __va(pfn << PAGE_SHIFT); } -/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */ -#define INIT_PGT_BUF_SIZE (5 * PAGE_SIZE) +/* need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS */ +#define INIT_PGT_BUF_SIZE (6 * PAGE_SIZE) RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); void __init early_alloc_pgt_buf(void) { @@ -277,6 +277,9 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, end_pfn = limit_pfn; nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + if (!after_bootmem) + adjust_range_page_size_mask(mr, nr_range); + /* try to merge same page size and continuous */ for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { unsigned long old_start; @@ -291,9 +294,6 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, nr_range--; } - if (!after_bootmem) - adjust_range_page_size_mask(mr, nr_range); - for (i = 0; i < nr_range; i++) printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n", mr[i].start, mr[i].end - 1, @@ -494,7 +494,6 @@ int devmem_is_allowed(unsigned long pagenr) void free_init_pages(char *what, unsigned long begin, unsigned long end) { - unsigned long addr; unsigned long begin_aligned, end_aligned; /* Make sure boundaries are page aligned */ @@ -509,8 +508,6 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) if (begin >= end) return; - addr = begin; - /* * If debugging page accesses then do not free this memory but * mark them not present - any buggy init-section access will @@ -529,18 +526,13 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); - printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); - - for (; addr < end; addr += PAGE_SIZE) { - memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); - free_reserved_page(virt_to_page(addr)); - } + free_reserved_area((void *)begin, (void *)end, POISON_FREE_INITMEM, what); #endif } void free_initmem(void) { - free_init_pages("unused kernel memory", + free_init_pages("unused kernel", (unsigned long)(&__init_begin), (unsigned long)(&__init_end)); } @@ -566,7 +558,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end) * - relocate_initrd() * So here We can do PAGE_ALIGN() safely to get partial page to be freed */ - free_init_pages("initrd memory", start, PAGE_ALIGN(end)); + free_init_pages("initrd", start, PAGE_ALIGN(end)); } #endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 3ac7e319918d..4287f1ffba7e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -660,10 +660,8 @@ void __init initmem_init(void) highstart_pfn = max_low_pfn; printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); - num_physpages = highend_pfn; high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else - num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif @@ -671,7 +669,7 @@ void __init initmem_init(void) sparse_memory_present_with_active_regions(0); #ifdef CONFIG_FLATMEM - max_mapnr = num_physpages; + max_mapnr = IS_ENABLED(CONFIG_HIGHMEM) ? highend_pfn : max_low_pfn; #endif __vmalloc_start_set = true; @@ -739,9 +737,6 @@ static void __init test_wp_bit(void) void __init mem_init(void) { - int codesize, reservedpages, datasize, initsize; - int tmp; - pci_iommu_alloc(); #ifdef CONFIG_FLATMEM @@ -759,32 +754,11 @@ void __init mem_init(void) set_highmem_pages_init(); /* this will put all low memory onto the freelists */ - totalram_pages += free_all_bootmem(); - - reservedpages = 0; - for (tmp = 0; tmp < max_low_pfn; tmp++) - /* - * Only count reserved RAM pages: - */ - if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) - reservedpages++; + free_all_bootmem(); after_bootmem = 1; - codesize = (unsigned long) &_etext - (unsigned long) &_text; - datasize = (unsigned long) &_edata - (unsigned long) &_etext; - initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; - - printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " - "%dk reserved, %dk data, %dk init, %ldk highmem)\n", - nr_free_pages() << (PAGE_SHIFT-10), - num_physpages << (PAGE_SHIFT-10), - codesize >> 10, - reservedpages << (PAGE_SHIFT-10), - datasize >> 10, - initsize >> 10, - totalhigh_pages << (PAGE_SHIFT-10)); - + mem_init_print_info(NULL); printk(KERN_INFO "virtual kernel memory layout:\n" " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" #ifdef CONFIG_HIGHMEM diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index bb00c4672ad6..104d56a9245f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -368,7 +368,7 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) * * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text) * - * phys_addr holds the negative offset to the kernel, which is added + * phys_base holds the negative offset to the kernel, which is added * to the compile time generated pmds. This results in invalid pmds up * to the point where we hit the physaddr 0 mapping. * @@ -712,36 +712,22 @@ EXPORT_SYMBOL_GPL(arch_add_memory); static void __meminit free_pagetable(struct page *page, int order) { - struct zone *zone; - bool bootmem = false; unsigned long magic; unsigned int nr_pages = 1 << order; /* bootmem page has reserved flag */ if (PageReserved(page)) { __ClearPageReserved(page); - bootmem = true; magic = (unsigned long)page->lru.next; if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { while (nr_pages--) put_page_bootmem(page++); } else - __free_pages_bootmem(page, order); + while (nr_pages--) + free_reserved_page(page++); } else free_pages((unsigned long)page_address(page), order); - - /* - * SECTION_INFO pages and MIX_SECTION_INFO pages - * are all allocated by bootmem. - */ - if (bootmem) { - zone = page_zone(page); - zone_span_writelock(zone); - zone->present_pages += nr_pages; - zone_span_writeunlock(zone); - totalram_pages += nr_pages; - } } static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) @@ -1058,9 +1044,6 @@ static void __init register_page_bootmem_info(void) void __init mem_init(void) { - long codesize, reservedpages, datasize, initsize; - unsigned long absent_pages; - pci_iommu_alloc(); /* clear_bss() already clear the empty_zero_page */ @@ -1068,29 +1051,14 @@ void __init mem_init(void) register_page_bootmem_info(); /* this will put all memory onto the freelists */ - totalram_pages = free_all_bootmem(); - - absent_pages = absent_pages_in_range(0, max_pfn); - reservedpages = max_pfn - totalram_pages - absent_pages; + free_all_bootmem(); after_bootmem = 1; - codesize = (unsigned long) &_etext - (unsigned long) &_text; - datasize = (unsigned long) &_edata - (unsigned long) &_etext; - initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; - /* Register memory areas for /proc/kcore */ kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, VSYSCALL_END - VSYSCALL_START, KCORE_OTHER); - printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " - "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n", - nr_free_pages() << (PAGE_SHIFT-10), - max_pfn << (PAGE_SHIFT-10), - codesize >> 10, - absent_pages << (PAGE_SHIFT-10), - reservedpages << (PAGE_SHIFT-10), - datasize >> 10, - initsize >> 10); + mem_init_print_info(NULL); } #ifdef CONFIG_DEBUG_RODATA @@ -1166,11 +1134,10 @@ void mark_rodata_ro(void) set_memory_ro(start, (end-start) >> PAGE_SHIFT); #endif - free_init_pages("unused kernel memory", + free_init_pages("unused kernel", (unsigned long) __va(__pa_symbol(text_end)), (unsigned long) __va(__pa_symbol(rodata_start))); - - free_init_pages("unused kernel memory", + free_init_pages("unused kernel", (unsigned long) __va(__pa_symbol(rodata_end)), (unsigned long) __va(__pa_symbol(_sdata))); } diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 9a1e6583910c..799580cabc78 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -487,7 +487,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) unsigned long offset; resource_size_t last_addr; unsigned int nrpages; - enum fixed_addresses idx0, idx; + enum fixed_addresses idx; int i, slot; WARN_ON(system_state != SYSTEM_BOOTING); @@ -501,15 +501,15 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) } if (slot < 0) { - printk(KERN_INFO "early_iomap(%08llx, %08lx) not found slot\n", - (u64)phys_addr, size); + printk(KERN_INFO "%s(%08llx, %08lx) not found slot\n", + __func__, (u64)phys_addr, size); WARN_ON(1); return NULL; } if (early_ioremap_debug) { - printk(KERN_INFO "early_ioremap(%08llx, %08lx) [%d] => ", - (u64)phys_addr, size, slot); + printk(KERN_INFO "%s(%08llx, %08lx) [%d] => ", + __func__, (u64)phys_addr, size, slot); dump_stack(); } @@ -540,8 +540,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) /* * Ok, go for it.. */ - idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; - idx = idx0; + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; while (nrpages > 0) { early_set_fixmap(idx, phys_addr, prot); phys_addr += PAGE_SIZE; diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 845df6835f9f..25e7e1372bb2 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -112,13 +112,13 @@ static unsigned long mmap_legacy_base(void) */ void arch_pick_mmap_layout(struct mm_struct *mm) { + mm->mmap_legacy_base = mmap_legacy_base(); + mm->mmap_base = mmap_base(); + if (mmap_is_legacy()) { - mm->mmap_base = mmap_legacy_base(); + mm->mmap_base = mm->mmap_legacy_base; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { - mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index dc0b727742f4..0057a7accfb1 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -410,9 +410,7 @@ out: pr_warning("multiple CPUs still online, may miss events.\n"); } -/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit, - but this whole function is ifdefed CONFIG_HOTPLUG_CPU */ -static void __ref leave_uniprocessor(void) +static void leave_uniprocessor(void) { int cpu; int err; diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index a71c4e207679..8bf93bae1f13 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -60,7 +60,7 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; -int __cpuinit numa_cpu_node(int cpu) +int numa_cpu_node(int cpu) { int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); @@ -691,12 +691,12 @@ void __init init_cpu_to_node(void) #ifndef CONFIG_DEBUG_PER_CPU_MAPS # ifndef CONFIG_NUMA_EMU -void __cpuinit numa_add_cpu(int cpu) +void numa_add_cpu(int cpu) { cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); } -void __cpuinit numa_remove_cpu(int cpu) +void numa_remove_cpu(int cpu) { cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); } @@ -763,17 +763,17 @@ void debug_cpumask_set_cpu(int cpu, int node, bool enable) } # ifndef CONFIG_NUMA_EMU -static void __cpuinit numa_set_cpumask(int cpu, bool enable) +static void numa_set_cpumask(int cpu, bool enable) { debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); } -void __cpuinit numa_add_cpu(int cpu) +void numa_add_cpu(int cpu) { numa_set_cpumask(cpu, true); } -void __cpuinit numa_remove_cpu(int cpu) +void numa_remove_cpu(int cpu) { numa_set_cpumask(cpu, false); } diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 73a6d7395bd3..0342d27ca798 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -83,10 +83,8 @@ void __init initmem_init(void) highstart_pfn = max_low_pfn; printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); - num_physpages = highend_pfn; high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else - num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index dbbbb47260cc..a8f90ce3dedf 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -10,7 +10,7 @@ #include "numa_internal.h" -static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata; +static int emu_nid_to_phys[MAX_NUMNODES]; static char *emu_cmdline __initdata; void __init numa_emu_cmdline(char *str) @@ -444,7 +444,7 @@ no_emu: } #ifndef CONFIG_DEBUG_PER_CPU_MAPS -void __cpuinit numa_add_cpu(int cpu) +void numa_add_cpu(int cpu) { int physnid, nid; @@ -462,7 +462,7 @@ void __cpuinit numa_add_cpu(int cpu) cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); } -void __cpuinit numa_remove_cpu(int cpu) +void numa_remove_cpu(int cpu) { int i; @@ -470,7 +470,7 @@ void __cpuinit numa_remove_cpu(int cpu) cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); } #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ -static void __cpuinit numa_set_cpumask(int cpu, bool enable) +static void numa_set_cpumask(int cpu, bool enable) { int nid, physnid; @@ -490,12 +490,12 @@ static void __cpuinit numa_set_cpumask(int cpu, bool enable) } } -void __cpuinit numa_add_cpu(int cpu) +void numa_add_cpu(int cpu) { numa_set_cpumask(cpu, true); } -void __cpuinit numa_remove_cpu(int cpu) +void numa_remove_cpu(int cpu) { numa_set_cpumask(cpu, false); } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 17fda6a8b3c2..dfa537a03be1 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -240,7 +240,6 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) { pud_t *pud; - unsigned long addr; int i; if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ @@ -248,8 +247,7 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) pud = pud_offset(pgd, 0); - for (addr = i = 0; i < PREALLOCATED_PMDS; - i++, pud++, addr += PUD_SIZE) { + for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { pmd_t *pmd = pmds[i]; if (i >= KERNEL_PGD_BOUNDARY) diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index 410531d3c292..90555bf60aa4 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -5,7 +5,7 @@ #include <asm/pgtable.h> #include <asm/proto.h> -static int disable_nx __cpuinitdata; +static int disable_nx; /* * noexec = on|off @@ -29,7 +29,7 @@ static int __init noexec_setup(char *str) } early_param("noexec", noexec_setup); -void __cpuinit x86_configure_nx(void) +void x86_configure_nx(void) { if (cpu_has_nx && !disable_nx) __supported_pte_mask |= _PAGE_NX; diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index cdd0da9dd530..266ca912f62e 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -146,6 +146,7 @@ int __init acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) { u64 start, end; + u32 hotpluggable; int node, pxm; if (srat_disabled()) @@ -154,7 +155,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) goto out_err_bad_srat; if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) goto out_err; - if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) + hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE; + if (hotpluggable && !save_add_info()) goto out_err; start = ma->base_address; @@ -174,9 +176,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) node_set(node, numa_nodes_parsed); - printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", - node, pxm, - (unsigned long long) start, (unsigned long long) end - 1); + pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s\n", + node, pxm, + (unsigned long long) start, (unsigned long long) end - 1, + hotpluggable ? " hotplug" : ""); return 0; out_err_bad_srat: diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 282375f13c7e..ae699b3bbac8 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -103,6 +103,7 @@ static void flush_tlb_func(void *info) if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) return; + count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { if (f->flush_end == TLB_FLUSH_ALL) local_flush_tlb(); @@ -130,6 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, info.flush_start = start; info.flush_end = end; + count_vm_event(NR_TLB_REMOTE_FLUSH); if (is_uv_system()) { unsigned int cpu; @@ -149,6 +151,7 @@ void flush_tlb_current_task(void) preempt_disable(); + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); @@ -211,16 +214,19 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm; /* tlb_flushall_shift is on balance point, details in commit log */ - if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) + if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) { + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); - else { + } else { if (has_large_page(mm, start, end)) { local_flush_tlb(); goto flush_all; } /* flush range by one by one 'invlpg' */ - for (addr = start; addr < end; addr += PAGE_SIZE) + for (addr = start; addr < end; addr += PAGE_SIZE) { + count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); + } if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) @@ -256,6 +262,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) static void do_flush_tlb_all(void *info) { + count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); __flush_tlb_all(); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) leave_mm(smp_processor_id()); @@ -263,6 +270,7 @@ static void do_flush_tlb_all(void *info) void flush_tlb_all(void) { + count_vm_event(NR_TLB_REMOTE_FLUSH); on_each_cpu(do_flush_tlb_all, NULL, 1); } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index f66b54086ce5..79c216aa0e2b 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -12,6 +12,7 @@ #include <linux/netdevice.h> #include <linux/filter.h> #include <linux/if_vlan.h> +#include <linux/random.h> /* * Conventions : @@ -144,6 +145,39 @@ static int pkt_type_offset(void) return -1; } +struct bpf_binary_header { + unsigned int pages; + /* Note : for security reasons, bpf code will follow a randomly + * sized amount of int3 instructions + */ + u8 image[]; +}; + +static struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen, + u8 **image_ptr) +{ + unsigned int sz, hole; + struct bpf_binary_header *header; + + /* Most of BPF filters are really small, + * but if some of them fill a page, allow at least + * 128 extra bytes to insert a random section of int3 + */ + sz = round_up(proglen + sizeof(*header) + 128, PAGE_SIZE); + header = module_alloc(sz); + if (!header) + return NULL; + + memset(header, 0xcc, sz); /* fill whole space with int3 instructions */ + + header->pages = sz / PAGE_SIZE; + hole = sz - (proglen + sizeof(*header)); + + /* insert a random number of int3 instructions before BPF code */ + *image_ptr = &header->image[prandom_u32() % hole]; + return header; +} + void bpf_jit_compile(struct sk_filter *fp) { u8 temp[64]; @@ -153,6 +187,7 @@ void bpf_jit_compile(struct sk_filter *fp) int t_offset, f_offset; u8 t_op, f_op, seen = 0, pass; u8 *image = NULL; + struct bpf_binary_header *header = NULL; u8 *func; int pc_ret0 = -1; /* bpf index of first RET #0 instruction (if any) */ unsigned int cleanup_addr; /* epilogue code offset */ @@ -693,7 +728,7 @@ cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i]; if (unlikely(proglen + ilen > oldproglen)) { pr_err("bpb_jit_compile fatal error\n"); kfree(addrs); - module_free(NULL, image); + module_free(NULL, header); return; } memcpy(image + proglen, temp, ilen); @@ -717,10 +752,8 @@ cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i]; break; } if (proglen == oldproglen) { - image = module_alloc(max_t(unsigned int, - proglen, - sizeof(struct work_struct))); - if (!image) + header = bpf_alloc_binary(proglen, &image); + if (!header) goto out; } oldproglen = proglen; @@ -730,7 +763,8 @@ cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i]; bpf_jit_dump(flen, proglen, pass, image); if (image) { - bpf_flush_icache(image, image + proglen); + bpf_flush_icache(header, image + proglen); + set_memory_ro((unsigned long)header, header->pages); fp->bpf_func = (void *)image; } out: @@ -738,20 +772,13 @@ out: return; } -static void jit_free_defer(struct work_struct *arg) -{ - module_free(NULL, arg); -} - -/* run from softirq, we must use a work_struct to call - * module_free() from process context - */ void bpf_jit_free(struct sk_filter *fp) { if (fp->bpf_func != sk_run_filter) { - struct work_struct *work = (struct work_struct *)fp->bpf_func; + unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; + struct bpf_binary_header *header = (void *)addr; - INIT_WORK(work, jit_free_defer); - schedule_work(work); + set_memory_rw(addr, header->pages); + module_free(NULL, header); } } diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 48768df2471a..6890d8498e0b 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -403,7 +403,7 @@ static void nmi_cpu_down(void *dummy) nmi_cpu_shutdown(dummy); } -static int nmi_create_files(struct super_block *sb, struct dentry *root) +static int nmi_create_files(struct dentry *root) { unsigned int i; @@ -420,14 +420,14 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) continue; snprintf(buf, sizeof(buf), "%d", i); - dir = oprofilefs_mkdir(sb, root, buf); - oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled); - oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event); - oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count); - oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); - oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); - oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); - oprofilefs_create_ulong(sb, dir, "extra", &counter_config[i].extra); + dir = oprofilefs_mkdir(root, buf); + oprofilefs_create_ulong(dir, "enabled", &counter_config[i].enabled); + oprofilefs_create_ulong(dir, "event", &counter_config[i].event); + oprofilefs_create_ulong(dir, "count", &counter_config[i].count); + oprofilefs_create_ulong(dir, "unit_mask", &counter_config[i].unit_mask); + oprofilefs_create_ulong(dir, "kernel", &counter_config[i].kernel); + oprofilefs_create_ulong(dir, "user", &counter_config[i].user); + oprofilefs_create_ulong(dir, "extra", &counter_config[i].extra); } return 0; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index b2b94438ff05..50d86c0e9ba4 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -454,16 +454,16 @@ static void init_ibs(void) printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps); } -static int (*create_arch_files)(struct super_block *sb, struct dentry *root); +static int (*create_arch_files)(struct dentry *root); -static int setup_ibs_files(struct super_block *sb, struct dentry *root) +static int setup_ibs_files(struct dentry *root) { struct dentry *dir; int ret = 0; /* architecture specific files */ if (create_arch_files) - ret = create_arch_files(sb, root); + ret = create_arch_files(root); if (ret) return ret; @@ -479,26 +479,26 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) ibs_config.max_cnt_op = 250000; if (ibs_caps & IBS_CAPS_FETCHSAM) { - dir = oprofilefs_mkdir(sb, root, "ibs_fetch"); - oprofilefs_create_ulong(sb, dir, "enable", + dir = oprofilefs_mkdir(root, "ibs_fetch"); + oprofilefs_create_ulong(dir, "enable", &ibs_config.fetch_enabled); - oprofilefs_create_ulong(sb, dir, "max_count", + oprofilefs_create_ulong(dir, "max_count", &ibs_config.max_cnt_fetch); - oprofilefs_create_ulong(sb, dir, "rand_enable", + oprofilefs_create_ulong(dir, "rand_enable", &ibs_config.rand_en); } if (ibs_caps & IBS_CAPS_OPSAM) { - dir = oprofilefs_mkdir(sb, root, "ibs_op"); - oprofilefs_create_ulong(sb, dir, "enable", + dir = oprofilefs_mkdir(root, "ibs_op"); + oprofilefs_create_ulong(dir, "enable", &ibs_config.op_enabled); - oprofilefs_create_ulong(sb, dir, "max_count", + oprofilefs_create_ulong(dir, "max_count", &ibs_config.max_cnt_op); if (ibs_caps & IBS_CAPS_OPCNT) - oprofilefs_create_ulong(sb, dir, "dispatched_ops", + oprofilefs_create_ulong(dir, "dispatched_ops", &ibs_config.dispatched_ops); if (ibs_caps & IBS_CAPS_BRNTRGT) - oprofilefs_create_ulong(sb, dir, "branch_target", + oprofilefs_create_ulong(dir, "branch_target", &ibs_config.branch_target); } diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 3e724256dbee..b30e937689d6 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -324,14 +324,11 @@ setup_resource(struct acpi_resource *acpi_res, void *data) res->start = start; res->end = end; info->res_offset[info->res_num] = addr.translation_offset; + info->res_num++; - if (!pci_use_crs) { + if (!pci_use_crs) dev_printk(KERN_DEBUG, &info->bridge->dev, "host bridge window %pR (ignored)\n", res); - return AE_OK; - } - - info->res_num++; return AE_OK; } @@ -571,13 +568,8 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) */ if (bus) { struct pci_bus *child; - list_for_each_entry(child, &bus->children, node) { - struct pci_dev *self = child->self; - if (!self) - continue; - - pcie_bus_configure_settings(child, self->pcie_mpss); - } + list_for_each_entry(child, &bus->children, node) + pcie_bus_configure_settings(child); } if (bus && node != -1) { diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index e9e6ed5cdf94..a48be98e9ded 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -312,7 +312,7 @@ static int __init early_fill_mp_bus_info(void) #define ENABLE_CF8_EXT_CFG (1ULL << 46) -static void __cpuinit enable_pci_io_ecs(void *unused) +static void enable_pci_io_ecs(void *unused) { u64 reg; rdmsrl(MSR_AMD64_NB_CFG, reg); @@ -322,8 +322,8 @@ static void __cpuinit enable_pci_io_ecs(void *unused) } } -static int __cpuinit amd_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int amd_cpu_notify(struct notifier_block *self, unsigned long action, + void *hcpu) { int cpu = (long)hcpu; switch (action) { @@ -337,7 +337,7 @@ static int __cpuinit amd_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata amd_cpu_notifier = { +static struct notifier_block amd_cpu_notifier = { .notifier_call = amd_cpu_notify, }; diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 94919e307f8e..db6b1ab43255 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -210,6 +210,8 @@ static void pcibios_allocate_bridge_resources(struct pci_dev *dev) r = &dev->resource[idx]; if (!r->flags) continue; + if (r->parent) /* Already allocated */ + continue; if (!r->start || pci_claim_resource(dev, idx) < 0) { /* * Something is wrong with the region. @@ -318,6 +320,8 @@ static void pcibios_allocate_dev_rom_resource(struct pci_dev *dev) r = &dev->resource[PCI_ROM_RESOURCE]; if (!r->flags || !r->start) return; + if (r->parent) /* Already allocated */ + return; if (pci_claim_resource(dev, PCI_ROM_RESOURCE) < 0) { r->end -= r->start; diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c index 6eb18c42a28a..903fded50786 100644 --- a/arch/x86/pci/mrst.c +++ b/arch/x86/pci/mrst.c @@ -23,11 +23,11 @@ #include <linux/ioport.h> #include <linux/init.h> #include <linux/dmi.h> +#include <linux/acpi.h> +#include <linux/io.h> +#include <linux/smp.h> -#include <asm/acpi.h> #include <asm/segment.h> -#include <asm/io.h> -#include <asm/smp.h> #include <asm/pci_x86.h> #include <asm/hw_irq.h> #include <asm/io_apic.h> @@ -43,7 +43,7 @@ #define PCI_FIXED_BAR_4_SIZE 0x14 #define PCI_FIXED_BAR_5_SIZE 0x1c -static int pci_soc_mode = 0; +static int pci_soc_mode; /** * fixed_bar_cap - return the offset of the fixed BAR cap if found @@ -141,7 +141,8 @@ static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn, */ static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg) { - /* This is a workaround for A0 LNC bug where PCI status register does + /* + * This is a workaround for A0 LNC bug where PCI status register does * not have new CAP bit set. can not be written by SW either. * * PCI header type in real LNC indicates a single function device, this @@ -154,7 +155,7 @@ static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg) || devfn == PCI_DEVFN(0, 0) || devfn == PCI_DEVFN(3, 0))) return 1; - return 0; /* langwell on others */ + return 0; /* Langwell on others */ } static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, @@ -172,7 +173,8 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, { int offset; - /* On MRST, there is no PCI ROM BAR, this will cause a subsequent read + /* + * On MRST, there is no PCI ROM BAR, this will cause a subsequent read * to ROM BAR return 0 then being ignored. */ if (where == PCI_ROM_ADDRESS) @@ -210,7 +212,8 @@ static int mrst_pci_irq_enable(struct pci_dev *dev) pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); - /* MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to + /* + * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to * IOAPIC RTE entries, so we just enable RTE for the device. */ irq_attr.ioapic = mp_find_ioapic(dev->irq); @@ -235,7 +238,7 @@ struct pci_ops pci_mrst_ops = { */ int __init pci_mrst_init(void) { - printk(KERN_INFO "Intel MID platform detected, using MID PCI ops\n"); + pr_info("Intel MID platform detected, using MID PCI ops\n"); pci_mmcfg_late_init(); pcibios_enable_irq = mrst_pci_irq_enable; pci_root_ops = pci_mrst_ops; @@ -244,17 +247,21 @@ int __init pci_mrst_init(void) return 1; } -/* Langwell devices are not true pci devices, they are not subject to 10 ms - * d3 to d0 delay required by pci spec. +/* + * Langwell devices are not true PCI devices; they are not subject to 10 ms + * d3 to d0 delay required by PCI spec. */ static void pci_d3delay_fixup(struct pci_dev *dev) { - /* PCI fixups are effectively decided compile time. If we have a dual - SoC/non-SoC kernel we don't want to mangle d3 on non SoC devices */ - if (!pci_soc_mode) - return; - /* true pci devices in lincroft should allow type 1 access, the rest - * are langwell fake pci devices. + /* + * PCI fixups are effectively decided compile time. If we have a dual + * SoC/non-SoC kernel we don't want to mangle d3 on non-SoC devices. + */ + if (!pci_soc_mode) + return; + /* + * True PCI devices in Lincroft should allow type 1 access, the rest + * are Langwell fake PCI devices. */ if (type1_access_ok(dev->bus->number, dev->devfn, PCI_DEVICE_ID)) return; diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c index f8ab4945892e..8244f5ec2f4c 100644 --- a/arch/x86/platform/ce4100/ce4100.c +++ b/arch/x86/platform/ce4100/ce4100.c @@ -12,8 +12,10 @@ #include <linux/kernel.h> #include <linux/irq.h> #include <linux/module.h> +#include <linux/reboot.h> #include <linux/serial_reg.h> #include <linux/serial_8250.h> +#include <linux/reboot.h> #include <asm/ce4100.h> #include <asm/prom.h> @@ -134,7 +136,7 @@ static void __init sdv_arch_setup(void) } #ifdef CONFIG_X86_IO_APIC -static void __cpuinit sdv_pci_init(void) +static void sdv_pci_init(void) { x86_of_pci_init(); /* We can't set this earlier, because we need to calibrate the timer */ diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 82089d8b1954..c7e22ab29a5a 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -42,7 +42,6 @@ #include <linux/io.h> #include <linux/reboot.h> #include <linux/bcd.h> -#include <linux/ucs2_string.h> #include <asm/setup.h> #include <asm/efi.h> @@ -54,12 +53,12 @@ #define EFI_DEBUG 1 -/* - * There's some additional metadata associated with each - * variable. Intel's reference implementation is 60 bytes - bump that - * to account for potential alignment constraints - */ -#define VAR_METADATA_SIZE 64 +#define EFI_MIN_RESERVE 5120 + +#define EFI_DUMMY_GUID \ + EFI_GUID(0x4424ac57, 0xbe4b, 0x47dd, 0x9e, 0x97, 0xed, 0x50, 0xf0, 0x9f, 0x92, 0xa9) + +static efi_char16_t efi_dummy_name[6] = { 'D', 'U', 'M', 'M', 'Y', 0 }; struct efi __read_mostly efi = { .mps = EFI_INVALID_TABLE_ADDR, @@ -79,13 +78,6 @@ struct efi_memory_map memmap; static struct efi efi_phys __initdata; static efi_system_table_t efi_systab __initdata; -static u64 efi_var_store_size; -static u64 efi_var_remaining_size; -static u64 efi_var_max_var_size; -static u64 boot_used_size; -static u64 boot_var_size; -static u64 active_size; - unsigned long x86_efi_facility; /* @@ -188,53 +180,8 @@ static efi_status_t virt_efi_get_next_variable(unsigned long *name_size, efi_char16_t *name, efi_guid_t *vendor) { - efi_status_t status; - static bool finished = false; - static u64 var_size; - - status = efi_call_virt3(get_next_variable, - name_size, name, vendor); - - if (status == EFI_NOT_FOUND) { - finished = true; - if (var_size < boot_used_size) { - boot_var_size = boot_used_size - var_size; - active_size += boot_var_size; - } else { - printk(KERN_WARNING FW_BUG "efi: Inconsistent initial sizes\n"); - } - } - - if (boot_used_size && !finished) { - unsigned long size = 0; - u32 attr; - efi_status_t s; - void *tmp; - - s = virt_efi_get_variable(name, vendor, &attr, &size, NULL); - - if (s != EFI_BUFFER_TOO_SMALL || !size) - return status; - - tmp = kmalloc(size, GFP_ATOMIC); - - if (!tmp) - return status; - - s = virt_efi_get_variable(name, vendor, &attr, &size, tmp); - - if (s == EFI_SUCCESS && (attr & EFI_VARIABLE_NON_VOLATILE)) { - var_size += size; - var_size += ucs2_strsize(name, 1024); - active_size += size; - active_size += VAR_METADATA_SIZE; - active_size += ucs2_strsize(name, 1024); - } - - kfree(tmp); - } - - return status; + return efi_call_virt3(get_next_variable, + name_size, name, vendor); } static efi_status_t virt_efi_set_variable(efi_char16_t *name, @@ -243,34 +190,9 @@ static efi_status_t virt_efi_set_variable(efi_char16_t *name, unsigned long data_size, void *data) { - efi_status_t status; - u32 orig_attr = 0; - unsigned long orig_size = 0; - - status = virt_efi_get_variable(name, vendor, &orig_attr, &orig_size, - NULL); - - if (status != EFI_BUFFER_TOO_SMALL) - orig_size = 0; - - status = efi_call_virt5(set_variable, - name, vendor, attr, - data_size, data); - - if (status == EFI_SUCCESS) { - if (orig_size) { - active_size -= orig_size; - active_size -= ucs2_strsize(name, 1024); - active_size -= VAR_METADATA_SIZE; - } - if (data_size) { - active_size += data_size; - active_size += ucs2_strsize(name, 1024); - active_size += VAR_METADATA_SIZE; - } - } - - return status; + return efi_call_virt5(set_variable, + name, vendor, attr, + data_size, data); } static efi_status_t virt_efi_query_variable_info(u32 attr, @@ -352,8 +274,9 @@ static efi_status_t __init phys_efi_get_time(efi_time_t *tm, return status; } -int efi_set_rtc_mmss(unsigned long nowtime) +int efi_set_rtc_mmss(const struct timespec *now) { + unsigned long nowtime = now->tv_sec; efi_status_t status; efi_time_t eft; efi_time_cap_t cap; @@ -388,7 +311,7 @@ int efi_set_rtc_mmss(unsigned long nowtime) return 0; } -unsigned long efi_get_time(void) +void efi_get_time(struct timespec *now) { efi_status_t status; efi_time_t eft; @@ -398,8 +321,9 @@ unsigned long efi_get_time(void) if (status != EFI_SUCCESS) pr_err("Oops: efitime: can't read time!\n"); - return mktime(eft.year, eft.month, eft.day, eft.hour, - eft.minute, eft.second); + now->tv_sec = mktime(eft.year, eft.month, eft.day, eft.hour, + eft.minute, eft.second); + now->tv_nsec = 0; } /* @@ -786,9 +710,6 @@ void __init efi_init(void) char vendor[100] = "unknown"; int i = 0; void *tmp; - struct setup_data *data; - struct efi_var_bootdata *efi_var_data; - u64 pa_data; #ifdef CONFIG_X86_32 if (boot_params.efi_info.efi_systab_hi || @@ -806,22 +727,6 @@ void __init efi_init(void) if (efi_systab_init(efi_phys.systab)) return; - pa_data = boot_params.hdr.setup_data; - while (pa_data) { - data = early_ioremap(pa_data, sizeof(*efi_var_data)); - if (data->type == SETUP_EFI_VARS) { - efi_var_data = (struct efi_var_bootdata *)data; - - efi_var_store_size = efi_var_data->store_size; - efi_var_remaining_size = efi_var_data->remaining_size; - efi_var_max_var_size = efi_var_data->max_var_size; - } - pa_data = data->next; - early_iounmap(data, sizeof(*efi_var_data)); - } - - boot_used_size = efi_var_store_size - efi_var_remaining_size; - set_bit(EFI_SYSTEM_TABLES, &x86_efi_facility); /* @@ -1007,10 +912,13 @@ void __init efi_enter_virtual_mode(void) for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { md = p; - if (!(md->attribute & EFI_MEMORY_RUNTIME) && - md->type != EFI_BOOT_SERVICES_CODE && - md->type != EFI_BOOT_SERVICES_DATA) - continue; + if (!(md->attribute & EFI_MEMORY_RUNTIME)) { +#ifdef CONFIG_X86_64 + if (md->type != EFI_BOOT_SERVICES_CODE && + md->type != EFI_BOOT_SERVICES_DATA) +#endif + continue; + } size = md->num_pages << EFI_PAGE_SHIFT; end = md->phys_addr + size; @@ -1085,6 +993,13 @@ void __init efi_enter_virtual_mode(void) runtime_code_page_mkexec(); kfree(new_memmap); + + /* clean DUMMY object */ + efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID, + EFI_VARIABLE_NON_VOLATILE | + EFI_VARIABLE_BOOTSERVICE_ACCESS | + EFI_VARIABLE_RUNTIME_ACCESS, + 0, NULL); } /* @@ -1136,33 +1051,70 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size) efi_status_t status; u64 storage_size, remaining_size, max_size; + if (!(attributes & EFI_VARIABLE_NON_VOLATILE)) + return 0; + status = efi.query_variable_info(attributes, &storage_size, &remaining_size, &max_size); if (status != EFI_SUCCESS) return status; - if (!max_size && remaining_size > size) - printk_once(KERN_ERR FW_BUG "Broken EFI implementation" - " is returning MaxVariableSize=0\n"); /* * Some firmware implementations refuse to boot if there's insufficient * space in the variable store. We account for that by refusing the * write if permitting it would reduce the available space to under - * 50%. However, some firmware won't reclaim variable space until - * after the used (not merely the actively used) space drops below - * a threshold. We can approximate that case with the value calculated - * above. If both the firmware and our calculations indicate that the - * available space would drop below 50%, refuse the write. + * 5KB. This figure was provided by Samsung, so should be safe. */ + if ((remaining_size - size < EFI_MIN_RESERVE) && + !efi_no_storage_paranoia) { + + /* + * Triggering garbage collection may require that the firmware + * generate a real EFI_OUT_OF_RESOURCES error. We can force + * that by attempting to use more space than is available. + */ + unsigned long dummy_size = remaining_size + 1024; + void *dummy = kzalloc(dummy_size, GFP_ATOMIC); + + if (!dummy) + return EFI_OUT_OF_RESOURCES; + + status = efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID, + EFI_VARIABLE_NON_VOLATILE | + EFI_VARIABLE_BOOTSERVICE_ACCESS | + EFI_VARIABLE_RUNTIME_ACCESS, + dummy_size, dummy); + + if (status == EFI_SUCCESS) { + /* + * This should have failed, so if it didn't make sure + * that we delete it... + */ + efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID, + EFI_VARIABLE_NON_VOLATILE | + EFI_VARIABLE_BOOTSERVICE_ACCESS | + EFI_VARIABLE_RUNTIME_ACCESS, + 0, dummy); + } + + kfree(dummy); - if (!storage_size || size > remaining_size || - (max_size && size > max_size)) - return EFI_OUT_OF_RESOURCES; + /* + * The runtime code may now have triggered a garbage collection + * run, so check the variable info again + */ + status = efi.query_variable_info(attributes, &storage_size, + &remaining_size, &max_size); - if (!efi_no_storage_paranoia && - ((active_size + size + VAR_METADATA_SIZE > storage_size / 2) && - (remaining_size - size < storage_size / 2))) - return EFI_OUT_OF_RESOURCES; + if (status != EFI_SUCCESS) + return status; + + /* + * There still isn't enough room, so return an error + */ + if (remaining_size - size < EFI_MIN_RESERVE) + return EFI_OUT_OF_RESOURCES; + } return EFI_SUCCESS; } diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index a0a0a4389bbd..3ca5957b7a34 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -20,7 +20,7 @@ #include <linux/intel_pmic_gpio.h> #include <linux/spi/spi.h> #include <linux/i2c.h> -#include <linux/i2c/pca953x.h> +#include <linux/platform_data/pca953x.h> #include <linux/gpio_keys.h> #include <linux/input.h> #include <linux/platform_device.h> @@ -65,7 +65,7 @@ * lapic (always-on,ARAT) ------ 150 */ -__cpuinitdata enum mrst_timer_options mrst_timer_options; +enum mrst_timer_options mrst_timer_options; static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; @@ -248,7 +248,7 @@ static void __init mrst_time_init(void) apbt_time_init(); } -static void __cpuinit mrst_arch_setup(void) +static void mrst_arch_setup(void) { if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c index d62b0a3b5c14..5e355b134ba4 100644 --- a/arch/x86/platform/mrst/vrtc.c +++ b/arch/x86/platform/mrst/vrtc.c @@ -56,7 +56,7 @@ void vrtc_cmos_write(unsigned char val, unsigned char reg) } EXPORT_SYMBOL_GPL(vrtc_cmos_write); -unsigned long vrtc_get_time(void) +void vrtc_get_time(struct timespec *now) { u8 sec, min, hour, mday, mon; unsigned long flags; @@ -82,17 +82,18 @@ unsigned long vrtc_get_time(void) printk(KERN_INFO "vRTC: sec: %d min: %d hour: %d day: %d " "mon: %d year: %d\n", sec, min, hour, mday, mon, year); - return mktime(year, mon, mday, hour, min, sec); + now->tv_sec = mktime(year, mon, mday, hour, min, sec); + now->tv_nsec = 0; } -int vrtc_set_mmss(unsigned long nowtime) +int vrtc_set_mmss(const struct timespec *now) { unsigned long flags; struct rtc_time tm; int year; int retval = 0; - rtc_time_to_tm(nowtime, &tm); + rtc_time_to_tm(now->tv_sec, &tm); if (!rtc_valid_tm(&tm) && tm.tm_year >= 72) { /* * tm.year is the number of years since 1900, and the @@ -110,7 +111,7 @@ int vrtc_set_mmss(unsigned long nowtime) } else { printk(KERN_ERR "%s: Invalid vRTC value: write of %lx to vRTC failed\n", - __FUNCTION__, nowtime); + __FUNCTION__, now->tv_sec); retval = -EINVAL; } return retval; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 1cf5b300305e..424f4c97a44d 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -25,10 +25,10 @@ #include <asm/cpu.h> #ifdef CONFIG_X86_32 -unsigned long saved_context_ebx; -unsigned long saved_context_esp, saved_context_ebp; -unsigned long saved_context_esi, saved_context_edi; -unsigned long saved_context_eflags; +__visible unsigned long saved_context_ebx; +__visible unsigned long saved_context_esp, saved_context_ebp; +__visible unsigned long saved_context_esi, saved_context_edi; +__visible unsigned long saved_context_eflags; #endif struct saved_context saved_context; diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index a0fde91c16cf..304fca20d96e 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -20,26 +20,26 @@ #include <asm/suspend.h> /* References to section boundaries */ -extern const void __nosave_begin, __nosave_end; +extern __visible const void __nosave_begin, __nosave_end; /* Defined in hibernate_asm_64.S */ -extern int restore_image(void); +extern asmlinkage int restore_image(void); /* * Address to jump to in the last phase of restore in order to get to the image * kernel's text (this value is passed in the image header). */ -unsigned long restore_jump_address; +unsigned long restore_jump_address __visible; /* * Value of the cr3 register from before the hibernation (this value is passed * in the image header). */ -unsigned long restore_cr3; +unsigned long restore_cr3 __visible; -pgd_t *temp_level4_pgt; +pgd_t *temp_level4_pgt __visible; -void *relocated_restore_code; +void *relocated_restore_code __visible; static void *alloc_pgt_page(void *context) { diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index e6773dc8ac41..093a892026f9 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -68,7 +68,7 @@ BEGIN { lprefix1_expr = "\\((66|!F3)\\)" lprefix2_expr = "\\(F3\\)" - lprefix3_expr = "\\((F2|!F3)\\)" + lprefix3_expr = "\\((F2|!F3|66\\&F2)\\)" lprefix_expr = "\\((66|F2|F3)\\)" max_lprefix = 4 @@ -83,6 +83,8 @@ BEGIN { prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" prefix_num["REPNE"] = "INAT_PFX_REPNE" prefix_num["REP/REPE"] = "INAT_PFX_REPE" + prefix_num["XACQUIRE"] = "INAT_PFX_REPNE" + prefix_num["XRELEASE"] = "INAT_PFX_REPE" prefix_num["LOCK"] = "INAT_PFX_LOCK" prefix_num["SEG=CS"] = "INAT_PFX_CS" prefix_num["SEG=DS"] = "INAT_PFX_DS" diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 590be1090892..f7bab68a4b83 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -42,9 +42,6 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { "^(xen_irq_disable_direct_reloc$|" "xen_save_fl_direct_reloc$|" "VDSO|" -#if ELF_BITS == 64 - "__vvar_page|" -#endif "__crc_)", /* @@ -72,6 +69,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { "__per_cpu_load|" "init_per_cpu__.*|" "__end_rodata_hpage_align|" + "__vvar_page|" #endif "_end)$" }; diff --git a/arch/x86/um/os-Linux/prctl.c b/arch/x86/um/os-Linux/prctl.c index 9d34eddb517f..96eb2bd28832 100644 --- a/arch/x86/um/os-Linux/prctl.c +++ b/arch/x86/um/os-Linux/prctl.c @@ -4,7 +4,7 @@ */ #include <sys/ptrace.h> -#include <linux/ptrace.h> +#include <asm/ptrace.h> int os_arch_prctl(int pid, int code, unsigned long *addr) { diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index ae7319db18ee..5e04a1c899fa 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -508,7 +508,6 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, { struct rt_sigframe __user *frame; int err = 0; - struct task_struct *me = current; frame = (struct rt_sigframe __user *) round_down(stack_top - sizeof(struct rt_sigframe), 16); diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index c74436e687bf..72074d528400 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -85,15 +85,18 @@ static notrace cycle_t vread_pvclock(int *mode) cycle_t ret; u64 last; u32 version; - u32 migrate_count; u8 flags; unsigned cpu, cpu1; /* - * When looping to get a consistent (time-info, tsc) pair, we - * also need to deal with the possibility we can switch vcpus, - * so make sure we always re-fetch time-info for the current vcpu. + * Note: hypervisor must guarantee that: + * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. + * 2. that per-CPU pvclock time info is updated if the + * underlying CPU changes. + * 3. that version is increased whenever underlying CPU + * changes. + * */ do { cpu = __getcpu() & VGETCPU_CPU_MASK; @@ -104,8 +107,6 @@ static notrace cycle_t vread_pvclock(int *mode) pvti = get_pvti(cpu); - migrate_count = pvti->migrate_count; - version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); /* @@ -117,8 +118,7 @@ static notrace cycle_t vread_pvclock(int *mode) cpu1 = __getcpu() & VGETCPU_CPU_MASK; } while (unlikely(cpu != cpu1 || (pvti->pvti.version & 1) || - pvti->pvti.version != version || - pvti->migrate_count != migrate_count)); + pvti->pvti.version != version)); if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) *mode = VCLOCK_NONE; diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 0faad646f5fd..d6bfb876cfb0 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -372,7 +372,7 @@ subsys_initcall(sysenter_setup); /* Register vsyscall32 into the ABI table */ #include <linux/sysctl.h> -static ctl_table abi_table2[] = { +static struct ctl_table abi_table2[] = { { .procname = "vsyscall32", .data = &sysctl_vsyscall32, @@ -383,7 +383,7 @@ static ctl_table abi_table2[] = { {} }; -static ctl_table abi_root_table2[] = { +static struct ctl_table abi_root_table2[] = { { .procname = "abi", .mode = 0555, diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a492be2635ac..fa6ade76ef3f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -427,8 +427,7 @@ static void __init xen_init_cpuid_mask(void) if (!xen_initial_domain()) cpuid_leaf1_edx_mask &= - ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ - (1 << X86_FEATURE_ACPI)); /* disable ACPI */ + ~((1 << X86_FEATURE_ACPI)); /* disable ACPI */ cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_X2APIC % 32)); @@ -735,8 +734,7 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, addr = (unsigned long)xen_int3; else if (addr == (unsigned long)stack_segment) addr = (unsigned long)xen_stack_segment; - else if (addr == (unsigned long)double_fault || - addr == (unsigned long)nmi) { + else if (addr == (unsigned long)double_fault) { /* Don't need to handle these */ return 0; #ifdef CONFIG_X86_MCE @@ -747,7 +745,12 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, */ ; #endif - } else { + } else if (addr == (unsigned long)nmi) + /* + * Use the native version as well. + */ + ; + else { /* Some other trap using IST? */ if (WARN_ON(val->ist != 0)) return 0; @@ -1557,7 +1560,7 @@ asmlinkage void __init xen_start_kernel(void) #ifdef CONFIG_X86_32 /* set up basic CPUID stuff */ cpu_detect(&new_cpu_data); - new_cpu_data.hard_math = 1; + set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU); new_cpu_data.wp_works_ok = 1; new_cpu_data.x86_capability[0] = cpuid_edx(1); #endif @@ -1681,15 +1684,14 @@ static void __init init_hvm_pv_info(void) xen_domain_type = XEN_HVM_DOMAIN; } -static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int xen_hvm_cpu_notify(struct notifier_block *self, unsigned long action, + void *hcpu) { int cpu = (long)hcpu; switch (action) { case CPU_UP_PREPARE: xen_vcpu_setup(cpu); if (xen_have_vector_callback) { - xen_init_lock_cpu(cpu); if (xen_feature(XENFEAT_hvm_safe_pvclock)) xen_setup_timer(cpu); } @@ -1700,7 +1702,7 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = { +static struct notifier_block xen_hvm_cpu_notifier = { .notifier_call = xen_hvm_cpu_notify, }; @@ -1710,6 +1712,8 @@ static void __init xen_hvm_guest_init(void) xen_hvm_init_shared_info(); + xen_panic_handler_init(); + if (xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; xen_hvm_smp_init(); @@ -1720,15 +1724,12 @@ static void __init xen_hvm_guest_init(void) xen_hvm_init_mmu_ops(); } -static bool __init xen_hvm_platform(void) +static uint32_t __init xen_hvm_platform(void) { if (xen_pv_domain()) - return false; - - if (!xen_cpuid_base()) - return false; + return 0; - return true; + return xen_cpuid_base(); } bool xen_hvm_need_lapic(void) diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 01a4dc015ae1..0da7f863056f 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -47,23 +47,18 @@ static void xen_restore_fl(unsigned long flags) /* convert from IF type flag */ flags = !(flags & X86_EFLAGS_IF); - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ + /* See xen_irq_enable() for why preemption must be disabled. */ preempt_disable(); vcpu = this_cpu_read(xen_vcpu); vcpu->evtchn_upcall_mask = flags; - preempt_enable_no_resched(); - - /* Doesn't matter if we get preempted here, because any - pending event will get dealt with anyway. */ if (flags == 0) { - preempt_check_resched(); barrier(); /* unmask then check (avoid races) */ if (unlikely(vcpu->evtchn_upcall_pending)) xen_force_evtchn_callback(); - } + preempt_enable(); + } else + preempt_enable_no_resched(); } PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl); @@ -82,10 +77,12 @@ static void xen_irq_enable(void) { struct vcpu_info *vcpu; - /* We don't need to worry about being preempted here, since - either a) interrupts are disabled, so no preemption, or b) - the caller is confused and is trying to re-enable interrupts - on an indeterminate processor. */ + /* + * We may be preempted as soon as vcpu->evtchn_upcall_mask is + * cleared, so disable preemption to ensure we check for + * events on the VCPU we are still running on. + */ + preempt_disable(); vcpu = this_cpu_read(xen_vcpu); vcpu->evtchn_upcall_mask = 0; @@ -96,6 +93,8 @@ static void xen_irq_enable(void) barrier(); /* unmask then check (avoid races) */ if (unlikely(vcpu->evtchn_upcall_pending)) xen_force_evtchn_callback(); + + preempt_enable(); } PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 95fb2aa5927e..a61c7d5811be 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -161,6 +161,7 @@ #include <asm/xen/page.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> +#include <xen/balloon.h> #include <xen/grant_table.h> #include "multicalls.h" @@ -878,7 +879,6 @@ int m2p_add_override(unsigned long mfn, struct page *page, unsigned long uninitialized_var(address); unsigned level; pte_t *ptep = NULL; - int ret = 0; pfn = page_to_pfn(page); if (!PageHighMem(page)) { @@ -925,8 +925,8 @@ int m2p_add_override(unsigned long mfn, struct page *page, * frontend pages while they are being shared with the backend, * because mfn_to_pfn (that ends up being called by GUPF) will * return the backend pfn rather than the frontend pfn. */ - ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); - if (ret == 0 && get_phys_to_machine(pfn) == mfn) + pfn = mfn_to_pfn_no_overrides(mfn); + if (get_phys_to_machine(pfn) == mfn) set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); return 0; @@ -941,7 +941,6 @@ int m2p_remove_override(struct page *page, unsigned long uninitialized_var(address); unsigned level; pte_t *ptep = NULL; - int ret = 0; pfn = page_to_pfn(page); mfn = get_phys_to_machine(pfn); @@ -967,7 +966,10 @@ int m2p_remove_override(struct page *page, if (kmap_op != NULL) { if (!PageHighMem(page)) { struct multicall_space mcs; - struct gnttab_unmap_grant_ref *unmap_op; + struct gnttab_unmap_and_replace *unmap_op; + struct page *scratch_page = get_balloon_scratch_page(); + unsigned long scratch_page_address = (unsigned long) + __va(page_to_pfn(scratch_page) << PAGE_SHIFT); /* * It might be that we queued all the m2p grant table @@ -986,25 +988,31 @@ int m2p_remove_override(struct page *page, printk(KERN_WARNING "m2p_remove_override: " "pfn %lx mfn %lx, failed to modify kernel mappings", pfn, mfn); + put_balloon_scratch_page(); return -1; } - mcs = xen_mc_entry( - sizeof(struct gnttab_unmap_grant_ref)); + xen_mc_batch(); + + mcs = __xen_mc_entry( + sizeof(struct gnttab_unmap_and_replace)); unmap_op = mcs.args; unmap_op->host_addr = kmap_op->host_addr; + unmap_op->new_addr = scratch_page_address; unmap_op->handle = kmap_op->handle; - unmap_op->dev_bus_addr = 0; MULTI_grant_table_op(mcs.mc, - GNTTABOP_unmap_grant_ref, unmap_op, 1); + GNTTABOP_unmap_and_replace, unmap_op, 1); + + mcs = __xen_mc_entry(0); + MULTI_update_va_mapping(mcs.mc, scratch_page_address, + pfn_pte(page_to_pfn(scratch_page), + PAGE_KERNEL_RO), 0); xen_mc_issue(PARAVIRT_LAZY_MMU); - set_pte_at(&init_mm, address, ptep, - pfn_pte(pfn, PAGE_KERNEL)); - __flush_tlb_single(address); kmap_op->host_addr = 0; + put_balloon_scratch_page(); } } @@ -1019,8 +1027,8 @@ int m2p_remove_override(struct page *page, * the original pfn causes mfn_to_pfn(mfn) to return the frontend * pfn again. */ mfn &= ~FOREIGN_FRAME_BIT; - ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); - if (ret == 0 && get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) && + pfn = mfn_to_pfn_no_overrides(mfn); + if (get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) && m2p_find_override(mfn) == NULL) set_phys_to_machine(pfn, mfn); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 94eac5c85cdc..09f3059cb00b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -33,6 +33,9 @@ /* These are code, but not functions. Defined in entry.S */ extern const char xen_hypervisor_callback[]; extern const char xen_failsafe_callback[]; +#ifdef CONFIG_X86_64 +extern const char nmi[]; +#endif extern void xen_sysenter_target(void); extern void xen_syscall_target(void); extern void xen_syscall32_target(void); @@ -215,13 +218,19 @@ static void __init xen_set_identity_and_release_chunk( unsigned long pfn; /* - * If the PFNs are currently mapped, the VA mapping also needs - * to be updated to be 1:1. + * If the PFNs are currently mapped, clear the mappings + * (except for the ISA region which must be 1:1 mapped) to + * release the refcounts (in Xen) on the original frames. */ - for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) + for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) { + pte_t pte = __pte_ma(0); + + if (pfn < PFN_UP(ISA_END_ADDRESS)) + pte = mfn_pte(pfn, PAGE_KERNEL_IO); + (void)HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), - mfn_pte(pfn, PAGE_KERNEL_IO), 0); + (unsigned long)__va(pfn << PAGE_SHIFT), pte, 0); + } if (start_pfn < nr_pages) *released += xen_release_chunk( @@ -313,6 +322,17 @@ static void xen_align_and_add_e820_region(u64 start, u64 size, int type) e820_add_region(start, end - start, type); } +void xen_ignore_unusable(struct e820entry *list, size_t map_size) +{ + struct e820entry *entry; + unsigned int i; + + for (i = 0, entry = list; i < map_size; i++, entry++) { + if (entry->type == E820_UNUSABLE) + entry->type = E820_RAM; + } +} + /** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ @@ -353,6 +373,17 @@ char * __init xen_memory_setup(void) } BUG_ON(rc); + /* + * Xen won't allow a 1:1 mapping to be created to UNUSABLE + * regions, so if we're using the machine memory map leave the + * region as RAM as it is in the pseudo-physical map. + * + * UNUSABLE regions in domUs are not handled and will need + * a patch in the future. + */ + if (xen_initial_domain()) + xen_ignore_unusable(map, memmap.nr_entries); + /* Make sure the Xen-supplied memory map is well-ordered. */ sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); @@ -475,7 +506,7 @@ static void __init fiddle_vdso(void) #endif } -static int __cpuinit register_callback(unsigned type, const void *func) +static int register_callback(unsigned type, const void *func) { struct callback_register callback = { .type = type, @@ -486,7 +517,7 @@ static int __cpuinit register_callback(unsigned type, const void *func) return HYPERVISOR_callback_op(CALLBACKOP_register, &callback); } -void __cpuinit xen_enable_sysenter(void) +void xen_enable_sysenter(void) { int ret; unsigned sysenter_feature; @@ -505,7 +536,7 @@ void __cpuinit xen_enable_sysenter(void) setup_clear_cpu_cap(sysenter_feature); } -void __cpuinit xen_enable_syscall(void) +void xen_enable_syscall(void) { #ifdef CONFIG_X86_64 int ret; @@ -525,7 +556,13 @@ void __cpuinit xen_enable_syscall(void) } #endif /* CONFIG_X86_64 */ } - +void __cpuinit xen_enable_nmi(void) +{ +#ifdef CONFIG_X86_64 + if (register_callback(CALLBACKTYPE_nmi, nmi)) + BUG(); +#endif +} void __init xen_arch_setup(void) { xen_panic_handler_init(); @@ -543,7 +580,7 @@ void __init xen_arch_setup(void) xen_enable_sysenter(); xen_enable_syscall(); - + xen_enable_nmi(); #ifdef CONFIG_ACPI if (!(xen_start_info->flags & SIF_INITDOMAIN)) { printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index fb44426fe931..d1e4777b4e75 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -17,6 +17,7 @@ #include <linux/slab.h> #include <linux/smp.h> #include <linux/irq_work.h> +#include <linux/tick.h> #include <asm/paravirt.h> #include <asm/desc.h> @@ -39,11 +40,15 @@ cpumask_var_t xen_cpu_initialized_map; -static DEFINE_PER_CPU(int, xen_resched_irq); -static DEFINE_PER_CPU(int, xen_callfunc_irq); -static DEFINE_PER_CPU(int, xen_callfuncsingle_irq); -static DEFINE_PER_CPU(int, xen_irq_work); -static DEFINE_PER_CPU(int, xen_debug_irq) = -1; +struct xen_common_irq { + int irq; + char *name; +}; +static DEFINE_PER_CPU(struct xen_common_irq, xen_resched_irq) = { .irq = -1 }; +static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 }; +static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 }; +static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 }; +static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 }; static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); @@ -60,7 +65,7 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static void __cpuinit cpu_bringup(void) +static void cpu_bringup(void) { int cpu; @@ -92,16 +97,53 @@ static void __cpuinit cpu_bringup(void) wmb(); /* make sure everything is out */ } -static void __cpuinit cpu_bringup_and_idle(void) +static void cpu_bringup_and_idle(void) { cpu_bringup(); cpu_startup_entry(CPUHP_ONLINE); } +static void xen_smp_intr_free(unsigned int cpu) +{ + if (per_cpu(xen_resched_irq, cpu).irq >= 0) { + unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu).irq, NULL); + per_cpu(xen_resched_irq, cpu).irq = -1; + kfree(per_cpu(xen_resched_irq, cpu).name); + per_cpu(xen_resched_irq, cpu).name = NULL; + } + if (per_cpu(xen_callfunc_irq, cpu).irq >= 0) { + unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu).irq, NULL); + per_cpu(xen_callfunc_irq, cpu).irq = -1; + kfree(per_cpu(xen_callfunc_irq, cpu).name); + per_cpu(xen_callfunc_irq, cpu).name = NULL; + } + if (per_cpu(xen_debug_irq, cpu).irq >= 0) { + unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu).irq, NULL); + per_cpu(xen_debug_irq, cpu).irq = -1; + kfree(per_cpu(xen_debug_irq, cpu).name); + per_cpu(xen_debug_irq, cpu).name = NULL; + } + if (per_cpu(xen_callfuncsingle_irq, cpu).irq >= 0) { + unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu).irq, + NULL); + per_cpu(xen_callfuncsingle_irq, cpu).irq = -1; + kfree(per_cpu(xen_callfuncsingle_irq, cpu).name); + per_cpu(xen_callfuncsingle_irq, cpu).name = NULL; + } + if (xen_hvm_domain()) + return; + + if (per_cpu(xen_irq_work, cpu).irq >= 0) { + unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL); + per_cpu(xen_irq_work, cpu).irq = -1; + kfree(per_cpu(xen_irq_work, cpu).name); + per_cpu(xen_irq_work, cpu).name = NULL; + } +}; static int xen_smp_intr_init(unsigned int cpu) { int rc; - const char *resched_name, *callfunc_name, *debug_name; + char *resched_name, *callfunc_name, *debug_name; resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, @@ -112,7 +154,8 @@ static int xen_smp_intr_init(unsigned int cpu) NULL); if (rc < 0) goto fail; - per_cpu(xen_resched_irq, cpu) = rc; + per_cpu(xen_resched_irq, cpu).irq = rc; + per_cpu(xen_resched_irq, cpu).name = resched_name; callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu); rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR, @@ -123,7 +166,8 @@ static int xen_smp_intr_init(unsigned int cpu) NULL); if (rc < 0) goto fail; - per_cpu(xen_callfunc_irq, cpu) = rc; + per_cpu(xen_callfunc_irq, cpu).irq = rc; + per_cpu(xen_callfunc_irq, cpu).name = callfunc_name; debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu); rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt, @@ -131,7 +175,8 @@ static int xen_smp_intr_init(unsigned int cpu) debug_name, NULL); if (rc < 0) goto fail; - per_cpu(xen_debug_irq, cpu) = rc; + per_cpu(xen_debug_irq, cpu).irq = rc; + per_cpu(xen_debug_irq, cpu).name = debug_name; callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu); rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR, @@ -142,7 +187,8 @@ static int xen_smp_intr_init(unsigned int cpu) NULL); if (rc < 0) goto fail; - per_cpu(xen_callfuncsingle_irq, cpu) = rc; + per_cpu(xen_callfuncsingle_irq, cpu).irq = rc; + per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name; /* * The IRQ worker on PVHVM goes through the native path and uses the @@ -160,26 +206,13 @@ static int xen_smp_intr_init(unsigned int cpu) NULL); if (rc < 0) goto fail; - per_cpu(xen_irq_work, cpu) = rc; + per_cpu(xen_irq_work, cpu).irq = rc; + per_cpu(xen_irq_work, cpu).name = callfunc_name; return 0; fail: - if (per_cpu(xen_resched_irq, cpu) >= 0) - unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL); - if (per_cpu(xen_callfunc_irq, cpu) >= 0) - unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); - if (per_cpu(xen_debug_irq, cpu) >= 0) - unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); - if (per_cpu(xen_callfuncsingle_irq, cpu) >= 0) - unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), - NULL); - if (xen_hvm_domain()) - return rc; - - if (per_cpu(xen_irq_work, cpu) >= 0) - unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL); - + xen_smp_intr_free(cpu); return rc; } @@ -240,12 +273,21 @@ static void __init xen_smp_prepare_boot_cpu(void) BUG_ON(smp_processor_id() != 0); native_smp_prepare_boot_cpu(); - /* We've switched to the "real" per-cpu gdt, so make sure the - old memory can be recycled */ - make_lowmem_page_readwrite(xen_initial_gdt); + if (xen_pv_domain()) { + /* We've switched to the "real" per-cpu gdt, so make sure the + old memory can be recycled */ + make_lowmem_page_readwrite(xen_initial_gdt); - xen_filter_cpu_maps(); - xen_setup_vcpu_info_placement(); + xen_filter_cpu_maps(); + xen_setup_vcpu_info_placement(); + } + /* + * The alternative logic (which patches the unlock/lock) runs before + * the smp bootup up code is activated. Hence we need to set this up + * the core kernel is being patched. Otherwise we will have only + * modules patched but not core code. + */ + xen_init_spinlocks(); } static void __init xen_smp_prepare_cpus(unsigned int max_cpus) @@ -293,7 +335,7 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) set_cpu_present(cpu, true); } -static int __cpuinit +static int cpu_initialize_context(unsigned int cpu, struct task_struct *idle) { struct vcpu_guest_context *ctxt; @@ -364,7 +406,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) return 0; } -static int __cpuinit xen_cpu_up(unsigned int cpu, struct task_struct *idle) +static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) { int rc; @@ -432,21 +474,23 @@ static void xen_cpu_die(unsigned int cpu) current->state = TASK_UNINTERRUPTIBLE; schedule_timeout(HZ/10); } - unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL); - unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); - unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); - unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); - if (!xen_hvm_domain()) - unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL); + xen_smp_intr_free(cpu); xen_uninit_lock_cpu(cpu); xen_teardown_timer(cpu); } -static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */ +static void xen_play_dead(void) /* used only with HOTPLUG_CPU */ { play_dead_common(); HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); cpu_bringup(); + /* + * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down) + * clears certain data that the cpu_idle loop (which called us + * and that we return from) expects. The only way to get that + * data back is to call: + */ + tick_nohz_idle_enter(); } #else /* !CONFIG_HOTPLUG_CPU */ @@ -537,6 +581,12 @@ static inline int xen_map_vector(int vector) case IRQ_WORK_VECTOR: xen_vector = XEN_IRQ_WORK_VECTOR; break; +#ifdef CONFIG_X86_64 + case NMI_VECTOR: + case APIC_DM_NMI: /* Some use that instead of NMI_VECTOR */ + xen_vector = XEN_NMI_VECTOR; + break; +#endif default: xen_vector = -1; printk(KERN_ERR "xen: vector 0x%x is not implemented\n", @@ -645,7 +695,6 @@ void __init xen_smp_init(void) { smp_ops = xen_smp_ops; xen_fill_possible_map(); - xen_init_spinlocks(); } static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) @@ -656,11 +705,27 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) xen_init_lock_cpu(0); } -static int __cpuinit xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) +static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) { int rc; - rc = native_cpu_up(cpu, tidle); - WARN_ON (xen_smp_intr_init(cpu)); + /* + * xen_smp_intr_init() needs to run before native_cpu_up() + * so that IPI vectors are set up on the booting CPU before + * it is marked online in native_cpu_up(). + */ + rc = xen_smp_intr_init(cpu); + WARN_ON(rc); + if (!rc) + rc = native_cpu_up(cpu, tidle); + + /* + * We must initialize the slowpath CPU kicker _after_ the native + * path has executed. If we initialized it before none of the + * unlocker IPI kicks would reach the booting CPU as the booting + * CPU had not set itself 'online' in cpu_online_mask. That mask + * is checked when IPIs are sent (on HVM at least). + */ + xen_init_lock_cpu(cpu); return rc; } @@ -680,4 +745,5 @@ void __init xen_hvm_smp_init(void) smp_ops.cpu_die = xen_hvm_cpu_die; smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; + smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu; } diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 3002ec1bb71a..be6b86078957 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -7,6 +7,7 @@ #include <linux/debugfs.h> #include <linux/log2.h> #include <linux/gfp.h> +#include <linux/slab.h> #include <asm/paravirt.h> @@ -16,45 +17,44 @@ #include "xen-ops.h" #include "debugfs.h" -#ifdef CONFIG_XEN_DEBUG_FS -static struct xen_spinlock_stats -{ - u64 taken; - u32 taken_slow; - u32 taken_slow_nested; - u32 taken_slow_pickup; - u32 taken_slow_spurious; - u32 taken_slow_irqenable; +enum xen_contention_stat { + TAKEN_SLOW, + TAKEN_SLOW_PICKUP, + TAKEN_SLOW_SPURIOUS, + RELEASED_SLOW, + RELEASED_SLOW_KICKED, + NR_CONTENTION_STATS +}; - u64 released; - u32 released_slow; - u32 released_slow_kicked; +#ifdef CONFIG_XEN_DEBUG_FS #define HISTO_BUCKETS 30 - u32 histo_spin_total[HISTO_BUCKETS+1]; - u32 histo_spin_spinning[HISTO_BUCKETS+1]; +static struct xen_spinlock_stats +{ + u32 contention_stats[NR_CONTENTION_STATS]; u32 histo_spin_blocked[HISTO_BUCKETS+1]; - - u64 time_total; - u64 time_spinning; u64 time_blocked; } spinlock_stats; static u8 zero_stats; -static unsigned lock_timeout = 1 << 10; -#define TIMEOUT lock_timeout - static inline void check_zero(void) { - if (unlikely(zero_stats)) { - memset(&spinlock_stats, 0, sizeof(spinlock_stats)); - zero_stats = 0; + u8 ret; + u8 old = ACCESS_ONCE(zero_stats); + if (unlikely(old)) { + ret = cmpxchg(&zero_stats, old, 0); + /* This ensures only one fellow resets the stat */ + if (ret == old) + memset(&spinlock_stats, 0, sizeof(spinlock_stats)); } } -#define ADD_STATS(elem, val) \ - do { check_zero(); spinlock_stats.elem += (val); } while(0) +static inline void add_stats(enum xen_contention_stat var, u32 val) +{ + check_zero(); + spinlock_stats.contention_stats[var] += val; +} static inline u64 spin_time_start(void) { @@ -73,22 +73,6 @@ static void __spin_time_accum(u64 delta, u32 *array) array[HISTO_BUCKETS]++; } -static inline void spin_time_accum_spinning(u64 start) -{ - u32 delta = xen_clocksource_read() - start; - - __spin_time_accum(delta, spinlock_stats.histo_spin_spinning); - spinlock_stats.time_spinning += delta; -} - -static inline void spin_time_accum_total(u64 start) -{ - u32 delta = xen_clocksource_read() - start; - - __spin_time_accum(delta, spinlock_stats.histo_spin_total); - spinlock_stats.time_total += delta; -} - static inline void spin_time_accum_blocked(u64 start) { u32 delta = xen_clocksource_read() - start; @@ -97,283 +81,155 @@ static inline void spin_time_accum_blocked(u64 start) spinlock_stats.time_blocked += delta; } #else /* !CONFIG_XEN_DEBUG_FS */ -#define TIMEOUT (1 << 10) -#define ADD_STATS(elem, val) do { (void)(val); } while(0) +static inline void add_stats(enum xen_contention_stat var, u32 val) +{ +} static inline u64 spin_time_start(void) { return 0; } -static inline void spin_time_accum_total(u64 start) -{ -} -static inline void spin_time_accum_spinning(u64 start) -{ -} static inline void spin_time_accum_blocked(u64 start) { } #endif /* CONFIG_XEN_DEBUG_FS */ -/* - * Size struct xen_spinlock so it's the same as arch_spinlock_t. - */ -#if NR_CPUS < 256 -typedef u8 xen_spinners_t; -# define inc_spinners(xl) \ - asm(LOCK_PREFIX " incb %0" : "+m" ((xl)->spinners) : : "memory"); -# define dec_spinners(xl) \ - asm(LOCK_PREFIX " decb %0" : "+m" ((xl)->spinners) : : "memory"); -#else -typedef u16 xen_spinners_t; -# define inc_spinners(xl) \ - asm(LOCK_PREFIX " incw %0" : "+m" ((xl)->spinners) : : "memory"); -# define dec_spinners(xl) \ - asm(LOCK_PREFIX " decw %0" : "+m" ((xl)->spinners) : : "memory"); -#endif - -struct xen_spinlock { - unsigned char lock; /* 0 -> free; 1 -> locked */ - xen_spinners_t spinners; /* count of waiting cpus */ +struct xen_lock_waiting { + struct arch_spinlock *lock; + __ticket_t want; }; -static int xen_spin_is_locked(struct arch_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - return xl->lock != 0; -} - -static int xen_spin_is_contended(struct arch_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - /* Not strictly true; this is only the count of contended - lock-takers entering the slow path. */ - return xl->spinners != 0; -} - -static int xen_spin_trylock(struct arch_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - u8 old = 1; - - asm("xchgb %b0,%1" - : "+q" (old), "+m" (xl->lock) : : "memory"); - - return old == 0; -} - static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; -static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); - -/* - * Mark a cpu as interested in a lock. Returns the CPU's previous - * lock of interest, in case we got preempted by an interrupt. - */ -static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl) -{ - struct xen_spinlock *prev; - - prev = __this_cpu_read(lock_spinners); - __this_cpu_write(lock_spinners, xl); - - wmb(); /* set lock of interest before count */ +static DEFINE_PER_CPU(char *, irq_name); +static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting); +static cpumask_t waiting_cpus; - inc_spinners(xl); - - return prev; -} - -/* - * Mark a cpu as no longer interested in a lock. Restores previous - * lock of interest (NULL for none). - */ -static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev) -{ - dec_spinners(xl); - wmb(); /* decrement count before restoring lock */ - __this_cpu_write(lock_spinners, prev); -} - -static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable) +static bool xen_pvspin = true; +static void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) { - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - struct xen_spinlock *prev; int irq = __this_cpu_read(lock_kicker_irq); - int ret; + struct xen_lock_waiting *w = &__get_cpu_var(lock_waiting); + int cpu = smp_processor_id(); u64 start; + unsigned long flags; /* If kicker interrupts not initialized yet, just spin */ if (irq == -1) - return 0; + return; start = spin_time_start(); - /* announce we're spinning */ - prev = spinning_lock(xl); + /* + * Make sure an interrupt handler can't upset things in a + * partially setup state. + */ + local_irq_save(flags); + /* + * We don't really care if we're overwriting some other + * (lock,want) pair, as that would mean that we're currently + * in an interrupt context, and the outer context had + * interrupts enabled. That has already kicked the VCPU out + * of xen_poll_irq(), so it will just return spuriously and + * retry with newly setup (lock,want). + * + * The ordering protocol on this is that the "lock" pointer + * may only be set non-NULL if the "want" ticket is correct. + * If we're updating "want", we must first clear "lock". + */ + w->lock = NULL; + smp_wmb(); + w->want = want; + smp_wmb(); + w->lock = lock; - ADD_STATS(taken_slow, 1); - ADD_STATS(taken_slow_nested, prev != NULL); + /* This uses set_bit, which atomic and therefore a barrier */ + cpumask_set_cpu(cpu, &waiting_cpus); + add_stats(TAKEN_SLOW, 1); - do { - unsigned long flags; + /* clear pending */ + xen_clear_irq_pending(irq); - /* clear pending */ - xen_clear_irq_pending(irq); + /* Only check lock once pending cleared */ + barrier(); - /* check again make sure it didn't become free while - we weren't looking */ - ret = xen_spin_trylock(lock); - if (ret) { - ADD_STATS(taken_slow_pickup, 1); + /* + * Mark entry to slowpath before doing the pickup test to make + * sure we don't deadlock with an unlocker. + */ + __ticket_enter_slowpath(lock); - /* - * If we interrupted another spinlock while it - * was blocking, make sure it doesn't block - * without rechecking the lock. - */ - if (prev != NULL) - xen_set_irq_pending(irq); - goto out; - } + /* + * check again make sure it didn't become free while + * we weren't looking + */ + if (ACCESS_ONCE(lock->tickets.head) == want) { + add_stats(TAKEN_SLOW_PICKUP, 1); + goto out; + } - flags = arch_local_save_flags(); - if (irq_enable) { - ADD_STATS(taken_slow_irqenable, 1); - raw_local_irq_enable(); - } + /* Allow interrupts while blocked */ + local_irq_restore(flags); - /* - * Block until irq becomes pending. If we're - * interrupted at this point (after the trylock but - * before entering the block), then the nested lock - * handler guarantees that the irq will be left - * pending if there's any chance the lock became free; - * xen_poll_irq() returns immediately if the irq is - * pending. - */ - xen_poll_irq(irq); + /* + * If an interrupt happens here, it will leave the wakeup irq + * pending, which will cause xen_poll_irq() to return + * immediately. + */ - raw_local_irq_restore(flags); + /* Block until irq becomes pending (or perhaps a spurious wakeup) */ + xen_poll_irq(irq); + add_stats(TAKEN_SLOW_SPURIOUS, !xen_test_irq_pending(irq)); - ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); - } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ + local_irq_save(flags); kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); - out: - unspinning_lock(xl, prev); - spin_time_accum_blocked(start); - - return ret; -} - -static inline void __xen_spin_lock(struct arch_spinlock *lock, bool irq_enable) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - unsigned timeout; - u8 oldval; - u64 start_spin; - - ADD_STATS(taken, 1); - - start_spin = spin_time_start(); - - do { - u64 start_spin_fast = spin_time_start(); - - timeout = TIMEOUT; + cpumask_clear_cpu(cpu, &waiting_cpus); + w->lock = NULL; - asm("1: xchgb %1,%0\n" - " testb %1,%1\n" - " jz 3f\n" - "2: rep;nop\n" - " cmpb $0,%0\n" - " je 1b\n" - " dec %2\n" - " jnz 2b\n" - "3:\n" - : "+m" (xl->lock), "=q" (oldval), "+r" (timeout) - : "1" (1) - : "memory"); + local_irq_restore(flags); - spin_time_accum_spinning(start_spin_fast); - - } while (unlikely(oldval != 0 && - (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable)))); - - spin_time_accum_total(start_spin); -} - -static void xen_spin_lock(struct arch_spinlock *lock) -{ - __xen_spin_lock(lock, false); -} - -static void xen_spin_lock_flags(struct arch_spinlock *lock, unsigned long flags) -{ - __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags)); + spin_time_accum_blocked(start); } +PV_CALLEE_SAVE_REGS_THUNK(xen_lock_spinning); -static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) +static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next) { int cpu; - ADD_STATS(released_slow, 1); + add_stats(RELEASED_SLOW, 1); + + for_each_cpu(cpu, &waiting_cpus) { + const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu); - for_each_online_cpu(cpu) { - /* XXX should mix up next cpu selection */ - if (per_cpu(lock_spinners, cpu) == xl) { - ADD_STATS(released_slow_kicked, 1); + /* Make sure we read lock before want */ + if (ACCESS_ONCE(w->lock) == lock && + ACCESS_ONCE(w->want) == next) { + add_stats(RELEASED_SLOW_KICKED, 1); xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); + break; } } } -static void xen_spin_unlock(struct arch_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - ADD_STATS(released, 1); - - smp_wmb(); /* make sure no writes get moved after unlock */ - xl->lock = 0; /* release lock */ - - /* - * Make sure unlock happens before checking for waiting - * spinners. We need a strong barrier to enforce the - * write-read ordering to different memory locations, as the - * CPU makes no implied guarantees about their ordering. - */ - mb(); - - if (unlikely(xl->spinners)) - xen_spin_unlock_slow(xl); -} - static irqreturn_t dummy_handler(int irq, void *dev_id) { BUG(); return IRQ_HANDLED; } -void __cpuinit xen_init_lock_cpu(int cpu) +void xen_init_lock_cpu(int cpu) { int irq; - const char *name; + char *name; + + if (!xen_pvspin) + return; WARN(per_cpu(lock_kicker_irq, cpu) >= 0, "spinlock on CPU%d exists on IRQ%d!\n", cpu, per_cpu(lock_kicker_irq, cpu)); - /* - * See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23 - * (xen: disable PV spinlocks on HVM) - */ - if (xen_hvm_domain()) - return; - name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, cpu, @@ -385,6 +241,7 @@ void __cpuinit xen_init_lock_cpu(int cpu) if (irq >= 0) { disable_irq(irq); /* make sure it's never delivered */ per_cpu(lock_kicker_irq, cpu) = irq; + per_cpu(irq_name, cpu) = name; } printk("cpu %d spinlock event irq %d\n", cpu, irq); @@ -392,35 +249,58 @@ void __cpuinit xen_init_lock_cpu(int cpu) void xen_uninit_lock_cpu(int cpu) { - /* - * See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23 - * (xen: disable PV spinlocks on HVM) - */ - if (xen_hvm_domain()) + if (!xen_pvspin) return; unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL); per_cpu(lock_kicker_irq, cpu) = -1; + kfree(per_cpu(irq_name, cpu)); + per_cpu(irq_name, cpu) = NULL; } + +/* + * Our init of PV spinlocks is split in two init functions due to us + * using paravirt patching and jump labels patching and having to do + * all of this before SMP code is invoked. + * + * The paravirt patching needs to be done _before_ the alternative asm code + * is started, otherwise we would not patch the core kernel code. + */ void __init xen_init_spinlocks(void) { - /* - * See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23 - * (xen: disable PV spinlocks on HVM) - */ - if (xen_hvm_domain()) + + if (!xen_pvspin) { + printk(KERN_DEBUG "xen: PV spinlocks disabled\n"); return; + } + + pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning); + pv_lock_ops.unlock_kick = xen_unlock_kick; +} + +/* + * While the jump_label init code needs to happend _after_ the jump labels are + * enabled and before SMP is started. Hence we use pre-SMP initcall level + * init. We cannot do it in xen_init_spinlocks as that is done before + * jump labels are activated. + */ +static __init int xen_init_spinlocks_jump(void) +{ + if (!xen_pvspin) + return 0; - BUILD_BUG_ON(sizeof(struct xen_spinlock) > sizeof(arch_spinlock_t)); + static_key_slow_inc(¶virt_ticketlocks_enabled); + return 0; +} +early_initcall(xen_init_spinlocks_jump); - pv_lock_ops.spin_is_locked = xen_spin_is_locked; - pv_lock_ops.spin_is_contended = xen_spin_is_contended; - pv_lock_ops.spin_lock = xen_spin_lock; - pv_lock_ops.spin_lock_flags = xen_spin_lock_flags; - pv_lock_ops.spin_trylock = xen_spin_trylock; - pv_lock_ops.spin_unlock = xen_spin_unlock; +static __init int xen_parse_nopvspin(char *arg) +{ + xen_pvspin = false; + return 0; } +early_param("xen_nopvspin", xen_parse_nopvspin); #ifdef CONFIG_XEN_DEBUG_FS @@ -433,41 +313,28 @@ static int __init xen_spinlock_debugfs(void) if (d_xen == NULL) return -ENOMEM; + if (!xen_pvspin) + return 0; + d_spin_debug = debugfs_create_dir("spinlocks", d_xen); debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); - debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout); - - debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken); debugfs_create_u32("taken_slow", 0444, d_spin_debug, - &spinlock_stats.taken_slow); - debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug, - &spinlock_stats.taken_slow_nested); + &spinlock_stats.contention_stats[TAKEN_SLOW]); debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, - &spinlock_stats.taken_slow_pickup); + &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]); debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug, - &spinlock_stats.taken_slow_spurious); - debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug, - &spinlock_stats.taken_slow_irqenable); + &spinlock_stats.contention_stats[TAKEN_SLOW_SPURIOUS]); - debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released); debugfs_create_u32("released_slow", 0444, d_spin_debug, - &spinlock_stats.released_slow); + &spinlock_stats.contention_stats[RELEASED_SLOW]); debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, - &spinlock_stats.released_slow_kicked); + &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]); - debugfs_create_u64("time_spinning", 0444, d_spin_debug, - &spinlock_stats.time_spinning); debugfs_create_u64("time_blocked", 0444, d_spin_debug, &spinlock_stats.time_blocked); - debugfs_create_u64("time_total", 0444, d_spin_debug, - &spinlock_stats.time_total); - debugfs_create_u32_array("histo_total", 0444, d_spin_debug, - spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1); - debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug, - spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1); debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 3d88bfdf9e1c..ee365895b06b 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -14,6 +14,8 @@ #include <linux/kernel_stat.h> #include <linux/math64.h> #include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/pvclock_gtod.h> #include <asm/pvclock.h> #include <asm/xen/hypervisor.h> @@ -36,9 +38,8 @@ static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate); /* snapshots of runstate info */ static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot); -/* unused ns of stolen and blocked time */ +/* unused ns of stolen time */ static DEFINE_PER_CPU(u64, xen_residual_stolen); -static DEFINE_PER_CPU(u64, xen_residual_blocked); /* return an consistent snapshot of 64-bit time/counter value */ static u64 get64(const u64 *p) @@ -115,7 +116,7 @@ static void do_stolen_accounting(void) { struct vcpu_runstate_info state; struct vcpu_runstate_info *snap; - s64 blocked, runnable, offline, stolen; + s64 runnable, offline, stolen; cputime_t ticks; get_runstate_snapshot(&state); @@ -125,7 +126,6 @@ static void do_stolen_accounting(void) snap = &__get_cpu_var(xen_runstate_snapshot); /* work out how much time the VCPU has not been runn*ing* */ - blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]; runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]; offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline]; @@ -141,17 +141,6 @@ static void do_stolen_accounting(void) ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); __this_cpu_write(xen_residual_stolen, stolen); account_steal_ticks(ticks); - - /* Add the appropriate number of ticks of blocked time, - including any left-overs from last time. */ - blocked += __this_cpu_read(xen_residual_blocked); - - if (blocked < 0) - blocked = 0; - - ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); - __this_cpu_write(xen_residual_blocked, blocked); - account_idle_ticks(ticks); } /* Get the TSC speed from Xen */ @@ -191,34 +180,56 @@ static void xen_read_wallclock(struct timespec *ts) put_cpu_var(xen_vcpu); } -static unsigned long xen_get_wallclock(void) +static void xen_get_wallclock(struct timespec *now) { - struct timespec ts; + xen_read_wallclock(now); +} - xen_read_wallclock(&ts); - return ts.tv_sec; +static int xen_set_wallclock(const struct timespec *now) +{ + return -1; } -static int xen_set_wallclock(unsigned long now) +static int xen_pvclock_gtod_notify(struct notifier_block *nb, + unsigned long was_set, void *priv) { + /* Protected by the calling core code serialization */ + static struct timespec next_sync; + struct xen_platform_op op; - int rc; + struct timespec now; - /* do nothing for domU */ - if (!xen_initial_domain()) - return -1; + now = __current_kernel_time(); + + /* + * We only take the expensive HV call when the clock was set + * or when the 11 minutes RTC synchronization time elapsed. + */ + if (!was_set && timespec_compare(&now, &next_sync) < 0) + return NOTIFY_OK; op.cmd = XENPF_settime; - op.u.settime.secs = now; - op.u.settime.nsecs = 0; + op.u.settime.secs = now.tv_sec; + op.u.settime.nsecs = now.tv_nsec; op.u.settime.system_time = xen_clocksource_read(); - rc = HYPERVISOR_dom0_op(&op); - WARN(rc != 0, "XENPF_settime failed: now=%ld\n", now); + (void)HYPERVISOR_dom0_op(&op); + + /* + * Move the next drift compensation time 11 minutes + * ahead. That's emulating the sync_cmos_clock() update for + * the hardware RTC. + */ + next_sync = now; + next_sync.tv_sec += 11 * 60; - return rc; + return NOTIFY_OK; } +static struct notifier_block xen_pvclock_gtod_notifier = { + .notifier_call = xen_pvclock_gtod_notify, +}; + static struct clocksource xen_clocksource __read_mostly = { .name = "xen", .rating = 400, @@ -377,11 +388,16 @@ static const struct clock_event_device xen_vcpuop_clockevent = { static const struct clock_event_device *xen_clockevent = &xen_timerop_clockevent; -static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events) = { .irq = -1 }; + +struct xen_clock_event_device { + struct clock_event_device evt; + char *name; +}; +static DEFINE_PER_CPU(struct xen_clock_event_device, xen_clock_events) = { .evt.irq = -1 }; static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) { - struct clock_event_device *evt = &__get_cpu_var(xen_clock_events); + struct clock_event_device *evt = &__get_cpu_var(xen_clock_events).evt; irqreturn_t ret; ret = IRQ_NONE; @@ -395,14 +411,30 @@ static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) return ret; } +void xen_teardown_timer(int cpu) +{ + struct clock_event_device *evt; + BUG_ON(cpu == 0); + evt = &per_cpu(xen_clock_events, cpu).evt; + + if (evt->irq >= 0) { + unbind_from_irqhandler(evt->irq, NULL); + evt->irq = -1; + kfree(per_cpu(xen_clock_events, cpu).name); + per_cpu(xen_clock_events, cpu).name = NULL; + } +} + void xen_setup_timer(int cpu) { - const char *name; + char *name; struct clock_event_device *evt; int irq; - evt = &per_cpu(xen_clock_events, cpu); + evt = &per_cpu(xen_clock_events, cpu).evt; WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu); + if (evt->irq >= 0) + xen_teardown_timer(cpu); printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu); @@ -420,22 +452,15 @@ void xen_setup_timer(int cpu) evt->cpumask = cpumask_of(cpu); evt->irq = irq; + per_cpu(xen_clock_events, cpu).name = name; } -void xen_teardown_timer(int cpu) -{ - struct clock_event_device *evt; - BUG_ON(cpu == 0); - evt = &per_cpu(xen_clock_events, cpu); - unbind_from_irqhandler(evt->irq, NULL); - evt->irq = -1; -} void xen_setup_cpu_clockevents(void) { BUG_ON(preemptible()); - clockevents_register_device(&__get_cpu_var(xen_clock_events)); + clockevents_register_device(&__get_cpu_var(xen_clock_events).evt); } void xen_timer_resume(void) @@ -480,6 +505,9 @@ static void __init xen_time_init(void) xen_setup_runstate_info(cpu); xen_setup_timer(cpu); xen_setup_cpu_clockevents(); + + if (xen_initial_domain()) + pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier); } void __init xen_init_time_ops(void) @@ -492,7 +520,9 @@ void __init xen_init_time_ops(void) x86_platform.calibrate_tsc = xen_tsc_khz; x86_platform.get_wallclock = xen_get_wallclock; - x86_platform.set_wallclock = xen_set_wallclock; + /* Dom0 uses the native method to set the hardware RTC. */ + if (!xen_initial_domain()) + x86_platform.set_wallclock = xen_set_wallclock; } #ifdef CONFIG_XEN_PVHVM diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index a95b41744ad0..95f8c6142328 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -73,7 +73,7 @@ static inline void xen_hvm_smp_init(void) {} #ifdef CONFIG_PARAVIRT_SPINLOCKS void __init xen_init_spinlocks(void); -void __cpuinit xen_init_lock_cpu(int cpu); +void xen_init_lock_cpu(int cpu); void xen_uninit_lock_cpu(int cpu); #else static inline void xen_init_spinlocks(void) @@ -105,9 +105,9 @@ static inline void __init xen_init_apic(void) /* Declare an asm function, along with symbols needed to make it inlineable */ #define DECL_ASM(ret, name, ...) \ - ret name(__VA_ARGS__); \ - extern char name##_end[]; \ - extern char name##_reloc[] \ + __visible ret name(__VA_ARGS__); \ + extern char name##_end[] __visible; \ + extern char name##_reloc[] __visible DECL_ASM(void, xen_irq_enable_direct, void); DECL_ASM(void, xen_irq_disable_direct, void); @@ -115,11 +115,11 @@ DECL_ASM(unsigned long, xen_save_fl_direct, void); DECL_ASM(void, xen_restore_fl_direct, unsigned long); /* These are not functions, and cannot be called normally */ -void xen_iret(void); -void xen_sysexit(void); -void xen_sysret32(void); -void xen_sysret64(void); -void xen_adjust_exception_frame(void); +__visible void xen_iret(void); +__visible void xen_sysexit(void); +__visible void xen_sysret32(void); +__visible void xen_sysret64(void); +__visible void xen_adjust_exception_frame(void); extern int xen_panic_handler_init(void); |