summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig64
-rw-r--r--arch/x86/Makefile8
-rw-r--r--arch/x86/boot/boot.h1
-rw-r--r--arch/x86/boot/compressed/eboot.c2
-rw-r--r--arch/x86/boot/compressed/head_32.S31
-rw-r--r--arch/x86/boot/compressed/head_64.S1
-rw-r--r--arch/x86/boot/compressed/misc.c77
-rw-r--r--arch/x86/boot/printf.c2
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/camellia_glue.c64
-rw-r--r--arch/x86/crypto/crct10dif-pcl-asm_64.S643
-rw-r--r--arch/x86/crypto/crct10dif-pclmul_glue.c151
-rw-r--r--arch/x86/ia32/ia32_signal.c2
-rw-r--r--arch/x86/ia32/ia32entry.S2
-rw-r--r--arch/x86/include/asm/acpi.h2
-rw-r--r--arch/x86/include/asm/alternative.h14
-rw-r--r--arch/x86/include/asm/apic.h2
-rw-r--r--arch/x86/include/asm/asm.h6
-rw-r--r--arch/x86/include/asm/bitops.h46
-rw-r--r--arch/x86/include/asm/bootparam_utils.h4
-rw-r--r--arch/x86/include/asm/checksum_32.h22
-rw-r--r--arch/x86/include/asm/cpufeature.h17
-rw-r--r--arch/x86/include/asm/dma-contiguous.h1
-rw-r--r--arch/x86/include/asm/e820.h2
-rw-r--r--arch/x86/include/asm/hypervisor.h2
-rw-r--r--arch/x86/include/asm/jump_label.h9
-rw-r--r--arch/x86/include/asm/kvm_host.h14
-rw-r--r--arch/x86/include/asm/kvm_para.h38
-rw-r--r--arch/x86/include/asm/mce.h16
-rw-r--r--arch/x86/include/asm/microcode_amd.h2
-rw-r--r--arch/x86/include/asm/mmu_context.h20
-rw-r--r--arch/x86/include/asm/mutex_64.h30
-rw-r--r--arch/x86/include/asm/page_32_types.h2
-rw-r--r--arch/x86/include/asm/page_64_types.h5
-rw-r--r--arch/x86/include/asm/page_types.h5
-rw-r--r--arch/x86/include/asm/paravirt.h32
-rw-r--r--arch/x86/include/asm/paravirt_types.h14
-rw-r--r--arch/x86/include/asm/pci.h30
-rw-r--r--arch/x86/include/asm/pgtable-2level.h48
-rw-r--r--arch/x86/include/asm/pgtable-3level.h3
-rw-r--r--arch/x86/include/asm/pgtable.h34
-rw-r--r--arch/x86/include/asm/pgtable_types.h20
-rw-r--r--arch/x86/include/asm/processor.h32
-rw-r--r--arch/x86/include/asm/pvclock.h1
-rw-r--r--arch/x86/include/asm/spinlock.h137
-rw-r--r--arch/x86/include/asm/spinlock_types.h16
-rw-r--r--arch/x86/include/asm/sync_bitops.h24
-rw-r--r--arch/x86/include/asm/sysfb.h98
-rw-r--r--arch/x86/include/asm/tlbflush.h37
-rw-r--r--arch/x86/include/asm/topology.h3
-rw-r--r--arch/x86/include/asm/tsc.h1
-rw-r--r--arch/x86/include/asm/uaccess.h7
-rw-r--r--arch/x86/include/asm/vmx.h2
-rw-r--r--arch/x86/include/asm/xen/events.h1
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h16
-rw-r--r--arch/x86/include/asm/xor_avx.h4
-rw-r--r--arch/x86/include/uapi/asm/kvm_para.h1
-rw-r--r--arch/x86/include/uapi/asm/vmx.h6
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/acpi/boot.c25
-rw-r--r--arch/x86/kernel/alternative.c155
-rw-r--r--arch/x86/kernel/amd_nb.c13
-rw-r--r--arch/x86/kernel/apic/io_apic.c14
-rw-r--r--arch/x86/kernel/cpu/amd.c20
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c28
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c42
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c13
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event.h2
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c3
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c182
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c32
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c264
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h10
-rw-r--r--arch/x86/kernel/cpu/vmware.c8
-rw-r--r--arch/x86/kernel/crash.c4
-rw-r--r--arch/x86/kernel/devicetree.c3
-rw-r--r--arch/x86/kernel/e820.c5
-rw-r--r--arch/x86/kernel/early-quirks.c168
-rw-r--r--arch/x86/kernel/entry_32.S3
-rw-r--r--arch/x86/kernel/head_32.S2
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/jump_label.c84
-rw-r--r--arch/x86/kernel/kprobes/common.h5
-rw-r--r--arch/x86/kernel/kprobes/core.c2
-rw-r--r--arch/x86/kernel/kprobes/opt.c110
-rw-r--r--arch/x86/kernel/kvm.c268
-rw-r--r--arch/x86/kernel/microcode_amd.c36
-rw-r--r--arch/x86/kernel/microcode_amd_early.c27
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c18
-rw-r--r--arch/x86/kernel/pvclock.c44
-rw-r--r--arch/x86/kernel/setup.c23
-rw-r--r--arch/x86/kernel/signal.c6
-rw-r--r--arch/x86/kernel/sys_x86_64.c2
-rw-r--r--arch/x86/kernel/sysfb.c74
-rw-r--r--arch/x86/kernel/sysfb_efi.c214
-rw-r--r--arch/x86/kernel/sysfb_simplefb.c95
-rw-r--r--arch/x86/kernel/tboot.c10
-rw-r--r--arch/x86/kernel/traps.c4
-rw-r--r--arch/x86/kernel/tsc.c6
-rw-r--r--arch/x86/kernel/x86_init.c24
-rw-r--r--arch/x86/kvm/cpuid.c3
-rw-r--r--arch/x86/kvm/emulate.c14
-rw-r--r--arch/x86/kvm/lapic.c38
-rw-r--r--arch/x86/kvm/mmu.c206
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/paging_tmpl.h198
-rw-r--r--arch/x86/kvm/pmu.c25
-rw-r--r--arch/x86/kvm/vmx.c454
-rw-r--r--arch/x86/kvm/x86.c224
-rw-r--r--arch/x86/lguest/boot.c10
-rw-r--r--arch/x86/lib/csum-wrappers_64.c12
-rw-r--r--arch/x86/lib/x86-opcode-map.txt42
-rw-r--r--arch/x86/mm/fault.c43
-rw-r--r--arch/x86/mm/hugetlbpage.c8
-rw-r--r--arch/x86/mm/init.c4
-rw-r--r--arch/x86/mm/ioremap.c5
-rw-r--r--arch/x86/mm/mmap.c6
-rw-r--r--arch/x86/mm/srat.c11
-rw-r--r--arch/x86/mm/tlb.c14
-rw-r--r--arch/x86/oprofile/nmi_int.c18
-rw-r--r--arch/x86/oprofile/op_model_amd.c24
-rw-r--r--arch/x86/pci/acpi.c9
-rw-r--r--arch/x86/pci/i386.c4
-rw-r--r--arch/x86/pci/mmconfig-shared.c7
-rw-r--r--arch/x86/pci/mrst.c41
-rw-r--r--arch/x86/platform/mrst/mrst.c2
-rw-r--r--arch/x86/tools/gen-insn-attr-x86.awk4
-rw-r--r--arch/x86/um/os-Linux/prctl.c2
-rw-r--r--arch/x86/vdso/vclock_gettime.c16
-rw-r--r--arch/x86/xen/enlighten.c25
-rw-r--r--arch/x86/xen/irq.c25
-rw-r--r--arch/x86/xen/p2m.c26
-rw-r--r--arch/x86/xen/setup.c51
-rw-r--r--arch/x86/xen/smp.c47
-rw-r--r--arch/x86/xen/spinlock.c428
139 files changed, 4473 insertions, 1530 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4d5843d5f64d..ee2fb9d37745 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -16,6 +16,7 @@ config X86_64
def_bool y
depends on 64BIT
select X86_DEV_DMA_OPS
+ select ARCH_USE_CMPXCHG_LOCKREF
### Arch settings
config X86
@@ -81,8 +82,6 @@ config X86
select HAVE_USER_RETURN_NOTIFIER
select ARCH_BINFMT_ELF_RANDOMIZE_PIE
select HAVE_ARCH_JUMP_LABEL
- select HAVE_TEXT_POKE_SMP
- select HAVE_GENERIC_HARDIRQS
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select SPARSE_IRQ
select GENERIC_FIND_FIRST_BIT
@@ -633,6 +632,7 @@ config PARAVIRT_DEBUG
config PARAVIRT_SPINLOCKS
bool "Paravirtualization layer for spinlocks"
depends on PARAVIRT && SMP
+ select UNINLINE_SPIN_UNLOCK
---help---
Paravirtualized spinlocks allow a pvops backend to replace the
spinlock implementation with something virtualization-friendly
@@ -657,6 +657,15 @@ config KVM_GUEST
underlying device model, the host provides the guest with
timing infrastructure such as time of day, and system time
+config KVM_DEBUG_FS
+ bool "Enable debug information for KVM Guests in debugfs"
+ depends on KVM_GUEST && DEBUG_FS
+ default n
+ ---help---
+ This option enables collection of various statistics for KVM guest.
+ Statistics are displayed in debugfs filesystem. Enabling this option
+ may incur significant overhead.
+
source "arch/x86/lguest/Kconfig"
config PARAVIRT_TIME_ACCOUNTING
@@ -1345,8 +1354,12 @@ config ARCH_SELECT_MEMORY_MODEL
depends on ARCH_SPARSEMEM_ENABLE
config ARCH_MEMORY_PROBE
- def_bool y
+ bool "Enable sysfs memory/probe interface"
depends on X86_64 && MEMORY_HOTPLUG
+ help
+ This option enables a sysfs memory/probe interface for testing.
+ See Documentation/memory-hotplug.txt for more information.
+ If you are unsure how to answer this question, answer N.
config ARCH_PROC_KCORE_TEXT
def_bool y
@@ -1628,9 +1641,9 @@ config KEXEC
It is an ongoing process to be certain the hardware in a machine
is properly shutdown, so do not be surprised if this code does not
- initially work for you. It may help to enable device hotplugging
- support. As of this writing the exact hardware interface is
- strongly in flux, so no good recommendation can be made.
+ initially work for you. As of this writing the exact hardware
+ interface is strongly in flux, so no good recommendation can be
+ made.
config CRASH_DUMP
bool "kernel crash dumps"
@@ -1717,9 +1730,10 @@ config X86_NEED_RELOCS
depends on X86_32 && RELOCATABLE
config PHYSICAL_ALIGN
- hex "Alignment value to which kernel should be aligned" if X86_32
+ hex "Alignment value to which kernel should be aligned"
default "0x1000000"
- range 0x2000 0x1000000
+ range 0x2000 0x1000000 if X86_32
+ range 0x200000 0x1000000 if X86_64
---help---
This value puts the alignment restrictions on physical address
where kernel is loaded and run from. Kernel is compiled for an
@@ -1737,6 +1751,9 @@ config PHYSICAL_ALIGN
end result is that kernel runs from a physical address meeting
above alignment restrictions.
+ On 32-bit this value must be a multiple of 0x2000. On 64-bit
+ this value must be a multiple of 0x200000.
+
Don't change this unless you know what you are doing.
config HOTPLUG_CPU
@@ -2015,7 +2032,6 @@ menu "Bus options (PCI etc.)"
config PCI
bool "PCI support"
default y
- select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
---help---
Find out whether you have a PCI motherboard. PCI is the name of a
bus system, i.e. the way the CPU talks to the other stuff inside
@@ -2271,6 +2287,32 @@ config RAPIDIO
source "drivers/rapidio/Kconfig"
+config X86_SYSFB
+ bool "Mark VGA/VBE/EFI FB as generic system framebuffer"
+ help
+ Firmwares often provide initial graphics framebuffers so the BIOS,
+ bootloader or kernel can show basic video-output during boot for
+ user-guidance and debugging. Historically, x86 used the VESA BIOS
+ Extensions and EFI-framebuffers for this, which are mostly limited
+ to x86.
+ This option, if enabled, marks VGA/VBE/EFI framebuffers as generic
+ framebuffers so the new generic system-framebuffer drivers can be
+ used on x86. If the framebuffer is not compatible with the generic
+ modes, it is adverticed as fallback platform framebuffer so legacy
+ drivers like efifb, vesafb and uvesafb can pick it up.
+ If this option is not selected, all system framebuffers are always
+ marked as fallback platform framebuffers as usual.
+
+ Note: Legacy fbdev drivers, including vesafb, efifb, uvesafb, will
+ not be able to pick up generic system framebuffers if this option
+ is selected. You are highly encouraged to enable simplefb as
+ replacement if you select this option. simplefb can correctly deal
+ with generic system framebuffers. But you should still keep vesafb
+ and others enabled as fallback if a system framebuffer is
+ incompatible with simplefb.
+
+ If unsure, say Y.
+
endmenu
@@ -2333,10 +2375,6 @@ config HAVE_ATOMIC_IOMAP
def_bool y
depends on X86_32
-config HAVE_TEXT_POKE_SMP
- bool
- select STOP_MACHINE if SMP
-
config X86_DEV_DMA_OPS
bool
depends on X86_64 || STA2X11
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 07639c656fcd..41250fb33985 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -16,6 +16,10 @@ endif
# e.g.: obj-y += foo_$(BITS).o
export BITS
+ifdef CONFIG_X86_NEED_RELOCS
+ LDFLAGS_vmlinux := --emit-relocs
+endif
+
ifeq ($(CONFIG_X86_32),y)
BITS := 32
UTS_MACHINE := i386
@@ -25,10 +29,6 @@ ifeq ($(CONFIG_X86_32),y)
KBUILD_AFLAGS += $(biarch)
KBUILD_CFLAGS += $(biarch)
- ifdef CONFIG_RELOCATABLE
- LDFLAGS_vmlinux := --emit-relocs
- endif
-
KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return
# Never want PIC in a 32-bit kernel, prevent breakage with GCC built
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 5b7531966b84..ef72baeff484 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -355,6 +355,7 @@ int strncmp(const char *cs, const char *ct, size_t count);
size_t strnlen(const char *s, size_t maxlen);
unsigned int atou(const char *s);
unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base);
+size_t strlen(const char *s);
/* tty.c */
void puts(const char *);
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index d606463aa6d6..b7388a425f09 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -225,7 +225,7 @@ static void low_free(unsigned long size, unsigned long addr)
unsigned long nr_pages;
nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
- efi_call_phys2(sys_table->boottime->free_pages, addr, size);
+ efi_call_phys2(sys_table->boottime->free_pages, addr, nr_pages);
}
static void find_bits(unsigned long mask, u8 *pos, u8 *size)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 1e3184f6072f..5d6f6891b188 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -181,8 +181,9 @@ relocated:
/*
* Do the decompression, and jump to the new kernel..
*/
- leal z_extract_offset_negative(%ebx), %ebp
/* push arguments for decompress_kernel: */
+ pushl $z_output_len /* decompressed length */
+ leal z_extract_offset_negative(%ebx), %ebp
pushl %ebp /* output address */
pushl $z_input_len /* input_len */
leal input_data(%ebx), %eax
@@ -191,33 +192,7 @@ relocated:
pushl %eax /* heap area */
pushl %esi /* real mode pointer */
call decompress_kernel
- addl $20, %esp
-
-#if CONFIG_RELOCATABLE
-/*
- * Find the address of the relocations.
- */
- leal z_output_len(%ebp), %edi
-
-/*
- * Calculate the delta between where vmlinux was compiled to run
- * and where it was actually loaded.
- */
- movl %ebp, %ebx
- subl $LOAD_PHYSICAL_ADDR, %ebx
- jz 2f /* Nothing to be done if loaded at compiled addr. */
-/*
- * Process relocations.
- */
-
-1: subl $4, %edi
- movl (%edi), %ecx
- testl %ecx, %ecx
- jz 2f
- addl %ebx, -__PAGE_OFFSET(%ebx, %ecx)
- jmp 1b
-2:
-#endif
+ addl $24, %esp
/*
* Jump to the decompressed kernel.
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 06e71c2c16bf..c337422b575d 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -338,6 +338,7 @@ relocated:
leaq input_data(%rip), %rdx /* input_data */
movl $z_input_len, %ecx /* input_len */
movq %rbp, %r8 /* output target address */
+ movq $z_output_len, %r9 /* decompressed length */
call decompress_kernel
popq %rsi
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 0319c88290a5..434f077d2c4d 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -271,6 +271,79 @@ static void error(char *x)
asm("hlt");
}
+#if CONFIG_X86_NEED_RELOCS
+static void handle_relocations(void *output, unsigned long output_len)
+{
+ int *reloc;
+ unsigned long delta, map, ptr;
+ unsigned long min_addr = (unsigned long)output;
+ unsigned long max_addr = min_addr + output_len;
+
+ /*
+ * Calculate the delta between where vmlinux was linked to load
+ * and where it was actually loaded.
+ */
+ delta = min_addr - LOAD_PHYSICAL_ADDR;
+ if (!delta) {
+ debug_putstr("No relocation needed... ");
+ return;
+ }
+ debug_putstr("Performing relocations... ");
+
+ /*
+ * The kernel contains a table of relocation addresses. Those
+ * addresses have the final load address of the kernel in virtual
+ * memory. We are currently working in the self map. So we need to
+ * create an adjustment for kernel memory addresses to the self map.
+ * This will involve subtracting out the base address of the kernel.
+ */
+ map = delta - __START_KERNEL_map;
+
+ /*
+ * Process relocations: 32 bit relocations first then 64 bit after.
+ * Two sets of binary relocations are added to the end of the kernel
+ * before compression. Each relocation table entry is the kernel
+ * address of the location which needs to be updated stored as a
+ * 32-bit value which is sign extended to 64 bits.
+ *
+ * Format is:
+ *
+ * kernel bits...
+ * 0 - zero terminator for 64 bit relocations
+ * 64 bit relocation repeated
+ * 0 - zero terminator for 32 bit relocations
+ * 32 bit relocation repeated
+ *
+ * So we work backwards from the end of the decompressed image.
+ */
+ for (reloc = output + output_len - sizeof(*reloc); *reloc; reloc--) {
+ int extended = *reloc;
+ extended += map;
+
+ ptr = (unsigned long)extended;
+ if (ptr < min_addr || ptr > max_addr)
+ error("32-bit relocation outside of kernel!\n");
+
+ *(uint32_t *)ptr += delta;
+ }
+#ifdef CONFIG_X86_64
+ for (reloc--; *reloc; reloc--) {
+ long extended = *reloc;
+ extended += map;
+
+ ptr = (unsigned long)extended;
+ if (ptr < min_addr || ptr > max_addr)
+ error("64-bit relocation outside of kernel!\n");
+
+ *(uint64_t *)ptr += delta;
+ }
+#endif
+}
+#else
+static inline void handle_relocations(void *output, unsigned long output_len)
+{ }
+#endif
+
static void parse_elf(void *output)
{
#ifdef CONFIG_X86_64
@@ -325,7 +398,8 @@ static void parse_elf(void *output)
asmlinkage void decompress_kernel(void *rmode, memptr heap,
unsigned char *input_data,
unsigned long input_len,
- unsigned char *output)
+ unsigned char *output,
+ unsigned long output_len)
{
real_mode = rmode;
@@ -365,6 +439,7 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
debug_putstr("\nDecompressing Linux... ");
decompress(input_data, input_len, NULL, NULL, output, NULL, error);
parse_elf(output);
+ handle_relocations(output, output_len);
debug_putstr("done.\nBooting the kernel.\n");
return;
}
diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c
index cdac91ca55d3..565083c16e5c 100644
--- a/arch/x86/boot/printf.c
+++ b/arch/x86/boot/printf.c
@@ -55,7 +55,7 @@ static char *number(char *str, long num, int base, int size, int precision,
locase = (type & SMALL);
if (type & LEFT)
type &= ~ZEROPAD;
- if (base < 2 || base > 36)
+ if (base < 2 || base > 16)
return NULL;
c = (type & ZEROPAD) ? '0' : ' ';
sign = 0;
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 6c63c358a7e6..7d6ba9db1be9 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
+obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
# These modules require assembler to support AVX.
ifeq ($(avx_supported),yes)
@@ -81,3 +82,4 @@ crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
+crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index 5cb86ccd4acb..c171dcbf192d 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -62,7 +62,7 @@ static void camellia_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
}
/* camellia sboxes */
-const u64 camellia_sp10011110[256] = {
+__visible const u64 camellia_sp10011110[256] = {
0x7000007070707000ULL, 0x8200008282828200ULL, 0x2c00002c2c2c2c00ULL,
0xec0000ecececec00ULL, 0xb30000b3b3b3b300ULL, 0x2700002727272700ULL,
0xc00000c0c0c0c000ULL, 0xe50000e5e5e5e500ULL, 0xe40000e4e4e4e400ULL,
@@ -151,7 +151,7 @@ const u64 camellia_sp10011110[256] = {
0x9e00009e9e9e9e00ULL,
};
-const u64 camellia_sp22000222[256] = {
+__visible const u64 camellia_sp22000222[256] = {
0xe0e0000000e0e0e0ULL, 0x0505000000050505ULL, 0x5858000000585858ULL,
0xd9d9000000d9d9d9ULL, 0x6767000000676767ULL, 0x4e4e0000004e4e4eULL,
0x8181000000818181ULL, 0xcbcb000000cbcbcbULL, 0xc9c9000000c9c9c9ULL,
@@ -240,7 +240,7 @@ const u64 camellia_sp22000222[256] = {
0x3d3d0000003d3d3dULL,
};
-const u64 camellia_sp03303033[256] = {
+__visible const u64 camellia_sp03303033[256] = {
0x0038380038003838ULL, 0x0041410041004141ULL, 0x0016160016001616ULL,
0x0076760076007676ULL, 0x00d9d900d900d9d9ULL, 0x0093930093009393ULL,
0x0060600060006060ULL, 0x00f2f200f200f2f2ULL, 0x0072720072007272ULL,
@@ -329,7 +329,7 @@ const u64 camellia_sp03303033[256] = {
0x004f4f004f004f4fULL,
};
-const u64 camellia_sp00444404[256] = {
+__visible const u64 camellia_sp00444404[256] = {
0x0000707070700070ULL, 0x00002c2c2c2c002cULL, 0x0000b3b3b3b300b3ULL,
0x0000c0c0c0c000c0ULL, 0x0000e4e4e4e400e4ULL, 0x0000575757570057ULL,
0x0000eaeaeaea00eaULL, 0x0000aeaeaeae00aeULL, 0x0000232323230023ULL,
@@ -418,7 +418,7 @@ const u64 camellia_sp00444404[256] = {
0x00009e9e9e9e009eULL,
};
-const u64 camellia_sp02220222[256] = {
+__visible const u64 camellia_sp02220222[256] = {
0x00e0e0e000e0e0e0ULL, 0x0005050500050505ULL, 0x0058585800585858ULL,
0x00d9d9d900d9d9d9ULL, 0x0067676700676767ULL, 0x004e4e4e004e4e4eULL,
0x0081818100818181ULL, 0x00cbcbcb00cbcbcbULL, 0x00c9c9c900c9c9c9ULL,
@@ -507,7 +507,7 @@ const u64 camellia_sp02220222[256] = {
0x003d3d3d003d3d3dULL,
};
-const u64 camellia_sp30333033[256] = {
+__visible const u64 camellia_sp30333033[256] = {
0x3800383838003838ULL, 0x4100414141004141ULL, 0x1600161616001616ULL,
0x7600767676007676ULL, 0xd900d9d9d900d9d9ULL, 0x9300939393009393ULL,
0x6000606060006060ULL, 0xf200f2f2f200f2f2ULL, 0x7200727272007272ULL,
@@ -596,7 +596,7 @@ const u64 camellia_sp30333033[256] = {
0x4f004f4f4f004f4fULL,
};
-const u64 camellia_sp44044404[256] = {
+__visible const u64 camellia_sp44044404[256] = {
0x7070007070700070ULL, 0x2c2c002c2c2c002cULL, 0xb3b300b3b3b300b3ULL,
0xc0c000c0c0c000c0ULL, 0xe4e400e4e4e400e4ULL, 0x5757005757570057ULL,
0xeaea00eaeaea00eaULL, 0xaeae00aeaeae00aeULL, 0x2323002323230023ULL,
@@ -685,7 +685,7 @@ const u64 camellia_sp44044404[256] = {
0x9e9e009e9e9e009eULL,
};
-const u64 camellia_sp11101110[256] = {
+__visible const u64 camellia_sp11101110[256] = {
0x7070700070707000ULL, 0x8282820082828200ULL, 0x2c2c2c002c2c2c00ULL,
0xececec00ececec00ULL, 0xb3b3b300b3b3b300ULL, 0x2727270027272700ULL,
0xc0c0c000c0c0c000ULL, 0xe5e5e500e5e5e500ULL, 0xe4e4e400e4e4e400ULL,
@@ -828,8 +828,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
subRL[1] ^= (subRL[1] & ~subRL[9]) << 32;
/* modified for FLinv(kl2) */
- dw = (subRL[1] & subRL[9]) >> 32,
- subRL[1] ^= rol32(dw, 1);
+ dw = (subRL[1] & subRL[9]) >> 32;
+ subRL[1] ^= rol32(dw, 1);
/* round 8 */
subRL[11] ^= subRL[1];
@@ -840,8 +840,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
subRL[1] ^= (subRL[1] & ~subRL[17]) << 32;
/* modified for FLinv(kl4) */
- dw = (subRL[1] & subRL[17]) >> 32,
- subRL[1] ^= rol32(dw, 1);
+ dw = (subRL[1] & subRL[17]) >> 32;
+ subRL[1] ^= rol32(dw, 1);
/* round 14 */
subRL[19] ^= subRL[1];
@@ -859,8 +859,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
} else {
subRL[1] ^= (subRL[1] & ~subRL[25]) << 32;
/* modified for FLinv(kl6) */
- dw = (subRL[1] & subRL[25]) >> 32,
- subRL[1] ^= rol32(dw, 1);
+ dw = (subRL[1] & subRL[25]) >> 32;
+ subRL[1] ^= rol32(dw, 1);
/* round 20 */
subRL[27] ^= subRL[1];
@@ -882,8 +882,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
kw4 ^= (kw4 & ~subRL[24]) << 32;
/* modified for FL(kl5) */
- dw = (kw4 & subRL[24]) >> 32,
- kw4 ^= rol32(dw, 1);
+ dw = (kw4 & subRL[24]) >> 32;
+ kw4 ^= rol32(dw, 1);
}
/* round 17 */
@@ -895,8 +895,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
kw4 ^= (kw4 & ~subRL[16]) << 32;
/* modified for FL(kl3) */
- dw = (kw4 & subRL[16]) >> 32,
- kw4 ^= rol32(dw, 1);
+ dw = (kw4 & subRL[16]) >> 32;
+ kw4 ^= rol32(dw, 1);
/* round 11 */
subRL[14] ^= kw4;
@@ -907,8 +907,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
kw4 ^= (kw4 & ~subRL[8]) << 32;
/* modified for FL(kl1) */
- dw = (kw4 & subRL[8]) >> 32,
- kw4 ^= rol32(dw, 1);
+ dw = (kw4 & subRL[8]) >> 32;
+ kw4 ^= rol32(dw, 1);
/* round 5 */
subRL[6] ^= kw4;
@@ -928,8 +928,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
SET_SUBKEY_LR(6, subRL[5] ^ subRL[7]); /* round 5 */
tl = (subRL[10] >> 32) ^ (subRL[10] & ~subRL[8]);
- dw = tl & (subRL[8] >> 32), /* FL(kl1) */
- tr = subRL[10] ^ rol32(dw, 1);
+ dw = tl & (subRL[8] >> 32); /* FL(kl1) */
+ tr = subRL[10] ^ rol32(dw, 1);
tt = (tr | ((u64)tl << 32));
SET_SUBKEY_LR(7, subRL[6] ^ tt); /* round 6 */
@@ -937,8 +937,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
SET_SUBKEY_LR(9, subRL[9]); /* FLinv(kl2) */
tl = (subRL[7] >> 32) ^ (subRL[7] & ~subRL[9]);
- dw = tl & (subRL[9] >> 32), /* FLinv(kl2) */
- tr = subRL[7] ^ rol32(dw, 1);
+ dw = tl & (subRL[9] >> 32); /* FLinv(kl2) */
+ tr = subRL[7] ^ rol32(dw, 1);
tt = (tr | ((u64)tl << 32));
SET_SUBKEY_LR(10, subRL[11] ^ tt); /* round 7 */
@@ -948,8 +948,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
SET_SUBKEY_LR(14, subRL[13] ^ subRL[15]); /* round 11 */
tl = (subRL[18] >> 32) ^ (subRL[18] & ~subRL[16]);
- dw = tl & (subRL[16] >> 32), /* FL(kl3) */
- tr = subRL[18] ^ rol32(dw, 1);
+ dw = tl & (subRL[16] >> 32); /* FL(kl3) */
+ tr = subRL[18] ^ rol32(dw, 1);
tt = (tr | ((u64)tl << 32));
SET_SUBKEY_LR(15, subRL[14] ^ tt); /* round 12 */
@@ -957,8 +957,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
SET_SUBKEY_LR(17, subRL[17]); /* FLinv(kl4) */
tl = (subRL[15] >> 32) ^ (subRL[15] & ~subRL[17]);
- dw = tl & (subRL[17] >> 32), /* FLinv(kl4) */
- tr = subRL[15] ^ rol32(dw, 1);
+ dw = tl & (subRL[17] >> 32); /* FLinv(kl4) */
+ tr = subRL[15] ^ rol32(dw, 1);
tt = (tr | ((u64)tl << 32));
SET_SUBKEY_LR(18, subRL[19] ^ tt); /* round 13 */
@@ -972,8 +972,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
SET_SUBKEY_LR(24, subRL[24] ^ subRL[23]); /* kw3 */
} else {
tl = (subRL[26] >> 32) ^ (subRL[26] & ~subRL[24]);
- dw = tl & (subRL[24] >> 32), /* FL(kl5) */
- tr = subRL[26] ^ rol32(dw, 1);
+ dw = tl & (subRL[24] >> 32); /* FL(kl5) */
+ tr = subRL[26] ^ rol32(dw, 1);
tt = (tr | ((u64)tl << 32));
SET_SUBKEY_LR(23, subRL[22] ^ tt); /* round 18 */
@@ -981,8 +981,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
SET_SUBKEY_LR(25, subRL[25]); /* FLinv(kl6) */
tl = (subRL[23] >> 32) ^ (subRL[23] & ~subRL[25]);
- dw = tl & (subRL[25] >> 32), /* FLinv(kl6) */
- tr = subRL[23] ^ rol32(dw, 1);
+ dw = tl & (subRL[25] >> 32); /* FLinv(kl6) */
+ tr = subRL[23] ^ rol32(dw, 1);
tt = (tr | ((u64)tl << 32));
SET_SUBKEY_LR(26, subRL[27] ^ tt); /* round 19 */
diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S
new file mode 100644
index 000000000000..35e97569d05f
--- /dev/null
+++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S
@@ -0,0 +1,643 @@
+########################################################################
+# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+#
+# Copyright (c) 2013, Intel Corporation
+#
+# Authors:
+# Erdinc Ozturk <erdinc.ozturk@intel.com>
+# Vinodh Gopal <vinodh.gopal@intel.com>
+# James Guilford <james.guilford@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the
+# distribution.
+#
+# * Neither the name of the Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+#
+# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+# Function API:
+# UINT16 crc_t10dif_pcl(
+# UINT16 init_crc, //initial CRC value, 16 bits
+# const unsigned char *buf, //buffer pointer to calculate CRC on
+# UINT64 len //buffer length in bytes (64-bit data)
+# );
+#
+# Reference paper titled "Fast CRC Computation for Generic
+# Polynomials Using PCLMULQDQ Instruction"
+# URL: http://www.intel.com/content/dam/www/public/us/en/documents
+# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+#
+#
+
+#include <linux/linkage.h>
+
+.text
+
+#define arg1 %rdi
+#define arg2 %rsi
+#define arg3 %rdx
+
+#define arg1_low32 %edi
+
+ENTRY(crc_t10dif_pcl)
+.align 16
+
+ # adjust the 16-bit initial_crc value, scale it to 32 bits
+ shl $16, arg1_low32
+
+ # Allocate Stack Space
+ mov %rsp, %rcx
+ sub $16*2, %rsp
+ # align stack to 16 byte boundary
+ and $~(0x10 - 1), %rsp
+
+ # check if smaller than 256
+ cmp $256, arg3
+
+ # for sizes less than 128, we can't fold 64B at a time...
+ jl _less_than_128
+
+
+ # load the initial crc value
+ movd arg1_low32, %xmm10 # initial crc
+
+ # crc value does not need to be byte-reflected, but it needs
+ # to be moved to the high part of the register.
+ # because data will be byte-reflected and will align with
+ # initial crc at correct place.
+ pslldq $12, %xmm10
+
+ movdqa SHUF_MASK(%rip), %xmm11
+ # receive the initial 64B data, xor the initial crc value
+ movdqu 16*0(arg2), %xmm0
+ movdqu 16*1(arg2), %xmm1
+ movdqu 16*2(arg2), %xmm2
+ movdqu 16*3(arg2), %xmm3
+ movdqu 16*4(arg2), %xmm4
+ movdqu 16*5(arg2), %xmm5
+ movdqu 16*6(arg2), %xmm6
+ movdqu 16*7(arg2), %xmm7
+
+ pshufb %xmm11, %xmm0
+ # XOR the initial_crc value
+ pxor %xmm10, %xmm0
+ pshufb %xmm11, %xmm1
+ pshufb %xmm11, %xmm2
+ pshufb %xmm11, %xmm3
+ pshufb %xmm11, %xmm4
+ pshufb %xmm11, %xmm5
+ pshufb %xmm11, %xmm6
+ pshufb %xmm11, %xmm7
+
+ movdqa rk3(%rip), %xmm10 #xmm10 has rk3 and rk4
+ #imm value of pclmulqdq instruction
+ #will determine which constant to use
+
+ #################################################################
+ # we subtract 256 instead of 128 to save one instruction from the loop
+ sub $256, arg3
+
+ # at this section of the code, there is 64*x+y (0<=y<64) bytes of
+ # buffer. The _fold_64_B_loop will fold 64B at a time
+ # until we have 64+y Bytes of buffer
+
+
+ # fold 64B at a time. This section of the code folds 4 xmm
+ # registers in parallel
+_fold_64_B_loop:
+
+ # update the buffer pointer
+ add $128, arg2 # buf += 64#
+
+ movdqu 16*0(arg2), %xmm9
+ movdqu 16*1(arg2), %xmm12
+ pshufb %xmm11, %xmm9
+ pshufb %xmm11, %xmm12
+ movdqa %xmm0, %xmm8
+ movdqa %xmm1, %xmm13
+ pclmulqdq $0x0 , %xmm10, %xmm0
+ pclmulqdq $0x11, %xmm10, %xmm8
+ pclmulqdq $0x0 , %xmm10, %xmm1
+ pclmulqdq $0x11, %xmm10, %xmm13
+ pxor %xmm9 , %xmm0
+ xorps %xmm8 , %xmm0
+ pxor %xmm12, %xmm1
+ xorps %xmm13, %xmm1
+
+ movdqu 16*2(arg2), %xmm9
+ movdqu 16*3(arg2), %xmm12
+ pshufb %xmm11, %xmm9
+ pshufb %xmm11, %xmm12
+ movdqa %xmm2, %xmm8
+ movdqa %xmm3, %xmm13
+ pclmulqdq $0x0, %xmm10, %xmm2
+ pclmulqdq $0x11, %xmm10, %xmm8
+ pclmulqdq $0x0, %xmm10, %xmm3
+ pclmulqdq $0x11, %xmm10, %xmm13
+ pxor %xmm9 , %xmm2
+ xorps %xmm8 , %xmm2
+ pxor %xmm12, %xmm3
+ xorps %xmm13, %xmm3
+
+ movdqu 16*4(arg2), %xmm9
+ movdqu 16*5(arg2), %xmm12
+ pshufb %xmm11, %xmm9
+ pshufb %xmm11, %xmm12
+ movdqa %xmm4, %xmm8
+ movdqa %xmm5, %xmm13
+ pclmulqdq $0x0, %xmm10, %xmm4
+ pclmulqdq $0x11, %xmm10, %xmm8
+ pclmulqdq $0x0, %xmm10, %xmm5
+ pclmulqdq $0x11, %xmm10, %xmm13
+ pxor %xmm9 , %xmm4
+ xorps %xmm8 , %xmm4
+ pxor %xmm12, %xmm5
+ xorps %xmm13, %xmm5
+
+ movdqu 16*6(arg2), %xmm9
+ movdqu 16*7(arg2), %xmm12
+ pshufb %xmm11, %xmm9
+ pshufb %xmm11, %xmm12
+ movdqa %xmm6 , %xmm8
+ movdqa %xmm7 , %xmm13
+ pclmulqdq $0x0 , %xmm10, %xmm6
+ pclmulqdq $0x11, %xmm10, %xmm8
+ pclmulqdq $0x0 , %xmm10, %xmm7
+ pclmulqdq $0x11, %xmm10, %xmm13
+ pxor %xmm9 , %xmm6
+ xorps %xmm8 , %xmm6
+ pxor %xmm12, %xmm7
+ xorps %xmm13, %xmm7
+
+ sub $128, arg3
+
+ # check if there is another 64B in the buffer to be able to fold
+ jge _fold_64_B_loop
+ ##################################################################
+
+
+ add $128, arg2
+ # at this point, the buffer pointer is pointing at the last y Bytes
+ # of the buffer the 64B of folded data is in 4 of the xmm
+ # registers: xmm0, xmm1, xmm2, xmm3
+
+
+ # fold the 8 xmm registers to 1 xmm register with different constants
+
+ movdqa rk9(%rip), %xmm10
+ movdqa %xmm0, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm0
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ xorps %xmm0, %xmm7
+
+ movdqa rk11(%rip), %xmm10
+ movdqa %xmm1, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm1
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ xorps %xmm1, %xmm7
+
+ movdqa rk13(%rip), %xmm10
+ movdqa %xmm2, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm2
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ pxor %xmm2, %xmm7
+
+ movdqa rk15(%rip), %xmm10
+ movdqa %xmm3, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm3
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ xorps %xmm3, %xmm7
+
+ movdqa rk17(%rip), %xmm10
+ movdqa %xmm4, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm4
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ pxor %xmm4, %xmm7
+
+ movdqa rk19(%rip), %xmm10
+ movdqa %xmm5, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm5
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ xorps %xmm5, %xmm7
+
+ movdqa rk1(%rip), %xmm10 #xmm10 has rk1 and rk2
+ #imm value of pclmulqdq instruction
+ #will determine which constant to use
+ movdqa %xmm6, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm6
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ pxor %xmm6, %xmm7
+
+
+ # instead of 64, we add 48 to the loop counter to save 1 instruction
+ # from the loop instead of a cmp instruction, we use the negative
+ # flag with the jl instruction
+ add $128-16, arg3
+ jl _final_reduction_for_128
+
+ # now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7
+ # and the rest is in memory. We can fold 16 bytes at a time if y>=16
+ # continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa %xmm7, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm7
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ movdqu (arg2), %xmm0
+ pshufb %xmm11, %xmm0
+ pxor %xmm0 , %xmm7
+ add $16, arg2
+ sub $16, arg3
+ # instead of a cmp instruction, we utilize the flags with the
+ # jge instruction equivalent of: cmp arg3, 16-16
+ # check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ #now we have 16+z bytes left to reduce, where 0<= z < 16.
+ #first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+ # check if any more data to fold. If not, compute the CRC of
+ # the final 128 bits
+ add $16, arg3
+ je _128_done
+
+ # here we are getting data that is less than 16 bytes.
+ # since we know that there was data before the pointer, we can
+ # offset the input pointer before the actual point, to receive
+ # exactly 16 bytes. after that the registers need to be adjusted.
+_get_last_two_xmms:
+ movdqa %xmm7, %xmm2
+
+ movdqu -16(arg2, arg3), %xmm1
+ pshufb %xmm11, %xmm1
+
+ # get rid of the extra data that was loaded before
+ # load the shift constant
+ lea pshufb_shf_table+16(%rip), %rax
+ sub arg3, %rax
+ movdqu (%rax), %xmm0
+
+ # shift xmm2 to the left by arg3 bytes
+ pshufb %xmm0, %xmm2
+
+ # shift xmm7 to the right by 16-arg3 bytes
+ pxor mask1(%rip), %xmm0
+ pshufb %xmm0, %xmm7
+ pblendvb %xmm2, %xmm1 #xmm0 is implicit
+
+ # fold 16 Bytes
+ movdqa %xmm1, %xmm2
+ movdqa %xmm7, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm7
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ pxor %xmm2, %xmm7
+
+_128_done:
+ # compute crc of a 128-bit value
+ movdqa rk5(%rip), %xmm10 # rk5 and rk6 in xmm10
+ movdqa %xmm7, %xmm0
+
+ #64b fold
+ pclmulqdq $0x1, %xmm10, %xmm7
+ pslldq $8 , %xmm0
+ pxor %xmm0, %xmm7
+
+ #32b fold
+ movdqa %xmm7, %xmm0
+
+ pand mask2(%rip), %xmm0
+
+ psrldq $12, %xmm7
+ pclmulqdq $0x10, %xmm10, %xmm7
+ pxor %xmm0, %xmm7
+
+ #barrett reduction
+_barrett:
+ movdqa rk7(%rip), %xmm10 # rk7 and rk8 in xmm10
+ movdqa %xmm7, %xmm0
+ pclmulqdq $0x01, %xmm10, %xmm7
+ pslldq $4, %xmm7
+ pclmulqdq $0x11, %xmm10, %xmm7
+
+ pslldq $4, %xmm7
+ pxor %xmm0, %xmm7
+ pextrd $1, %xmm7, %eax
+
+_cleanup:
+ # scale the result back to 16 bits
+ shr $16, %eax
+ mov %rcx, %rsp
+ ret
+
+########################################################################
+
+.align 16
+_less_than_128:
+
+ # check if there is enough buffer to be able to fold 16B at a time
+ cmp $32, arg3
+ jl _less_than_32
+ movdqa SHUF_MASK(%rip), %xmm11
+
+ # now if there is, load the constants
+ movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10
+
+ movd arg1_low32, %xmm0 # get the initial crc value
+ pslldq $12, %xmm0 # align it to its correct place
+ movdqu (arg2), %xmm7 # load the plaintext
+ pshufb %xmm11, %xmm7 # byte-reflect the plaintext
+ pxor %xmm0, %xmm7
+
+
+ # update the buffer pointer
+ add $16, arg2
+
+ # update the counter. subtract 32 instead of 16 to save one
+ # instruction from the loop
+ sub $32, arg3
+
+ jmp _16B_reduction_loop
+
+
+.align 16
+_less_than_32:
+ # mov initial crc to the return value. this is necessary for
+ # zero-length buffers.
+ mov arg1_low32, %eax
+ test arg3, arg3
+ je _cleanup
+
+ movdqa SHUF_MASK(%rip), %xmm11
+
+ movd arg1_low32, %xmm0 # get the initial crc value
+ pslldq $12, %xmm0 # align it to its correct place
+
+ cmp $16, arg3
+ je _exact_16_left
+ jl _less_than_16_left
+
+ movdqu (arg2), %xmm7 # load the plaintext
+ pshufb %xmm11, %xmm7 # byte-reflect the plaintext
+ pxor %xmm0 , %xmm7 # xor the initial crc value
+ add $16, arg2
+ sub $16, arg3
+ movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10
+ jmp _get_last_two_xmms
+
+
+.align 16
+_less_than_16_left:
+ # use stack space to load data less than 16 bytes, zero-out
+ # the 16B in memory first.
+
+ pxor %xmm1, %xmm1
+ mov %rsp, %r11
+ movdqa %xmm1, (%r11)
+
+ cmp $4, arg3
+ jl _only_less_than_4
+
+ # backup the counter value
+ mov arg3, %r9
+ cmp $8, arg3
+ jl _less_than_8_left
+
+ # load 8 Bytes
+ mov (arg2), %rax
+ mov %rax, (%r11)
+ add $8, %r11
+ sub $8, arg3
+ add $8, arg2
+_less_than_8_left:
+
+ cmp $4, arg3
+ jl _less_than_4_left
+
+ # load 4 Bytes
+ mov (arg2), %eax
+ mov %eax, (%r11)
+ add $4, %r11
+ sub $4, arg3
+ add $4, arg2
+_less_than_4_left:
+
+ cmp $2, arg3
+ jl _less_than_2_left
+
+ # load 2 Bytes
+ mov (arg2), %ax
+ mov %ax, (%r11)
+ add $2, %r11
+ sub $2, arg3
+ add $2, arg2
+_less_than_2_left:
+ cmp $1, arg3
+ jl _zero_left
+
+ # load 1 Byte
+ mov (arg2), %al
+ mov %al, (%r11)
+_zero_left:
+ movdqa (%rsp), %xmm7
+ pshufb %xmm11, %xmm7
+ pxor %xmm0 , %xmm7 # xor the initial crc value
+
+ # shl r9, 4
+ lea pshufb_shf_table+16(%rip), %rax
+ sub %r9, %rax
+ movdqu (%rax), %xmm0
+ pxor mask1(%rip), %xmm0
+
+ pshufb %xmm0, %xmm7
+ jmp _128_done
+
+.align 16
+_exact_16_left:
+ movdqu (arg2), %xmm7
+ pshufb %xmm11, %xmm7
+ pxor %xmm0 , %xmm7 # xor the initial crc value
+
+ jmp _128_done
+
+_only_less_than_4:
+ cmp $3, arg3
+ jl _only_less_than_3
+
+ # load 3 Bytes
+ mov (arg2), %al
+ mov %al, (%r11)
+
+ mov 1(arg2), %al
+ mov %al, 1(%r11)
+
+ mov 2(arg2), %al
+ mov %al, 2(%r11)
+
+ movdqa (%rsp), %xmm7
+ pshufb %xmm11, %xmm7
+ pxor %xmm0 , %xmm7 # xor the initial crc value
+
+ psrldq $5, %xmm7
+
+ jmp _barrett
+_only_less_than_3:
+ cmp $2, arg3
+ jl _only_less_than_2
+
+ # load 2 Bytes
+ mov (arg2), %al
+ mov %al, (%r11)
+
+ mov 1(arg2), %al
+ mov %al, 1(%r11)
+
+ movdqa (%rsp), %xmm7
+ pshufb %xmm11, %xmm7
+ pxor %xmm0 , %xmm7 # xor the initial crc value
+
+ psrldq $6, %xmm7
+
+ jmp _barrett
+_only_less_than_2:
+
+ # load 1 Byte
+ mov (arg2), %al
+ mov %al, (%r11)
+
+ movdqa (%rsp), %xmm7
+ pshufb %xmm11, %xmm7
+ pxor %xmm0 , %xmm7 # xor the initial crc value
+
+ psrldq $7, %xmm7
+
+ jmp _barrett
+
+ENDPROC(crc_t10dif_pcl)
+
+.data
+
+# precomputed constants
+# these constants are precomputed from the poly:
+# 0x8bb70000 (0x8bb7 scaled to 32 bits)
+.align 16
+# Q = 0x18BB70000
+# rk1 = 2^(32*3) mod Q << 32
+# rk2 = 2^(32*5) mod Q << 32
+# rk3 = 2^(32*15) mod Q << 32
+# rk4 = 2^(32*17) mod Q << 32
+# rk5 = 2^(32*3) mod Q << 32
+# rk6 = 2^(32*2) mod Q << 32
+# rk7 = floor(2^64/Q)
+# rk8 = Q
+rk1:
+.quad 0x2d56000000000000
+rk2:
+.quad 0x06df000000000000
+rk3:
+.quad 0x9d9d000000000000
+rk4:
+.quad 0x7cf5000000000000
+rk5:
+.quad 0x2d56000000000000
+rk6:
+.quad 0x1368000000000000
+rk7:
+.quad 0x00000001f65a57f8
+rk8:
+.quad 0x000000018bb70000
+
+rk9:
+.quad 0xceae000000000000
+rk10:
+.quad 0xbfd6000000000000
+rk11:
+.quad 0x1e16000000000000
+rk12:
+.quad 0x713c000000000000
+rk13:
+.quad 0xf7f9000000000000
+rk14:
+.quad 0x80a6000000000000
+rk15:
+.quad 0x044c000000000000
+rk16:
+.quad 0xe658000000000000
+rk17:
+.quad 0xad18000000000000
+rk18:
+.quad 0xa497000000000000
+rk19:
+.quad 0x6ee3000000000000
+rk20:
+.quad 0xe7b5000000000000
+
+
+
+mask1:
+.octa 0x80808080808080808080808080808080
+mask2:
+.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
+
+SHUF_MASK:
+.octa 0x000102030405060708090A0B0C0D0E0F
+
+pshufb_shf_table:
+# use these values for shift constants for the pshufb instruction
+# different alignments result in values as shown:
+# DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
+# DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
+# DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
+# DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
+# DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
+# DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
+# DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
+# DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
+# DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
+# DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
+# DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
+# DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
+# DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
+# DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
+# DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
+.octa 0x8f8e8d8c8b8a89888786858483828100
+.octa 0x000e0d0c0b0a09080706050403020100
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
new file mode 100644
index 000000000000..7845d7fd54c0
--- /dev/null
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -0,0 +1,151 @@
+/*
+ * Cryptographic API.
+ *
+ * T10 Data Integrity Field CRC16 Crypto Transform using PCLMULQDQ Instructions
+ *
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/crc-t10dif.h>
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <asm/i387.h>
+#include <asm/cpufeature.h>
+#include <asm/cpu_device_id.h>
+
+asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf,
+ size_t len);
+
+struct chksum_desc_ctx {
+ __u16 crc;
+};
+
+/*
+ * Steps through buffer one byte at at time, calculates reflected
+ * crc using table.
+ */
+
+static int chksum_init(struct shash_desc *desc)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ ctx->crc = 0;
+
+ return 0;
+}
+
+static int chksum_update(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ if (irq_fpu_usable()) {
+ kernel_fpu_begin();
+ ctx->crc = crc_t10dif_pcl(ctx->crc, data, length);
+ kernel_fpu_end();
+ } else
+ ctx->crc = crc_t10dif_generic(ctx->crc, data, length);
+ return 0;
+}
+
+static int chksum_final(struct shash_desc *desc, u8 *out)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ *(__u16 *)out = ctx->crc;
+ return 0;
+}
+
+static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len,
+ u8 *out)
+{
+ if (irq_fpu_usable()) {
+ kernel_fpu_begin();
+ *(__u16 *)out = crc_t10dif_pcl(*crcp, data, len);
+ kernel_fpu_end();
+ } else
+ *(__u16 *)out = crc_t10dif_generic(*crcp, data, len);
+ return 0;
+}
+
+static int chksum_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ return __chksum_finup(&ctx->crc, data, len, out);
+}
+
+static int chksum_digest(struct shash_desc *desc, const u8 *data,
+ unsigned int length, u8 *out)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ return __chksum_finup(&ctx->crc, data, length, out);
+}
+
+static struct shash_alg alg = {
+ .digestsize = CRC_T10DIF_DIGEST_SIZE,
+ .init = chksum_init,
+ .update = chksum_update,
+ .final = chksum_final,
+ .finup = chksum_finup,
+ .digest = chksum_digest,
+ .descsize = sizeof(struct chksum_desc_ctx),
+ .base = {
+ .cra_name = "crct10dif",
+ .cra_driver_name = "crct10dif-pclmul",
+ .cra_priority = 200,
+ .cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static const struct x86_cpu_id crct10dif_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, crct10dif_cpu_id);
+
+static int __init crct10dif_intel_mod_init(void)
+{
+ if (!x86_match_cpu(crct10dif_cpu_id))
+ return -ENODEV;
+
+ return crypto_register_shash(&alg);
+}
+
+static void __exit crct10dif_intel_mod_fini(void)
+{
+ crypto_unregister_shash(&alg);
+}
+
+module_init(crct10dif_intel_mod_init);
+module_exit(crct10dif_intel_mod_fini);
+
+MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>");
+MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ.");
+MODULE_LICENSE("GPL");
+
+MODULE_ALIAS("crct10dif");
+MODULE_ALIAS("crct10dif-pclmul");
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index bccfca68430e..665a730307f2 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -457,7 +457,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
else
put_user_ex(0, &frame->uc.uc_flags);
put_user_ex(0, &frame->uc.uc_link);
- err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp);
+ compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp);
if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = ksig->ka.sa.sa_restorer;
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 474dc1b59f72..4299eb05023c 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -452,7 +452,7 @@ ia32_badsys:
CFI_ENDPROC
- .macro PTREGSCALL label, func, arg
+ .macro PTREGSCALL label, func
ALIGN
GLOBAL(\label)
leaq \func(%rip),%rax
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 2dfac58f3b11..b1977bad5435 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -86,6 +86,7 @@ extern int acpi_pci_disabled;
extern int acpi_skip_timer_override;
extern int acpi_use_timer_override;
extern int acpi_fix_pin2_polarity;
+extern int acpi_disable_cmcff;
extern u8 acpi_sci_flags;
extern int acpi_sci_override_gsi;
@@ -168,6 +169,7 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf)
#define acpi_lapic 0
#define acpi_ioapic 0
+#define acpi_disable_cmcff 0
static inline void acpi_noirq_set(void) { }
static inline void acpi_disable_pci(void) { }
static inline void disable_acpi(void) { }
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 58ed6d96a6ac..0a3f9c9f98d5 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -5,6 +5,7 @@
#include <linux/stddef.h>
#include <linux/stringify.h>
#include <asm/asm.h>
+#include <asm/ptrace.h>
/*
* Alternative inline assembly for SMP.
@@ -220,20 +221,11 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len);
* no thread can be preempted in the instructions being modified (no iret to an
* invalid instruction possible) or if the instructions are changed from a
* consistent state to another consistent state atomically.
- * More care must be taken when modifying code in the SMP case because of
- * Intel's errata. text_poke_smp() takes care that errata, but still
- * doesn't support NMI/MCE handler code modifying.
* On the local CPU you need to be protected again NMI or MCE handlers seeing an
* inconsistent instruction while you patch.
*/
-struct text_poke_param {
- void *addr;
- const void *opcode;
- size_t len;
-};
-
extern void *text_poke(void *addr, const void *opcode, size_t len);
-extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
-extern void text_poke_smp_batch(struct text_poke_param *params, int n);
+extern int poke_int3_handler(struct pt_regs *regs);
+extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index f8119b582c3c..1d2091a226bc 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -715,4 +715,6 @@ static inline void exiting_ack_irq(void)
ack_APIC_irq();
}
+extern void ioapic_zap_locks(void);
+
#endif /* _ASM_X86_APIC_H */
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 1c2d247f65ce..4582e8e1cd1a 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -3,21 +3,25 @@
#ifdef __ASSEMBLY__
# define __ASM_FORM(x) x
+# define __ASM_FORM_RAW(x) x
# define __ASM_FORM_COMMA(x) x,
#else
# define __ASM_FORM(x) " " #x " "
+# define __ASM_FORM_RAW(x) #x
# define __ASM_FORM_COMMA(x) " " #x ","
#endif
#ifdef CONFIG_X86_32
# define __ASM_SEL(a,b) __ASM_FORM(a)
+# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a)
#else
# define __ASM_SEL(a,b) __ASM_FORM(b)
+# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b)
#endif
#define __ASM_SIZE(inst, ...) __ASM_SEL(inst##l##__VA_ARGS__, \
inst##q##__VA_ARGS__)
-#define __ASM_REG(reg) __ASM_SEL(e##reg, r##reg)
+#define __ASM_REG(reg) __ASM_SEL_RAW(e##reg, r##reg)
#define _ASM_PTR __ASM_SEL(.long, .quad)
#define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8)
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 6dfd0195bb55..41639ce8fd63 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -15,6 +15,14 @@
#include <linux/compiler.h>
#include <asm/alternative.h>
+#if BITS_PER_LONG == 32
+# define _BITOPS_LONG_SHIFT 5
+#elif BITS_PER_LONG == 64
+# define _BITOPS_LONG_SHIFT 6
+#else
+# error "Unexpected BITS_PER_LONG"
+#endif
+
#define BIT_64(n) (U64_C(1) << (n))
/*
@@ -59,7 +67,7 @@
* restricted to acting on a single-word quantity.
*/
static __always_inline void
-set_bit(unsigned int nr, volatile unsigned long *addr)
+set_bit(long nr, volatile unsigned long *addr)
{
if (IS_IMMEDIATE(nr)) {
asm volatile(LOCK_PREFIX "orb %1,%0"
@@ -81,7 +89,7 @@ set_bit(unsigned int nr, volatile unsigned long *addr)
* If it's called on the same region of memory simultaneously, the effect
* may be that only one operation succeeds.
*/
-static inline void __set_bit(int nr, volatile unsigned long *addr)
+static inline void __set_bit(long nr, volatile unsigned long *addr)
{
asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory");
}
@@ -97,7 +105,7 @@ static inline void __set_bit(int nr, volatile unsigned long *addr)
* in order to ensure changes are visible on other processors.
*/
static __always_inline void
-clear_bit(int nr, volatile unsigned long *addr)
+clear_bit(long nr, volatile unsigned long *addr)
{
if (IS_IMMEDIATE(nr)) {
asm volatile(LOCK_PREFIX "andb %1,%0"
@@ -118,13 +126,13 @@ clear_bit(int nr, volatile unsigned long *addr)
* clear_bit() is atomic and implies release semantics before the memory
* operation. It can be used for an unlock.
*/
-static inline void clear_bit_unlock(unsigned nr, volatile unsigned long *addr)
+static inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
{
barrier();
clear_bit(nr, addr);
}
-static inline void __clear_bit(int nr, volatile unsigned long *addr)
+static inline void __clear_bit(long nr, volatile unsigned long *addr)
{
asm volatile("btr %1,%0" : ADDR : "Ir" (nr));
}
@@ -141,7 +149,7 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr)
* No memory barrier is required here, because x86 cannot reorder stores past
* older loads. Same principle as spin_unlock.
*/
-static inline void __clear_bit_unlock(unsigned nr, volatile unsigned long *addr)
+static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
{
barrier();
__clear_bit(nr, addr);
@@ -159,7 +167,7 @@ static inline void __clear_bit_unlock(unsigned nr, volatile unsigned long *addr)
* If it's called on the same region of memory simultaneously, the effect
* may be that only one operation succeeds.
*/
-static inline void __change_bit(int nr, volatile unsigned long *addr)
+static inline void __change_bit(long nr, volatile unsigned long *addr)
{
asm volatile("btc %1,%0" : ADDR : "Ir" (nr));
}
@@ -173,7 +181,7 @@ static inline void __change_bit(int nr, volatile unsigned long *addr)
* Note that @nr may be almost arbitrarily large; this function is not
* restricted to acting on a single-word quantity.
*/
-static inline void change_bit(int nr, volatile unsigned long *addr)
+static inline void change_bit(long nr, volatile unsigned long *addr)
{
if (IS_IMMEDIATE(nr)) {
asm volatile(LOCK_PREFIX "xorb %1,%0"
@@ -194,7 +202,7 @@ static inline void change_bit(int nr, volatile unsigned long *addr)
* This operation is atomic and cannot be reordered.
* It also implies a memory barrier.
*/
-static inline int test_and_set_bit(int nr, volatile unsigned long *addr)
+static inline int test_and_set_bit(long nr, volatile unsigned long *addr)
{
int oldbit;
@@ -212,7 +220,7 @@ static inline int test_and_set_bit(int nr, volatile unsigned long *addr)
* This is the same as test_and_set_bit on x86.
*/
static __always_inline int
-test_and_set_bit_lock(int nr, volatile unsigned long *addr)
+test_and_set_bit_lock(long nr, volatile unsigned long *addr)
{
return test_and_set_bit(nr, addr);
}
@@ -226,7 +234,7 @@ test_and_set_bit_lock(int nr, volatile unsigned long *addr)
* If two examples of this operation race, one can appear to succeed
* but actually fail. You must protect multiple accesses with a lock.
*/
-static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
+static inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
{
int oldbit;
@@ -245,7 +253,7 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
* This operation is atomic and cannot be reordered.
* It also implies a memory barrier.
*/
-static inline int test_and_clear_bit(int nr, volatile unsigned long *addr)
+static inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
{
int oldbit;
@@ -272,7 +280,7 @@ static inline int test_and_clear_bit(int nr, volatile unsigned long *addr)
* accessed from a hypervisor on the same CPU if running in a VM: don't change
* this without also updating arch/x86/kernel/kvm.c
*/
-static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
+static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
{
int oldbit;
@@ -284,7 +292,7 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
}
/* WARNING: non atomic and it can be reordered! */
-static inline int __test_and_change_bit(int nr, volatile unsigned long *addr)
+static inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
{
int oldbit;
@@ -304,7 +312,7 @@ static inline int __test_and_change_bit(int nr, volatile unsigned long *addr)
* This operation is atomic and cannot be reordered.
* It also implies a memory barrier.
*/
-static inline int test_and_change_bit(int nr, volatile unsigned long *addr)
+static inline int test_and_change_bit(long nr, volatile unsigned long *addr)
{
int oldbit;
@@ -315,13 +323,13 @@ static inline int test_and_change_bit(int nr, volatile unsigned long *addr)
return oldbit;
}
-static __always_inline int constant_test_bit(unsigned int nr, const volatile unsigned long *addr)
+static __always_inline int constant_test_bit(long nr, const volatile unsigned long *addr)
{
- return ((1UL << (nr % BITS_PER_LONG)) &
- (addr[nr / BITS_PER_LONG])) != 0;
+ return ((1UL << (nr & (BITS_PER_LONG-1))) &
+ (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
}
-static inline int variable_test_bit(int nr, volatile const unsigned long *addr)
+static inline int variable_test_bit(long nr, volatile const unsigned long *addr)
{
int oldbit;
diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h
index 653668d140f9..4a8cb8d7cbd5 100644
--- a/arch/x86/include/asm/bootparam_utils.h
+++ b/arch/x86/include/asm/bootparam_utils.h
@@ -35,9 +35,9 @@ static void sanitize_boot_params(struct boot_params *boot_params)
*/
if (boot_params->sentinel) {
/* fields in boot_params are left uninitialized, clear them */
- memset(&boot_params->olpc_ofw_header, 0,
+ memset(&boot_params->ext_ramdisk_image, 0,
(char *)&boot_params->efi_info -
- (char *)&boot_params->olpc_ofw_header);
+ (char *)&boot_params->ext_ramdisk_image);
memset(&boot_params->kbd_status, 0,
(char *)&boot_params->hdr -
(char *)&boot_params->kbd_status);
diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h
index 46fc474fd819..f50de6951738 100644
--- a/arch/x86/include/asm/checksum_32.h
+++ b/arch/x86/include/asm/checksum_32.h
@@ -49,9 +49,15 @@ static inline __wsum csum_partial_copy_from_user(const void __user *src,
int len, __wsum sum,
int *err_ptr)
{
+ __wsum ret;
+
might_sleep();
- return csum_partial_copy_generic((__force void *)src, dst,
- len, sum, err_ptr, NULL);
+ stac();
+ ret = csum_partial_copy_generic((__force void *)src, dst,
+ len, sum, err_ptr, NULL);
+ clac();
+
+ return ret;
}
/*
@@ -176,10 +182,16 @@ static inline __wsum csum_and_copy_to_user(const void *src,
int len, __wsum sum,
int *err_ptr)
{
+ __wsum ret;
+
might_sleep();
- if (access_ok(VERIFY_WRITE, dst, len))
- return csum_partial_copy_generic(src, (__force void *)dst,
- len, sum, NULL, err_ptr);
+ if (access_ok(VERIFY_WRITE, dst, len)) {
+ stac();
+ ret = csum_partial_copy_generic(src, (__force void *)dst,
+ len, sum, NULL, err_ptr);
+ clac();
+ return ret;
+ }
if (len)
*err_ptr = -EFAULT;
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 47538a61c91b..d3f5c63078d8 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -366,9 +366,10 @@ extern bool __static_cpu_has_safe(u16 bit);
*/
static __always_inline __pure bool __static_cpu_has(u16 bit)
{
-#if __GNUC__ > 4 || __GNUC_MINOR__ >= 5
+#ifdef CC_HAVE_ASM_GOTO
#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS
+
/*
* Catch too early usage of this before alternatives
* have run.
@@ -384,6 +385,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
".previous\n"
/* skipping size check since replacement size = 0 */
: : "i" (X86_FEATURE_ALWAYS) : : t_warn);
+
#endif
asm goto("1: jmp %l[t_no]\n"
@@ -406,7 +408,9 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
warn_pre_alternatives();
return false;
#endif
-#else /* GCC_VERSION >= 40500 */
+
+#else /* CC_HAVE_ASM_GOTO */
+
u8 flag;
/* Open-coded due to __stringify() in ALTERNATIVE() */
asm volatile("1: movb $0,%0\n"
@@ -427,7 +431,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
".previous\n"
: "=qm" (flag) : "i" (bit));
return flag;
-#endif
+
+#endif /* CC_HAVE_ASM_GOTO */
}
#define static_cpu_has(bit) \
@@ -441,7 +446,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
{
-#if __GNUC__ > 4 || __GNUC_MINOR__ >= 5
+#ifdef CC_HAVE_ASM_GOTO
/*
* We need to spell the jumps to the compiler because, depending on the offset,
* the replacement jump can be bigger than the original jump, and this we cannot
@@ -475,7 +480,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
return false;
t_dynamic:
return __static_cpu_has_safe(bit);
-#else /* GCC_VERSION >= 40500 */
+#else
u8 flag;
/* Open-coded due to __stringify() in ALTERNATIVE() */
asm volatile("1: movb $2,%0\n"
@@ -511,7 +516,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
: "=qm" (flag)
: "i" (bit), "i" (X86_FEATURE_ALWAYS));
return (flag == 2 ? __static_cpu_has_safe(bit) : flag);
-#endif
+#endif /* CC_HAVE_ASM_GOTO */
}
#define static_cpu_has_safe(bit) \
diff --git a/arch/x86/include/asm/dma-contiguous.h b/arch/x86/include/asm/dma-contiguous.h
index c09241659971..b4b38bacb404 100644
--- a/arch/x86/include/asm/dma-contiguous.h
+++ b/arch/x86/include/asm/dma-contiguous.h
@@ -4,7 +4,6 @@
#ifdef __KERNEL__
#include <linux/types.h>
-#include <asm-generic/dma-contiguous.h>
static inline void
dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) { }
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index cccd07fa5e3a..779c2efe2e97 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -29,7 +29,7 @@ extern void e820_setup_gap(void);
extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
unsigned long start_addr, unsigned long long end_addr);
struct setup_data;
-extern void parse_e820_ext(struct setup_data *data);
+extern void parse_e820_ext(u64 phys_addr, u32 data_len);
#if defined(CONFIG_X86_64) || \
(defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 2d4b5e6107cd..e42f758a0fbd 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -33,7 +33,7 @@ struct hypervisor_x86 {
const char *name;
/* Detection routine */
- bool (*detect)(void);
+ uint32_t (*detect)(void);
/* Adjust CPU feature bits (run once per CPU) */
void (*set_cpu_features)(struct cpuinfo_x86 *);
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 3a16c1483b45..64507f35800c 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -3,18 +3,23 @@
#ifdef __KERNEL__
+#include <linux/stringify.h>
#include <linux/types.h>
#include <asm/nops.h>
#include <asm/asm.h>
#define JUMP_LABEL_NOP_SIZE 5
-#define STATIC_KEY_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t"
+#ifdef CONFIG_X86_64
+# define STATIC_KEY_INIT_NOP P6_NOP5_ATOMIC
+#else
+# define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC
+#endif
static __always_inline bool arch_static_branch(struct static_key *key)
{
asm goto("1:"
- STATIC_KEY_INITIAL_NOP
+ ".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t"
".pushsection __jump_table, \"aw\" \n\t"
_ASM_ALIGN "\n\t"
_ASM_PTR "1b, %l[l_yes], %c0 \n\t"
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f87f7fcefa0a..c76ff74a98f2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -286,6 +286,7 @@ struct kvm_mmu {
u64 *pae_root;
u64 *lm_root;
u64 rsvd_bits_mask[2][4];
+ u64 bad_mt_xwr;
/*
* Bitmap: bit set = last pte in walk
@@ -323,6 +324,7 @@ struct kvm_pmu {
u64 global_ovf_ctrl;
u64 counter_bitmask[2];
u64 global_ctrl_mask;
+ u64 reserved_bits;
u8 version;
struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
@@ -511,6 +513,14 @@ struct kvm_vcpu_arch {
* instruction.
*/
bool write_fault_to_shadow_pgtable;
+
+ /* set at EPT violation at this point */
+ unsigned long exit_qualification;
+
+ /* pv related host specific info */
+ struct {
+ bool pv_unhalted;
+ } pv;
};
struct kvm_lpage_info {
@@ -802,8 +812,8 @@ extern u32 kvm_min_guest_tsc_khz;
extern u32 kvm_max_guest_tsc_khz;
enum emulation_result {
- EMULATE_DONE, /* no further processing */
- EMULATE_DO_MMIO, /* kvm_run filled with mmio request */
+ EMULATE_DONE, /* no further processing */
+ EMULATE_USER_EXIT, /* kvm_run ready for userspace exit */
EMULATE_FAIL, /* can't emulate this instruction */
};
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 695399f2d5eb..1df115909758 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -85,26 +85,20 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
return ret;
}
-static inline bool kvm_para_available(void)
+static inline uint32_t kvm_cpuid_base(void)
{
- unsigned int eax, ebx, ecx, edx;
- char signature[13];
-
if (boot_cpu_data.cpuid_level < 0)
- return false; /* So we don't blow up on old processors */
+ return 0; /* So we don't blow up on old processors */
- if (cpu_has_hypervisor) {
- cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx);
- memcpy(signature + 0, &ebx, 4);
- memcpy(signature + 4, &ecx, 4);
- memcpy(signature + 8, &edx, 4);
- signature[12] = 0;
+ if (cpu_has_hypervisor)
+ return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0);
- if (strcmp(signature, "KVMKVMKVM") == 0)
- return true;
- }
+ return 0;
+}
- return false;
+static inline bool kvm_para_available(void)
+{
+ return kvm_cpuid_base() != 0;
}
static inline unsigned int kvm_arch_para_features(void)
@@ -118,10 +112,20 @@ void kvm_async_pf_task_wait(u32 token);
void kvm_async_pf_task_wake(u32 token);
u32 kvm_read_and_reset_pf_reason(void);
extern void kvm_disable_steal_time(void);
-#else
-#define kvm_guest_init() do { } while (0)
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+void __init kvm_spinlock_init(void);
+#else /* !CONFIG_PARAVIRT_SPINLOCKS */
+static inline void kvm_spinlock_init(void)
+{
+}
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+
+#else /* CONFIG_KVM_GUEST */
+#define kvm_guest_init() do {} while (0)
#define kvm_async_pf_task_wait(T) do {} while(0)
#define kvm_async_pf_task_wake(T) do {} while(0)
+
static inline u32 kvm_read_and_reset_pf_reason(void)
{
return 0;
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 29e3093bbd21..cbe6b9e404ce 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -32,11 +32,20 @@
#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */
#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */
#define MCI_STATUS_AR (1ULL<<55) /* Action required */
-#define MCACOD 0xffff /* MCA Error Code */
+
+/*
+ * Note that the full MCACOD field of IA32_MCi_STATUS MSR is
+ * bits 15:0. But bit 12 is the 'F' bit, defined for corrected
+ * errors to indicate that errors are being filtered by hardware.
+ * We should mask out bit 12 when looking for specific signatures
+ * of uncorrected errors - so the F bit is deliberately skipped
+ * in this #define.
+ */
+#define MCACOD 0xefff /* MCA Error Code */
/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */
#define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */
-#define MCACOD_SCRUBMSK 0xfff0
+#define MCACOD_SCRUBMSK 0xeff0 /* Skip bit 12 ('F' bit) */
#define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */
#define MCACOD_DATA 0x0134 /* Data Load */
#define MCACOD_INSTR 0x0150 /* Instruction Fetch */
@@ -188,6 +197,9 @@ extern void register_mce_write_callback(ssize_t (*)(struct file *filp,
const char __user *ubuf,
size_t usize, loff_t *off));
+/* Disable CMCI/polling for MCA bank claimed by firmware */
+extern void mce_disable_bank(int bank);
+
/*
* Exception handler
*/
diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h
index 50e5c58ced23..4c019179a57d 100644
--- a/arch/x86/include/asm/microcode_amd.h
+++ b/arch/x86/include/asm/microcode_amd.h
@@ -59,7 +59,7 @@ static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table,
extern int __apply_microcode_amd(struct microcode_amd *mc_amd);
extern int apply_microcode_amd(int cpu);
-extern enum ucode_state load_microcode_amd(int cpu, const u8 *data, size_t size);
+extern enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size);
#ifdef CONFIG_MICROCODE_AMD_EARLY
#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index cdbf36776106..be12c534fd59 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -45,22 +45,28 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
/* Re-load page tables */
load_cr3(next->pgd);
- /* stop flush ipis for the previous mm */
+ /* Stop flush ipis for the previous mm */
cpumask_clear_cpu(cpu, mm_cpumask(prev));
- /*
- * load the LDT, if the LDT is different:
- */
+ /* Load the LDT, if the LDT is different: */
if (unlikely(prev->context.ldt != next->context.ldt))
load_LDT_nolock(&next->context);
}
#ifdef CONFIG_SMP
- else {
+ else {
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
- if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) {
- /* We were in lazy tlb mode and leave_mm disabled
+ if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
+ /*
+ * On established mms, the mm_cpumask is only changed
+ * from irq context, from ptep_clear_flush() while in
+ * lazy tlb mode, and here. Irqs are blocked during
+ * schedule, protecting us from simultaneous changes.
+ */
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+ /*
+ * We were in lazy tlb mode and leave_mm disabled
* tlb flush IPI delivery. We must reload CR3
* to make sure to use no freed page tables.
*/
diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h
index 2c543fff241b..e7e6751648ed 100644
--- a/arch/x86/include/asm/mutex_64.h
+++ b/arch/x86/include/asm/mutex_64.h
@@ -16,6 +16,20 @@
*
* Atomically decrements @v and calls <fail_fn> if the result is negative.
*/
+#ifdef CC_HAVE_ASM_GOTO
+static inline void __mutex_fastpath_lock(atomic_t *v,
+ void (*fail_fn)(atomic_t *))
+{
+ asm volatile goto(LOCK_PREFIX " decl %0\n"
+ " jns %l[exit]\n"
+ : : "m" (v->counter)
+ : "memory", "cc"
+ : exit);
+ fail_fn(v);
+exit:
+ return;
+}
+#else
#define __mutex_fastpath_lock(v, fail_fn) \
do { \
unsigned long dummy; \
@@ -32,6 +46,7 @@ do { \
: "rax", "rsi", "rdx", "rcx", \
"r8", "r9", "r10", "r11", "memory"); \
} while (0)
+#endif
/**
* __mutex_fastpath_lock_retval - try to take the lock by moving the count
@@ -56,6 +71,20 @@ static inline int __mutex_fastpath_lock_retval(atomic_t *count)
*
* Atomically increments @v and calls <fail_fn> if the result is nonpositive.
*/
+#ifdef CC_HAVE_ASM_GOTO
+static inline void __mutex_fastpath_unlock(atomic_t *v,
+ void (*fail_fn)(atomic_t *))
+{
+ asm volatile goto(LOCK_PREFIX " incl %0\n"
+ " jg %l[exit]\n"
+ : : "m" (v->counter)
+ : "memory", "cc"
+ : exit);
+ fail_fn(v);
+exit:
+ return;
+}
+#else
#define __mutex_fastpath_unlock(v, fail_fn) \
do { \
unsigned long dummy; \
@@ -72,6 +101,7 @@ do { \
: "rax", "rsi", "rdx", "rcx", \
"r8", "r9", "r10", "r11", "memory"); \
} while (0)
+#endif
#define __mutex_slowpath_needs_to_unlock() 1
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index ef17af013475..f48b17df4224 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -15,6 +15,8 @@
*/
#define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL)
+#define __START_KERNEL_map __PAGE_OFFSET
+
#define THREAD_SIZE_ORDER 1
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 6c896fbe21db..43dcd804ebd5 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -32,11 +32,6 @@
*/
#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
-#define __PHYSICAL_START ((CONFIG_PHYSICAL_START + \
- (CONFIG_PHYSICAL_ALIGN - 1)) & \
- ~(CONFIG_PHYSICAL_ALIGN - 1))
-
-#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 54c97879195e..f97fbe3abb67 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -33,6 +33,11 @@
(((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define __PHYSICAL_START ALIGN(CONFIG_PHYSICAL_START, \
+ CONFIG_PHYSICAL_ALIGN)
+
+#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
+
#ifdef CONFIG_X86_64
#include <asm/page_64_types.h>
#else
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index cfdc9ee4c900..401f350ef71b 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -712,36 +712,16 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
-static inline int arch_spin_is_locked(struct arch_spinlock *lock)
+static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock,
+ __ticket_t ticket)
{
- return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock);
+ PVOP_VCALLEE2(pv_lock_ops.lock_spinning, lock, ticket);
}
-static inline int arch_spin_is_contended(struct arch_spinlock *lock)
+static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock,
+ __ticket_t ticket)
{
- return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock);
-}
-#define arch_spin_is_contended arch_spin_is_contended
-
-static __always_inline void arch_spin_lock(struct arch_spinlock *lock)
-{
- PVOP_VCALL1(pv_lock_ops.spin_lock, lock);
-}
-
-static __always_inline void arch_spin_lock_flags(struct arch_spinlock *lock,
- unsigned long flags)
-{
- PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags);
-}
-
-static __always_inline int arch_spin_trylock(struct arch_spinlock *lock)
-{
- return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock);
-}
-
-static __always_inline void arch_spin_unlock(struct arch_spinlock *lock)
-{
- PVOP_VCALL1(pv_lock_ops.spin_unlock, lock);
+ PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket);
}
#endif
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 0617ff241e8f..aab8f671b523 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -327,13 +327,15 @@ struct pv_mmu_ops {
};
struct arch_spinlock;
+#ifdef CONFIG_SMP
+#include <asm/spinlock_types.h>
+#else
+typedef u16 __ticket_t;
+#endif
+
struct pv_lock_ops {
- int (*spin_is_locked)(struct arch_spinlock *lock);
- int (*spin_is_contended)(struct arch_spinlock *lock);
- void (*spin_lock)(struct arch_spinlock *lock);
- void (*spin_lock_flags)(struct arch_spinlock *lock, unsigned long flags);
- int (*spin_trylock)(struct arch_spinlock *lock);
- void (*spin_unlock)(struct arch_spinlock *lock);
+ struct paravirt_callee_save lock_spinning;
+ void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket);
};
/* This contains all the paravirt structures: we get a convenient
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index d9e9e6c7ed32..7d7443283a9d 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -100,29 +100,6 @@ static inline void early_quirks(void) { }
extern void pci_iommu_alloc(void);
#ifdef CONFIG_PCI_MSI
-/* MSI arch specific hooks */
-static inline int x86_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
-{
- return x86_msi.setup_msi_irqs(dev, nvec, type);
-}
-
-static inline void x86_teardown_msi_irqs(struct pci_dev *dev)
-{
- x86_msi.teardown_msi_irqs(dev);
-}
-
-static inline void x86_teardown_msi_irq(unsigned int irq)
-{
- x86_msi.teardown_msi_irq(irq);
-}
-static inline void x86_restore_msi_irqs(struct pci_dev *dev, int irq)
-{
- x86_msi.restore_msi_irqs(dev, irq);
-}
-#define arch_setup_msi_irqs x86_setup_msi_irqs
-#define arch_teardown_msi_irqs x86_teardown_msi_irqs
-#define arch_teardown_msi_irq x86_teardown_msi_irq
-#define arch_restore_msi_irqs x86_restore_msi_irqs
/* implemented in arch/x86/kernel/apic/io_apic. */
struct msi_desc;
int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
@@ -130,16 +107,9 @@ void native_teardown_msi_irq(unsigned int irq);
void native_restore_msi_irqs(struct pci_dev *dev, int irq);
int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
unsigned int irq_base, unsigned int irq_offset);
-/* default to the implementation in drivers/lib/msi.c */
-#define HAVE_DEFAULT_MSI_TEARDOWN_IRQS
-#define HAVE_DEFAULT_MSI_RESTORE_IRQS
-void default_teardown_msi_irqs(struct pci_dev *dev);
-void default_restore_msi_irqs(struct pci_dev *dev, int irq);
#else
#define native_setup_msi_irqs NULL
#define native_teardown_msi_irq NULL
-#define default_teardown_msi_irqs NULL
-#define default_restore_msi_irqs NULL
#endif
#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index f2b489cf1602..3bf2dd0cf61f 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -55,9 +55,53 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
#endif
+#ifdef CONFIG_MEM_SOFT_DIRTY
+
+/*
+ * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE, _PAGE_BIT_SOFT_DIRTY and
+ * _PAGE_BIT_PROTNONE are taken, split up the 28 bits of offset
+ * into this range.
+ */
+#define PTE_FILE_MAX_BITS 28
+#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1)
+#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1)
+#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1)
+#define PTE_FILE_SHIFT4 (_PAGE_BIT_SOFT_DIRTY + 1)
+#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1)
+#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
+#define PTE_FILE_BITS3 (PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1)
+
+#define pte_to_pgoff(pte) \
+ ((((pte).pte_low >> (PTE_FILE_SHIFT1)) \
+ & ((1U << PTE_FILE_BITS1) - 1))) \
+ + ((((pte).pte_low >> (PTE_FILE_SHIFT2)) \
+ & ((1U << PTE_FILE_BITS2) - 1)) \
+ << (PTE_FILE_BITS1)) \
+ + ((((pte).pte_low >> (PTE_FILE_SHIFT3)) \
+ & ((1U << PTE_FILE_BITS3) - 1)) \
+ << (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \
+ + ((((pte).pte_low >> (PTE_FILE_SHIFT4))) \
+ << (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3))
+
+#define pgoff_to_pte(off) \
+ ((pte_t) { .pte_low = \
+ ((((off)) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \
+ + ((((off) >> PTE_FILE_BITS1) \
+ & ((1U << PTE_FILE_BITS2) - 1)) \
+ << PTE_FILE_SHIFT2) \
+ + ((((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \
+ & ((1U << PTE_FILE_BITS3) - 1)) \
+ << PTE_FILE_SHIFT3) \
+ + ((((off) >> \
+ (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3))) \
+ << PTE_FILE_SHIFT4) \
+ + _PAGE_FILE })
+
+#else /* CONFIG_MEM_SOFT_DIRTY */
+
/*
* Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
- * split up the 29 bits of offset into this range:
+ * split up the 29 bits of offset into this range.
*/
#define PTE_FILE_MAX_BITS 29
#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1)
@@ -88,6 +132,8 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
<< PTE_FILE_SHIFT3) \
+ _PAGE_FILE })
+#endif /* CONFIG_MEM_SOFT_DIRTY */
+
/* Encode and de-code a swap entry */
#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 4cc9f2b7cdc3..81bb91b49a88 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -179,6 +179,9 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
/*
* Bits 0, 6 and 7 are taken in the low part of the pte,
* put the 32 bits of offset into the high part.
+ *
+ * For soft-dirty tracking 11 bit is taken from
+ * the low part of pte as well.
*/
#define pte_to_pgoff(pte) ((pte).pte_high)
#define pgoff_to_pte(off) \
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 4e4765908af5..3d1999458709 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -315,6 +315,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
}
+static inline pte_t pte_file_clear_soft_dirty(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
+}
+
+static inline pte_t pte_file_mksoft_dirty(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
+}
+
+static inline int pte_file_soft_dirty(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_SOFT_DIRTY;
+}
+
/*
* Mask out unsupported bits in a present pgprot. Non-present pgprots
* can use those bits for other purposes, so leave them be.
@@ -416,6 +431,7 @@ pte_t *populate_extra_pte(unsigned long vaddr);
#ifndef __ASSEMBLY__
#include <linux/mm_types.h>
+#include <linux/mmdebug.h>
#include <linux/log2.h>
static inline int pte_none(pte_t pte)
@@ -834,6 +850,24 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
{
}
+static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
+{
+ VM_BUG_ON(pte_present(pte));
+ return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
+}
+
+static inline int pte_swp_soft_dirty(pte_t pte)
+{
+ VM_BUG_ON(pte_present(pte));
+ return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
+}
+
+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
+{
+ VM_BUG_ON(pte_present(pte));
+ return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
+}
+
#include <asm-generic/pgtable.h>
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index c98ac63aae48..0ecac257fb26 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -61,12 +61,30 @@
* they do not conflict with each other.
*/
+#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_HIDDEN
+
#ifdef CONFIG_MEM_SOFT_DIRTY
-#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
+#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY)
#else
#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0))
#endif
+/*
+ * Tracking soft dirty bit when a page goes to a swap is tricky.
+ * We need a bit which can be stored in pte _and_ not conflict
+ * with swap entry format. On x86 bits 6 and 7 are *not* involved
+ * into swap entry computation, but bit 6 is used for nonlinear
+ * file mapping, so we borrow bit 7 for soft dirty tracking.
+ *
+ * Please note that this bit must be treated as swap dirty page
+ * mark if and only if the PTE has present bit clear!
+ */
+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE
+#else
+#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0))
+#endif
+
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
#else
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 573c1ad4994e..987c75ecc334 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -942,33 +942,19 @@ extern int set_tsc_mode(unsigned int val);
extern u16 amd_get_nb_id(int cpu);
-struct aperfmperf {
- u64 aperf, mperf;
-};
-
-static inline void get_aperfmperf(struct aperfmperf *am)
+static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
{
- WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF));
-
- rdmsrl(MSR_IA32_APERF, am->aperf);
- rdmsrl(MSR_IA32_MPERF, am->mperf);
-}
+ uint32_t base, eax, signature[3];
-#define APERFMPERF_SHIFT 10
+ for (base = 0x40000000; base < 0x40010000; base += 0x100) {
+ cpuid(base, &eax, &signature[0], &signature[1], &signature[2]);
-static inline
-unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
- struct aperfmperf *new)
-{
- u64 aperf = new->aperf - old->aperf;
- u64 mperf = new->mperf - old->mperf;
- unsigned long ratio = aperf;
-
- mperf >>= APERFMPERF_SHIFT;
- if (mperf)
- ratio = div64_u64(aperf, mperf);
+ if (!memcmp(sig, signature, 12) &&
+ (leaves == 0 || ((eax - base) >= leaves)))
+ return base;
+ }
- return ratio;
+ return 0;
}
extern unsigned long arch_align_stack(unsigned long sp);
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 109a9dd5d454..be8269b00e2a 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -93,7 +93,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
struct pvclock_vsyscall_time_info {
struct pvclock_vcpu_time_info pvti;
- u32 migrate_count;
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 33692eaabab5..bf156ded74b5 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -1,11 +1,14 @@
#ifndef _ASM_X86_SPINLOCK_H
#define _ASM_X86_SPINLOCK_H
+#include <linux/jump_label.h>
#include <linux/atomic.h>
#include <asm/page.h>
#include <asm/processor.h>
#include <linux/compiler.h>
#include <asm/paravirt.h>
+#include <asm/bitops.h>
+
/*
* Your basic SMP spinlocks, allowing only a single CPU anywhere
*
@@ -34,6 +37,36 @@
# define UNLOCK_LOCK_PREFIX
#endif
+/* How long a lock should spin before we consider blocking */
+#define SPIN_THRESHOLD (1 << 15)
+
+extern struct static_key paravirt_ticketlocks_enabled;
+static __always_inline bool static_key_false(struct static_key *key);
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+
+static inline void __ticket_enter_slowpath(arch_spinlock_t *lock)
+{
+ set_bit(0, (volatile unsigned long *)&lock->tickets.tail);
+}
+
+#else /* !CONFIG_PARAVIRT_SPINLOCKS */
+static __always_inline void __ticket_lock_spinning(arch_spinlock_t *lock,
+ __ticket_t ticket)
+{
+}
+static inline void __ticket_unlock_kick(arch_spinlock_t *lock,
+ __ticket_t ticket)
+{
+}
+
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+
+static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
+{
+ return lock.tickets.head == lock.tickets.tail;
+}
+
/*
* Ticket locks are conceptually two parts, one indicating the current head of
* the queue, and the other indicating the current tail. The lock is acquired
@@ -47,81 +80,101 @@
* in the high part, because a wide xadd increment of the low part would carry
* up and contaminate the high part.
*/
-static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
+static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
{
- register struct __raw_tickets inc = { .tail = 1 };
+ register struct __raw_tickets inc = { .tail = TICKET_LOCK_INC };
inc = xadd(&lock->tickets, inc);
+ if (likely(inc.head == inc.tail))
+ goto out;
+ inc.tail &= ~TICKET_SLOWPATH_FLAG;
for (;;) {
- if (inc.head == inc.tail)
- break;
- cpu_relax();
- inc.head = ACCESS_ONCE(lock->tickets.head);
+ unsigned count = SPIN_THRESHOLD;
+
+ do {
+ if (ACCESS_ONCE(lock->tickets.head) == inc.tail)
+ goto out;
+ cpu_relax();
+ } while (--count);
+ __ticket_lock_spinning(lock, inc.tail);
}
- barrier(); /* make sure nothing creeps before the lock is taken */
+out: barrier(); /* make sure nothing creeps before the lock is taken */
}
-static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
+static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
{
arch_spinlock_t old, new;
old.tickets = ACCESS_ONCE(lock->tickets);
- if (old.tickets.head != old.tickets.tail)
+ if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG))
return 0;
- new.head_tail = old.head_tail + (1 << TICKET_SHIFT);
+ new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT);
/* cmpxchg is a full barrier, so nothing can move before it */
return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
}
-static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
+static inline void __ticket_unlock_slowpath(arch_spinlock_t *lock,
+ arch_spinlock_t old)
{
- __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
+ arch_spinlock_t new;
+
+ BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS);
+
+ /* Perform the unlock on the "before" copy */
+ old.tickets.head += TICKET_LOCK_INC;
+
+ /* Clear the slowpath flag */
+ new.head_tail = old.head_tail & ~(TICKET_SLOWPATH_FLAG << TICKET_SHIFT);
+
+ /*
+ * If the lock is uncontended, clear the flag - use cmpxchg in
+ * case it changes behind our back though.
+ */
+ if (new.tickets.head != new.tickets.tail ||
+ cmpxchg(&lock->head_tail, old.head_tail,
+ new.head_tail) != old.head_tail) {
+ /*
+ * Lock still has someone queued for it, so wake up an
+ * appropriate waiter.
+ */
+ __ticket_unlock_kick(lock, old.tickets.head);
+ }
}
-static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
+static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
{
- struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+ if (TICKET_SLOWPATH_FLAG &&
+ static_key_false(&paravirt_ticketlocks_enabled)) {
+ arch_spinlock_t prev;
- return tmp.tail != tmp.head;
-}
+ prev = *lock;
+ add_smp(&lock->tickets.head, TICKET_LOCK_INC);
-static inline int __ticket_spin_is_contended(arch_spinlock_t *lock)
-{
- struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+ /* add_smp() is a full mb() */
- return (__ticket_t)(tmp.tail - tmp.head) > 1;
+ if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG))
+ __ticket_unlock_slowpath(lock, prev);
+ } else
+ __add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX);
}
-#ifndef CONFIG_PARAVIRT_SPINLOCKS
-
static inline int arch_spin_is_locked(arch_spinlock_t *lock)
{
- return __ticket_spin_is_locked(lock);
-}
-
-static inline int arch_spin_is_contended(arch_spinlock_t *lock)
-{
- return __ticket_spin_is_contended(lock);
-}
-#define arch_spin_is_contended arch_spin_is_contended
+ struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
-static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
-{
- __ticket_spin_lock(lock);
+ return tmp.tail != tmp.head;
}
-static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
+static inline int arch_spin_is_contended(arch_spinlock_t *lock)
{
- return __ticket_spin_trylock(lock);
-}
+ struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
-static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
-{
- __ticket_spin_unlock(lock);
+ return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC;
}
+#define arch_spin_is_contended arch_spin_is_contended
static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
unsigned long flags)
@@ -129,8 +182,6 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
arch_spin_lock(lock);
}
-#endif /* CONFIG_PARAVIRT_SPINLOCKS */
-
static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
{
while (arch_spin_is_locked(lock))
@@ -233,8 +284,4 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
#define arch_read_relax(lock) cpu_relax()
#define arch_write_relax(lock) cpu_relax()
-/* The {read|write|spin}_lock() on x86 are full memory barriers. */
-static inline void smp_mb__after_lock(void) { }
-#define ARCH_HAS_SMP_MB_AFTER_LOCK
-
#endif /* _ASM_X86_SPINLOCK_H */
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index ad0ad07fc006..4f1bea19945b 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -1,13 +1,17 @@
#ifndef _ASM_X86_SPINLOCK_TYPES_H
#define _ASM_X86_SPINLOCK_TYPES_H
-#ifndef __LINUX_SPINLOCK_TYPES_H
-# error "please don't include this file directly"
-#endif
-
#include <linux/types.h>
-#if (CONFIG_NR_CPUS < 256)
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define __TICKET_LOCK_INC 2
+#define TICKET_SLOWPATH_FLAG ((__ticket_t)1)
+#else
+#define __TICKET_LOCK_INC 1
+#define TICKET_SLOWPATH_FLAG ((__ticket_t)0)
+#endif
+
+#if (CONFIG_NR_CPUS < (256 / __TICKET_LOCK_INC))
typedef u8 __ticket_t;
typedef u16 __ticketpair_t;
#else
@@ -15,6 +19,8 @@ typedef u16 __ticket_t;
typedef u32 __ticketpair_t;
#endif
+#define TICKET_LOCK_INC ((__ticket_t)__TICKET_LOCK_INC)
+
#define TICKET_SHIFT (sizeof(__ticket_t) * 8)
typedef struct arch_spinlock {
diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h
index 9d09b4073b60..05af3b31d522 100644
--- a/arch/x86/include/asm/sync_bitops.h
+++ b/arch/x86/include/asm/sync_bitops.h
@@ -26,9 +26,9 @@
* Note that @nr may be almost arbitrarily large; this function is not
* restricted to acting on a single-word quantity.
*/
-static inline void sync_set_bit(int nr, volatile unsigned long *addr)
+static inline void sync_set_bit(long nr, volatile unsigned long *addr)
{
- asm volatile("lock; btsl %1,%0"
+ asm volatile("lock; bts %1,%0"
: "+m" (ADDR)
: "Ir" (nr)
: "memory");
@@ -44,9 +44,9 @@ static inline void sync_set_bit(int nr, volatile unsigned long *addr)
* you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
* in order to ensure changes are visible on other processors.
*/
-static inline void sync_clear_bit(int nr, volatile unsigned long *addr)
+static inline void sync_clear_bit(long nr, volatile unsigned long *addr)
{
- asm volatile("lock; btrl %1,%0"
+ asm volatile("lock; btr %1,%0"
: "+m" (ADDR)
: "Ir" (nr)
: "memory");
@@ -61,9 +61,9 @@ static inline void sync_clear_bit(int nr, volatile unsigned long *addr)
* Note that @nr may be almost arbitrarily large; this function is not
* restricted to acting on a single-word quantity.
*/
-static inline void sync_change_bit(int nr, volatile unsigned long *addr)
+static inline void sync_change_bit(long nr, volatile unsigned long *addr)
{
- asm volatile("lock; btcl %1,%0"
+ asm volatile("lock; btc %1,%0"
: "+m" (ADDR)
: "Ir" (nr)
: "memory");
@@ -77,11 +77,11 @@ static inline void sync_change_bit(int nr, volatile unsigned long *addr)
* This operation is atomic and cannot be reordered.
* It also implies a memory barrier.
*/
-static inline int sync_test_and_set_bit(int nr, volatile unsigned long *addr)
+static inline int sync_test_and_set_bit(long nr, volatile unsigned long *addr)
{
int oldbit;
- asm volatile("lock; btsl %2,%1\n\tsbbl %0,%0"
+ asm volatile("lock; bts %2,%1\n\tsbbl %0,%0"
: "=r" (oldbit), "+m" (ADDR)
: "Ir" (nr) : "memory");
return oldbit;
@@ -95,11 +95,11 @@ static inline int sync_test_and_set_bit(int nr, volatile unsigned long *addr)
* This operation is atomic and cannot be reordered.
* It also implies a memory barrier.
*/
-static inline int sync_test_and_clear_bit(int nr, volatile unsigned long *addr)
+static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr)
{
int oldbit;
- asm volatile("lock; btrl %2,%1\n\tsbbl %0,%0"
+ asm volatile("lock; btr %2,%1\n\tsbbl %0,%0"
: "=r" (oldbit), "+m" (ADDR)
: "Ir" (nr) : "memory");
return oldbit;
@@ -113,11 +113,11 @@ static inline int sync_test_and_clear_bit(int nr, volatile unsigned long *addr)
* This operation is atomic and cannot be reordered.
* It also implies a memory barrier.
*/
-static inline int sync_test_and_change_bit(int nr, volatile unsigned long *addr)
+static inline int sync_test_and_change_bit(long nr, volatile unsigned long *addr)
{
int oldbit;
- asm volatile("lock; btcl %2,%1\n\tsbbl %0,%0"
+ asm volatile("lock; btc %2,%1\n\tsbbl %0,%0"
: "=r" (oldbit), "+m" (ADDR)
: "Ir" (nr) : "memory");
return oldbit;
diff --git a/arch/x86/include/asm/sysfb.h b/arch/x86/include/asm/sysfb.h
new file mode 100644
index 000000000000..2aeb3e25579c
--- /dev/null
+++ b/arch/x86/include/asm/sysfb.h
@@ -0,0 +1,98 @@
+#ifndef _ARCH_X86_KERNEL_SYSFB_H
+#define _ARCH_X86_KERNEL_SYSFB_H
+
+/*
+ * Generic System Framebuffers on x86
+ * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/platform_data/simplefb.h>
+#include <linux/screen_info.h>
+
+enum {
+ M_I17, /* 17-Inch iMac */
+ M_I20, /* 20-Inch iMac */
+ M_I20_SR, /* 20-Inch iMac (Santa Rosa) */
+ M_I24, /* 24-Inch iMac */
+ M_I24_8_1, /* 24-Inch iMac, 8,1th gen */
+ M_I24_10_1, /* 24-Inch iMac, 10,1th gen */
+ M_I27_11_1, /* 27-Inch iMac, 11,1th gen */
+ M_MINI, /* Mac Mini */
+ M_MINI_3_1, /* Mac Mini, 3,1th gen */
+ M_MINI_4_1, /* Mac Mini, 4,1th gen */
+ M_MB, /* MacBook */
+ M_MB_2, /* MacBook, 2nd rev. */
+ M_MB_3, /* MacBook, 3rd rev. */
+ M_MB_5_1, /* MacBook, 5th rev. */
+ M_MB_6_1, /* MacBook, 6th rev. */
+ M_MB_7_1, /* MacBook, 7th rev. */
+ M_MB_SR, /* MacBook, 2nd gen, (Santa Rosa) */
+ M_MBA, /* MacBook Air */
+ M_MBA_3, /* Macbook Air, 3rd rev */
+ M_MBP, /* MacBook Pro */
+ M_MBP_2, /* MacBook Pro 2nd gen */
+ M_MBP_2_2, /* MacBook Pro 2,2nd gen */
+ M_MBP_SR, /* MacBook Pro (Santa Rosa) */
+ M_MBP_4, /* MacBook Pro, 4th gen */
+ M_MBP_5_1, /* MacBook Pro, 5,1th gen */
+ M_MBP_5_2, /* MacBook Pro, 5,2th gen */
+ M_MBP_5_3, /* MacBook Pro, 5,3rd gen */
+ M_MBP_6_1, /* MacBook Pro, 6,1th gen */
+ M_MBP_6_2, /* MacBook Pro, 6,2th gen */
+ M_MBP_7_1, /* MacBook Pro, 7,1th gen */
+ M_MBP_8_2, /* MacBook Pro, 8,2nd gen */
+ M_UNKNOWN /* placeholder */
+};
+
+struct efifb_dmi_info {
+ char *optname;
+ unsigned long base;
+ int stride;
+ int width;
+ int height;
+ int flags;
+};
+
+#ifdef CONFIG_EFI
+
+extern struct efifb_dmi_info efifb_dmi_list[];
+void sysfb_apply_efi_quirks(void);
+
+#else /* CONFIG_EFI */
+
+static inline void sysfb_apply_efi_quirks(void)
+{
+}
+
+#endif /* CONFIG_EFI */
+
+#ifdef CONFIG_X86_SYSFB
+
+bool parse_mode(const struct screen_info *si,
+ struct simplefb_platform_data *mode);
+int create_simplefb(const struct screen_info *si,
+ const struct simplefb_platform_data *mode);
+
+#else /* CONFIG_X86_SYSFB */
+
+static inline bool parse_mode(const struct screen_info *si,
+ struct simplefb_platform_data *mode)
+{
+ return false;
+}
+
+static inline int create_simplefb(const struct screen_info *si,
+ const struct simplefb_platform_data *mode)
+{
+ return -EINVAL;
+}
+
+#endif /* CONFIG_X86_SYSFB */
+
+#endif /* _ARCH_X86_KERNEL_SYSFB_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index cf512003e663..e6d90babc245 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -62,6 +62,7 @@ static inline void __flush_tlb_all(void)
static inline void __flush_tlb_one(unsigned long addr)
{
+ count_vm_event(NR_TLB_LOCAL_FLUSH_ONE);
__flush_tlb_single(addr);
}
@@ -84,14 +85,38 @@ static inline void __flush_tlb_one(unsigned long addr)
#ifndef CONFIG_SMP
-#define flush_tlb() __flush_tlb()
-#define flush_tlb_all() __flush_tlb_all()
-#define local_flush_tlb() __flush_tlb()
+/* "_up" is for UniProcessor.
+ *
+ * This is a helper for other header functions. *Not* intended to be called
+ * directly. All global TLB flushes need to either call this, or to bump the
+ * vm statistics themselves.
+ */
+static inline void __flush_tlb_up(void)
+{
+ count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ __flush_tlb();
+}
+
+static inline void flush_tlb_all(void)
+{
+ count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ __flush_tlb_all();
+}
+
+static inline void flush_tlb(void)
+{
+ __flush_tlb_up();
+}
+
+static inline void local_flush_tlb(void)
+{
+ __flush_tlb_up();
+}
static inline void flush_tlb_mm(struct mm_struct *mm)
{
if (mm == current->active_mm)
- __flush_tlb();
+ __flush_tlb_up();
}
static inline void flush_tlb_page(struct vm_area_struct *vma,
@@ -105,14 +130,14 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
if (vma->vm_mm == current->active_mm)
- __flush_tlb();
+ __flush_tlb_up();
}
static inline void flush_tlb_mm_range(struct mm_struct *mm,
unsigned long start, unsigned long end, unsigned long vmflag)
{
if (mm == current->active_mm)
- __flush_tlb();
+ __flush_tlb_up();
}
static inline void native_flush_tlb_others(const struct cpumask *cpumask,
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 095b21507b6a..d35f24e231cd 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -124,9 +124,6 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu);
#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id)
#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
#define topology_thread_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))
-
-/* indicates that pointers to the topology cpumask_t maps are valid */
-#define arch_provides_topology_pointers yes
#endif
static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index c91e8b9d588b..235be70d5bb4 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -49,6 +49,7 @@ extern void tsc_init(void);
extern void mark_tsc_unstable(char *reason);
extern int unsynchronized_tsc(void);
extern int check_tsc_unstable(void);
+extern int check_tsc_disabled(void);
extern unsigned long native_calibrate_tsc(void);
extern int tsc_clocksource_reliable;
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 5ee26875baea..5838fa911aa0 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -153,16 +153,19 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
* Careful: we have to cast the result to the type of the pointer
* for sign reasons.
*
- * The use of %edx as the register specifier is a bit of a
+ * The use of _ASM_DX as the register specifier is a bit of a
* simplification, as gcc only cares about it as the starting point
* and not size: for a 64-bit value it will use %ecx:%edx on 32 bits
* (%ecx being the next register in gcc's x86 register sequence), and
* %rdx on 64 bits.
+ *
+ * Clang/LLVM cares about the size of the register, but still wants
+ * the base register for something that ends up being a pair.
*/
#define get_user(x, ptr) \
({ \
int __ret_gu; \
- register __inttype(*(ptr)) __val_gu asm("%edx"); \
+ register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \
__chk_user_ptr(ptr); \
might_fault(); \
asm volatile("call __get_user_%P3" \
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index f3e01a2cbaa1..966502d4682e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -387,6 +387,7 @@ enum vmcs_field {
#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0
#define VMX_EPT_EXTENT_CONTEXT 1
#define VMX_EPT_EXTENT_GLOBAL 2
+#define VMX_EPT_EXTENT_SHIFT 24
#define VMX_EPT_EXECUTE_ONLY_BIT (1ull)
#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6)
@@ -394,6 +395,7 @@ enum vmcs_field {
#define VMX_EPTP_WB_BIT (1ull << 14)
#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
+#define VMX_EPT_INVEPT_BIT (1ull << 20)
#define VMX_EPT_AD_BIT (1ull << 21)
#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
index ca842f2769ef..608a79d5a466 100644
--- a/arch/x86/include/asm/xen/events.h
+++ b/arch/x86/include/asm/xen/events.h
@@ -7,6 +7,7 @@ enum ipi_vector {
XEN_CALL_FUNCTION_SINGLE_VECTOR,
XEN_SPIN_UNLOCK_VECTOR,
XEN_IRQ_WORK_VECTOR,
+ XEN_NMI_VECTOR,
XEN_NR_IPIS,
};
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 125f344f06a9..d866959e5685 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -40,21 +40,7 @@ extern struct start_info *xen_start_info;
static inline uint32_t xen_cpuid_base(void)
{
- uint32_t base, eax, ebx, ecx, edx;
- char signature[13];
-
- for (base = 0x40000000; base < 0x40010000; base += 0x100) {
- cpuid(base, &eax, &ebx, &ecx, &edx);
- *(uint32_t *)(signature + 0) = ebx;
- *(uint32_t *)(signature + 4) = ecx;
- *(uint32_t *)(signature + 8) = edx;
- signature[12] = 0;
-
- if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
- return base;
- }
-
- return 0;
+ return hypervisor_cpuid_base("XenVMMXenVMM", 2);
}
#ifdef CONFIG_XEN
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
index 7ea79c5fa1f2..492b29802f57 100644
--- a/arch/x86/include/asm/xor_avx.h
+++ b/arch/x86/include/asm/xor_avx.h
@@ -167,12 +167,12 @@ static struct xor_block_template xor_block_avx = {
#define AVX_XOR_SPEED \
do { \
- if (cpu_has_avx) \
+ if (cpu_has_avx && cpu_has_osxsave) \
xor_speed(&xor_block_avx); \
} while (0)
#define AVX_SELECT(FASTEST) \
- (cpu_has_avx ? &xor_block_avx : FASTEST)
+ (cpu_has_avx && cpu_has_osxsave ? &xor_block_avx : FASTEST)
#else
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 06fdbd987e97..94dc8ca434e0 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -23,6 +23,7 @@
#define KVM_FEATURE_ASYNC_PF 4
#define KVM_FEATURE_STEAL_TIME 5
#define KVM_FEATURE_PV_EOI 6
+#define KVM_FEATURE_PV_UNHALT 7
/* The last 8 bits are used to indicate how to interpret the flags field
* in pvclock structure. If no bits are set, all flags are ignored.
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index d651082c7cf7..0e79420376eb 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,7 @@
#define EXIT_REASON_EOI_INDUCED 45
#define EXIT_REASON_EPT_VIOLATION 48
#define EXIT_REASON_EPT_MISCONFIG 49
+#define EXIT_REASON_INVEPT 50
#define EXIT_REASON_PREEMPTION_TIMER 52
#define EXIT_REASON_WBINVD 54
#define EXIT_REASON_XSETBV 55
@@ -106,12 +107,13 @@
{ EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \
{ EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \
{ EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \
+ { EXIT_REASON_INVEPT, "INVEPT" }, \
+ { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }, \
{ EXIT_REASON_WBINVD, "WBINVD" }, \
{ EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \
{ EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \
{ EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
{ EXIT_REASON_INVD, "INVD" }, \
- { EXIT_REASON_INVPCID, "INVPCID" }, \
- { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }
+ { EXIT_REASON_INVPCID, "INVPCID" }
#endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 88d99ea77723..a5408b965c9d 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -103,6 +103,9 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
obj-$(CONFIG_OF) += devicetree.o
obj-$(CONFIG_UPROBES) += uprobes.o
+obj-y += sysfb.o
+obj-$(CONFIG_X86_SYSFB) += sysfb_simplefb.o
+obj-$(CONFIG_EFI) += sysfb_efi.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 2627a81253ee..40c76604199f 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -67,6 +67,7 @@ EXPORT_SYMBOL(acpi_pci_disabled);
int acpi_lapic;
int acpi_ioapic;
int acpi_strict;
+int acpi_disable_cmcff;
u8 acpi_sci_flags __initdata;
int acpi_sci_override_gsi __initdata;
@@ -141,16 +142,8 @@ static u32 irq_to_gsi(int irq)
}
/*
- * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
- * to map the target physical address. The problem is that set_fixmap()
- * provides a single page, and it is possible that the page is not
- * sufficient.
- * By using this area, we can map up to MAX_IO_APICS pages temporarily,
- * i.e. until the next __va_range() call.
- *
- * Important Safety Note: The fixed I/O APIC page numbers are *subtracted*
- * from the fixed base. That's why we start at FIX_IO_APIC_BASE_END and
- * count idx down while incrementing the phys address.
+ * This is just a simple wrapper around early_ioremap(),
+ * with sanity checks for phys == 0 and size == 0.
*/
char *__init __acpi_map_table(unsigned long phys, unsigned long size)
{
@@ -160,6 +153,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
return early_ioremap(phys, size);
}
+
void __init __acpi_unmap_table(char *map, unsigned long size)
{
if (!map || !size)
@@ -199,7 +193,7 @@ static void acpi_register_lapic(int id, u8 enabled)
{
unsigned int ver = 0;
- if (id >= (MAX_LOCAL_APIC-1)) {
+ if (id >= MAX_LOCAL_APIC) {
printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
return;
}
@@ -1120,6 +1114,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
int ioapic;
int ioapic_pin;
struct io_apic_irq_attr irq_attr;
+ int ret;
if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
return gsi;
@@ -1149,7 +1144,9 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
- io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr);
+ ret = io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr);
+ if (ret < 0)
+ gsi = INT_MIN;
return gsi;
}
@@ -1626,6 +1623,10 @@ static int __init parse_acpi(char *arg)
/* "acpi=copy_dsdt" copys DSDT */
else if (strcmp(arg, "copy_dsdt") == 0) {
acpi_gbl_copy_dsdt_locally = 1;
+ }
+ /* "acpi=nocmcff" disables FF mode for corrected errors */
+ else if (strcmp(arg, "nocmcff") == 0) {
+ acpi_disable_cmcff = 1;
} else {
/* Core will printk when we return error. */
return -EINVAL;
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index c15cf9a25e27..15e8563e5c24 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -11,6 +11,7 @@
#include <linux/memory.h>
#include <linux/stop_machine.h>
#include <linux/slab.h>
+#include <linux/kdebug.h>
#include <asm/alternative.h>
#include <asm/sections.h>
#include <asm/pgtable.h>
@@ -596,97 +597,93 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
return addr;
}
-/*
- * Cross-modifying kernel text with stop_machine().
- * This code originally comes from immediate value.
- */
-static atomic_t stop_machine_first;
-static int wrote_text;
+static void do_sync_core(void *info)
+{
+ sync_core();
+}
-struct text_poke_params {
- struct text_poke_param *params;
- int nparams;
-};
+static bool bp_patching_in_progress;
+static void *bp_int3_handler, *bp_int3_addr;
-static int __kprobes stop_machine_text_poke(void *data)
+int poke_int3_handler(struct pt_regs *regs)
{
- struct text_poke_params *tpp = data;
- struct text_poke_param *p;
- int i;
+ /* bp_patching_in_progress */
+ smp_rmb();
- if (atomic_xchg(&stop_machine_first, 0)) {
- for (i = 0; i < tpp->nparams; i++) {
- p = &tpp->params[i];
- text_poke(p->addr, p->opcode, p->len);
- }
- smp_wmb(); /* Make sure other cpus see that this has run */
- wrote_text = 1;
- } else {
- while (!wrote_text)
- cpu_relax();
- smp_mb(); /* Load wrote_text before following execution */
- }
+ if (likely(!bp_patching_in_progress))
+ return 0;
- for (i = 0; i < tpp->nparams; i++) {
- p = &tpp->params[i];
- flush_icache_range((unsigned long)p->addr,
- (unsigned long)p->addr + p->len);
- }
- /*
- * Intel Archiecture Software Developer's Manual section 7.1.3 specifies
- * that a core serializing instruction such as "cpuid" should be
- * executed on _each_ core before the new instruction is made visible.
- */
- sync_core();
- return 0;
-}
+ if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr)
+ return 0;
+
+ /* set up the specified breakpoint handler */
+ regs->ip = (unsigned long) bp_int3_handler;
+
+ return 1;
-/**
- * text_poke_smp - Update instructions on a live kernel on SMP
- * @addr: address to modify
- * @opcode: source of the copy
- * @len: length to copy
- *
- * Modify multi-byte instruction by using stop_machine() on SMP. This allows
- * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
- * should be allowed, since stop_machine() does _not_ protect code against
- * NMI and MCE.
- *
- * Note: Must be called under get_online_cpus() and text_mutex.
- */
-void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
-{
- struct text_poke_params tpp;
- struct text_poke_param p;
-
- p.addr = addr;
- p.opcode = opcode;
- p.len = len;
- tpp.params = &p;
- tpp.nparams = 1;
- atomic_set(&stop_machine_first, 1);
- wrote_text = 0;
- /* Use __stop_machine() because the caller already got online_cpus. */
- __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
- return addr;
}
/**
- * text_poke_smp_batch - Update instructions on a live kernel on SMP
- * @params: an array of text_poke parameters
- * @n: the number of elements in params.
+ * text_poke_bp() -- update instructions on live kernel on SMP
+ * @addr: address to patch
+ * @opcode: opcode of new instruction
+ * @len: length to copy
+ * @handler: address to jump to when the temporary breakpoint is hit
*
- * Modify multi-byte instruction by using stop_machine() on SMP. Since the
- * stop_machine() is heavy task, it is better to aggregate text_poke requests
- * and do it once if possible.
+ * Modify multi-byte instruction by using int3 breakpoint on SMP.
+ * We completely avoid stop_machine() here, and achieve the
+ * synchronization using int3 breakpoint.
*
- * Note: Must be called under get_online_cpus() and text_mutex.
+ * The way it is done:
+ * - add a int3 trap to the address that will be patched
+ * - sync cores
+ * - update all but the first byte of the patched range
+ * - sync cores
+ * - replace the first byte (int3) by the first byte of
+ * replacing opcode
+ * - sync cores
+ *
+ * Note: must be called under text_mutex.
*/
-void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
+void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
{
- struct text_poke_params tpp = {.params = params, .nparams = n};
+ unsigned char int3 = 0xcc;
+
+ bp_int3_handler = handler;
+ bp_int3_addr = (u8 *)addr + sizeof(int3);
+ bp_patching_in_progress = true;
+ /*
+ * Corresponding read barrier in int3 notifier for
+ * making sure the in_progress flags is correctly ordered wrt.
+ * patching
+ */
+ smp_wmb();
+
+ text_poke(addr, &int3, sizeof(int3));
- atomic_set(&stop_machine_first, 1);
- wrote_text = 0;
- __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
+ on_each_cpu(do_sync_core, NULL, 1);
+
+ if (len - sizeof(int3) > 0) {
+ /* patch all but the first byte */
+ text_poke((char *)addr + sizeof(int3),
+ (const char *) opcode + sizeof(int3),
+ len - sizeof(int3));
+ /*
+ * According to Intel, this core syncing is very likely
+ * not necessary and we'd be safe even without it. But
+ * better safe than sorry (plus there's not only Intel).
+ */
+ on_each_cpu(do_sync_core, NULL, 1);
+ }
+
+ /* patch the first byte */
+ text_poke(addr, opcode, sizeof(int3));
+
+ on_each_cpu(do_sync_core, NULL, 1);
+
+ bp_patching_in_progress = false;
+ smp_wmb();
+
+ return addr;
}
+
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 3048ded1b598..59554dca96ec 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -20,6 +20,7 @@ const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
{}
};
@@ -27,6 +28,7 @@ EXPORT_SYMBOL(amd_nb_misc_ids);
static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
{}
};
@@ -81,13 +83,20 @@ int amd_cache_northbridges(void)
next_northbridge(misc, amd_nb_misc_ids);
node_to_amd_nb(i)->link = link =
next_northbridge(link, amd_nb_link_ids);
- }
+ }
+ /* GART present only on Fam15h upto model 0fh */
if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
- boot_cpu_data.x86 == 0x15)
+ (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10))
amd_northbridges.flags |= AMD_NB_GART;
/*
+ * Check for L3 cache presence.
+ */
+ if (!cpuid_edx(0x80000006))
+ return 0;
+
+ /*
* Some CPU families support L3 Cache Index Disable. There are some
* limitations because of E382 and E388 on family 0x10.
*/
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 9ed796ccc32c..e63a5bd2a78f 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1534,6 +1534,11 @@ void intel_ir_io_apic_print_entries(unsigned int apic,
}
}
+void ioapic_zap_locks(void)
+{
+ raw_spin_lock_init(&ioapic_lock);
+}
+
__apicdebuginit(void) print_IO_APIC(int ioapic_idx)
{
union IO_APIC_reg_00 reg_00;
@@ -3375,12 +3380,15 @@ int io_apic_setup_irq_pin_once(unsigned int irq, int node,
{
unsigned int ioapic_idx = attr->ioapic, pin = attr->ioapic_pin;
int ret;
+ struct IO_APIC_route_entry orig_entry;
/* Avoid redundant programming */
if (test_bit(pin, ioapics[ioapic_idx].pin_programmed)) {
- pr_debug("Pin %d-%d already programmed\n",
- mpc_ioapic_id(ioapic_idx), pin);
- return 0;
+ pr_debug("Pin %d-%d already programmed\n", mpc_ioapic_id(ioapic_idx), pin);
+ orig_entry = ioapic_read_entry(attr->ioapic, pin);
+ if (attr->trigger == orig_entry.trigger && attr->polarity == orig_entry.polarity)
+ return 0;
+ return -EBUSY;
}
ret = io_apic_setup_irq_pin(irq, node, attr);
if (!ret)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 466e3d15de12..903a264af981 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -512,7 +512,7 @@ static void early_init_amd(struct cpuinfo_x86 *c)
static const int amd_erratum_383[];
static const int amd_erratum_400[];
-static bool cpu_has_amd_erratum(const int *erratum);
+static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
static void init_amd(struct cpuinfo_x86 *c)
{
@@ -729,11 +729,11 @@ static void init_amd(struct cpuinfo_x86 *c)
value &= ~(1ULL << 24);
wrmsrl_safe(MSR_AMD64_BU_CFG2, value);
- if (cpu_has_amd_erratum(amd_erratum_383))
+ if (cpu_has_amd_erratum(c, amd_erratum_383))
set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
}
- if (cpu_has_amd_erratum(amd_erratum_400))
+ if (cpu_has_amd_erratum(c, amd_erratum_400))
set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
@@ -878,23 +878,13 @@ static const int amd_erratum_400[] =
static const int amd_erratum_383[] =
AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
-static bool cpu_has_amd_erratum(const int *erratum)
+
+static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
{
- struct cpuinfo_x86 *cpu = __this_cpu_ptr(&cpu_info);
int osvw_id = *erratum++;
u32 range;
u32 ms;
- /*
- * If called early enough that current_cpu_data hasn't been initialized
- * yet, fall back to boot_cpu_data.
- */
- if (cpu->x86 == 0)
- cpu = &boot_cpu_data;
-
- if (cpu->x86_vendor != X86_VENDOR_AMD)
- return false;
-
if (osvw_id >= 0 && osvw_id < 65536 &&
cpu_has(cpu, X86_FEATURE_OSVW)) {
u64 osvw_len;
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 87279212d318..36ce402a3fa5 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -25,11 +25,6 @@
#include <asm/processor.h>
#include <asm/hypervisor.h>
-/*
- * Hypervisor detect order. This is specified explicitly here because
- * some hypervisors might implement compatibility modes for other
- * hypervisors and therefore need to be detected in specific sequence.
- */
static const __initconst struct hypervisor_x86 * const hypervisors[] =
{
#ifdef CONFIG_XEN_PVHVM
@@ -49,15 +44,19 @@ static inline void __init
detect_hypervisor_vendor(void)
{
const struct hypervisor_x86 *h, * const *p;
+ uint32_t pri, max_pri = 0;
for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
h = *p;
- if (h->detect()) {
+ pri = h->detect();
+ if (pri != 0 && pri > max_pri) {
+ max_pri = pri;
x86_hyper = h;
- printk(KERN_INFO "Hypervisor detected: %s\n", h->name);
- break;
}
}
+
+ if (max_pri)
+ printk(KERN_INFO "Hypervisor detected: %s\n", x86_hyper->name);
}
void init_hypervisor(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 5b7d4fa5d3b7..09edd0b65fef 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -25,15 +25,18 @@ int mce_severity(struct mce *a, int tolerant, char **msg);
struct dentry *mce_get_debugfs_dir(void);
extern struct mce_bank *mce_banks;
+extern mce_banks_t mce_banks_ce_disabled;
#ifdef CONFIG_X86_MCE_INTEL
unsigned long mce_intel_adjust_timer(unsigned long interval);
void mce_intel_cmci_poll(void);
void mce_intel_hcpu_update(unsigned long cpu);
+void cmci_disable_bank(int bank);
#else
# define mce_intel_adjust_timer mce_adjust_timer_default
static inline void mce_intel_cmci_poll(void) { }
static inline void mce_intel_hcpu_update(unsigned long cpu) { }
+static inline void cmci_disable_bank(int bank) { }
#endif
void mce_timer_kick(unsigned long interval);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 87a65c939bcd..b3218cdee95f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -97,6 +97,15 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
};
+/*
+ * MCA banks controlled through firmware first for corrected errors.
+ * This is a global list of banks for which we won't enable CMCI and we
+ * won't poll. Firmware controls these banks and is responsible for
+ * reporting corrected errors through GHES. Uncorrected/recoverable
+ * errors are still notified through a machine check.
+ */
+mce_banks_t mce_banks_ce_disabled;
+
static DEFINE_PER_CPU(struct work_struct, mce_work);
static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
@@ -1935,6 +1944,25 @@ static struct miscdevice mce_chrdev_device = {
&mce_chrdev_ops,
};
+static void __mce_disable_bank(void *arg)
+{
+ int bank = *((int *)arg);
+ __clear_bit(bank, __get_cpu_var(mce_poll_banks));
+ cmci_disable_bank(bank);
+}
+
+void mce_disable_bank(int bank)
+{
+ if (bank >= mca_cfg.banks) {
+ pr_warn(FW_BUG
+ "Ignoring request to disable invalid MCA bank %d.\n",
+ bank);
+ return;
+ }
+ set_bit(bank, mce_banks_ce_disabled);
+ on_each_cpu(__mce_disable_bank, &bank, 1);
+}
+
/*
* mce=off Disables machine check
* mce=no_cmci Disables CMCI
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index d56405309dc1..4cfe0458ca66 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -203,6 +203,10 @@ static void cmci_discover(int banks)
if (test_bit(i, owned))
continue;
+ /* Skip banks in firmware first mode */
+ if (test_bit(i, mce_banks_ce_disabled))
+ continue;
+
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
/* Already owned by someone else? */
@@ -271,6 +275,19 @@ void cmci_recheck(void)
local_irq_restore(flags);
}
+/* Caller must hold the lock on cmci_discover_lock */
+static void __cmci_disable_bank(int bank)
+{
+ u64 val;
+
+ if (!test_bit(bank, __get_cpu_var(mce_banks_owned)))
+ return;
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ val &= ~MCI_CTL2_CMCI_EN;
+ wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ __clear_bit(bank, __get_cpu_var(mce_banks_owned));
+}
+
/*
* Disable CMCI on this CPU for all banks it owns when it goes down.
* This allows other CPUs to claim the banks on rediscovery.
@@ -280,20 +297,12 @@ void cmci_clear(void)
unsigned long flags;
int i;
int banks;
- u64 val;
if (!cmci_supported(&banks))
return;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
- for (i = 0; i < banks; i++) {
- if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
- continue;
- /* Disable CMCI */
- rdmsrl(MSR_IA32_MCx_CTL2(i), val);
- val &= ~MCI_CTL2_CMCI_EN;
- wrmsrl(MSR_IA32_MCx_CTL2(i), val);
- __clear_bit(i, __get_cpu_var(mce_banks_owned));
- }
+ for (i = 0; i < banks; i++)
+ __cmci_disable_bank(i);
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
@@ -327,6 +336,19 @@ void cmci_reenable(void)
cmci_discover(banks);
}
+void cmci_disable_bank(int bank)
+{
+ int banks;
+ unsigned long flags;
+
+ if (!cmci_supported(&banks))
+ return;
+
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ __cmci_disable_bank(bank);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
static void intel_init_cmci(void)
{
int banks;
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 8f4be53ea04b..71a39f3621ba 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -27,20 +27,23 @@
struct ms_hyperv_info ms_hyperv;
EXPORT_SYMBOL_GPL(ms_hyperv);
-static bool __init ms_hyperv_platform(void)
+static uint32_t __init ms_hyperv_platform(void)
{
u32 eax;
u32 hyp_signature[3];
if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
- return false;
+ return 0;
cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
&eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
- return eax >= HYPERV_CPUID_MIN &&
- eax <= HYPERV_CPUID_MAX &&
- !memcmp("Microsoft Hv", hyp_signature, 12);
+ if (eax >= HYPERV_CPUID_MIN &&
+ eax <= HYPERV_CPUID_MAX &&
+ !memcmp("Microsoft Hv", hyp_signature, 12))
+ return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
+
+ return 0;
}
static cycle_t read_hv_clock(struct clocksource *arg)
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index d4cdfa67509e..ce2d0a2c3e4f 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -683,6 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
}
/* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
+ count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
/* Save MTRR state */
@@ -696,6 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
static void post_set(void) __releases(set_atomicity_lock)
{
/* Flush TLBs (no need to flush caches - they are disabled) */
+ count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
/* Intel (P6) standard MTRRs */
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index a7c7305030cc..8355c84b9729 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1884,6 +1884,7 @@ static struct pmu pmu = {
void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
{
userpg->cap_usr_time = 0;
+ userpg->cap_usr_time_zero = 0;
userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc;
userpg->pmc_width = x86_pmu.cntval_bits;
@@ -1897,6 +1898,11 @@ void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
userpg->time_mult = this_cpu_read(cyc2ns);
userpg->time_shift = CYC2NS_SCALE_FACTOR;
userpg->time_offset = this_cpu_read(cyc2ns_offset) - now;
+
+ if (sched_clock_stable && !check_tsc_disabled()) {
+ userpg->cap_usr_time_zero = 1;
+ userpg->time_zero = this_cpu_read(cyc2ns_offset);
+ }
}
/*
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 97e557bc4c91..cc16faae0538 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -641,6 +641,8 @@ extern struct event_constraint intel_core2_pebs_event_constraints[];
extern struct event_constraint intel_atom_pebs_event_constraints[];
+extern struct event_constraint intel_slm_pebs_event_constraints[];
+
extern struct event_constraint intel_nehalem_pebs_event_constraints[];
extern struct event_constraint intel_westmere_pebs_event_constraints[];
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 4cbe03287b08..beeb7cc07044 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -347,8 +347,7 @@ static struct amd_nb *amd_alloc_nb(int cpu)
struct amd_nb *nb;
int i;
- nb = kmalloc_node(sizeof(struct amd_nb), GFP_KERNEL | __GFP_ZERO,
- cpu_to_node(cpu));
+ nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu));
if (!nb)
return NULL;
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index fbc9210b45bc..9db76c31b3c3 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -81,7 +81,8 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
{
- INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
EVENT_EXTRA_END
};
@@ -123,6 +124,7 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */
INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */
INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */
+ INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */
@@ -143,8 +145,9 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
{
- INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
- INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
EVENT_EXTRA_END
};
@@ -162,16 +165,27 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly =
EVENT_CONSTRAINT_END
};
+static struct event_constraint intel_slm_event_constraints[] __read_mostly =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
+ EVENT_CONSTRAINT_END
+};
+
static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
- INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
- INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1),
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1),
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
EVENT_EXTRA_END
};
static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
- INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
- INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
EVENT_EXTRA_END
};
@@ -882,6 +896,140 @@ static __initconst const u64 atom_hw_cache_event_ids
},
};
+static struct extra_reg intel_slm_extra_regs[] __read_mostly =
+{
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffffull, RSP_1),
+ EVENT_EXTRA_END
+};
+
+#define SLM_DMND_READ SNB_DMND_DATA_RD
+#define SLM_DMND_WRITE SNB_DMND_RFO
+#define SLM_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO)
+
+#define SLM_SNP_ANY (SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM)
+#define SLM_LLC_ACCESS SNB_RESP_ANY
+#define SLM_LLC_MISS (SLM_SNP_ANY|SNB_NON_DRAM)
+
+static __initconst const u64 slm_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = SLM_DMND_READ|SLM_LLC_MISS,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = SLM_DMND_WRITE|SLM_LLC_MISS,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = SLM_DMND_PREFETCH|SLM_LLC_MISS,
+ },
+ },
+};
+
+static __initconst const u64 slm_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0x0104, /* LD_DCU_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */
+ [ C(RESULT_MISS) ] = 0x0280, /* ICACGE.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_WRITE) ] = {
+ /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_PREFETCH) ] = {
+ /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0x0804, /* LD_DTLB_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
{
/* user explicitly requested branch sampling */
@@ -1301,11 +1449,11 @@ static void intel_fixup_er(struct perf_event *event, int idx)
if (idx == EXTRA_REG_RSP_0) {
event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
- event->hw.config |= 0x01b7;
+ event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event;
event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
} else if (idx == EXTRA_REG_RSP_1) {
event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
- event->hw.config |= 0x01bb;
+ event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event;
event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
}
}
@@ -2176,6 +2324,21 @@ __init int intel_pmu_init(void)
pr_cont("Atom events, ");
break;
+ case 55: /* Atom 22nm "Silvermont" */
+ memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));
+
+ intel_pmu_lbr_init_atom();
+
+ x86_pmu.event_constraints = intel_slm_event_constraints;
+ x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_slm_extra_regs;
+ x86_pmu.er_flags |= ERF_HAS_RSP_1;
+ pr_cont("Silvermont events, ");
+ break;
+
case 37: /* 32 nm nehalem, "Clarkdale" */
case 44: /* 32 nm nehalem, "Gulftown" */
case 47: /* 32 nm Xeon E7 */
@@ -2270,6 +2433,7 @@ __init int intel_pmu_init(void)
case 70:
case 71:
case 63:
+ case 69:
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 4ab70acd3a8d..ab3ba1c1b7dd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -224,7 +224,7 @@ static int alloc_pebs_buffer(int cpu)
if (!x86_pmu.pebs)
return 0;
- buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
+ buffer = kzalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL, node);
if (unlikely(!buffer))
return -ENOMEM;
@@ -262,7 +262,7 @@ static int alloc_bts_buffer(int cpu)
if (!x86_pmu.bts)
return 0;
- buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
+ buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL, node);
if (unlikely(!buffer))
return -ENOMEM;
@@ -295,7 +295,7 @@ static int alloc_ds_buffer(int cpu)
int node = cpu_to_node(cpu);
struct debug_store *ds;
- ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node);
+ ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
if (unlikely(!ds))
return -ENOMEM;
@@ -517,6 +517,32 @@ struct event_constraint intel_atom_pebs_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_slm_pebs_event_constraints[] = {
+ INTEL_UEVENT_CONSTRAINT(0x0103, 0x1), /* REHABQ.LD_BLOCK_ST_FORWARD_PS */
+ INTEL_UEVENT_CONSTRAINT(0x0803, 0x1), /* REHABQ.LD_SPLITS_PS */
+ INTEL_UEVENT_CONSTRAINT(0x0204, 0x1), /* MEM_UOPS_RETIRED.L2_HIT_LOADS_PS */
+ INTEL_UEVENT_CONSTRAINT(0x0404, 0x1), /* MEM_UOPS_RETIRED.L2_MISS_LOADS_PS */
+ INTEL_UEVENT_CONSTRAINT(0x0804, 0x1), /* MEM_UOPS_RETIRED.DTLB_MISS_LOADS_PS */
+ INTEL_UEVENT_CONSTRAINT(0x2004, 0x1), /* MEM_UOPS_RETIRED.HITM_PS */
+ INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY_PS */
+ INTEL_UEVENT_CONSTRAINT(0x00c4, 0x1), /* BR_INST_RETIRED.ALL_BRANCHES_PS */
+ INTEL_UEVENT_CONSTRAINT(0x7ec4, 0x1), /* BR_INST_RETIRED.JCC_PS */
+ INTEL_UEVENT_CONSTRAINT(0xbfc4, 0x1), /* BR_INST_RETIRED.FAR_BRANCH_PS */
+ INTEL_UEVENT_CONSTRAINT(0xebc4, 0x1), /* BR_INST_RETIRED.NON_RETURN_IND_PS */
+ INTEL_UEVENT_CONSTRAINT(0xf7c4, 0x1), /* BR_INST_RETIRED.RETURN_PS */
+ INTEL_UEVENT_CONSTRAINT(0xf9c4, 0x1), /* BR_INST_RETIRED.CALL_PS */
+ INTEL_UEVENT_CONSTRAINT(0xfbc4, 0x1), /* BR_INST_RETIRED.IND_CALL_PS */
+ INTEL_UEVENT_CONSTRAINT(0xfdc4, 0x1), /* BR_INST_RETIRED.REL_CALL_PS */
+ INTEL_UEVENT_CONSTRAINT(0xfec4, 0x1), /* BR_INST_RETIRED.TAKEN_JCC_PS */
+ INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_MISP_RETIRED.ALL_BRANCHES_PS */
+ INTEL_UEVENT_CONSTRAINT(0x7ec5, 0x1), /* BR_INST_MISP_RETIRED.JCC_PS */
+ INTEL_UEVENT_CONSTRAINT(0xebc5, 0x1), /* BR_INST_MISP_RETIRED.NON_RETURN_IND_PS */
+ INTEL_UEVENT_CONSTRAINT(0xf7c5, 0x1), /* BR_INST_MISP_RETIRED.RETURN_PS */
+ INTEL_UEVENT_CONSTRAINT(0xfbc5, 0x1), /* BR_INST_MISP_RETIRED.IND_CALL_PS */
+ INTEL_UEVENT_CONSTRAINT(0xfec5, 0x1), /* BR_INST_MISP_RETIRED.TAKEN_JCC_PS */
+ EVENT_CONSTRAINT_END
+};
+
struct event_constraint intel_nehalem_pebs_event_constraints[] = {
INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index cad791dbde95..8ed44589b0e4 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -6,6 +6,8 @@ static struct intel_uncore_type **pci_uncores = empty_uncore;
/* pci bus to socket mapping */
static int pcibus_to_physid[256] = { [0 ... 255] = -1, };
+static struct pci_dev *extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
+
static DEFINE_RAW_SPINLOCK(uncore_box_lock);
/* mask of cpus that collect uncore events */
@@ -45,6 +47,24 @@ DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51");
+DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35");
+DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31");
+DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17");
+DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12");
+DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4");
+DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31");
+DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17");
+DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12");
+DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
+DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
{
@@ -281,7 +301,7 @@ static struct attribute *snbep_uncore_cbox_formats_attr[] = {
};
static struct attribute *snbep_uncore_pcu_formats_attr[] = {
- &format_attr_event.attr,
+ &format_attr_event_ext.attr,
&format_attr_occ_sel.attr,
&format_attr_edge.attr,
&format_attr_inv.attr,
@@ -301,6 +321,24 @@ static struct attribute *snbep_uncore_qpi_formats_attr[] = {
&format_attr_edge.attr,
&format_attr_inv.attr,
&format_attr_thresh8.attr,
+ &format_attr_match_rds.attr,
+ &format_attr_match_rnid30.attr,
+ &format_attr_match_rnid4.attr,
+ &format_attr_match_dnid.attr,
+ &format_attr_match_mc.attr,
+ &format_attr_match_opc.attr,
+ &format_attr_match_vnw.attr,
+ &format_attr_match0.attr,
+ &format_attr_match1.attr,
+ &format_attr_mask_rds.attr,
+ &format_attr_mask_rnid30.attr,
+ &format_attr_mask_rnid4.attr,
+ &format_attr_mask_dnid.attr,
+ &format_attr_mask_mc.attr,
+ &format_attr_mask_opc.attr,
+ &format_attr_mask_vnw.attr,
+ &format_attr_mask0.attr,
+ &format_attr_mask1.attr,
NULL,
};
@@ -314,8 +352,8 @@ static struct uncore_event_desc snbep_uncore_imc_events[] = {
static struct uncore_event_desc snbep_uncore_qpi_events[] = {
INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"),
INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
- INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x02,umask=0x08"),
- INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x03,umask=0x04"),
+ INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x102,umask=0x08"),
+ INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x103,umask=0x04"),
{ /* end: all zeroes */ },
};
@@ -356,13 +394,16 @@ static struct intel_uncore_ops snbep_uncore_msr_ops = {
SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
};
+#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT() \
+ .init_box = snbep_uncore_pci_init_box, \
+ .disable_box = snbep_uncore_pci_disable_box, \
+ .enable_box = snbep_uncore_pci_enable_box, \
+ .disable_event = snbep_uncore_pci_disable_event, \
+ .read_counter = snbep_uncore_pci_read_counter
+
static struct intel_uncore_ops snbep_uncore_pci_ops = {
- .init_box = snbep_uncore_pci_init_box,
- .disable_box = snbep_uncore_pci_disable_box,
- .enable_box = snbep_uncore_pci_enable_box,
- .disable_event = snbep_uncore_pci_disable_event,
- .enable_event = snbep_uncore_pci_enable_event,
- .read_counter = snbep_uncore_pci_read_counter,
+ SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
+ .enable_event = snbep_uncore_pci_enable_event, \
};
static struct event_constraint snbep_uncore_cbox_constraints[] = {
@@ -726,6 +767,61 @@ static struct intel_uncore_type *snbep_msr_uncores[] = {
NULL,
};
+enum {
+ SNBEP_PCI_QPI_PORT0_FILTER,
+ SNBEP_PCI_QPI_PORT1_FILTER,
+};
+
+static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) {
+ reg1->idx = 0;
+ reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0;
+ reg1->config = event->attr.config1;
+ reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0;
+ reg2->config = event->attr.config2;
+ }
+ return 0;
+}
+
+static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE) {
+ int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
+ struct pci_dev *filter_pdev = extra_pci_dev[box->phys_id][idx];
+ WARN_ON_ONCE(!filter_pdev);
+ if (filter_pdev) {
+ pci_write_config_dword(filter_pdev, reg1->reg,
+ (u32)reg1->config);
+ pci_write_config_dword(filter_pdev, reg1->reg + 4,
+ (u32)(reg1->config >> 32));
+ pci_write_config_dword(filter_pdev, reg2->reg,
+ (u32)reg2->config);
+ pci_write_config_dword(filter_pdev, reg2->reg + 4,
+ (u32)(reg2->config >> 32));
+ }
+ }
+
+ pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops snbep_uncore_qpi_ops = {
+ SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
+ .enable_event = snbep_qpi_enable_event,
+ .hw_config = snbep_qpi_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
#define SNBEP_UNCORE_PCI_COMMON_INIT() \
.perf_ctr = SNBEP_PCI_PMON_CTR0, \
.event_ctl = SNBEP_PCI_PMON_CTL0, \
@@ -755,17 +851,18 @@ static struct intel_uncore_type snbep_uncore_imc = {
};
static struct intel_uncore_type snbep_uncore_qpi = {
- .name = "qpi",
- .num_counters = 4,
- .num_boxes = 2,
- .perf_ctr_bits = 48,
- .perf_ctr = SNBEP_PCI_PMON_CTR0,
- .event_ctl = SNBEP_PCI_PMON_CTL0,
- .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
- .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
- .ops = &snbep_uncore_pci_ops,
- .event_descs = snbep_uncore_qpi_events,
- .format_group = &snbep_uncore_qpi_format_group,
+ .name = "qpi",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &snbep_uncore_qpi_ops,
+ .event_descs = snbep_uncore_qpi_events,
+ .format_group = &snbep_uncore_qpi_format_group,
};
@@ -807,43 +904,53 @@ static struct intel_uncore_type *snbep_pci_uncores[] = {
static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = {
{ /* Home Agent */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
- .driver_data = SNBEP_PCI_UNCORE_HA,
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0),
},
{ /* MC Channel 0 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
- .driver_data = SNBEP_PCI_UNCORE_IMC,
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0),
},
{ /* MC Channel 1 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
- .driver_data = SNBEP_PCI_UNCORE_IMC,
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1),
},
{ /* MC Channel 2 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
- .driver_data = SNBEP_PCI_UNCORE_IMC,
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2),
},
{ /* MC Channel 3 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
- .driver_data = SNBEP_PCI_UNCORE_IMC,
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3),
},
{ /* QPI Port 0 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
- .driver_data = SNBEP_PCI_UNCORE_QPI,
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0),
},
{ /* QPI Port 1 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
- .driver_data = SNBEP_PCI_UNCORE_QPI,
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1),
},
{ /* R2PCIe */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
- .driver_data = SNBEP_PCI_UNCORE_R2PCIE,
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0),
},
{ /* R3QPI Link 0 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
- .driver_data = SNBEP_PCI_UNCORE_R3QPI,
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0),
},
{ /* R3QPI Link 1 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
- .driver_data = SNBEP_PCI_UNCORE_R3QPI,
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT0_FILTER),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT1_FILTER),
},
{ /* end: all zeroes */ }
};
@@ -1256,71 +1363,71 @@ static struct intel_uncore_type *ivt_pci_uncores[] = {
static DEFINE_PCI_DEVICE_TABLE(ivt_uncore_pci_ids) = {
{ /* Home Agent 0 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30),
- .driver_data = IVT_PCI_UNCORE_HA,
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 0),
},
{ /* Home Agent 1 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38),
- .driver_data = IVT_PCI_UNCORE_HA,
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 1),
},
{ /* MC0 Channel 0 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4),
- .driver_data = IVT_PCI_UNCORE_IMC,
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 0),
},
{ /* MC0 Channel 1 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5),
- .driver_data = IVT_PCI_UNCORE_IMC,
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 1),
},
{ /* MC0 Channel 3 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0),
- .driver_data = IVT_PCI_UNCORE_IMC,
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 2),
},
{ /* MC0 Channel 4 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1),
- .driver_data = IVT_PCI_UNCORE_IMC,
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 3),
},
{ /* MC1 Channel 0 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4),
- .driver_data = IVT_PCI_UNCORE_IMC,
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 4),
},
{ /* MC1 Channel 1 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5),
- .driver_data = IVT_PCI_UNCORE_IMC,
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 5),
},
{ /* MC1 Channel 3 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0),
- .driver_data = IVT_PCI_UNCORE_IMC,
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 6),
},
{ /* MC1 Channel 4 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),
- .driver_data = IVT_PCI_UNCORE_IMC,
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 7),
},
{ /* QPI0 Port 0 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),
- .driver_data = IVT_PCI_UNCORE_QPI,
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 0),
},
{ /* QPI0 Port 1 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33),
- .driver_data = IVT_PCI_UNCORE_QPI,
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 1),
},
{ /* QPI1 Port 2 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a),
- .driver_data = IVT_PCI_UNCORE_QPI,
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 2),
},
{ /* R2PCIe */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34),
- .driver_data = IVT_PCI_UNCORE_R2PCIE,
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R2PCIE, 0),
},
{ /* R3QPI0 Link 0 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36),
- .driver_data = IVT_PCI_UNCORE_R3QPI,
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 0),
},
{ /* R3QPI0 Link 1 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37),
- .driver_data = IVT_PCI_UNCORE_R3QPI,
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 1),
},
{ /* R3QPI1 Link 2 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),
- .driver_data = IVT_PCI_UNCORE_R3QPI,
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 2),
},
{ /* end: all zeroes */ }
};
@@ -2606,7 +2713,7 @@ struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int cp
size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg);
- box = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu));
+ box = kzalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
if (!box)
return NULL;
@@ -2701,7 +2808,7 @@ uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *eve
return c;
}
- if (event->hw.config == ~0ULL)
+ if (event->attr.config == UNCORE_FIXED_EVENT)
return &constraint_fixed;
if (type->constraints) {
@@ -3005,7 +3112,9 @@ static int uncore_pmu_event_init(struct perf_event *event)
*/
if (pmu->type->single_fixed && pmu->pmu_idx > 0)
return -EINVAL;
- hwc->config = ~0ULL;
+
+ /* fixed counters have event field hardcoded to zero */
+ hwc->config = 0ULL;
} else {
hwc->config = event->attr.config & pmu->type->event_mask;
if (pmu->type->ops->hw_config) {
@@ -3167,16 +3276,24 @@ static bool pcidrv_registered;
/*
* add a pci uncore device
*/
-static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev)
+static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
struct intel_uncore_pmu *pmu;
struct intel_uncore_box *box;
- int i, phys_id;
+ struct intel_uncore_type *type;
+ int phys_id;
phys_id = pcibus_to_physid[pdev->bus->number];
if (phys_id < 0)
return -ENODEV;
+ if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
+ extra_pci_dev[phys_id][UNCORE_PCI_DEV_IDX(id->driver_data)] = pdev;
+ pci_set_drvdata(pdev, NULL);
+ return 0;
+ }
+
+ type = pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
box = uncore_alloc_box(type, 0);
if (!box)
return -ENOMEM;
@@ -3185,21 +3302,11 @@ static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev)
* for performance monitoring unit with multiple boxes,
* each box has a different function id.
*/
- for (i = 0; i < type->num_boxes; i++) {
- pmu = &type->pmus[i];
- if (pmu->func_id == pdev->devfn)
- break;
- if (pmu->func_id < 0) {
- pmu->func_id = pdev->devfn;
- break;
- }
- pmu = NULL;
- }
-
- if (!pmu) {
- kfree(box);
- return -EINVAL;
- }
+ pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
+ if (pmu->func_id < 0)
+ pmu->func_id = pdev->devfn;
+ else
+ WARN_ON_ONCE(pmu->func_id != pdev->devfn);
box->phys_id = phys_id;
box->pci_dev = pdev;
@@ -3217,9 +3324,22 @@ static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev)
static void uncore_pci_remove(struct pci_dev *pdev)
{
struct intel_uncore_box *box = pci_get_drvdata(pdev);
- struct intel_uncore_pmu *pmu = box->pmu;
- int cpu, phys_id = pcibus_to_physid[pdev->bus->number];
+ struct intel_uncore_pmu *pmu;
+ int i, cpu, phys_id = pcibus_to_physid[pdev->bus->number];
+
+ box = pci_get_drvdata(pdev);
+ if (!box) {
+ for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
+ if (extra_pci_dev[phys_id][i] == pdev) {
+ extra_pci_dev[phys_id][i] = NULL;
+ break;
+ }
+ }
+ WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX);
+ return;
+ }
+ pmu = box->pmu;
if (WARN_ON_ONCE(phys_id != box->phys_id))
return;
@@ -3240,12 +3360,6 @@ static void uncore_pci_remove(struct pci_dev *pdev)
kfree(box);
}
-static int uncore_pci_probe(struct pci_dev *pdev,
- const struct pci_device_id *id)
-{
- return uncore_pci_add(pci_uncores[id->driver_data], pdev);
-}
-
static int __init uncore_pci_init(void)
{
int ret;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 47b3d00c9d89..a80ab71a883d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -12,6 +12,15 @@
#define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC
#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1)
+#define UNCORE_PCI_DEV_DATA(type, idx) ((type << 8) | idx)
+#define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff)
+#define UNCORE_PCI_DEV_IDX(data) (data & 0xff)
+#define UNCORE_EXTRA_PCI_DEV 0xff
+#define UNCORE_EXTRA_PCI_DEV_MAX 2
+
+/* support up to 8 sockets */
+#define UNCORE_SOCKET_MAX 8
+
#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
/* SNB event control */
@@ -108,6 +117,7 @@
(SNBEP_PMON_CTL_EV_SEL_MASK | \
SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_EV_SEL_EXT | \
SNBEP_PMON_CTL_INVERT | \
SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 7076878404ec..628a059a9a06 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -93,7 +93,7 @@ static void __init vmware_platform_setup(void)
* serial key should be enough, as this will always have a VMware
* specific string when running under VMware hypervisor.
*/
-static bool __init vmware_platform(void)
+static uint32_t __init vmware_platform(void)
{
if (cpu_has_hypervisor) {
unsigned int eax;
@@ -102,12 +102,12 @@ static bool __init vmware_platform(void)
cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
&hyper_vendor_id[1], &hyper_vendor_id[2]);
if (!memcmp(hyper_vendor_id, "VMwareVMware", 12))
- return true;
+ return CPUID_VMWARE_INFO_LEAF;
} else if (dmi_available && dmi_name_in_serial("VMware") &&
__vmware_platform())
- return true;
+ return 1;
- return false;
+ return 0;
}
/*
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 74467feb4dc5..e0e0841eef45 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -128,7 +128,9 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
cpu_emergency_svm_disable();
lapic_shutdown();
-#if defined(CONFIG_X86_IO_APIC)
+#ifdef CONFIG_X86_IO_APIC
+ /* Prevent crash_kexec() from deadlocking on ioapic_lock. */
+ ioapic_zap_locks();
disable_IO_APIC();
#endif
#ifdef CONFIG_HPET_TIMER
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 69eb2fa25494..376dc7873447 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -52,8 +52,7 @@ void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
}
#ifdef CONFIG_BLK_DEV_INITRD
-void __init early_init_dt_setup_initrd_arch(unsigned long start,
- unsigned long end)
+void __init early_init_dt_setup_initrd_arch(u64 start, u64 end)
{
initrd_start = (unsigned long)__va(start);
initrd_end = (unsigned long)__va(end);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index d32abeabbda5..174da5fc5a7b 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -658,15 +658,18 @@ __init void e820_setup_gap(void)
* boot_params.e820_map, others are passed via SETUP_E820_EXT node of
* linked list of struct setup_data, which is parsed here.
*/
-void __init parse_e820_ext(struct setup_data *sdata)
+void __init parse_e820_ext(u64 phys_addr, u32 data_len)
{
int entries;
struct e820entry *extmap;
+ struct setup_data *sdata;
+ sdata = early_memremap(phys_addr, data_len);
entries = sdata->len / sizeof(struct e820entry);
extmap = (struct e820entry *)(sdata->data);
__append_e820_map(extmap, entries);
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ early_iounmap(sdata, data_len);
printk(KERN_INFO "e820: extended physical RAM map:\n");
e820_print_map("extended");
}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 94ab6b90dd3f..b3cd3ebae077 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -12,6 +12,7 @@
#include <linux/pci.h>
#include <linux/acpi.h>
#include <linux/pci_ids.h>
+#include <drm/i915_drm.h>
#include <asm/pci-direct.h>
#include <asm/dma.h>
#include <asm/io_apic.h>
@@ -196,16 +197,175 @@ static void __init ati_bugs_contd(int num, int slot, int func)
static void __init intel_remapping_check(int num, int slot, int func)
{
u8 revision;
+ u16 device;
+ device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
revision = read_pci_config_byte(num, slot, func, PCI_REVISION_ID);
/*
- * Revision 0x13 of this chipset supports irq remapping
- * but has an erratum that breaks its behavior, flag it as such
+ * Revision 13 of all triggering devices id in this quirk have
+ * a problem draining interrupts when irq remapping is enabled,
+ * and should be flagged as broken. Additionally revisions 0x12
+ * and 0x22 of device id 0x3405 has this problem.
*/
if (revision == 0x13)
set_irq_remapping_broken();
+ else if ((device == 0x3405) &&
+ ((revision == 0x12) ||
+ (revision == 0x22)))
+ set_irq_remapping_broken();
+
+}
+
+/*
+ * Systems with Intel graphics controllers set aside memory exclusively
+ * for gfx driver use. This memory is not marked in the E820 as reserved
+ * or as RAM, and so is subject to overlap from E820 manipulation later
+ * in the boot process. On some systems, MMIO space is allocated on top,
+ * despite the efforts of the "RAM buffer" approach, which simply rounds
+ * memory boundaries up to 64M to try to catch space that may decode
+ * as RAM and so is not suitable for MMIO.
+ *
+ * And yes, so far on current devices the base addr is always under 4G.
+ */
+static u32 __init intel_stolen_base(int num, int slot, int func)
+{
+ u32 base;
+
+ /*
+ * For the PCI IDs in this quirk, the stolen base is always
+ * in 0x5c, aka the BDSM register (yes that's really what
+ * it's called).
+ */
+ base = read_pci_config(num, slot, func, 0x5c);
+ base &= ~((1<<20) - 1);
+
+ return base;
+}
+
+#define KB(x) ((x) * 1024)
+#define MB(x) (KB (KB (x)))
+#define GB(x) (MB (KB (x)))
+
+static size_t __init gen3_stolen_size(int num, int slot, int func)
+{
+ size_t stolen_size;
+ u16 gmch_ctrl;
+
+ gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL);
+
+ switch (gmch_ctrl & I855_GMCH_GMS_MASK) {
+ case I855_GMCH_GMS_STOLEN_1M:
+ stolen_size = MB(1);
+ break;
+ case I855_GMCH_GMS_STOLEN_4M:
+ stolen_size = MB(4);
+ break;
+ case I855_GMCH_GMS_STOLEN_8M:
+ stolen_size = MB(8);
+ break;
+ case I855_GMCH_GMS_STOLEN_16M:
+ stolen_size = MB(16);
+ break;
+ case I855_GMCH_GMS_STOLEN_32M:
+ stolen_size = MB(32);
+ break;
+ case I915_GMCH_GMS_STOLEN_48M:
+ stolen_size = MB(48);
+ break;
+ case I915_GMCH_GMS_STOLEN_64M:
+ stolen_size = MB(64);
+ break;
+ case G33_GMCH_GMS_STOLEN_128M:
+ stolen_size = MB(128);
+ break;
+ case G33_GMCH_GMS_STOLEN_256M:
+ stolen_size = MB(256);
+ break;
+ case INTEL_GMCH_GMS_STOLEN_96M:
+ stolen_size = MB(96);
+ break;
+ case INTEL_GMCH_GMS_STOLEN_160M:
+ stolen_size = MB(160);
+ break;
+ case INTEL_GMCH_GMS_STOLEN_224M:
+ stolen_size = MB(224);
+ break;
+ case INTEL_GMCH_GMS_STOLEN_352M:
+ stolen_size = MB(352);
+ break;
+ default:
+ stolen_size = 0;
+ break;
+ }
+
+ return stolen_size;
+}
+
+static size_t __init gen6_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gmch_ctrl >>= SNB_GMCH_GMS_SHIFT;
+ gmch_ctrl &= SNB_GMCH_GMS_MASK;
+
+ return gmch_ctrl << 25; /* 32 MB units */
+}
+
+typedef size_t (*stolen_size_fn)(int num, int slot, int func);
+
+static struct pci_device_id intel_stolen_ids[] __initdata = {
+ INTEL_I915G_IDS(gen3_stolen_size),
+ INTEL_I915GM_IDS(gen3_stolen_size),
+ INTEL_I945G_IDS(gen3_stolen_size),
+ INTEL_I945GM_IDS(gen3_stolen_size),
+ INTEL_VLV_M_IDS(gen3_stolen_size),
+ INTEL_VLV_D_IDS(gen3_stolen_size),
+ INTEL_PINEVIEW_IDS(gen3_stolen_size),
+ INTEL_I965G_IDS(gen3_stolen_size),
+ INTEL_G33_IDS(gen3_stolen_size),
+ INTEL_I965GM_IDS(gen3_stolen_size),
+ INTEL_GM45_IDS(gen3_stolen_size),
+ INTEL_G45_IDS(gen3_stolen_size),
+ INTEL_IRONLAKE_D_IDS(gen3_stolen_size),
+ INTEL_IRONLAKE_M_IDS(gen3_stolen_size),
+ INTEL_SNB_D_IDS(gen6_stolen_size),
+ INTEL_SNB_M_IDS(gen6_stolen_size),
+ INTEL_IVB_M_IDS(gen6_stolen_size),
+ INTEL_IVB_D_IDS(gen6_stolen_size),
+ INTEL_HSW_D_IDS(gen6_stolen_size),
+ INTEL_HSW_M_IDS(gen6_stolen_size),
+};
+static void __init intel_graphics_stolen(int num, int slot, int func)
+{
+ size_t size;
+ int i;
+ u32 start;
+ u16 device, subvendor, subdevice;
+
+ device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
+ subvendor = read_pci_config_16(num, slot, func,
+ PCI_SUBSYSTEM_VENDOR_ID);
+ subdevice = read_pci_config_16(num, slot, func, PCI_SUBSYSTEM_ID);
+
+ for (i = 0; i < ARRAY_SIZE(intel_stolen_ids); i++) {
+ if (intel_stolen_ids[i].device == device) {
+ stolen_size_fn stolen_size =
+ (stolen_size_fn)intel_stolen_ids[i].driver_data;
+ size = stolen_size(num, slot, func);
+ start = intel_stolen_base(num, slot, func);
+ if (size && start) {
+ /* Mark this space as reserved */
+ e820_add_region(start, size, E820_RESERVED);
+ sanitize_e820_map(e820.map,
+ ARRAY_SIZE(e820.map),
+ &e820.nr_map);
+ }
+ return;
+ }
+ }
}
#define QFLAG_APPLY_ONCE 0x1
@@ -239,8 +399,12 @@ static struct chipset early_qrk[] __initdata = {
PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
{ PCI_VENDOR_ID_INTEL, 0x3403, PCI_CLASS_BRIDGE_HOST,
PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
+ { PCI_VENDOR_ID_INTEL, 0x3405, PCI_CLASS_BRIDGE_HOST,
+ PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
{ PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST,
PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
+ { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID,
+ QFLAG_APPLY_ONCE, intel_graphics_stolen },
{}
};
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 2cfbc3a3a2dd..f0dcb0ceb6a2 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1176,6 +1176,9 @@ ftrace_restore_flags:
#else /* ! CONFIG_DYNAMIC_FTRACE */
ENTRY(mcount)
+ cmpl $__PAGE_OFFSET, %esp
+ jb ftrace_stub /* Paging not enabled yet? */
+
cmpl $0, function_trace_stop
jne ftrace_stub
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 5dd87a89f011..81ba27679f18 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -409,6 +409,7 @@ enable_paging:
/*
* Check if it is 486
*/
+ movb $4,X86 # at least 486
cmpl $-1,X86_CPUID
je is486
@@ -436,7 +437,6 @@ enable_paging:
movl %edx,X86_CAPABILITY
is486:
- movb $4,X86
movl $0x50022,%ecx # set AM, WP, NE and MP
movl %cr0,%eax
andl $0x80000011,%eax # Save PG,PE,ET
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 202d24f0f7e7..5d576ab34403 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -116,7 +116,7 @@ static void mxcsr_feature_mask_init(void)
if (cpu_has_fxsr) {
memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct));
- asm volatile("fxsave %0" : : "m" (fx_scratch));
+ asm volatile("fxsave %0" : "+m" (fx_scratch));
mask = fx_scratch.mxcsr_mask;
if (mask == 0)
mask = 0x0000ffbf;
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 2889b3d43882..ee11b7dfbfbb 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -24,20 +24,71 @@ union jump_code_union {
} __attribute__((packed));
};
+static void bug_at(unsigned char *ip, int line)
+{
+ /*
+ * The location is not an op that we were expecting.
+ * Something went wrong. Crash the box, as something could be
+ * corrupting the kernel.
+ */
+ pr_warning("Unexpected op at %pS [%p] (%02x %02x %02x %02x %02x) %s:%d\n",
+ ip, ip, ip[0], ip[1], ip[2], ip[3], ip[4], __FILE__, line);
+ BUG();
+}
+
static void __jump_label_transform(struct jump_entry *entry,
enum jump_label_type type,
- void *(*poker)(void *, const void *, size_t))
+ void *(*poker)(void *, const void *, size_t),
+ int init)
{
union jump_code_union code;
+ const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
if (type == JUMP_LABEL_ENABLE) {
+ /*
+ * We are enabling this jump label. If it is not a nop
+ * then something must have gone wrong.
+ */
+ if (unlikely(memcmp((void *)entry->code, ideal_nop, 5) != 0))
+ bug_at((void *)entry->code, __LINE__);
+
code.jump = 0xe9;
code.offset = entry->target -
(entry->code + JUMP_LABEL_NOP_SIZE);
- } else
+ } else {
+ /*
+ * We are disabling this jump label. If it is not what
+ * we think it is, then something must have gone wrong.
+ * If this is the first initialization call, then we
+ * are converting the default nop to the ideal nop.
+ */
+ if (init) {
+ const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
+ if (unlikely(memcmp((void *)entry->code, default_nop, 5) != 0))
+ bug_at((void *)entry->code, __LINE__);
+ } else {
+ code.jump = 0xe9;
+ code.offset = entry->target -
+ (entry->code + JUMP_LABEL_NOP_SIZE);
+ if (unlikely(memcmp((void *)entry->code, &code, 5) != 0))
+ bug_at((void *)entry->code, __LINE__);
+ }
memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE);
+ }
- (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
+ /*
+ * Make text_poke_bp() a default fallback poker.
+ *
+ * At the time the change is being done, just ignore whether we
+ * are doing nop -> jump or jump -> nop transition, and assume
+ * always nop being the 'currently valid' instruction
+ *
+ */
+ if (poker)
+ (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
+ else
+ text_poke_bp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE,
+ (void *)entry->code + JUMP_LABEL_NOP_SIZE);
}
void arch_jump_label_transform(struct jump_entry *entry,
@@ -45,15 +96,38 @@ void arch_jump_label_transform(struct jump_entry *entry,
{
get_online_cpus();
mutex_lock(&text_mutex);
- __jump_label_transform(entry, type, text_poke_smp);
+ __jump_label_transform(entry, type, NULL, 0);
mutex_unlock(&text_mutex);
put_online_cpus();
}
+static enum {
+ JL_STATE_START,
+ JL_STATE_NO_UPDATE,
+ JL_STATE_UPDATE,
+} jlstate __initdata_or_module = JL_STATE_START;
+
__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
enum jump_label_type type)
{
- __jump_label_transform(entry, type, text_poke_early);
+ /*
+ * This function is called at boot up and when modules are
+ * first loaded. Check if the default nop, the one that is
+ * inserted at compile time, is the ideal nop. If it is, then
+ * we do not need to update the nop, and we can leave it as is.
+ * If it is not, then we need to update the nop to the ideal nop.
+ */
+ if (jlstate == JL_STATE_START) {
+ const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
+ const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
+
+ if (memcmp(ideal_nop, default_nop, 5) != 0)
+ jlstate = JL_STATE_UPDATE;
+ else
+ jlstate = JL_STATE_NO_UPDATE;
+ }
+ if (jlstate == JL_STATE_UPDATE)
+ __jump_label_transform(entry, type, text_poke_early, 1);
}
#endif
diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h
index 2e9d4b5af036..c6ee63f927ab 100644
--- a/arch/x86/kernel/kprobes/common.h
+++ b/arch/x86/kernel/kprobes/common.h
@@ -82,14 +82,9 @@ extern void synthesize_reljump(void *from, void *to);
extern void synthesize_relcall(void *from, void *to);
#ifdef CONFIG_OPTPROBES
-extern int arch_init_optprobes(void);
extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter);
extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr);
#else /* !CONFIG_OPTPROBES */
-static inline int arch_init_optprobes(void)
-{
- return 0;
-}
static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
{
return 0;
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 048852d06447..79a3f9682871 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -1068,7 +1068,7 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
int __init arch_init_kprobes(void)
{
- return arch_init_optprobes();
+ return 0;
}
int __kprobes arch_trampoline_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 2c1ac2893365..898160b42e43 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -368,31 +368,6 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
return 0;
}
-#define MAX_OPTIMIZE_PROBES 256
-static struct text_poke_param *jump_poke_params;
-static struct jump_poke_buffer {
- u8 buf[RELATIVEJUMP_SIZE];
-} *jump_poke_bufs;
-
-static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
- u8 *insn_buf,
- struct optimized_kprobe *op)
-{
- s32 rel = (s32)((long)op->optinsn.insn -
- ((long)op->kp.addr + RELATIVEJUMP_SIZE));
-
- /* Backup instructions which will be replaced by jump address */
- memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
- RELATIVE_ADDR_SIZE);
-
- insn_buf[0] = RELATIVEJUMP_OPCODE;
- *(s32 *)(&insn_buf[1]) = rel;
-
- tprm->addr = op->kp.addr;
- tprm->opcode = insn_buf;
- tprm->len = RELATIVEJUMP_SIZE;
-}
-
/*
* Replace breakpoints (int3) with relative jumps.
* Caller must call with locking kprobe_mutex and text_mutex.
@@ -400,37 +375,38 @@ static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
void __kprobes arch_optimize_kprobes(struct list_head *oplist)
{
struct optimized_kprobe *op, *tmp;
- int c = 0;
+ u8 insn_buf[RELATIVEJUMP_SIZE];
list_for_each_entry_safe(op, tmp, oplist, list) {
+ s32 rel = (s32)((long)op->optinsn.insn -
+ ((long)op->kp.addr + RELATIVEJUMP_SIZE));
+
WARN_ON(kprobe_disabled(&op->kp));
- /* Setup param */
- setup_optimize_kprobe(&jump_poke_params[c],
- jump_poke_bufs[c].buf, op);
+
+ /* Backup instructions which will be replaced by jump address */
+ memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
+ RELATIVE_ADDR_SIZE);
+
+ insn_buf[0] = RELATIVEJUMP_OPCODE;
+ *(s32 *)(&insn_buf[1]) = rel;
+
+ text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE,
+ op->optinsn.insn);
+
list_del_init(&op->list);
- if (++c >= MAX_OPTIMIZE_PROBES)
- break;
}
-
- /*
- * text_poke_smp doesn't support NMI/MCE code modifying.
- * However, since kprobes itself also doesn't support NMI/MCE
- * code probing, it's not a problem.
- */
- text_poke_smp_batch(jump_poke_params, c);
}
-static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
- u8 *insn_buf,
- struct optimized_kprobe *op)
+/* Replace a relative jump with a breakpoint (int3). */
+void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
{
+ u8 insn_buf[RELATIVEJUMP_SIZE];
+
/* Set int3 to first byte for kprobes */
insn_buf[0] = BREAKPOINT_INSTRUCTION;
memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
-
- tprm->addr = op->kp.addr;
- tprm->opcode = insn_buf;
- tprm->len = RELATIVEJUMP_SIZE;
+ text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE,
+ op->optinsn.insn);
}
/*
@@ -441,34 +417,11 @@ extern void arch_unoptimize_kprobes(struct list_head *oplist,
struct list_head *done_list)
{
struct optimized_kprobe *op, *tmp;
- int c = 0;
list_for_each_entry_safe(op, tmp, oplist, list) {
- /* Setup param */
- setup_unoptimize_kprobe(&jump_poke_params[c],
- jump_poke_bufs[c].buf, op);
+ arch_unoptimize_kprobe(op);
list_move(&op->list, done_list);
- if (++c >= MAX_OPTIMIZE_PROBES)
- break;
}
-
- /*
- * text_poke_smp doesn't support NMI/MCE code modifying.
- * However, since kprobes itself also doesn't support NMI/MCE
- * code probing, it's not a problem.
- */
- text_poke_smp_batch(jump_poke_params, c);
-}
-
-/* Replace a relative jump with a breakpoint (int3). */
-void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
-{
- u8 buf[RELATIVEJUMP_SIZE];
-
- /* Set int3 to first byte for kprobes */
- buf[0] = BREAKPOINT_INSTRUCTION;
- memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
- text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
}
int __kprobes
@@ -488,22 +441,3 @@ setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
}
return 0;
}
-
-int __kprobes arch_init_optprobes(void)
-{
- /* Allocate code buffer and parameter array */
- jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
- MAX_OPTIMIZE_PROBES, GFP_KERNEL);
- if (!jump_poke_bufs)
- return -ENOMEM;
-
- jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
- MAX_OPTIMIZE_PROBES, GFP_KERNEL);
- if (!jump_poke_params) {
- kfree(jump_poke_bufs);
- jump_poke_bufs = NULL;
- return -ENOMEM;
- }
-
- return 0;
-}
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index a96d32cc55b8..697b93af02dd 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -34,6 +34,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/kprobes.h>
+#include <linux/debugfs.h>
#include <asm/timer.h>
#include <asm/cpu.h>
#include <asm/traps.h>
@@ -419,6 +420,7 @@ static void __init kvm_smp_prepare_boot_cpu(void)
WARN_ON(kvm_register_clock("primary cpu clock"));
kvm_guest_cpu_init();
native_smp_prepare_boot_cpu();
+ kvm_spinlock_init();
}
static void kvm_guest_cpu_online(void *dummy)
@@ -498,11 +500,9 @@ void __init kvm_guest_init(void)
#endif
}
-static bool __init kvm_detect(void)
+static uint32_t __init kvm_detect(void)
{
- if (!kvm_para_available())
- return false;
- return true;
+ return kvm_cpuid_base();
}
const struct hypervisor_x86 x86_hyper_kvm __refconst = {
@@ -523,3 +523,263 @@ static __init int activate_jump_labels(void)
return 0;
}
arch_initcall(activate_jump_labels);
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+
+/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
+static void kvm_kick_cpu(int cpu)
+{
+ int apicid;
+ unsigned long flags = 0;
+
+ apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
+}
+
+enum kvm_contention_stat {
+ TAKEN_SLOW,
+ TAKEN_SLOW_PICKUP,
+ RELEASED_SLOW,
+ RELEASED_SLOW_KICKED,
+ NR_CONTENTION_STATS
+};
+
+#ifdef CONFIG_KVM_DEBUG_FS
+#define HISTO_BUCKETS 30
+
+static struct kvm_spinlock_stats
+{
+ u32 contention_stats[NR_CONTENTION_STATS];
+ u32 histo_spin_blocked[HISTO_BUCKETS+1];
+ u64 time_blocked;
+} spinlock_stats;
+
+static u8 zero_stats;
+
+static inline void check_zero(void)
+{
+ u8 ret;
+ u8 old;
+
+ old = ACCESS_ONCE(zero_stats);
+ if (unlikely(old)) {
+ ret = cmpxchg(&zero_stats, old, 0);
+ /* This ensures only one fellow resets the stat */
+ if (ret == old)
+ memset(&spinlock_stats, 0, sizeof(spinlock_stats));
+ }
+}
+
+static inline void add_stats(enum kvm_contention_stat var, u32 val)
+{
+ check_zero();
+ spinlock_stats.contention_stats[var] += val;
+}
+
+
+static inline u64 spin_time_start(void)
+{
+ return sched_clock();
+}
+
+static void __spin_time_accum(u64 delta, u32 *array)
+{
+ unsigned index;
+
+ index = ilog2(delta);
+ check_zero();
+
+ if (index < HISTO_BUCKETS)
+ array[index]++;
+ else
+ array[HISTO_BUCKETS]++;
+}
+
+static inline void spin_time_accum_blocked(u64 start)
+{
+ u32 delta;
+
+ delta = sched_clock() - start;
+ __spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
+ spinlock_stats.time_blocked += delta;
+}
+
+static struct dentry *d_spin_debug;
+static struct dentry *d_kvm_debug;
+
+struct dentry *kvm_init_debugfs(void)
+{
+ d_kvm_debug = debugfs_create_dir("kvm", NULL);
+ if (!d_kvm_debug)
+ printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n");
+
+ return d_kvm_debug;
+}
+
+static int __init kvm_spinlock_debugfs(void)
+{
+ struct dentry *d_kvm;
+
+ d_kvm = kvm_init_debugfs();
+ if (d_kvm == NULL)
+ return -ENOMEM;
+
+ d_spin_debug = debugfs_create_dir("spinlocks", d_kvm);
+
+ debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
+
+ debugfs_create_u32("taken_slow", 0444, d_spin_debug,
+ &spinlock_stats.contention_stats[TAKEN_SLOW]);
+ debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
+ &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]);
+
+ debugfs_create_u32("released_slow", 0444, d_spin_debug,
+ &spinlock_stats.contention_stats[RELEASED_SLOW]);
+ debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
+ &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]);
+
+ debugfs_create_u64("time_blocked", 0444, d_spin_debug,
+ &spinlock_stats.time_blocked);
+
+ debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
+ spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
+
+ return 0;
+}
+fs_initcall(kvm_spinlock_debugfs);
+#else /* !CONFIG_KVM_DEBUG_FS */
+static inline void add_stats(enum kvm_contention_stat var, u32 val)
+{
+}
+
+static inline u64 spin_time_start(void)
+{
+ return 0;
+}
+
+static inline void spin_time_accum_blocked(u64 start)
+{
+}
+#endif /* CONFIG_KVM_DEBUG_FS */
+
+struct kvm_lock_waiting {
+ struct arch_spinlock *lock;
+ __ticket_t want;
+};
+
+/* cpus 'waiting' on a spinlock to become available */
+static cpumask_t waiting_cpus;
+
+/* Track spinlock on which a cpu is waiting */
+static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting);
+
+static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
+{
+ struct kvm_lock_waiting *w;
+ int cpu;
+ u64 start;
+ unsigned long flags;
+
+ if (in_nmi())
+ return;
+
+ w = &__get_cpu_var(klock_waiting);
+ cpu = smp_processor_id();
+ start = spin_time_start();
+
+ /*
+ * Make sure an interrupt handler can't upset things in a
+ * partially setup state.
+ */
+ local_irq_save(flags);
+
+ /*
+ * The ordering protocol on this is that the "lock" pointer
+ * may only be set non-NULL if the "want" ticket is correct.
+ * If we're updating "want", we must first clear "lock".
+ */
+ w->lock = NULL;
+ smp_wmb();
+ w->want = want;
+ smp_wmb();
+ w->lock = lock;
+
+ add_stats(TAKEN_SLOW, 1);
+
+ /*
+ * This uses set_bit, which is atomic but we should not rely on its
+ * reordering gurantees. So barrier is needed after this call.
+ */
+ cpumask_set_cpu(cpu, &waiting_cpus);
+
+ barrier();
+
+ /*
+ * Mark entry to slowpath before doing the pickup test to make
+ * sure we don't deadlock with an unlocker.
+ */
+ __ticket_enter_slowpath(lock);
+
+ /*
+ * check again make sure it didn't become free while
+ * we weren't looking.
+ */
+ if (ACCESS_ONCE(lock->tickets.head) == want) {
+ add_stats(TAKEN_SLOW_PICKUP, 1);
+ goto out;
+ }
+
+ /*
+ * halt until it's our turn and kicked. Note that we do safe halt
+ * for irq enabled case to avoid hang when lock info is overwritten
+ * in irq spinlock slowpath and no spurious interrupt occur to save us.
+ */
+ if (arch_irqs_disabled_flags(flags))
+ halt();
+ else
+ safe_halt();
+
+out:
+ cpumask_clear_cpu(cpu, &waiting_cpus);
+ w->lock = NULL;
+ local_irq_restore(flags);
+ spin_time_accum_blocked(start);
+}
+PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
+
+/* Kick vcpu waiting on @lock->head to reach value @ticket */
+static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
+{
+ int cpu;
+
+ add_stats(RELEASED_SLOW, 1);
+ for_each_cpu(cpu, &waiting_cpus) {
+ const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu);
+ if (ACCESS_ONCE(w->lock) == lock &&
+ ACCESS_ONCE(w->want) == ticket) {
+ add_stats(RELEASED_SLOW_KICKED, 1);
+ kvm_kick_cpu(cpu);
+ break;
+ }
+ }
+}
+
+/*
+ * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
+ */
+void __init kvm_spinlock_init(void)
+{
+ if (!kvm_para_available())
+ return;
+ /* Does host kernel support KVM_FEATURE_PV_UNHALT? */
+ if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
+ return;
+
+ printk(KERN_INFO "KVM setup paravirtual spinlock\n");
+
+ static_key_slow_inc(&paravirt_ticketlocks_enabled);
+
+ pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning);
+ pv_lock_ops.unlock_kick = kvm_unlock_kick;
+}
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 47ebb1dbfbcb..7123b5df479d 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -145,10 +145,9 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
return 0;
}
-static unsigned int verify_patch_size(int cpu, u32 patch_size,
+static unsigned int verify_patch_size(u8 family, u32 patch_size,
unsigned int size)
{
- struct cpuinfo_x86 *c = &cpu_data(cpu);
u32 max_size;
#define F1XH_MPB_MAX_SIZE 2048
@@ -156,7 +155,7 @@ static unsigned int verify_patch_size(int cpu, u32 patch_size,
#define F15H_MPB_MAX_SIZE 4096
#define F16H_MPB_MAX_SIZE 3458
- switch (c->x86) {
+ switch (family) {
case 0x14:
max_size = F14H_MPB_MAX_SIZE;
break;
@@ -220,12 +219,13 @@ int apply_microcode_amd(int cpu)
return 0;
}
- if (__apply_microcode_amd(mc_amd))
+ if (__apply_microcode_amd(mc_amd)) {
pr_err("CPU%d: update failed for patch_level=0x%08x\n",
cpu, mc_amd->hdr.patch_id);
- else
- pr_info("CPU%d: new patch_level=0x%08x\n", cpu,
- mc_amd->hdr.patch_id);
+ return -1;
+ }
+ pr_info("CPU%d: new patch_level=0x%08x\n", cpu,
+ mc_amd->hdr.patch_id);
uci->cpu_sig.rev = mc_amd->hdr.patch_id;
c->microcode = mc_amd->hdr.patch_id;
@@ -276,9 +276,8 @@ static void cleanup(void)
* driver cannot continue functioning normally. In such cases, we tear
* down everything we've used up so far and exit.
*/
-static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover)
+static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover)
{
- struct cpuinfo_x86 *c = &cpu_data(cpu);
struct microcode_header_amd *mc_hdr;
struct ucode_patch *patch;
unsigned int patch_size, crnt_size, ret;
@@ -298,7 +297,7 @@ static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover)
/* check if patch is for the current family */
proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff);
- if (proc_fam != c->x86)
+ if (proc_fam != family)
return crnt_size;
if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
@@ -307,7 +306,7 @@ static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover)
return crnt_size;
}
- ret = verify_patch_size(cpu, patch_size, leftover);
+ ret = verify_patch_size(family, patch_size, leftover);
if (!ret) {
pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id);
return crnt_size;
@@ -338,7 +337,8 @@ static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover)
return crnt_size;
}
-static enum ucode_state __load_microcode_amd(int cpu, const u8 *data, size_t size)
+static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
+ size_t size)
{
enum ucode_state ret = UCODE_ERROR;
unsigned int leftover;
@@ -361,7 +361,7 @@ static enum ucode_state __load_microcode_amd(int cpu, const u8 *data, size_t siz
}
while (leftover) {
- crnt_size = verify_and_add_patch(cpu, fw, leftover);
+ crnt_size = verify_and_add_patch(family, fw, leftover);
if (crnt_size < 0)
return ret;
@@ -372,22 +372,22 @@ static enum ucode_state __load_microcode_amd(int cpu, const u8 *data, size_t siz
return UCODE_OK;
}
-enum ucode_state load_microcode_amd(int cpu, const u8 *data, size_t size)
+enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)
{
enum ucode_state ret;
/* free old equiv table */
free_equiv_cpu_table();
- ret = __load_microcode_amd(cpu, data, size);
+ ret = __load_microcode_amd(family, data, size);
if (ret != UCODE_OK)
cleanup();
#if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32)
/* save BSP's matching patch for early load */
- if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) {
- struct ucode_patch *p = find_patch(cpu);
+ if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) {
+ struct ucode_patch *p = find_patch(smp_processor_id());
if (p) {
memset(amd_bsp_mpb, 0, MPB_MAX_SIZE);
memcpy(amd_bsp_mpb, p->data, min_t(u32, ksize(p->data),
@@ -440,7 +440,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
goto fw_release;
}
- ret = load_microcode_amd(cpu, fw->data, fw->size);
+ ret = load_microcode_amd(c->x86, fw->data, fw->size);
fw_release:
release_firmware(fw);
diff --git a/arch/x86/kernel/microcode_amd_early.c b/arch/x86/kernel/microcode_amd_early.c
index 1d14ffee5749..6073104ccaa3 100644
--- a/arch/x86/kernel/microcode_amd_early.c
+++ b/arch/x86/kernel/microcode_amd_early.c
@@ -238,25 +238,17 @@ static void __init collect_cpu_sig_on_bsp(void *arg)
uci->cpu_sig.sig = cpuid_eax(0x00000001);
}
#else
-static void collect_cpu_info_amd_early(struct cpuinfo_x86 *c,
- struct ucode_cpu_info *uci)
+void load_ucode_amd_ap(void)
{
+ unsigned int cpu = smp_processor_id();
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
u32 rev, eax;
rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
eax = cpuid_eax(0x00000001);
- uci->cpu_sig.sig = eax;
uci->cpu_sig.rev = rev;
- c->microcode = rev;
- c->x86 = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
-}
-
-void load_ucode_amd_ap(void)
-{
- unsigned int cpu = smp_processor_id();
-
- collect_cpu_info_amd_early(&cpu_data(cpu), ucode_cpu_info + cpu);
+ uci->cpu_sig.sig = eax;
if (cpu && !ucode_loaded) {
void *ucode;
@@ -265,8 +257,10 @@ void load_ucode_amd_ap(void)
return;
ucode = (void *)(initrd_start + ucode_offset);
- if (load_microcode_amd(0, ucode, ucode_size) != UCODE_OK)
+ eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+ if (load_microcode_amd(eax, ucode, ucode_size) != UCODE_OK)
return;
+
ucode_loaded = true;
}
@@ -278,6 +272,8 @@ int __init save_microcode_in_initrd_amd(void)
{
enum ucode_state ret;
void *ucode;
+ u32 eax;
+
#ifdef CONFIG_X86_32
unsigned int bsp = boot_cpu_data.cpu_index;
struct ucode_cpu_info *uci = ucode_cpu_info + bsp;
@@ -293,7 +289,10 @@ int __init save_microcode_in_initrd_amd(void)
return 0;
ucode = (void *)(initrd_start + ucode_offset);
- ret = load_microcode_amd(0, ucode, ucode_size);
+ eax = cpuid_eax(0x00000001);
+ eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+
+ ret = load_microcode_amd(eax, ucode, ucode_size);
if (ret != UCODE_OK)
return -EINVAL;
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 676b8c77a976..bbb6c7316341 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -4,25 +4,17 @@
*/
#include <linux/spinlock.h>
#include <linux/module.h>
+#include <linux/jump_label.h>
#include <asm/paravirt.h>
-static inline void
-default_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
-{
- arch_spin_lock(lock);
-}
-
struct pv_lock_ops pv_lock_ops = {
#ifdef CONFIG_SMP
- .spin_is_locked = __ticket_spin_is_locked,
- .spin_is_contended = __ticket_spin_is_contended,
-
- .spin_lock = __ticket_spin_lock,
- .spin_lock_flags = default_spin_lock_flags,
- .spin_trylock = __ticket_spin_trylock,
- .spin_unlock = __ticket_spin_unlock,
+ .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop),
+ .unlock_kick = paravirt_nop,
#endif
};
EXPORT_SYMBOL(pv_lock_ops);
+struct static_key paravirt_ticketlocks_enabled = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL(paravirt_ticketlocks_enabled);
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 2cb9470ea85b..a16bae3f83b3 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -128,46 +128,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
}
-static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
-
-static struct pvclock_vsyscall_time_info *
-pvclock_get_vsyscall_user_time_info(int cpu)
-{
- if (!pvclock_vdso_info) {
- BUG();
- return NULL;
- }
-
- return &pvclock_vdso_info[cpu];
-}
-
-struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
-{
- return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
-}
-
#ifdef CONFIG_X86_64
-static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
- void *v)
-{
- struct task_migration_notifier *mn = v;
- struct pvclock_vsyscall_time_info *pvti;
-
- pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
-
- /* this is NULL when pvclock vsyscall is not initialized */
- if (unlikely(pvti == NULL))
- return NOTIFY_DONE;
-
- pvti->migrate_count++;
-
- return NOTIFY_DONE;
-}
-
-static struct notifier_block pvclock_migrate = {
- .notifier_call = pvclock_task_migrate,
-};
-
/*
* Initialize the generic pvclock vsyscall state. This will allocate
* a/some page(s) for the per-vcpu pvclock information, set up a
@@ -181,17 +142,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
- pvclock_vdso_info = i;
-
for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
__set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
__pa(i) + (idx*PAGE_SIZE),
PAGE_KERNEL_VVAR);
}
-
- register_task_migration_notifier(&pvclock_migrate);
-
return 0;
}
#endif
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index dfa55afccf5e..f0de6294b955 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -426,25 +426,23 @@ static void __init reserve_initrd(void)
static void __init parse_setup_data(void)
{
struct setup_data *data;
- u64 pa_data;
+ u64 pa_data, pa_next;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
- u32 data_len, map_len;
+ u32 data_len, map_len, data_type;
map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK),
(u64)sizeof(struct setup_data));
data = early_memremap(pa_data, map_len);
data_len = data->len + sizeof(struct setup_data);
- if (data_len > map_len) {
- early_iounmap(data, map_len);
- data = early_memremap(pa_data, data_len);
- map_len = data_len;
- }
+ data_type = data->type;
+ pa_next = data->next;
+ early_iounmap(data, map_len);
- switch (data->type) {
+ switch (data_type) {
case SETUP_E820_EXT:
- parse_e820_ext(data);
+ parse_e820_ext(pa_data, data_len);
break;
case SETUP_DTB:
add_dtb(pa_data);
@@ -452,8 +450,7 @@ static void __init parse_setup_data(void)
default:
break;
}
- pa_data = data->next;
- early_iounmap(data, map_len);
+ pa_data = pa_next;
}
}
@@ -1070,7 +1067,7 @@ void __init setup_arch(char **cmdline_p)
cleanup_highmap();
- memblock.current_limit = ISA_END_ADDRESS;
+ memblock_set_current_limit(ISA_END_ADDRESS);
memblock_x86_fill();
/*
@@ -1103,7 +1100,7 @@ void __init setup_arch(char **cmdline_p)
setup_real_mode();
- memblock.current_limit = get_max_mapped();
+ memblock_set_current_limit(get_max_mapped());
dma_contiguous_reserve(0);
/*
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 6a9acc667d20..9e5de6813e1f 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -358,7 +358,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
else
put_user_ex(0, &frame->uc.uc_flags);
put_user_ex(0, &frame->uc.uc_link);
- err |= __save_altstack(&frame->uc.uc_stack, regs->sp);
+ save_altstack_ex(&frame->uc.uc_stack, regs->sp);
/* Set up to return from userspace. */
restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
@@ -423,7 +423,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
else
put_user_ex(0, &frame->uc.uc_flags);
put_user_ex(0, &frame->uc.uc_link);
- err |= __save_altstack(&frame->uc.uc_stack, regs->sp);
+ save_altstack_ex(&frame->uc.uc_stack, regs->sp);
/* Set up to return from userspace. If provided, use a stub
already in userspace. */
@@ -490,7 +490,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
else
put_user_ex(0, &frame->uc.uc_flags);
put_user_ex(0, &frame->uc.uc_link);
- err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp);
+ compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp);
put_user_ex(0, &frame->uc.uc__pad0);
if (ksig->ka.sa.sa_flags & SA_RESTORER) {
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index dbded5aedb81..30277e27431a 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -101,7 +101,7 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
*begin = new_begin;
}
} else {
- *begin = TASK_UNMAPPED_BASE;
+ *begin = current->mm->mmap_legacy_base;
*end = TASK_SIZE;
}
}
diff --git a/arch/x86/kernel/sysfb.c b/arch/x86/kernel/sysfb.c
new file mode 100644
index 000000000000..193ec2ce46c7
--- /dev/null
+++ b/arch/x86/kernel/sysfb.c
@@ -0,0 +1,74 @@
+/*
+ * Generic System Framebuffers on x86
+ * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+/*
+ * Simple-Framebuffer support for x86 systems
+ * Create a platform-device for any available boot framebuffer. The
+ * simple-framebuffer platform device is already available on DT systems, so
+ * this module parses the global "screen_info" object and creates a suitable
+ * platform device compatible with the "simple-framebuffer" DT object. If
+ * the framebuffer is incompatible, we instead create a legacy
+ * "vesa-framebuffer", "efi-framebuffer" or "platform-framebuffer" device and
+ * pass the screen_info as platform_data. This allows legacy drivers
+ * to pick these devices up without messing with simple-framebuffer drivers.
+ * The global "screen_info" is still valid at all times.
+ *
+ * If CONFIG_X86_SYSFB is not selected, we never register "simple-framebuffer"
+ * platform devices, but only use legacy framebuffer devices for
+ * backwards compatibility.
+ *
+ * TODO: We set the dev_id field of all platform-devices to 0. This allows
+ * other x86 OF/DT parsers to create such devices, too. However, they must
+ * start at offset 1 for this to work.
+ */
+
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/platform_data/simplefb.h>
+#include <linux/platform_device.h>
+#include <linux/screen_info.h>
+#include <asm/sysfb.h>
+
+static __init int sysfb_init(void)
+{
+ struct screen_info *si = &screen_info;
+ struct simplefb_platform_data mode;
+ struct platform_device *pd;
+ const char *name;
+ bool compatible;
+ int ret;
+
+ sysfb_apply_efi_quirks();
+
+ /* try to create a simple-framebuffer device */
+ compatible = parse_mode(si, &mode);
+ if (compatible) {
+ ret = create_simplefb(si, &mode);
+ if (!ret)
+ return 0;
+ }
+
+ /* if the FB is incompatible, create a legacy framebuffer device */
+ if (si->orig_video_isVGA == VIDEO_TYPE_EFI)
+ name = "efi-framebuffer";
+ else if (si->orig_video_isVGA == VIDEO_TYPE_VLFB)
+ name = "vesa-framebuffer";
+ else
+ name = "platform-framebuffer";
+
+ pd = platform_device_register_resndata(NULL, name, 0,
+ NULL, 0, si, sizeof(*si));
+ return IS_ERR(pd) ? PTR_ERR(pd) : 0;
+}
+
+/* must execute after PCI subsystem for EFI quirks */
+device_initcall(sysfb_init);
diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c
new file mode 100644
index 000000000000..b285d4e8c68e
--- /dev/null
+++ b/arch/x86/kernel/sysfb_efi.c
@@ -0,0 +1,214 @@
+/*
+ * Generic System Framebuffers on x86
+ * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
+ *
+ * EFI Quirks Copyright (c) 2006 Edgar Hucek <gimli@dark-green.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+/*
+ * EFI Quirks
+ * Several EFI systems do not correctly advertise their boot framebuffers.
+ * Hence, we use this static table of known broken machines and fix up the
+ * information so framebuffer drivers can load corectly.
+ */
+
+#include <linux/dmi.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/pci.h>
+#include <linux/screen_info.h>
+#include <video/vga.h>
+#include <asm/sysfb.h>
+
+enum {
+ OVERRIDE_NONE = 0x0,
+ OVERRIDE_BASE = 0x1,
+ OVERRIDE_STRIDE = 0x2,
+ OVERRIDE_HEIGHT = 0x4,
+ OVERRIDE_WIDTH = 0x8,
+};
+
+struct efifb_dmi_info efifb_dmi_list[] = {
+ [M_I17] = { "i17", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_I20] = { "i20", 0x80010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE }, /* guess */
+ [M_I20_SR] = { "imac7", 0x40010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE },
+ [M_I24] = { "i24", 0x80010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, /* guess */
+ [M_I24_8_1] = { "imac8", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
+ [M_I24_10_1] = { "imac10", 0xc0010000, 2048 * 4, 1920, 1080, OVERRIDE_NONE },
+ [M_I27_11_1] = { "imac11", 0xc0010000, 2560 * 4, 2560, 1440, OVERRIDE_NONE },
+ [M_MINI]= { "mini", 0x80000000, 2048 * 4, 1024, 768, OVERRIDE_NONE },
+ [M_MINI_3_1] = { "mini31", 0x40010000, 1024 * 4, 1024, 768, OVERRIDE_NONE },
+ [M_MINI_4_1] = { "mini41", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
+ [M_MB] = { "macbook", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
+ [M_MB_5_1] = { "macbook51", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
+ [M_MB_6_1] = { "macbook61", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
+ [M_MB_7_1] = { "macbook71", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
+ [M_MBA] = { "mba", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
+ /* 11" Macbook Air 3,1 passes the wrong stride */
+ [M_MBA_3] = { "mba3", 0, 2048 * 4, 0, 0, OVERRIDE_STRIDE },
+ [M_MBP] = { "mbp", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_MBP_2] = { "mbp2", 0, 0, 0, 0, OVERRIDE_NONE }, /* placeholder */
+ [M_MBP_2_2] = { "mbp22", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_MBP_SR] = { "mbp3", 0x80030000, 2048 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_MBP_4] = { "mbp4", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
+ [M_MBP_5_1] = { "mbp51", 0xc0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_MBP_5_2] = { "mbp52", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
+ [M_MBP_5_3] = { "mbp53", 0xd0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_MBP_6_1] = { "mbp61", 0x90030000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
+ [M_MBP_6_2] = { "mbp62", 0x90030000, 2048 * 4, 1680, 1050, OVERRIDE_NONE },
+ [M_MBP_7_1] = { "mbp71", 0xc0010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
+ [M_MBP_8_2] = { "mbp82", 0x90010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_UNKNOWN] = { NULL, 0, 0, 0, 0, OVERRIDE_NONE }
+};
+
+#define choose_value(dmivalue, fwvalue, field, flags) ({ \
+ typeof(fwvalue) _ret_ = fwvalue; \
+ if ((flags) & (field)) \
+ _ret_ = dmivalue; \
+ else if ((fwvalue) == 0) \
+ _ret_ = dmivalue; \
+ _ret_; \
+ })
+
+static int __init efifb_set_system(const struct dmi_system_id *id)
+{
+ struct efifb_dmi_info *info = id->driver_data;
+
+ if (info->base == 0 && info->height == 0 && info->width == 0 &&
+ info->stride == 0)
+ return 0;
+
+ /* Trust the bootloader over the DMI tables */
+ if (screen_info.lfb_base == 0) {
+#if defined(CONFIG_PCI)
+ struct pci_dev *dev = NULL;
+ int found_bar = 0;
+#endif
+ if (info->base) {
+ screen_info.lfb_base = choose_value(info->base,
+ screen_info.lfb_base, OVERRIDE_BASE,
+ info->flags);
+
+#if defined(CONFIG_PCI)
+ /* make sure that the address in the table is actually
+ * on a VGA device's PCI BAR */
+
+ for_each_pci_dev(dev) {
+ int i;
+ if ((dev->class >> 8) != PCI_CLASS_DISPLAY_VGA)
+ continue;
+ for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
+ resource_size_t start, end;
+
+ start = pci_resource_start(dev, i);
+ if (start == 0)
+ break;
+ end = pci_resource_end(dev, i);
+ if (screen_info.lfb_base >= start &&
+ screen_info.lfb_base < end) {
+ found_bar = 1;
+ }
+ }
+ }
+ if (!found_bar)
+ screen_info.lfb_base = 0;
+#endif
+ }
+ }
+ if (screen_info.lfb_base) {
+ screen_info.lfb_linelength = choose_value(info->stride,
+ screen_info.lfb_linelength, OVERRIDE_STRIDE,
+ info->flags);
+ screen_info.lfb_width = choose_value(info->width,
+ screen_info.lfb_width, OVERRIDE_WIDTH,
+ info->flags);
+ screen_info.lfb_height = choose_value(info->height,
+ screen_info.lfb_height, OVERRIDE_HEIGHT,
+ info->flags);
+ if (screen_info.orig_video_isVGA == 0)
+ screen_info.orig_video_isVGA = VIDEO_TYPE_EFI;
+ } else {
+ screen_info.lfb_linelength = 0;
+ screen_info.lfb_width = 0;
+ screen_info.lfb_height = 0;
+ screen_info.orig_video_isVGA = 0;
+ return 0;
+ }
+
+ printk(KERN_INFO "efifb: dmi detected %s - framebuffer at 0x%08x "
+ "(%dx%d, stride %d)\n", id->ident,
+ screen_info.lfb_base, screen_info.lfb_width,
+ screen_info.lfb_height, screen_info.lfb_linelength);
+
+ return 1;
+}
+
+#define EFIFB_DMI_SYSTEM_ID(vendor, name, enumid) \
+ { \
+ efifb_set_system, \
+ name, \
+ { \
+ DMI_MATCH(DMI_BIOS_VENDOR, vendor), \
+ DMI_MATCH(DMI_PRODUCT_NAME, name) \
+ }, \
+ &efifb_dmi_list[enumid] \
+ }
+
+static const struct dmi_system_id efifb_dmi_system_table[] __initconst = {
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac4,1", M_I17),
+ /* At least one of these two will be right; maybe both? */
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac5,1", M_I20),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac5,1", M_I20),
+ /* At least one of these two will be right; maybe both? */
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac6,1", M_I24),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac6,1", M_I24),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac7,1", M_I20_SR),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac8,1", M_I24_8_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac10,1", M_I24_10_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac11,1", M_I27_11_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "Macmini1,1", M_MINI),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini3,1", M_MINI_3_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini4,1", M_MINI_4_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook1,1", M_MB),
+ /* At least one of these two will be right; maybe both? */
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook2,1", M_MB),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook2,1", M_MB),
+ /* At least one of these two will be right; maybe both? */
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook3,1", M_MB),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook3,1", M_MB),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook4,1", M_MB),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook5,1", M_MB_5_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook6,1", M_MB_6_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook7,1", M_MB_7_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir1,1", M_MBA),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir3,1", M_MBA_3),
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro1,1", M_MBP),
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,1", M_MBP_2),
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,2", M_MBP_2_2),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro2,1", M_MBP_2),
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro3,1", M_MBP_SR),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro3,1", M_MBP_SR),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro4,1", M_MBP_4),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,1", M_MBP_5_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,2", M_MBP_5_2),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,3", M_MBP_5_3),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,1", M_MBP_6_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,2", M_MBP_6_2),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro7,1", M_MBP_7_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro8,2", M_MBP_8_2),
+ {},
+};
+
+__init void sysfb_apply_efi_quirks(void)
+{
+ if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI ||
+ !(screen_info.capabilities & VIDEO_CAPABILITY_SKIP_QUIRKS))
+ dmi_check_system(efifb_dmi_system_table);
+}
diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c
new file mode 100644
index 000000000000..22513e96b012
--- /dev/null
+++ b/arch/x86/kernel/sysfb_simplefb.c
@@ -0,0 +1,95 @@
+/*
+ * Generic System Framebuffers on x86
+ * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+/*
+ * simple-framebuffer probing
+ * Try to convert "screen_info" into a "simple-framebuffer" compatible mode.
+ * If the mode is incompatible, we return "false" and let the caller create
+ * legacy nodes instead.
+ */
+
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/platform_data/simplefb.h>
+#include <linux/platform_device.h>
+#include <linux/screen_info.h>
+#include <asm/sysfb.h>
+
+static const char simplefb_resname[] = "BOOTFB";
+static const struct simplefb_format formats[] = SIMPLEFB_FORMATS;
+
+/* try parsing x86 screen_info into a simple-framebuffer mode struct */
+__init bool parse_mode(const struct screen_info *si,
+ struct simplefb_platform_data *mode)
+{
+ const struct simplefb_format *f;
+ __u8 type;
+ unsigned int i;
+
+ type = si->orig_video_isVGA;
+ if (type != VIDEO_TYPE_VLFB && type != VIDEO_TYPE_EFI)
+ return false;
+
+ for (i = 0; i < ARRAY_SIZE(formats); ++i) {
+ f = &formats[i];
+ if (si->lfb_depth == f->bits_per_pixel &&
+ si->red_size == f->red.length &&
+ si->red_pos == f->red.offset &&
+ si->green_size == f->green.length &&
+ si->green_pos == f->green.offset &&
+ si->blue_size == f->blue.length &&
+ si->blue_pos == f->blue.offset &&
+ si->rsvd_size == f->transp.length &&
+ si->rsvd_pos == f->transp.offset) {
+ mode->format = f->name;
+ mode->width = si->lfb_width;
+ mode->height = si->lfb_height;
+ mode->stride = si->lfb_linelength;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+__init int create_simplefb(const struct screen_info *si,
+ const struct simplefb_platform_data *mode)
+{
+ struct platform_device *pd;
+ struct resource res;
+ unsigned long len;
+
+ /* don't use lfb_size as it may contain the whole VMEM instead of only
+ * the part that is occupied by the framebuffer */
+ len = mode->height * mode->stride;
+ len = PAGE_ALIGN(len);
+ if (len > si->lfb_size << 16) {
+ printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n");
+ return -EINVAL;
+ }
+
+ /* setup IORESOURCE_MEM as framebuffer memory */
+ memset(&res, 0, sizeof(res));
+ res.flags = IORESOURCE_MEM;
+ res.name = simplefb_resname;
+ res.start = si->lfb_base;
+ res.end = si->lfb_base + len - 1;
+ if (res.end <= res.start)
+ return -EINVAL;
+
+ pd = platform_device_register_resndata(NULL, "simple-framebuffer", 0,
+ &res, 1, mode, sizeof(*mode));
+ if (IS_ERR(pd))
+ return PTR_ERR(pd);
+
+ return 0;
+}
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index addf7b58f4e8..91a4496db434 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -301,6 +301,15 @@ static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
return 0;
}
+static int tboot_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b)
+{
+ if (!tboot_enabled())
+ return 0;
+
+ pr_warning("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)");
+ return -ENODEV;
+}
+
static atomic_t ap_wfs_count;
static int tboot_wait_for_aps(int num_aps)
@@ -422,6 +431,7 @@ static __init int tboot_late_init(void)
#endif
acpi_os_set_prepare_sleep(&tboot_sleep);
+ acpi_os_set_prepare_extended_sleep(&tboot_extended_sleep);
return 0;
}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1b23a1c92746..8c8093b146ca 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -58,6 +58,7 @@
#include <asm/mce.h>
#include <asm/fixmap.h>
#include <asm/mach_traps.h>
+#include <asm/alternative.h>
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -327,6 +328,9 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
ftrace_int3_handler(regs))
return;
#endif
+ if (poke_int3_handler(regs))
+ return;
+
prev_state = exception_enter();
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 6ff49247edf8..930e5d48f560 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -89,6 +89,12 @@ int check_tsc_unstable(void)
}
EXPORT_SYMBOL_GPL(check_tsc_unstable);
+int check_tsc_disabled(void)
+{
+ return tsc_disabled;
+}
+EXPORT_SYMBOL_GPL(check_tsc_disabled);
+
#ifdef CONFIG_X86_TSC
int __init notsc_setup(char *str)
{
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 5f24c71accaa..8ce0072cd700 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -107,6 +107,8 @@ struct x86_platform_ops x86_platform = {
};
EXPORT_SYMBOL_GPL(x86_platform);
+
+#if defined(CONFIG_PCI_MSI)
struct x86_msi_ops x86_msi = {
.setup_msi_irqs = native_setup_msi_irqs,
.compose_msi_msg = native_compose_msi_msg,
@@ -116,6 +118,28 @@ struct x86_msi_ops x86_msi = {
.setup_hpet_msi = default_setup_hpet_msi,
};
+/* MSI arch specific hooks */
+int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ return x86_msi.setup_msi_irqs(dev, nvec, type);
+}
+
+void arch_teardown_msi_irqs(struct pci_dev *dev)
+{
+ x86_msi.teardown_msi_irqs(dev);
+}
+
+void arch_teardown_msi_irq(unsigned int irq)
+{
+ x86_msi.teardown_msi_irq(irq);
+}
+
+void arch_restore_msi_irqs(struct pci_dev *dev, int irq)
+{
+ x86_msi.restore_msi_irqs(dev, irq);
+}
+#endif
+
struct x86_io_apic_ops x86_io_apic_ops = {
.init = native_io_apic_init_mappings,
.read = native_io_apic_read,
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index a20ecb5b6cbf..b110fe6c03d4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -413,7 +413,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
(1 << KVM_FEATURE_CLOCKSOURCE2) |
(1 << KVM_FEATURE_ASYNC_PF) |
(1 << KVM_FEATURE_PV_EOI) |
- (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+ (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
+ (1 << KVM_FEATURE_PV_UNHALT);
if (sched_info_on())
entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2bc1e81045b0..ddc3f3d2afdb 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2025,6 +2025,17 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
return rc;
}
+static int em_ret_far_imm(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+
+ rc = em_ret_far(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rsp_increment(ctxt, ctxt->src.val);
+ return X86EMUL_CONTINUE;
+}
+
static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
{
/* Save real source value, then compare EAX against destination. */
@@ -3763,7 +3774,8 @@ static const struct opcode opcode_table[256] = {
G(ByteOp, group11), G(0, group11),
/* 0xC8 - 0xCF */
I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave),
- N, I(ImplicitOps | Stack, em_ret_far),
+ I(ImplicitOps | Stack | SrcImmU16, em_ret_far_imm),
+ I(ImplicitOps | Stack, em_ret_far),
D(ImplicitOps), DI(SrcImmByte, intn),
D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
/* 0xD0 - 0xD7 */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index afc11245827c..5439117d5c4c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -79,16 +79,6 @@ static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
*((u32 *) (apic->regs + reg_off)) = val;
}
-static inline int apic_test_and_set_vector(int vec, void *bitmap)
-{
- return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline int apic_test_and_clear_vector(int vec, void *bitmap)
-{
- return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
static inline int apic_test_vector(int vec, void *bitmap)
{
return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -331,10 +321,10 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
}
EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
-static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
+static inline void apic_set_irr(int vec, struct kvm_lapic *apic)
{
apic->irr_pending = true;
- return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
+ apic_set_vector(vec, apic->regs + APIC_IRR);
}
static inline int apic_search_irr(struct kvm_lapic *apic)
@@ -681,32 +671,28 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
if (unlikely(!apic_enabled(apic)))
break;
+ result = 1;
+
if (dest_map)
__set_bit(vcpu->vcpu_id, dest_map);
- if (kvm_x86_ops->deliver_posted_interrupt) {
- result = 1;
+ if (kvm_x86_ops->deliver_posted_interrupt)
kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
- } else {
- result = !apic_test_and_set_irr(vector, apic);
-
- if (!result) {
- if (trig_mode)
- apic_debug("level trig mode repeatedly "
- "for vector %d", vector);
- goto out;
- }
+ else {
+ apic_set_irr(vector, apic);
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
}
-out:
trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
- trig_mode, vector, !result);
+ trig_mode, vector, false);
break;
case APIC_DM_REMRD:
- apic_debug("Ignoring delivery mode 3\n");
+ result = 1;
+ vcpu->arch.pv.pv_unhalted = 1;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
break;
case APIC_DM_SMI:
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9e9285ae9b94..dce0df8150df 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -132,8 +132,8 @@ module_param(dbg, bool, 0644);
(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
* PT32_LEVEL_BITS))) - 1))
-#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
- | PT64_NX_MASK)
+#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
+ | shadow_x_mask | shadow_nx_mask)
#define ACC_EXEC_MASK 1
#define ACC_WRITE_MASK PT_WRITABLE_MASK
@@ -331,11 +331,6 @@ static int is_large_pte(u64 pte)
return pte & PT_PAGE_SIZE_MASK;
}
-static int is_dirty_gpte(unsigned long pte)
-{
- return pte & PT_DIRTY_MASK;
-}
-
static int is_rmap_spte(u64 pte)
{
return is_shadow_present_pte(pte);
@@ -2052,12 +2047,18 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
return __shadow_walk_next(iterator, *iterator->sptep);
}
-static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
+static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed)
{
u64 spte;
+ BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK ||
+ VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
+
spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
- shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
+ shadow_user_mask | shadow_x_mask;
+
+ if (accessed)
+ spte |= shadow_accessed_mask;
mmu_spte_set(sptep, spte);
}
@@ -2574,14 +2575,6 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
mmu_free_roots(vcpu);
}
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
-{
- int bit7;
-
- bit7 = (gpte >> 7) & 1;
- return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
-}
-
static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
bool no_dirty_log)
{
@@ -2594,26 +2587,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
return gfn_to_pfn_memslot_atomic(slot, gfn);
}
-static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *spte,
- u64 gpte)
-{
- if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
- goto no_present;
-
- if (!is_present_gpte(gpte))
- goto no_present;
-
- if (!(gpte & PT_ACCESSED_MASK))
- goto no_present;
-
- return false;
-
-no_present:
- drop_spte(vcpu->kvm, spte);
- return true;
-}
-
static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
u64 *start, u64 *end)
@@ -2710,7 +2683,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
iterator.level - 1,
1, ACC_ALL, iterator.sptep);
- link_shadow_page(iterator.sptep, sp);
+ link_shadow_page(iterator.sptep, sp, true);
}
}
return emulate;
@@ -2808,7 +2781,7 @@ exit:
return ret;
}
-static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code)
+static bool page_fault_can_be_fast(u32 error_code)
{
/*
* Do not fix the mmio spte with invalid generation number which
@@ -2861,7 +2834,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
bool ret = false;
u64 spte = 0ull;
- if (!page_fault_can_be_fast(vcpu, error_code))
+ if (!page_fault_can_be_fast(error_code))
return false;
walk_shadow_page_lockless_begin(vcpu);
@@ -3209,6 +3182,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
mmu_sync_roots(vcpu);
spin_unlock(&vcpu->kvm->mmu_lock);
}
+EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
u32 access, struct x86_exception *exception)
@@ -3478,6 +3452,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
++vcpu->stat.tlb_flush;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
+EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
static void paging_new_cr3(struct kvm_vcpu *vcpu)
{
@@ -3501,18 +3476,6 @@ static void paging_free(struct kvm_vcpu *vcpu)
nonpaging_free(vcpu);
}
-static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
-{
- unsigned mask;
-
- BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
-
- mask = (unsigned)~ACC_WRITE_MASK;
- /* Allow write access to dirty gptes */
- mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK;
- *access &= mask;
-}
-
static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
unsigned access, int *nr_present)
{
@@ -3530,16 +3493,6 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
return false;
}
-static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
-{
- unsigned access;
-
- access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
- access &= ~(gpte >> PT64_NX_SHIFT);
-
- return access;
-}
-
static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
{
unsigned index;
@@ -3549,6 +3502,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
return mmu->last_pte_bitmap & (1 << index);
}
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
+
#define PTTYPE 64
#include "paging_tmpl.h"
#undef PTTYPE
@@ -3563,6 +3521,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
int maxphyaddr = cpuid_maxphyaddr(vcpu);
u64 exb_bit_rsvd = 0;
+ context->bad_mt_xwr = 0;
+
if (!context->nx)
exb_bit_rsvd = rsvd_bits(63, 63);
switch (context->root_level) {
@@ -3618,7 +3578,40 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
}
}
-static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context, bool execonly)
+{
+ int maxphyaddr = cpuid_maxphyaddr(vcpu);
+ int pte;
+
+ context->rsvd_bits_mask[0][3] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
+ context->rsvd_bits_mask[0][2] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+ context->rsvd_bits_mask[0][1] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+ context->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
+
+ /* large page */
+ context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
+ context->rsvd_bits_mask[1][2] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
+ context->rsvd_bits_mask[1][1] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
+ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+
+ for (pte = 0; pte < 64; pte++) {
+ int rwx_bits = pte & 7;
+ int mt = pte >> 3;
+ if (mt == 0x2 || mt == 0x3 || mt == 0x7 ||
+ rwx_bits == 0x2 || rwx_bits == 0x6 ||
+ (rwx_bits == 0x4 && !execonly))
+ context->bad_mt_xwr |= (1ull << pte);
+ }
+}
+
+static void update_permission_bitmask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu, bool ept)
{
unsigned bit, byte, pfec;
u8 map;
@@ -3636,12 +3629,16 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu
w = bit & ACC_WRITE_MASK;
u = bit & ACC_USER_MASK;
- /* Not really needed: !nx will cause pte.nx to fault */
- x |= !mmu->nx;
- /* Allow supervisor writes if !cr0.wp */
- w |= !is_write_protection(vcpu) && !uf;
- /* Disallow supervisor fetches of user code if cr4.smep */
- x &= !(smep && u && !uf);
+ if (!ept) {
+ /* Not really needed: !nx will cause pte.nx to fault */
+ x |= !mmu->nx;
+ /* Allow supervisor writes if !cr0.wp */
+ w |= !is_write_protection(vcpu) && !uf;
+ /* Disallow supervisor fetches of user code if cr4.smep */
+ x &= !(smep && u && !uf);
+ } else
+ /* Not really needed: no U/S accesses on ept */
+ u = 1;
fault = (ff && !x) || (uf && !u) || (wf && !w);
map |= fault << bit;
@@ -3676,7 +3673,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
context->root_level = level;
reset_rsvds_bits_mask(vcpu, context);
- update_permission_bitmask(vcpu, context);
+ update_permission_bitmask(vcpu, context, false);
update_last_pte_bitmap(vcpu, context);
ASSERT(is_pae(vcpu));
@@ -3706,7 +3703,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
context->root_level = PT32_ROOT_LEVEL;
reset_rsvds_bits_mask(vcpu, context);
- update_permission_bitmask(vcpu, context);
+ update_permission_bitmask(vcpu, context, false);
update_last_pte_bitmap(vcpu, context);
context->new_cr3 = paging_new_cr3;
@@ -3768,7 +3765,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->gva_to_gpa = paging32_gva_to_gpa;
}
- update_permission_bitmask(vcpu, context);
+ update_permission_bitmask(vcpu, context, false);
update_last_pte_bitmap(vcpu, context);
return 0;
@@ -3800,6 +3797,33 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
+int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+ bool execonly)
+{
+ ASSERT(vcpu);
+ ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+
+ context->nx = true;
+ context->new_cr3 = paging_new_cr3;
+ context->page_fault = ept_page_fault;
+ context->gva_to_gpa = ept_gva_to_gpa;
+ context->sync_page = ept_sync_page;
+ context->invlpg = ept_invlpg;
+ context->update_pte = ept_update_pte;
+ context->free = paging_free;
+ context->root_level = context->shadow_root_level;
+ context->root_hpa = INVALID_PAGE;
+ context->direct_map = false;
+
+ update_permission_bitmask(vcpu, context, true);
+ reset_rsvds_bits_mask_ept(vcpu, context, execonly);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
+
static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
{
int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
@@ -3847,7 +3871,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
}
- update_permission_bitmask(vcpu, g_context);
+ update_permission_bitmask(vcpu, g_context, false);
update_last_pte_bitmap(vcpu, g_context);
return 0;
@@ -3923,8 +3947,8 @@ static bool need_remote_flush(u64 old, u64 new)
return true;
if ((old ^ new) & PT64_BASE_ADDR_MASK)
return true;
- old ^= PT64_NX_MASK;
- new ^= PT64_NX_MASK;
+ old ^= shadow_nx_mask;
+ new ^= shadow_nx_mask;
return (old & ~new & PT64_PERM_MASK) != 0;
}
@@ -4182,7 +4206,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
switch (er) {
case EMULATE_DONE:
return 1;
- case EMULATE_DO_MMIO:
+ case EMULATE_USER_EXIT:
++vcpu->stat.mmio_exits;
/* fall through */
case EMULATE_FAIL:
@@ -4390,23 +4414,19 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
/*
* The very rare case: if the generation-number is round,
* zap all shadow pages.
- *
- * The max value is MMIO_MAX_GEN - 1 since it is not called
- * when mark memslot invalid.
*/
- if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) {
+ if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) {
printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
kvm_mmu_invalidate_zap_all_pages(kvm);
}
}
-static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long
+mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
struct kvm *kvm;
int nr_to_scan = sc->nr_to_scan;
-
- if (nr_to_scan == 0)
- goto out;
+ unsigned long freed = 0;
raw_spin_lock(&kvm_lock);
@@ -4441,25 +4461,37 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
goto unlock;
}
- prepare_zap_oldest_mmu_page(kvm, &invalid_list);
+ if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
+ freed++;
kvm_mmu_commit_zap_page(kvm, &invalid_list);
unlock:
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
+ /*
+ * unfair on small ones
+ * per-vm shrinkers cry out
+ * sadness comes quickly
+ */
list_move_tail(&kvm->vm_list, &vm_list);
break;
}
raw_spin_unlock(&kvm_lock);
+ return freed;
-out:
+}
+
+static unsigned long
+mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
}
static struct shrinker mmu_shrinker = {
- .shrink = mmu_shrink,
+ .count_objects = mmu_shrink_count,
+ .scan_objects = mmu_shrink_scan,
.seeks = DEFAULT_SEEKS * 10,
};
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 5b59c573aba7..77e044a0f5f7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -71,6 +71,8 @@ enum {
int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+ bool execonly);
static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
{
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 7769699d48a8..ad75d77999d0 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -23,6 +23,13 @@
* so the code in this file is compiled twice, once per pte size.
*/
+/*
+ * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro
+ * uses for EPT without A/D paging type.
+ */
+extern u64 __pure __using_nonexistent_pte_bit(void)
+ __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT");
+
#if PTTYPE == 64
#define pt_element_t u64
#define guest_walker guest_walker64
@@ -32,6 +39,10 @@
#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_BITS PT64_LEVEL_BITS
+ #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
+ #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
+ #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+ #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
#ifdef CONFIG_X86_64
#define PT_MAX_FULL_LEVELS 4
#define CMPXCHG cmpxchg
@@ -49,7 +60,26 @@
#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
#define PT_LEVEL_BITS PT32_LEVEL_BITS
#define PT_MAX_FULL_LEVELS 2
+ #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
+ #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
+ #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+ #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
#define CMPXCHG cmpxchg
+#elif PTTYPE == PTTYPE_EPT
+ #define pt_element_t u64
+ #define guest_walker guest_walkerEPT
+ #define FNAME(name) ept_##name
+ #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+ #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+ #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+ #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+ #define PT_LEVEL_BITS PT64_LEVEL_BITS
+ #define PT_GUEST_ACCESSED_MASK 0
+ #define PT_GUEST_DIRTY_MASK 0
+ #define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit()
+ #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit()
+ #define CMPXCHG cmpxchg64
+ #define PT_MAX_FULL_LEVELS 4
#else
#error Invalid PTTYPE value
#endif
@@ -69,6 +99,7 @@ struct guest_walker {
pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
+ bool pte_writable[PT_MAX_FULL_LEVELS];
unsigned pt_access;
unsigned pte_access;
gfn_t gfn;
@@ -80,6 +111,40 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
}
+static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
+{
+ unsigned mask;
+
+ /* dirty bit is not supported, so no need to track it */
+ if (!PT_GUEST_DIRTY_MASK)
+ return;
+
+ BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
+
+ mask = (unsigned)~ACC_WRITE_MASK;
+ /* Allow write access to dirty gptes */
+ mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
+ PT_WRITABLE_MASK;
+ *access &= mask;
+}
+
+static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+ int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f;
+
+ return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) |
+ ((mmu->bad_mt_xwr & (1ull << low6)) != 0);
+}
+
+static inline int FNAME(is_present_gpte)(unsigned long pte)
+{
+#if PTTYPE != PTTYPE_EPT
+ return is_present_gpte(pte);
+#else
+ return pte & 7;
+#endif
+}
+
static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
pt_element_t __user *ptep_user, unsigned index,
pt_element_t orig_pte, pt_element_t new_pte)
@@ -103,6 +168,42 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return (ret != orig_pte);
}
+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ u64 gpte)
+{
+ if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+ goto no_present;
+
+ if (!FNAME(is_present_gpte)(gpte))
+ goto no_present;
+
+ /* if accessed bit is not supported prefetch non accessed gpte */
+ if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK))
+ goto no_present;
+
+ return false;
+
+no_present:
+ drop_spte(vcpu->kvm, spte);
+ return true;
+}
+
+static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+ unsigned access;
+#if PTTYPE == PTTYPE_EPT
+ access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
+ ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
+ ACC_USER_MASK;
+#else
+ access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+ access &= ~(gpte >> PT64_NX_SHIFT);
+#endif
+
+ return access;
+}
+
static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
struct kvm_mmu *mmu,
struct guest_walker *walker,
@@ -114,22 +215,43 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
gfn_t table_gfn;
int ret;
+ /* dirty/accessed bits are not supported, so no need to update them */
+ if (!PT_GUEST_DIRTY_MASK)
+ return 0;
+
for (level = walker->max_level; level >= walker->level; --level) {
pte = orig_pte = walker->ptes[level - 1];
table_gfn = walker->table_gfn[level - 1];
ptep_user = walker->ptep_user[level - 1];
index = offset_in_page(ptep_user) / sizeof(pt_element_t);
- if (!(pte & PT_ACCESSED_MASK)) {
+ if (!(pte & PT_GUEST_ACCESSED_MASK)) {
trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
- pte |= PT_ACCESSED_MASK;
+ pte |= PT_GUEST_ACCESSED_MASK;
}
- if (level == walker->level && write_fault && !is_dirty_gpte(pte)) {
+ if (level == walker->level && write_fault &&
+ !(pte & PT_GUEST_DIRTY_MASK)) {
trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
- pte |= PT_DIRTY_MASK;
+ pte |= PT_GUEST_DIRTY_MASK;
}
if (pte == orig_pte)
continue;
+ /*
+ * If the slot is read-only, simply do not process the accessed
+ * and dirty bits. This is the correct thing to do if the slot
+ * is ROM, and page tables in read-as-ROM/write-as-MMIO slots
+ * are only supported if the accessed and dirty bits are already
+ * set in the ROM (so that MMIO writes are never needed).
+ *
+ * Note that NPT does not allow this at all and faults, since
+ * it always wants nested page table entries for the guest
+ * page tables to be writable. And EPT works but will simply
+ * overwrite the read-only memory to set the accessed and dirty
+ * bits.
+ */
+ if (unlikely(!walker->pte_writable[level - 1]))
+ continue;
+
ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte);
if (ret)
return ret;
@@ -170,7 +292,7 @@ retry_walk:
if (walker->level == PT32E_ROOT_LEVEL) {
pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
trace_kvm_mmu_paging_element(pte, walker->level);
- if (!is_present_gpte(pte))
+ if (!FNAME(is_present_gpte)(pte))
goto error;
--walker->level;
}
@@ -179,7 +301,7 @@ retry_walk:
ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
(mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
- accessed_dirty = PT_ACCESSED_MASK;
+ accessed_dirty = PT_GUEST_ACCESSED_MASK;
pt_access = pte_access = ACC_ALL;
++walker->level;
@@ -204,7 +326,8 @@ retry_walk:
goto error;
real_gfn = gpa_to_gfn(real_gfn);
- host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
+ host_addr = gfn_to_hva_prot(vcpu->kvm, real_gfn,
+ &walker->pte_writable[walker->level - 1]);
if (unlikely(kvm_is_error_hva(host_addr)))
goto error;
@@ -215,17 +338,17 @@ retry_walk:
trace_kvm_mmu_paging_element(pte, walker->level);
- if (unlikely(!is_present_gpte(pte)))
+ if (unlikely(!FNAME(is_present_gpte)(pte)))
goto error;
- if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
- walker->level))) {
+ if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte,
+ walker->level))) {
errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
goto error;
}
accessed_dirty &= pte;
- pte_access = pt_access & gpte_access(vcpu, pte);
+ pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
walker->ptes[walker->level - 1] = pte;
} while (!is_last_gpte(mmu, walker->level, pte));
@@ -248,13 +371,15 @@ retry_walk:
walker->gfn = real_gpa >> PAGE_SHIFT;
if (!write_fault)
- protect_clean_gpte(&pte_access, pte);
+ FNAME(protect_clean_gpte)(&pte_access, pte);
else
/*
- * On a write fault, fold the dirty bit into accessed_dirty by
- * shifting it one place right.
+ * On a write fault, fold the dirty bit into accessed_dirty.
+ * For modes without A/D bits support accessed_dirty will be
+ * always clear.
*/
- accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
+ accessed_dirty &= pte >>
+ (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
if (unlikely(!accessed_dirty)) {
ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
@@ -279,6 +404,25 @@ error:
walker->fault.vector = PF_VECTOR;
walker->fault.error_code_valid = true;
walker->fault.error_code = errcode;
+
+#if PTTYPE == PTTYPE_EPT
+ /*
+ * Use PFERR_RSVD_MASK in error_code to to tell if EPT
+ * misconfiguration requires to be injected. The detection is
+ * done by is_rsvd_bits_set() above.
+ *
+ * We set up the value of exit_qualification to inject:
+ * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation
+ * [5:3] - Calculated by the page walk of the guest EPT page tables
+ * [7:8] - Derived from [7:8] of real exit_qualification
+ *
+ * The other bits are set to 0.
+ */
+ if (!(errcode & PFERR_RSVD_MASK)) {
+ vcpu->arch.exit_qualification &= 0x187;
+ vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3;
+ }
+#endif
walker->fault.address = addr;
walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
@@ -293,6 +437,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
access);
}
+#if PTTYPE != PTTYPE_EPT
static int FNAME(walk_addr_nested)(struct guest_walker *walker,
struct kvm_vcpu *vcpu, gva_t addr,
u32 access)
@@ -300,6 +445,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
addr, access);
}
+#endif
static bool
FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -309,14 +455,14 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
gfn_t gfn;
pfn_t pfn;
- if (prefetch_invalid_gpte(vcpu, sp, spte, gpte))
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
return false;
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
gfn = gpte_to_gfn(gpte);
- pte_access = sp->role.access & gpte_access(vcpu, gpte);
- protect_clean_gpte(&pte_access, gpte);
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ FNAME(protect_clean_gpte)(&pte_access, gpte);
pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
no_dirty_log && (pte_access & ACC_WRITE_MASK));
if (is_error_pfn(pfn))
@@ -446,7 +592,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
goto out_gpte_changed;
if (sp)
- link_shadow_page(it.sptep, sp);
+ link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
}
for (;
@@ -466,7 +612,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
true, direct_access, it.sptep);
- link_shadow_page(it.sptep, sp);
+ link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
}
clear_sp_write_flooding_count(it.sptep);
@@ -727,6 +873,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
return gpa;
}
+#if PTTYPE != PTTYPE_EPT
static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
u32 access,
struct x86_exception *exception)
@@ -745,6 +892,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
return gpa;
}
+#endif
/*
* Using the cached information from sp->gfns is safe because:
@@ -785,15 +933,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
sizeof(pt_element_t)))
return -EINVAL;
- if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) {
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
vcpu->kvm->tlbs_dirty++;
continue;
}
gfn = gpte_to_gfn(gpte);
pte_access = sp->role.access;
- pte_access &= gpte_access(vcpu, gpte);
- protect_clean_gpte(&pte_access, gpte);
+ pte_access &= FNAME(gpte_access)(vcpu, gpte);
+ FNAME(protect_clean_gpte)(&pte_access, gpte);
if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access,
&nr_present))
@@ -830,3 +978,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
#undef gpte_to_gfn
#undef gpte_to_gfn_lvl
#undef CMPXCHG
+#undef PT_GUEST_ACCESSED_MASK
+#undef PT_GUEST_DIRTY_MASK
+#undef PT_GUEST_DIRTY_SHIFT
+#undef PT_GUEST_ACCESSED_SHIFT
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index c53e797e7369..5c4f63151b4d 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -160,7 +160,7 @@ static void stop_counter(struct kvm_pmc *pmc)
static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
unsigned config, bool exclude_user, bool exclude_kernel,
- bool intr)
+ bool intr, bool in_tx, bool in_tx_cp)
{
struct perf_event *event;
struct perf_event_attr attr = {
@@ -173,6 +173,10 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
.exclude_kernel = exclude_kernel,
.config = config,
};
+ if (in_tx)
+ attr.config |= HSW_IN_TX;
+ if (in_tx_cp)
+ attr.config |= HSW_IN_TX_CHECKPOINTED;
attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
@@ -226,7 +230,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
ARCH_PERFMON_EVENTSEL_INV |
- ARCH_PERFMON_EVENTSEL_CMASK))) {
+ ARCH_PERFMON_EVENTSEL_CMASK |
+ HSW_IN_TX |
+ HSW_IN_TX_CHECKPOINTED))) {
config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
unit_mask);
if (config != PERF_COUNT_HW_MAX)
@@ -239,7 +245,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
reprogram_counter(pmc, type, config,
!(eventsel & ARCH_PERFMON_EVENTSEL_USR),
!(eventsel & ARCH_PERFMON_EVENTSEL_OS),
- eventsel & ARCH_PERFMON_EVENTSEL_INT);
+ eventsel & ARCH_PERFMON_EVENTSEL_INT,
+ (eventsel & HSW_IN_TX),
+ (eventsel & HSW_IN_TX_CHECKPOINTED));
}
static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
@@ -256,7 +264,7 @@ static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
arch_events[fixed_pmc_events[idx]].event_type,
!(en & 0x2), /* exclude user */
!(en & 0x1), /* exclude kernel */
- pmi);
+ pmi, false, false);
}
static inline u8 fixed_en_pmi(u64 ctrl, int idx)
@@ -408,7 +416,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
} else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
if (data == pmc->eventsel)
return 0;
- if (!(data & 0xffffffff00200000ull)) {
+ if (!(data & pmu->reserved_bits)) {
reprogram_gp_counter(pmc, data);
return 0;
}
@@ -450,6 +458,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
pmu->counter_bitmask[KVM_PMC_GP] = 0;
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
pmu->version = 0;
+ pmu->reserved_bits = 0xffffffff00200000ull;
entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
if (!entry)
@@ -478,6 +487,12 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
(((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
pmu->global_ctrl_mask = ~pmu->global_ctrl;
+
+ entry = kvm_find_cpuid_entry(vcpu, 7, 0);
+ if (entry &&
+ (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
+ (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
+ pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
}
void kvm_pmu_init(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 064d0be67ecc..a1216de9ffda 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -373,6 +373,7 @@ struct nested_vmx {
* we must keep them pinned while L2 runs.
*/
struct page *apic_access_page;
+ u64 msr_ia32_feature_control;
};
#define POSTED_INTR_ON 0
@@ -711,10 +712,10 @@ static void nested_release_page_clean(struct page *page)
kvm_release_page_clean(page);
}
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
static u64 construct_eptp(unsigned long root_hpa);
static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void);
-static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -1039,12 +1040,16 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
(vmcs12->secondary_vm_exec_control & bit);
}
-static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
- struct kvm_vcpu *vcpu)
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
{
return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
}
+static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
+}
+
static inline bool is_exception(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2155,6 +2160,7 @@ static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
static u32 nested_vmx_misc_low, nested_vmx_misc_high;
+static u32 nested_vmx_ept_caps;
static __init void nested_vmx_setup_ctls_msrs(void)
{
/*
@@ -2190,14 +2196,17 @@ static __init void nested_vmx_setup_ctls_msrs(void)
* If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and
* 17 must be 1.
*/
+ rdmsr(MSR_IA32_VMX_EXIT_CTLS,
+ nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
/* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
+ nested_vmx_exit_ctls_high &=
#ifdef CONFIG_X86_64
- nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
-#else
- nested_vmx_exit_ctls_high = 0;
+ VM_EXIT_HOST_ADDR_SPACE_SIZE |
#endif
- nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+ VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+ nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+ VM_EXIT_LOAD_IA32_EFER);
/* entry controls */
rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2205,8 +2214,12 @@ static __init void nested_vmx_setup_ctls_msrs(void)
/* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */
nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
nested_vmx_entry_ctls_high &=
- VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
- nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+#ifdef CONFIG_X86_64
+ VM_ENTRY_IA32E_MODE |
+#endif
+ VM_ENTRY_LOAD_IA32_PAT;
+ nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
+ VM_ENTRY_LOAD_IA32_EFER);
/* cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -2241,6 +2254,22 @@ static __init void nested_vmx_setup_ctls_msrs(void)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_WBINVD_EXITING;
+ if (enable_ept) {
+ /* nested EPT: emulate EPT also to L1 */
+ nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
+ nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
+ VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
+ nested_vmx_ept_caps &= vmx_capability.ept;
+ /*
+ * Since invept is completely emulated we support both global
+ * and context invalidation independent of what host cpu
+ * supports
+ */
+ nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
+ VMX_EPT_EXTENT_CONTEXT_BIT;
+ } else
+ nested_vmx_ept_caps = 0;
+
/* miscellaneous data */
rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
@@ -2282,8 +2311,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
switch (msr_index) {
case MSR_IA32_FEATURE_CONTROL:
- *pdata = 0;
- break;
+ if (nested_vmx_allowed(vcpu)) {
+ *pdata = to_vmx(vcpu)->nested.msr_ia32_feature_control;
+ break;
+ }
+ return 0;
case MSR_IA32_VMX_BASIC:
/*
* This MSR reports some information about VMX support. We
@@ -2346,8 +2378,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
nested_vmx_secondary_ctls_high);
break;
case MSR_IA32_VMX_EPT_VPID_CAP:
- /* Currently, no nested ept or nested vpid */
- *pdata = 0;
+ /* Currently, no nested vpid support */
+ *pdata = nested_vmx_ept_caps;
break;
default:
return 0;
@@ -2356,14 +2388,24 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
return 1;
}
-static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
+ u32 msr_index = msr_info->index;
+ u64 data = msr_info->data;
+ bool host_initialized = msr_info->host_initiated;
+
if (!nested_vmx_allowed(vcpu))
return 0;
- if (msr_index == MSR_IA32_FEATURE_CONTROL)
- /* TODO: the right thing. */
+ if (msr_index == MSR_IA32_FEATURE_CONTROL) {
+ if (!host_initialized &&
+ to_vmx(vcpu)->nested.msr_ia32_feature_control
+ & FEATURE_CONTROL_LOCKED)
+ return 0;
+ to_vmx(vcpu)->nested.msr_ia32_feature_control = data;
return 1;
+ }
+
/*
* No need to treat VMX capability MSRs specially: If we don't handle
* them, handle_wrmsr will #GP(0), which is correct (they are readonly)
@@ -2494,7 +2536,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
/* Otherwise falls through */
default:
- if (vmx_set_vmx_msr(vcpu, msr_index, data))
+ if (vmx_set_vmx_msr(vcpu, msr_info))
break;
msr = find_msr_entry(vmx, msr_index);
if (msr) {
@@ -5297,14 +5339,27 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
return 0;
}
+ /*
+ * EPT violation happened while executing iret from NMI,
+ * "blocked by NMI" bit has to be set before next VM entry.
+ * There are errata that may cause this bit to not be set:
+ * AAK134, BY25.
+ */
+ if (exit_qualification & INTR_INFO_UNBLOCK_NMI)
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
+
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
trace_kvm_page_fault(gpa, exit_qualification);
/* It is a write fault? */
error_code = exit_qualification & (1U << 1);
+ /* It is a fetch fault? */
+ error_code |= (exit_qualification & (1U << 2)) << 2;
/* ept page table is present? */
error_code |= (exit_qualification >> 3) & 0x1;
+ vcpu->arch.exit_qualification = exit_qualification;
+
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
}
@@ -5438,7 +5493,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
- if (err == EMULATE_DO_MMIO) {
+ if (err == EMULATE_USER_EXIT) {
+ ++vcpu->stat.mmio_exits;
ret = 0;
goto out;
}
@@ -5567,8 +5623,47 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
free_loaded_vmcs(&vmx->vmcs01);
}
+/*
+ * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
+ * set the success or error code of an emulated VMX instruction, as specified
+ * by Vol 2B, VMX Instruction Reference, "Conventions".
+ */
+static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
+{
+ vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
+ & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
+}
+
+static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
+{
+ vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+ & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
+ X86_EFLAGS_SF | X86_EFLAGS_OF))
+ | X86_EFLAGS_CF);
+}
+
static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
- u32 vm_instruction_error);
+ u32 vm_instruction_error)
+{
+ if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
+ /*
+ * failValid writes the error number to the current VMCS, which
+ * can't be done there isn't a current VMCS.
+ */
+ nested_vmx_failInvalid(vcpu);
+ return;
+ }
+ vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+ & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_SF | X86_EFLAGS_OF))
+ | X86_EFLAGS_ZF);
+ get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
+ /*
+ * We don't need to force a shadow sync because
+ * VM_INSTRUCTION_ERROR is not shadowed
+ */
+}
/*
* Emulate the VMXON instruction.
@@ -5583,6 +5678,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
struct kvm_segment cs;
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs *shadow_vmcs;
+ const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
+ | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
/* The Intel VMX Instruction Reference lists a bunch of bits that
* are prerequisite to running VMXON, most notably cr4.VMXE must be
@@ -5611,6 +5708,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
skip_emulated_instruction(vcpu);
return 1;
}
+
+ if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
+ != VMXON_NEEDED_FEATURES) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
if (enable_shadow_vmcs) {
shadow_vmcs = alloc_vmcs();
if (!shadow_vmcs)
@@ -5628,6 +5732,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
vmx->nested.vmxon = true;
skip_emulated_instruction(vcpu);
+ nested_vmx_succeed(vcpu);
return 1;
}
@@ -5712,6 +5817,7 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
return 1;
free_nested(to_vmx(vcpu));
skip_emulated_instruction(vcpu);
+ nested_vmx_succeed(vcpu);
return 1;
}
@@ -5768,48 +5874,6 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
return 0;
}
-/*
- * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
- * set the success or error code of an emulated VMX instruction, as specified
- * by Vol 2B, VMX Instruction Reference, "Conventions".
- */
-static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
-{
- vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
- & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
- X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
-}
-
-static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
-{
- vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
- & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
- X86_EFLAGS_SF | X86_EFLAGS_OF))
- | X86_EFLAGS_CF);
-}
-
-static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
- u32 vm_instruction_error)
-{
- if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
- /*
- * failValid writes the error number to the current VMCS, which
- * can't be done there isn't a current VMCS.
- */
- nested_vmx_failInvalid(vcpu);
- return;
- }
- vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
- & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
- X86_EFLAGS_SF | X86_EFLAGS_OF))
- | X86_EFLAGS_ZF);
- get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
- /*
- * We don't need to force a shadow sync because
- * VM_INSTRUCTION_ERROR is not shadowed
- */
-}
-
/* Emulate the VMCLEAR instruction */
static int handle_vmclear(struct kvm_vcpu *vcpu)
{
@@ -5972,8 +6036,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
unsigned long field;
u64 field_value;
struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
- unsigned long *fields = (unsigned long *)shadow_read_write_fields;
- int num_fields = max_shadow_read_write_fields;
+ const unsigned long *fields = shadow_read_write_fields;
+ const int num_fields = max_shadow_read_write_fields;
vmcs_load(shadow_vmcs);
@@ -6002,12 +6066,11 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
{
- unsigned long *fields[] = {
- (unsigned long *)shadow_read_write_fields,
- (unsigned long *)shadow_read_only_fields
+ const unsigned long *fields[] = {
+ shadow_read_write_fields,
+ shadow_read_only_fields
};
- int num_lists = ARRAY_SIZE(fields);
- int max_fields[] = {
+ const int max_fields[] = {
max_shadow_read_write_fields,
max_shadow_read_only_fields
};
@@ -6018,7 +6081,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
vmcs_load(shadow_vmcs);
- for (q = 0; q < num_lists; q++) {
+ for (q = 0; q < ARRAY_SIZE(fields); q++) {
for (i = 0; i < max_fields[q]; i++) {
field = fields[q][i];
vmcs12_read_any(&vmx->vcpu, field, &field_value);
@@ -6248,6 +6311,74 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
return 1;
}
+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+ u32 vmx_instruction_info, types;
+ unsigned long type;
+ gva_t gva;
+ struct x86_exception e;
+ struct {
+ u64 eptp, gpa;
+ } operand;
+ u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK;
+
+ if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
+ !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+ types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
+
+ if (!(types & (1UL << type))) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return 1;
+ }
+
+ /* According to the Intel VMX instruction reference, the memory
+ * operand is read even if it isn't needed (e.g., for type==global)
+ */
+ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ vmx_instruction_info, &gva))
+ return 1;
+ if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
+ sizeof(operand), &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+
+ switch (type) {
+ case VMX_EPT_EXTENT_CONTEXT:
+ if ((operand.eptp & eptp_mask) !=
+ (nested_ept_get_cr3(vcpu) & eptp_mask))
+ break;
+ case VMX_EPT_EXTENT_GLOBAL:
+ kvm_mmu_sync_roots(vcpu);
+ kvm_mmu_flush_tlb(vcpu);
+ nested_vmx_succeed(vcpu);
+ break;
+ default:
+ BUG_ON(1);
+ break;
+ }
+
+ skip_emulated_instruction(vcpu);
+ return 1;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -6292,6 +6423,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
[EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op,
[EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,
+ [EXIT_REASON_INVEPT] = handle_invept,
};
static const int kvm_vmx_max_exit_handlers =
@@ -6518,6 +6650,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+ case EXIT_REASON_INVEPT:
/*
* VMX instructions trap unconditionally. This allows L1 to
* emulate them for its L2 guest, i.e., allows 3-level nesting!
@@ -6550,7 +6683,20 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
case EXIT_REASON_EPT_VIOLATION:
+ /*
+ * L0 always deals with the EPT violation. If nested EPT is
+ * used, and the nested mmu code discovers that the address is
+ * missing in the guest EPT table (EPT12), the EPT violation
+ * will be injected with nested_ept_inject_page_fault()
+ */
+ return 0;
case EXIT_REASON_EPT_MISCONFIG:
+ /*
+ * L2 never uses directly L1's EPT, but rather L0's own EPT
+ * table (shadow on EPT) or a merged EPT table that L0 built
+ * (EPT on EPT). So any problems with the structure of the
+ * table is L0's fault.
+ */
return 0;
case EXIT_REASON_PREEMPTION_TIMER:
return vmcs12->pin_based_vm_exec_control &
@@ -6638,7 +6784,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
!(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
- get_vmcs12(vcpu), vcpu)))) {
+ get_vmcs12(vcpu))))) {
if (vmx_interrupt_allowed(vcpu)) {
vmx->soft_vnmi_blocked = 0;
} else if (vmx->vnmi_blocked_time > 1000000000LL &&
@@ -7326,6 +7472,48 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
entry->ecx |= bit(X86_FEATURE_VMX);
}
+static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
+{
+ struct vmcs12 *vmcs12;
+ nested_vmx_vmexit(vcpu);
+ vmcs12 = get_vmcs12(vcpu);
+
+ if (fault->error_code & PFERR_RSVD_MASK)
+ vmcs12->vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
+ else
+ vmcs12->vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
+ vmcs12->exit_qualification = vcpu->arch.exit_qualification;
+ vmcs12->guest_physical_address = fault->address;
+}
+
+/* Callbacks for nested_ept_init_mmu_context: */
+
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
+{
+ /* return the page table to be shadowed - in our case, EPT12 */
+ return get_vmcs12(vcpu)->ept_pointer;
+}
+
+static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+ int r = kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
+ nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
+
+ vcpu->arch.mmu.set_cr3 = vmx_set_cr3;
+ vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3;
+ vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
+
+ vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
+
+ return r;
+}
+
+static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+}
+
/*
* prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
* L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -7388,7 +7576,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs12->guest_interruptibility_info);
vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
- vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
+ vmx_set_rflags(vcpu, vmcs12->guest_rflags);
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
vmcs12->guest_pending_dbg_exceptions);
vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
@@ -7508,15 +7696,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
- /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
- vmcs_write32(VM_EXIT_CONTROLS,
- vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
- vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
+ /* L2->L1 exit controls are emulated - the hardware exit is to L0 so
+ * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
+ * bits are further modified by vmx_set_efer() below.
+ */
+ vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+
+ /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
+ * emulated by vmx_set_efer(), below.
+ */
+ vmcs_write32(VM_ENTRY_CONTROLS,
+ (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
+ ~VM_ENTRY_IA32E_MODE) |
(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
- if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
+ if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) {
vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
- else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+ vcpu->arch.pat = vmcs12->guest_ia32_pat;
+ } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
@@ -7538,6 +7735,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmx_flush_tlb(vcpu);
}
+ if (nested_cpu_has_ept(vmcs12)) {
+ kvm_mmu_unload(vcpu);
+ nested_ept_init_mmu_context(vcpu);
+ }
+
if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
vcpu->arch.efer = vmcs12->guest_ia32_efer;
else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
@@ -7565,6 +7767,20 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
kvm_set_cr3(vcpu, vmcs12->guest_cr3);
kvm_mmu_reset_context(vcpu);
+ /*
+ * L1 may access the L2's PDPTR, so save them to construct vmcs12
+ */
+ if (enable_ept) {
+ vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+ vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+ vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+ vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+ __clear_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail);
+ __clear_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_dirty);
+ }
+
kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
}
@@ -7887,6 +8103,22 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs12->guest_pending_dbg_exceptions =
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+ /*
+ * In some cases (usually, nested EPT), L2 is allowed to change its
+ * own CR3 without exiting. If it has changed it, we must keep it.
+ * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
+ * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
+ *
+ * Additionally, restore L2's PDPTR to vmcs12.
+ */
+ if (enable_ept) {
+ vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3);
+ vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+ vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+ vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+ vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+ }
+
vmcs12->vm_entry_controls =
(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
(vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE);
@@ -7948,6 +8180,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
+ struct kvm_segment seg;
+
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
vcpu->arch.efer = vmcs12->host_ia32_efer;
else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
@@ -7982,7 +8216,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
kvm_set_cr4(vcpu, vmcs12->host_cr4);
- /* shadow page tables on either EPT or shadow page tables */
+ if (nested_cpu_has_ept(vmcs12))
+ nested_ept_uninit_mmu_context(vcpu);
+
kvm_set_cr3(vcpu, vmcs12->host_cr3);
kvm_mmu_reset_context(vcpu);
@@ -8001,23 +8237,61 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
- vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base);
- vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base);
- vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base);
- vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector);
- vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector);
- vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector);
- vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector);
- vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector);
- vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector);
- vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector);
-
- if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT)
+
+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
+ vcpu->arch.pat = vmcs12->host_ia32_pat;
+ }
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
vmcs12->host_ia32_perf_global_ctrl);
+ /* Set L1 segment info according to Intel SDM
+ 27.5.2 Loading Host Segment and Descriptor-Table Registers */
+ seg = (struct kvm_segment) {
+ .base = 0,
+ .limit = 0xFFFFFFFF,
+ .selector = vmcs12->host_cs_selector,
+ .type = 11,
+ .present = 1,
+ .s = 1,
+ .g = 1
+ };
+ if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
+ seg.l = 1;
+ else
+ seg.db = 1;
+ vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
+ seg = (struct kvm_segment) {
+ .base = 0,
+ .limit = 0xFFFFFFFF,
+ .type = 3,
+ .present = 1,
+ .s = 1,
+ .db = 1,
+ .g = 1
+ };
+ seg.selector = vmcs12->host_ds_selector;
+ vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
+ seg.selector = vmcs12->host_es_selector;
+ vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
+ seg.selector = vmcs12->host_ss_selector;
+ vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
+ seg.selector = vmcs12->host_fs_selector;
+ seg.base = vmcs12->host_fs_base;
+ vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
+ seg.selector = vmcs12->host_gs_selector;
+ seg.base = vmcs12->host_gs_base;
+ vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
+ seg = (struct kvm_segment) {
+ .base = vmcs12->host_tr_base,
+ .limit = 0x67,
+ .selector = vmcs12->host_tr_selector,
+ .type = 11,
+ .present = 1
+ };
+ vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
+
kvm_set_dr(vcpu, 7, 0x400);
vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d21bce505315..e5ca72a5cdb6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -682,17 +682,6 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
*/
}
- /*
- * Does the new cr3 value map to physical memory? (Note, we
- * catch an invalid cr3 even in real-mode, because it would
- * cause trouble later on when we turn on paging anyway.)
- *
- * A real CPU would silently accept an invalid cr3 and would
- * attempt to use it - with largely undefined (and often hard
- * to debug) behavior on the guest side.
- */
- if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
- return 1;
vcpu->arch.cr3 = cr3;
__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
vcpu->arch.mmu.new_cr3(vcpu);
@@ -850,7 +839,8 @@ static u32 msrs_to_save[] = {
#ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
- MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+ MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+ MSR_IA32_FEATURE_CONTROL
};
static unsigned num_msrs_to_save;
@@ -1457,6 +1447,29 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
#endif
}
+static void kvm_gen_update_masterclock(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+ int i;
+ struct kvm_vcpu *vcpu;
+ struct kvm_arch *ka = &kvm->arch;
+
+ spin_lock(&ka->pvclock_gtod_sync_lock);
+ kvm_make_mclock_inprogress_request(kvm);
+ /* no guest entries from this point */
+ pvclock_update_vm_gtod_copy(kvm);
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+
+ /* guest entries allowed */
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
+
+ spin_unlock(&ka->pvclock_gtod_sync_lock);
+#endif
+}
+
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
unsigned long flags, this_tsc_khz;
@@ -3806,6 +3819,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
delta = user_ns.clock - now_ns;
local_irq_enable();
kvm->arch.kvmclock_offset = delta;
+ kvm_gen_update_masterclock(kvm);
break;
}
case KVM_GET_CLOCK: {
@@ -4955,6 +4969,97 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
static int complete_emulated_pio(struct kvm_vcpu *vcpu);
+static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
+ unsigned long *db)
+{
+ u32 dr6 = 0;
+ int i;
+ u32 enable, rwlen;
+
+ enable = dr7;
+ rwlen = dr7 >> 16;
+ for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
+ if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
+ dr6 |= (1 << i);
+ return dr6;
+}
+
+static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+
+ /*
+ * Use the "raw" value to see if TF was passed to the processor.
+ * Note that the new value of the flags has not been saved yet.
+ *
+ * This is correct even for TF set by the guest, because "the
+ * processor will not generate this exception after the instruction
+ * that sets the TF flag".
+ */
+ unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
+
+ if (unlikely(rflags & X86_EFLAGS_TF)) {
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+ kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1;
+ kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ *r = EMULATE_USER_EXIT;
+ } else {
+ vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF;
+ /*
+ * "Certain debug exceptions may clear bit 0-3. The
+ * remaining contents of the DR6 register are never
+ * cleared by the processor".
+ */
+ vcpu->arch.dr6 &= ~15;
+ vcpu->arch.dr6 |= DR6_BS;
+ kvm_queue_exception(vcpu, DB_VECTOR);
+ }
+ }
+}
+
+static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+ unsigned long eip = vcpu->arch.emulate_ctxt.eip;
+ u32 dr6 = 0;
+
+ if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
+ (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
+ dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+ vcpu->arch.guest_debug_dr7,
+ vcpu->arch.eff_db);
+
+ if (dr6 != 0) {
+ kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
+ kvm_run->debug.arch.pc = kvm_rip_read(vcpu) +
+ get_segment_base(vcpu, VCPU_SREG_CS);
+
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ *r = EMULATE_USER_EXIT;
+ return true;
+ }
+ }
+
+ if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) {
+ dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+ vcpu->arch.dr7,
+ vcpu->arch.db);
+
+ if (dr6 != 0) {
+ vcpu->arch.dr6 &= ~15;
+ vcpu->arch.dr6 |= dr6;
+ kvm_queue_exception(vcpu, DB_VECTOR);
+ *r = EMULATE_DONE;
+ return true;
+ }
+ }
+
+ return false;
+}
+
int x86_emulate_instruction(struct kvm_vcpu *vcpu,
unsigned long cr2,
int emulation_type,
@@ -4975,6 +5080,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
init_emulate_ctxt(vcpu);
+
+ /*
+ * We will reenter on the same instruction since
+ * we do not set complete_userspace_io. This does not
+ * handle watchpoints yet, those would be handled in
+ * the emulate_ops.
+ */
+ if (kvm_vcpu_check_breakpoint(vcpu, &r))
+ return r;
+
ctxt->interruptibility = 0;
ctxt->have_exception = false;
ctxt->perm_ok = false;
@@ -5031,17 +5146,18 @@ restart:
inject_emulated_exception(vcpu);
r = EMULATE_DONE;
} else if (vcpu->arch.pio.count) {
- if (!vcpu->arch.pio.in)
+ if (!vcpu->arch.pio.in) {
+ /* FIXME: return into emulator if single-stepping. */
vcpu->arch.pio.count = 0;
- else {
+ } else {
writeback = false;
vcpu->arch.complete_userspace_io = complete_emulated_pio;
}
- r = EMULATE_DO_MMIO;
+ r = EMULATE_USER_EXIT;
} else if (vcpu->mmio_needed) {
if (!vcpu->mmio_is_write)
writeback = false;
- r = EMULATE_DO_MMIO;
+ r = EMULATE_USER_EXIT;
vcpu->arch.complete_userspace_io = complete_emulated_mmio;
} else if (r == EMULATION_RESTART)
goto restart;
@@ -5050,10 +5166,12 @@ restart:
if (writeback) {
toggle_interruptibility(vcpu, ctxt->interruptibility);
- kvm_set_rflags(vcpu, ctxt->eflags);
kvm_make_request(KVM_REQ_EVENT, vcpu);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
kvm_rip_write(vcpu, ctxt->eip);
+ if (r == EMULATE_DONE)
+ kvm_vcpu_check_singlestep(vcpu, &r);
+ kvm_set_rflags(vcpu, ctxt->eflags);
} else
vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
@@ -5347,7 +5465,7 @@ static struct notifier_block pvclock_gtod_notifier = {
int kvm_arch_init(void *opaque)
{
int r;
- struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
+ struct kvm_x86_ops *ops = opaque;
if (kvm_x86_ops) {
printk(KERN_ERR "kvm: already loaded the other module\n");
@@ -5495,6 +5613,23 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
return 1;
}
+/*
+ * kvm_pv_kick_cpu_op: Kick a vcpu.
+ *
+ * @apicid - apicid of vcpu to be kicked.
+ */
+static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
+{
+ struct kvm_lapic_irq lapic_irq;
+
+ lapic_irq.shorthand = 0;
+ lapic_irq.dest_mode = 0;
+ lapic_irq.dest_id = apicid;
+
+ lapic_irq.delivery_mode = APIC_DM_REMRD;
+ kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL);
+}
+
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
unsigned long nr, a0, a1, a2, a3, ret;
@@ -5528,6 +5663,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
case KVM_HC_VAPIC_POLL_IRQ:
ret = 0;
break;
+ case KVM_HC_KICK_CPU:
+ kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
+ ret = 0;
+ break;
default:
ret = -KVM_ENOSYS;
break;
@@ -5689,29 +5828,6 @@ static void process_nmi(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
-static void kvm_gen_update_masterclock(struct kvm *kvm)
-{
-#ifdef CONFIG_X86_64
- int i;
- struct kvm_vcpu *vcpu;
- struct kvm_arch *ka = &kvm->arch;
-
- spin_lock(&ka->pvclock_gtod_sync_lock);
- kvm_make_mclock_inprogress_request(kvm);
- /* no guest entries from this point */
- pvclock_update_vm_gtod_copy(kvm);
-
- kvm_for_each_vcpu(i, vcpu, kvm)
- set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
-
- /* guest entries allowed */
- kvm_for_each_vcpu(i, vcpu, kvm)
- clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
-
- spin_unlock(&ka->pvclock_gtod_sync_lock);
-#endif
-}
-
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
{
u64 eoi_exit_bitmap[4];
@@ -5950,6 +6066,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
kvm_apic_accept_events(vcpu);
switch(vcpu->arch.mp_state) {
case KVM_MP_STATE_HALTED:
+ vcpu->arch.pv.pv_unhalted = false;
vcpu->arch.mp_state =
KVM_MP_STATE_RUNNABLE;
case KVM_MP_STATE_RUNNABLE:
@@ -6061,6 +6178,8 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
vcpu->mmio_needed = 0;
+
+ /* FIXME: return into emulator if single-stepping. */
if (vcpu->mmio_is_write)
return 1;
vcpu->mmio_read_completed = 1;
@@ -6249,7 +6368,12 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
kvm_apic_accept_events(vcpu);
- mp_state->mp_state = vcpu->arch.mp_state;
+ if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
+ vcpu->arch.pv.pv_unhalted)
+ mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
+ else
+ mp_state->mp_state = vcpu->arch.mp_state;
+
return 0;
}
@@ -6770,6 +6894,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
BUG_ON(vcpu->kvm == NULL);
kvm = vcpu->kvm;
+ vcpu->arch.pv.pv_unhalted = false;
vcpu->arch.emulate_ctxt.ops = &emulate_ops;
if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -7019,6 +7144,15 @@ out_free:
return -ENOMEM;
}
+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+ /*
+ * memslots->generation has been incremented.
+ * mmio generation may have reached its maximum value.
+ */
+ kvm_mmu_invalidate_mmio_sptes(kvm);
+}
+
int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *memslot,
struct kvm_userspace_memory_region *mem,
@@ -7079,11 +7213,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
*/
if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
kvm_mmu_slot_remove_write_access(kvm, mem->slot);
- /*
- * If memory slot is created, or moved, we need to clear all
- * mmio sptes.
- */
- kvm_mmu_invalidate_mmio_sptes(kvm);
}
void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -7103,6 +7232,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
!vcpu->arch.apf.halted)
|| !list_empty_careful(&vcpu->async_pf.done)
|| kvm_apic_has_events(vcpu)
+ || vcpu->arch.pv.pv_unhalted
|| atomic_read(&vcpu->arch.nmi_queued) ||
(kvm_arch_interrupt_allowed(vcpu) &&
kvm_cpu_has_interrupt(vcpu));
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 6a22c19da663..bdf8532494fe 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -7,8 +7,7 @@
* kernel and insert a module (lg.ko) which allows us to run other Linux
* kernels the same way we'd run processes. We call the first kernel the Host,
* and the others the Guests. The program which sets up and configures Guests
- * (such as the example in Documentation/virtual/lguest/lguest.c) is called the
- * Launcher.
+ * (such as the example in tools/lguest/lguest.c) is called the Launcher.
*
* Secondly, we only run specially modified Guests, not normal kernels: setting
* CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows
@@ -1057,6 +1056,12 @@ static void lguest_load_sp0(struct tss_struct *tss,
}
/* Let's just say, I wouldn't do debugging under a Guest. */
+static unsigned long lguest_get_debugreg(int regno)
+{
+ /* FIXME: Implement */
+ return 0;
+}
+
static void lguest_set_debugreg(int regno, unsigned long value)
{
/* FIXME: Implement */
@@ -1304,6 +1309,7 @@ __init void lguest_init(void)
pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
pv_cpu_ops.set_ldt = lguest_set_ldt;
pv_cpu_ops.load_tls = lguest_load_tls;
+ pv_cpu_ops.get_debugreg = lguest_get_debugreg;
pv_cpu_ops.set_debugreg = lguest_set_debugreg;
pv_cpu_ops.clts = lguest_clts;
pv_cpu_ops.read_cr0 = lguest_read_cr0;
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
index 25b7ae8d058a..7609e0e421ec 100644
--- a/arch/x86/lib/csum-wrappers_64.c
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -6,6 +6,7 @@
*/
#include <asm/checksum.h>
#include <linux/module.h>
+#include <asm/smap.h>
/**
* csum_partial_copy_from_user - Copy and checksum from user space.
@@ -52,8 +53,10 @@ csum_partial_copy_from_user(const void __user *src, void *dst,
len -= 2;
}
}
+ stac();
isum = csum_partial_copy_generic((__force const void *)src,
dst, len, isum, errp, NULL);
+ clac();
if (unlikely(*errp))
goto out_err;
@@ -82,6 +85,8 @@ __wsum
csum_partial_copy_to_user(const void *src, void __user *dst,
int len, __wsum isum, int *errp)
{
+ __wsum ret;
+
might_sleep();
if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) {
@@ -105,8 +110,11 @@ csum_partial_copy_to_user(const void *src, void __user *dst,
}
*errp = 0;
- return csum_partial_copy_generic(src, (void __force *)dst,
- len, isum, NULL, errp);
+ stac();
+ ret = csum_partial_copy_generic(src, (void __force *)dst,
+ len, isum, NULL, errp);
+ clac();
+ return ret;
}
EXPORT_SYMBOL(csum_partial_copy_to_user);
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 5d7e51f3fd28..533a85e3a07e 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -1,10 +1,8 @@
# x86 Opcode Maps
#
# This is (mostly) based on following documentations.
-# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2
-# (#325383-040US, October 2011)
-# - Intel(R) Advanced Vector Extensions Programming Reference
-# (#319433-011,JUNE 2011).
+# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2C
+# (#326018-047US, June 2013)
#
#<Opcode maps>
# Table: table-name
@@ -29,6 +27,7 @@
# - (F3): the last prefix is 0xF3
# - (F2): the last prefix is 0xF2
# - (!F3) : the last prefix is not 0xF3 (including non-last prefix case)
+# - (66&F2): Both 0x66 and 0xF2 prefixes are specified.
Table: one byte opcode
Referrer:
@@ -246,8 +245,8 @@ c2: RETN Iw (f64)
c3: RETN
c4: LES Gz,Mp (i64) | VEX+2byte (Prefix)
c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix)
-c6: Grp11 Eb,Ib (1A)
-c7: Grp11 Ev,Iz (1A)
+c6: Grp11A Eb,Ib (1A)
+c7: Grp11B Ev,Iz (1A)
c8: ENTER Iw,Ib
c9: LEAVE (d64)
ca: RETF Iw
@@ -293,8 +292,8 @@ ef: OUT DX,eAX
# 0xf0 - 0xff
f0: LOCK (Prefix)
f1:
-f2: REPNE (Prefix)
-f3: REP/REPE (Prefix)
+f2: REPNE (Prefix) | XACQUIRE (Prefix)
+f3: REP/REPE (Prefix) | XRELEASE (Prefix)
f4: HLT
f5: CMC
f6: Grp3_1 Eb (1A)
@@ -326,7 +325,8 @@ AVXcode: 1
0a:
0b: UD2 (1B)
0c:
-0d: NOP Ev | GrpP
+# AMD's prefetch group. Intel supports prefetchw(/1) only.
+0d: GrpP
0e: FEMMS
# 3DNow! uses the last imm byte as opcode extension.
0f: 3DNow! Pq,Qq,Ib
@@ -729,12 +729,12 @@ dc: VAESENC Vdq,Hdq,Wdq (66),(v1)
dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1)
de: VAESDEC Vdq,Hdq,Wdq (66),(v1)
df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1)
-f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2)
-f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2)
+f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2)
+f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2)
f2: ANDN Gy,By,Ey (v)
f3: Grp17 (1A)
f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v)
-f6: MULX By,Gy,rDX,Ey (F2),(v)
+f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v)
f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v)
EndTable
@@ -861,8 +861,8 @@ EndTable
GrpTable: Grp7
0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B)
-1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001)
-2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B)
+1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B)
+2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B)
3: LIDT Ms
4: SMSW Mw/Rv
5:
@@ -880,15 +880,21 @@ EndTable
GrpTable: Grp9
1: CMPXCHG8B/16B Mq/Mdq
6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B)
-7: VMPTRST Mq | VMPTRST Mq (F3)
+7: VMPTRST Mq | VMPTRST Mq (F3) | RDSEED Rv (11B)
EndTable
GrpTable: Grp10
EndTable
-GrpTable: Grp11
-# Note: the operands are given by group opcode
-0: MOV
+# Grp11A and Grp11B are expressed as Grp11 in Intel SDM
+GrpTable: Grp11A
+0: MOV Eb,Ib
+7: XABORT Ib (000),(11B)
+EndTable
+
+GrpTable: Grp11B
+0: MOV Eb,Iz
+7: XBEGIN Jz (000),(11B)
EndTable
GrpTable: Grp12
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 654be4ae3047..3aaeffcfd67a 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -842,23 +842,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
force_sig_info_fault(SIGBUS, code, address, tsk, fault);
}
-static noinline int
+static noinline void
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
unsigned long address, unsigned int fault)
{
- /*
- * Pagefault was interrupted by SIGKILL. We have no reason to
- * continue pagefault.
- */
- if (fatal_signal_pending(current)) {
- if (!(fault & VM_FAULT_RETRY))
- up_read(&current->mm->mmap_sem);
- if (!(error_code & PF_USER))
- no_context(regs, error_code, address, 0, 0);
- return 1;
+ if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
+ up_read(&current->mm->mmap_sem);
+ no_context(regs, error_code, address, 0, 0);
+ return;
}
- if (!(fault & VM_FAULT_ERROR))
- return 0;
if (fault & VM_FAULT_OOM) {
/* Kernel mode? Handle exceptions or die: */
@@ -866,7 +858,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
up_read(&current->mm->mmap_sem);
no_context(regs, error_code, address,
SIGSEGV, SEGV_MAPERR);
- return 1;
+ return;
}
up_read(&current->mm->mmap_sem);
@@ -884,7 +876,6 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
else
BUG();
}
- return 1;
}
static int spurious_fault_check(unsigned long error_code, pte_t *pte)
@@ -1011,9 +1002,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
unsigned long address;
struct mm_struct *mm;
int fault;
- int write = error_code & PF_WRITE;
- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
- (write ? FAULT_FLAG_WRITE : 0);
+ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
tsk = current;
mm = tsk->mm;
@@ -1083,6 +1072,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
if (user_mode_vm(regs)) {
local_irq_enable();
error_code |= PF_USER;
+ flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
local_irq_enable();
@@ -1109,6 +1099,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
return;
}
+ if (error_code & PF_WRITE)
+ flags |= FAULT_FLAG_WRITE;
+
/*
* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in
@@ -1187,9 +1180,17 @@ good_area:
*/
fault = handle_mm_fault(mm, vma, address, flags);
- if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
- if (mm_fault_error(regs, error_code, address, fault))
- return;
+ /*
+ * If we need to retry but a fatal signal is pending, handle the
+ * signal first. We do not need to release the mmap_sem because it
+ * would already be released in __lock_page_or_retry in mm/filemap.c.
+ */
+ if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)))
+ return;
+
+ if (unlikely(fault & VM_FAULT_ERROR)) {
+ mm_fault_error(regs, error_code, address, fault);
+ return;
}
/*
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 7e73e8c69096..9d980d88b747 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -59,6 +59,10 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
return NULL;
}
+int pmd_huge_support(void)
+{
+ return 0;
+}
#else
struct page *
@@ -77,6 +81,10 @@ int pud_huge(pud_t pud)
return !!(pud_val(pud) & _PAGE_PSE);
}
+int pmd_huge_support(void)
+{
+ return 1;
+}
#endif
/* x86_64 also uses this file */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 2ec29ac78ae6..04664cdb7fda 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -78,8 +78,8 @@ __ref void *alloc_low_pages(unsigned int num)
return __va(pfn << PAGE_SHIFT);
}
-/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */
-#define INIT_PGT_BUF_SIZE (5 * PAGE_SIZE)
+/* need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS */
+#define INIT_PGT_BUF_SIZE (6 * PAGE_SIZE)
RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
void __init early_alloc_pgt_buf(void)
{
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 0215e2c563ef..799580cabc78 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -487,7 +487,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
unsigned long offset;
resource_size_t last_addr;
unsigned int nrpages;
- enum fixed_addresses idx0, idx;
+ enum fixed_addresses idx;
int i, slot;
WARN_ON(system_state != SYSTEM_BOOTING);
@@ -540,8 +540,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
/*
* Ok, go for it..
*/
- idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
- idx = idx0;
+ idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
while (nrpages > 0) {
early_set_fixmap(idx, phys_addr, prot);
phys_addr += PAGE_SIZE;
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 62c29a5bfe26..25e7e1372bb2 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -112,11 +112,13 @@ static unsigned long mmap_legacy_base(void)
*/
void arch_pick_mmap_layout(struct mm_struct *mm)
{
+ mm->mmap_legacy_base = mmap_legacy_base();
+ mm->mmap_base = mmap_base();
+
if (mmap_is_legacy()) {
- mm->mmap_base = mmap_legacy_base();
+ mm->mmap_base = mm->mmap_legacy_base;
mm->get_unmapped_area = arch_get_unmapped_area;
} else {
- mm->mmap_base = mmap_base();
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index cdd0da9dd530..266ca912f62e 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -146,6 +146,7 @@ int __init
acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
{
u64 start, end;
+ u32 hotpluggable;
int node, pxm;
if (srat_disabled())
@@ -154,7 +155,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
goto out_err_bad_srat;
if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
goto out_err;
- if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
+ hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE;
+ if (hotpluggable && !save_add_info())
goto out_err;
start = ma->base_address;
@@ -174,9 +176,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
node_set(node, numa_nodes_parsed);
- printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n",
- node, pxm,
- (unsigned long long) start, (unsigned long long) end - 1);
+ pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s\n",
+ node, pxm,
+ (unsigned long long) start, (unsigned long long) end - 1,
+ hotpluggable ? " hotplug" : "");
return 0;
out_err_bad_srat:
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 282375f13c7e..ae699b3bbac8 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -103,6 +103,7 @@ static void flush_tlb_func(void *info)
if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
return;
+ count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
if (f->flush_end == TLB_FLUSH_ALL)
local_flush_tlb();
@@ -130,6 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
info.flush_start = start;
info.flush_end = end;
+ count_vm_event(NR_TLB_REMOTE_FLUSH);
if (is_uv_system()) {
unsigned int cpu;
@@ -149,6 +151,7 @@ void flush_tlb_current_task(void)
preempt_disable();
+ count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
@@ -211,16 +214,19 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm;
/* tlb_flushall_shift is on balance point, details in commit log */
- if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
+ if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) {
+ count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
- else {
+ } else {
if (has_large_page(mm, start, end)) {
local_flush_tlb();
goto flush_all;
}
/* flush range by one by one 'invlpg' */
- for (addr = start; addr < end; addr += PAGE_SIZE)
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ count_vm_event(NR_TLB_LOCAL_FLUSH_ONE);
__flush_tlb_single(addr);
+ }
if (cpumask_any_but(mm_cpumask(mm),
smp_processor_id()) < nr_cpu_ids)
@@ -256,6 +262,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
static void do_flush_tlb_all(void *info)
{
+ count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
__flush_tlb_all();
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
leave_mm(smp_processor_id());
@@ -263,6 +270,7 @@ static void do_flush_tlb_all(void *info)
void flush_tlb_all(void)
{
+ count_vm_event(NR_TLB_REMOTE_FLUSH);
on_each_cpu(do_flush_tlb_all, NULL, 1);
}
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 48768df2471a..6890d8498e0b 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -403,7 +403,7 @@ static void nmi_cpu_down(void *dummy)
nmi_cpu_shutdown(dummy);
}
-static int nmi_create_files(struct super_block *sb, struct dentry *root)
+static int nmi_create_files(struct dentry *root)
{
unsigned int i;
@@ -420,14 +420,14 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root)
continue;
snprintf(buf, sizeof(buf), "%d", i);
- dir = oprofilefs_mkdir(sb, root, buf);
- oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled);
- oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event);
- oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count);
- oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask);
- oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel);
- oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user);
- oprofilefs_create_ulong(sb, dir, "extra", &counter_config[i].extra);
+ dir = oprofilefs_mkdir(root, buf);
+ oprofilefs_create_ulong(dir, "enabled", &counter_config[i].enabled);
+ oprofilefs_create_ulong(dir, "event", &counter_config[i].event);
+ oprofilefs_create_ulong(dir, "count", &counter_config[i].count);
+ oprofilefs_create_ulong(dir, "unit_mask", &counter_config[i].unit_mask);
+ oprofilefs_create_ulong(dir, "kernel", &counter_config[i].kernel);
+ oprofilefs_create_ulong(dir, "user", &counter_config[i].user);
+ oprofilefs_create_ulong(dir, "extra", &counter_config[i].extra);
}
return 0;
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index b2b94438ff05..50d86c0e9ba4 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -454,16 +454,16 @@ static void init_ibs(void)
printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps);
}
-static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
+static int (*create_arch_files)(struct dentry *root);
-static int setup_ibs_files(struct super_block *sb, struct dentry *root)
+static int setup_ibs_files(struct dentry *root)
{
struct dentry *dir;
int ret = 0;
/* architecture specific files */
if (create_arch_files)
- ret = create_arch_files(sb, root);
+ ret = create_arch_files(root);
if (ret)
return ret;
@@ -479,26 +479,26 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
ibs_config.max_cnt_op = 250000;
if (ibs_caps & IBS_CAPS_FETCHSAM) {
- dir = oprofilefs_mkdir(sb, root, "ibs_fetch");
- oprofilefs_create_ulong(sb, dir, "enable",
+ dir = oprofilefs_mkdir(root, "ibs_fetch");
+ oprofilefs_create_ulong(dir, "enable",
&ibs_config.fetch_enabled);
- oprofilefs_create_ulong(sb, dir, "max_count",
+ oprofilefs_create_ulong(dir, "max_count",
&ibs_config.max_cnt_fetch);
- oprofilefs_create_ulong(sb, dir, "rand_enable",
+ oprofilefs_create_ulong(dir, "rand_enable",
&ibs_config.rand_en);
}
if (ibs_caps & IBS_CAPS_OPSAM) {
- dir = oprofilefs_mkdir(sb, root, "ibs_op");
- oprofilefs_create_ulong(sb, dir, "enable",
+ dir = oprofilefs_mkdir(root, "ibs_op");
+ oprofilefs_create_ulong(dir, "enable",
&ibs_config.op_enabled);
- oprofilefs_create_ulong(sb, dir, "max_count",
+ oprofilefs_create_ulong(dir, "max_count",
&ibs_config.max_cnt_op);
if (ibs_caps & IBS_CAPS_OPCNT)
- oprofilefs_create_ulong(sb, dir, "dispatched_ops",
+ oprofilefs_create_ulong(dir, "dispatched_ops",
&ibs_config.dispatched_ops);
if (ibs_caps & IBS_CAPS_BRNTRGT)
- oprofilefs_create_ulong(sb, dir, "branch_target",
+ oprofilefs_create_ulong(dir, "branch_target",
&ibs_config.branch_target);
}
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index d641897a1f4e..b30e937689d6 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -568,13 +568,8 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
*/
if (bus) {
struct pci_bus *child;
- list_for_each_entry(child, &bus->children, node) {
- struct pci_dev *self = child->self;
- if (!self)
- continue;
-
- pcie_bus_configure_settings(child, self->pcie_mpss);
- }
+ list_for_each_entry(child, &bus->children, node)
+ pcie_bus_configure_settings(child);
}
if (bus && node != -1) {
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 94919e307f8e..db6b1ab43255 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -210,6 +210,8 @@ static void pcibios_allocate_bridge_resources(struct pci_dev *dev)
r = &dev->resource[idx];
if (!r->flags)
continue;
+ if (r->parent) /* Already allocated */
+ continue;
if (!r->start || pci_claim_resource(dev, idx) < 0) {
/*
* Something is wrong with the region.
@@ -318,6 +320,8 @@ static void pcibios_allocate_dev_rom_resource(struct pci_dev *dev)
r = &dev->resource[PCI_ROM_RESOURCE];
if (!r->flags || !r->start)
return;
+ if (r->parent) /* Already allocated */
+ return;
if (pci_claim_resource(dev, PCI_ROM_RESOURCE) < 0) {
r->end -= r->start;
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 082e88129712..5596c7bdd327 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -700,7 +700,7 @@ int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
if (!(pci_probe & PCI_PROBE_MMCONF) || pci_mmcfg_arch_init_failed)
return -ENODEV;
- if (start > end)
+ if (start > end || !addr)
return -EINVAL;
mutex_lock(&pci_mmcfg_lock);
@@ -716,11 +716,6 @@ int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
return -EEXIST;
}
- if (!addr) {
- mutex_unlock(&pci_mmcfg_lock);
- return -EINVAL;
- }
-
rc = -EBUSY;
cfg = pci_mmconfig_alloc(seg, start, end, addr);
if (cfg == NULL) {
diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c
index 6eb18c42a28a..903fded50786 100644
--- a/arch/x86/pci/mrst.c
+++ b/arch/x86/pci/mrst.c
@@ -23,11 +23,11 @@
#include <linux/ioport.h>
#include <linux/init.h>
#include <linux/dmi.h>
+#include <linux/acpi.h>
+#include <linux/io.h>
+#include <linux/smp.h>
-#include <asm/acpi.h>
#include <asm/segment.h>
-#include <asm/io.h>
-#include <asm/smp.h>
#include <asm/pci_x86.h>
#include <asm/hw_irq.h>
#include <asm/io_apic.h>
@@ -43,7 +43,7 @@
#define PCI_FIXED_BAR_4_SIZE 0x14
#define PCI_FIXED_BAR_5_SIZE 0x1c
-static int pci_soc_mode = 0;
+static int pci_soc_mode;
/**
* fixed_bar_cap - return the offset of the fixed BAR cap if found
@@ -141,7 +141,8 @@ static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn,
*/
static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg)
{
- /* This is a workaround for A0 LNC bug where PCI status register does
+ /*
+ * This is a workaround for A0 LNC bug where PCI status register does
* not have new CAP bit set. can not be written by SW either.
*
* PCI header type in real LNC indicates a single function device, this
@@ -154,7 +155,7 @@ static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg)
|| devfn == PCI_DEVFN(0, 0)
|| devfn == PCI_DEVFN(3, 0)))
return 1;
- return 0; /* langwell on others */
+ return 0; /* Langwell on others */
}
static int pci_read(struct pci_bus *bus, unsigned int devfn, int where,
@@ -172,7 +173,8 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
{
int offset;
- /* On MRST, there is no PCI ROM BAR, this will cause a subsequent read
+ /*
+ * On MRST, there is no PCI ROM BAR, this will cause a subsequent read
* to ROM BAR return 0 then being ignored.
*/
if (where == PCI_ROM_ADDRESS)
@@ -210,7 +212,8 @@ static int mrst_pci_irq_enable(struct pci_dev *dev)
pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
- /* MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
+ /*
+ * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
* IOAPIC RTE entries, so we just enable RTE for the device.
*/
irq_attr.ioapic = mp_find_ioapic(dev->irq);
@@ -235,7 +238,7 @@ struct pci_ops pci_mrst_ops = {
*/
int __init pci_mrst_init(void)
{
- printk(KERN_INFO "Intel MID platform detected, using MID PCI ops\n");
+ pr_info("Intel MID platform detected, using MID PCI ops\n");
pci_mmcfg_late_init();
pcibios_enable_irq = mrst_pci_irq_enable;
pci_root_ops = pci_mrst_ops;
@@ -244,17 +247,21 @@ int __init pci_mrst_init(void)
return 1;
}
-/* Langwell devices are not true pci devices, they are not subject to 10 ms
- * d3 to d0 delay required by pci spec.
+/*
+ * Langwell devices are not true PCI devices; they are not subject to 10 ms
+ * d3 to d0 delay required by PCI spec.
*/
static void pci_d3delay_fixup(struct pci_dev *dev)
{
- /* PCI fixups are effectively decided compile time. If we have a dual
- SoC/non-SoC kernel we don't want to mangle d3 on non SoC devices */
- if (!pci_soc_mode)
- return;
- /* true pci devices in lincroft should allow type 1 access, the rest
- * are langwell fake pci devices.
+ /*
+ * PCI fixups are effectively decided compile time. If we have a dual
+ * SoC/non-SoC kernel we don't want to mangle d3 on non-SoC devices.
+ */
+ if (!pci_soc_mode)
+ return;
+ /*
+ * True PCI devices in Lincroft should allow type 1 access, the rest
+ * are Langwell fake PCI devices.
*/
if (type1_access_ok(dev->bus->number, dev->devfn, PCI_DEVICE_ID))
return;
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 47fe66fe61f1..3ca5957b7a34 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -20,7 +20,7 @@
#include <linux/intel_pmic_gpio.h>
#include <linux/spi/spi.h>
#include <linux/i2c.h>
-#include <linux/i2c/pca953x.h>
+#include <linux/platform_data/pca953x.h>
#include <linux/gpio_keys.h>
#include <linux/input.h>
#include <linux/platform_device.h>
diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk
index e6773dc8ac41..093a892026f9 100644
--- a/arch/x86/tools/gen-insn-attr-x86.awk
+++ b/arch/x86/tools/gen-insn-attr-x86.awk
@@ -68,7 +68,7 @@ BEGIN {
lprefix1_expr = "\\((66|!F3)\\)"
lprefix2_expr = "\\(F3\\)"
- lprefix3_expr = "\\((F2|!F3)\\)"
+ lprefix3_expr = "\\((F2|!F3|66\\&F2)\\)"
lprefix_expr = "\\((66|F2|F3)\\)"
max_lprefix = 4
@@ -83,6 +83,8 @@ BEGIN {
prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ"
prefix_num["REPNE"] = "INAT_PFX_REPNE"
prefix_num["REP/REPE"] = "INAT_PFX_REPE"
+ prefix_num["XACQUIRE"] = "INAT_PFX_REPNE"
+ prefix_num["XRELEASE"] = "INAT_PFX_REPE"
prefix_num["LOCK"] = "INAT_PFX_LOCK"
prefix_num["SEG=CS"] = "INAT_PFX_CS"
prefix_num["SEG=DS"] = "INAT_PFX_DS"
diff --git a/arch/x86/um/os-Linux/prctl.c b/arch/x86/um/os-Linux/prctl.c
index 9d34eddb517f..96eb2bd28832 100644
--- a/arch/x86/um/os-Linux/prctl.c
+++ b/arch/x86/um/os-Linux/prctl.c
@@ -4,7 +4,7 @@
*/
#include <sys/ptrace.h>
-#include <linux/ptrace.h>
+#include <asm/ptrace.h>
int os_arch_prctl(int pid, int code, unsigned long *addr)
{
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index c74436e687bf..72074d528400 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -85,15 +85,18 @@ static notrace cycle_t vread_pvclock(int *mode)
cycle_t ret;
u64 last;
u32 version;
- u32 migrate_count;
u8 flags;
unsigned cpu, cpu1;
/*
- * When looping to get a consistent (time-info, tsc) pair, we
- * also need to deal with the possibility we can switch vcpus,
- * so make sure we always re-fetch time-info for the current vcpu.
+ * Note: hypervisor must guarantee that:
+ * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
+ * 2. that per-CPU pvclock time info is updated if the
+ * underlying CPU changes.
+ * 3. that version is increased whenever underlying CPU
+ * changes.
+ *
*/
do {
cpu = __getcpu() & VGETCPU_CPU_MASK;
@@ -104,8 +107,6 @@ static notrace cycle_t vread_pvclock(int *mode)
pvti = get_pvti(cpu);
- migrate_count = pvti->migrate_count;
-
version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
/*
@@ -117,8 +118,7 @@ static notrace cycle_t vread_pvclock(int *mode)
cpu1 = __getcpu() & VGETCPU_CPU_MASK;
} while (unlikely(cpu != cpu1 ||
(pvti->pvti.version & 1) ||
- pvti->pvti.version != version ||
- pvti->migrate_count != migrate_count));
+ pvti->pvti.version != version));
if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
*mode = VCLOCK_NONE;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 193097ef3d7d..fa6ade76ef3f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -427,8 +427,7 @@ static void __init xen_init_cpuid_mask(void)
if (!xen_initial_domain())
cpuid_leaf1_edx_mask &=
- ~((1 << X86_FEATURE_APIC) | /* disable local APIC */
- (1 << X86_FEATURE_ACPI)); /* disable ACPI */
+ ~((1 << X86_FEATURE_ACPI)); /* disable ACPI */
cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_X2APIC % 32));
@@ -735,8 +734,7 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
addr = (unsigned long)xen_int3;
else if (addr == (unsigned long)stack_segment)
addr = (unsigned long)xen_stack_segment;
- else if (addr == (unsigned long)double_fault ||
- addr == (unsigned long)nmi) {
+ else if (addr == (unsigned long)double_fault) {
/* Don't need to handle these */
return 0;
#ifdef CONFIG_X86_MCE
@@ -747,7 +745,12 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
*/
;
#endif
- } else {
+ } else if (addr == (unsigned long)nmi)
+ /*
+ * Use the native version as well.
+ */
+ ;
+ else {
/* Some other trap using IST? */
if (WARN_ON(val->ist != 0))
return 0;
@@ -1689,7 +1692,6 @@ static int xen_hvm_cpu_notify(struct notifier_block *self, unsigned long action,
case CPU_UP_PREPARE:
xen_vcpu_setup(cpu);
if (xen_have_vector_callback) {
- xen_init_lock_cpu(cpu);
if (xen_feature(XENFEAT_hvm_safe_pvclock))
xen_setup_timer(cpu);
}
@@ -1710,6 +1712,8 @@ static void __init xen_hvm_guest_init(void)
xen_hvm_init_shared_info();
+ xen_panic_handler_init();
+
if (xen_feature(XENFEAT_hvm_callback_vector))
xen_have_vector_callback = 1;
xen_hvm_smp_init();
@@ -1720,15 +1724,12 @@ static void __init xen_hvm_guest_init(void)
xen_hvm_init_mmu_ops();
}
-static bool __init xen_hvm_platform(void)
+static uint32_t __init xen_hvm_platform(void)
{
if (xen_pv_domain())
- return false;
-
- if (!xen_cpuid_base())
- return false;
+ return 0;
- return true;
+ return xen_cpuid_base();
}
bool xen_hvm_need_lapic(void)
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 01a4dc015ae1..0da7f863056f 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -47,23 +47,18 @@ static void xen_restore_fl(unsigned long flags)
/* convert from IF type flag */
flags = !(flags & X86_EFLAGS_IF);
- /* There's a one instruction preempt window here. We need to
- make sure we're don't switch CPUs between getting the vcpu
- pointer and updating the mask. */
+ /* See xen_irq_enable() for why preemption must be disabled. */
preempt_disable();
vcpu = this_cpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = flags;
- preempt_enable_no_resched();
-
- /* Doesn't matter if we get preempted here, because any
- pending event will get dealt with anyway. */
if (flags == 0) {
- preempt_check_resched();
barrier(); /* unmask then check (avoid races) */
if (unlikely(vcpu->evtchn_upcall_pending))
xen_force_evtchn_callback();
- }
+ preempt_enable();
+ } else
+ preempt_enable_no_resched();
}
PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
@@ -82,10 +77,12 @@ static void xen_irq_enable(void)
{
struct vcpu_info *vcpu;
- /* We don't need to worry about being preempted here, since
- either a) interrupts are disabled, so no preemption, or b)
- the caller is confused and is trying to re-enable interrupts
- on an indeterminate processor. */
+ /*
+ * We may be preempted as soon as vcpu->evtchn_upcall_mask is
+ * cleared, so disable preemption to ensure we check for
+ * events on the VCPU we are still running on.
+ */
+ preempt_disable();
vcpu = this_cpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = 0;
@@ -96,6 +93,8 @@ static void xen_irq_enable(void)
barrier(); /* unmask then check (avoid races) */
if (unlikely(vcpu->evtchn_upcall_pending))
xen_force_evtchn_callback();
+
+ preempt_enable();
}
PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 95fb2aa5927e..8b901e8d782d 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -161,6 +161,7 @@
#include <asm/xen/page.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
+#include <xen/balloon.h>
#include <xen/grant_table.h>
#include "multicalls.h"
@@ -967,7 +968,10 @@ int m2p_remove_override(struct page *page,
if (kmap_op != NULL) {
if (!PageHighMem(page)) {
struct multicall_space mcs;
- struct gnttab_unmap_grant_ref *unmap_op;
+ struct gnttab_unmap_and_replace *unmap_op;
+ struct page *scratch_page = get_balloon_scratch_page();
+ unsigned long scratch_page_address = (unsigned long)
+ __va(page_to_pfn(scratch_page) << PAGE_SHIFT);
/*
* It might be that we queued all the m2p grant table
@@ -986,25 +990,31 @@ int m2p_remove_override(struct page *page,
printk(KERN_WARNING "m2p_remove_override: "
"pfn %lx mfn %lx, failed to modify kernel mappings",
pfn, mfn);
+ put_balloon_scratch_page();
return -1;
}
- mcs = xen_mc_entry(
- sizeof(struct gnttab_unmap_grant_ref));
+ xen_mc_batch();
+
+ mcs = __xen_mc_entry(
+ sizeof(struct gnttab_unmap_and_replace));
unmap_op = mcs.args;
unmap_op->host_addr = kmap_op->host_addr;
+ unmap_op->new_addr = scratch_page_address;
unmap_op->handle = kmap_op->handle;
- unmap_op->dev_bus_addr = 0;
MULTI_grant_table_op(mcs.mc,
- GNTTABOP_unmap_grant_ref, unmap_op, 1);
+ GNTTABOP_unmap_and_replace, unmap_op, 1);
+
+ mcs = __xen_mc_entry(0);
+ MULTI_update_va_mapping(mcs.mc, scratch_page_address,
+ pfn_pte(page_to_pfn(scratch_page),
+ PAGE_KERNEL_RO), 0);
xen_mc_issue(PARAVIRT_LAZY_MMU);
- set_pte_at(&init_mm, address, ptep,
- pfn_pte(pfn, PAGE_KERNEL));
- __flush_tlb_single(address);
kmap_op->host_addr = 0;
+ put_balloon_scratch_page();
}
}
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 056d11faef21..09f3059cb00b 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -33,6 +33,9 @@
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
+#ifdef CONFIG_X86_64
+extern const char nmi[];
+#endif
extern void xen_sysenter_target(void);
extern void xen_syscall_target(void);
extern void xen_syscall32_target(void);
@@ -215,13 +218,19 @@ static void __init xen_set_identity_and_release_chunk(
unsigned long pfn;
/*
- * If the PFNs are currently mapped, the VA mapping also needs
- * to be updated to be 1:1.
+ * If the PFNs are currently mapped, clear the mappings
+ * (except for the ISA region which must be 1:1 mapped) to
+ * release the refcounts (in Xen) on the original frames.
*/
- for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
+ for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
+ pte_t pte = __pte_ma(0);
+
+ if (pfn < PFN_UP(ISA_END_ADDRESS))
+ pte = mfn_pte(pfn, PAGE_KERNEL_IO);
+
(void)HYPERVISOR_update_va_mapping(
- (unsigned long)__va(pfn << PAGE_SHIFT),
- mfn_pte(pfn, PAGE_KERNEL_IO), 0);
+ (unsigned long)__va(pfn << PAGE_SHIFT), pte, 0);
+ }
if (start_pfn < nr_pages)
*released += xen_release_chunk(
@@ -313,6 +322,17 @@ static void xen_align_and_add_e820_region(u64 start, u64 size, int type)
e820_add_region(start, end - start, type);
}
+void xen_ignore_unusable(struct e820entry *list, size_t map_size)
+{
+ struct e820entry *entry;
+ unsigned int i;
+
+ for (i = 0, entry = list; i < map_size; i++, entry++) {
+ if (entry->type == E820_UNUSABLE)
+ entry->type = E820_RAM;
+ }
+}
+
/**
* machine_specific_memory_setup - Hook for machine specific memory setup.
**/
@@ -353,6 +373,17 @@ char * __init xen_memory_setup(void)
}
BUG_ON(rc);
+ /*
+ * Xen won't allow a 1:1 mapping to be created to UNUSABLE
+ * regions, so if we're using the machine memory map leave the
+ * region as RAM as it is in the pseudo-physical map.
+ *
+ * UNUSABLE regions in domUs are not handled and will need
+ * a patch in the future.
+ */
+ if (xen_initial_domain())
+ xen_ignore_unusable(map, memmap.nr_entries);
+
/* Make sure the Xen-supplied memory map is well-ordered. */
sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
@@ -525,7 +556,13 @@ void xen_enable_syscall(void)
}
#endif /* CONFIG_X86_64 */
}
-
+void __cpuinit xen_enable_nmi(void)
+{
+#ifdef CONFIG_X86_64
+ if (register_callback(CALLBACKTYPE_nmi, nmi))
+ BUG();
+#endif
+}
void __init xen_arch_setup(void)
{
xen_panic_handler_init();
@@ -543,7 +580,7 @@ void __init xen_arch_setup(void)
xen_enable_sysenter();
xen_enable_syscall();
-
+ xen_enable_nmi();
#ifdef CONFIG_ACPI
if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index ca92754eb846..d1e4777b4e75 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -273,12 +273,21 @@ static void __init xen_smp_prepare_boot_cpu(void)
BUG_ON(smp_processor_id() != 0);
native_smp_prepare_boot_cpu();
- /* We've switched to the "real" per-cpu gdt, so make sure the
- old memory can be recycled */
- make_lowmem_page_readwrite(xen_initial_gdt);
+ if (xen_pv_domain()) {
+ /* We've switched to the "real" per-cpu gdt, so make sure the
+ old memory can be recycled */
+ make_lowmem_page_readwrite(xen_initial_gdt);
- xen_filter_cpu_maps();
- xen_setup_vcpu_info_placement();
+ xen_filter_cpu_maps();
+ xen_setup_vcpu_info_placement();
+ }
+ /*
+ * The alternative logic (which patches the unlock/lock) runs before
+ * the smp bootup up code is activated. Hence we need to set this up
+ * the core kernel is being patched. Otherwise we will have only
+ * modules patched but not core code.
+ */
+ xen_init_spinlocks();
}
static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
@@ -572,6 +581,12 @@ static inline int xen_map_vector(int vector)
case IRQ_WORK_VECTOR:
xen_vector = XEN_IRQ_WORK_VECTOR;
break;
+#ifdef CONFIG_X86_64
+ case NMI_VECTOR:
+ case APIC_DM_NMI: /* Some use that instead of NMI_VECTOR */
+ xen_vector = XEN_NMI_VECTOR;
+ break;
+#endif
default:
xen_vector = -1;
printk(KERN_ERR "xen: vector 0x%x is not implemented\n",
@@ -680,7 +695,6 @@ void __init xen_smp_init(void)
{
smp_ops = xen_smp_ops;
xen_fill_possible_map();
- xen_init_spinlocks();
}
static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
@@ -694,8 +708,24 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
{
int rc;
- rc = native_cpu_up(cpu, tidle);
- WARN_ON (xen_smp_intr_init(cpu));
+ /*
+ * xen_smp_intr_init() needs to run before native_cpu_up()
+ * so that IPI vectors are set up on the booting CPU before
+ * it is marked online in native_cpu_up().
+ */
+ rc = xen_smp_intr_init(cpu);
+ WARN_ON(rc);
+ if (!rc)
+ rc = native_cpu_up(cpu, tidle);
+
+ /*
+ * We must initialize the slowpath CPU kicker _after_ the native
+ * path has executed. If we initialized it before none of the
+ * unlocker IPI kicks would reach the booting CPU as the booting
+ * CPU had not set itself 'online' in cpu_online_mask. That mask
+ * is checked when IPIs are sent (on HVM at least).
+ */
+ xen_init_lock_cpu(cpu);
return rc;
}
@@ -715,4 +745,5 @@ void __init xen_hvm_smp_init(void)
smp_ops.cpu_die = xen_hvm_cpu_die;
smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
+ smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu;
}
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index cf3caee356b3..253f63fceea1 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -17,45 +17,44 @@
#include "xen-ops.h"
#include "debugfs.h"
-#ifdef CONFIG_XEN_DEBUG_FS
-static struct xen_spinlock_stats
-{
- u64 taken;
- u32 taken_slow;
- u32 taken_slow_nested;
- u32 taken_slow_pickup;
- u32 taken_slow_spurious;
- u32 taken_slow_irqenable;
+enum xen_contention_stat {
+ TAKEN_SLOW,
+ TAKEN_SLOW_PICKUP,
+ TAKEN_SLOW_SPURIOUS,
+ RELEASED_SLOW,
+ RELEASED_SLOW_KICKED,
+ NR_CONTENTION_STATS
+};
- u64 released;
- u32 released_slow;
- u32 released_slow_kicked;
+#ifdef CONFIG_XEN_DEBUG_FS
#define HISTO_BUCKETS 30
- u32 histo_spin_total[HISTO_BUCKETS+1];
- u32 histo_spin_spinning[HISTO_BUCKETS+1];
+static struct xen_spinlock_stats
+{
+ u32 contention_stats[NR_CONTENTION_STATS];
u32 histo_spin_blocked[HISTO_BUCKETS+1];
-
- u64 time_total;
- u64 time_spinning;
u64 time_blocked;
} spinlock_stats;
static u8 zero_stats;
-static unsigned lock_timeout = 1 << 10;
-#define TIMEOUT lock_timeout
-
static inline void check_zero(void)
{
- if (unlikely(zero_stats)) {
- memset(&spinlock_stats, 0, sizeof(spinlock_stats));
- zero_stats = 0;
+ u8 ret;
+ u8 old = ACCESS_ONCE(zero_stats);
+ if (unlikely(old)) {
+ ret = cmpxchg(&zero_stats, old, 0);
+ /* This ensures only one fellow resets the stat */
+ if (ret == old)
+ memset(&spinlock_stats, 0, sizeof(spinlock_stats));
}
}
-#define ADD_STATS(elem, val) \
- do { check_zero(); spinlock_stats.elem += (val); } while(0)
+static inline void add_stats(enum xen_contention_stat var, u32 val)
+{
+ check_zero();
+ spinlock_stats.contention_stats[var] += val;
+}
static inline u64 spin_time_start(void)
{
@@ -74,22 +73,6 @@ static void __spin_time_accum(u64 delta, u32 *array)
array[HISTO_BUCKETS]++;
}
-static inline void spin_time_accum_spinning(u64 start)
-{
- u32 delta = xen_clocksource_read() - start;
-
- __spin_time_accum(delta, spinlock_stats.histo_spin_spinning);
- spinlock_stats.time_spinning += delta;
-}
-
-static inline void spin_time_accum_total(u64 start)
-{
- u32 delta = xen_clocksource_read() - start;
-
- __spin_time_accum(delta, spinlock_stats.histo_spin_total);
- spinlock_stats.time_total += delta;
-}
-
static inline void spin_time_accum_blocked(u64 start)
{
u32 delta = xen_clocksource_read() - start;
@@ -98,263 +81,138 @@ static inline void spin_time_accum_blocked(u64 start)
spinlock_stats.time_blocked += delta;
}
#else /* !CONFIG_XEN_DEBUG_FS */
-#define TIMEOUT (1 << 10)
-#define ADD_STATS(elem, val) do { (void)(val); } while(0)
+static inline void add_stats(enum xen_contention_stat var, u32 val)
+{
+}
static inline u64 spin_time_start(void)
{
return 0;
}
-static inline void spin_time_accum_total(u64 start)
-{
-}
-static inline void spin_time_accum_spinning(u64 start)
-{
-}
static inline void spin_time_accum_blocked(u64 start)
{
}
#endif /* CONFIG_XEN_DEBUG_FS */
-/*
- * Size struct xen_spinlock so it's the same as arch_spinlock_t.
- */
-#if NR_CPUS < 256
-typedef u8 xen_spinners_t;
-# define inc_spinners(xl) \
- asm(LOCK_PREFIX " incb %0" : "+m" ((xl)->spinners) : : "memory");
-# define dec_spinners(xl) \
- asm(LOCK_PREFIX " decb %0" : "+m" ((xl)->spinners) : : "memory");
-#else
-typedef u16 xen_spinners_t;
-# define inc_spinners(xl) \
- asm(LOCK_PREFIX " incw %0" : "+m" ((xl)->spinners) : : "memory");
-# define dec_spinners(xl) \
- asm(LOCK_PREFIX " decw %0" : "+m" ((xl)->spinners) : : "memory");
-#endif
-
-struct xen_spinlock {
- unsigned char lock; /* 0 -> free; 1 -> locked */
- xen_spinners_t spinners; /* count of waiting cpus */
+struct xen_lock_waiting {
+ struct arch_spinlock *lock;
+ __ticket_t want;
};
-static int xen_spin_is_locked(struct arch_spinlock *lock)
-{
- struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-
- return xl->lock != 0;
-}
-
-static int xen_spin_is_contended(struct arch_spinlock *lock)
-{
- struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-
- /* Not strictly true; this is only the count of contended
- lock-takers entering the slow path. */
- return xl->spinners != 0;
-}
-
-static int xen_spin_trylock(struct arch_spinlock *lock)
-{
- struct xen_spinlock *xl = (struct xen_spinlock *)lock;
- u8 old = 1;
-
- asm("xchgb %b0,%1"
- : "+q" (old), "+m" (xl->lock) : : "memory");
-
- return old == 0;
-}
-
-static DEFINE_PER_CPU(char *, irq_name);
static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
-static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
-
-/*
- * Mark a cpu as interested in a lock. Returns the CPU's previous
- * lock of interest, in case we got preempted by an interrupt.
- */
-static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
-{
- struct xen_spinlock *prev;
-
- prev = __this_cpu_read(lock_spinners);
- __this_cpu_write(lock_spinners, xl);
-
- wmb(); /* set lock of interest before count */
-
- inc_spinners(xl);
-
- return prev;
-}
-
-/*
- * Mark a cpu as no longer interested in a lock. Restores previous
- * lock of interest (NULL for none).
- */
-static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
-{
- dec_spinners(xl);
- wmb(); /* decrement count before restoring lock */
- __this_cpu_write(lock_spinners, prev);
-}
+static DEFINE_PER_CPU(char *, irq_name);
+static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting);
+static cpumask_t waiting_cpus;
-static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable)
+static bool xen_pvspin = true;
+static void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
{
- struct xen_spinlock *xl = (struct xen_spinlock *)lock;
- struct xen_spinlock *prev;
int irq = __this_cpu_read(lock_kicker_irq);
- int ret;
+ struct xen_lock_waiting *w = &__get_cpu_var(lock_waiting);
+ int cpu = smp_processor_id();
u64 start;
+ unsigned long flags;
/* If kicker interrupts not initialized yet, just spin */
if (irq == -1)
- return 0;
+ return;
start = spin_time_start();
- /* announce we're spinning */
- prev = spinning_lock(xl);
+ /*
+ * Make sure an interrupt handler can't upset things in a
+ * partially setup state.
+ */
+ local_irq_save(flags);
+ /*
+ * We don't really care if we're overwriting some other
+ * (lock,want) pair, as that would mean that we're currently
+ * in an interrupt context, and the outer context had
+ * interrupts enabled. That has already kicked the VCPU out
+ * of xen_poll_irq(), so it will just return spuriously and
+ * retry with newly setup (lock,want).
+ *
+ * The ordering protocol on this is that the "lock" pointer
+ * may only be set non-NULL if the "want" ticket is correct.
+ * If we're updating "want", we must first clear "lock".
+ */
+ w->lock = NULL;
+ smp_wmb();
+ w->want = want;
+ smp_wmb();
+ w->lock = lock;
- ADD_STATS(taken_slow, 1);
- ADD_STATS(taken_slow_nested, prev != NULL);
+ /* This uses set_bit, which atomic and therefore a barrier */
+ cpumask_set_cpu(cpu, &waiting_cpus);
+ add_stats(TAKEN_SLOW, 1);
- do {
- unsigned long flags;
+ /* clear pending */
+ xen_clear_irq_pending(irq);
- /* clear pending */
- xen_clear_irq_pending(irq);
+ /* Only check lock once pending cleared */
+ barrier();
- /* check again make sure it didn't become free while
- we weren't looking */
- ret = xen_spin_trylock(lock);
- if (ret) {
- ADD_STATS(taken_slow_pickup, 1);
+ /*
+ * Mark entry to slowpath before doing the pickup test to make
+ * sure we don't deadlock with an unlocker.
+ */
+ __ticket_enter_slowpath(lock);
- /*
- * If we interrupted another spinlock while it
- * was blocking, make sure it doesn't block
- * without rechecking the lock.
- */
- if (prev != NULL)
- xen_set_irq_pending(irq);
- goto out;
- }
+ /*
+ * check again make sure it didn't become free while
+ * we weren't looking
+ */
+ if (ACCESS_ONCE(lock->tickets.head) == want) {
+ add_stats(TAKEN_SLOW_PICKUP, 1);
+ goto out;
+ }
- flags = arch_local_save_flags();
- if (irq_enable) {
- ADD_STATS(taken_slow_irqenable, 1);
- raw_local_irq_enable();
- }
+ /* Allow interrupts while blocked */
+ local_irq_restore(flags);
- /*
- * Block until irq becomes pending. If we're
- * interrupted at this point (after the trylock but
- * before entering the block), then the nested lock
- * handler guarantees that the irq will be left
- * pending if there's any chance the lock became free;
- * xen_poll_irq() returns immediately if the irq is
- * pending.
- */
- xen_poll_irq(irq);
+ /*
+ * If an interrupt happens here, it will leave the wakeup irq
+ * pending, which will cause xen_poll_irq() to return
+ * immediately.
+ */
- raw_local_irq_restore(flags);
+ /* Block until irq becomes pending (or perhaps a spurious wakeup) */
+ xen_poll_irq(irq);
+ add_stats(TAKEN_SLOW_SPURIOUS, !xen_test_irq_pending(irq));
- ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
- } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
+ local_irq_save(flags);
kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
-
out:
- unspinning_lock(xl, prev);
- spin_time_accum_blocked(start);
+ cpumask_clear_cpu(cpu, &waiting_cpus);
+ w->lock = NULL;
- return ret;
-}
+ local_irq_restore(flags);
-static inline void __xen_spin_lock(struct arch_spinlock *lock, bool irq_enable)
-{
- struct xen_spinlock *xl = (struct xen_spinlock *)lock;
- unsigned timeout;
- u8 oldval;
- u64 start_spin;
-
- ADD_STATS(taken, 1);
-
- start_spin = spin_time_start();
-
- do {
- u64 start_spin_fast = spin_time_start();
-
- timeout = TIMEOUT;
-
- asm("1: xchgb %1,%0\n"
- " testb %1,%1\n"
- " jz 3f\n"
- "2: rep;nop\n"
- " cmpb $0,%0\n"
- " je 1b\n"
- " dec %2\n"
- " jnz 2b\n"
- "3:\n"
- : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
- : "1" (1)
- : "memory");
-
- spin_time_accum_spinning(start_spin_fast);
-
- } while (unlikely(oldval != 0 &&
- (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable))));
-
- spin_time_accum_total(start_spin);
-}
-
-static void xen_spin_lock(struct arch_spinlock *lock)
-{
- __xen_spin_lock(lock, false);
-}
-
-static void xen_spin_lock_flags(struct arch_spinlock *lock, unsigned long flags)
-{
- __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
+ spin_time_accum_blocked(start);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_lock_spinning);
-static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
+static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next)
{
int cpu;
- ADD_STATS(released_slow, 1);
+ add_stats(RELEASED_SLOW, 1);
- for_each_online_cpu(cpu) {
- /* XXX should mix up next cpu selection */
- if (per_cpu(lock_spinners, cpu) == xl) {
- ADD_STATS(released_slow_kicked, 1);
+ for_each_cpu(cpu, &waiting_cpus) {
+ const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu);
+
+ /* Make sure we read lock before want */
+ if (ACCESS_ONCE(w->lock) == lock &&
+ ACCESS_ONCE(w->want) == next) {
+ add_stats(RELEASED_SLOW_KICKED, 1);
xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
+ break;
}
}
}
-static void xen_spin_unlock(struct arch_spinlock *lock)
-{
- struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-
- ADD_STATS(released, 1);
-
- smp_wmb(); /* make sure no writes get moved after unlock */
- xl->lock = 0; /* release lock */
-
- /*
- * Make sure unlock happens before checking for waiting
- * spinners. We need a strong barrier to enforce the
- * write-read ordering to different memory locations, as the
- * CPU makes no implied guarantees about their ordering.
- */
- mb();
-
- if (unlikely(xl->spinners))
- xen_spin_unlock_slow(xl);
-}
-
static irqreturn_t dummy_handler(int irq, void *dev_id)
{
BUG();
@@ -366,16 +224,12 @@ void xen_init_lock_cpu(int cpu)
int irq;
char *name;
+ if (!xen_pvspin)
+ return;
+
WARN(per_cpu(lock_kicker_irq, cpu) >= 0, "spinlock on CPU%d exists on IRQ%d!\n",
cpu, per_cpu(lock_kicker_irq, cpu));
- /*
- * See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23
- * (xen: disable PV spinlocks on HVM)
- */
- if (xen_hvm_domain())
- return;
-
name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
cpu,
@@ -395,11 +249,7 @@ void xen_init_lock_cpu(int cpu)
void xen_uninit_lock_cpu(int cpu)
{
- /*
- * See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23
- * (xen: disable PV spinlocks on HVM)
- */
- if (xen_hvm_domain())
+ if (!xen_pvspin)
return;
unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
@@ -408,25 +258,28 @@ void xen_uninit_lock_cpu(int cpu)
per_cpu(irq_name, cpu) = NULL;
}
+
void __init xen_init_spinlocks(void)
{
- /*
- * See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23
- * (xen: disable PV spinlocks on HVM)
- */
- if (xen_hvm_domain())
+
+ if (!xen_pvspin) {
+ printk(KERN_DEBUG "xen: PV spinlocks disabled\n");
return;
+ }
- BUILD_BUG_ON(sizeof(struct xen_spinlock) > sizeof(arch_spinlock_t));
+ static_key_slow_inc(&paravirt_ticketlocks_enabled);
- pv_lock_ops.spin_is_locked = xen_spin_is_locked;
- pv_lock_ops.spin_is_contended = xen_spin_is_contended;
- pv_lock_ops.spin_lock = xen_spin_lock;
- pv_lock_ops.spin_lock_flags = xen_spin_lock_flags;
- pv_lock_ops.spin_trylock = xen_spin_trylock;
- pv_lock_ops.spin_unlock = xen_spin_unlock;
+ pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning);
+ pv_lock_ops.unlock_kick = xen_unlock_kick;
}
+static __init int xen_parse_nopvspin(char *arg)
+{
+ xen_pvspin = false;
+ return 0;
+}
+early_param("xen_nopvspin", xen_parse_nopvspin);
+
#ifdef CONFIG_XEN_DEBUG_FS
static struct dentry *d_spin_debug;
@@ -438,41 +291,28 @@ static int __init xen_spinlock_debugfs(void)
if (d_xen == NULL)
return -ENOMEM;
+ if (!xen_pvspin)
+ return 0;
+
d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
- debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout);
-
- debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken);
debugfs_create_u32("taken_slow", 0444, d_spin_debug,
- &spinlock_stats.taken_slow);
- debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug,
- &spinlock_stats.taken_slow_nested);
+ &spinlock_stats.contention_stats[TAKEN_SLOW]);
debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
- &spinlock_stats.taken_slow_pickup);
+ &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]);
debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
- &spinlock_stats.taken_slow_spurious);
- debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug,
- &spinlock_stats.taken_slow_irqenable);
+ &spinlock_stats.contention_stats[TAKEN_SLOW_SPURIOUS]);
- debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released);
debugfs_create_u32("released_slow", 0444, d_spin_debug,
- &spinlock_stats.released_slow);
+ &spinlock_stats.contention_stats[RELEASED_SLOW]);
debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
- &spinlock_stats.released_slow_kicked);
+ &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]);
- debugfs_create_u64("time_spinning", 0444, d_spin_debug,
- &spinlock_stats.time_spinning);
debugfs_create_u64("time_blocked", 0444, d_spin_debug,
&spinlock_stats.time_blocked);
- debugfs_create_u64("time_total", 0444, d_spin_debug,
- &spinlock_stats.time_total);
- debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
- spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
- debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
- spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);