Merge tag 'v4.0-rc5' into x86/fpu, to prevent conflicts

Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Ingo Molnar <mingo@kernel.org> 2015-03-23 10:13:36 +0100
committer: Ingo Molnar <mingo@kernel.org> 2015-03-23 10:13:36 +0100
commit: eda2360ad18b7cde87728fad85c6735a52c2576e (patch)
tree: 2bc28704b40686c0d3d0fd5aa7d000f9d9dd153f /arch/x86
parent: 1d23c4518b1f3a03c278f23333149245c178d2a6 (diff)
parent: bc465aa9d045feb0e13b4a8f32cc33c1943f62d6 (diff)
148 files changed, 4859 insertions, 2179 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5e28e2be3a41..b7d31ca55187 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -17,6 +17,7 @@ config X86_64
 	depends on 64BIT
 	select X86_DEV_DMA_OPS
 	select ARCH_USE_CMPXCHG_LOCKREF
+	select HAVE_LIVEPATCH
 
 ### Arch settings
 config X86
@@ -84,6 +85,7 @@ config X86
 	select HAVE_CMPXCHG_LOCAL
 	select HAVE_CMPXCHG_DOUBLE
 	select HAVE_ARCH_KMEMCHECK
+	select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
 	select HAVE_USER_RETURN_NOTIFIER
 	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
 	select HAVE_ARCH_JUMP_LABEL
@@ -486,6 +488,23 @@ config X86_INTEL_MID
 	  Intel MID platforms are based on an Intel processor and chipset which
 	  consume less power than most of the x86 derivatives.
 
+config X86_INTEL_QUARK
+	bool "Intel Quark platform support"
+	depends on X86_32
+	depends on X86_EXTENDED_PLATFORM
+	depends on X86_PLATFORM_DEVICES
+	depends on X86_TSC
+	depends on PCI
+	depends on PCI_GOANY
+	depends on X86_IO_APIC
+	select IOSF_MBI
+	select INTEL_IMR
+	select COMMON_CLK
+	---help---
+	  Select to include support for Quark X1000 SoC.
+	  Say Y here if you have a Quark based system such as the Arduino
+	  compatible Intel Galileo.
+
 config X86_INTEL_LPSS
 	bool "Intel Low Power Subsystem Support"
 	depends on ACPI
@@ -497,6 +516,17 @@ config X86_INTEL_LPSS
 	  things like clock tree (common clock framework) and pincontrol
 	  which are needed by the LPSS peripheral drivers.
 
+config X86_AMD_PLATFORM_DEVICE
+	bool "AMD ACPI2Platform devices support"
+	depends on ACPI
+	select COMMON_CLK
+	select PINCTRL
+	---help---
+	  Select to interpret AMD specific ACPI device to platform device
+	  such as I2C, UART, GPIO found on AMD Carrizo and later chipsets.
+	  I2C and UART depend on COMMON_CLK to set clock. GPIO driver is
+	  implemented under PINCTRL subsystem.
+
 config IOSF_MBI
 	tristate "Intel SoC IOSF Sideband support for SoC platforms"
 	depends on PCI
@@ -2017,6 +2047,8 @@ config CMDLINE_OVERRIDE
 	  This is used to work around broken boot loaders.  This should
 	  be set to 'N' under normal conditions.
 
+source "kernel/livepatch/Kconfig"
+
 endmenu
 
 config ARCH_ENABLE_MEMORY_HOTPLUG
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 61bd2ad94281..20028da8ae18 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -313,6 +313,19 @@ config DEBUG_NMI_SELFTEST
 
 	  If unsure, say N.
 
+config DEBUG_IMR_SELFTEST
+	bool "Isolated Memory Region self test"
+	default n
+	depends on INTEL_IMR
+	---help---
+	  This option enables automated sanity testing of the IMR code.
+	  Some simple tests are run to verify IMR bounds checking, alignment
+	  and overlapping. This option is really only useful if you are
+	  debugging an IMR memory map or are modifying the IMR code and want to
+	  test your changes.
+
+	  If unsure say N here.
+
 config X86_DEBUG_STATIC_CPU_HAS
 	bool "Debug alternatives"
 	depends on DEBUG_KERNEL
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 920e6160c535..5ba2d9ce82dc 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -148,6 +148,7 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI
 
 # does binutils support specific instructions?
 asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
+asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
 asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1)
 avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
 avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um
index 36b62bc52638..95eba554baf9 100644
--- a/arch/x86/Makefile.um
+++ b/arch/x86/Makefile.um
@@ -30,7 +30,7 @@ cflags-y += -ffreestanding
 # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
 # a lot more stack due to the lack of sharing of stacklots.  Also, gcc
 # 4.3.0 needs -funit-at-a-time for extern inline functions.
-KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then \
+KBUILD_CFLAGS += $(shell if [ $(cc-version) -lt 0400 ] ; then \
 			echo $(call cc-option,-fno-unit-at-a-time); \
 			else echo $(call cc-option,-funit-at-a-time); fi ;)
 
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 3db07f30636f..57bbf2fb21f6 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -14,6 +14,8 @@
 # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
 # The number is the same as you would ordinarily press at bootup.
 
+KASAN_SANITIZE := n
+
 SVGA_MODE	:= -DSVGA_MODE=NORMAL_VGA
 
 targets		:= vmlinux.bin setup.bin setup.elf bzImage
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index ad754b4411f7..0a291cdfaf77 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -16,6 +16,8 @@
 #	(see scripts/Makefile.lib size_append)
 #	compressed vmlinux.bin.all + u32 size of vmlinux.bin.all
 
+KASAN_SANITIZE := n
+
 targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
 	vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4
 
@@ -49,6 +51,7 @@ $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
 
 vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o \
 	$(objtree)/drivers/firmware/efi/libstub/lib.a
+vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o
 
 $(obj)/vmlinux: $(vmlinux-objs-y) FORCE
 	$(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 92b9a5f2aed6..ef17683484e9 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -13,8 +13,7 @@
 #include <asm/setup.h>
 #include <asm/desc.h>
 
-#undef memcpy			/* Use memcpy from misc.c */
-
+#include "../string.h"
 #include "eboot.h"
 
 static efi_system_table_t *sys_table;
diff --git a/arch/x86/boot/compressed/efi_stub_64.S b/arch/x86/boot/compressed/efi_stub_64.S
index 7ff3632806b1..99494dff2113 100644
--- a/arch/x86/boot/compressed/efi_stub_64.S
+++ b/arch/x86/boot/compressed/efi_stub_64.S
@@ -3,28 +3,3 @@
 #include <asm/processor-flags.h>
 
 #include "../../platform/efi/efi_stub_64.S"
-
-#ifdef CONFIG_EFI_MIXED
-	.code64
-	.text
-ENTRY(efi64_thunk)
-	push	%rbp
-	push	%rbx
-
-	subq	$16, %rsp
-	leaq	efi_exit32(%rip), %rax
-	movl	%eax, 8(%rsp)
-	leaq	efi_gdt64(%rip), %rax
-	movl	%eax, 4(%rsp)
-	movl	%eax, 2(%rax)		/* Fixup the gdt base address */
-	leaq	efi32_boot_gdt(%rip), %rax
-	movl	%eax, (%rsp)
-
-	call	__efi64_thunk
-
-	addq	$16, %rsp
-	pop	%rbx
-	pop	%rbp
-	ret
-ENDPROC(efi64_thunk)
-#endif /* CONFIG_EFI_MIXED */
diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S
new file mode 100644
index 000000000000..630384a4c14a
--- /dev/null
+++ b/arch/x86/boot/compressed/efi_thunk_64.S
@@ -0,0 +1,196 @@
+/*
+ * Copyright (C) 2014, 2015 Intel Corporation; author Matt Fleming
+ *
+ * Early support for invoking 32-bit EFI services from a 64-bit kernel.
+ *
+ * Because this thunking occurs before ExitBootServices() we have to
+ * restore the firmware's 32-bit GDT before we make EFI serivce calls,
+ * since the firmware's 32-bit IDT is still currently installed and it
+ * needs to be able to service interrupts.
+ *
+ * On the plus side, we don't have to worry about mangling 64-bit
+ * addresses into 32-bits because we're executing with an identify
+ * mapped pagetable and haven't transitioned to 64-bit virtual addresses
+ * yet.
+ */
+
+#include <linux/linkage.h>
+#include <asm/msr.h>
+#include <asm/page_types.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>
+
+	.code64
+	.text
+ENTRY(efi64_thunk)
+	push	%rbp
+	push	%rbx
+
+	subq	$8, %rsp
+	leaq	efi_exit32(%rip), %rax
+	movl	%eax, 4(%rsp)
+	leaq	efi_gdt64(%rip), %rax
+	movl	%eax, (%rsp)
+	movl	%eax, 2(%rax)		/* Fixup the gdt base address */
+
+	movl	%ds, %eax
+	push	%rax
+	movl	%es, %eax
+	push	%rax
+	movl	%ss, %eax
+	push	%rax
+
+	/*
+	 * Convert x86-64 ABI params to i386 ABI
+	 */
+	subq	$32, %rsp
+	movl	%esi, 0x0(%rsp)
+	movl	%edx, 0x4(%rsp)
+	movl	%ecx, 0x8(%rsp)
+	movq	%r8, %rsi
+	movl	%esi, 0xc(%rsp)
+	movq	%r9, %rsi
+	movl	%esi,  0x10(%rsp)
+
+	sgdt	save_gdt(%rip)
+
+	leaq	1f(%rip), %rbx
+	movq	%rbx, func_rt_ptr(%rip)
+
+	/*
+	 * Switch to gdt with 32-bit segments. This is the firmware GDT
+	 * that was installed when the kernel started executing. This
+	 * pointer was saved at the EFI stub entry point in head_64.S.
+	 */
+	leaq	efi32_boot_gdt(%rip), %rax
+	lgdt	(%rax)
+
+	pushq	$__KERNEL_CS
+	leaq	efi_enter32(%rip), %rax
+	pushq	%rax
+	lretq
+
+1:	addq	$32, %rsp
+
+	lgdt	save_gdt(%rip)
+
+	pop	%rbx
+	movl	%ebx, %ss
+	pop	%rbx
+	movl	%ebx, %es
+	pop	%rbx
+	movl	%ebx, %ds
+
+	/*
+	 * Convert 32-bit status code into 64-bit.
+	 */
+	test	%rax, %rax
+	jz	1f
+	movl	%eax, %ecx
+	andl	$0x0fffffff, %ecx
+	andl	$0xf0000000, %eax
+	shl	$32, %rax
+	or	%rcx, %rax
+1:
+	addq	$8, %rsp
+	pop	%rbx
+	pop	%rbp
+	ret
+ENDPROC(efi64_thunk)
+
+ENTRY(efi_exit32)
+	movq	func_rt_ptr(%rip), %rax
+	push	%rax
+	mov	%rdi, %rax
+	ret
+ENDPROC(efi_exit32)
+
+	.code32
+/*
+ * EFI service pointer must be in %edi.
+ *
+ * The stack should represent the 32-bit calling convention.
+ */
+ENTRY(efi_enter32)
+	movl	$__KERNEL_DS, %eax
+	movl	%eax, %ds
+	movl	%eax, %es
+	movl	%eax, %ss
+
+	/* Reload pgtables */
+	movl	%cr3, %eax
+	movl	%eax, %cr3
+
+	/* Disable paging */
+	movl	%cr0, %eax
+	btrl	$X86_CR0_PG_BIT, %eax
+	movl	%eax, %cr0
+
+	/* Disable long mode via EFER */
+	movl	$MSR_EFER, %ecx
+	rdmsr
+	btrl	$_EFER_LME, %eax
+	wrmsr
+
+	call	*%edi
+
+	/* We must preserve return value */
+	movl	%eax, %edi
+
+	/*
+	 * Some firmware will return with interrupts enabled. Be sure to
+	 * disable them before we switch GDTs.
+	 */
+	cli
+
+	movl	56(%esp), %eax
+	movl	%eax, 2(%eax)
+	lgdtl	(%eax)
+
+	movl	%cr4, %eax
+	btsl	$(X86_CR4_PAE_BIT), %eax
+	movl	%eax, %cr4
+
+	movl	%cr3, %eax
+	movl	%eax, %cr3
+
+	movl	$MSR_EFER, %ecx
+	rdmsr
+	btsl	$_EFER_LME, %eax
+	wrmsr
+
+	xorl	%eax, %eax
+	lldt	%ax
+
+	movl	60(%esp), %eax
+	pushl	$__KERNEL_CS
+	pushl	%eax
+
+	/* Enable paging */
+	movl	%cr0, %eax
+	btsl	$X86_CR0_PG_BIT, %eax
+	movl	%eax, %cr0
+	lret
+ENDPROC(efi_enter32)
+
+	.data
+	.balign	8
+	.global	efi32_boot_gdt
+efi32_boot_gdt:	.word	0
+		.quad	0
+
+save_gdt:	.word	0
+		.quad	0
+func_rt_ptr:	.quad	0
+
+	.global efi_gdt64
+efi_gdt64:
+	.word	efi_gdt64_end - efi_gdt64
+	.long	0			/* Filled out by user */
+	.word	0
+	.quad	0x0000000000000000	/* NULL descriptor */
+	.quad	0x00af9a000000ffff	/* __KERNEL_CS */
+	.quad	0x00cf92000000ffff	/* __KERNEL_DS */
+	.quad	0x0080890000000000	/* TS descriptor */
+	.quad   0x0000000000000000	/* TS continued */
+efi_gdt64_end:
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 24e3e569a13c..04477d68403f 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -7,6 +7,7 @@
  * we just keep it from happening
  */
 #undef CONFIG_PARAVIRT
+#undef CONFIG_KASAN
 #ifdef CONFIG_X86_32
 #define _ASM_X86_DESC_H 1
 #endif
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 477e9d75149b..6bd2c6c95373 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -32,12 +32,23 @@
 #include <linux/linkage.h>
 #include <asm/inst.h>
 
+/*
+ * The following macros are used to move an (un)aligned 16 byte value to/from
+ * an XMM register.  This can done for either FP or integer values, for FP use
+ * movaps (move aligned packed single) or integer use movdqa (move double quad
+ * aligned).  It doesn't make a performance difference which instruction is used
+ * since Nehalem (original Core i7) was released.  However, the movaps is a byte
+ * shorter, so that is the one we'll use for now. (same for unaligned).
+ */
+#define MOVADQ	movaps
+#define MOVUDQ	movups
+
 #ifdef __x86_64__
+
 .data
 .align 16
 .Lgf128mul_x_ble_mask:
 	.octa 0x00000000000000010000000000000087
-
 POLY:   .octa 0xC2000000000000000000000000000001
 TWOONE: .octa 0x00000001000000000000000000000001
 
@@ -89,6 +100,7 @@ enc:        .octa 0x2
 #define arg8 STACK_OFFSET+16(%r14)
 #define arg9 STACK_OFFSET+24(%r14)
 #define arg10 STACK_OFFSET+32(%r14)
+#define keysize 2*15*16(%arg1)
 #endif
 
 
@@ -213,10 +225,12 @@ enc:        .octa 0x2
 
 .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+        MOVADQ     SHUF_MASK(%rip), %xmm14
 	mov	   arg7, %r10           # %r10 = AAD
 	mov	   arg8, %r12           # %r12 = aadLen
 	mov	   %r12, %r11
 	pxor	   %xmm\i, %xmm\i
+
 _get_AAD_loop\num_initial_blocks\operation:
 	movd	   (%r10), \TMP1
 	pslldq	   $12, \TMP1
@@ -225,16 +239,18 @@ _get_AAD_loop\num_initial_blocks\operation:
 	add	   $4, %r10
 	sub	   $4, %r12
 	jne	   _get_AAD_loop\num_initial_blocks\operation
+
 	cmp	   $16, %r11
 	je	   _get_AAD_loop2_done\num_initial_blocks\operation
+
 	mov	   $16, %r12
 _get_AAD_loop2\num_initial_blocks\operation:
 	psrldq	   $4, %xmm\i
 	sub	   $4, %r12
 	cmp	   %r11, %r12
 	jne	   _get_AAD_loop2\num_initial_blocks\operation
+
 _get_AAD_loop2_done\num_initial_blocks\operation:
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
 
 	xor	   %r11, %r11 # initialise the data pointer offset as zero
@@ -243,59 +259,34 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 
 	mov	   %arg5, %rax                      # %rax = *Y0
 	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM   %xmm14, \XMM0
 
 .if (\i == 5) || (\i == 6) || (\i == 7)
+	MOVADQ		ONE(%RIP),\TMP1
+	MOVADQ		(%arg1),\TMP2
 .irpc index, \i_seq
-	paddd	   ONE(%rip), \XMM0                 # INCR Y0
+	paddd	   \TMP1, \XMM0                 # INCR Y0
 	movdqa	   \XMM0, %xmm\index
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
-
-.endr
-.irpc index, \i_seq
-	pxor	   16*0(%arg1), %xmm\index
-.endr
-.irpc index, \i_seq
-	movaps 0x10(%rdi), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 1
-.endr
-.irpc index, \i_seq
-	movaps 0x20(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
-.endr
-.irpc index, \i_seq
-	movaps 0x30(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
-.endr
-.irpc index, \i_seq
-	movaps 0x40(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
-.endr
-.irpc index, \i_seq
-	movaps 0x50(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
-.endr
-.irpc index, \i_seq
-	movaps 0x60(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
+	pxor	   \TMP2, %xmm\index
 .endr
-.irpc index, \i_seq
-	movaps 0x70(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
-.endr
-.irpc index, \i_seq
-	movaps 0x80(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
-.endr
-.irpc index, \i_seq
-	movaps 0x90(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
+	lea	0x10(%arg1),%r10
+	mov	keysize,%eax
+	shr	$2,%eax				# 128->4, 192->6, 256->8
+	add	$5,%eax			      # 128->9, 192->11, 256->13
+
+aes_loop_initial_dec\num_initial_blocks:
+	MOVADQ	(%r10),\TMP1
+.irpc	index, \i_seq
+	AESENC	\TMP1, %xmm\index
 .endr
+	add	$16,%r10
+	sub	$1,%eax
+	jnz	aes_loop_initial_dec\num_initial_blocks
+
+	MOVADQ	(%r10), \TMP1
 .irpc index, \i_seq
-	movaps 0xa0(%arg1), \TMP1
-	AESENCLAST \TMP1, %xmm\index         # Round 10
+	AESENCLAST \TMP1, %xmm\index         # Last Round
 .endr
 .irpc index, \i_seq
 	movdqu	   (%arg3 , %r11, 1), \TMP1
@@ -305,10 +296,8 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 	add	   $16, %r11
 
 	movdqa     \TMP1, %xmm\index
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM	   %xmm14, %xmm\index
-
-		# prepare plaintext/ciphertext for GHASH computation
+                # prepare plaintext/ciphertext for GHASH computation
 .endr
 .endif
 	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
@@ -338,30 +327,28 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 * Precomputations for HashKey parallel with encryption of first 4 blocks.
 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 */
-	paddd	   ONE(%rip), \XMM0              # INCR Y0
-	movdqa	   \XMM0, \XMM1
-        movdqa     SHUF_MASK(%rip), %xmm14
+	MOVADQ	   ONE(%rip), \TMP1
+	paddd	   \TMP1, \XMM0              # INCR Y0
+	MOVADQ	   \XMM0, \XMM1
 	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
 
-	paddd	   ONE(%rip), \XMM0              # INCR Y0
-	movdqa	   \XMM0, \XMM2
-        movdqa     SHUF_MASK(%rip), %xmm14
+	paddd	   \TMP1, \XMM0              # INCR Y0
+	MOVADQ	   \XMM0, \XMM2
 	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
 
-	paddd	   ONE(%rip), \XMM0              # INCR Y0
-	movdqa	   \XMM0, \XMM3
-        movdqa     SHUF_MASK(%rip), %xmm14
+	paddd	   \TMP1, \XMM0              # INCR Y0
+	MOVADQ	   \XMM0, \XMM3
 	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
 
-	paddd	   ONE(%rip), \XMM0              # INCR Y0
-	movdqa	   \XMM0, \XMM4
-        movdqa     SHUF_MASK(%rip), %xmm14
+	paddd	   \TMP1, \XMM0              # INCR Y0
+	MOVADQ	   \XMM0, \XMM4
 	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
 
-	pxor	   16*0(%arg1), \XMM1
-	pxor	   16*0(%arg1), \XMM2
-	pxor	   16*0(%arg1), \XMM3
-	pxor	   16*0(%arg1), \XMM4
+	MOVADQ	   0(%arg1),\TMP1
+	pxor	   \TMP1, \XMM1
+	pxor	   \TMP1, \XMM2
+	pxor	   \TMP1, \XMM3
+	pxor	   \TMP1, \XMM4
 	movdqa	   \TMP3, \TMP5
 	pshufd	   $78, \TMP3, \TMP1
 	pxor	   \TMP3, \TMP1
@@ -399,7 +386,23 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 	pshufd	   $78, \TMP5, \TMP1
 	pxor	   \TMP5, \TMP1
 	movdqa	   \TMP1, HashKey_4_k(%rsp)
-	movaps 0xa0(%arg1), \TMP2
+	lea	   0xa0(%arg1),%r10
+	mov	   keysize,%eax
+	shr	   $2,%eax			# 128->4, 192->6, 256->8
+	sub	   $4,%eax			# 128->0, 192->2, 256->4
+	jz	   aes_loop_pre_dec_done\num_initial_blocks
+
+aes_loop_pre_dec\num_initial_blocks:
+	MOVADQ	   (%r10),\TMP2
+.irpc	index, 1234
+	AESENC	   \TMP2, %xmm\index
+.endr
+	add	   $16,%r10
+	sub	   $1,%eax
+	jnz	   aes_loop_pre_dec\num_initial_blocks
+
+aes_loop_pre_dec_done\num_initial_blocks:
+	MOVADQ	   (%r10), \TMP2
 	AESENCLAST \TMP2, \XMM1
 	AESENCLAST \TMP2, \XMM2
 	AESENCLAST \TMP2, \XMM3
@@ -421,15 +424,11 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 	movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1)
 	movdqa     \TMP1, \XMM4
 	add	   $64, %r11
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
 	pxor	   \XMMDst, \XMM1
 # combine GHASHed value with the corresponding ciphertext
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
 
 _initial_blocks_done\num_initial_blocks\operation:
@@ -451,6 +450,7 @@ _initial_blocks_done\num_initial_blocks\operation:
 
 .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+        MOVADQ     SHUF_MASK(%rip), %xmm14
 	mov	   arg7, %r10           # %r10 = AAD
 	mov	   arg8, %r12           # %r12 = aadLen
 	mov	   %r12, %r11
@@ -472,7 +472,6 @@ _get_AAD_loop2\num_initial_blocks\operation:
 	cmp	   %r11, %r12
 	jne	   _get_AAD_loop2\num_initial_blocks\operation
 _get_AAD_loop2_done\num_initial_blocks\operation:
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
 
 	xor	   %r11, %r11 # initialise the data pointer offset as zero
@@ -481,59 +480,35 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 
 	mov	   %arg5, %rax                      # %rax = *Y0
 	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM   %xmm14, \XMM0
 
 .if (\i == 5) || (\i == 6) || (\i == 7)
-.irpc index, \i_seq
-	paddd	   ONE(%rip), \XMM0                 # INCR Y0
-	movdqa	   \XMM0, %xmm\index
-        movdqa     SHUF_MASK(%rip), %xmm14
-	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
 
-.endr
-.irpc index, \i_seq
-	pxor	   16*0(%arg1), %xmm\index
-.endr
-.irpc index, \i_seq
-	movaps 0x10(%rdi), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 1
-.endr
-.irpc index, \i_seq
-	movaps 0x20(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
-.endr
+	MOVADQ		ONE(%RIP),\TMP1
+	MOVADQ		0(%arg1),\TMP2
 .irpc index, \i_seq
-	movaps 0x30(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
+	paddd		\TMP1, \XMM0                 # INCR Y0
+	MOVADQ		\XMM0, %xmm\index
+	PSHUFB_XMM	%xmm14, %xmm\index      # perform a 16 byte swap
+	pxor		\TMP2, %xmm\index
 .endr
-.irpc index, \i_seq
-	movaps 0x40(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
-.endr
-.irpc index, \i_seq
-	movaps 0x50(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
-.endr
-.irpc index, \i_seq
-	movaps 0x60(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
-.endr
-.irpc index, \i_seq
-	movaps 0x70(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
-.endr
-.irpc index, \i_seq
-	movaps 0x80(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
-.endr
-.irpc index, \i_seq
-	movaps 0x90(%arg1), \TMP1
-	AESENC     \TMP1, %xmm\index          # Round 2
+	lea	0x10(%arg1),%r10
+	mov	keysize,%eax
+	shr	$2,%eax				# 128->4, 192->6, 256->8
+	add	$5,%eax			      # 128->9, 192->11, 256->13
+
+aes_loop_initial_enc\num_initial_blocks:
+	MOVADQ	(%r10),\TMP1
+.irpc	index, \i_seq
+	AESENC	\TMP1, %xmm\index
 .endr
+	add	$16,%r10
+	sub	$1,%eax
+	jnz	aes_loop_initial_enc\num_initial_blocks
+
+	MOVADQ	(%r10), \TMP1
 .irpc index, \i_seq
-	movaps 0xa0(%arg1), \TMP1
-	AESENCLAST \TMP1, %xmm\index         # Round 10
+	AESENCLAST \TMP1, %xmm\index         # Last Round
 .endr
 .irpc index, \i_seq
 	movdqu	   (%arg3 , %r11, 1), \TMP1
@@ -541,8 +516,6 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 	movdqu	   %xmm\index, (%arg2 , %r11, 1)
 	# write back plaintext/ciphertext for num_initial_blocks
 	add	   $16, %r11
-
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM	   %xmm14, %xmm\index
 
 		# prepare plaintext/ciphertext for GHASH computation
@@ -575,30 +548,28 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 * Precomputations for HashKey parallel with encryption of first 4 blocks.
 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 */
-	paddd	   ONE(%rip), \XMM0              # INCR Y0
-	movdqa	   \XMM0, \XMM1
-        movdqa     SHUF_MASK(%rip), %xmm14
+	MOVADQ	   ONE(%RIP),\TMP1
+	paddd	   \TMP1, \XMM0              # INCR Y0
+	MOVADQ	   \XMM0, \XMM1
 	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
 
-	paddd	   ONE(%rip), \XMM0              # INCR Y0
-	movdqa	   \XMM0, \XMM2
-        movdqa     SHUF_MASK(%rip), %xmm14
+	paddd	   \TMP1, \XMM0              # INCR Y0
+	MOVADQ	   \XMM0, \XMM2
 	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
 
-	paddd	   ONE(%rip), \XMM0              # INCR Y0
-	movdqa	   \XMM0, \XMM3
-        movdqa     SHUF_MASK(%rip), %xmm14
+	paddd	   \TMP1, \XMM0              # INCR Y0
+	MOVADQ	   \XMM0, \XMM3
 	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
 
-	paddd	   ONE(%rip), \XMM0              # INCR Y0
-	movdqa	   \XMM0, \XMM4
-        movdqa     SHUF_MASK(%rip), %xmm14
+	paddd	   \TMP1, \XMM0              # INCR Y0
+	MOVADQ	   \XMM0, \XMM4
 	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
 
-	pxor	   16*0(%arg1), \XMM1
-	pxor	   16*0(%arg1), \XMM2
-	pxor	   16*0(%arg1), \XMM3
-	pxor	   16*0(%arg1), \XMM4
+	MOVADQ	   0(%arg1),\TMP1
+	pxor	   \TMP1, \XMM1
+	pxor	   \TMP1, \XMM2
+	pxor	   \TMP1, \XMM3
+	pxor	   \TMP1, \XMM4
 	movdqa	   \TMP3, \TMP5
 	pshufd	   $78, \TMP3, \TMP1
 	pxor	   \TMP3, \TMP1
@@ -636,7 +607,23 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 	pshufd	   $78, \TMP5, \TMP1
 	pxor	   \TMP5, \TMP1
 	movdqa	   \TMP1, HashKey_4_k(%rsp)
-	movaps 0xa0(%arg1), \TMP2
+	lea	   0xa0(%arg1),%r10
+	mov	   keysize,%eax
+	shr	   $2,%eax			# 128->4, 192->6, 256->8
+	sub	   $4,%eax			# 128->0, 192->2, 256->4
+	jz	   aes_loop_pre_enc_done\num_initial_blocks
+
+aes_loop_pre_enc\num_initial_blocks:
+	MOVADQ	   (%r10),\TMP2
+.irpc	index, 1234
+	AESENC	   \TMP2, %xmm\index
+.endr
+	add	   $16,%r10
+	sub	   $1,%eax
+	jnz	   aes_loop_pre_enc\num_initial_blocks
+
+aes_loop_pre_enc_done\num_initial_blocks:
+	MOVADQ	   (%r10), \TMP2
 	AESENCLAST \TMP2, \XMM1
 	AESENCLAST \TMP2, \XMM2
 	AESENCLAST \TMP2, \XMM3
@@ -655,15 +642,11 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 	movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
 
 	add	   $64, %r11
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
 	pxor	   \XMMDst, \XMM1
 # combine GHASHed value with the corresponding ciphertext
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
-        movdqa     SHUF_MASK(%rip), %xmm14
 	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
 
 _initial_blocks_done\num_initial_blocks\operation:
@@ -794,7 +777,23 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 	AESENC	  \TMP3, \XMM3
 	AESENC	  \TMP3, \XMM4
 	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
-	movaps 0xa0(%arg1), \TMP3
+	lea	  0xa0(%arg1),%r10
+	mov	  keysize,%eax
+	shr	  $2,%eax			# 128->4, 192->6, 256->8
+	sub	  $4,%eax			# 128->0, 192->2, 256->4
+	jz	  aes_loop_par_enc_done
+
+aes_loop_par_enc:
+	MOVADQ	  (%r10),\TMP3
+.irpc	index, 1234
+	AESENC	  \TMP3, %xmm\index
+.endr
+	add	  $16,%r10
+	sub	  $1,%eax
+	jnz	  aes_loop_par_enc
+
+aes_loop_par_enc_done:
+	MOVADQ	  (%r10), \TMP3
 	AESENCLAST \TMP3, \XMM1           # Round 10
 	AESENCLAST \TMP3, \XMM2
 	AESENCLAST \TMP3, \XMM3
@@ -986,8 +985,24 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 	AESENC	  \TMP3, \XMM3
 	AESENC	  \TMP3, \XMM4
 	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
-	movaps 0xa0(%arg1), \TMP3
-	AESENCLAST \TMP3, \XMM1           # Round 10
+	lea	  0xa0(%arg1),%r10
+	mov	  keysize,%eax
+	shr	  $2,%eax		        # 128->4, 192->6, 256->8
+	sub	  $4,%eax			# 128->0, 192->2, 256->4
+	jz	  aes_loop_par_dec_done
+
+aes_loop_par_dec:
+	MOVADQ	  (%r10),\TMP3
+.irpc	index, 1234
+	AESENC	  \TMP3, %xmm\index
+.endr
+	add	  $16,%r10
+	sub	  $1,%eax
+	jnz	  aes_loop_par_dec
+
+aes_loop_par_dec_done:
+	MOVADQ	  (%r10), \TMP3
+	AESENCLAST \TMP3, \XMM1           # last round
 	AESENCLAST \TMP3, \XMM2
 	AESENCLAST \TMP3, \XMM3
 	AESENCLAST \TMP3, \XMM4
@@ -1155,33 +1170,29 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
 	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
 .endm
 
-/* Encryption of a single block done*/
-.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
 
-	pxor	(%arg1), \XMM0
-        movaps 16(%arg1), \TMP1
-	AESENC	\TMP1, \XMM0
-        movaps 32(%arg1), \TMP1
-	AESENC	\TMP1, \XMM0
-        movaps 48(%arg1), \TMP1
-	AESENC	\TMP1, \XMM0
-        movaps 64(%arg1), \TMP1
-	AESENC	\TMP1, \XMM0
-        movaps 80(%arg1), \TMP1
-	AESENC	\TMP1, \XMM0
-        movaps 96(%arg1), \TMP1
-	AESENC	\TMP1, \XMM0
-        movaps 112(%arg1), \TMP1
-	AESENC	\TMP1, \XMM0
-        movaps 128(%arg1), \TMP1
-	AESENC	\TMP1, \XMM0
-        movaps 144(%arg1), \TMP1
-	AESENC	\TMP1, \XMM0
-        movaps 160(%arg1), \TMP1
-	AESENCLAST	\TMP1, \XMM0
-.endm
+/* Encryption of a single block
+* uses eax & r10
+*/
 
+.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
 
+	pxor		(%arg1), \XMM0
+	mov		keysize,%eax
+	shr		$2,%eax			# 128->4, 192->6, 256->8
+	add		$5,%eax			# 128->9, 192->11, 256->13
+	lea		16(%arg1), %r10	  # get first expanded key address
+
+_esb_loop_\@:
+	MOVADQ		(%r10),\TMP1
+	AESENC		\TMP1,\XMM0
+	add		$16,%r10
+	sub		$1,%eax
+	jnz		_esb_loop_\@
+
+	MOVADQ		(%r10),\TMP1
+	AESENCLAST	\TMP1,\XMM0
+.endm
 /*****************************************************************************
 * void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
 *                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index ae855f4f64b7..54f60ab41c63 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -43,6 +43,7 @@
 #include <asm/crypto/glue_helper.h>
 #endif
 
+
 /* This data is stored at the end of the crypto_tfm struct.
  * It's a type of per "session" data storage location.
  * This needs to be 16 byte aligned.
@@ -182,7 +183,8 @@ static void aesni_gcm_enc_avx(void *ctx, u8 *out,
 			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
 			u8 *auth_tag, unsigned long auth_tag_len)
 {
-	if (plaintext_len < AVX_GEN2_OPTSIZE) {
+        struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
+	if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)){
 		aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
 				aad_len, auth_tag, auth_tag_len);
 	} else {
@@ -197,7 +199,8 @@ static void aesni_gcm_dec_avx(void *ctx, u8 *out,
 			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
 			u8 *auth_tag, unsigned long auth_tag_len)
 {
-	if (ciphertext_len < AVX_GEN2_OPTSIZE) {
+        struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
+	if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
 		aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, aad,
 				aad_len, auth_tag, auth_tag_len);
 	} else {
@@ -231,7 +234,8 @@ static void aesni_gcm_enc_avx2(void *ctx, u8 *out,
 			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
 			u8 *auth_tag, unsigned long auth_tag_len)
 {
-	if (plaintext_len < AVX_GEN2_OPTSIZE) {
+       struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
+	if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
 		aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
 				aad_len, auth_tag, auth_tag_len);
 	} else if (plaintext_len < AVX_GEN4_OPTSIZE) {
@@ -250,7 +254,8 @@ static void aesni_gcm_dec_avx2(void *ctx, u8 *out,
 			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
 			u8 *auth_tag, unsigned long auth_tag_len)
 {
-	if (ciphertext_len < AVX_GEN2_OPTSIZE) {
+       struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
+	if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
 		aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey,
 				aad, aad_len, auth_tag, auth_tag_len);
 	} else if (ciphertext_len < AVX_GEN4_OPTSIZE) {
@@ -511,7 +516,7 @@ static int ctr_crypt(struct blkcipher_desc *desc,
 	kernel_fpu_begin();
 	while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
 		aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
-				  nbytes & AES_BLOCK_MASK, walk.iv);
+			              nbytes & AES_BLOCK_MASK, walk.iv);
 		nbytes &= AES_BLOCK_SIZE - 1;
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 	}
@@ -902,7 +907,8 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
 	}
 	/*Account for 4 byte nonce at the end.*/
 	key_len -= 4;
-	if (key_len != AES_KEYSIZE_128) {
+	if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 &&
+	    key_len != AES_KEYSIZE_256) {
 		crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
 		return -EINVAL;
 	}
@@ -1013,6 +1019,7 @@ static int __driver_rfc4106_encrypt(struct aead_request *req)
 	__be32 counter = cpu_to_be32(1);
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
 	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	u32 key_len = ctx->aes_key_expanded.key_length;
 	void *aes_ctx = &(ctx->aes_key_expanded);
 	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
 	u8 iv_tab[16+AESNI_ALIGN];
@@ -1027,6 +1034,13 @@ static int __driver_rfc4106_encrypt(struct aead_request *req)
 	/* to 8 or 12 bytes */
 	if (unlikely(req->assoclen != 8 && req->assoclen != 12))
 		return -EINVAL;
+	if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16))
+	        return -EINVAL;
+	if (unlikely(key_len != AES_KEYSIZE_128 &&
+	             key_len != AES_KEYSIZE_192 &&
+	             key_len != AES_KEYSIZE_256))
+	        return -EINVAL;
+
 	/* IV below built */
 	for (i = 0; i < 4; i++)
 		*(iv+i) = ctx->nonce[i];
@@ -1091,6 +1105,7 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
 	int retval = 0;
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
 	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	u32 key_len = ctx->aes_key_expanded.key_length;
 	void *aes_ctx = &(ctx->aes_key_expanded);
 	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
 	u8 iv_and_authTag[32+AESNI_ALIGN];
@@ -1104,6 +1119,13 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
 	if (unlikely((req->cryptlen < auth_tag_len) ||
 		(req->assoclen != 8 && req->assoclen != 12)))
 		return -EINVAL;
+	if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16))
+	        return -EINVAL;
+	if (unlikely(key_len != AES_KEYSIZE_128 &&
+	             key_len != AES_KEYSIZE_192 &&
+	             key_len != AES_KEYSIZE_256))
+	        return -EINVAL;
+
 	/* Assuming we are supporting rfc4106 64-bit extended */
 	/* sequence numbers We need to have the AAD length */
 	/* equal to 8 or 12 bytes */
@@ -1133,7 +1155,7 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
 		src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
 		if (!src)
 			return -ENOMEM;
-		assoc = (src + req->cryptlen + auth_tag_len);
+		assoc = (src + req->cryptlen);
 		scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
 		scatterwalk_map_and_copy(assoc, req->assoc, 0,
 			req->assoclen, 0);
@@ -1158,7 +1180,7 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
 		scatterwalk_done(&src_sg_walk, 0, 0);
 		scatterwalk_done(&assoc_sg_walk, 0, 0);
 	} else {
-		scatterwalk_map_and_copy(dst, req->dst, 0, req->cryptlen, 1);
+		scatterwalk_map_and_copy(dst, req->dst, 0, tempCipherLen, 1);
 		kfree(src);
 	}
 	return retval;
diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c
index 38a14f818ef1..d6fc59aaaadf 100644
--- a/arch/x86/crypto/des3_ede_glue.c
+++ b/arch/x86/crypto/des3_ede_glue.c
@@ -504,6 +504,4 @@ MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Triple DES EDE Cipher Algorithm, asm optimized");
 MODULE_ALIAS_CRYPTO("des3_ede");
 MODULE_ALIAS_CRYPTO("des3_ede-asm");
-MODULE_ALIAS_CRYPTO("des");
-MODULE_ALIAS_CRYPTO("des-asm");
 MODULE_AUTHOR("Jussi Kivilinna <jussi.kivilinna@iki.fi>");
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index f9e181aaba97..d0165c9a2932 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -169,7 +169,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
 	u32 tmp;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	get_user_try {
 		/*
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 92003f3c8a42..efc3b22d896e 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -213,7 +213,15 @@ void register_lapic_address(unsigned long address);
 extern void setup_boot_APIC_clock(void);
 extern void setup_secondary_APIC_clock(void);
 extern int APIC_init_uniprocessor(void);
+
+#ifdef CONFIG_X86_64
+static inline int apic_force_enable(unsigned long addr)
+{
+	return -1;
+}
+#else
 extern int apic_force_enable(unsigned long addr);
+#endif
 
 extern int apic_bsp_setup(bool upmode);
 extern void apic_ap_setup(void);
diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index 5fa1be21ac2a..810f20fd4e4e 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -397,7 +397,7 @@ static inline void drop_fpu(struct task_struct *tsk)
 	preempt_disable();
 	tsk->thread.fpu_counter = 0;
 	__drop_fpu(tsk);
-	clear_used_math();
+	clear_stopped_child_used_math(tsk);
 	preempt_enable();
 }
 
diff --git a/arch/x86/include/asm/imr.h b/arch/x86/include/asm/imr.h
new file mode 100644
index 000000000000..cd2ce4068441
--- /dev/null
+++ b/arch/x86/include/asm/imr.h
@@ -0,0 +1,60 @@
+/*
+ * imr.h: Isolated Memory Region API
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#ifndef _IMR_H
+#define _IMR_H
+
+#include <linux/types.h>
+
+/*
+ * IMR agent access mask bits
+ * See section 12.7.4.7 from quark-x1000-datasheet.pdf for register
+ * definitions.
+ */
+#define IMR_ESRAM_FLUSH		BIT(31)
+#define IMR_CPU_SNOOP		BIT(30)		/* Applicable only to write */
+#define IMR_RMU			BIT(29)
+#define IMR_VC1_SAI_ID3		BIT(15)
+#define IMR_VC1_SAI_ID2		BIT(14)
+#define IMR_VC1_SAI_ID1		BIT(13)
+#define IMR_VC1_SAI_ID0		BIT(12)
+#define IMR_VC0_SAI_ID3		BIT(11)
+#define IMR_VC0_SAI_ID2		BIT(10)
+#define IMR_VC0_SAI_ID1		BIT(9)
+#define IMR_VC0_SAI_ID0		BIT(8)
+#define IMR_CPU_0		BIT(1)		/* SMM mode */
+#define IMR_CPU			BIT(0)		/* Non SMM mode */
+#define IMR_ACCESS_NONE		0
+
+/*
+ * Read/Write access-all bits here include some reserved bits
+ * These are the values firmware uses and are accepted by hardware.
+ * The kernel defines read/write access-all in the same way as firmware
+ * in order to have a consistent and crisp definition across firmware,
+ * bootloader and kernel.
+ */
+#define IMR_READ_ACCESS_ALL	0xBFFFFFFF
+#define IMR_WRITE_ACCESS_ALL	0xFFFFFFFF
+
+/* Number of IMRs provided by Quark X1000 SoC */
+#define QUARK_X1000_IMR_MAX	0x08
+#define QUARK_X1000_IMR_REGBASE 0x40
+
+/* IMR alignment bits - only bits 31:10 are checked for IMR validity */
+#define IMR_ALIGN		0x400
+#define IMR_MASK		(IMR_ALIGN - 1)
+
+int imr_add_range(phys_addr_t base, size_t size,
+		  unsigned int rmask, unsigned int wmask, bool lock);
+
+int imr_remove_range(phys_addr_t base, size_t size);
+
+#endif /* _IMR_H */
diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h
index e34e097b6f9d..705d35708a50 100644
--- a/arch/x86/include/asm/intel-mid.h
+++ b/arch/x86/include/asm/intel-mid.h
@@ -136,9 +136,6 @@ extern enum intel_mid_timer_options intel_mid_timer_options;
 #define SFI_MTMR_MAX_NUM 8
 #define SFI_MRTC_MAX	8
 
-extern struct console early_mrst_console;
-extern void mrst_early_console_init(void);
-
 extern struct console early_hsu_console;
 extern void hsu_early_console_init(const char *);
 
diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
new file mode 100644
index 000000000000..8b22422fbad8
--- /dev/null
+++ b/arch/x86/include/asm/kasan.h
@@ -0,0 +1,31 @@
+#ifndef _ASM_X86_KASAN_H
+#define _ASM_X86_KASAN_H
+
+/*
+ * Compiler uses shadow offset assuming that addresses start
+ * from 0. Kernel addresses don't start from 0, so shadow
+ * for kernel really starts from compiler's shadow offset +
+ * 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT
+ */
+#define KASAN_SHADOW_START      (KASAN_SHADOW_OFFSET + \
+					(0xffff800000000000ULL >> 3))
+/* 47 bits for kernel address -> (47 - 3) bits for shadow */
+#define KASAN_SHADOW_END        (KASAN_SHADOW_START + (1ULL << (47 - 3)))
+
+#ifndef __ASSEMBLY__
+
+extern pte_t kasan_zero_pte[];
+extern pte_t kasan_zero_pmd[];
+extern pte_t kasan_zero_pud[];
+
+#ifdef CONFIG_KASAN
+void __init kasan_map_early_shadow(pgd_t *pgd);
+void __init kasan_init(void);
+#else
+static inline void kasan_map_early_shadow(pgd_t *pgd) { }
+static inline void kasan_init(void) { }
+#endif
+
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index eb181178fe0b..57a9d94fe160 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -208,6 +208,7 @@ struct x86_emulate_ops {
 
 	void (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
 			  u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
+	void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
 };
 
 typedef u32 __attribute__((vector_size(16))) sse128_t;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d89c6b828c96..a236e39cc385 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -38,8 +38,6 @@
 #define KVM_PRIVATE_MEM_SLOTS 3
 #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
 
-#define KVM_MMIO_SIZE 16
-
 #define KVM_PIO_PAGE_OFFSET 1
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2
 
@@ -51,7 +49,7 @@
 			  | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
 
 #define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL
-#define CR3_PCID_INVD		 (1UL << 63)
+#define CR3_PCID_INVD		 BIT_64(63)
 #define CR4_RESERVED_BITS                                               \
 	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
 			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
@@ -160,6 +158,18 @@ enum {
 #define DR7_FIXED_1	0x00000400
 #define DR7_VOLATILE	0xffff2bff
 
+#define PFERR_PRESENT_BIT 0
+#define PFERR_WRITE_BIT 1
+#define PFERR_USER_BIT 2
+#define PFERR_RSVD_BIT 3
+#define PFERR_FETCH_BIT 4
+
+#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
+#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
+#define PFERR_USER_MASK (1U << PFERR_USER_BIT)
+#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
+#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
+
 /* apic attention bits */
 #define KVM_APIC_CHECK_VAPIC	0
 /*
@@ -615,6 +625,8 @@ struct kvm_arch {
 	#ifdef CONFIG_KVM_MMU_AUDIT
 	int audit_point;
 	#endif
+
+	bool boot_vcpu_runs_old_kvmclock;
 };
 
 struct kvm_vm_stat {
@@ -643,6 +655,7 @@ struct kvm_vcpu_stat {
 	u32 irq_window_exits;
 	u32 nmi_window_exits;
 	u32 halt_exits;
+	u32 halt_successful_poll;
 	u32 halt_wakeup;
 	u32 request_irq_exits;
 	u32 irq_exits;
@@ -787,6 +800,31 @@ struct kvm_x86_ops {
 	int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
 
 	void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
+
+	/*
+	 * Arch-specific dirty logging hooks. These hooks are only supposed to
+	 * be valid if the specific arch has hardware-accelerated dirty logging
+	 * mechanism. Currently only for PML on VMX.
+	 *
+	 *  - slot_enable_log_dirty:
+	 *	called when enabling log dirty mode for the slot.
+	 *  - slot_disable_log_dirty:
+	 *	called when disabling log dirty mode for the slot.
+	 *	also called when slot is created with log dirty disabled.
+	 *  - flush_log_dirty:
+	 *	called before reporting dirty_bitmap to userspace.
+	 *  - enable_log_dirty_pt_masked:
+	 *	called when reenabling log dirty for the GFNs in the mask after
+	 *	corresponding bits are cleared in slot->dirty_bitmap.
+	 */
+	void (*slot_enable_log_dirty)(struct kvm *kvm,
+				      struct kvm_memory_slot *slot);
+	void (*slot_disable_log_dirty)(struct kvm *kvm,
+				       struct kvm_memory_slot *slot);
+	void (*flush_log_dirty)(struct kvm *kvm);
+	void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
+					   struct kvm_memory_slot *slot,
+					   gfn_t offset, unsigned long mask);
 };
 
 struct kvm_arch_async_pf {
@@ -819,10 +857,17 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 		u64 dirty_mask, u64 nx_mask, u64 x_mask);
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
-void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
-				     struct kvm_memory_slot *slot,
-				     gfn_t gfn_offset, unsigned long mask);
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+				      struct kvm_memory_slot *memslot);
+void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
+				   struct kvm_memory_slot *memslot);
+void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
+					struct kvm_memory_slot *memslot);
+void kvm_mmu_slot_set_dirty(struct kvm *kvm,
+			    struct kvm_memory_slot *memslot);
+void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
+				   struct kvm_memory_slot *slot,
+				   gfn_t gfn_offset, unsigned long mask);
 void kvm_mmu_zap_all(struct kvm *kvm);
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index 879fd7d33877..ef01fef3eebc 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -16,7 +16,6 @@
 #define LHCALL_SET_PTE		14
 #define LHCALL_SET_PGD		15
 #define LHCALL_LOAD_TLS		16
-#define LHCALL_NOTIFY		17
 #define LHCALL_LOAD_GDT_ENTRY	18
 #define LHCALL_SEND_INTERRUPTS	19
 
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
new file mode 100644
index 000000000000..a455a53d789a
--- /dev/null
+++ b/arch/x86/include/asm/livepatch.h
@@ -0,0 +1,46 @@
+/*
+ * livepatch.h - x86-specific Kernel Live Patching Core
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _ASM_X86_LIVEPATCH_H
+#define _ASM_X86_LIVEPATCH_H
+
+#include <linux/module.h>
+#include <linux/ftrace.h>
+
+#ifdef CONFIG_LIVEPATCH
+static inline int klp_check_compiler_support(void)
+{
+#ifndef CC_USING_FENTRY
+	return 1;
+#endif
+	return 0;
+}
+extern int klp_write_module_reloc(struct module *mod, unsigned long type,
+				  unsigned long loc, unsigned long value);
+
+static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
+{
+	regs->ip = ip;
+}
+#else
+#error Live patching support is disabled; check CONFIG_LIVEPATCH
+#endif
+
+#endif /* _ASM_X86_LIVEPATCH_H */
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 876e74e8eec7..09b9620a73b4 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -19,6 +19,8 @@ typedef struct {
 
 	struct mutex lock;
 	void __user *vdso;
+
+	atomic_t perf_rdpmc_allowed;	/* nonzero if rdpmc is allowed */
 } mm_context_t;
 
 #ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 4b75d591eb5e..883f6b933fa4 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -18,6 +18,21 @@ static inline void paravirt_activate_mm(struct mm_struct *prev,
 }
 #endif	/* !CONFIG_PARAVIRT */
 
+#ifdef CONFIG_PERF_EVENTS
+extern struct static_key rdpmc_always_available;
+
+static inline void load_mm_cr4(struct mm_struct *mm)
+{
+	if (static_key_true(&rdpmc_always_available) ||
+	    atomic_read(&mm->context.perf_rdpmc_allowed))
+		cr4_set_bits(X86_CR4_PCE);
+	else
+		cr4_clear_bits(X86_CR4_PCE);
+}
+#else
+static inline void load_mm_cr4(struct mm_struct *mm) {}
+#endif
+
 /*
  * Used for LDT copy/destruction.
  */
@@ -52,15 +67,20 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		/* Stop flush ipis for the previous mm */
 		cpumask_clear_cpu(cpu, mm_cpumask(prev));
 
+		/* Load per-mm CR4 state */
+		load_mm_cr4(next);
+
 		/*
 		 * Load the LDT, if the LDT is different.
 		 *
-		 * It's possible leave_mm(prev) has been called.  If so,
-		 * then prev->context.ldt could be out of sync with the
-		 * LDT descriptor or the LDT register.  This can only happen
-		 * if prev->context.ldt is non-null, since we never free
-		 * an LDT.  But LDTs can't be shared across mms, so
-		 * prev->context.ldt won't be equal to next->context.ldt.
+		 * It's possible that prev->context.ldt doesn't match
+		 * the LDT register.  This can happen if leave_mm(prev)
+		 * was called and then modify_ldt changed
+		 * prev->context.ldt but suppressed an IPI to this CPU.
+		 * In this case, prev->context.ldt != NULL, because we
+		 * never free an LDT while the mm still exists.  That
+		 * means that next->context.ldt != prev->context.ldt,
+		 * because mms never share an LDT.
 		 */
 		if (unlikely(prev->context.ldt != next->context.ldt))
 			load_LDT_nolock(&next->context);
@@ -85,6 +105,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 			 */
 			load_cr3(next->pgd);
 			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+			load_mm_cr4(next);
 			load_LDT_nolock(&next->context);
 		}
 	}
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 75450b2c7be4..4edd53b79a81 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -1,17 +1,23 @@
 #ifndef _ASM_X86_PAGE_64_DEFS_H
 #define _ASM_X86_PAGE_64_DEFS_H
 
-#define THREAD_SIZE_ORDER	2
+#ifdef CONFIG_KASAN
+#define KASAN_STACK_ORDER 1
+#else
+#define KASAN_STACK_ORDER 0
+#endif
+
+#define THREAD_SIZE_ORDER	(2 + KASAN_STACK_ORDER)
 #define THREAD_SIZE  (PAGE_SIZE << THREAD_SIZE_ORDER)
 #define CURRENT_MASK (~(THREAD_SIZE - 1))
 
-#define EXCEPTION_STACK_ORDER 0
+#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER)
 #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
 
 #define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
 #define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
 
-#define IRQ_STACK_ORDER 2
+#define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER)
 #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
 
 #define DOUBLEFAULT_STACK 1
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 32444ae939ca..965c47d254aa 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -80,16 +80,16 @@ static inline void write_cr3(unsigned long x)
 	PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
 }
 
-static inline unsigned long read_cr4(void)
+static inline unsigned long __read_cr4(void)
 {
 	return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
 }
-static inline unsigned long read_cr4_safe(void)
+static inline unsigned long __read_cr4_safe(void)
 {
 	return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe);
 }
 
-static inline void write_cr4(unsigned long x)
+static inline void __write_cr4(unsigned long x)
 {
 	PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
 }
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 206a87fdd22d..fd74a11959de 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -62,44 +62,8 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi
 	return ((value >> rightshift) & mask) << leftshift;
 }
 
-/*
- * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
- * split up the 29 bits of offset into this range.
- */
-#define PTE_FILE_MAX_BITS	29
-#define PTE_FILE_SHIFT1		(_PAGE_BIT_PRESENT + 1)
-#define PTE_FILE_SHIFT2		(_PAGE_BIT_FILE + 1)
-#define PTE_FILE_SHIFT3		(_PAGE_BIT_PROTNONE + 1)
-#define PTE_FILE_BITS1		(PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1)
-#define PTE_FILE_BITS2		(PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
-
-#define PTE_FILE_MASK1		((1U << PTE_FILE_BITS1) - 1)
-#define PTE_FILE_MASK2		((1U << PTE_FILE_BITS2) - 1)
-
-#define PTE_FILE_LSHIFT2	(PTE_FILE_BITS1)
-#define PTE_FILE_LSHIFT3	(PTE_FILE_BITS1 + PTE_FILE_BITS2)
-
-static __always_inline pgoff_t pte_to_pgoff(pte_t pte)
-{
-	return (pgoff_t)
-		(pte_bitop(pte.pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1,  0)		    +
-		 pte_bitop(pte.pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2,  PTE_FILE_LSHIFT2) +
-		 pte_bitop(pte.pte_low, PTE_FILE_SHIFT3,           -1UL,  PTE_FILE_LSHIFT3));
-}
-
-static __always_inline pte_t pgoff_to_pte(pgoff_t off)
-{
-	return (pte_t){
-		.pte_low =
-			pte_bitop(off,                0, PTE_FILE_MASK1,  PTE_FILE_SHIFT1) +
-			pte_bitop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2,  PTE_FILE_SHIFT2) +
-			pte_bitop(off, PTE_FILE_LSHIFT3,           -1UL,  PTE_FILE_SHIFT3) +
-			_PAGE_FILE,
-	};
-}
-
 /* Encode and de-code a swap entry */
-#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
+#define SWP_TYPE_BITS 5
 #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
 
 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 81bb91b49a88..cdaa58c9b39e 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -176,18 +176,6 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
 #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
 #endif
 
-/*
- * Bits 0, 6 and 7 are taken in the low part of the pte,
- * put the 32 bits of offset into the high part.
- *
- * For soft-dirty tracking 11 bit is taken from
- * the low part of pte as well.
- */
-#define pte_to_pgoff(pte) ((pte).pte_high)
-#define pgoff_to_pte(off)						\
-	((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
-#define PTE_FILE_MAX_BITS       32
-
 /* Encode and de-code a swap entry */
 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
 #define __swp_type(x)			(((x).val) & 0x1f)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index e8a5454acc99..a0c35bf6cb92 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -115,11 +115,6 @@ static inline int pte_write(pte_t pte)
 	return pte_flags(pte) & _PAGE_RW;
 }
 
-static inline int pte_file(pte_t pte)
-{
-	return pte_flags(pte) & _PAGE_FILE;
-}
-
 static inline int pte_huge(pte_t pte)
 {
 	return pte_flags(pte) & _PAGE_PSE;
@@ -137,13 +132,7 @@ static inline int pte_exec(pte_t pte)
 
 static inline int pte_special(pte_t pte)
 {
-	/*
-	 * See CONFIG_NUMA_BALANCING pte_numa in include/asm-generic/pgtable.h.
-	 * On x86 we have _PAGE_BIT_NUMA == _PAGE_BIT_GLOBAL+1 ==
-	 * __PAGE_BIT_SOFTW1 == _PAGE_BIT_SPECIAL.
-	 */
-	return (pte_flags(pte) & _PAGE_SPECIAL) &&
-		(pte_flags(pte) & (_PAGE_PRESENT|_PAGE_PROTNONE));
+	return pte_flags(pte) & _PAGE_SPECIAL;
 }
 
 static inline unsigned long pte_pfn(pte_t pte)
@@ -305,7 +294,7 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
 
 static inline pmd_t pmd_mknotpresent(pmd_t pmd)
 {
-	return pmd_clear_flags(pmd, _PAGE_PRESENT);
+	return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE);
 }
 
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
@@ -329,21 +318,6 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
 	return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
 }
 
-static inline pte_t pte_file_clear_soft_dirty(pte_t pte)
-{
-	return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
-}
-
-static inline pte_t pte_file_mksoft_dirty(pte_t pte)
-{
-	return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
-}
-
-static inline int pte_file_soft_dirty(pte_t pte)
-{
-	return pte_flags(pte) & _PAGE_SOFT_DIRTY;
-}
-
 #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
 
 /*
@@ -463,13 +437,6 @@ static inline int pte_same(pte_t a, pte_t b)
 
 static inline int pte_present(pte_t a)
 {
-	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE |
-			       _PAGE_NUMA);
-}
-
-#define pte_present_nonuma pte_present_nonuma
-static inline int pte_present_nonuma(pte_t a)
-{
 	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
 }
 
@@ -479,7 +446,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
 	if (pte_flags(a) & _PAGE_PRESENT)
 		return true;
 
-	if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) &&
+	if ((pte_flags(a) & _PAGE_PROTNONE) &&
 			mm_tlb_flush_pending(mm))
 		return true;
 
@@ -499,10 +466,27 @@ static inline int pmd_present(pmd_t pmd)
 	 * the _PAGE_PSE flag will remain set at all times while the
 	 * _PAGE_PRESENT bit is clear).
 	 */
-	return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE |
-				 _PAGE_NUMA);
+	return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * These work without NUMA balancing but the kernel does not care. See the
+ * comment in include/asm-generic/pgtable.h
+ */
+static inline int pte_protnone(pte_t pte)
+{
+	return (pte_flags(pte) & (_PAGE_PROTNONE | _PAGE_PRESENT))
+		== _PAGE_PROTNONE;
+}
+
+static inline int pmd_protnone(pmd_t pmd)
+{
+	return (pmd_flags(pmd) & (_PAGE_PROTNONE | _PAGE_PRESENT))
+		== _PAGE_PROTNONE;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
 static inline int pmd_none(pmd_t pmd)
 {
 	/* Only check low word on 32-bit platforms, since it might be
@@ -559,11 +543,6 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
 
 static inline int pmd_bad(pmd_t pmd)
 {
-#ifdef CONFIG_NUMA_BALANCING
-	/* pmd_numa check */
-	if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)
-		return 0;
-#endif
 	return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
 }
 
@@ -882,19 +861,16 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
 static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
 {
-	VM_BUG_ON(pte_present_nonuma(pte));
 	return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
 }
 
 static inline int pte_swp_soft_dirty(pte_t pte)
 {
-	VM_BUG_ON(pte_present_nonuma(pte));
 	return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
 }
 
 static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
 {
-	VM_BUG_ON(pte_present_nonuma(pte));
 	return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
 }
 #endif
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 4572b2f30237..2ee781114d34 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -133,10 +133,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 /* PUD - Level3 access */
 
 /* PMD  - Level 2 access */
-#define pte_to_pgoff(pte) ((pte_val((pte)) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
-#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) |	\
-					    _PAGE_FILE })
-#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
 
 /* PTE - Level 1 access. */
 
@@ -145,13 +141,8 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 #define pte_unmap(pte) ((void)(pte))/* NOP */
 
 /* Encode and de-code a swap entry */
-#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
-#ifdef CONFIG_NUMA_BALANCING
-/* Automatic NUMA balancing needs to be distinguishable from swap entries */
-#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2)
-#else
+#define SWP_TYPE_BITS 5
 #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
-#endif
 
 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
 
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 25bcd4a89517..8c7c10802e9c 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -4,7 +4,7 @@
 #include <linux/const.h>
 #include <asm/page_types.h>
 
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 
 #define _PAGE_BIT_PRESENT	0	/* is present */
 #define _PAGE_BIT_RW		1	/* writeable */
@@ -27,19 +27,9 @@
 #define _PAGE_BIT_SOFT_DIRTY	_PAGE_BIT_SOFTW3 /* software dirty tracking */
 #define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
 
-/*
- * Swap offsets on configurations that allow automatic NUMA balancing use the
- * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from
- * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the
- * maximum possible swap space from 16TB to 8TB.
- */
-#define _PAGE_BIT_NUMA		(_PAGE_BIT_GLOBAL+1)
-
 /* If _PAGE_BIT_PRESENT is clear, we use these: */
 /* - if the user mapped it with PROT_NONE; pte_present gives true */
 #define _PAGE_BIT_PROTNONE	_PAGE_BIT_GLOBAL
-/* - set: nonlinear file mapping, saved PTE; unset:swap */
-#define _PAGE_BIT_FILE		_PAGE_BIT_DIRTY
 
 #define _PAGE_PRESENT	(_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
 #define _PAGE_RW	(_AT(pteval_t, 1) << _PAGE_BIT_RW)
@@ -78,21 +68,6 @@
 #endif
 
 /*
- * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page
- * that is not present. The hinting fault gathers numa placement statistics
- * (see pte_numa()). The bit is always zero when the PTE is not present.
- *
- * The bit picked must be always zero when the pmd is present and not
- * present, so that we don't lose information when we set it while
- * atomically clearing the present bit.
- */
-#ifdef CONFIG_NUMA_BALANCING
-#define _PAGE_NUMA	(_AT(pteval_t, 1) << _PAGE_BIT_NUMA)
-#else
-#define _PAGE_NUMA	(_AT(pteval_t, 0))
-#endif
-
-/*
  * Tracking soft dirty bit when a page goes to a swap is tricky.
  * We need a bit which can be stored in pte _and_ not conflict
  * with swap entry format. On x86 bits 6 and 7 are *not* involved
@@ -114,7 +89,6 @@
 #define _PAGE_NX	(_AT(pteval_t, 0))
 #endif
 
-#define _PAGE_FILE	(_AT(pteval_t, 1) << _PAGE_BIT_FILE)
 #define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
 
 #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
@@ -125,8 +99,8 @@
 /* Set of bits not changed in pte_modify */
 #define _PAGE_CHG_MASK	(PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT |		\
 			 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY |	\
-			 _PAGE_SOFT_DIRTY | _PAGE_NUMA)
-#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA)
+			 _PAGE_SOFT_DIRTY)
+#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
 
 /*
  * The cache modes defined here are used to translate between pure SW usage
@@ -327,20 +301,6 @@ static inline pteval_t pte_flags(pte_t pte)
 	return native_pte_val(pte) & PTE_FLAGS_MASK;
 }
 
-#ifdef CONFIG_NUMA_BALANCING
-/* Set of bits that distinguishes present, prot_none and numa ptes */
-#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)
-static inline pteval_t ptenuma_flags(pte_t pte)
-{
-	return pte_flags(pte) & _PAGE_NUMA_MASK;
-}
-
-static inline pmdval_t pmdnuma_flags(pmd_t pmd)
-{
-	return pmd_flags(pmd) & _PAGE_NUMA_MASK;
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
 #define pgprot_val(x)	((x).pgprot)
 #define __pgprot(x)	((pgprot_t) { (x) } )
 
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a092a0cce0b7..ec1c93588cef 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -579,39 +579,6 @@ static inline void load_sp0(struct tss_struct *tss,
 #define set_iopl_mask native_set_iopl_mask
 #endif /* CONFIG_PARAVIRT */
 
-/*
- * Save the cr4 feature set we're using (ie
- * Pentium 4MB enable and PPro Global page
- * enable), so that any CPU's that boot up
- * after us can get the correct flags.
- */
-extern unsigned long mmu_cr4_features;
-extern u32 *trampoline_cr4_features;
-
-static inline void set_in_cr4(unsigned long mask)
-{
-	unsigned long cr4;
-
-	mmu_cr4_features |= mask;
-	if (trampoline_cr4_features)
-		*trampoline_cr4_features = mmu_cr4_features;
-	cr4 = read_cr4();
-	cr4 |= mask;
-	write_cr4(cr4);
-}
-
-static inline void clear_in_cr4(unsigned long mask)
-{
-	unsigned long cr4;
-
-	mmu_cr4_features &= ~mask;
-	if (trampoline_cr4_features)
-		*trampoline_cr4_features = mmu_cr4_features;
-	cr4 = read_cr4();
-	cr4 &= ~mask;
-	write_cr4(cr4);
-}
-
 typedef struct {
 	unsigned long		seg;
 } mm_segment_t;
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index e820c080a4e9..6a4b00fafb00 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -137,17 +137,17 @@ static inline void write_cr3(unsigned long x)
 	native_write_cr3(x);
 }
 
-static inline unsigned long read_cr4(void)
+static inline unsigned long __read_cr4(void)
 {
 	return native_read_cr4();
 }
 
-static inline unsigned long read_cr4_safe(void)
+static inline unsigned long __read_cr4_safe(void)
 {
 	return native_read_cr4_safe();
 }
 
-static inline void write_cr4(unsigned long x)
+static inline void __write_cr4(unsigned long x)
 {
 	native_write_cr4(x);
 }
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 625660f8a2fc..cf87de3fc390 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -46,7 +46,7 @@ static __always_inline bool static_key_false(struct static_key *key);
 
 static inline void __ticket_enter_slowpath(arch_spinlock_t *lock)
 {
-	set_bit(0, (volatile unsigned long *)&lock->tickets.tail);
+	set_bit(0, (volatile unsigned long *)&lock->tickets.head);
 }
 
 #else  /* !CONFIG_PARAVIRT_SPINLOCKS */
@@ -60,10 +60,30 @@ static inline void __ticket_unlock_kick(arch_spinlock_t *lock,
 }
 
 #endif /* CONFIG_PARAVIRT_SPINLOCKS */
+static inline int  __tickets_equal(__ticket_t one, __ticket_t two)
+{
+	return !((one ^ two) & ~TICKET_SLOWPATH_FLAG);
+}
+
+static inline void __ticket_check_and_clear_slowpath(arch_spinlock_t *lock,
+							__ticket_t head)
+{
+	if (head & TICKET_SLOWPATH_FLAG) {
+		arch_spinlock_t old, new;
+
+		old.tickets.head = head;
+		new.tickets.head = head & ~TICKET_SLOWPATH_FLAG;
+		old.tickets.tail = new.tickets.head + TICKET_LOCK_INC;
+		new.tickets.tail = old.tickets.tail;
+
+		/* try to clear slowpath flag when there are no contenders */
+		cmpxchg(&lock->head_tail, old.head_tail, new.head_tail);
+	}
+}
 
 static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
-	return lock.tickets.head == lock.tickets.tail;
+	return __tickets_equal(lock.tickets.head, lock.tickets.tail);
 }
 
 /*
@@ -87,18 +107,21 @@ static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
 	if (likely(inc.head == inc.tail))
 		goto out;
 
-	inc.tail &= ~TICKET_SLOWPATH_FLAG;
 	for (;;) {
 		unsigned count = SPIN_THRESHOLD;
 
 		do {
-			if (READ_ONCE(lock->tickets.head) == inc.tail)
-				goto out;
+			inc.head = READ_ONCE(lock->tickets.head);
+			if (__tickets_equal(inc.head, inc.tail))
+				goto clear_slowpath;
 			cpu_relax();
 		} while (--count);
 		__ticket_lock_spinning(lock, inc.tail);
 	}
-out:	barrier();	/* make sure nothing creeps before the lock is taken */
+clear_slowpath:
+	__ticket_check_and_clear_slowpath(lock, inc.head);
+out:
+	barrier();	/* make sure nothing creeps before the lock is taken */
 }
 
 static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
@@ -106,56 +129,30 @@ static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
 	arch_spinlock_t old, new;
 
 	old.tickets = READ_ONCE(lock->tickets);
-	if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG))
+	if (!__tickets_equal(old.tickets.head, old.tickets.tail))
 		return 0;
 
 	new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT);
+	new.head_tail &= ~TICKET_SLOWPATH_FLAG;
 
 	/* cmpxchg is a full barrier, so nothing can move before it */
 	return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
 }
 
-static inline void __ticket_unlock_slowpath(arch_spinlock_t *lock,
-					    arch_spinlock_t old)
-{
-	arch_spinlock_t new;
-
-	BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS);
-
-	/* Perform the unlock on the "before" copy */
-	old.tickets.head += TICKET_LOCK_INC;
-
-	/* Clear the slowpath flag */
-	new.head_tail = old.head_tail & ~(TICKET_SLOWPATH_FLAG << TICKET_SHIFT);
-
-	/*
-	 * If the lock is uncontended, clear the flag - use cmpxchg in
-	 * case it changes behind our back though.
-	 */
-	if (new.tickets.head != new.tickets.tail ||
-	    cmpxchg(&lock->head_tail, old.head_tail,
-					new.head_tail) != old.head_tail) {
-		/*
-		 * Lock still has someone queued for it, so wake up an
-		 * appropriate waiter.
-		 */
-		__ticket_unlock_kick(lock, old.tickets.head);
-	}
-}
-
 static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
 	if (TICKET_SLOWPATH_FLAG &&
-	    static_key_false(&paravirt_ticketlocks_enabled)) {
-		arch_spinlock_t prev;
+		static_key_false(&paravirt_ticketlocks_enabled)) {
+		__ticket_t head;
 
-		prev = *lock;
-		add_smp(&lock->tickets.head, TICKET_LOCK_INC);
+		BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS);
 
-		/* add_smp() is a full mb() */
+		head = xadd(&lock->tickets.head, TICKET_LOCK_INC);
 
-		if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG))
-			__ticket_unlock_slowpath(lock, prev);
+		if (unlikely(head & TICKET_SLOWPATH_FLAG)) {
+			head &= ~TICKET_SLOWPATH_FLAG;
+			__ticket_unlock_kick(lock, (head + TICKET_LOCK_INC));
+		}
 	} else
 		__add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX);
 }
@@ -164,14 +161,15 @@ static inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
 	struct __raw_tickets tmp = READ_ONCE(lock->tickets);
 
-	return tmp.tail != tmp.head;
+	return !__tickets_equal(tmp.tail, tmp.head);
 }
 
 static inline int arch_spin_is_contended(arch_spinlock_t *lock)
 {
 	struct __raw_tickets tmp = READ_ONCE(lock->tickets);
 
-	return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC;
+	tmp.head &= ~TICKET_SLOWPATH_FLAG;
+	return (tmp.tail - tmp.head) > TICKET_LOCK_INC;
 }
 #define arch_spin_is_contended	arch_spin_is_contended
 
@@ -183,16 +181,16 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
 
 static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
 {
-	__ticket_t head = ACCESS_ONCE(lock->tickets.head);
+	__ticket_t head = READ_ONCE(lock->tickets.head);
 
 	for (;;) {
-		struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+		struct __raw_tickets tmp = READ_ONCE(lock->tickets);
 		/*
 		 * We need to check "unlocked" in a loop, tmp.head == head
 		 * can be false positive because of overflow.
 		 */
-		if (tmp.head == (tmp.tail & ~TICKET_SLOWPATH_FLAG) ||
-		    tmp.head != head)
+		if (__tickets_equal(tmp.head, tmp.tail) ||
+				!__tickets_equal(tmp.head, head))
 			break;
 
 		cpu_relax();
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 19e2c468fc2c..e4661196994e 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -27,11 +27,12 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t
    function. */
 
 #define __HAVE_ARCH_MEMCPY 1
+extern void *__memcpy(void *to, const void *from, size_t len);
+
 #ifndef CONFIG_KMEMCHECK
 #if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
 extern void *memcpy(void *to, const void *from, size_t len);
 #else
-extern void *__memcpy(void *to, const void *from, size_t len);
 #define memcpy(dst, src, len)					\
 ({								\
 	size_t __len = (len);					\
@@ -53,9 +54,11 @@ extern void *__memcpy(void *to, const void *from, size_t len);
 
 #define __HAVE_ARCH_MEMSET
 void *memset(void *s, int c, size_t n);
+void *__memset(void *s, int c, size_t n);
 
 #define __HAVE_ARCH_MEMMOVE
 void *memmove(void *dest, const void *src, size_t count);
+void *__memmove(void *dest, const void *src, size_t count);
 
 int memcmp(const void *cs, const void *ct, size_t count);
 size_t strlen(const char *s);
@@ -63,6 +66,19 @@ char *strcpy(char *dest, const char *src);
 char *strcat(char *dest, const char *src);
 int strcmp(const char *cs, const char *ct);
 
+#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
+
+/*
+ * For files that not instrumented (e.g. mm/slub.c) we
+ * should use not instrumented version of mem* functions.
+ */
+
+#undef memcpy
+#define memcpy(dst, src, len) __memcpy(dst, src, len)
+#define memmove(dst, src, len) __memmove(dst, src, len)
+#define memset(s, c, n) __memset(s, c, n)
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e82e95abc92b..1d4e4f279a32 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -31,7 +31,6 @@ struct thread_info {
 	__u32			cpu;		/* current CPU */
 	int			saved_preempt_count;
 	mm_segment_t		addr_limit;
-	struct restart_block    restart_block;
 	void __user		*sysenter_return;
 	unsigned int		sig_on_uaccess_error:1;
 	unsigned int		uaccess_err:1;	/* uaccess failed */
@@ -45,9 +44,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.saved_preempt_count = INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 04905bfc508b..cd791948b286 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -15,6 +15,75 @@
 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
 #endif
 
+struct tlb_state {
+#ifdef CONFIG_SMP
+	struct mm_struct *active_mm;
+	int state;
+#endif
+
+	/*
+	 * Access to this CR4 shadow and to H/W CR4 is protected by
+	 * disabling interrupts when modifying either one.
+	 */
+	unsigned long cr4;
+};
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
+
+/* Initialize cr4 shadow for this CPU. */
+static inline void cr4_init_shadow(void)
+{
+	this_cpu_write(cpu_tlbstate.cr4, __read_cr4());
+}
+
+/* Set in this cpu's CR4. */
+static inline void cr4_set_bits(unsigned long mask)
+{
+	unsigned long cr4;
+
+	cr4 = this_cpu_read(cpu_tlbstate.cr4);
+	if ((cr4 | mask) != cr4) {
+		cr4 |= mask;
+		this_cpu_write(cpu_tlbstate.cr4, cr4);
+		__write_cr4(cr4);
+	}
+}
+
+/* Clear in this cpu's CR4. */
+static inline void cr4_clear_bits(unsigned long mask)
+{
+	unsigned long cr4;
+
+	cr4 = this_cpu_read(cpu_tlbstate.cr4);
+	if ((cr4 & ~mask) != cr4) {
+		cr4 &= ~mask;
+		this_cpu_write(cpu_tlbstate.cr4, cr4);
+		__write_cr4(cr4);
+	}
+}
+
+/* Read the CR4 shadow. */
+static inline unsigned long cr4_read_shadow(void)
+{
+	return this_cpu_read(cpu_tlbstate.cr4);
+}
+
+/*
+ * Save some of cr4 feature set we're using (e.g.  Pentium 4MB
+ * enable and PPro Global page enable), so that any CPU's that boot
+ * up after us can get the correct flags.  This should only be used
+ * during boot on the boot cpu.
+ */
+extern unsigned long mmu_cr4_features;
+extern u32 *trampoline_cr4_features;
+
+static inline void cr4_set_bits_and_update_boot(unsigned long mask)
+{
+	mmu_cr4_features |= mask;
+	if (trampoline_cr4_features)
+		*trampoline_cr4_features = mmu_cr4_features;
+	cr4_set_bits(mask);
+}
+
 static inline void __native_flush_tlb(void)
 {
 	native_write_cr3(native_read_cr3());
@@ -24,7 +93,7 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
 {
 	unsigned long cr4;
 
-	cr4 = native_read_cr4();
+	cr4 = this_cpu_read(cpu_tlbstate.cr4);
 	/* clear PGE */
 	native_write_cr4(cr4 & ~X86_CR4_PGE);
 	/* write old PGE again and flush TLBs */
@@ -184,12 +253,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 #define TLBSTATE_OK	1
 #define TLBSTATE_LAZY	2
 
-struct tlb_state {
-	struct mm_struct *active_mm;
-	int state;
-};
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
-
 static inline void reset_lazy_tlbstate(void)
 {
 	this_cpu_write(cpu_tlbstate.state, 0);
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 0d592e0a5b84..ace9dec050b1 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -179,7 +179,7 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
 	asm volatile("call __get_user_%P3"				\
 		     : "=a" (__ret_gu), "=r" (__val_gu)			\
 		     : "0" (ptr), "i" (sizeof(*(ptr))));		\
-	(x) = (__typeof__(*(ptr))) __val_gu;				\
+	(x) = (__force __typeof__(*(ptr))) __val_gu;			\
 	__ret_gu;							\
 })
 
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 5da71c27cc59..cce9ee68e335 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -19,6 +19,7 @@
 
 #include <asm/vmx.h>
 #include <asm/svm.h>
+#include <asm/tlbflush.h>
 
 /*
  * VMX functions:
@@ -40,12 +41,12 @@ static inline int cpu_has_vmx(void)
 static inline void cpu_vmxoff(void)
 {
 	asm volatile (ASM_VMX_VMXOFF : : : "cc");
-	write_cr4(read_cr4() & ~X86_CR4_VMXE);
+	cr4_clear_bits(X86_CR4_VMXE);
 }
 
 static inline int cpu_vmx_enabled(void)
 {
-	return read_cr4() & X86_CR4_VMXE;
+	return __read_cr4() & X86_CR4_VMXE;
 }
 
 /** Disable VMX if it is enabled on the current CPU
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 45afaee9555c..da772edd19ab 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -69,6 +69,7 @@
 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING	0x00000400
 #define SECONDARY_EXEC_ENABLE_INVPCID		0x00001000
 #define SECONDARY_EXEC_SHADOW_VMCS              0x00004000
+#define SECONDARY_EXEC_ENABLE_PML               0x00020000
 #define SECONDARY_EXEC_XSAVES			0x00100000
 
 
@@ -121,6 +122,7 @@ enum vmcs_field {
 	GUEST_LDTR_SELECTOR             = 0x0000080c,
 	GUEST_TR_SELECTOR               = 0x0000080e,
 	GUEST_INTR_STATUS               = 0x00000810,
+	GUEST_PML_INDEX			= 0x00000812,
 	HOST_ES_SELECTOR                = 0x00000c00,
 	HOST_CS_SELECTOR                = 0x00000c02,
 	HOST_SS_SELECTOR                = 0x00000c04,
@@ -140,6 +142,8 @@ enum vmcs_field {
 	VM_EXIT_MSR_LOAD_ADDR_HIGH      = 0x00002009,
 	VM_ENTRY_MSR_LOAD_ADDR          = 0x0000200a,
 	VM_ENTRY_MSR_LOAD_ADDR_HIGH     = 0x0000200b,
+	PML_ADDRESS			= 0x0000200e,
+	PML_ADDRESS_HIGH		= 0x0000200f,
 	TSC_OFFSET                      = 0x00002010,
 	TSC_OFFSET_HIGH                 = 0x00002011,
 	VIRTUAL_APIC_PAGE_ADDR          = 0x00002012,
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 5eea09915a15..358dcd338915 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -55,9 +55,8 @@ extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
 				   struct gnttab_map_grant_ref *kmap_ops,
 				   struct page **pages, unsigned int count);
 extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
-				     struct gnttab_map_grant_ref *kmap_ops,
+				     struct gnttab_unmap_grant_ref *kunmap_ops,
 				     struct page **pages, unsigned int count);
-extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
 
 /*
  * Helper functions to write or read unsigned long values to/from
@@ -154,21 +153,12 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
 		return mfn;
 
 	pfn = mfn_to_pfn_no_overrides(mfn);
-	if (__pfn_to_mfn(pfn) != mfn) {
-		/*
-		 * If this appears to be a foreign mfn (because the pfn
-		 * doesn't map back to the mfn), then check the local override
-		 * table to see if there's a better pfn to use.
-		 *
-		 * m2p_find_override_pfn returns ~0 if it doesn't find anything.
-		 */
-		pfn = m2p_find_override_pfn(mfn, ~0);
-	}
+	if (__pfn_to_mfn(pfn) != mfn)
+		pfn = ~0;
 
 	/*
-	 * pfn is ~0 if there are no entries in the m2p for mfn or if the
-	 * entry doesn't map back to the mfn and m2p_override doesn't have a
-	 * valid entry for it.
+	 * pfn is ~0 if there are no entries in the m2p for mfn or the
+	 * entry doesn't map back to the mfn.
 	 */
 	if (pfn == ~0 && __pfn_to_mfn(mfn) == IDENTITY_FRAME(mfn))
 		pfn = mfn;
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 5fa9770035dc..c9a6d68b8d62 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -82,18 +82,15 @@ static inline int xsave_state_booting(struct xsave_struct *fx, u64 mask)
 	if (boot_cpu_has(X86_FEATURE_XSAVES))
 		asm volatile("1:"XSAVES"\n\t"
 			"2:\n\t"
-			: : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+			     xstate_fault
+			: "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
 			:   "memory");
 	else
 		asm volatile("1:"XSAVE"\n\t"
 			"2:\n\t"
-			: : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+			     xstate_fault
+			: "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
 			:   "memory");
-
-	asm volatile(xstate_fault
-		     : "0" (0)
-		     : "memory");
-
 	return err;
 }
 
@@ -112,18 +109,15 @@ static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask)
 	if (boot_cpu_has(X86_FEATURE_XSAVES))
 		asm volatile("1:"XRSTORS"\n\t"
 			"2:\n\t"
-			: : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+			     xstate_fault
+			: "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
 			:   "memory");
 	else
 		asm volatile("1:"XRSTOR"\n\t"
 			"2:\n\t"
-			: : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+			     xstate_fault
+			: "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
 			:   "memory");
-
-	asm volatile(xstate_fault
-		     : "0" (0)
-		     : "memory");
-
 	return err;
 }
 
@@ -149,9 +143,9 @@ static inline int xsave_state(struct xsave_struct *fx, u64 mask)
 	 */
 	alternative_input_2(
 		"1:"XSAVE,
-		"1:"XSAVEOPT,
+		XSAVEOPT,
 		X86_FEATURE_XSAVEOPT,
-		"1:"XSAVES,
+		XSAVES,
 		X86_FEATURE_XSAVES,
 		[fx] "D" (fx), "a" (lmask), "d" (hmask) :
 		"memory");
@@ -178,7 +172,7 @@ static inline int xrstor_state(struct xsave_struct *fx, u64 mask)
 	 */
 	alternative_input(
 		"1: " XRSTOR,
-		"1: " XRSTORS,
+		XRSTORS,
 		X86_FEATURE_XSAVES,
 		"D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
 		: "memory");
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index 462efe746d77..90c458e66e13 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -187,6 +187,17 @@
 #define HV_X64_MSR_SINT14			0x4000009E
 #define HV_X64_MSR_SINT15			0x4000009F
 
+/*
+ * Synthetic Timer MSRs. Four timers per vcpu.
+ */
+#define HV_X64_MSR_STIMER0_CONFIG		0x400000B0
+#define HV_X64_MSR_STIMER0_COUNT		0x400000B1
+#define HV_X64_MSR_STIMER1_CONFIG		0x400000B2
+#define HV_X64_MSR_STIMER1_COUNT		0x400000B3
+#define HV_X64_MSR_STIMER2_CONFIG		0x400000B4
+#define HV_X64_MSR_STIMER2_COUNT		0x400000B5
+#define HV_X64_MSR_STIMER3_CONFIG		0x400000B6
+#define HV_X64_MSR_STIMER3_COUNT		0x400000B7
 
 #define HV_X64_MSR_HYPERCALL_ENABLE		0x00000001
 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT	12
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index d979e5abae55..3ce079136c11 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -152,6 +152,10 @@
 #define MSR_CC6_DEMOTION_POLICY_CONFIG	0x00000668
 #define MSR_MC6_DEMOTION_POLICY_CONFIG	0x00000669
 
+#define MSR_CORE_PERF_LIMIT_REASONS	0x00000690
+#define MSR_GFX_PERF_LIMIT_REASONS	0x000006B0
+#define MSR_RING_PERF_LIMIT_REASONS	0x000006B1
+
 /* Hardware P state interface */
 #define MSR_PPERF			0x0000064e
 #define MSR_PERF_LIMIT_REASONS		0x0000064f
@@ -360,8 +364,12 @@
 #define MSR_IA32_UCODE_WRITE		0x00000079
 #define MSR_IA32_UCODE_REV		0x0000008b
 
+#define MSR_IA32_SMM_MONITOR_CTL	0x0000009b
+#define MSR_IA32_SMBASE			0x0000009e
+
 #define MSR_IA32_PERF_STATUS		0x00000198
 #define MSR_IA32_PERF_CTL		0x00000199
+#define INTEL_PERF_CTL_MASK		0xffff
 #define MSR_AMD_PSTATE_DEF_BASE		0xc0010064
 #define MSR_AMD_PERF_STATUS		0xc0010063
 #define MSR_AMD_PERF_CTL		0xc0010062
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index b813bf9da1e2..c5f1a1deb91a 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -56,6 +56,7 @@
 #define EXIT_REASON_MSR_READ            31
 #define EXIT_REASON_MSR_WRITE           32
 #define EXIT_REASON_INVALID_STATE       33
+#define EXIT_REASON_MSR_LOAD_FAIL       34
 #define EXIT_REASON_MWAIT_INSTRUCTION   36
 #define EXIT_REASON_MONITOR_INSTRUCTION 39
 #define EXIT_REASON_PAUSE_INSTRUCTION   40
@@ -72,6 +73,7 @@
 #define EXIT_REASON_XSETBV              55
 #define EXIT_REASON_APIC_WRITE          56
 #define EXIT_REASON_INVPCID             58
+#define EXIT_REASON_PML_FULL            62
 #define EXIT_REASON_XSAVES              63
 #define EXIT_REASON_XRSTORS             64
 
@@ -116,10 +118,14 @@
 	{ EXIT_REASON_APIC_WRITE,            "APIC_WRITE" }, \
 	{ EXIT_REASON_EOI_INDUCED,           "EOI_INDUCED" }, \
 	{ EXIT_REASON_INVALID_STATE,         "INVALID_STATE" }, \
+	{ EXIT_REASON_MSR_LOAD_FAIL,         "MSR_LOAD_FAIL" }, \
 	{ EXIT_REASON_INVD,                  "INVD" }, \
 	{ EXIT_REASON_INVVPID,               "INVVPID" }, \
 	{ EXIT_REASON_INVPCID,               "INVPCID" }, \
 	{ EXIT_REASON_XSAVES,                "XSAVES" }, \
 	{ EXIT_REASON_XRSTORS,               "XRSTORS" }
 
+#define VMX_ABORT_SAVE_GUEST_MSR_FAIL        1
+#define VMX_ABORT_LOAD_HOST_MSR_FAIL         4
+
 #endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5d4502c8b983..cdb1b70ddad0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -16,6 +16,10 @@ CFLAGS_REMOVE_ftrace.o = -pg
 CFLAGS_REMOVE_early_printk.o = -pg
 endif
 
+KASAN_SANITIZE_head$(BITS).o := n
+KASAN_SANITIZE_dumpstack.o := n
+KASAN_SANITIZE_dumpstack_$(BITS).o := n
+
 CFLAGS_irq.o := -I$(src)/../include/asm/trace
 
 obj-y			:= process_$(BITS).o signal.o entry_$(BITS).o
@@ -63,6 +67,7 @@ obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-y				+= apic/
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
+obj-$(CONFIG_LIVEPATCH)		+= livepatch.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
 obj-$(CONFIG_FTRACE_SYSCALLS)	+= ftrace.o
 obj-$(CONFIG_X86_TSC)		+= trace_clock.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index a18fff361c7f..803b684676ff 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -613,6 +613,11 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp)
 {
 	int rc, irq, trigger, polarity;
 
+	if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
+		*irqp = gsi;
+		return 0;
+	}
+
 	rc = acpi_get_override_irq(gsi, &trigger, &polarity);
 	if (rc == 0) {
 		trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
@@ -845,13 +850,7 @@ int acpi_ioapic_registered(acpi_handle handle, u32 gsi_base)
 
 static int __init acpi_parse_sbf(struct acpi_table_header *table)
 {
-	struct acpi_table_boot *sb;
-
-	sb = (struct acpi_table_boot *)table;
-	if (!sb) {
-		printk(KERN_WARNING PREFIX "Unable to map SBF\n");
-		return -ENODEV;
-	}
+	struct acpi_table_boot *sb = (struct acpi_table_boot *)table;
 
 	sbf_port = sb->cmos_index;	/* Save CMOS port */
 
@@ -865,13 +864,7 @@ static struct resource *hpet_res __initdata;
 
 static int __init acpi_parse_hpet(struct acpi_table_header *table)
 {
-	struct acpi_table_hpet *hpet_tbl;
-
-	hpet_tbl = (struct acpi_table_hpet *)table;
-	if (!hpet_tbl) {
-		printk(KERN_WARNING PREFIX "Unable to map HPET\n");
-		return -ENODEV;
-	}
+	struct acpi_table_hpet *hpet_tbl = (struct acpi_table_hpet *)table;
 
 	if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) {
 		printk(KERN_WARNING PREFIX "HPET timers must be located in "
@@ -1345,6 +1338,26 @@ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
 }
 
 /*
+ * ACPI offers an alternative platform interface model that removes
+ * ACPI hardware requirements for platforms that do not implement
+ * the PC Architecture.
+ *
+ * We initialize the Hardware-reduced ACPI model here:
+ */
+static void __init acpi_reduced_hw_init(void)
+{
+	if (acpi_gbl_reduced_hardware) {
+		/*
+		 * Override x86_init functions and bypass legacy pic
+		 * in Hardware-reduced ACPI mode
+		 */
+		x86_init.timers.timer_init	= x86_init_noop;
+		x86_init.irqs.pre_vector_init	= x86_init_noop;
+		legacy_pic			= &null_legacy_pic;
+	}
+}
+
+/*
  * If your system is blacklisted here, but you find that acpi=force
  * works for you, please contact linux-acpi@vger.kernel.org
  */
@@ -1543,6 +1556,11 @@ int __init early_acpi_boot_init(void)
 	 */
 	early_acpi_process_madt();
 
+	/*
+	 * Hardware-reduced ACPI mode initialization:
+	 */
+	acpi_reduced_hw_init();
+
 	return 0;
 }
 
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 31368207837c..d1daead5fcdd 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -78,7 +78,7 @@ int x86_acpi_suspend_lowlevel(void)
 
 	header->pmode_cr0 = read_cr0();
 	if (__this_cpu_read(cpu_info.cpuid_level) >= 0) {
-		header->pmode_cr4 = read_cr4();
+		header->pmode_cr4 = __read_cr4();
 		header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4);
 	}
 	if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b665d241efad..ad3639ae1b9b 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1580,8 +1580,7 @@ static __init void try_to_enable_x2apic(int remap_mode)
 		 * under KVM
 		 */
 		if (max_physical_apicid > 255 ||
-		    (IS_ENABLED(CONFIG_HYPERVISOR_GUEST) &&
-		     !hypervisor_x2apic_available())) {
+		    !hypervisor_x2apic_available()) {
 			pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n");
 			x2apic_disable();
 			return;
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index c2fd21fed002..017149cded07 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -37,10 +37,12 @@ static const struct apic apic_numachip;
 static unsigned int get_apic_id(unsigned long x)
 {
 	unsigned long value;
-	unsigned int id;
+	unsigned int id = (x >> 24) & 0xff;
 
-	rdmsrl(MSR_FAM10H_NODE_ID, value);
-	id = ((x >> 24) & 0xffU) | ((value << 2) & 0xff00U);
+	if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) {
+		rdmsrl(MSR_FAM10H_NODE_ID, value);
+		id |= (value << 2) & 0xff00;
+	}
 
 	return id;
 }
@@ -155,10 +157,18 @@ static int __init numachip_probe(void)
 
 static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
 {
-	if (c->phys_proc_id != node) {
-		c->phys_proc_id = node;
-		per_cpu(cpu_llc_id, smp_processor_id()) = node;
+	u64 val;
+	u32 nodes = 1;
+
+	this_cpu_write(cpu_llc_id, node);
+
+	/* Account for nodes per socket in multi-core-module processors */
+	if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) {
+		rdmsrl(MSR_FAM10H_NODE_ID, val);
+		nodes = ((val >> 3) & 7) + 1;
 	}
+
+	c->phys_proc_id = node / nodes;
 }
 
 static int __init numachip_system_init(void)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b15bffcaba6d..2346c95c6ab1 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -19,6 +19,7 @@
 #include <asm/archrandom.h>
 #include <asm/hypervisor.h>
 #include <asm/processor.h>
+#include <asm/tlbflush.h>
 #include <asm/debugreg.h>
 #include <asm/sections.h>
 #include <asm/vsyscall.h>
@@ -278,7 +279,7 @@ __setup("nosmep", setup_disable_smep);
 static __always_inline void setup_smep(struct cpuinfo_x86 *c)
 {
 	if (cpu_has(c, X86_FEATURE_SMEP))
-		set_in_cr4(X86_CR4_SMEP);
+		cr4_set_bits(X86_CR4_SMEP);
 }
 
 static __init int setup_disable_smap(char *arg)
@@ -298,9 +299,9 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
 
 	if (cpu_has(c, X86_FEATURE_SMAP)) {
 #ifdef CONFIG_X86_SMAP
-		set_in_cr4(X86_CR4_SMAP);
+		cr4_set_bits(X86_CR4_SMAP);
 #else
-		clear_in_cr4(X86_CR4_SMAP);
+		cr4_clear_bits(X86_CR4_SMAP);
 #endif
 	}
 }
@@ -1295,6 +1296,12 @@ void cpu_init(void)
 	wait_for_master_cpu(cpu);
 
 	/*
+	 * Initialize the CR4 shadow before doing anything that could
+	 * try to read it.
+	 */
+	cr4_init_shadow();
+
+	/*
 	 * Load microcode on this cpu if a valid microcode is available.
 	 * This is early microcode loading procedure.
 	 */
@@ -1313,7 +1320,7 @@ void cpu_init(void)
 
 	pr_debug("Initializing CPU#%d\n", cpu);
 
-	clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+	cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
 
 	/*
 	 * Initialize the per-CPU GDT with the boot GDT,
@@ -1389,12 +1396,18 @@ void cpu_init(void)
 
 	wait_for_master_cpu(cpu);
 
+	/*
+	 * Initialize the CR4 shadow before doing anything that could
+	 * try to read it.
+	 */
+	cr4_init_shadow();
+
 	show_ucode_info_early();
 
 	printk(KERN_INFO "Initializing CPU#%d\n", cpu);
 
 	if (cpu_feature_enabled(X86_FEATURE_VME) || cpu_has_tsc || cpu_has_de)
-		clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+		cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
 
 	load_current_idt();
 	switch_to_new_gdt(cpu);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 94d7dcb12145..50163fa9034f 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -565,8 +565,8 @@ static const struct _tlb_table intel_tlb_table[] = {
 	{ 0xb2, TLB_INST_4K,		64,	" TLB_INST 4KByte pages, 4-way set associative" },
 	{ 0xb3, TLB_DATA_4K,		128,	" TLB_DATA 4 KByte pages, 4-way set associative" },
 	{ 0xb4, TLB_DATA_4K,		256,	" TLB_DATA 4 KByte pages, 4-way associative" },
-	{ 0xb5, TLB_INST_4K,		64,	" TLB_INST 4 KByte pages, 8-way set ssociative" },
-	{ 0xb6, TLB_INST_4K,		128,	" TLB_INST 4 KByte pages, 8-way set ssociative" },
+	{ 0xb5, TLB_INST_4K,		64,	" TLB_INST 4 KByte pages, 8-way set associative" },
+	{ 0xb6, TLB_INST_4K,		128,	" TLB_INST 4 KByte pages, 8-way set associative" },
 	{ 0xba, TLB_DATA_4K,		64,	" TLB_DATA 4 KByte pages, 4-way associative" },
 	{ 0xc0, TLB_DATA_4K_4M,		8,	" TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
 	{ 0xc1, STLB_4K_2M,		1024,	" STLB 4 KByte and 2 MByte pages, 8-way associative" },
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index c7035073dfc1..659643376dbf 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -952,20 +952,18 @@ static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf,
 static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
 					int type, char *buf)
 {
-	ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
-	int n = 0;
-
-	if (len > 1) {
-		const struct cpumask *mask;
-
-		mask = to_cpumask(this_leaf->shared_cpu_map);
-		n = type ?
-			cpulist_scnprintf(buf, len-2, mask) :
-			cpumask_scnprintf(buf, len-2, mask);
-		buf[n++] = '\n';
-		buf[n] = '\0';
-	}
-	return n;
+	const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
+	int ret;
+
+	if (type)
+		ret = scnprintf(buf, PAGE_SIZE - 1, "%*pbl",
+				cpumask_pr_args(mask));
+	else
+		ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb",
+				cpumask_pr_args(mask));
+	buf[ret++] = '\n';
+	buf[ret] = '\0';
+	return ret;
 }
 
 static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf,
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index cdfed7953963..3c036cb4a370 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -44,6 +44,7 @@
 
 #include <asm/processor.h>
 #include <asm/traps.h>
+#include <asm/tlbflush.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
 
@@ -151,14 +152,11 @@ static struct mce_log mcelog = {
 void mce_log(struct mce *mce)
 {
 	unsigned next, entry;
-	int ret = 0;
 
 	/* Emit the trace record: */
 	trace_mce_record(mce);
 
-	ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
-	if (ret == NOTIFY_STOP)
-		return;
+	atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
 
 	mce->finished = 0;
 	wmb();
@@ -1452,7 +1450,7 @@ static void __mcheck_cpu_init_generic(void)
 	bitmap_fill(all_banks, MAX_NR_BANKS);
 	machine_check_poll(MCP_UC | m_fl, &all_banks);
 
-	set_in_cr4(X86_CR4_MCE);
+	cr4_set_bits(X86_CR4_MCE);
 
 	rdmsrl(MSR_IA32_MCG_CAP, cap);
 	if (cap & MCG_CTL_P)
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index ec2663a708e4..737b0ad4e61a 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -9,6 +9,7 @@
 
 #include <asm/processor.h>
 #include <asm/traps.h>
+#include <asm/tlbflush.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
 
@@ -65,7 +66,7 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
 	       "Intel old style machine check architecture supported.\n");
 
 	/* Enable MCE: */
-	set_in_cr4(X86_CR4_MCE);
+	cr4_set_bits(X86_CR4_MCE);
 	printk(KERN_INFO
 	       "Intel old style machine check reporting enabled on CPU#%d.\n",
 	       smp_processor_id());
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index bd5d46a32210..44f138296fbe 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -8,6 +8,7 @@
 
 #include <asm/processor.h>
 #include <asm/traps.h>
+#include <asm/tlbflush.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
 
@@ -36,7 +37,7 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c)
 	lo &= ~(1<<4);	/* Enable MCE */
 	wrmsr(MSR_IDT_FCR1, lo, hi);
 
-	set_in_cr4(X86_CR4_MCE);
+	cr4_set_bits(X86_CR4_MCE);
 
 	printk(KERN_INFO
 	       "Winchip machine check reporting enabled on CPU#0.\n");
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index c6826d1e8082..746e7fd08aad 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -196,6 +196,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 		struct microcode_header_intel mc_header;
 		unsigned int mc_size;
 
+		if (leftover < sizeof(mc_header)) {
+			pr_err("error! Truncated header in microcode data file\n");
+			break;
+		}
+
 		if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header)))
 			break;
 
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index ec9df6f9cd47..420eb933189c 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -321,7 +321,11 @@ get_matching_model_microcode(int cpu, unsigned long start,
 	unsigned int mc_saved_count = mc_saved_data->mc_saved_count;
 	int i;
 
-	while (leftover) {
+	while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) {
+
+		if (leftover < sizeof(mc_header))
+			break;
+
 		mc_header = (struct microcode_header_intel *)ucode_ptr;
 
 		mc_size = get_totalsize(mc_header);
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 9e451b0876b5..f8c81ba0b465 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -138,8 +138,8 @@ static void prepare_set(void)
 
 	/*  Save value of CR4 and clear Page Global Enable (bit 7)  */
 	if (cpu_has_pge) {
-		cr4 = read_cr4();
-		write_cr4(cr4 & ~X86_CR4_PGE);
+		cr4 = __read_cr4();
+		__write_cr4(cr4 & ~X86_CR4_PGE);
 	}
 
 	/*
@@ -171,7 +171,7 @@ static void post_set(void)
 
 	/* Restore value of CR4 */
 	if (cpu_has_pge)
-		write_cr4(cr4);
+		__write_cr4(cr4);
 }
 
 static void cyrix_set_arr(unsigned int reg, unsigned long base,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0e25a1bc5ab5..7d74f7b3c6ba 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -678,8 +678,8 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
 
 	/* Save value of CR4 and clear Page Global Enable (bit 7) */
 	if (cpu_has_pge) {
-		cr4 = read_cr4();
-		write_cr4(cr4 & ~X86_CR4_PGE);
+		cr4 = __read_cr4();
+		__write_cr4(cr4 & ~X86_CR4_PGE);
 	}
 
 	/* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
@@ -708,7 +708,7 @@ static void post_set(void) __releases(set_atomicity_lock)
 
 	/* Restore value of CR4 */
 	if (cpu_has_pge)
-		write_cr4(cr4);
+		__write_cr4(cr4);
 	raw_spin_unlock(&set_atomicity_lock);
 }
 
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 143e5f5dc855..b71a7f86d68a 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,6 +31,8 @@
 #include <asm/nmi.h>
 #include <asm/smp.h>
 #include <asm/alternative.h>
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
 #include <asm/timer.h>
 #include <asm/desc.h>
 #include <asm/ldt.h>
@@ -43,6 +45,8 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
 	.enabled = 1,
 };
 
+struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;
+
 u64 __read_mostly hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -1327,8 +1331,6 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 		break;
 
 	case CPU_STARTING:
-		if (x86_pmu.attr_rdpmc)
-			set_in_cr4(X86_CR4_PCE);
 		if (x86_pmu.cpu_starting)
 			x86_pmu.cpu_starting(cpu);
 		break;
@@ -1804,14 +1806,44 @@ static int x86_pmu_event_init(struct perf_event *event)
 			event->destroy(event);
 	}
 
+	if (ACCESS_ONCE(x86_pmu.attr_rdpmc))
+		event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
+
 	return err;
 }
 
+static void refresh_pce(void *ignored)
+{
+	if (current->mm)
+		load_mm_cr4(current->mm);
+}
+
+static void x86_pmu_event_mapped(struct perf_event *event)
+{
+	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+		return;
+
+	if (atomic_inc_return(&current->mm->context.perf_rdpmc_allowed) == 1)
+		on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+}
+
+static void x86_pmu_event_unmapped(struct perf_event *event)
+{
+	if (!current->mm)
+		return;
+
+	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+		return;
+
+	if (atomic_dec_and_test(&current->mm->context.perf_rdpmc_allowed))
+		on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+}
+
 static int x86_pmu_event_idx(struct perf_event *event)
 {
 	int idx = event->hw.idx;
 
-	if (!x86_pmu.attr_rdpmc)
+	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
 		return 0;
 
 	if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
@@ -1829,16 +1861,6 @@ static ssize_t get_attr_rdpmc(struct device *cdev,
 	return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
 }
 
-static void change_rdpmc(void *info)
-{
-	bool enable = !!(unsigned long)info;
-
-	if (enable)
-		set_in_cr4(X86_CR4_PCE);
-	else
-		clear_in_cr4(X86_CR4_PCE);
-}
-
 static ssize_t set_attr_rdpmc(struct device *cdev,
 			      struct device_attribute *attr,
 			      const char *buf, size_t count)
@@ -1850,14 +1872,27 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
 	if (ret)
 		return ret;
 
+	if (val > 2)
+		return -EINVAL;
+
 	if (x86_pmu.attr_rdpmc_broken)
 		return -ENOTSUPP;
 
-	if (!!val != !!x86_pmu.attr_rdpmc) {
-		x86_pmu.attr_rdpmc = !!val;
-		on_each_cpu(change_rdpmc, (void *)val, 1);
+	if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) {
+		/*
+		 * Changing into or out of always available, aka
+		 * perf-event-bypassing mode.  This path is extremely slow,
+		 * but only root can trigger it, so it's okay.
+		 */
+		if (val == 2)
+			static_key_slow_inc(&rdpmc_always_available);
+		else
+			static_key_slow_dec(&rdpmc_always_available);
+		on_each_cpu(refresh_pce, NULL, 1);
 	}
 
+	x86_pmu.attr_rdpmc = val;
+
 	return count;
 }
 
@@ -1900,6 +1935,9 @@ static struct pmu pmu = {
 
 	.event_init		= x86_pmu_event_init,
 
+	.event_mapped		= x86_pmu_event_mapped,
+	.event_unmapped		= x86_pmu_event_unmapped,
+
 	.add			= x86_pmu_add,
 	.del			= x86_pmu_del,
 	.start			= x86_pmu_start,
@@ -1914,13 +1952,15 @@ static struct pmu pmu = {
 	.flush_branch_stack	= x86_pmu_flush_branch_stack,
 };
 
-void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
+void arch_perf_update_userpage(struct perf_event *event,
+			       struct perf_event_mmap_page *userpg, u64 now)
 {
 	struct cyc2ns_data *data;
 
 	userpg->cap_user_time = 0;
 	userpg->cap_user_time_zero = 0;
-	userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc;
+	userpg->cap_user_rdpmc =
+		!!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
 	userpg->pmc_width = x86_pmu.cntval_bits;
 
 	if (!sched_clock_stable())
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 4e6cdb0ddc70..df525d2be1e8 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -71,6 +71,8 @@ struct event_constraint {
 #define PERF_X86_EVENT_COMMITTED	0x8 /* event passed commit_txn */
 #define PERF_X86_EVENT_PEBS_LD_HSW	0x10 /* haswell style datala, load */
 #define PERF_X86_EVENT_PEBS_NA_HSW	0x20 /* haswell style datala, unknown */
+#define PERF_X86_EVENT_RDPMC_ALLOWED	0x40 /* grant rdpmc permission */
+
 
 struct amd_nb {
 	int nb_id;  /* NorthBridge id */
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index b74ebc7c4402..cf3df1d8d039 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -265,7 +265,10 @@ int __die(const char *str, struct pt_regs *regs, long err)
 	printk("SMP ");
 #endif
 #ifdef CONFIG_DEBUG_PAGEALLOC
-	printk("DEBUG_PAGEALLOC");
+	printk("DEBUG_PAGEALLOC ");
+#endif
+#ifdef CONFIG_KASAN
+	printk("KASAN");
 #endif
 	printk("\n");
 	if (notify_die(DIE_OOPS, str, regs, err,
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 01d1c187c9f9..a62536a1be88 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -19,6 +19,7 @@
 #include <linux/usb/ehci_def.h>
 #include <linux/efi.h>
 #include <asm/efi.h>
+#include <asm/pci_x86.h>
 
 /* Simple VGA output */
 #define VGABASE		(__ISA_IO_base + 0xb8000)
@@ -76,7 +77,7 @@ static struct console early_vga_console = {
 
 /* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
 
-static int early_serial_base = 0x3f8;  /* ttyS0 */
+static unsigned long early_serial_base = 0x3f8;  /* ttyS0 */
 
 #define XMTRDY          0x20
 
@@ -94,13 +95,40 @@ static int early_serial_base = 0x3f8;  /* ttyS0 */
 #define DLL             0       /*  Divisor Latch Low         */
 #define DLH             1       /*  Divisor latch High        */
 
+static void mem32_serial_out(unsigned long addr, int offset, int value)
+{
+	uint32_t *vaddr = (uint32_t *)addr;
+	/* shift implied by pointer type */
+	writel(value, vaddr + offset);
+}
+
+static unsigned int mem32_serial_in(unsigned long addr, int offset)
+{
+	uint32_t *vaddr = (uint32_t *)addr;
+	/* shift implied by pointer type */
+	return readl(vaddr + offset);
+}
+
+static unsigned int io_serial_in(unsigned long addr, int offset)
+{
+	return inb(addr + offset);
+}
+
+static void io_serial_out(unsigned long addr, int offset, int value)
+{
+	outb(value, addr + offset);
+}
+
+static unsigned int (*serial_in)(unsigned long addr, int offset) = io_serial_in;
+static void (*serial_out)(unsigned long addr, int offset, int value) = io_serial_out;
+
 static int early_serial_putc(unsigned char ch)
 {
 	unsigned timeout = 0xffff;
 
-	while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
+	while ((serial_in(early_serial_base, LSR) & XMTRDY) == 0 && --timeout)
 		cpu_relax();
-	outb(ch, early_serial_base + TXR);
+	serial_out(early_serial_base, TXR, ch);
 	return timeout ? 0 : -1;
 }
 
@@ -114,13 +142,28 @@ static void early_serial_write(struct console *con, const char *s, unsigned n)
 	}
 }
 
+static __init void early_serial_hw_init(unsigned divisor)
+{
+	unsigned char c;
+
+	serial_out(early_serial_base, LCR, 0x3);	/* 8n1 */
+	serial_out(early_serial_base, IER, 0);	/* no interrupt */
+	serial_out(early_serial_base, FCR, 0);	/* no fifo */
+	serial_out(early_serial_base, MCR, 0x3);	/* DTR + RTS */
+
+	c = serial_in(early_serial_base, LCR);
+	serial_out(early_serial_base, LCR, c | DLAB);
+	serial_out(early_serial_base, DLL, divisor & 0xff);
+	serial_out(early_serial_base, DLH, (divisor >> 8) & 0xff);
+	serial_out(early_serial_base, LCR, c & ~DLAB);
+}
+
 #define DEFAULT_BAUD 9600
 
 static __init void early_serial_init(char *s)
 {
-	unsigned char c;
 	unsigned divisor;
-	unsigned baud = DEFAULT_BAUD;
+	unsigned long baud = DEFAULT_BAUD;
 	char *e;
 
 	if (*s == ',')
@@ -145,24 +188,124 @@ static __init void early_serial_init(char *s)
 			s++;
 	}
 
-	outb(0x3, early_serial_base + LCR);	/* 8n1 */
-	outb(0, early_serial_base + IER);	/* no interrupt */
-	outb(0, early_serial_base + FCR);	/* no fifo */
-	outb(0x3, early_serial_base + MCR);	/* DTR + RTS */
+	if (*s) {
+		if (kstrtoul(s, 0, &baud) < 0 || baud == 0)
+			baud = DEFAULT_BAUD;
+	}
+
+	/* Convert from baud to divisor value */
+	divisor = 115200 / baud;
+
+	/* These will always be IO based ports */
+	serial_in = io_serial_in;
+	serial_out = io_serial_out;
+
+	/* Set up the HW */
+	early_serial_hw_init(divisor);
+}
+
+#ifdef CONFIG_PCI
+/*
+ * early_pci_serial_init()
+ *
+ * This function is invoked when the early_printk param starts with "pciserial"
+ * The rest of the param should be ",B:D.F,baud" where B, D & F describe the
+ * location of a PCI device that must be a UART device.
+ */
+static __init void early_pci_serial_init(char *s)
+{
+	unsigned divisor;
+	unsigned long baud = DEFAULT_BAUD;
+	u8 bus, slot, func;
+	uint32_t classcode, bar0;
+	uint16_t cmdreg;
+	char *e;
+
+
+	/*
+	 * First, part the param to get the BDF values
+	 */
+	if (*s == ',')
+		++s;
+
+	if (*s == 0)
+		return;
+
+	bus = (u8)simple_strtoul(s, &e, 16);
+	s = e;
+	if (*s != ':')
+		return;
+	++s;
+	slot = (u8)simple_strtoul(s, &e, 16);
+	s = e;
+	if (*s != '.')
+		return;
+	++s;
+	func = (u8)simple_strtoul(s, &e, 16);
+	s = e;
+
+	/* A baud might be following */
+	if (*s == ',')
+		s++;
+
+	/*
+	 * Second, find the device from the BDF
+	 */
+	cmdreg = read_pci_config(bus, slot, func, PCI_COMMAND);
+	classcode = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
+	bar0 = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
+
+	/*
+	 * Verify it is a UART type device
+	 */
+	if (((classcode >> 16 != PCI_CLASS_COMMUNICATION_MODEM) &&
+	     (classcode >> 16 != PCI_CLASS_COMMUNICATION_SERIAL)) ||
+	   (((classcode >> 8) & 0xff) != 0x02)) /* 16550 I/F at BAR0 */
+		return;
+
+	/*
+	 * Determine if it is IO or memory mapped
+	 */
+	if (bar0 & 0x01) {
+		/* it is IO mapped */
+		serial_in = io_serial_in;
+		serial_out = io_serial_out;
+		early_serial_base = bar0&0xfffffffc;
+		write_pci_config(bus, slot, func, PCI_COMMAND,
+						cmdreg|PCI_COMMAND_IO);
+	} else {
+		/* It is memory mapped - assume 32-bit alignment */
+		serial_in = mem32_serial_in;
+		serial_out = mem32_serial_out;
+		/* WARNING! assuming the address is always in the first 4G */
+		early_serial_base =
+			(unsigned long)early_ioremap(bar0 & 0xfffffff0, 0x10);
+		write_pci_config(bus, slot, func, PCI_COMMAND,
+						cmdreg|PCI_COMMAND_MEMORY);
+	}
 
+	/*
+	 * Lastly, initalize the hardware
+	 */
 	if (*s) {
-		baud = simple_strtoul(s, &e, 0);
-		if (baud == 0 || s == e)
+		if (strcmp(s, "nocfg") == 0)
+			/* Sometimes, we want to leave the UART alone
+			 * and assume the BIOS has set it up correctly.
+			 * "nocfg" tells us this is the case, and we
+			 * should do no more setup.
+			 */
+			return;
+		if (kstrtoul(s, 0, &baud) < 0 || baud == 0)
 			baud = DEFAULT_BAUD;
 	}
 
+	/* Convert from baud to divisor value */
 	divisor = 115200 / baud;
-	c = inb(early_serial_base + LCR);
-	outb(c | DLAB, early_serial_base + LCR);
-	outb(divisor & 0xff, early_serial_base + DLL);
-	outb((divisor >> 8) & 0xff, early_serial_base + DLH);
-	outb(c & ~DLAB, early_serial_base + LCR);
+
+	/* Set up the HW */
+	early_serial_hw_init(divisor);
 }
+#endif
 
 static struct console early_serial_console = {
 	.name =		"earlyser",
@@ -210,6 +353,13 @@ static int __init setup_early_printk(char *buf)
 			early_serial_init(buf + 4);
 			early_console_register(&early_serial_console, keep);
 		}
+#ifdef CONFIG_PCI
+		if (!strncmp(buf, "pciserial", 9)) {
+			early_pci_serial_init(buf + 9);
+			early_console_register(&early_serial_console, keep);
+			buf += 9; /* Keep from match the above "serial" */
+		}
+#endif
 		if (!strncmp(buf, "vga", 3) &&
 		    boot_params.screen_info.orig_video_isVGA == 1) {
 			max_xpos = boot_params.screen_info.orig_video_cols;
@@ -226,11 +376,6 @@ static int __init setup_early_printk(char *buf)
 			early_console_register(&xenboot_console, keep);
 #endif
 #ifdef CONFIG_EARLY_PRINTK_INTEL_MID
-		if (!strncmp(buf, "mrst", 4)) {
-			mrst_early_console_init();
-			early_console_register(&early_mrst_console, keep);
-		}
-
 		if (!strncmp(buf, "hsu", 3)) {
 			hsu_early_console_init(buf + 3);
 			early_console_register(&early_hsu_console, keep);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 000d4199b03e..31e2d5bf3e38 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -982,6 +982,9 @@ ENTRY(xen_hypervisor_callback)
 ENTRY(xen_do_upcall)
 1:	mov %esp, %eax
 	call xen_evtchn_do_upcall
+#ifndef CONFIG_PREEMPT
+	call xen_maybe_preempt_hcall
+#endif
 	jmp  ret_from_intr
 	CFI_ENDPROC
 ENDPROC(xen_hypervisor_callback)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index db13655c3a2a..1d74d161687c 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -269,11 +269,14 @@ ENTRY(ret_from_fork)
 	testl $3, CS-ARGOFFSET(%rsp)		# from kernel_thread?
 	jz   1f
 
-	testl $_TIF_IA32, TI_flags(%rcx)	# 32-bit compat task needs IRET
-	jnz  int_ret_from_sys_call
-
-	RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
-	jmp ret_from_sys_call			# go to the SYSRET fastpath
+	/*
+	 * By the time we get here, we have no idea whether our pt_regs,
+	 * ti flags, and ti status came from the 64-bit SYSCALL fast path,
+	 * the slow path, or one of the ia32entry paths.
+	 * Use int_ret_from_sys_call to return, since it can safely handle
+	 * all of the above.
+	 */
+	jmp  int_ret_from_sys_call
 
 1:
 	subq $REST_SKIP, %rsp	# leave space for volatiles
@@ -1208,6 +1211,9 @@ ENTRY(xen_do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
 	popq %rsp
 	CFI_DEF_CFA_REGISTER rsp
 	decl PER_CPU_VAR(irq_count)
+#ifndef CONFIG_PREEMPT
+	call xen_maybe_preempt_hcall
+#endif
 	jmp  error_exit
 	CFI_ENDPROC
 END(xen_do_hypervisor_callback)
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index d6c1b9836995..2911ef3a9f1c 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -31,6 +31,7 @@ static void __init i386_default_early_setup(void)
 
 asmlinkage __visible void __init i386_start_kernel(void)
 {
+	cr4_init_shadow();
 	sanitize_boot_params(&boot_params);
 
 	/* Call the subarch specific early setup function */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index eda1a865641e..c4f8d4659070 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -27,6 +27,7 @@
 #include <asm/bios_ebda.h>
 #include <asm/bootparam_utils.h>
 #include <asm/microcode.h>
+#include <asm/kasan.h>
 
 /*
  * Manage page tables very early on.
@@ -46,7 +47,7 @@ static void __init reset_early_page_tables(void)
 
 	next_early_pgt = 0;
 
-	write_cr3(__pa(early_level4_pgt));
+	write_cr3(__pa_nodebug(early_level4_pgt));
 }
 
 /* Create a new PMD entry */
@@ -59,7 +60,7 @@ int __init early_make_pgtable(unsigned long address)
 	pmdval_t pmd, *pmd_p;
 
 	/* Invalid address or early pgt is done ?  */
-	if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt))
+	if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
 		return -1;
 
 again:
@@ -155,9 +156,13 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
 				(__START_KERNEL & PGDIR_MASK)));
 	BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
 
+	cr4_init_shadow();
+
 	/* Kill off the identity-map trampoline */
 	reset_early_page_tables();
 
+	kasan_map_early_shadow(early_level4_pgt);
+
 	/* clear bss before set_intr_gate with early_idt_handler */
 	clear_bss();
 
@@ -179,6 +184,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
 	/* set init_level4_pgt kernel high mapping*/
 	init_level4_pgt[511] = early_level4_pgt[511];
 
+	kasan_map_early_shadow(init_level4_pgt);
+
 	x86_64_start_reservations(real_mode_data);
 }
 
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index a468c0a65c42..6fd514d9f69a 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -514,8 +514,38 @@ ENTRY(phys_base)
 	/* This must match the first entry in level2_kernel_pgt */
 	.quad   0x0000000000000000
 
+#ifdef CONFIG_KASAN
+#define FILL(VAL, COUNT)				\
+	.rept (COUNT) ;					\
+	.quad	(VAL) ;					\
+	.endr
+
+NEXT_PAGE(kasan_zero_pte)
+	FILL(kasan_zero_page - __START_KERNEL_map + _KERNPG_TABLE, 512)
+NEXT_PAGE(kasan_zero_pmd)
+	FILL(kasan_zero_pte - __START_KERNEL_map + _KERNPG_TABLE, 512)
+NEXT_PAGE(kasan_zero_pud)
+	FILL(kasan_zero_pmd - __START_KERNEL_map + _KERNPG_TABLE, 512)
+
+#undef FILL
+#endif
+
+
 #include "../../x86/xen/xen-head.S"
 	
 	__PAGE_ALIGNED_BSS
 NEXT_PAGE(empty_zero_page)
 	.skip PAGE_SIZE
+
+#ifdef CONFIG_KASAN
+/*
+ * This page used as early shadow. We don't use empty_zero_page
+ * at early stages, stack instrumentation could write some garbage
+ * to this page.
+ * Latter we reuse it as zero shadow for large ranges of memory
+ * that allowed to access, but not instrumented by kasan
+ * (vmalloc/vmemmap ...).
+ */
+NEXT_PAGE(kasan_zero_page)
+	.skip PAGE_SIZE
+#endif
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 319bcb9372fe..3acbff4716b0 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -168,7 +168,7 @@ static void _hpet_print_config(const char *function, int line)
 #define hpet_print_config()					\
 do {								\
 	if (hpet_verbose)					\
-		_hpet_print_config(__FUNCTION__, __LINE__);	\
+		_hpet_print_config(__func__, __LINE__);	\
 } while (0)
 
 /*
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 0f3de6674ae3..29e982ada854 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -13,6 +13,7 @@
 #include <asm/sigcontext.h>
 #include <asm/processor.h>
 #include <asm/math_emu.h>
+#include <asm/tlbflush.h>
 #include <asm/uaccess.h>
 #include <asm/ptrace.h>
 #include <asm/i387.h>
@@ -197,7 +198,7 @@ void fpu_init(void)
 	if (cpu_has_xmm)
 		cr4_mask |= X86_CR4_OSXMMEXCPT;
 	if (cr4_mask)
-		set_in_cr4(cr4_mask);
+		cr4_set_bits(cr4_mask);
 
 	cr0 = read_cr0();
 	cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 705ef8d48e2d..67b1cbe0093a 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -302,6 +302,9 @@ int check_irq_vectors_for_cpu_disable(void)
 		irq = __this_cpu_read(vector_irq[vector]);
 		if (irq >= 0) {
 			desc = irq_to_desc(irq);
+			if (!desc)
+				continue;
+
 			data = irq_desc_get_irq_data(desc);
 			cpumask_copy(&affinity_new, data->affinity);
 			cpu_clear(this_cpu, affinity_new);
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 98f654d466e5..4e3d5a9621fe 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -84,7 +84,7 @@ static volatile u32 twobyte_is_boostable[256 / 32] = {
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
 	/*      ----------------------------------------------          */
 	W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
-	W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 10 */
+	W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1) , /* 10 */
 	W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
 	W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
 	W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
@@ -223,27 +223,48 @@ static unsigned long
 __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
 {
 	struct kprobe *kp;
+	unsigned long faddr;
 
 	kp = get_kprobe((void *)addr);
-	/* There is no probe, return original address */
-	if (!kp)
+	faddr = ftrace_location(addr);
+	/*
+	 * Addresses inside the ftrace location are refused by
+	 * arch_check_ftrace_location(). Something went terribly wrong
+	 * if such an address is checked here.
+	 */
+	if (WARN_ON(faddr && faddr != addr))
+		return 0UL;
+	/*
+	 * Use the current code if it is not modified by Kprobe
+	 * and it cannot be modified by ftrace.
+	 */
+	if (!kp && !faddr)
 		return addr;
 
 	/*
-	 *  Basically, kp->ainsn.insn has an original instruction.
-	 *  However, RIP-relative instruction can not do single-stepping
-	 *  at different place, __copy_instruction() tweaks the displacement of
-	 *  that instruction. In that case, we can't recover the instruction
-	 *  from the kp->ainsn.insn.
+	 * Basically, kp->ainsn.insn has an original instruction.
+	 * However, RIP-relative instruction can not do single-stepping
+	 * at different place, __copy_instruction() tweaks the displacement of
+	 * that instruction. In that case, we can't recover the instruction
+	 * from the kp->ainsn.insn.
 	 *
-	 *  On the other hand, kp->opcode has a copy of the first byte of
-	 *  the probed instruction, which is overwritten by int3. And
-	 *  the instruction at kp->addr is not modified by kprobes except
-	 *  for the first byte, we can recover the original instruction
-	 *  from it and kp->opcode.
+	 * On the other hand, in case on normal Kprobe, kp->opcode has a copy
+	 * of the first byte of the probed instruction, which is overwritten
+	 * by int3. And the instruction at kp->addr is not modified by kprobes
+	 * except for the first byte, we can recover the original instruction
+	 * from it and kp->opcode.
+	 *
+	 * In case of Kprobes using ftrace, we do not have a copy of
+	 * the original instruction. In fact, the ftrace location might
+	 * be modified at anytime and even could be in an inconsistent state.
+	 * Fortunately, we know that the original code is the ideal 5-byte
+	 * long NOP.
 	 */
-	memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
-	buf[0] = kp->opcode;
+	memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+	if (faddr)
+		memcpy(buf, ideal_nops[NOP_ATOMIC5], 5);
+	else
+		buf[0] = kp->opcode;
 	return (unsigned long)buf;
 }
 
@@ -251,6 +272,7 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
  * Recover the probed instruction at addr for further analysis.
  * Caller must lock kprobes by kprobe_mutex, or disable preemption
  * for preventing to release referencing kprobes.
+ * Returns zero if the instruction can not get recovered.
  */
 unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
 {
@@ -285,6 +307,8 @@ static int can_probe(unsigned long paddr)
 		 * normally used, we just go through if there is no kprobe.
 		 */
 		__addr = recover_probed_instruction(buf, addr);
+		if (!__addr)
+			return 0;
 		kernel_insn_init(&insn, (void *)__addr, MAX_INSN_SIZE);
 		insn_get_length(&insn);
 
@@ -333,6 +357,8 @@ int __copy_instruction(u8 *dest, u8 *src)
 	unsigned long recovered_insn =
 		recover_probed_instruction(buf, (unsigned long)src);
 
+	if (!recovered_insn)
+		return 0;
 	kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
 	insn_get_length(&insn);
 	/* Another subsystem puts a breakpoint, failed to recover */
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 7c523bbf3dc8..7b3b9d15c47a 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -259,6 +259,8 @@ static int can_optimize(unsigned long paddr)
 			 */
 			return 0;
 		recovered_insn = recover_probed_instruction(buf, addr);
+		if (!recovered_insn)
+			return 0;
 		kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
 		insn_get_length(&insn);
 		/* Another subsystem puts a breakpoint */
@@ -322,7 +324,8 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
  * Target instructions MUST be relocatable (checked inside)
  * This is called when new aggr(opt)probe is allocated or reused.
  */
-int arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
+int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
+				  struct kprobe *__unused)
 {
 	u8 *buf;
 	int ret;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 94f643484300..e354cc6446ab 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -609,7 +609,7 @@ static inline void check_zero(void)
 	u8 ret;
 	u8 old;
 
-	old = ACCESS_ONCE(zero_stats);
+	old = READ_ONCE(zero_stats);
 	if (unlikely(old)) {
 		ret = cmpxchg(&zero_stats, old, 0);
 		/* This ensures only one fellow resets the stat */
@@ -727,6 +727,7 @@ __visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 	int cpu;
 	u64 start;
 	unsigned long flags;
+	__ticket_t head;
 
 	if (in_nmi())
 		return;
@@ -768,11 +769,15 @@ __visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 	 */
 	__ticket_enter_slowpath(lock);
 
+	/* make sure enter_slowpath, which is atomic does not cross the read */
+	smp_mb__after_atomic();
+
 	/*
 	 * check again make sure it didn't become free while
 	 * we weren't looking.
 	 */
-	if (ACCESS_ONCE(lock->tickets.head) == want) {
+	head = READ_ONCE(lock->tickets.head);
+	if (__tickets_equal(head, want)) {
 		add_stats(TAKEN_SLOW_PICKUP, 1);
 		goto out;
 	}
@@ -803,8 +808,8 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
 	add_stats(RELEASED_SLOW, 1);
 	for_each_cpu(cpu, &waiting_cpus) {
 		const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu);
-		if (ACCESS_ONCE(w->lock) == lock &&
-		    ACCESS_ONCE(w->want) == ticket) {
+		if (READ_ONCE(w->lock) == lock &&
+		    READ_ONCE(w->want) == ticket) {
 			add_stats(RELEASED_SLOW_KICKED, 1);
 			kvm_kick_cpu(cpu);
 			break;
diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c
new file mode 100644
index 000000000000..ff3c3101d003
--- /dev/null
+++ b/arch/x86/kernel/livepatch.c
@@ -0,0 +1,90 @@
+/*
+ * livepatch.c - x86-specific Kernel Live Patching Core
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/page_types.h>
+#include <asm/elf.h>
+#include <asm/livepatch.h>
+
+/**
+ * klp_write_module_reloc() - write a relocation in a module
+ * @mod:	module in which the section to be modified is found
+ * @type:	ELF relocation type (see asm/elf.h)
+ * @loc:	address that the relocation should be written to
+ * @value:	relocation value (sym address + addend)
+ *
+ * This function writes a relocation to the specified location for
+ * a particular module.
+ */
+int klp_write_module_reloc(struct module *mod, unsigned long type,
+			   unsigned long loc, unsigned long value)
+{
+	int ret, numpages, size = 4;
+	bool readonly;
+	unsigned long val;
+	unsigned long core = (unsigned long)mod->module_core;
+	unsigned long core_ro_size = mod->core_ro_size;
+	unsigned long core_size = mod->core_size;
+
+	switch (type) {
+	case R_X86_64_NONE:
+		return 0;
+	case R_X86_64_64:
+		val = value;
+		size = 8;
+		break;
+	case R_X86_64_32:
+		val = (u32)value;
+		break;
+	case R_X86_64_32S:
+		val = (s32)value;
+		break;
+	case R_X86_64_PC32:
+		val = (u32)(value - loc);
+		break;
+	default:
+		/* unsupported relocation type */
+		return -EINVAL;
+	}
+
+	if (loc < core || loc >= core + core_size)
+		/* loc does not point to any symbol inside the module */
+		return -EINVAL;
+
+	if (loc < core + core_ro_size)
+		readonly = true;
+	else
+		readonly = false;
+
+	/* determine if the relocation spans a page boundary */
+	numpages = ((loc & PAGE_MASK) == ((loc + size) & PAGE_MASK)) ? 1 : 2;
+
+	if (readonly)
+		set_memory_rw(loc & PAGE_MASK, numpages);
+
+	ret = probe_kernel_write((void *)loc, &val, size);
+
+	if (readonly)
+		set_memory_ro(loc & PAGE_MASK, numpages);
+
+	return ret;
+}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index e69f9882bf95..d1ac80b72c72 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -24,6 +24,7 @@
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
+#include <linux/kasan.h>
 #include <linux/bug.h>
 #include <linux/mm.h>
 #include <linux/gfp.h>
@@ -83,13 +84,22 @@ static unsigned long int get_module_load_offset(void)
 
 void *module_alloc(unsigned long size)
 {
+	void *p;
+
 	if (PAGE_ALIGN(size) > MODULES_LEN)
 		return NULL;
-	return __vmalloc_node_range(size, 1,
+
+	p = __vmalloc_node_range(size, MODULE_ALIGN,
 				    MODULES_VADDR + get_module_load_offset(),
 				    MODULES_END, GFP_KERNEL | __GFP_HIGHMEM,
-				    PAGE_KERNEL_EXEC, NUMA_NO_NODE,
+				    PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
 				    __builtin_return_address(0));
+	if (p && (kasan_module_alloc(p, size) < 0)) {
+		vfree(p);
+		return NULL;
+	}
+
+	return p;
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 83480373a642..dcaf4b00d0b4 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@
 #include <asm/fpu-internal.h>
 #include <asm/debugreg.h>
 #include <asm/nmi.h>
+#include <asm/tlbflush.h>
 
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -148,7 +149,7 @@ void flush_thread(void)
 
 static void hard_disable_TSC(void)
 {
-	write_cr4(read_cr4() | X86_CR4_TSD);
+	cr4_set_bits(X86_CR4_TSD);
 }
 
 void disable_TSC(void)
@@ -165,7 +166,7 @@ void disable_TSC(void)
 
 static void hard_enable_TSC(void)
 {
-	write_cr4(read_cr4() & ~X86_CR4_TSD);
+	cr4_clear_bits(X86_CR4_TSD);
 }
 
 static void enable_TSC(void)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8f3ebfe710d0..603c4f99cb5a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -101,7 +101,7 @@ void __show_regs(struct pt_regs *regs, int all)
 	cr0 = read_cr0();
 	cr2 = read_cr2();
 	cr3 = read_cr3();
-	cr4 = read_cr4_safe();
+	cr4 = __read_cr4_safe();
 	printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
 			cr0, cr2, cr3, cr4);
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 5a2c02913af3..67fcc43577d2 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -93,7 +93,7 @@ void __show_regs(struct pt_regs *regs, int all)
 	cr0 = read_cr0();
 	cr2 = read_cr2();
 	cr3 = read_cr3();
-	cr4 = read_cr4();
+	cr4 = __read_cr4();
 
 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
 	       fs, fsindex, gs, gsindex, shadowgs);
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index fe3dbfe0c4a5..cd9685235df9 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -49,11 +49,11 @@ int mach_set_rtc_mmss(const struct timespec *now)
 		retval = set_rtc_time(&tm);
 		if (retval)
 			printk(KERN_ERR "%s: RTC write failed with error %d\n",
-			       __FUNCTION__, retval);
+			       __func__, retval);
 	} else {
 		printk(KERN_ERR
 		       "%s: Invalid RTC value: write of %lx to RTC failed\n",
-			__FUNCTION__, nowtime);
+			__func__, nowtime);
 		retval = -EINVAL;
 	}
 	return retval;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c4648adadd7d..0a2421cca01f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -89,6 +89,7 @@
 #include <asm/cacheflush.h>
 #include <asm/processor.h>
 #include <asm/bugs.h>
+#include <asm/kasan.h>
 
 #include <asm/vsyscall.h>
 #include <asm/cpu.h>
@@ -1174,9 +1175,11 @@ void __init setup_arch(char **cmdline_p)
 
 	x86_init.paging.pagetable_init();
 
+	kasan_init();
+
 	if (boot_cpu_data.cpuid_level >= 0) {
 		/* A CPU has %cr4 if and only if it has CPUID */
-		mmu_cr4_features = read_cr4();
+		mmu_cr4_features = __read_cr4();
 		if (trampoline_cr4_features)
 			*trampoline_cr4_features = mmu_cr4_features;
 	}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 2a33c8f68319..e5042463c1bc 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -69,7 +69,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
 	unsigned int err = 0;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	get_user_try {
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 92b83e299ed3..7ee7369d5aec 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -384,7 +384,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 		goto exit;
 	conditional_sti(regs);
 
-	if (!user_mode(regs))
+	if (!user_mode_vm(regs))
 		die("bounds", regs, error_code);
 
 	if (!cpu_feature_enabled(X86_FEATURE_MPX)) {
@@ -637,7 +637,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	 * then it's very likely the result of an icebp/int01 trap.
 	 * User wants a sigtrap for that.
 	 */
-	if (!dr6 && user_mode(regs))
+	if (!dr6 && user_mode_vm(regs))
 		user_icebp = 1;
 
 	/* Catch kmemcheck conditions first of all! */
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 8b96a947021f..81f8adb0679e 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -66,27 +66,54 @@
  * Good-instruction tables for 32-bit apps.  This is non-const and volatile
  * to keep gcc from statically optimizing it out, as variable_test_bit makes
  * some versions of gcc to think only *(unsigned long*) is used.
+ *
+ * Opcodes we'll probably never support:
+ * 6c-6f - ins,outs. SEGVs if used in userspace
+ * e4-e7 - in,out imm. SEGVs if used in userspace
+ * ec-ef - in,out acc. SEGVs if used in userspace
+ * cc - int3. SIGTRAP if used in userspace
+ * ce - into. Not used in userspace - no kernel support to make it useful. SEGVs
+ *	(why we support bound (62) then? it's similar, and similarly unused...)
+ * f1 - int1. SIGTRAP if used in userspace
+ * f4 - hlt. SEGVs if used in userspace
+ * fa - cli. SEGVs if used in userspace
+ * fb - sti. SEGVs if used in userspace
+ *
+ * Opcodes which need some work to be supported:
+ * 07,17,1f - pop es/ss/ds
+ *	Normally not used in userspace, but would execute if used.
+ *	Can cause GP or stack exception if tries to load wrong segment descriptor.
+ *	We hesitate to run them under single step since kernel's handling
+ *	of userspace single-stepping (TF flag) is fragile.
+ *	We can easily refuse to support push es/cs/ss/ds (06/0e/16/1e)
+ *	on the same grounds that they are never used.
+ * cd - int N.
+ *	Used by userspace for "int 80" syscall entry. (Other "int N"
+ *	cause GP -> SEGV since their IDT gates don't allow calls from CPL 3).
+ *	Not supported since kernel's handling of userspace single-stepping
+ *	(TF flag) is fragile.
+ * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad
  */
 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
 static volatile u32 good_insns_32[256 / 32] = {
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
 	/*      ----------------------------------------------         */
-	W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */
+	W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 00 */
 	W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */
-	W(0x20, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* 20 */
-	W(0x30, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) , /* 30 */
+	W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
+	W(0x30, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */
 	W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
 	W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
-	W(0x60, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
+	W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
 	W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
 	W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
 	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
 	W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
 	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
 	W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
-	W(0xd0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+	W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
 	W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
-	W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1)   /* f0 */
+	W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1)   /* f0 */
 	/*      ----------------------------------------------         */
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
 };
@@ -94,27 +121,61 @@ static volatile u32 good_insns_32[256 / 32] = {
 #define good_insns_32	NULL
 #endif
 
-/* Good-instruction tables for 64-bit apps */
+/* Good-instruction tables for 64-bit apps.
+ *
+ * Genuinely invalid opcodes:
+ * 06,07 - formerly push/pop es
+ * 0e - formerly push cs
+ * 16,17 - formerly push/pop ss
+ * 1e,1f - formerly push/pop ds
+ * 27,2f,37,3f - formerly daa/das/aaa/aas
+ * 60,61 - formerly pusha/popa
+ * 62 - formerly bound. EVEX prefix for AVX512 (not yet supported)
+ * 82 - formerly redundant encoding of Group1
+ * 9a - formerly call seg:ofs
+ * ce - formerly into
+ * d4,d5 - formerly aam/aad
+ * d6 - formerly undocumented salc
+ * ea - formerly jmp seg:ofs
+ *
+ * Opcodes we'll probably never support:
+ * 6c-6f - ins,outs. SEGVs if used in userspace
+ * e4-e7 - in,out imm. SEGVs if used in userspace
+ * ec-ef - in,out acc. SEGVs if used in userspace
+ * cc - int3. SIGTRAP if used in userspace
+ * f1 - int1. SIGTRAP if used in userspace
+ * f4 - hlt. SEGVs if used in userspace
+ * fa - cli. SEGVs if used in userspace
+ * fb - sti. SEGVs if used in userspace
+ *
+ * Opcodes which need some work to be supported:
+ * cd - int N.
+ *	Used by userspace for "int 80" syscall entry. (Other "int N"
+ *	cause GP -> SEGV since their IDT gates don't allow calls from CPL 3).
+ *	Not supported since kernel's handling of userspace single-stepping
+ *	(TF flag) is fragile.
+ * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad
+ */
 #if defined(CONFIG_X86_64)
 static volatile u32 good_insns_64[256 / 32] = {
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
 	/*      ----------------------------------------------         */
-	W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */
+	W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* 00 */
 	W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */
-	W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */
-	W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */
-	W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
+	W(0x20, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 20 */
+	W(0x30, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 30 */
+	W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
 	W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
-	W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
+	W(0x60, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
 	W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
 	W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
-	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1) , /* 90 */
 	W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
 	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
-	W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
+	W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
 	W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
-	W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
-	W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1)   /* f0 */
+	W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0) | /* e0 */
+	W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1)   /* f0 */
 	/*      ----------------------------------------------         */
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
 };
@@ -122,49 +183,55 @@ static volatile u32 good_insns_64[256 / 32] = {
 #define good_insns_64	NULL
 #endif
 
-/* Using this for both 64-bit and 32-bit apps */
+/* Using this for both 64-bit and 32-bit apps.
+ * Opcodes we don't support:
+ * 0f 00 - SLDT/STR/LLDT/LTR/VERR/VERW/-/- group. System insns
+ * 0f 01 - SGDT/SIDT/LGDT/LIDT/SMSW/-/LMSW/INVLPG group.
+ *	Also encodes tons of other system insns if mod=11.
+ *	Some are in fact non-system: xend, xtest, rdtscp, maybe more
+ * 0f 05 - syscall
+ * 0f 06 - clts (CPL0 insn)
+ * 0f 07 - sysret
+ * 0f 08 - invd (CPL0 insn)
+ * 0f 09 - wbinvd (CPL0 insn)
+ * 0f 0b - ud2
+ * 0f 30 - wrmsr (CPL0 insn) (then why rdmsr is allowed, it's also CPL0 insn?)
+ * 0f 34 - sysenter
+ * 0f 35 - sysexit
+ * 0f 37 - getsec
+ * 0f 78 - vmread (Intel VMX. CPL0 insn)
+ * 0f 79 - vmwrite (Intel VMX. CPL0 insn)
+ *	Note: with prefixes, these two opcodes are
+ *	extrq/insertq/AVX512 convert vector ops.
+ * 0f ae - group15: [f]xsave,[f]xrstor,[v]{ld,st}mxcsr,clflush[opt],
+ *	{rd,wr}{fs,gs}base,{s,l,m}fence.
+ *	Why? They are all user-executable.
+ */
 static volatile u32 good_2byte_insns[256 / 32] = {
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
 	/*      ----------------------------------------------         */
-	W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */
-	W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
-	W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
-	W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
+	W(0x00, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1) | /* 00 */
+	W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
+	W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
+	W(0x30, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */
 	W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
 	W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
 	W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
-	W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
+	W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* 70 */
 	W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
 	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
-	W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
-	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+	W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
+	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
 	W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
-	W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+	W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
 	W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
-	W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)   /* f0 */
+	W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)   /* f0 */
 	/*      ----------------------------------------------         */
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
 };
 #undef W
 
 /*
- * opcodes we'll probably never support:
- *
- *  6c-6d, e4-e5, ec-ed - in
- *  6e-6f, e6-e7, ee-ef - out
- *  cc, cd - int3, int
- *  cf - iret
- *  d6 - illegal instruction
- *  f1 - int1/icebp
- *  f4 - hlt
- *  fa, fb - cli, sti
- *  0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2
- *
- * invalid opcodes in 64-bit mode:
- *
- *  06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5
- *  63 - we support this opcode in x86_64 but not in i386.
- *
  * opcodes we may need to refine support for:
  *
  *  0f - 2-byte instructions: For many of these instructions, the validity
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 040681928e9d..37d8fa4438f0 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -50,13 +50,19 @@ EXPORT_SYMBOL(csum_partial);
 #undef memset
 #undef memmove
 
+extern void *__memset(void *, int, __kernel_size_t);
+extern void *__memcpy(void *, const void *, __kernel_size_t);
+extern void *__memmove(void *, const void *, __kernel_size_t);
 extern void *memset(void *, int, __kernel_size_t);
 extern void *memcpy(void *, const void *, __kernel_size_t);
-extern void *__memcpy(void *, const void *, __kernel_size_t);
+extern void *memmove(void *, const void *, __kernel_size_t);
+
+EXPORT_SYMBOL(__memset);
+EXPORT_SYMBOL(__memcpy);
+EXPORT_SYMBOL(__memmove);
 
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
-EXPORT_SYMBOL(__memcpy);
 EXPORT_SYMBOL(memmove);
 
 #ifndef CONFIG_DEBUG_VIRTUAL
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index de9dcf89a302..0bf82c5ac529 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -12,6 +12,7 @@
 #include <asm/i387.h>
 #include <asm/fpu-internal.h>
 #include <asm/sigframe.h>
+#include <asm/tlbflush.h>
 #include <asm/xcr.h>
 
 /*
@@ -378,7 +379,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
 		 * thread's fpu state, reconstruct fxstate from the fsave
 		 * header. Sanitize the copied state etc.
 		 */
-		struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave;
+		struct fpu *fpu = &tsk->thread.fpu;
 		struct user_i387_ia32_struct env;
 		int err = 0;
 
@@ -392,14 +393,15 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
 		 */
 		drop_fpu(tsk);
 
-		if (__copy_from_user(xsave, buf_fx, state_size) ||
+		if (__copy_from_user(&fpu->state->xsave, buf_fx, state_size) ||
 		    __copy_from_user(&env, buf, sizeof(env))) {
+			fpu_finit(fpu);
 			err = -1;
 		} else {
 			sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only);
-			set_used_math();
 		}
 
+		set_used_math();
 		if (use_eager_fpu()) {
 			preempt_disable();
 			math_state_restore();
@@ -453,7 +455,7 @@ static void prepare_fx_sw_frame(void)
  */
 static inline void xstate_enable(void)
 {
-	set_in_cr4(X86_CR4_OSXSAVE);
+	cr4_set_bits(X86_CR4_OSXSAVE);
 	xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
 }
 
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 7dc7ba577ecd..413a7bf9efbb 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -39,6 +39,7 @@ config KVM
 	select PERF_EVENTS
 	select HAVE_KVM_MSI
 	select HAVE_KVM_CPU_RELAX_INTERCEPT
+	select KVM_GENERIC_DIRTYLOG_READ_PROTECT
 	select KVM_VFIO
 	select SRCU
 	---help---
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index de12c1d379f1..106c01557f2b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -86,6 +86,7 @@
 #define DstAcc      (OpAcc << DstShift)
 #define DstDI       (OpDI << DstShift)
 #define DstMem64    (OpMem64 << DstShift)
+#define DstMem16    (OpMem16 << DstShift)
 #define DstImmUByte (OpImmUByte << DstShift)
 #define DstDX       (OpDX << DstShift)
 #define DstAccLo    (OpAccLo << DstShift)
@@ -124,6 +125,7 @@
 #define RMExt       (4<<15)     /* Opcode extension in ModRM r/m if mod == 3 */
 #define Escape      (5<<15)     /* Escape to coprocessor instruction */
 #define InstrDual   (6<<15)     /* Alternate instruction decoding of mod == 3 */
+#define ModeDual    (7<<15)     /* Different instruction for 32/64 bit */
 #define Sse         (1<<18)     /* SSE Vector instruction */
 /* Generic ModRM decode. */
 #define ModRM       (1<<19)
@@ -165,10 +167,10 @@
 #define NoMod	    ((u64)1 << 47)  /* Mod field is ignored */
 #define Intercept   ((u64)1 << 48)  /* Has valid intercept field */
 #define CheckPerm   ((u64)1 << 49)  /* Has valid check_perm field */
-#define NoBigReal   ((u64)1 << 50)  /* No big real mode */
 #define PrivUD      ((u64)1 << 51)  /* #UD instead of #GP on CPL > 0 */
 #define NearBranch  ((u64)1 << 52)  /* Near branches */
 #define No16	    ((u64)1 << 53)  /* No 16 bit operand */
+#define IncSP       ((u64)1 << 54)  /* SP is incremented before ModRM calc */
 
 #define DstXacc     (DstAccLo | SrcAccHi | SrcWrite)
 
@@ -213,6 +215,7 @@ struct opcode {
 		const struct gprefix *gprefix;
 		const struct escape *esc;
 		const struct instr_dual *idual;
+		const struct mode_dual *mdual;
 		void (*fastop)(struct fastop *fake);
 	} u;
 	int (*check_perm)(struct x86_emulate_ctxt *ctxt);
@@ -240,6 +243,11 @@ struct instr_dual {
 	struct opcode mod3;
 };
 
+struct mode_dual {
+	struct opcode mode32;
+	struct opcode mode64;
+};
+
 /* EFLAGS bit definitions. */
 #define EFLG_ID (1<<21)
 #define EFLG_VIP (1<<20)
@@ -262,6 +270,13 @@ struct instr_dual {
 #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
 #define EFLG_RESERVED_ONE_MASK 2
 
+enum x86_transfer_type {
+	X86_TRANSFER_NONE,
+	X86_TRANSFER_CALL_JMP,
+	X86_TRANSFER_RET,
+	X86_TRANSFER_TASK_SWITCH,
+};
+
 static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
 {
 	if (!(ctxt->regs_valid & (1 << nr))) {
@@ -669,9 +684,13 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
 		}
 		if (addr.ea > lim)
 			goto bad;
-		*max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea);
-		if (size > *max_size)
-			goto bad;
+		if (lim == 0xffffffff)
+			*max_size = ~0u;
+		else {
+			*max_size = (u64)lim + 1 - addr.ea;
+			if (size > *max_size)
+				goto bad;
+		}
 		la &= (u32)-1;
 		break;
 	}
@@ -722,19 +741,26 @@ static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst,
 			  const struct desc_struct *cs_desc)
 {
 	enum x86emul_mode mode = ctxt->mode;
+	int rc;
 
 #ifdef CONFIG_X86_64
-	if (ctxt->mode >= X86EMUL_MODE_PROT32 && cs_desc->l) {
-		u64 efer = 0;
+	if (ctxt->mode >= X86EMUL_MODE_PROT16) {
+		if (cs_desc->l) {
+			u64 efer = 0;
 
-		ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
-		if (efer & EFER_LMA)
-			mode = X86EMUL_MODE_PROT64;
+			ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+			if (efer & EFER_LMA)
+				mode = X86EMUL_MODE_PROT64;
+		} else
+			mode = X86EMUL_MODE_PROT32; /* temporary value */
 	}
 #endif
 	if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32)
 		mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
-	return assign_eip(ctxt, dst, mode);
+	rc = assign_eip(ctxt, dst, mode);
+	if (rc == X86EMUL_CONTINUE)
+		ctxt->mode = mode;
+	return rc;
 }
 
 static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
@@ -1057,8 +1083,6 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt)
 	asm volatile("fnstcw %0": "+m"(fcw));
 	ctxt->ops->put_fpu(ctxt);
 
-	/* force 2 byte destination */
-	ctxt->dst.bytes = 2;
 	ctxt->dst.val = fcw;
 
 	return X86EMUL_CONTINUE;
@@ -1075,8 +1099,6 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
 	asm volatile("fnstsw %0": "+m"(fsw));
 	ctxt->ops->put_fpu(ctxt);
 
-	/* force 2 byte destination */
-	ctxt->dst.bytes = 2;
 	ctxt->dst.val = fsw;
 
 	return X86EMUL_CONTINUE;
@@ -1223,6 +1245,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 			else {
 				modrm_ea += reg_read(ctxt, base_reg);
 				adjust_modrm_seg(ctxt, base_reg);
+				/* Increment ESP on POP [ESP] */
+				if ((ctxt->d & IncSP) &&
+				    base_reg == VCPU_REGS_RSP)
+					modrm_ea += ctxt->op_bytes;
 			}
 			if (index_reg != 4)
 				modrm_ea += reg_read(ctxt, index_reg) << scale;
@@ -1435,10 +1461,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
 		ops->get_gdt(ctxt, dt);
 }
 
-/* allowed just for 8 bytes segments */
-static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
-				   u16 selector, struct desc_struct *desc,
-				   ulong *desc_addr_p)
+static int get_descriptor_ptr(struct x86_emulate_ctxt *ctxt,
+			      u16 selector, ulong *desc_addr_p)
 {
 	struct desc_ptr dt;
 	u16 index = selector >> 3;
@@ -1449,8 +1473,34 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	if (dt.size < index * 8 + 7)
 		return emulate_gp(ctxt, selector & 0xfffc);
 
-	*desc_addr_p = addr = dt.address + index * 8;
-	return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
+	addr = dt.address + index * 8;
+
+#ifdef CONFIG_X86_64
+	if (addr >> 32 != 0) {
+		u64 efer = 0;
+
+		ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+		if (!(efer & EFER_LMA))
+			addr &= (u32)-1;
+	}
+#endif
+
+	*desc_addr_p = addr;
+	return X86EMUL_CONTINUE;
+}
+
+/* allowed just for 8 bytes segments */
+static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+				   u16 selector, struct desc_struct *desc,
+				   ulong *desc_addr_p)
+{
+	int rc;
+
+	rc = get_descriptor_ptr(ctxt, selector, desc_addr_p);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	return ctxt->ops->read_std(ctxt, *desc_addr_p, desc, sizeof(*desc),
 				   &ctxt->exception);
 }
 
@@ -1458,16 +1508,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 				    u16 selector, struct desc_struct *desc)
 {
-	struct desc_ptr dt;
-	u16 index = selector >> 3;
+	int rc;
 	ulong addr;
 
-	get_descriptor_table_ptr(ctxt, selector, &dt);
-
-	if (dt.size < index * 8 + 7)
-		return emulate_gp(ctxt, selector & 0xfffc);
+	rc = get_descriptor_ptr(ctxt, selector, &addr);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
 
-	addr = dt.address + index * 8;
 	return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc,
 				    &ctxt->exception);
 }
@@ -1475,7 +1522,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 /* Does not support long mode */
 static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 				     u16 selector, int seg, u8 cpl,
-				     bool in_task_switch,
+				     enum x86_transfer_type transfer,
 				     struct desc_struct *desc)
 {
 	struct desc_struct seg_desc, old_desc;
@@ -1529,11 +1576,15 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 		return ret;
 
 	err_code = selector & 0xfffc;
-	err_vec = in_task_switch ? TS_VECTOR : GP_VECTOR;
+	err_vec = (transfer == X86_TRANSFER_TASK_SWITCH) ? TS_VECTOR :
+							   GP_VECTOR;
 
 	/* can't load system descriptor into segment selector */
-	if (seg <= VCPU_SREG_GS && !seg_desc.s)
+	if (seg <= VCPU_SREG_GS && !seg_desc.s) {
+		if (transfer == X86_TRANSFER_CALL_JMP)
+			return X86EMUL_UNHANDLEABLE;
 		goto exception;
+	}
 
 	if (!seg_desc.p) {
 		err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
@@ -1605,10 +1656,13 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 
 	if (seg_desc.s) {
 		/* mark segment as accessed */
-		seg_desc.type |= 1;
-		ret = write_segment_descriptor(ctxt, selector, &seg_desc);
-		if (ret != X86EMUL_CONTINUE)
-			return ret;
+		if (!(seg_desc.type & 1)) {
+			seg_desc.type |= 1;
+			ret = write_segment_descriptor(ctxt, selector,
+						       &seg_desc);
+			if (ret != X86EMUL_CONTINUE)
+				return ret;
+		}
 	} else if (ctxt->mode == X86EMUL_MODE_PROT64) {
 		ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3,
 				sizeof(base3), &ctxt->exception);
@@ -1631,7 +1685,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 				   u16 selector, int seg)
 {
 	u8 cpl = ctxt->ops->cpl(ctxt);
-	return __load_segment_descriptor(ctxt, selector, seg, cpl, false, NULL);
+	return __load_segment_descriptor(ctxt, selector, seg, cpl,
+					 X86_TRANSFER_NONE, NULL);
 }
 
 static void write_register_operand(struct operand *op)
@@ -1828,12 +1883,14 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
 	unsigned long selector;
 	int rc;
 
-	rc = emulate_pop(ctxt, &selector, ctxt->op_bytes);
+	rc = emulate_pop(ctxt, &selector, 2);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
 	if (ctxt->modrm_reg == VCPU_SREG_SS)
 		ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+	if (ctxt->op_bytes > 2)
+		rsp_increment(ctxt, ctxt->op_bytes - 2);
 
 	rc = load_segment_descriptor(ctxt, (u16)selector, seg);
 	return rc;
@@ -2007,6 +2064,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
 
 	ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
 	ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
+	ctxt->ops->set_nmi_mask(ctxt, false);
 
 	return rc;
 }
@@ -2041,7 +2099,8 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
 
 	memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
 
-	rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false,
+	rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
+				       X86_TRANSFER_CALL_JMP,
 				       &new_desc);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
@@ -2130,7 +2189,8 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
 	/* Outer-privilege level return is not implemented */
 	if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl)
 		return X86EMUL_UNHANDLEABLE;
-	rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, false,
+	rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl,
+				       X86_TRANSFER_RET,
 				       &new_desc);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
@@ -2163,12 +2223,15 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
 	fastop(ctxt, em_cmp);
 
 	if (ctxt->eflags & EFLG_ZF) {
-		/* Success: write back to memory. */
+		/* Success: write back to memory; no update of EAX */
+		ctxt->src.type = OP_NONE;
 		ctxt->dst.val = ctxt->src.orig_val;
 	} else {
 		/* Failure: write the value we saw to EAX. */
-		ctxt->dst.type = OP_REG;
-		ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+		ctxt->src.type = OP_REG;
+		ctxt->src.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+		ctxt->src.val = ctxt->dst.orig_val;
+		/* Create write-cycle to dest by writing the same value */
 		ctxt->dst.val = ctxt->dst.orig_val;
 	}
 	return X86EMUL_CONTINUE;
@@ -2556,23 +2619,23 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
 	 * it is handled in a context of new task
 	 */
 	ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl,
-					true, NULL);
+					X86_TRANSFER_TASK_SWITCH, NULL);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 	ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
-					true, NULL);
+					X86_TRANSFER_TASK_SWITCH, NULL);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 	ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
-					true, NULL);
+					X86_TRANSFER_TASK_SWITCH, NULL);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 	ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
-					true, NULL);
+					X86_TRANSFER_TASK_SWITCH, NULL);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 	ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
-					true, NULL);
+					X86_TRANSFER_TASK_SWITCH, NULL);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 
@@ -2694,31 +2757,31 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
 	 * it is handled in a context of new task
 	 */
 	ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR,
-					cpl, true, NULL);
+					cpl, X86_TRANSFER_TASK_SWITCH, NULL);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 	ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
-					true, NULL);
+					X86_TRANSFER_TASK_SWITCH, NULL);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 	ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
-					true, NULL);
+					X86_TRANSFER_TASK_SWITCH, NULL);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 	ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
-					true, NULL);
+					X86_TRANSFER_TASK_SWITCH, NULL);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 	ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
-					true, NULL);
+					X86_TRANSFER_TASK_SWITCH, NULL);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 	ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl,
-					true, NULL);
+					X86_TRANSFER_TASK_SWITCH, NULL);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 	ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl,
-					true, NULL);
+					X86_TRANSFER_TASK_SWITCH, NULL);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 
@@ -2739,7 +2802,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 	ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
 			    &ctxt->exception);
 	if (ret != X86EMUL_CONTINUE)
-		/* FIXME: need to provide precise fault address */
 		return ret;
 
 	save_state_to_tss32(ctxt, &tss_seg);
@@ -2748,13 +2810,11 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 	ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip,
 			     ldt_sel_offset - eip_offset, &ctxt->exception);
 	if (ret != X86EMUL_CONTINUE)
-		/* FIXME: need to provide precise fault address */
 		return ret;
 
 	ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
 			    &ctxt->exception);
 	if (ret != X86EMUL_CONTINUE)
-		/* FIXME: need to provide precise fault address */
 		return ret;
 
 	if (old_tss_sel != 0xffff) {
@@ -2765,7 +2825,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 				     sizeof tss_seg.prev_task_link,
 				     &ctxt->exception);
 		if (ret != X86EMUL_CONTINUE)
-			/* FIXME: need to provide precise fault address */
 			return ret;
 	}
 
@@ -2999,15 +3058,16 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
 	struct desc_struct old_desc, new_desc;
 	const struct x86_emulate_ops *ops = ctxt->ops;
 	int cpl = ctxt->ops->cpl(ctxt);
+	enum x86emul_mode prev_mode = ctxt->mode;
 
 	old_eip = ctxt->_eip;
 	ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS);
 
 	memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
-	rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false,
-				       &new_desc);
+	rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
+				       X86_TRANSFER_CALL_JMP, &new_desc);
 	if (rc != X86EMUL_CONTINUE)
-		return X86EMUL_CONTINUE;
+		return rc;
 
 	rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
 	if (rc != X86EMUL_CONTINUE)
@@ -3022,11 +3082,14 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
 	rc = em_push(ctxt);
 	/* If we failed, we tainted the memory, but the very least we should
 	   restore cs */
-	if (rc != X86EMUL_CONTINUE)
+	if (rc != X86EMUL_CONTINUE) {
+		pr_warn_once("faulting far call emulation tainted memory\n");
 		goto fail;
+	}
 	return rc;
 fail:
 	ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS);
+	ctxt->mode = prev_mode;
 	return rc;
 
 }
@@ -3477,6 +3540,12 @@ static int em_clflush(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_movsxd(struct x86_emulate_ctxt *ctxt)
+{
+	ctxt->dst.val = (s32) ctxt->src.val;
+	return X86EMUL_CONTINUE;
+}
+
 static bool valid_cr(int nr)
 {
 	switch (nr) {
@@ -3676,6 +3745,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
 #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
 #define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
 #define ID(_f, _i) { .flags = ((_f) | InstrDual | ModRM), .u.idual = (_i) }
+#define MD(_f, _m) { .flags = ((_f) | ModeDual), .u.mdual = (_m) }
 #define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) }
 #define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
 #define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) }
@@ -3738,7 +3808,7 @@ static const struct opcode group1[] = {
 };
 
 static const struct opcode group1A[] = {
-	I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N,
+	I(DstMem | SrcNone | Mov | Stack | IncSP, em_pop), N, N, N, N, N, N, N,
 };
 
 static const struct opcode group2[] = {
@@ -3854,7 +3924,7 @@ static const struct gprefix pfx_0f_e7 = {
 };
 
 static const struct escape escape_d9 = { {
-	N, N, N, N, N, N, N, I(DstMem, em_fnstcw),
+	N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstcw),
 }, {
 	/* 0xC0 - 0xC7 */
 	N, N, N, N, N, N, N, N,
@@ -3896,7 +3966,7 @@ static const struct escape escape_db = { {
 } };
 
 static const struct escape escape_dd = { {
-	N, N, N, N, N, N, N, I(DstMem, em_fnstsw),
+	N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstsw),
 }, {
 	/* 0xC0 - 0xC7 */
 	N, N, N, N, N, N, N, N,
@@ -3920,6 +3990,10 @@ static const struct instr_dual instr_dual_0f_c3 = {
 	I(DstMem | SrcReg | ModRM | No16 | Mov, em_mov), N
 };
 
+static const struct mode_dual mode_dual_63 = {
+	N, I(DstReg | SrcMem32 | ModRM | Mov, em_movsxd)
+};
+
 static const struct opcode opcode_table[256] = {
 	/* 0x00 - 0x07 */
 	F6ALU(Lock, em_add),
@@ -3954,7 +4028,7 @@ static const struct opcode opcode_table[256] = {
 	/* 0x60 - 0x67 */
 	I(ImplicitOps | Stack | No64, em_pusha),
 	I(ImplicitOps | Stack | No64, em_popa),
-	N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
+	N, MD(ModRM, &mode_dual_63),
 	N, N, N, N,
 	/* 0x68 - 0x6F */
 	I(SrcImm | Mov | Stack, em_push),
@@ -4010,8 +4084,8 @@ static const struct opcode opcode_table[256] = {
 	G(ByteOp, group11), G(0, group11),
 	/* 0xC8 - 0xCF */
 	I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave),
-	I(ImplicitOps | Stack | SrcImmU16, em_ret_far_imm),
-	I(ImplicitOps | Stack, em_ret_far),
+	I(ImplicitOps | SrcImmU16, em_ret_far_imm),
+	I(ImplicitOps, em_ret_far),
 	D(ImplicitOps), DI(SrcImmByte, intn),
 	D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
 	/* 0xD0 - 0xD7 */
@@ -4108,7 +4182,7 @@ static const struct opcode twobyte_table[256] = {
 	F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
 	GD(0, &group15), F(DstReg | SrcMem | ModRM, em_imul),
 	/* 0xB0 - 0xB7 */
-	I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg),
+	I2bv(DstMem | SrcReg | ModRM | Lock | PageTable | SrcWrite, em_cmpxchg),
 	I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
 	F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
 	I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
@@ -4174,6 +4248,8 @@ static const struct opcode opcode_map_0f_38[256] = {
 #undef I
 #undef GP
 #undef EXT
+#undef MD
+#undef ID
 
 #undef D2bv
 #undef D2bvIP
@@ -4563,6 +4639,12 @@ done_prefixes:
 			else
 				opcode = opcode.u.idual->mod012;
 			break;
+		case ModeDual:
+			if (ctxt->mode == X86EMUL_MODE_PROT64)
+				opcode = opcode.u.mdual->mode64;
+			else
+				opcode = opcode.u.mdual->mode32;
+			break;
 		default:
 			return EMULATION_FAILED;
 		}
@@ -4860,10 +4942,16 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 		/* optimisation - avoid slow emulated read if Mov */
 		rc = segmented_read(ctxt, ctxt->dst.addr.mem,
 				   &ctxt->dst.val, ctxt->dst.bytes);
-		if (rc != X86EMUL_CONTINUE)
+		if (rc != X86EMUL_CONTINUE) {
+			if (!(ctxt->d & NoWrite) &&
+			    rc == X86EMUL_PROPAGATE_FAULT &&
+			    ctxt->exception.vector == PF_VECTOR)
+				ctxt->exception.error_code |= PFERR_WRITE_MASK;
 			goto done;
+		}
 	}
-	ctxt->dst.orig_val = ctxt->dst.val;
+	/* Copy full 64-bit value for CMPXCHG8B.  */
+	ctxt->dst.orig_val64 = ctxt->dst.val64;
 
 special_insn:
 
@@ -4899,11 +4987,6 @@ special_insn:
 		goto threebyte_insn;
 
 	switch (ctxt->b) {
-	case 0x63:		/* movsxd */
-		if (ctxt->mode != X86EMUL_MODE_PROT64)
-			goto cannot_emulate;
-		ctxt->dst.val = (s32) ctxt->src.val;
-		break;
 	case 0x70 ... 0x7f: /* jcc (short) */
 		if (test_cc(ctxt->b, ctxt->eflags))
 			rc = jmp_rel(ctxt, ctxt->src.val);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index cc31f7c06d3d..9541ba34126b 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -507,6 +507,7 @@ static int picdev_read(struct kvm_pic *s,
 		return -EOPNOTSUPP;
 
 	if (len != 1) {
+		memset(val, 0, len);
 		pr_pic_unimpl("non byte read\n");
 		return 0;
 	}
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 3c9195535ffc..c2e36d934af4 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -98,7 +98,7 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
 }
 
 void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
-int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 		int short_hand, unsigned int dest, int dest_mode);
 int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
 void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
index 17b73eeac8a4..7dbced309ddb 100644
--- a/arch/x86/kvm/iommu.c
+++ b/arch/x86/kvm/iommu.c
@@ -138,7 +138,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
 
 		gfn += page_size >> PAGE_SHIFT;
 
-
+		cond_resched();
 	}
 
 	return 0;
@@ -306,6 +306,8 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
 		kvm_unpin_pages(kvm, pfn, unmap_pages);
 
 		gfn += unmap_pages;
+
+		cond_resched();
 	}
 }
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d52dcf0776ea..bd4e34de24c7 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -33,6 +33,7 @@
 #include <asm/page.h>
 #include <asm/current.h>
 #include <asm/apicdef.h>
+#include <asm/delay.h>
 #include <linux/atomic.h>
 #include <linux/jump_label.h>
 #include "kvm_cache_regs.h"
@@ -327,17 +328,24 @@ static u8 count_vectors(void *bitmap)
 	return count;
 }
 
-void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+void __kvm_apic_update_irr(u32 *pir, void *regs)
 {
 	u32 i, pir_val;
-	struct kvm_lapic *apic = vcpu->arch.apic;
 
 	for (i = 0; i <= 7; i++) {
 		pir_val = xchg(&pir[i], 0);
 		if (pir_val)
-			*((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val;
+			*((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val;
 	}
 }
+EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
+
+void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	__kvm_apic_update_irr(pir, apic->regs);
+}
 EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
 
 static inline void apic_set_irr(int vec, struct kvm_lapic *apic)
@@ -405,7 +413,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
 	 * because the processor can modify ISR under the hood.  Instead
 	 * just set SVI.
 	 */
-	if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+	if (unlikely(kvm_x86_ops->hwapic_isr_update))
 		kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec);
 	else {
 		++apic->isr_count;
@@ -453,7 +461,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
 	 * on the other hand isr_count and highest_isr_cache are unused
 	 * and must be left alone.
 	 */
-	if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+	if (unlikely(kvm_x86_ops->hwapic_isr_update))
 		kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
 					       apic_find_highest_isr(apic));
 	else {
@@ -580,55 +588,48 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
 	apic_update_ppr(apic);
 }
 
-static int kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest)
+static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest)
 {
 	return dest == (apic_x2apic_mode(apic) ?
 			X2APIC_BROADCAST : APIC_BROADCAST);
 }
 
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest)
+static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest)
 {
 	return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest);
 }
 
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
+static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
 {
-	int result = 0;
 	u32 logical_id;
 
 	if (kvm_apic_broadcast(apic, mda))
-		return 1;
+		return true;
 
-	if (apic_x2apic_mode(apic)) {
-		logical_id = kvm_apic_get_reg(apic, APIC_LDR);
-		return logical_id & mda;
-	}
+	logical_id = kvm_apic_get_reg(apic, APIC_LDR);
 
-	logical_id = GET_APIC_LOGICAL_ID(kvm_apic_get_reg(apic, APIC_LDR));
+	if (apic_x2apic_mode(apic))
+		return ((logical_id >> 16) == (mda >> 16))
+		       && (logical_id & mda & 0xffff) != 0;
+
+	logical_id = GET_APIC_LOGICAL_ID(logical_id);
 
 	switch (kvm_apic_get_reg(apic, APIC_DFR)) {
 	case APIC_DFR_FLAT:
-		if (logical_id & mda)
-			result = 1;
-		break;
+		return (logical_id & mda) != 0;
 	case APIC_DFR_CLUSTER:
-		if (((logical_id >> 4) == (mda >> 0x4))
-		    && (logical_id & mda & 0xf))
-			result = 1;
-		break;
+		return ((logical_id >> 4) == (mda >> 4))
+		       && (logical_id & mda & 0xf) != 0;
 	default:
 		apic_debug("Bad DFR vcpu %d: %08x\n",
 			   apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR));
-		break;
+		return false;
 	}
-
-	return result;
 }
 
-int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 			   int short_hand, unsigned int dest, int dest_mode)
 {
-	int result = 0;
 	struct kvm_lapic *target = vcpu->arch.apic;
 
 	apic_debug("target %p, source %p, dest 0x%x, "
@@ -638,29 +639,21 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 	ASSERT(target);
 	switch (short_hand) {
 	case APIC_DEST_NOSHORT:
-		if (dest_mode == 0)
-			/* Physical mode. */
-			result = kvm_apic_match_physical_addr(target, dest);
+		if (dest_mode == APIC_DEST_PHYSICAL)
+			return kvm_apic_match_physical_addr(target, dest);
 		else
-			/* Logical mode. */
-			result = kvm_apic_match_logical_addr(target, dest);
-		break;
+			return kvm_apic_match_logical_addr(target, dest);
 	case APIC_DEST_SELF:
-		result = (target == source);
-		break;
+		return target == source;
 	case APIC_DEST_ALLINC:
-		result = 1;
-		break;
+		return true;
 	case APIC_DEST_ALLBUT:
-		result = (target != source);
-		break;
+		return target != source;
 	default:
 		apic_debug("kvm: apic: Bad dest shorthand value %x\n",
 			   short_hand);
-		break;
+		return false;
 	}
-
-	return result;
 }
 
 bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
@@ -693,7 +686,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 
 	ret = true;
 
-	if (irq->dest_mode == 0) { /* physical mode */
+	if (irq->dest_mode == APIC_DEST_PHYSICAL) {
 		if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
 			goto out;
 
@@ -1076,25 +1069,72 @@ static void apic_timer_expired(struct kvm_lapic *apic)
 {
 	struct kvm_vcpu *vcpu = apic->vcpu;
 	wait_queue_head_t *q = &vcpu->wq;
+	struct kvm_timer *ktimer = &apic->lapic_timer;
 
-	/*
-	 * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
-	 * vcpu_enter_guest.
-	 */
 	if (atomic_read(&apic->lapic_timer.pending))
 		return;
 
 	atomic_inc(&apic->lapic_timer.pending);
-	/* FIXME: this code should not know anything about vcpus */
-	kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+	kvm_set_pending_timer(vcpu);
 
 	if (waitqueue_active(q))
 		wake_up_interruptible(q);
+
+	if (apic_lvtt_tscdeadline(apic))
+		ktimer->expired_tscdeadline = ktimer->tscdeadline;
+}
+
+/*
+ * On APICv, this test will cause a busy wait
+ * during a higher-priority task.
+ */
+
+static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	u32 reg = kvm_apic_get_reg(apic, APIC_LVTT);
+
+	if (kvm_apic_hw_enabled(apic)) {
+		int vec = reg & APIC_VECTOR_MASK;
+		void *bitmap = apic->regs + APIC_ISR;
+
+		if (kvm_x86_ops->deliver_posted_interrupt)
+			bitmap = apic->regs + APIC_IRR;
+
+		if (apic_test_vector(vec, bitmap))
+			return true;
+	}
+	return false;
+}
+
+void wait_lapic_expire(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	u64 guest_tsc, tsc_deadline;
+
+	if (!kvm_vcpu_has_lapic(vcpu))
+		return;
+
+	if (apic->lapic_timer.expired_tscdeadline == 0)
+		return;
+
+	if (!lapic_timer_int_injected(vcpu))
+		return;
+
+	tsc_deadline = apic->lapic_timer.expired_tscdeadline;
+	apic->lapic_timer.expired_tscdeadline = 0;
+	guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+	trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
+
+	/* __delay is delay_tsc whenever the hardware has TSC, thus always.  */
+	if (guest_tsc < tsc_deadline)
+		__delay(tsc_deadline - guest_tsc);
 }
 
 static void start_apic_timer(struct kvm_lapic *apic)
 {
 	ktime_t now;
+
 	atomic_set(&apic->lapic_timer.pending, 0);
 
 	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
@@ -1140,6 +1180,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
 		/* lapic timer in tsc deadline mode */
 		u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
 		u64 ns = 0;
+		ktime_t expire;
 		struct kvm_vcpu *vcpu = apic->vcpu;
 		unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
 		unsigned long flags;
@@ -1154,8 +1195,10 @@ static void start_apic_timer(struct kvm_lapic *apic)
 		if (likely(tscdeadline > guest_tsc)) {
 			ns = (tscdeadline - guest_tsc) * 1000000ULL;
 			do_div(ns, this_tsc_khz);
+			expire = ktime_add_ns(now, ns);
+			expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
 			hrtimer_start(&apic->lapic_timer.timer,
-				ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
+				      expire, HRTIMER_MODE_ABS);
 		} else
 			apic_timer_expired(apic);
 
@@ -1529,7 +1572,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 		apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
 	}
 	apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm);
-	apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm);
+	apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0;
 	apic->highest_isr_cache = -1;
 	update_divide_count(apic);
 	atomic_set(&apic->lapic_timer.pending, 0);
@@ -1739,13 +1782,15 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
 	update_divide_count(apic);
 	start_apic_timer(apic);
 	apic->irr_pending = true;
-	apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm) ?
+	apic->isr_count = kvm_x86_ops->hwapic_isr_update ?
 				1 : count_vectors(apic->regs + APIC_ISR);
 	apic->highest_isr_cache = -1;
 	if (kvm_x86_ops->hwapic_irr_update)
 		kvm_x86_ops->hwapic_irr_update(vcpu,
 				apic_find_highest_irr(apic));
-	kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic));
+	if (unlikely(kvm_x86_ops->hwapic_isr_update))
+		kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
+				apic_find_highest_isr(apic));
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	kvm_rtc_eoi_tracking_restore_one(vcpu);
 }
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index c674fce53cf9..0bc6c656625b 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -14,6 +14,7 @@ struct kvm_timer {
 	u32 timer_mode;
 	u32 timer_mode_mask;
 	u64 tscdeadline;
+	u64 expired_tscdeadline;
 	atomic_t pending;			/* accumulated triggered timers */
 };
 
@@ -56,9 +57,8 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
 void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 
 void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
+void __kvm_apic_update_irr(u32 *pir, void *regs);
 void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest);
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
 		unsigned long *dest_map);
 int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
@@ -170,4 +170,6 @@ static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
 
 bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
 
+void wait_lapic_expire(struct kvm_vcpu *vcpu);
+
 #endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f83fc6c5e0ba..cee759299a35 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -63,30 +63,16 @@ enum {
 #undef MMU_DEBUG
 
 #ifdef MMU_DEBUG
+static bool dbg = 0;
+module_param(dbg, bool, 0644);
 
 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
-
+#define MMU_WARN_ON(x) WARN_ON(x)
 #else
-
 #define pgprintk(x...) do { } while (0)
 #define rmap_printk(x...) do { } while (0)
-
-#endif
-
-#ifdef MMU_DEBUG
-static bool dbg = 0;
-module_param(dbg, bool, 0644);
-#endif
-
-#ifndef MMU_DEBUG
-#define ASSERT(x) do { } while (0)
-#else
-#define ASSERT(x)							\
-	if (!(x)) {							\
-		printk(KERN_WARNING "assertion failed %s:%d: %s\n",	\
-		       __FILE__, __LINE__, #x);				\
-	}
+#define MMU_WARN_ON(x) do { } while (0)
 #endif
 
 #define PTE_PREFETCH_NUM		8
@@ -546,6 +532,11 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
 	return (old_spte & bit_mask) && !(new_spte & bit_mask);
 }
 
+static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask)
+{
+	return (old_spte & bit_mask) != (new_spte & bit_mask);
+}
+
 /* Rules for using mmu_spte_set:
  * Set the sptep from nonpresent to present.
  * Note: the sptep being assigned *must* be either not present
@@ -596,6 +587,14 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 	if (!shadow_accessed_mask)
 		return ret;
 
+	/*
+	 * Flush TLB when accessed/dirty bits are changed in the page tables,
+	 * to guarantee consistency between TLB and page tables.
+	 */
+	if (spte_is_bit_changed(old_spte, new_spte,
+                                shadow_accessed_mask | shadow_dirty_mask))
+		ret = true;
+
 	if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 	if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
@@ -1216,6 +1215,60 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
 	return flush;
 }
 
+static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep)
+{
+	u64 spte = *sptep;
+
+	rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
+
+	spte &= ~shadow_dirty_mask;
+
+	return mmu_spte_update(sptep, spte);
+}
+
+static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
+{
+	u64 *sptep;
+	struct rmap_iterator iter;
+	bool flush = false;
+
+	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
+		BUG_ON(!(*sptep & PT_PRESENT_MASK));
+
+		flush |= spte_clear_dirty(kvm, sptep);
+		sptep = rmap_get_next(&iter);
+	}
+
+	return flush;
+}
+
+static bool spte_set_dirty(struct kvm *kvm, u64 *sptep)
+{
+	u64 spte = *sptep;
+
+	rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
+
+	spte |= shadow_dirty_mask;
+
+	return mmu_spte_update(sptep, spte);
+}
+
+static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp)
+{
+	u64 *sptep;
+	struct rmap_iterator iter;
+	bool flush = false;
+
+	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
+		BUG_ON(!(*sptep & PT_PRESENT_MASK));
+
+		flush |= spte_set_dirty(kvm, sptep);
+		sptep = rmap_get_next(&iter);
+	}
+
+	return flush;
+}
+
 /**
  * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
  * @kvm: kvm instance
@@ -1226,7 +1279,7 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
  * Used when we do not need to care about huge page mappings: e.g. during dirty
  * logging we do not have any such mappings.
  */
-void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 				     struct kvm_memory_slot *slot,
 				     gfn_t gfn_offset, unsigned long mask)
 {
@@ -1242,6 +1295,53 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 	}
 }
 
+/**
+ * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages
+ * @kvm: kvm instance
+ * @slot: slot to clear D-bit
+ * @gfn_offset: start of the BITS_PER_LONG pages we care about
+ * @mask: indicates which pages we should clear D-bit
+ *
+ * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
+ */
+void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
+				     struct kvm_memory_slot *slot,
+				     gfn_t gfn_offset, unsigned long mask)
+{
+	unsigned long *rmapp;
+
+	while (mask) {
+		rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+				      PT_PAGE_TABLE_LEVEL, slot);
+		__rmap_clear_dirty(kvm, rmapp);
+
+		/* clear the first set bit */
+		mask &= mask - 1;
+	}
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
+
+/**
+ * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
+ * PT level pages.
+ *
+ * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
+ * enable dirty logging for them.
+ *
+ * Used when we do not need to care about huge page mappings: e.g. during dirty
+ * logging we do not have any such mappings.
+ */
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+				struct kvm_memory_slot *slot,
+				gfn_t gfn_offset, unsigned long mask)
+{
+	if (kvm_x86_ops->enable_log_dirty_pt_masked)
+		kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
+				mask);
+	else
+		kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+}
+
 static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
 {
 	struct kvm_memory_slot *slot;
@@ -1536,7 +1636,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
 
 static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
 {
-	ASSERT(is_empty_shadow_page(sp->spt));
+	MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
 	hlist_del(&sp->hash_link);
 	list_del(&sp->link);
 	free_page((unsigned long)sp->spt);
@@ -2501,8 +2601,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		}
 	}
 
-	if (pte_access & ACC_WRITE_MASK)
+	if (pte_access & ACC_WRITE_MASK) {
 		mark_page_dirty(vcpu->kvm, gfn);
+		spte |= shadow_dirty_mask;
+	}
 
 set_pte:
 	if (mmu_spte_update(sptep, spte))
@@ -2818,6 +2920,18 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	 */
 	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
 
+	/*
+	 * Theoretically we could also set dirty bit (and flush TLB) here in
+	 * order to eliminate unnecessary PML logging. See comments in
+	 * set_spte. But fast_page_fault is very unlikely to happen with PML
+	 * enabled, so we do not do this. This might result in the same GPA
+	 * to be logged in PML buffer again when the write really happens, and
+	 * eventually to be called by mark_page_dirty twice. But it's also no
+	 * harm. This also avoids the TLB flush needed after setting dirty bit
+	 * so non-PML cases won't be impacted.
+	 *
+	 * Compare with set_spte where instead shadow_dirty_mask is set.
+	 */
 	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
 		mark_page_dirty(vcpu->kvm, gfn);
 
@@ -3041,7 +3155,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 		for (i = 0; i < 4; ++i) {
 			hpa_t root = vcpu->arch.mmu.pae_root[i];
 
-			ASSERT(!VALID_PAGE(root));
+			MMU_WARN_ON(VALID_PAGE(root));
 			spin_lock(&vcpu->kvm->mmu_lock);
 			make_mmu_pages_available(vcpu);
 			sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
@@ -3079,7 +3193,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 
-		ASSERT(!VALID_PAGE(root));
+		MMU_WARN_ON(VALID_PAGE(root));
 
 		spin_lock(&vcpu->kvm->mmu_lock);
 		make_mmu_pages_available(vcpu);
@@ -3104,7 +3218,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
-		ASSERT(!VALID_PAGE(root));
+		MMU_WARN_ON(VALID_PAGE(root));
 		if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
 			pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
 			if (!is_present_gpte(pdptr)) {
@@ -3329,8 +3443,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 	if (r)
 		return r;
 
-	ASSERT(vcpu);
-	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
 	gfn = gva >> PAGE_SHIFT;
 
@@ -3396,8 +3509,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	int write = error_code & PFERR_WRITE_MASK;
 	bool map_writable;
 
-	ASSERT(vcpu);
-	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
 	if (unlikely(error_code & PFERR_RSVD_MASK)) {
 		r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
@@ -3718,7 +3830,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
 	update_permission_bitmask(vcpu, context, false);
 	update_last_pte_bitmap(vcpu, context);
 
-	ASSERT(is_pae(vcpu));
+	MMU_WARN_ON(!is_pae(vcpu));
 	context->page_fault = paging64_page_fault;
 	context->gva_to_gpa = paging64_gva_to_gpa;
 	context->sync_page = paging64_sync_page;
@@ -3763,7 +3875,7 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu,
 
 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 {
-	struct kvm_mmu *context = vcpu->arch.walk_mmu;
+	struct kvm_mmu *context = &vcpu->arch.mmu;
 
 	context->base_role.word = 0;
 	context->page_fault = tdp_page_fault;
@@ -3803,11 +3915,12 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	update_last_pte_bitmap(vcpu, context);
 }
 
-void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
 {
 	bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
-	ASSERT(vcpu);
-	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+	struct kvm_mmu *context = &vcpu->arch.mmu;
+
+	MMU_WARN_ON(VALID_PAGE(context->root_hpa));
 
 	if (!is_paging(vcpu))
 		nonpaging_init_context(vcpu, context);
@@ -3818,19 +3931,19 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 	else
 		paging32_init_context(vcpu, context);
 
-	vcpu->arch.mmu.base_role.nxe = is_nx(vcpu);
-	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
-	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
-	vcpu->arch.mmu.base_role.smep_andnot_wp
+	context->base_role.nxe = is_nx(vcpu);
+	context->base_role.cr4_pae = !!is_pae(vcpu);
+	context->base_role.cr0_wp  = is_write_protection(vcpu);
+	context->base_role.smep_andnot_wp
 		= smep && !is_write_protection(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
-void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
-		bool execonly)
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
 {
-	ASSERT(vcpu);
-	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+	struct kvm_mmu *context = &vcpu->arch.mmu;
+
+	MMU_WARN_ON(VALID_PAGE(context->root_hpa));
 
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 
@@ -3851,11 +3964,13 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
 
 static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
-	kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
-	vcpu->arch.walk_mmu->set_cr3           = kvm_x86_ops->set_cr3;
-	vcpu->arch.walk_mmu->get_cr3           = get_cr3;
-	vcpu->arch.walk_mmu->get_pdptr         = kvm_pdptr_read;
-	vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
+	struct kvm_mmu *context = &vcpu->arch.mmu;
+
+	kvm_init_shadow_mmu(vcpu);
+	context->set_cr3           = kvm_x86_ops->set_cr3;
+	context->get_cr3           = get_cr3;
+	context->get_pdptr         = kvm_pdptr_read;
+	context->inject_page_fault = kvm_inject_page_fault;
 }
 
 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
@@ -3900,17 +4015,15 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 static void init_kvm_mmu(struct kvm_vcpu *vcpu)
 {
 	if (mmu_is_nested(vcpu))
-		return init_kvm_nested_mmu(vcpu);
+		init_kvm_nested_mmu(vcpu);
 	else if (tdp_enabled)
-		return init_kvm_tdp_mmu(vcpu);
+		init_kvm_tdp_mmu(vcpu);
 	else
-		return init_kvm_softmmu(vcpu);
+		init_kvm_softmmu(vcpu);
 }
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
 {
-	ASSERT(vcpu);
-
 	kvm_mmu_unload(vcpu);
 	init_kvm_mmu(vcpu);
 }
@@ -4266,8 +4379,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 	struct page *page;
 	int i;
 
-	ASSERT(vcpu);
-
 	/*
 	 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
 	 * Therefore we need to allocate shadow page tables in the first
@@ -4286,8 +4397,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 
 int kvm_mmu_create(struct kvm_vcpu *vcpu)
 {
-	ASSERT(vcpu);
-
 	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 	vcpu->arch.mmu.translate_gpa = translate_gpa;
@@ -4298,19 +4407,18 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
 
 void kvm_mmu_setup(struct kvm_vcpu *vcpu)
 {
-	ASSERT(vcpu);
-	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+	MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
 	init_kvm_mmu(vcpu);
 }
 
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+				      struct kvm_memory_slot *memslot)
 {
-	struct kvm_memory_slot *memslot;
 	gfn_t last_gfn;
 	int i;
+	bool flush = false;
 
-	memslot = id_to_memslot(kvm->memslots, slot);
 	last_gfn = memslot->base_gfn + memslot->npages - 1;
 
 	spin_lock(&kvm->mmu_lock);
@@ -4325,7 +4433,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 
 		for (index = 0; index <= last_index; ++index, ++rmapp) {
 			if (*rmapp)
-				__rmap_write_protect(kvm, rmapp, false);
+				flush |= __rmap_write_protect(kvm, rmapp,
+						false);
 
 			if (need_resched() || spin_needbreak(&kvm->mmu_lock))
 				cond_resched_lock(&kvm->mmu_lock);
@@ -4352,8 +4461,124 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 	 * instead of PT_WRITABLE_MASK, that means it does not depend
 	 * on PT_WRITABLE_MASK anymore.
 	 */
-	kvm_flush_remote_tlbs(kvm);
+	if (flush)
+		kvm_flush_remote_tlbs(kvm);
+}
+
+void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
+				   struct kvm_memory_slot *memslot)
+{
+	gfn_t last_gfn;
+	unsigned long *rmapp;
+	unsigned long last_index, index;
+	bool flush = false;
+
+	last_gfn = memslot->base_gfn + memslot->npages - 1;
+
+	spin_lock(&kvm->mmu_lock);
+
+	rmapp = memslot->arch.rmap[PT_PAGE_TABLE_LEVEL - 1];
+	last_index = gfn_to_index(last_gfn, memslot->base_gfn,
+			PT_PAGE_TABLE_LEVEL);
+
+	for (index = 0; index <= last_index; ++index, ++rmapp) {
+		if (*rmapp)
+			flush |= __rmap_clear_dirty(kvm, rmapp);
+
+		if (need_resched() || spin_needbreak(&kvm->mmu_lock))
+			cond_resched_lock(&kvm->mmu_lock);
+	}
+
+	spin_unlock(&kvm->mmu_lock);
+
+	lockdep_assert_held(&kvm->slots_lock);
+
+	/*
+	 * It's also safe to flush TLBs out of mmu lock here as currently this
+	 * function is only used for dirty logging, in which case flushing TLB
+	 * out of mmu lock also guarantees no dirty pages will be lost in
+	 * dirty_bitmap.
+	 */
+	if (flush)
+		kvm_flush_remote_tlbs(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
+
+void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
+					struct kvm_memory_slot *memslot)
+{
+	gfn_t last_gfn;
+	int i;
+	bool flush = false;
+
+	last_gfn = memslot->base_gfn + memslot->npages - 1;
+
+	spin_lock(&kvm->mmu_lock);
+
+	for (i = PT_PAGE_TABLE_LEVEL + 1; /* skip rmap for 4K page */
+	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+		unsigned long *rmapp;
+		unsigned long last_index, index;
+
+		rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
+		last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
+
+		for (index = 0; index <= last_index; ++index, ++rmapp) {
+			if (*rmapp)
+				flush |= __rmap_write_protect(kvm, rmapp,
+						false);
+
+			if (need_resched() || spin_needbreak(&kvm->mmu_lock))
+				cond_resched_lock(&kvm->mmu_lock);
+		}
+	}
+	spin_unlock(&kvm->mmu_lock);
+
+	/* see kvm_mmu_slot_remove_write_access */
+	lockdep_assert_held(&kvm->slots_lock);
+
+	if (flush)
+		kvm_flush_remote_tlbs(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
+
+void kvm_mmu_slot_set_dirty(struct kvm *kvm,
+			    struct kvm_memory_slot *memslot)
+{
+	gfn_t last_gfn;
+	int i;
+	bool flush = false;
+
+	last_gfn = memslot->base_gfn + memslot->npages - 1;
+
+	spin_lock(&kvm->mmu_lock);
+
+	for (i = PT_PAGE_TABLE_LEVEL;
+	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+		unsigned long *rmapp;
+		unsigned long last_index, index;
+
+		rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
+		last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
+
+		for (index = 0; index <= last_index; ++index, ++rmapp) {
+			if (*rmapp)
+				flush |= __rmap_set_dirty(kvm, rmapp);
+
+			if (need_resched() || spin_needbreak(&kvm->mmu_lock))
+				cond_resched_lock(&kvm->mmu_lock);
+		}
+	}
+
+	spin_unlock(&kvm->mmu_lock);
+
+	lockdep_assert_held(&kvm->slots_lock);
+
+	/* see kvm_mmu_slot_leaf_clear_dirty */
+	if (flush)
+		kvm_flush_remote_tlbs(kvm);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
 
 #define BATCH_ZAP_PAGES	10
 static void kvm_zap_obsolete_pages(struct kvm *kvm)
@@ -4606,8 +4831,6 @@ EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
 
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
 {
-	ASSERT(vcpu);
-
 	kvm_mmu_unload(vcpu);
 	free_mmu_pages(vcpu);
 	mmu_free_memory_caches(vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index bde8ee725754..c7d65637c851 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -44,18 +44,6 @@
 #define PT_DIRECTORY_LEVEL 2
 #define PT_PAGE_TABLE_LEVEL 1
 
-#define PFERR_PRESENT_BIT 0
-#define PFERR_WRITE_BIT 1
-#define PFERR_USER_BIT 2
-#define PFERR_RSVD_BIT 3
-#define PFERR_FETCH_BIT 4
-
-#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
-#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
-#define PFERR_USER_MASK (1U << PFERR_USER_BIT)
-#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
-#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
-
 static inline u64 rsvd_bits(int s, int e)
 {
 	return ((1ULL << (e - s + 1)) - 1) << s;
@@ -81,9 +69,8 @@ enum {
 };
 
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
-void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
-void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
-		bool execonly);
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
 void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 		bool ept);
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 41dd0387cccb..cc618c882f90 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1583,7 +1583,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 
 static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
-	unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
+	unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
 	unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
 
 	if (cr4 & X86_CR4_VMXE)
@@ -2003,8 +2003,8 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
 
 static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 {
-	kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
-
+	WARN_ON(mmu_is_nested(vcpu));
+	kvm_init_shadow_mmu(vcpu);
 	vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
 	vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
 	vcpu->arch.mmu.get_pdptr         = nested_svm_get_tdp_pdptr;
@@ -3649,11 +3649,6 @@ static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 	return;
 }
 
-static void svm_hwapic_isr_update(struct kvm *kvm, int isr)
-{
-	return;
-}
-
 static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu)
 {
 	return;
@@ -4403,7 +4398,6 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
 	.vm_has_apicv = svm_vm_has_apicv,
 	.load_eoi_exitmap = svm_load_eoi_exitmap,
-	.hwapic_isr_update = svm_hwapic_isr_update,
 	.sync_pir_to_irr = svm_sync_pir_to_irr,
 
 	.set_tss_addr = svm_set_tss_addr,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index c2a34bb5ad93..7c7bc8bef21f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -848,6 +848,24 @@ TRACE_EVENT(kvm_track_tsc,
 
 #endif /* CONFIG_X86_64 */
 
+/*
+ * Tracepoint for PML full VMEXIT.
+ */
+TRACE_EVENT(kvm_pml_full,
+	TP_PROTO(unsigned int vcpu_id),
+	TP_ARGS(vcpu_id),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	vcpu_id			)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id		= vcpu_id;
+	),
+
+	TP_printk("vcpu %d: PML full", __entry->vcpu_id)
+);
+
 TRACE_EVENT(kvm_ple_window,
 	TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old),
 	TP_ARGS(grow, vcpu_id, new, old),
@@ -914,6 +932,26 @@ TRACE_EVENT(kvm_pvclock_update,
 		  __entry->flags)
 );
 
+TRACE_EVENT(kvm_wait_lapic_expire,
+	TP_PROTO(unsigned int vcpu_id, s64 delta),
+	TP_ARGS(vcpu_id, delta),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	vcpu_id		)
+		__field(	s64,		delta		)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id	   = vcpu_id;
+		__entry->delta             = delta;
+	),
+
+	TP_printk("vcpu %u: delta %lld (%s)",
+		  __entry->vcpu_id,
+		  __entry->delta,
+		  __entry->delta < 0 ? "early" : "late")
+);
+
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d4c58d884838..10a481b7674d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -45,6 +45,7 @@
 #include <asm/perf_event.h>
 #include <asm/debugreg.h>
 #include <asm/kexec.h>
+#include <asm/apic.h>
 
 #include "trace.h"
 
@@ -101,6 +102,9 @@ module_param(nested, bool, S_IRUGO);
 
 static u64 __read_mostly host_xss;
 
+static bool __read_mostly enable_pml = 1;
+module_param_named(pml, enable_pml, bool, S_IRUGO);
+
 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
 #define KVM_VM_CR0_ALWAYS_ON						\
@@ -215,7 +219,12 @@ struct __packed vmcs12 {
 	u64 tsc_offset;
 	u64 virtual_apic_page_addr;
 	u64 apic_access_addr;
+	u64 posted_intr_desc_addr;
 	u64 ept_pointer;
+	u64 eoi_exit_bitmap0;
+	u64 eoi_exit_bitmap1;
+	u64 eoi_exit_bitmap2;
+	u64 eoi_exit_bitmap3;
 	u64 xss_exit_bitmap;
 	u64 guest_physical_address;
 	u64 vmcs_link_pointer;
@@ -330,6 +339,7 @@ struct __packed vmcs12 {
 	u32 vmx_preemption_timer_value;
 	u32 padding32[7]; /* room for future expansion */
 	u16 virtual_processor_id;
+	u16 posted_intr_nv;
 	u16 guest_es_selector;
 	u16 guest_cs_selector;
 	u16 guest_ss_selector;
@@ -338,6 +348,7 @@ struct __packed vmcs12 {
 	u16 guest_gs_selector;
 	u16 guest_ldtr_selector;
 	u16 guest_tr_selector;
+	u16 guest_intr_status;
 	u16 host_es_selector;
 	u16 host_cs_selector;
 	u16 host_ss_selector;
@@ -401,6 +412,10 @@ struct nested_vmx {
 	 */
 	struct page *apic_access_page;
 	struct page *virtual_apic_page;
+	struct page *pi_desc_page;
+	struct pi_desc *pi_desc;
+	bool pi_pending;
+	u16 posted_intr_nv;
 	u64 msr_ia32_feature_control;
 
 	struct hrtimer preemption_timer;
@@ -408,6 +423,23 @@ struct nested_vmx {
 
 	/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
 	u64 vmcs01_debugctl;
+
+	u32 nested_vmx_procbased_ctls_low;
+	u32 nested_vmx_procbased_ctls_high;
+	u32 nested_vmx_true_procbased_ctls_low;
+	u32 nested_vmx_secondary_ctls_low;
+	u32 nested_vmx_secondary_ctls_high;
+	u32 nested_vmx_pinbased_ctls_low;
+	u32 nested_vmx_pinbased_ctls_high;
+	u32 nested_vmx_exit_ctls_low;
+	u32 nested_vmx_exit_ctls_high;
+	u32 nested_vmx_true_exit_ctls_low;
+	u32 nested_vmx_entry_ctls_low;
+	u32 nested_vmx_entry_ctls_high;
+	u32 nested_vmx_true_entry_ctls_low;
+	u32 nested_vmx_misc_low;
+	u32 nested_vmx_misc_high;
+	u32 nested_vmx_ept_caps;
 };
 
 #define POSTED_INTR_ON  0
@@ -511,6 +543,10 @@ struct vcpu_vmx {
 	/* Dynamic PLE window. */
 	int ple_window;
 	bool ple_window_dirty;
+
+	/* Support for PML */
+#define PML_ENTITY_NUM		512
+	struct page *pml_pg;
 };
 
 enum segment_cache_field {
@@ -594,6 +630,7 @@ static int max_shadow_read_write_fields =
 
 static const unsigned short vmcs_field_to_offset_table[] = {
 	FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
+	FIELD(POSTED_INTR_NV, posted_intr_nv),
 	FIELD(GUEST_ES_SELECTOR, guest_es_selector),
 	FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
 	FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
@@ -602,6 +639,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
 	FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
 	FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
 	FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
+	FIELD(GUEST_INTR_STATUS, guest_intr_status),
 	FIELD(HOST_ES_SELECTOR, host_es_selector),
 	FIELD(HOST_CS_SELECTOR, host_cs_selector),
 	FIELD(HOST_SS_SELECTOR, host_ss_selector),
@@ -618,7 +656,12 @@ static const unsigned short vmcs_field_to_offset_table[] = {
 	FIELD64(TSC_OFFSET, tsc_offset),
 	FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
 	FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
+	FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
 	FIELD64(EPT_POINTER, ept_pointer),
+	FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
+	FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
+	FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
+	FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
 	FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
 	FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
 	FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
@@ -766,6 +809,7 @@ static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
 static bool vmx_mpx_supported(void);
 static bool vmx_xsaves_supported(void);
+static int vmx_vm_has_apicv(struct kvm *kvm);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
 			    struct kvm_segment *var, int seg);
@@ -793,6 +837,7 @@ static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_longmode;
 static unsigned long *vmx_msr_bitmap_legacy_x2apic;
 static unsigned long *vmx_msr_bitmap_longmode_x2apic;
+static unsigned long *vmx_msr_bitmap_nested;
 static unsigned long *vmx_vmread_bitmap;
 static unsigned long *vmx_vmwrite_bitmap;
 
@@ -959,16 +1004,6 @@ static inline bool cpu_has_vmx_ept_execute_only(void)
 	return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
 }
 
-static inline bool cpu_has_vmx_eptp_uncacheable(void)
-{
-	return vmx_capability.ept & VMX_EPTP_UC_BIT;
-}
-
-static inline bool cpu_has_vmx_eptp_writeback(void)
-{
-	return vmx_capability.ept & VMX_EPTP_WB_BIT;
-}
-
 static inline bool cpu_has_vmx_ept_2m_page(void)
 {
 	return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
@@ -1073,6 +1108,11 @@ static inline bool cpu_has_vmx_shadow_vmcs(void)
 		SECONDARY_EXEC_SHADOW_VMCS;
 }
 
+static inline bool cpu_has_vmx_pml(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
+}
+
 static inline bool report_flexpriority(void)
 {
 	return flexpriority_enabled;
@@ -1112,6 +1152,26 @@ static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
 		vmx_xsaves_supported();
 }
 
+static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
+{
+	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
+}
+
+static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
+{
+	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
+}
+
+static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
+{
+	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+}
+
+static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
+{
+	return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
+}
+
 static inline bool is_exception(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2108,7 +2168,10 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
 {
 	unsigned long *msr_bitmap;
 
-	if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) {
+	if (is_guest_mode(vcpu))
+		msr_bitmap = vmx_msr_bitmap_nested;
+	else if (irqchip_in_kernel(vcpu->kvm) &&
+		apic_x2apic_mode(vcpu->arch.apic)) {
 		if (is_long_mode(vcpu))
 			msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
 		else
@@ -2284,20 +2347,8 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
  * if the corresponding bit in the (32-bit) control field *must* be on, and a
  * bit in the high half is on if the corresponding bit in the control field
  * may be on. See also vmx_control_verify().
- * TODO: allow these variables to be modified (downgraded) by module options
- * or other means.
  */
-static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
-static u32 nested_vmx_true_procbased_ctls_low;
-static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
-static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
-static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
-static u32 nested_vmx_true_exit_ctls_low;
-static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
-static u32 nested_vmx_true_entry_ctls_low;
-static u32 nested_vmx_misc_low, nested_vmx_misc_high;
-static u32 nested_vmx_ept_caps;
-static __init void nested_vmx_setup_ctls_msrs(void)
+static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 {
 	/*
 	 * Note that as a general rule, the high half of the MSRs (bits in
@@ -2316,57 +2367,74 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 
 	/* pin-based controls */
 	rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
-	      nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high);
-	nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
-	nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
-		PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
-	nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+		vmx->nested.nested_vmx_pinbased_ctls_low,
+		vmx->nested.nested_vmx_pinbased_ctls_high);
+	vmx->nested.nested_vmx_pinbased_ctls_low |=
+		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+	vmx->nested.nested_vmx_pinbased_ctls_high &=
+		PIN_BASED_EXT_INTR_MASK |
+		PIN_BASED_NMI_EXITING |
+		PIN_BASED_VIRTUAL_NMIS;
+	vmx->nested.nested_vmx_pinbased_ctls_high |=
+		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
 		PIN_BASED_VMX_PREEMPTION_TIMER;
+	if (vmx_vm_has_apicv(vmx->vcpu.kvm))
+		vmx->nested.nested_vmx_pinbased_ctls_high |=
+			PIN_BASED_POSTED_INTR;
 
 	/* exit controls */
 	rdmsr(MSR_IA32_VMX_EXIT_CTLS,
-		nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
-	nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+		vmx->nested.nested_vmx_exit_ctls_low,
+		vmx->nested.nested_vmx_exit_ctls_high);
+	vmx->nested.nested_vmx_exit_ctls_low =
+		VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
 
-	nested_vmx_exit_ctls_high &=
+	vmx->nested.nested_vmx_exit_ctls_high &=
 #ifdef CONFIG_X86_64
 		VM_EXIT_HOST_ADDR_SPACE_SIZE |
 #endif
 		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
-	nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+	vmx->nested.nested_vmx_exit_ctls_high |=
+		VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
 		VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
 		VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
 
 	if (vmx_mpx_supported())
-		nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
+		vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
 
 	/* We support free control of debug control saving. */
-	nested_vmx_true_exit_ctls_low = nested_vmx_exit_ctls_low &
+	vmx->nested.nested_vmx_true_exit_ctls_low =
+		vmx->nested.nested_vmx_exit_ctls_low &
 		~VM_EXIT_SAVE_DEBUG_CONTROLS;
 
 	/* entry controls */
 	rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
-		nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
-	nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
-	nested_vmx_entry_ctls_high &=
+		vmx->nested.nested_vmx_entry_ctls_low,
+		vmx->nested.nested_vmx_entry_ctls_high);
+	vmx->nested.nested_vmx_entry_ctls_low =
+		VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+	vmx->nested.nested_vmx_entry_ctls_high &=
 #ifdef CONFIG_X86_64
 		VM_ENTRY_IA32E_MODE |
 #endif
 		VM_ENTRY_LOAD_IA32_PAT;
-	nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
-				       VM_ENTRY_LOAD_IA32_EFER);
+	vmx->nested.nested_vmx_entry_ctls_high |=
+		(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
 	if (vmx_mpx_supported())
-		nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
+		vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
 
 	/* We support free control of debug control loading. */
-	nested_vmx_true_entry_ctls_low = nested_vmx_entry_ctls_low &
+	vmx->nested.nested_vmx_true_entry_ctls_low =
+		vmx->nested.nested_vmx_entry_ctls_low &
 		~VM_ENTRY_LOAD_DEBUG_CONTROLS;
 
 	/* cpu-based controls */
 	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
-		nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
-	nested_vmx_procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
-	nested_vmx_procbased_ctls_high &=
+		vmx->nested.nested_vmx_procbased_ctls_low,
+		vmx->nested.nested_vmx_procbased_ctls_high);
+	vmx->nested.nested_vmx_procbased_ctls_low =
+		CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+	vmx->nested.nested_vmx_procbased_ctls_high &=
 		CPU_BASED_VIRTUAL_INTR_PENDING |
 		CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
 		CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
@@ -2386,45 +2454,55 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 	 * can use it to avoid exits to L1 - even when L0 runs L2
 	 * without MSR bitmaps.
 	 */
-	nested_vmx_procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+	vmx->nested.nested_vmx_procbased_ctls_high |=
+		CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
 		CPU_BASED_USE_MSR_BITMAPS;
 
 	/* We support free control of CR3 access interception. */
-	nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low &
+	vmx->nested.nested_vmx_true_procbased_ctls_low =
+		vmx->nested.nested_vmx_procbased_ctls_low &
 		~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
 
 	/* secondary cpu-based controls */
 	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
-		nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
-	nested_vmx_secondary_ctls_low = 0;
-	nested_vmx_secondary_ctls_high &=
+		vmx->nested.nested_vmx_secondary_ctls_low,
+		vmx->nested.nested_vmx_secondary_ctls_high);
+	vmx->nested.nested_vmx_secondary_ctls_low = 0;
+	vmx->nested.nested_vmx_secondary_ctls_high &=
 		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+		SECONDARY_EXEC_APIC_REGISTER_VIRT |
+		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
 		SECONDARY_EXEC_WBINVD_EXITING |
 		SECONDARY_EXEC_XSAVES;
 
 	if (enable_ept) {
 		/* nested EPT: emulate EPT also to L1 */
-		nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT |
+		vmx->nested.nested_vmx_secondary_ctls_high |=
+			SECONDARY_EXEC_ENABLE_EPT |
 			SECONDARY_EXEC_UNRESTRICTED_GUEST;
-		nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
+		vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
 			 VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
 			 VMX_EPT_INVEPT_BIT;
-		nested_vmx_ept_caps &= vmx_capability.ept;
+		vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
 		/*
 		 * For nested guests, we don't do anything specific
 		 * for single context invalidation. Hence, only advertise
 		 * support for global context invalidation.
 		 */
-		nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
+		vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
 	} else
-		nested_vmx_ept_caps = 0;
+		vmx->nested.nested_vmx_ept_caps = 0;
 
 	/* miscellaneous data */
-	rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
-	nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
-	nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
+	rdmsr(MSR_IA32_VMX_MISC,
+		vmx->nested.nested_vmx_misc_low,
+		vmx->nested.nested_vmx_misc_high);
+	vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
+	vmx->nested.nested_vmx_misc_low |=
+		VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
 		VMX_MISC_ACTIVITY_HLT;
-	nested_vmx_misc_high = 0;
+	vmx->nested.nested_vmx_misc_high = 0;
 }
 
 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
@@ -2443,6 +2521,8 @@ static inline u64 vmx_control_msr(u32 low, u32 high)
 /* Returns 0 on success, non-0 otherwise. */
 static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 {
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
 	switch (msr_index) {
 	case MSR_IA32_VMX_BASIC:
 		/*
@@ -2457,36 +2537,44 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 		break;
 	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
 	case MSR_IA32_VMX_PINBASED_CTLS:
-		*pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low,
-					nested_vmx_pinbased_ctls_high);
+		*pdata = vmx_control_msr(
+			vmx->nested.nested_vmx_pinbased_ctls_low,
+			vmx->nested.nested_vmx_pinbased_ctls_high);
 		break;
 	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
-		*pdata = vmx_control_msr(nested_vmx_true_procbased_ctls_low,
-					nested_vmx_procbased_ctls_high);
+		*pdata = vmx_control_msr(
+			vmx->nested.nested_vmx_true_procbased_ctls_low,
+			vmx->nested.nested_vmx_procbased_ctls_high);
 		break;
 	case MSR_IA32_VMX_PROCBASED_CTLS:
-		*pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
-					nested_vmx_procbased_ctls_high);
+		*pdata = vmx_control_msr(
+			vmx->nested.nested_vmx_procbased_ctls_low,
+			vmx->nested.nested_vmx_procbased_ctls_high);
 		break;
 	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
-		*pdata = vmx_control_msr(nested_vmx_true_exit_ctls_low,
-					nested_vmx_exit_ctls_high);
+		*pdata = vmx_control_msr(
+			vmx->nested.nested_vmx_true_exit_ctls_low,
+			vmx->nested.nested_vmx_exit_ctls_high);
 		break;
 	case MSR_IA32_VMX_EXIT_CTLS:
-		*pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
-					nested_vmx_exit_ctls_high);
+		*pdata = vmx_control_msr(
+			vmx->nested.nested_vmx_exit_ctls_low,
+			vmx->nested.nested_vmx_exit_ctls_high);
 		break;
 	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
-		*pdata = vmx_control_msr(nested_vmx_true_entry_ctls_low,
-					nested_vmx_entry_ctls_high);
+		*pdata = vmx_control_msr(
+			vmx->nested.nested_vmx_true_entry_ctls_low,
+			vmx->nested.nested_vmx_entry_ctls_high);
 		break;
 	case MSR_IA32_VMX_ENTRY_CTLS:
-		*pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
-					nested_vmx_entry_ctls_high);
+		*pdata = vmx_control_msr(
+			vmx->nested.nested_vmx_entry_ctls_low,
+			vmx->nested.nested_vmx_entry_ctls_high);
 		break;
 	case MSR_IA32_VMX_MISC:
-		*pdata = vmx_control_msr(nested_vmx_misc_low,
-					 nested_vmx_misc_high);
+		*pdata = vmx_control_msr(
+			vmx->nested.nested_vmx_misc_low,
+			vmx->nested.nested_vmx_misc_high);
 		break;
 	/*
 	 * These MSRs specify bits which the guest must keep fixed (on or off)
@@ -2511,12 +2599,13 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 		*pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */
 		break;
 	case MSR_IA32_VMX_PROCBASED_CTLS2:
-		*pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
-					nested_vmx_secondary_ctls_high);
+		*pdata = vmx_control_msr(
+			vmx->nested.nested_vmx_secondary_ctls_low,
+			vmx->nested.nested_vmx_secondary_ctls_high);
 		break;
 	case MSR_IA32_VMX_EPT_VPID_CAP:
 		/* Currently, no nested vpid support */
-		*pdata = nested_vmx_ept_caps;
+		*pdata = vmx->nested.nested_vmx_ept_caps;
 		break;
 	default:
 		return 1;
@@ -2785,7 +2874,7 @@ static int hardware_enable(void)
 	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
 	u64 old, test_bits;
 
-	if (read_cr4() & X86_CR4_VMXE)
+	if (cr4_read_shadow() & X86_CR4_VMXE)
 		return -EBUSY;
 
 	INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
@@ -2812,7 +2901,7 @@ static int hardware_enable(void)
 		/* enable and lock */
 		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
 	}
-	write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
+	cr4_set_bits(X86_CR4_VMXE);
 
 	if (vmm_exclusive) {
 		kvm_cpu_vmxon(phys_addr);
@@ -2849,7 +2938,7 @@ static void hardware_disable(void)
 		vmclear_local_loaded_vmcss();
 		kvm_cpu_vmxoff();
 	}
-	write_cr4(read_cr4() & ~X86_CR4_VMXE);
+	cr4_clear_bits(X86_CR4_VMXE);
 }
 
 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
@@ -2929,7 +3018,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 			SECONDARY_EXEC_APIC_REGISTER_VIRT |
 			SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
 			SECONDARY_EXEC_SHADOW_VMCS |
-			SECONDARY_EXEC_XSAVES;
+			SECONDARY_EXEC_XSAVES |
+			SECONDARY_EXEC_ENABLE_PML;
 		if (adjust_vmx_controls(min2, opt2,
 					MSR_IA32_VMX_PROCBASED_CTLS2,
 					&_cpu_based_2nd_exec_control) < 0)
@@ -4159,6 +4249,52 @@ static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
 	}
 }
 
+/*
+ * If a msr is allowed by L0, we should check whether it is allowed by L1.
+ * The corresponding bit will be cleared unless both of L0 and L1 allow it.
+ */
+static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
+					       unsigned long *msr_bitmap_nested,
+					       u32 msr, int type)
+{
+	int f = sizeof(unsigned long);
+
+	if (!cpu_has_vmx_msr_bitmap()) {
+		WARN_ON(1);
+		return;
+	}
+
+	/*
+	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+	 * have the write-low and read-high bitmap offsets the wrong way round.
+	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+	 */
+	if (msr <= 0x1fff) {
+		if (type & MSR_TYPE_R &&
+		   !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
+			/* read-low */
+			__clear_bit(msr, msr_bitmap_nested + 0x000 / f);
+
+		if (type & MSR_TYPE_W &&
+		   !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
+			/* write-low */
+			__clear_bit(msr, msr_bitmap_nested + 0x800 / f);
+
+	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+		msr &= 0x1fff;
+		if (type & MSR_TYPE_R &&
+		   !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
+			/* read-high */
+			__clear_bit(msr, msr_bitmap_nested + 0x400 / f);
+
+		if (type & MSR_TYPE_W &&
+		   !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
+			/* write-high */
+			__clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
+
+	}
+}
+
 static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
 {
 	if (!longmode_only)
@@ -4197,6 +4333,74 @@ static int vmx_vm_has_apicv(struct kvm *kvm)
 	return enable_apicv && irqchip_in_kernel(kvm);
 }
 
+static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int max_irr;
+	void *vapic_page;
+	u16 status;
+
+	if (vmx->nested.pi_desc &&
+	    vmx->nested.pi_pending) {
+		vmx->nested.pi_pending = false;
+		if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+			return 0;
+
+		max_irr = find_last_bit(
+			(unsigned long *)vmx->nested.pi_desc->pir, 256);
+
+		if (max_irr == 256)
+			return 0;
+
+		vapic_page = kmap(vmx->nested.virtual_apic_page);
+		if (!vapic_page) {
+			WARN_ON(1);
+			return -ENOMEM;
+		}
+		__kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
+		kunmap(vmx->nested.virtual_apic_page);
+
+		status = vmcs_read16(GUEST_INTR_STATUS);
+		if ((u8)max_irr > ((u8)status & 0xff)) {
+			status &= ~0xff;
+			status |= (u8)max_irr;
+			vmcs_write16(GUEST_INTR_STATUS, status);
+		}
+	}
+	return 0;
+}
+
+static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_SMP
+	if (vcpu->mode == IN_GUEST_MODE) {
+		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
+				POSTED_INTR_VECTOR);
+		return true;
+	}
+#endif
+	return false;
+}
+
+static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
+						int vector)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (is_guest_mode(vcpu) &&
+	    vector == vmx->nested.posted_intr_nv) {
+		/* the PIR and ON have been set by L1. */
+		kvm_vcpu_trigger_posted_interrupt(vcpu);
+		/*
+		 * If a posted intr is not recognized by hardware,
+		 * we will accomplish it in the next vmentry.
+		 */
+		vmx->nested.pi_pending = true;
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
+		return 0;
+	}
+	return -1;
+}
 /*
  * Send interrupt to vcpu via posted interrupt way.
  * 1. If target vcpu is running(non-root mode), send posted interrupt
@@ -4209,17 +4413,16 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	int r;
 
+	r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
+	if (!r)
+		return;
+
 	if (pi_test_and_set_pir(vector, &vmx->pi_desc))
 		return;
 
 	r = pi_test_and_set_on(&vmx->pi_desc);
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
-#ifdef CONFIG_SMP
-	if (!r && (vcpu->mode == IN_GUEST_MODE))
-		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
-				POSTED_INTR_VECTOR);
-	else
-#endif
+	if (r || !kvm_vcpu_trigger_posted_interrupt(vcpu))
 		kvm_vcpu_kick(vcpu);
 }
 
@@ -4255,7 +4458,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
 	vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
 
 	/* Save the most likely value for this task's CR4 in the VMCS. */
-	cr4 = read_cr4();
+	cr4 = cr4_read_shadow();
 	vmcs_writel(HOST_CR4, cr4);			/* 22.2.3, 22.2.5 */
 	vmx->host_state.vmcs_host_cr4 = cr4;
 
@@ -4360,6 +4563,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 	   a current VMCS12
 	*/
 	exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
+	/* PML is enabled/disabled in creating/destorying vcpu */
+	exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
 	return exec_control;
 }
 
@@ -4986,11 +5192,12 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
 	hypercall[2] = 0xc1;
 }
 
-static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val)
+static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
 {
 	unsigned long always_on = VMXON_CR0_ALWAYSON;
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 
-	if (nested_vmx_secondary_ctls_high &
+	if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high &
 		SECONDARY_EXEC_UNRESTRICTED_GUEST &&
 	    nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
 		always_on &= ~(X86_CR0_PE | X86_CR0_PG);
@@ -5015,7 +5222,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
 		val = (val & ~vmcs12->cr0_guest_host_mask) |
 			(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
 
-		if (!nested_cr0_valid(vmcs12, val))
+		if (!nested_cr0_valid(vcpu, val))
 			return 1;
 
 		if (kvm_set_cr0(vcpu, val))
@@ -5817,13 +6024,21 @@ static __init int hardware_setup(void)
 				(unsigned long *)__get_free_page(GFP_KERNEL);
 	if (!vmx_msr_bitmap_longmode_x2apic)
 		goto out4;
+
+	if (nested) {
+		vmx_msr_bitmap_nested =
+			(unsigned long *)__get_free_page(GFP_KERNEL);
+		if (!vmx_msr_bitmap_nested)
+			goto out5;
+	}
+
 	vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
 	if (!vmx_vmread_bitmap)
-		goto out5;
+		goto out6;
 
 	vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
 	if (!vmx_vmwrite_bitmap)
-		goto out6;
+		goto out7;
 
 	memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
 	memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
@@ -5839,10 +6054,12 @@ static __init int hardware_setup(void)
 
 	memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
 	memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
+	if (nested)
+		memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
 
 	if (setup_vmcs_config(&vmcs_config) < 0) {
 		r = -EIO;
-		goto out7;
+		goto out8;
 	}
 
 	if (boot_cpu_has(X86_FEATURE_NX))
@@ -5868,16 +6085,16 @@ static __init int hardware_setup(void)
 	if (!cpu_has_vmx_unrestricted_guest())
 		enable_unrestricted_guest = 0;
 
-	if (!cpu_has_vmx_flexpriority()) {
+	if (!cpu_has_vmx_flexpriority())
 		flexpriority_enabled = 0;
 
-		/*
-		 * set_apic_access_page_addr() is used to reload apic access
-		 * page upon invalidation.  No need to do anything if the
-		 * processor does not have the APIC_ACCESS_ADDR VMCS field.
-		 */
+	/*
+	 * set_apic_access_page_addr() is used to reload apic access
+	 * page upon invalidation.  No need to do anything if not
+	 * using the APIC_ACCESS_ADDR VMCS field.
+	 */
+	if (!flexpriority_enabled)
 		kvm_x86_ops->set_apic_access_page_addr = NULL;
-	}
 
 	if (!cpu_has_vmx_tpr_shadow())
 		kvm_x86_ops->update_cr8_intercept = NULL;
@@ -5895,13 +6112,11 @@ static __init int hardware_setup(void)
 		kvm_x86_ops->update_cr8_intercept = NULL;
 	else {
 		kvm_x86_ops->hwapic_irr_update = NULL;
+		kvm_x86_ops->hwapic_isr_update = NULL;
 		kvm_x86_ops->deliver_posted_interrupt = NULL;
 		kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
 	}
 
-	if (nested)
-		nested_vmx_setup_ctls_msrs();
-
 	vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
 	vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
 	vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
@@ -5945,12 +6160,29 @@ static __init int hardware_setup(void)
 
 	update_ple_window_actual_max();
 
+	/*
+	 * Only enable PML when hardware supports PML feature, and both EPT
+	 * and EPT A/D bit features are enabled -- PML depends on them to work.
+	 */
+	if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
+		enable_pml = 0;
+
+	if (!enable_pml) {
+		kvm_x86_ops->slot_enable_log_dirty = NULL;
+		kvm_x86_ops->slot_disable_log_dirty = NULL;
+		kvm_x86_ops->flush_log_dirty = NULL;
+		kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
+	}
+
 	return alloc_kvm_area();
 
-out7:
+out8:
 	free_page((unsigned long)vmx_vmwrite_bitmap);
-out6:
+out7:
 	free_page((unsigned long)vmx_vmread_bitmap);
+out6:
+	if (nested)
+		free_page((unsigned long)vmx_msr_bitmap_nested);
 out5:
 	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
 out4:
@@ -5977,6 +6209,8 @@ static __exit void hardware_unsetup(void)
 	free_page((unsigned long)vmx_io_bitmap_a);
 	free_page((unsigned long)vmx_vmwrite_bitmap);
 	free_page((unsigned long)vmx_vmread_bitmap);
+	if (nested)
+		free_page((unsigned long)vmx_msr_bitmap_nested);
 
 	free_kvm_area();
 }
@@ -6143,6 +6377,13 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
 	 */
 }
 
+static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
+{
+	/* TODO: not to reset guest simply here. */
+	kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+	pr_warn("kvm: nested vmx abort, indicator %d\n", indicator);
+}
+
 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
 {
 	struct vcpu_vmx *vmx =
@@ -6432,6 +6673,7 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
 		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
 		vmcs_write64(VMCS_LINK_POINTER, -1ull);
 	}
+	vmx->nested.posted_intr_nv = -1;
 	kunmap(vmx->nested.current_vmcs12_page);
 	nested_release_page(vmx->nested.current_vmcs12_page);
 	vmx->nested.current_vmptr = -1ull;
@@ -6460,6 +6702,12 @@ static void free_nested(struct vcpu_vmx *vmx)
 		nested_release_page(vmx->nested.virtual_apic_page);
 		vmx->nested.virtual_apic_page = NULL;
 	}
+	if (vmx->nested.pi_desc_page) {
+		kunmap(vmx->nested.pi_desc_page);
+		nested_release_page(vmx->nested.pi_desc_page);
+		vmx->nested.pi_desc_page = NULL;
+		vmx->nested.pi_desc = NULL;
+	}
 
 	nested_free_all_saved_vmcss(vmx);
 }
@@ -6893,6 +7141,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
 /* Emulate the INVEPT instruction */
 static int handle_invept(struct kvm_vcpu *vcpu)
 {
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 vmx_instruction_info, types;
 	unsigned long type;
 	gva_t gva;
@@ -6901,8 +7150,9 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 		u64 eptp, gpa;
 	} operand;
 
-	if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
-	    !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+	if (!(vmx->nested.nested_vmx_secondary_ctls_high &
+	      SECONDARY_EXEC_ENABLE_EPT) ||
+	    !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
 		kvm_queue_exception(vcpu, UD_VECTOR);
 		return 1;
 	}
@@ -6918,7 +7168,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
 
-	types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
+	types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
 
 	if (!(types & (1UL << type))) {
 		nested_vmx_failValid(vcpu,
@@ -6960,6 +7210,31 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_pml_full(struct kvm_vcpu *vcpu)
+{
+	unsigned long exit_qualification;
+
+	trace_kvm_pml_full(vcpu->vcpu_id);
+
+	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+	/*
+	 * PML buffer FULL happened while executing iret from NMI,
+	 * "blocked by NMI" bit has to be set before next VM entry.
+	 */
+	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+			cpu_has_virtual_nmis() &&
+			(exit_qualification & INTR_INFO_UNBLOCK_NMI))
+		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+				GUEST_INTR_STATE_NMI);
+
+	/*
+	 * PML buffer already flushed at beginning of VMEXIT. Nothing to do
+	 * here.., and there's no userspace involvement needed for PML.
+	 */
+	return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -7008,6 +7283,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_INVVPID]                 = handle_invvpid,
 	[EXIT_REASON_XSAVES]                  = handle_xsaves,
 	[EXIT_REASON_XRSTORS]                 = handle_xrstors,
+	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -7275,6 +7551,10 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 	case EXIT_REASON_APIC_ACCESS:
 		return nested_cpu_has2(vmcs12,
 			SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+	case EXIT_REASON_APIC_WRITE:
+	case EXIT_REASON_EOI_INDUCED:
+		/* apic_write and eoi_induced should exit unconditionally. */
+		return 1;
 	case EXIT_REASON_EPT_VIOLATION:
 		/*
 		 * L0 always deals with the EPT violation. If nested EPT is
@@ -7314,6 +7594,89 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
 	*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
 }
 
+static int vmx_enable_pml(struct vcpu_vmx *vmx)
+{
+	struct page *pml_pg;
+	u32 exec_control;
+
+	pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!pml_pg)
+		return -ENOMEM;
+
+	vmx->pml_pg = pml_pg;
+
+	vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
+	vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+
+	exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+	exec_control |= SECONDARY_EXEC_ENABLE_PML;
+	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+
+	return 0;
+}
+
+static void vmx_disable_pml(struct vcpu_vmx *vmx)
+{
+	u32 exec_control;
+
+	ASSERT(vmx->pml_pg);
+	__free_page(vmx->pml_pg);
+	vmx->pml_pg = NULL;
+
+	exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+	exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+}
+
+static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx)
+{
+	struct kvm *kvm = vmx->vcpu.kvm;
+	u64 *pml_buf;
+	u16 pml_idx;
+
+	pml_idx = vmcs_read16(GUEST_PML_INDEX);
+
+	/* Do nothing if PML buffer is empty */
+	if (pml_idx == (PML_ENTITY_NUM - 1))
+		return;
+
+	/* PML index always points to next available PML buffer entity */
+	if (pml_idx >= PML_ENTITY_NUM)
+		pml_idx = 0;
+	else
+		pml_idx++;
+
+	pml_buf = page_address(vmx->pml_pg);
+	for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
+		u64 gpa;
+
+		gpa = pml_buf[pml_idx];
+		WARN_ON(gpa & (PAGE_SIZE - 1));
+		mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
+	}
+
+	/* reset PML index */
+	vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+}
+
+/*
+ * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
+ * Called before reporting dirty_bitmap to userspace.
+ */
+static void kvm_flush_pml_buffers(struct kvm *kvm)
+{
+	int i;
+	struct kvm_vcpu *vcpu;
+	/*
+	 * We only need to kick vcpu out of guest mode here, as PML buffer
+	 * is flushed at beginning of all VMEXITs, and it's obvious that only
+	 * vcpus running in guest are possible to have unflushed GPAs in PML
+	 * buffer.
+	 */
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		kvm_vcpu_kick(vcpu);
+}
+
 /*
  * The guest has exited.  See if we can fix it or if we need userspace
  * assistance.
@@ -7324,6 +7687,16 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	u32 exit_reason = vmx->exit_reason;
 	u32 vectoring_info = vmx->idt_vectoring_info;
 
+	/*
+	 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
+	 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
+	 * querying dirty_bitmap, we only need to kick all vcpus out of guest
+	 * mode as if vcpus is in root mode, the PML buffer must has been
+	 * flushed already.
+	 */
+	if (enable_pml)
+		vmx_flush_pml_buffer(vmx);
+
 	/* If guest state is invalid, start emulating */
 	if (vmx->emulation_required)
 		return handle_invalid_guest_state(vcpu);
@@ -7471,9 +7844,6 @@ static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
 	u16 status;
 	u8 old;
 
-	if (!vmx_vm_has_apicv(kvm))
-		return;
-
 	if (isr == -1)
 		isr = 0;
 
@@ -7784,7 +8154,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
 		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
 
-	cr4 = read_cr4();
+	cr4 = cr4_read_shadow();
 	if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
 		vmcs_writel(HOST_CR4, cr4);
 		vmx->host_state.vmcs_host_cr4 = cr4;
@@ -7973,6 +8343,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	if (enable_pml)
+		vmx_disable_pml(vmx);
 	free_vpid(vmx);
 	leave_guest_mode(vcpu);
 	vmx_load_vmcs01(vcpu);
@@ -8040,9 +8412,25 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 			goto free_vmcs;
 	}
 
+	if (nested)
+		nested_vmx_setup_ctls_msrs(vmx);
+
+	vmx->nested.posted_intr_nv = -1;
 	vmx->nested.current_vmptr = -1ull;
 	vmx->nested.current_vmcs12 = NULL;
 
+	/*
+	 * If PML is turned on, failure on enabling PML just results in failure
+	 * of creating the vcpu, therefore we can simplify PML logic (by
+	 * avoiding dealing with cases, such as enabling PML partially on vcpus
+	 * for the guest, etc.
+	 */
+	if (enable_pml) {
+		err = vmx_enable_pml(vmx);
+		if (err)
+			goto free_vmcs;
+	}
+
 	return &vmx->vcpu;
 
 free_vmcs:
@@ -8184,9 +8572,10 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
 
 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 {
-	kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
-			nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
-
+	WARN_ON(mmu_is_nested(vcpu));
+	kvm_init_shadow_ept_mmu(vcpu,
+			to_vmx(vcpu)->nested.nested_vmx_ept_caps &
+			VMX_EPT_EXECUTE_ONLY_BIT);
 	vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
 	vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
 	vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
@@ -8199,6 +8588,18 @@ static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
 	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
 }
 
+static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
+					    u16 error_code)
+{
+	bool inequality, bit;
+
+	bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
+	inequality =
+		(error_code & vmcs12->page_fault_error_code_mask) !=
+		 vmcs12->page_fault_error_code_match;
+	return inequality ^ bit;
+}
+
 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 		struct x86_exception *fault)
 {
@@ -8206,8 +8607,7 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 
 	WARN_ON(!is_guest_mode(vcpu));
 
-	/* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
-	if (vmcs12->exception_bitmap & (1u << PF_VECTOR))
+	if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code))
 		nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
 				  vmcs_read32(VM_EXIT_INTR_INFO),
 				  vmcs_readl(EXIT_QUALIFICATION));
@@ -8261,6 +8661,31 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
 			return false;
 	}
 
+	if (nested_cpu_has_posted_intr(vmcs12)) {
+		if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64))
+			return false;
+
+		if (vmx->nested.pi_desc_page) { /* shouldn't happen */
+			kunmap(vmx->nested.pi_desc_page);
+			nested_release_page(vmx->nested.pi_desc_page);
+		}
+		vmx->nested.pi_desc_page =
+			nested_get_page(vcpu, vmcs12->posted_intr_desc_addr);
+		if (!vmx->nested.pi_desc_page)
+			return false;
+
+		vmx->nested.pi_desc =
+			(struct pi_desc *)kmap(vmx->nested.pi_desc_page);
+		if (!vmx->nested.pi_desc) {
+			nested_release_page_clean(vmx->nested.pi_desc_page);
+			return false;
+		}
+		vmx->nested.pi_desc =
+			(struct pi_desc *)((void *)vmx->nested.pi_desc +
+			(unsigned long)(vmcs12->posted_intr_desc_addr &
+			(PAGE_SIZE - 1)));
+	}
+
 	return true;
 }
 
@@ -8286,6 +8711,310 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
 		      ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
 }
 
+static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
+						struct vmcs12 *vmcs12)
+{
+	int maxphyaddr;
+	u64 addr;
+
+	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+		return 0;
+
+	if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+	maxphyaddr = cpuid_maxphyaddr(vcpu);
+
+	if (!PAGE_ALIGNED(vmcs12->msr_bitmap) ||
+	   ((addr + PAGE_SIZE) >> maxphyaddr))
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Merge L0's and L1's MSR bitmap, return false to indicate that
+ * we do not use the hardware.
+ */
+static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
+					       struct vmcs12 *vmcs12)
+{
+	int msr;
+	struct page *page;
+	unsigned long *msr_bitmap;
+
+	if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
+		return false;
+
+	page = nested_get_page(vcpu, vmcs12->msr_bitmap);
+	if (!page) {
+		WARN_ON(1);
+		return false;
+	}
+	msr_bitmap = (unsigned long *)kmap(page);
+	if (!msr_bitmap) {
+		nested_release_page_clean(page);
+		WARN_ON(1);
+		return false;
+	}
+
+	if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
+		if (nested_cpu_has_apic_reg_virt(vmcs12))
+			for (msr = 0x800; msr <= 0x8ff; msr++)
+				nested_vmx_disable_intercept_for_msr(
+					msr_bitmap,
+					vmx_msr_bitmap_nested,
+					msr, MSR_TYPE_R);
+		/* TPR is allowed */
+		nested_vmx_disable_intercept_for_msr(msr_bitmap,
+				vmx_msr_bitmap_nested,
+				APIC_BASE_MSR + (APIC_TASKPRI >> 4),
+				MSR_TYPE_R | MSR_TYPE_W);
+		if (nested_cpu_has_vid(vmcs12)) {
+			/* EOI and self-IPI are allowed */
+			nested_vmx_disable_intercept_for_msr(
+				msr_bitmap,
+				vmx_msr_bitmap_nested,
+				APIC_BASE_MSR + (APIC_EOI >> 4),
+				MSR_TYPE_W);
+			nested_vmx_disable_intercept_for_msr(
+				msr_bitmap,
+				vmx_msr_bitmap_nested,
+				APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
+				MSR_TYPE_W);
+		}
+	} else {
+		/*
+		 * Enable reading intercept of all the x2apic
+		 * MSRs. We should not rely on vmcs12 to do any
+		 * optimizations here, it may have been modified
+		 * by L1.
+		 */
+		for (msr = 0x800; msr <= 0x8ff; msr++)
+			__vmx_enable_intercept_for_msr(
+				vmx_msr_bitmap_nested,
+				msr,
+				MSR_TYPE_R);
+
+		__vmx_enable_intercept_for_msr(
+				vmx_msr_bitmap_nested,
+				APIC_BASE_MSR + (APIC_TASKPRI >> 4),
+				MSR_TYPE_W);
+		__vmx_enable_intercept_for_msr(
+				vmx_msr_bitmap_nested,
+				APIC_BASE_MSR + (APIC_EOI >> 4),
+				MSR_TYPE_W);
+		__vmx_enable_intercept_for_msr(
+				vmx_msr_bitmap_nested,
+				APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
+				MSR_TYPE_W);
+	}
+	kunmap(page);
+	nested_release_page_clean(page);
+
+	return true;
+}
+
+static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
+					   struct vmcs12 *vmcs12)
+{
+	if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+	    !nested_cpu_has_apic_reg_virt(vmcs12) &&
+	    !nested_cpu_has_vid(vmcs12) &&
+	    !nested_cpu_has_posted_intr(vmcs12))
+		return 0;
+
+	/*
+	 * If virtualize x2apic mode is enabled,
+	 * virtualize apic access must be disabled.
+	 */
+	if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+	    nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+		return -EINVAL;
+
+	/*
+	 * If virtual interrupt delivery is enabled,
+	 * we must exit on external interrupts.
+	 */
+	if (nested_cpu_has_vid(vmcs12) &&
+	   !nested_exit_on_intr(vcpu))
+		return -EINVAL;
+
+	/*
+	 * bits 15:8 should be zero in posted_intr_nv,
+	 * the descriptor address has been already checked
+	 * in nested_get_vmcs12_pages.
+	 */
+	if (nested_cpu_has_posted_intr(vmcs12) &&
+	   (!nested_cpu_has_vid(vmcs12) ||
+	    !nested_exit_intr_ack_set(vcpu) ||
+	    vmcs12->posted_intr_nv & 0xff00))
+		return -EINVAL;
+
+	/* tpr shadow is needed by all apicv features. */
+	if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
+				       unsigned long count_field,
+				       unsigned long addr_field,
+				       int maxphyaddr)
+{
+	u64 count, addr;
+
+	if (vmcs12_read_any(vcpu, count_field, &count) ||
+	    vmcs12_read_any(vcpu, addr_field, &addr)) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+	if (count == 0)
+		return 0;
+	if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
+	    (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
+		pr_warn_ratelimited(
+			"nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
+			addr_field, maxphyaddr, count, addr);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
+						struct vmcs12 *vmcs12)
+{
+	int maxphyaddr;
+
+	if (vmcs12->vm_exit_msr_load_count == 0 &&
+	    vmcs12->vm_exit_msr_store_count == 0 &&
+	    vmcs12->vm_entry_msr_load_count == 0)
+		return 0; /* Fast path */
+	maxphyaddr = cpuid_maxphyaddr(vcpu);
+	if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
+					VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) ||
+	    nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
+					VM_EXIT_MSR_STORE_ADDR, maxphyaddr) ||
+	    nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
+					VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr))
+		return -EINVAL;
+	return 0;
+}
+
+static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
+				       struct vmx_msr_entry *e)
+{
+	/* x2APIC MSR accesses are not allowed */
+	if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8)
+		return -EINVAL;
+	if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
+	    e->index == MSR_IA32_UCODE_REV)
+		return -EINVAL;
+	if (e->reserved != 0)
+		return -EINVAL;
+	return 0;
+}
+
+static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
+				     struct vmx_msr_entry *e)
+{
+	if (e->index == MSR_FS_BASE ||
+	    e->index == MSR_GS_BASE ||
+	    e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
+	    nested_vmx_msr_check_common(vcpu, e))
+		return -EINVAL;
+	return 0;
+}
+
+static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
+				      struct vmx_msr_entry *e)
+{
+	if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
+	    nested_vmx_msr_check_common(vcpu, e))
+		return -EINVAL;
+	return 0;
+}
+
+/*
+ * Load guest's/host's msr at nested entry/exit.
+ * return 0 for success, entry index for failure.
+ */
+static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+	u32 i;
+	struct vmx_msr_entry e;
+	struct msr_data msr;
+
+	msr.host_initiated = false;
+	for (i = 0; i < count; i++) {
+		if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e),
+				   &e, sizeof(e))) {
+			pr_warn_ratelimited(
+				"%s cannot read MSR entry (%u, 0x%08llx)\n",
+				__func__, i, gpa + i * sizeof(e));
+			goto fail;
+		}
+		if (nested_vmx_load_msr_check(vcpu, &e)) {
+			pr_warn_ratelimited(
+				"%s check failed (%u, 0x%x, 0x%x)\n",
+				__func__, i, e.index, e.reserved);
+			goto fail;
+		}
+		msr.index = e.index;
+		msr.data = e.value;
+		if (kvm_set_msr(vcpu, &msr)) {
+			pr_warn_ratelimited(
+				"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
+				__func__, i, e.index, e.value);
+			goto fail;
+		}
+	}
+	return 0;
+fail:
+	return i + 1;
+}
+
+static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+	u32 i;
+	struct vmx_msr_entry e;
+
+	for (i = 0; i < count; i++) {
+		if (kvm_read_guest(vcpu->kvm,
+				   gpa + i * sizeof(e),
+				   &e, 2 * sizeof(u32))) {
+			pr_warn_ratelimited(
+				"%s cannot read MSR entry (%u, 0x%08llx)\n",
+				__func__, i, gpa + i * sizeof(e));
+			return -EINVAL;
+		}
+		if (nested_vmx_store_msr_check(vcpu, &e)) {
+			pr_warn_ratelimited(
+				"%s check failed (%u, 0x%x, 0x%x)\n",
+				__func__, i, e.index, e.reserved);
+			return -EINVAL;
+		}
+		if (kvm_get_msr(vcpu, e.index, &e.value)) {
+			pr_warn_ratelimited(
+				"%s cannot read MSR (%u, 0x%x)\n",
+				__func__, i, e.index);
+			return -EINVAL;
+		}
+		if (kvm_write_guest(vcpu->kvm,
+				    gpa + i * sizeof(e) +
+					offsetof(struct vmx_msr_entry, value),
+				    &e.value, sizeof(e.value))) {
+			pr_warn_ratelimited(
+				"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
+				__func__, i, e.index, e.value);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
 /*
  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -8365,8 +9094,23 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
 	exec_control = vmcs12->pin_based_vm_exec_control;
 	exec_control |= vmcs_config.pin_based_exec_ctrl;
-	exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER |
-                          PIN_BASED_POSTED_INTR);
+	exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+
+	if (nested_cpu_has_posted_intr(vmcs12)) {
+		/*
+		 * Note that we use L0's vector here and in
+		 * vmx_deliver_nested_posted_interrupt.
+		 */
+		vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
+		vmx->nested.pi_pending = false;
+		vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+		vmcs_write64(POSTED_INTR_DESC_ADDR,
+			page_to_phys(vmx->nested.pi_desc_page) +
+			(unsigned long)(vmcs12->posted_intr_desc_addr &
+			(PAGE_SIZE - 1)));
+	} else
+		exec_control &= ~PIN_BASED_POSTED_INTR;
+
 	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
 
 	vmx->nested.preemption_timer_expired = false;
@@ -8423,12 +9167,26 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 			else
 				vmcs_write64(APIC_ACCESS_ADDR,
 				  page_to_phys(vmx->nested.apic_access_page));
-		} else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) {
+		} else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
+			    (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) {
 			exec_control |=
 				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 			kvm_vcpu_reload_apic_access_page(vcpu);
 		}
 
+		if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
+			vmcs_write64(EOI_EXIT_BITMAP0,
+				vmcs12->eoi_exit_bitmap0);
+			vmcs_write64(EOI_EXIT_BITMAP1,
+				vmcs12->eoi_exit_bitmap1);
+			vmcs_write64(EOI_EXIT_BITMAP2,
+				vmcs12->eoi_exit_bitmap2);
+			vmcs_write64(EOI_EXIT_BITMAP3,
+				vmcs12->eoi_exit_bitmap3);
+			vmcs_write16(GUEST_INTR_STATUS,
+				vmcs12->guest_intr_status);
+		}
+
 		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
 	}
 
@@ -8462,11 +9220,17 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 		vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
 	}
 
+	if (cpu_has_vmx_msr_bitmap() &&
+	    exec_control & CPU_BASED_USE_MSR_BITMAPS) {
+		nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
+		/* MSR_BITMAP will be set by following vmx_set_efer. */
+	} else
+		exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
+
 	/*
-	 * Merging of IO and MSR bitmaps not currently supported.
+	 * Merging of IO bitmap not currently supported.
 	 * Rather, exit every time.
 	 */
-	exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
 	exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
 	exec_control |= CPU_BASED_UNCOND_IO_EXITING;
 
@@ -8582,6 +9346,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	int cpu;
 	struct loaded_vmcs *vmcs02;
 	bool ia32e;
+	u32 msr_entry_idx;
 
 	if (!nested_vmx_check_permission(vcpu) ||
 	    !nested_vmx_check_vmcs12(vcpu))
@@ -8616,41 +9381,42 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 		return 1;
 	}
 
-	if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
-			!PAGE_ALIGNED(vmcs12->msr_bitmap)) {
+	if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
 		/*TODO: Also verify bits beyond physical address width are 0*/
 		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
 		return 1;
 	}
 
-	if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
-		/*TODO: Also verify bits beyond physical address width are 0*/
+	if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
+		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+		return 1;
+	}
+
+	if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) {
 		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
 		return 1;
 	}
 
-	if (vmcs12->vm_entry_msr_load_count > 0 ||
-	    vmcs12->vm_exit_msr_load_count > 0 ||
-	    vmcs12->vm_exit_msr_store_count > 0) {
-		pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n",
-				    __func__);
+	if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
 		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
 		return 1;
 	}
 
 	if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
-				nested_vmx_true_procbased_ctls_low,
-				nested_vmx_procbased_ctls_high) ||
+				vmx->nested.nested_vmx_true_procbased_ctls_low,
+				vmx->nested.nested_vmx_procbased_ctls_high) ||
 	    !vmx_control_verify(vmcs12->secondary_vm_exec_control,
-	      nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) ||
+				vmx->nested.nested_vmx_secondary_ctls_low,
+				vmx->nested.nested_vmx_secondary_ctls_high) ||
 	    !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
-	      nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) ||
+				vmx->nested.nested_vmx_pinbased_ctls_low,
+				vmx->nested.nested_vmx_pinbased_ctls_high) ||
 	    !vmx_control_verify(vmcs12->vm_exit_controls,
-				nested_vmx_true_exit_ctls_low,
-				nested_vmx_exit_ctls_high) ||
+				vmx->nested.nested_vmx_true_exit_ctls_low,
+				vmx->nested.nested_vmx_exit_ctls_high) ||
 	    !vmx_control_verify(vmcs12->vm_entry_controls,
-				nested_vmx_true_entry_ctls_low,
-				nested_vmx_entry_ctls_high))
+				vmx->nested.nested_vmx_true_entry_ctls_low,
+				vmx->nested.nested_vmx_entry_ctls_high))
 	{
 		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
 		return 1;
@@ -8663,7 +9429,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 		return 1;
 	}
 
-	if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) ||
+	if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) ||
 	    ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
 		nested_vmx_entry_failure(vcpu, vmcs12,
 			EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
@@ -8739,10 +9505,21 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 
 	vmx_segment_cache_clear(vmx);
 
-	vmcs12->launch_state = 1;
-
 	prepare_vmcs02(vcpu, vmcs12);
 
+	msr_entry_idx = nested_vmx_load_msr(vcpu,
+					    vmcs12->vm_entry_msr_load_addr,
+					    vmcs12->vm_entry_msr_load_count);
+	if (msr_entry_idx) {
+		leave_guest_mode(vcpu);
+		vmx_load_vmcs01(vcpu);
+		nested_vmx_entry_failure(vcpu, vmcs12,
+				EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
+		return 1;
+	}
+
+	vmcs12->launch_state = 1;
+
 	if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
 		return kvm_emulate_halt(vcpu);
 
@@ -8869,9 +9646,10 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
 		if (vmx->nested.nested_run_pending)
 			return -EBUSY;
 		nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+		return 0;
 	}
 
-	return 0;
+	return vmx_complete_nested_posted_interrupt(vcpu);
 }
 
 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
@@ -8981,6 +9759,9 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 		vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
 	}
 
+	if (nested_cpu_has_vid(vmcs12))
+		vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
+
 	vmcs12->vm_entry_controls =
 		(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
 		(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
@@ -9172,6 +9953,13 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 
 	kvm_set_dr(vcpu, 7, 0x400);
 	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+	if (cpu_has_vmx_msr_bitmap())
+		vmx_set_msr_bitmap(vcpu);
+
+	if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
+				vmcs12->vm_exit_msr_load_count))
+		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
 }
 
 /*
@@ -9193,6 +9981,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 	prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
 		       exit_qualification);
 
+	if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
+				 vmcs12->vm_exit_msr_store_count))
+		nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+
 	vmx_load_vmcs01(vcpu);
 
 	if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
@@ -9235,6 +10027,12 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 		nested_release_page(vmx->nested.virtual_apic_page);
 		vmx->nested.virtual_apic_page = NULL;
 	}
+	if (vmx->nested.pi_desc_page) {
+		kunmap(vmx->nested.pi_desc_page);
+		nested_release_page(vmx->nested.pi_desc_page);
+		vmx->nested.pi_desc_page = NULL;
+		vmx->nested.pi_desc = NULL;
+	}
 
 	/*
 	 * We are now running in L2, mmu_notifier will force to reload the
@@ -9301,6 +10099,31 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
 		shrink_ple_window(vcpu);
 }
 
+static void vmx_slot_enable_log_dirty(struct kvm *kvm,
+				     struct kvm_memory_slot *slot)
+{
+	kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
+	kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
+}
+
+static void vmx_slot_disable_log_dirty(struct kvm *kvm,
+				       struct kvm_memory_slot *slot)
+{
+	kvm_mmu_slot_set_dirty(kvm, slot);
+}
+
+static void vmx_flush_log_dirty(struct kvm *kvm)
+{
+	kvm_flush_pml_buffers(kvm);
+}
+
+static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
+					   struct kvm_memory_slot *memslot,
+					   gfn_t offset, unsigned long mask)
+{
+	kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
@@ -9409,6 +10232,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.check_nested_events = vmx_check_nested_events,
 
 	.sched_in = vmx_sched_in,
+
+	.slot_enable_log_dirty = vmx_slot_enable_log_dirty,
+	.slot_disable_log_dirty = vmx_slot_disable_log_dirty,
+	.flush_log_dirty = vmx_flush_log_dirty,
+	.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c259814200bd..32bf19ef3115 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -108,6 +108,10 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
 static u32 tsc_tolerance_ppm = 250;
 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
 
+/* lapic timer advance (tscdeadline mode only) in nanoseconds */
+unsigned int lapic_timer_advance_ns = 0;
+module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
+
 static bool backwards_tsc_observed = false;
 
 #define KVM_NR_SHARED_MSRS 16
@@ -141,6 +145,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "irq_window", VCPU_STAT(irq_window_exits) },
 	{ "nmi_window", VCPU_STAT(nmi_window_exits) },
 	{ "halt_exits", VCPU_STAT(halt_exits) },
+	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ "hypercalls", VCPU_STAT(hypercalls) },
 	{ "request_irq", VCPU_STAT(request_irq_exits) },
@@ -492,7 +497,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
 
-int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 			       void *data, int offset, int len, u32 access)
 {
 	return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
@@ -643,7 +648,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
 	}
 }
 
-int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 {
 	u64 xcr0 = xcr;
 	u64 old_xcr0 = vcpu->arch.xcr0;
@@ -1083,6 +1088,15 @@ static void update_pvclock_gtod(struct timekeeper *tk)
 }
 #endif
 
+void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
+	 * vcpu_enter_guest.  This function is only called from
+	 * the physical CPU that is running vcpu.
+	 */
+	kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+}
 
 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 {
@@ -1180,7 +1194,7 @@ static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
 #endif
 
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
-unsigned long max_tsc_khz;
+static unsigned long max_tsc_khz;
 
 static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
 {
@@ -1234,7 +1248,7 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
 	return tsc;
 }
 
-void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
+static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_X86_64
 	bool vcpus_matched;
@@ -1529,7 +1543,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
 					&ka->master_cycle_now);
 
 	ka->use_master_clock = host_tsc_clocksource && vcpus_matched
-				&& !backwards_tsc_observed;
+				&& !backwards_tsc_observed
+				&& !ka->boot_vcpu_runs_old_kvmclock;
 
 	if (ka->use_master_clock)
 		atomic_set(&kvm_guest_has_master_clock, 1);
@@ -2161,8 +2176,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_KVM_SYSTEM_TIME_NEW:
 	case MSR_KVM_SYSTEM_TIME: {
 		u64 gpa_offset;
+		struct kvm_arch *ka = &vcpu->kvm->arch;
+
 		kvmclock_reset(vcpu);
 
+		if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
+			bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
+
+			if (ka->boot_vcpu_runs_old_kvmclock != tmp)
+				set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
+					&vcpu->requests);
+
+			ka->boot_vcpu_runs_old_kvmclock = tmp;
+		}
+
 		vcpu->arch.time = data;
 		kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
 
@@ -2324,6 +2351,7 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 {
 	return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
 }
+EXPORT_SYMBOL_GPL(kvm_get_msr);
 
 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 {
@@ -2716,7 +2744,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_USER_NMI:
 	case KVM_CAP_REINJECT_CONTROL:
 	case KVM_CAP_IRQ_INJECT_STATUS:
-	case KVM_CAP_IRQFD:
 	case KVM_CAP_IOEVENTFD:
 	case KVM_CAP_IOEVENTFD_NO_LENGTH:
 	case KVM_CAP_PIT2:
@@ -2738,6 +2765,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_READONLY_MEM:
 	case KVM_CAP_HYPERV_TIME:
 	case KVM_CAP_IOAPIC_POLARITY_IGNORED:
+	case KVM_CAP_TSC_DEADLINE_TIMER:
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
 	case KVM_CAP_ASSIGN_DEV_IRQ:
 	case KVM_CAP_PCI_2_3:
@@ -2776,9 +2804,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_TSC_CONTROL:
 		r = kvm_has_tsc_control;
 		break;
-	case KVM_CAP_TSC_DEADLINE_TIMER:
-		r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);
-		break;
 	default:
 		r = 0;
 		break;
@@ -3734,83 +3759,43 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
  * @kvm: kvm instance
  * @log: slot id and address to which we copy the log
  *
- * We need to keep it in mind that VCPU threads can write to the bitmap
- * concurrently.  So, to avoid losing data, we keep the following order for
- * each bit:
+ * Steps 1-4 below provide general overview of dirty page logging. See
+ * kvm_get_dirty_log_protect() function description for additional details.
+ *
+ * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
+ * always flush the TLB (step 4) even if previous step failed  and the dirty
+ * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
+ * does not preclude user space subsequent dirty log read. Flushing TLB ensures
+ * writes will be marked dirty for next log read.
  *
  *   1. Take a snapshot of the bit and clear it if needed.
  *   2. Write protect the corresponding page.
- *   3. Flush TLB's if needed.
- *   4. Copy the snapshot to the userspace.
- *
- * Between 2 and 3, the guest may write to the page using the remaining TLB
- * entry.  This is not a problem because the page will be reported dirty at
- * step 4 using the snapshot taken before and step 3 ensures that successive
- * writes will be logged for the next call.
+ *   3. Copy the snapshot to the userspace.
+ *   4. Flush TLB's if needed.
  */
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 {
-	int r;
-	struct kvm_memory_slot *memslot;
-	unsigned long n, i;
-	unsigned long *dirty_bitmap;
-	unsigned long *dirty_bitmap_buffer;
 	bool is_dirty = false;
+	int r;
 
 	mutex_lock(&kvm->slots_lock);
 
-	r = -EINVAL;
-	if (log->slot >= KVM_USER_MEM_SLOTS)
-		goto out;
-
-	memslot = id_to_memslot(kvm->memslots, log->slot);
-
-	dirty_bitmap = memslot->dirty_bitmap;
-	r = -ENOENT;
-	if (!dirty_bitmap)
-		goto out;
-
-	n = kvm_dirty_bitmap_bytes(memslot);
-
-	dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
-	memset(dirty_bitmap_buffer, 0, n);
-
-	spin_lock(&kvm->mmu_lock);
-
-	for (i = 0; i < n / sizeof(long); i++) {
-		unsigned long mask;
-		gfn_t offset;
-
-		if (!dirty_bitmap[i])
-			continue;
-
-		is_dirty = true;
-
-		mask = xchg(&dirty_bitmap[i], 0);
-		dirty_bitmap_buffer[i] = mask;
-
-		offset = i * BITS_PER_LONG;
-		kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
-	}
-
-	spin_unlock(&kvm->mmu_lock);
+	/*
+	 * Flush potentially hardware-cached dirty pages to dirty_bitmap.
+	 */
+	if (kvm_x86_ops->flush_log_dirty)
+		kvm_x86_ops->flush_log_dirty(kvm);
 
-	/* See the comments in kvm_mmu_slot_remove_write_access(). */
-	lockdep_assert_held(&kvm->slots_lock);
+	r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
 
 	/*
 	 * All the TLBs can be flushed out of mmu lock, see the comments in
 	 * kvm_mmu_slot_remove_write_access().
 	 */
+	lockdep_assert_held(&kvm->slots_lock);
 	if (is_dirty)
 		kvm_flush_remote_tlbs(kvm);
 
-	r = -EFAULT;
-	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
-		goto out;
-
-	r = 0;
-out:
 	mutex_unlock(&kvm->slots_lock);
 	return r;
 }
@@ -4516,6 +4501,8 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		addr += now;
+		if (ctxt->mode != X86EMUL_MODE_PROT64)
+			addr = (u32)addr;
 		val += now;
 		bytes -= now;
 	}
@@ -4984,6 +4971,11 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon
 	kvm_register_write(emul_to_vcpu(ctxt), reg, val);
 }
 
+static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
+{
+	kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
+}
+
 static const struct x86_emulate_ops emulate_ops = {
 	.read_gpr            = emulator_read_gpr,
 	.write_gpr           = emulator_write_gpr,
@@ -5019,6 +5011,7 @@ static const struct x86_emulate_ops emulate_ops = {
 	.put_fpu             = emulator_put_fpu,
 	.intercept           = emulator_intercept,
 	.get_cpuid           = emulator_get_cpuid,
+	.set_nmi_mask        = emulator_set_nmi_mask,
 };
 
 static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
@@ -6311,6 +6304,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	}
 
 	trace_kvm_entry(vcpu->vcpu_id);
+	wait_lapic_expire(vcpu);
 	kvm_x86_ops->run(vcpu);
 
 	/*
@@ -7041,15 +7035,13 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	return r;
 }
 
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
-	int r;
 	struct msr_data msr;
 	struct kvm *kvm = vcpu->kvm;
 
-	r = vcpu_load(vcpu);
-	if (r)
-		return r;
+	if (vcpu_load(vcpu))
+		return;
 	msr.data = 0x0;
 	msr.index = MSR_IA32_TSC;
 	msr.host_initiated = true;
@@ -7058,8 +7050,6 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 
 	schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
 					KVMCLOCK_SYNC_PERIOD);
-
-	return r;
 }
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -7549,12 +7539,62 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	return 0;
 }
 
+static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
+				     struct kvm_memory_slot *new)
+{
+	/* Still write protect RO slot */
+	if (new->flags & KVM_MEM_READONLY) {
+		kvm_mmu_slot_remove_write_access(kvm, new);
+		return;
+	}
+
+	/*
+	 * Call kvm_x86_ops dirty logging hooks when they are valid.
+	 *
+	 * kvm_x86_ops->slot_disable_log_dirty is called when:
+	 *
+	 *  - KVM_MR_CREATE with dirty logging is disabled
+	 *  - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag
+	 *
+	 * The reason is, in case of PML, we need to set D-bit for any slots
+	 * with dirty logging disabled in order to eliminate unnecessary GPA
+	 * logging in PML buffer (and potential PML buffer full VMEXT). This
+	 * guarantees leaving PML enabled during guest's lifetime won't have
+	 * any additonal overhead from PML when guest is running with dirty
+	 * logging disabled for memory slots.
+	 *
+	 * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot
+	 * to dirty logging mode.
+	 *
+	 * If kvm_x86_ops dirty logging hooks are invalid, use write protect.
+	 *
+	 * In case of write protect:
+	 *
+	 * Write protect all pages for dirty logging.
+	 *
+	 * All the sptes including the large sptes which point to this
+	 * slot are set to readonly. We can not create any new large
+	 * spte on this slot until the end of the logging.
+	 *
+	 * See the comments in fast_page_fault().
+	 */
+	if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+		if (kvm_x86_ops->slot_enable_log_dirty)
+			kvm_x86_ops->slot_enable_log_dirty(kvm, new);
+		else
+			kvm_mmu_slot_remove_write_access(kvm, new);
+	} else {
+		if (kvm_x86_ops->slot_disable_log_dirty)
+			kvm_x86_ops->slot_disable_log_dirty(kvm, new);
+	}
+}
+
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
 				const struct kvm_memory_slot *old,
 				enum kvm_mr_change change)
 {
-
+	struct kvm_memory_slot *new;
 	int nr_mmu_pages = 0;
 
 	if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
@@ -7573,17 +7613,20 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 
 	if (nr_mmu_pages)
 		kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+
+	/* It's OK to get 'new' slot here as it has already been installed */
+	new = id_to_memslot(kvm->memslots, mem->slot);
+
 	/*
-	 * Write protect all pages for dirty logging.
+	 * Set up write protection and/or dirty logging for the new slot.
 	 *
-	 * All the sptes including the large sptes which point to this
-	 * slot are set to readonly. We can not create any new large
-	 * spte on this slot until the end of the logging.
-	 *
-	 * See the comments in fast_page_fault().
+	 * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have
+	 * been zapped so no dirty logging staff is needed for old slot. For
+	 * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
+	 * new and it's also covered when dealing with the new slot.
 	 */
-	if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
-		kvm_mmu_slot_remove_write_access(kvm, mem->slot);
+	if (change != KVM_MR_DELETE)
+		kvm_mmu_slot_apply_flags(kvm, new);
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -7837,3 +7880,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index cc1d61af6140..f5fef1868096 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -147,6 +147,7 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
 
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
+void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
 
 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
@@ -170,5 +171,7 @@ extern u64 kvm_supported_xcr0(void);
 
 extern unsigned int min_timer_period_us;
 
+extern unsigned int lapic_timer_advance_ns;
+
 extern struct static_key kvm_no_apic_vcpu;
 #endif
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 4a0890f815c4..08f41caada45 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -1,6 +1,6 @@
 config LGUEST_GUEST
 	bool "Lguest guest support"
-	depends on X86_32 && PARAVIRT
+	depends on X86_32 && PARAVIRT && PCI
 	select TTY
 	select VIRTUALIZATION
 	select VIRTIO
@@ -8,7 +8,7 @@ config LGUEST_GUEST
 	help
 	  Lguest is a tiny in-kernel hypervisor.  Selecting this will
 	  allow your kernel to boot under lguest.  This option will increase
-	  your kernel size by about 6k.  If in doubt, say N.
+	  your kernel size by about 10k.  If in doubt, say N.
 
 	  If you say Y here, make sure you say Y (or M) to the virtio block
 	  and net drivers which lguest needs.
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index c1c1544b8485..ac4453d8520e 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -56,6 +56,9 @@
 #include <linux/virtio_console.h>
 #include <linux/pm.h>
 #include <linux/export.h>
+#include <linux/pci.h>
+#include <linux/virtio_pci.h>
+#include <asm/acpi.h>
 #include <asm/apic.h>
 #include <asm/lguest.h>
 #include <asm/paravirt.h>
@@ -71,6 +74,8 @@
 #include <asm/stackprotector.h>
 #include <asm/reboot.h>		/* for struct machine_ops */
 #include <asm/kvm_para.h>
+#include <asm/pci_x86.h>
+#include <asm/pci-direct.h>
 
 /*G:010
  * Welcome to the Guest!
@@ -831,6 +836,24 @@ static struct irq_chip lguest_irq_controller = {
 	.irq_unmask	= enable_lguest_irq,
 };
 
+static int lguest_enable_irq(struct pci_dev *dev)
+{
+	u8 line = 0;
+
+	/* We literally use the PCI interrupt line as the irq number. */
+	pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line);
+	irq_set_chip_and_handler_name(line, &lguest_irq_controller,
+				      handle_level_irq, "level");
+	dev->irq = line;
+	return 0;
+}
+
+/* We don't do hotplug PCI, so this shouldn't be called. */
+static void lguest_disable_irq(struct pci_dev *dev)
+{
+	WARN_ON(1);
+}
+
 /*
  * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
  * interrupt (except 128, which is used for system calls), and then tells the
@@ -1181,25 +1204,136 @@ static __init char *lguest_memory_setup(void)
 	return "LGUEST";
 }
 
+/* Offset within PCI config space of BAR access capability. */
+static int console_cfg_offset = 0;
+static int console_access_cap;
+
+/* Set up so that we access off in bar0 (on bus 0, device 1, function 0) */
+static void set_cfg_window(u32 cfg_offset, u32 off)
+{
+	write_pci_config_byte(0, 1, 0,
+			      cfg_offset + offsetof(struct virtio_pci_cap, bar),
+			      0);
+	write_pci_config(0, 1, 0,
+			 cfg_offset + offsetof(struct virtio_pci_cap, length),
+			 4);
+	write_pci_config(0, 1, 0,
+			 cfg_offset + offsetof(struct virtio_pci_cap, offset),
+			 off);
+}
+
+static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val)
+{
+	/*
+	 * We could set this up once, then leave it; nothing else in the *
+	 * kernel should touch these registers.  But if it went wrong, that
+	 * would be a horrible bug to find.
+	 */
+	set_cfg_window(cfg_offset, off);
+	write_pci_config(0, 1, 0,
+			 cfg_offset + sizeof(struct virtio_pci_cap), val);
+}
+
+static void probe_pci_console(void)
+{
+	u8 cap, common_cap = 0, device_cap = 0;
+	/* Offset within BAR0 */
+	u32 device_offset;
+	u32 device_len;
+
+	/* Avoid recursive printk into here. */
+	console_cfg_offset = -1;
+
+	if (!early_pci_allowed()) {
+		printk(KERN_ERR "lguest: early PCI access not allowed!\n");
+		return;
+	}
+
+	/* We expect a console PCI device at BUS0, slot 1. */
+	if (read_pci_config(0, 1, 0, 0) != 0x10431AF4) {
+		printk(KERN_ERR "lguest: PCI device is %#x!\n",
+		       read_pci_config(0, 1, 0, 0));
+		return;
+	}
+
+	/* Find the capabilities we need (must be in bar0) */
+	cap = read_pci_config_byte(0, 1, 0, PCI_CAPABILITY_LIST);
+	while (cap) {
+		u8 vndr = read_pci_config_byte(0, 1, 0, cap);
+		if (vndr == PCI_CAP_ID_VNDR) {
+			u8 type, bar;
+			u32 offset, length;
+
+			type = read_pci_config_byte(0, 1, 0,
+			    cap + offsetof(struct virtio_pci_cap, cfg_type));
+			bar = read_pci_config_byte(0, 1, 0,
+			    cap + offsetof(struct virtio_pci_cap, bar));
+			offset = read_pci_config(0, 1, 0,
+			    cap + offsetof(struct virtio_pci_cap, offset));
+			length = read_pci_config(0, 1, 0,
+			    cap + offsetof(struct virtio_pci_cap, length));
+
+			switch (type) {
+			case VIRTIO_PCI_CAP_DEVICE_CFG:
+				if (bar == 0) {
+					device_cap = cap;
+					device_offset = offset;
+					device_len = length;
+				}
+				break;
+			case VIRTIO_PCI_CAP_PCI_CFG:
+				console_access_cap = cap;
+				break;
+			}
+		}
+		cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT);
+	}
+	if (!device_cap || !console_access_cap) {
+		printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n",
+		       common_cap, device_cap, console_access_cap);
+		return;
+	}
+
+	/*
+	 * Note that we can't check features, until we've set the DRIVER
+	 * status bit.  We don't want to do that until we have a real driver,
+	 * so we just check that the device-specific config has room for
+	 * emerg_wr.  If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE
+	 * it should ignore the access.
+	 */
+	if (device_len < (offsetof(struct virtio_console_config, emerg_wr)
+			  + sizeof(u32))) {
+		printk(KERN_ERR "lguest: console missing emerg_wr field\n");
+		return;
+	}
+
+	console_cfg_offset = device_offset;
+	printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n");
+}
+
 /*
  * We will eventually use the virtio console device to produce console output,
- * but before that is set up we use LHCALL_NOTIFY on normal memory to produce
- * console output.
+ * but before that is set up we use the virtio PCI console's backdoor mmio
+ * access and the "emergency" write facility (which is legal even before the
+ * device is configured).
  */
 static __init int early_put_chars(u32 vtermno, const char *buf, int count)
 {
-	char scratch[17];
-	unsigned int len = count;
+	/* If we couldn't find PCI console, forget it. */
+	if (console_cfg_offset < 0)
+		return count;
 
-	/* We use a nul-terminated string, so we make a copy.  Icky, huh? */
-	if (len > sizeof(scratch) - 1)
-		len = sizeof(scratch) - 1;
-	scratch[len] = '\0';
-	memcpy(scratch, buf, len);
-	hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0, 0);
+	if (unlikely(!console_cfg_offset)) {
+		probe_pci_console();
+		if (console_cfg_offset < 0)
+			return count;
+	}
 
-	/* This routine returns the number of bytes actually written. */
-	return len;
+	write_bar_via_cfg(console_access_cap,
+			  console_cfg_offset
+			  + offsetof(struct virtio_console_config, emerg_wr),
+			  buf[0]);
+	return 1;
 }
 
 /*
@@ -1400,14 +1534,6 @@ __init void lguest_init(void)
 	atomic_notifier_chain_register(&panic_notifier_list, &paniced);
 
 	/*
-	 * The IDE code spends about 3 seconds probing for disks: if we reserve
-	 * all the I/O ports up front it can't get them and so doesn't probe.
-	 * Other device drivers are similar (but less severe).  This cuts the
-	 * kernel boot time on my machine from 4.1 seconds to 0.45 seconds.
-	 */
-	paravirt_disable_iospace();
-
-	/*
 	 * This is messy CPU setup stuff which the native boot code does before
 	 * start_kernel, so we have to do, too:
 	 */
@@ -1436,6 +1562,13 @@ __init void lguest_init(void)
 	/* Register our very early console. */
 	virtio_cons_early_init(early_put_chars);
 
+	/* Don't let ACPI try to control our PCI interrupts. */
+	disable_acpi();
+
+	/* We control them ourselves, by overriding these two hooks. */
+	pcibios_enable_irq = lguest_enable_irq;
+	pcibios_disable_irq = lguest_disable_irq;
+
 	/*
 	 * Last of all, we set the power management poweroff hook to point to
 	 * the Guest routine to power off, and the reboot hook to our restart
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 56313a326188..89b53c9968e7 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -53,6 +53,8 @@
 .Lmemcpy_e_e:
 	.previous
 
+.weak memcpy
+
 ENTRY(__memcpy)
 ENTRY(memcpy)
 	CFI_STARTPROC
@@ -199,8 +201,8 @@ ENDPROC(__memcpy)
 	 * only outcome...
 	 */
 	.section .altinstructions, "a"
-	altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
+	altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
 			     .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
-	altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
+	altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
 			     .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
 	.previous
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 65268a6104f4..9c4b530575da 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -24,7 +24,10 @@
  * Output:
  * rax: dest
  */
+.weak memmove
+
 ENTRY(memmove)
+ENTRY(__memmove)
 	CFI_STARTPROC
 
 	/* Handle more 32 bytes in loop */
@@ -220,4 +223,5 @@ ENTRY(memmove)
 		.Lmemmove_end_forward-.Lmemmove_begin_forward,	\
 		.Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
 	.previous
+ENDPROC(__memmove)
 ENDPROC(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 2dcb3808cbda..6f44935c6a60 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -56,6 +56,8 @@
 .Lmemset_e_e:
 	.previous
 
+.weak memset
+
 ENTRY(memset)
 ENTRY(__memset)
 	CFI_STARTPROC
@@ -147,8 +149,8 @@ ENDPROC(__memset)
          * feature to implement the right patch order.
 	 */
 	.section .altinstructions,"a"
-	altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
-			     .Lfinal-memset,.Lmemset_e-.Lmemset_c
-	altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
-			     .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
+	altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
+			     .Lfinal-__memset,.Lmemset_e-.Lmemset_c
+	altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
+			     .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e
 	.previous
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index ecfdc46a024a..c4cc74006c61 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -20,6 +20,9 @@ obj-$(CONFIG_HIGHMEM)		+= highmem_32.o
 
 obj-$(CONFIG_KMEMCHECK)		+= kmemcheck/
 
+KASAN_SANITIZE_kasan_init_$(BITS).o := n
+obj-$(CONFIG_KASAN)		+= kasan_init_$(BITS).o
+
 obj-$(CONFIG_MMIOTRACE)		+= mmiotrace.o
 mmiotrace-y			:= kmmio.o pf_in.o mmio-mod.o
 obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index e3ff27a5b634..ede025fb46f1 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -600,7 +600,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 			printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
 		if (pte && pte_present(*pte) && pte_exec(*pte) &&
 				(pgd_flags(*pgd) & _PAGE_USER) &&
-				(read_cr4() & X86_CR4_SMEP))
+				(__read_cr4() & X86_CR4_SMEP))
 			printk(smep_warning, from_kuid(&init_user_ns, current_uid()));
 	}
 
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index d7547824e763..81bf3d2af3eb 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -84,7 +84,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
 		struct page *page;
 
 		/* Similar to the PMD case, NUMA hinting must take slow path */
-		if (pte_numa(pte)) {
+		if (pte_protnone(pte)) {
 			pte_unmap(ptep);
 			return 0;
 		}
@@ -172,13 +172,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 		 */
 		if (pmd_none(pmd) || pmd_trans_splitting(pmd))
 			return 0;
-		if (unlikely(pmd_large(pmd))) {
+		if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
 			/*
 			 * NUMA hinting faults need to be handled in the GUP
 			 * slowpath for accounting purposes and so that they
 			 * can be serialised against THP migration.
 			 */
-			if (pmd_numa(pmd))
+			if (pmd_protnone(pmd))
 				return 0;
 			if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
 				return 0;
@@ -388,10 +388,9 @@ slow_irqon:
 		start += nr << PAGE_SHIFT;
 		pages += nr;
 
-		down_read(&mm->mmap_sem);
-		ret = get_user_pages(current, mm, start,
-			(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
-		up_read(&mm->mmap_sem);
+		ret = get_user_pages_unlocked(current, mm, start,
+					      (end - start) >> PAGE_SHIFT,
+					      write, 0, pages);
 
 		/* Have to be a bit careful with return values */
 		if (nr > 0) {
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 8b977ebf9388..42982b26e32b 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -52,23 +52,17 @@ int pud_huge(pud_t pud)
 	return 0;
 }
 
-struct page *
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-		pmd_t *pmd, int write)
-{
-	return NULL;
-}
 #else
 
-struct page *
-follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
-{
-	return ERR_PTR(-EINVAL);
-}
-
+/*
+ * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal
+ * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry.
+ * Otherwise, returns 0.
+ */
 int pmd_huge(pmd_t pmd)
 {
-	return !!(pmd_val(pmd) & _PAGE_PSE);
+	return !pmd_none(pmd) &&
+		(pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
 }
 
 int pud_huge(pud_t pud)
@@ -178,4 +172,15 @@ static __init int setup_hugepagesz(char *opt)
 	return 1;
 }
 __setup("hugepagesz=", setup_hugepagesz);
+
+#ifdef CONFIG_CMA
+static __init int gigantic_pages_init(void)
+{
+	/* With CMA we can allocate gigantic pages at runtime */
+	if (cpu_has_gbpages && !size_to_hstate(1UL << PUD_SHIFT))
+		hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+	return 0;
+}
+arch_initcall(gigantic_pages_init);
+#endif
 #endif
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 079c3b6a3ff1..a110efca6d06 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -173,11 +173,11 @@ static void __init probe_page_size_mask(void)
 
 	/* Enable PSE if available */
 	if (cpu_has_pse)
-		set_in_cr4(X86_CR4_PSE);
+		cr4_set_bits_and_update_boot(X86_CR4_PSE);
 
 	/* Enable PGE if available */
 	if (cpu_has_pge) {
-		set_in_cr4(X86_CR4_PGE);
+		cr4_set_bits_and_update_boot(X86_CR4_PGE);
 		__supported_pte_mask |= _PAGE_GLOBAL;
 	}
 }
@@ -238,6 +238,31 @@ static void __init_refok adjust_range_page_size_mask(struct map_range *mr,
 	}
 }
 
+static const char *page_size_string(struct map_range *mr)
+{
+	static const char str_1g[] = "1G";
+	static const char str_2m[] = "2M";
+	static const char str_4m[] = "4M";
+	static const char str_4k[] = "4k";
+
+	if (mr->page_size_mask & (1<<PG_LEVEL_1G))
+		return str_1g;
+	/*
+	 * 32-bit without PAE has a 4M large page size.
+	 * PG_LEVEL_2M is misnamed, but we can at least
+	 * print out the right size in the string.
+	 */
+	if (IS_ENABLED(CONFIG_X86_32) &&
+	    !IS_ENABLED(CONFIG_X86_PAE) &&
+	    mr->page_size_mask & (1<<PG_LEVEL_2M))
+		return str_4m;
+
+	if (mr->page_size_mask & (1<<PG_LEVEL_2M))
+		return str_2m;
+
+	return str_4k;
+}
+
 static int __meminit split_mem_range(struct map_range *mr, int nr_range,
 				     unsigned long start,
 				     unsigned long end)
@@ -333,8 +358,7 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range,
 	for (i = 0; i < nr_range; i++)
 		printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
 				mr[i].start, mr[i].end - 1,
-			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
-			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
+				page_size_string(&mr[i]));
 
 	return nr_range;
 }
@@ -608,7 +632,7 @@ void __init init_mem_mapping(void)
  *
  *
  * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
+ * contains BIOS code and data regions used by X and dosemu and similar apps.
  * Access has to be given to non-kernel-ram areas as well, these contain the PCI
  * mmio resources as well as potential bios/acpi data regions.
  */
@@ -713,6 +737,15 @@ void __init zone_sizes_init(void)
 	free_area_init_nodes(max_zone_pfns);
 }
 
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+#ifdef CONFIG_SMP
+	.active_mm = &init_mm,
+	.state = 0,
+#endif
+	.cr4 = ~0UL,	/* fail hard if we screw up cr4 shadow initialization */
+};
+EXPORT_SYMBOL_GPL(cpu_tlbstate);
+
 void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
 {
 	/* entry 0 MUST be WB (hardwired to speed up translations) */
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
new file mode 100644
index 000000000000..4860906c6b9f
--- /dev/null
+++ b/arch/x86/mm/kasan_init_64.c
@@ -0,0 +1,206 @@
+#include <linux/bootmem.h>
+#include <linux/kasan.h>
+#include <linux/kdebug.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+
+extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern struct range pfn_mapped[E820_X_MAX];
+
+extern unsigned char kasan_zero_page[PAGE_SIZE];
+
+static int __init map_range(struct range *range)
+{
+	unsigned long start;
+	unsigned long end;
+
+	start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start));
+	end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end));
+
+	/*
+	 * end + 1 here is intentional. We check several shadow bytes in advance
+	 * to slightly speed up fastpath. In some rare cases we could cross
+	 * boundary of mapped shadow, so we just map some more here.
+	 */
+	return vmemmap_populate(start, end + 1, NUMA_NO_NODE);
+}
+
+static void __init clear_pgds(unsigned long start,
+			unsigned long end)
+{
+	for (; start < end; start += PGDIR_SIZE)
+		pgd_clear(pgd_offset_k(start));
+}
+
+void __init kasan_map_early_shadow(pgd_t *pgd)
+{
+	int i;
+	unsigned long start = KASAN_SHADOW_START;
+	unsigned long end = KASAN_SHADOW_END;
+
+	for (i = pgd_index(start); start < end; i++) {
+		pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud)
+				| _KERNPG_TABLE);
+		start += PGDIR_SIZE;
+	}
+}
+
+static int __init zero_pte_populate(pmd_t *pmd, unsigned long addr,
+				unsigned long end)
+{
+	pte_t *pte = pte_offset_kernel(pmd, addr);
+
+	while (addr + PAGE_SIZE <= end) {
+		WARN_ON(!pte_none(*pte));
+		set_pte(pte, __pte(__pa_nodebug(kasan_zero_page)
+					| __PAGE_KERNEL_RO));
+		addr += PAGE_SIZE;
+		pte = pte_offset_kernel(pmd, addr);
+	}
+	return 0;
+}
+
+static int __init zero_pmd_populate(pud_t *pud, unsigned long addr,
+				unsigned long end)
+{
+	int ret = 0;
+	pmd_t *pmd = pmd_offset(pud, addr);
+
+	while (IS_ALIGNED(addr, PMD_SIZE) && addr + PMD_SIZE <= end) {
+		WARN_ON(!pmd_none(*pmd));
+		set_pmd(pmd, __pmd(__pa_nodebug(kasan_zero_pte)
+					| __PAGE_KERNEL_RO));
+		addr += PMD_SIZE;
+		pmd = pmd_offset(pud, addr);
+	}
+	if (addr < end) {
+		if (pmd_none(*pmd)) {
+			void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
+			if (!p)
+				return -ENOMEM;
+			set_pmd(pmd, __pmd(__pa_nodebug(p) | _KERNPG_TABLE));
+		}
+		ret = zero_pte_populate(pmd, addr, end);
+	}
+	return ret;
+}
+
+
+static int __init zero_pud_populate(pgd_t *pgd, unsigned long addr,
+				unsigned long end)
+{
+	int ret = 0;
+	pud_t *pud = pud_offset(pgd, addr);
+
+	while (IS_ALIGNED(addr, PUD_SIZE) && addr + PUD_SIZE <= end) {
+		WARN_ON(!pud_none(*pud));
+		set_pud(pud, __pud(__pa_nodebug(kasan_zero_pmd)
+					| __PAGE_KERNEL_RO));
+		addr += PUD_SIZE;
+		pud = pud_offset(pgd, addr);
+	}
+
+	if (addr < end) {
+		if (pud_none(*pud)) {
+			void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
+			if (!p)
+				return -ENOMEM;
+			set_pud(pud, __pud(__pa_nodebug(p) | _KERNPG_TABLE));
+		}
+		ret = zero_pmd_populate(pud, addr, end);
+	}
+	return ret;
+}
+
+static int __init zero_pgd_populate(unsigned long addr, unsigned long end)
+{
+	int ret = 0;
+	pgd_t *pgd = pgd_offset_k(addr);
+
+	while (IS_ALIGNED(addr, PGDIR_SIZE) && addr + PGDIR_SIZE <= end) {
+		WARN_ON(!pgd_none(*pgd));
+		set_pgd(pgd, __pgd(__pa_nodebug(kasan_zero_pud)
+					| __PAGE_KERNEL_RO));
+		addr += PGDIR_SIZE;
+		pgd = pgd_offset_k(addr);
+	}
+
+	if (addr < end) {
+		if (pgd_none(*pgd)) {
+			void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
+			if (!p)
+				return -ENOMEM;
+			set_pgd(pgd, __pgd(__pa_nodebug(p) | _KERNPG_TABLE));
+		}
+		ret = zero_pud_populate(pgd, addr, end);
+	}
+	return ret;
+}
+
+
+static void __init populate_zero_shadow(const void *start, const void *end)
+{
+	if (zero_pgd_populate((unsigned long)start, (unsigned long)end))
+		panic("kasan: unable to map zero shadow!");
+}
+
+
+#ifdef CONFIG_KASAN_INLINE
+static int kasan_die_handler(struct notifier_block *self,
+			     unsigned long val,
+			     void *data)
+{
+	if (val == DIE_GPF) {
+		pr_emerg("CONFIG_KASAN_INLINE enabled");
+		pr_emerg("GPF could be caused by NULL-ptr deref or user memory access");
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block kasan_die_notifier = {
+	.notifier_call = kasan_die_handler,
+};
+#endif
+
+void __init kasan_init(void)
+{
+	int i;
+
+#ifdef CONFIG_KASAN_INLINE
+	register_die_notifier(&kasan_die_notifier);
+#endif
+
+	memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt));
+	load_cr3(early_level4_pgt);
+
+	clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
+
+	populate_zero_shadow((void *)KASAN_SHADOW_START,
+			kasan_mem_to_shadow((void *)PAGE_OFFSET));
+
+	for (i = 0; i < E820_X_MAX; i++) {
+		if (pfn_mapped[i].end == 0)
+			break;
+
+		if (map_range(&pfn_mapped[i]))
+			panic("kasan: unable to allocate shadow!");
+	}
+	populate_zero_shadow(kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
+			kasan_mem_to_shadow((void *)__START_KERNEL_map));
+
+	vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext),
+			(unsigned long)kasan_mem_to_shadow(_end),
+			NUMA_NO_NODE);
+
+	populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
+			(void *)KASAN_SHADOW_END);
+
+	memset(kasan_zero_page, 0, PAGE_SIZE);
+
+	load_cr3(init_level4_pgt);
+	init_task.kasan_depth = 0;
+}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 919b91205cd4..df4552bd239e 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -35,12 +35,12 @@ struct va_alignment __read_mostly va_align = {
 	.flags = -1,
 };
 
-static unsigned int stack_maxrandom_size(void)
+static unsigned long stack_maxrandom_size(void)
 {
-	unsigned int max = 0;
+	unsigned long max = 0;
 	if ((current->flags & PF_RANDOMIZE) &&
 		!(current->personality & ADDR_NO_RANDOMIZE)) {
-		max = ((-1U) & STACK_RND_MASK) << PAGE_SHIFT;
+		max = ((-1UL) & STACK_RND_MASK) << PAGE_SHIFT;
 	}
 
 	return max;
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1a883705a12a..cd4785bbacb9 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -794,7 +794,6 @@ int early_cpu_to_node(int cpu)
 void debug_cpumask_set_cpu(int cpu, int node, bool enable)
 {
 	struct cpumask *mask;
-	char buf[64];
 
 	if (node == NUMA_NO_NODE) {
 		/* early_cpu_to_node() already emits a warning and trace */
@@ -812,10 +811,9 @@ void debug_cpumask_set_cpu(int cpu, int node, bool enable)
 	else
 		cpumask_clear_cpu(cpu, mask);
 
-	cpulist_scnprintf(buf, sizeof(buf), mask);
-	printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+	printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n",
 		enable ? "numa_add_cpu" : "numa_remove_cpu",
-		cpu, node, buf);
+		cpu, node, cpumask_pr_args(mask));
 	return;
 }
 
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 6fb6927f9e76..7b22adaad4f1 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -190,7 +190,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
 
 #endif	/* CONFIG_X86_PAE */
 
-static void free_pmds(pmd_t *pmds[])
+static void free_pmds(struct mm_struct *mm, pmd_t *pmds[])
 {
 	int i;
 
@@ -198,10 +198,11 @@ static void free_pmds(pmd_t *pmds[])
 		if (pmds[i]) {
 			pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
 			free_page((unsigned long)pmds[i]);
+			mm_dec_nr_pmds(mm);
 		}
 }
 
-static int preallocate_pmds(pmd_t *pmds[])
+static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
 {
 	int i;
 	bool failed = false;
@@ -215,11 +216,13 @@ static int preallocate_pmds(pmd_t *pmds[])
 			pmd = NULL;
 			failed = true;
 		}
+		if (pmd)
+			mm_inc_nr_pmds(mm);
 		pmds[i] = pmd;
 	}
 
 	if (failed) {
-		free_pmds(pmds);
+		free_pmds(mm, pmds);
 		return -ENOMEM;
 	}
 
@@ -246,6 +249,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
 
 			paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
 			pmd_free(mm, pmd);
+			mm_dec_nr_pmds(mm);
 		}
 	}
 }
@@ -283,7 +287,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 
 	mm->pgd = pgd;
 
-	if (preallocate_pmds(pmds) != 0)
+	if (preallocate_pmds(mm, pmds) != 0)
 		goto out_free_pgd;
 
 	if (paravirt_pgd_alloc(mm) != 0)
@@ -304,7 +308,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	return pgd;
 
 out_free_pmds:
-	free_pmds(pmds);
+	free_pmds(mm, pmds);
 out_free_pgd:
 	free_page((unsigned long)pgd);
 out:
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index ee61c36d64f8..3250f2371aea 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -14,9 +14,6 @@
 #include <asm/uv/uv.h>
 #include <linux/debugfs.h>
 
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
-			= { &init_mm, 0, };
-
 /*
  *	Smarter SMP flushing macros.
  *		c/o Linus Torvalds.
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index cfd1b132b8e3..e4695985f9de 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -10,9 +10,6 @@
 struct pci_root_info {
 	struct acpi_device *bridge;
 	char name[16];
-	unsigned int res_num;
-	struct resource *res;
-	resource_size_t *res_offset;
 	struct pci_sysdata sd;
 #ifdef	CONFIG_PCI_MMCONFIG
 	bool mcfg_added;
@@ -218,130 +215,41 @@ static void teardown_mcfg_map(struct pci_root_info *info)
 }
 #endif
 
-static acpi_status resource_to_addr(struct acpi_resource *resource,
-				    struct acpi_resource_address64 *addr)
-{
-	acpi_status status;
-	struct acpi_resource_memory24 *memory24;
-	struct acpi_resource_memory32 *memory32;
-	struct acpi_resource_fixed_memory32 *fixed_memory32;
-
-	memset(addr, 0, sizeof(*addr));
-	switch (resource->type) {
-	case ACPI_RESOURCE_TYPE_MEMORY24:
-		memory24 = &resource->data.memory24;
-		addr->resource_type = ACPI_MEMORY_RANGE;
-		addr->minimum = memory24->minimum;
-		addr->address_length = memory24->address_length;
-		addr->maximum = addr->minimum + addr->address_length - 1;
-		return AE_OK;
-	case ACPI_RESOURCE_TYPE_MEMORY32:
-		memory32 = &resource->data.memory32;
-		addr->resource_type = ACPI_MEMORY_RANGE;
-		addr->minimum = memory32->minimum;
-		addr->address_length = memory32->address_length;
-		addr->maximum = addr->minimum + addr->address_length - 1;
-		return AE_OK;
-	case ACPI_RESOURCE_TYPE_FIXED_MEMORY32:
-		fixed_memory32 = &resource->data.fixed_memory32;
-		addr->resource_type = ACPI_MEMORY_RANGE;
-		addr->minimum = fixed_memory32->address;
-		addr->address_length = fixed_memory32->address_length;
-		addr->maximum = addr->minimum + addr->address_length - 1;
-		return AE_OK;
-	case ACPI_RESOURCE_TYPE_ADDRESS16:
-	case ACPI_RESOURCE_TYPE_ADDRESS32:
-	case ACPI_RESOURCE_TYPE_ADDRESS64:
-		status = acpi_resource_to_address64(resource, addr);
-		if (ACPI_SUCCESS(status) &&
-		    (addr->resource_type == ACPI_MEMORY_RANGE ||
-		    addr->resource_type == ACPI_IO_RANGE) &&
-		    addr->address_length > 0) {
-			return AE_OK;
-		}
-		break;
-	}
-	return AE_ERROR;
-}
-
-static acpi_status count_resource(struct acpi_resource *acpi_res, void *data)
+static void validate_resources(struct device *dev, struct list_head *crs_res,
+			       unsigned long type)
 {
-	struct pci_root_info *info = data;
-	struct acpi_resource_address64 addr;
-	acpi_status status;
-
-	status = resource_to_addr(acpi_res, &addr);
-	if (ACPI_SUCCESS(status))
-		info->res_num++;
-	return AE_OK;
-}
-
-static acpi_status setup_resource(struct acpi_resource *acpi_res, void *data)
-{
-	struct pci_root_info *info = data;
-	struct resource *res;
-	struct acpi_resource_address64 addr;
-	acpi_status status;
-	unsigned long flags;
-	u64 start, orig_end, end;
-
-	status = resource_to_addr(acpi_res, &addr);
-	if (!ACPI_SUCCESS(status))
-		return AE_OK;
-
-	if (addr.resource_type == ACPI_MEMORY_RANGE) {
-		flags = IORESOURCE_MEM;
-		if (addr.info.mem.caching == ACPI_PREFETCHABLE_MEMORY)
-			flags |= IORESOURCE_PREFETCH;
-	} else if (addr.resource_type == ACPI_IO_RANGE) {
-		flags = IORESOURCE_IO;
-	} else
-		return AE_OK;
-
-	start = addr.minimum + addr.translation_offset;
-	orig_end = end = addr.maximum + addr.translation_offset;
-
-	/* Exclude non-addressable range or non-addressable portion of range */
-	end = min(end, (u64)iomem_resource.end);
-	if (end <= start) {
-		dev_info(&info->bridge->dev,
-			"host bridge window [%#llx-%#llx] "
-			"(ignored, not CPU addressable)\n", start, orig_end);
-		return AE_OK;
-	} else if (orig_end != end) {
-		dev_info(&info->bridge->dev,
-			"host bridge window [%#llx-%#llx] "
-			"([%#llx-%#llx] ignored, not CPU addressable)\n", 
-			start, orig_end, end + 1, orig_end);
-	}
+	LIST_HEAD(list);
+	struct resource *res1, *res2, *root = NULL;
+	struct resource_entry *tmp, *entry, *entry2;
 
-	res = &info->res[info->res_num];
-	res->name = info->name;
-	res->flags = flags;
-	res->start = start;
-	res->end = end;
-	info->res_offset[info->res_num] = addr.translation_offset;
-	info->res_num++;
+	BUG_ON((type & (IORESOURCE_MEM | IORESOURCE_IO)) == 0);
+	root = (type & IORESOURCE_MEM) ? &iomem_resource : &ioport_resource;
 
-	if (!pci_use_crs)
-		dev_printk(KERN_DEBUG, &info->bridge->dev,
-			   "host bridge window %pR (ignored)\n", res);
+	list_splice_init(crs_res, &list);
+	resource_list_for_each_entry_safe(entry, tmp, &list) {
+		bool free = false;
+		resource_size_t end;
 
-	return AE_OK;
-}
-
-static void coalesce_windows(struct pci_root_info *info, unsigned long type)
-{
-	int i, j;
-	struct resource *res1, *res2;
-
-	for (i = 0; i < info->res_num; i++) {
-		res1 = &info->res[i];
+		res1 = entry->res;
 		if (!(res1->flags & type))
-			continue;
+			goto next;
+
+		/* Exclude non-addressable range or non-addressable portion */
+		end = min(res1->end, root->end);
+		if (end <= res1->start) {
+			dev_info(dev, "host bridge window %pR (ignored, not CPU addressable)\n",
+				 res1);
+			free = true;
+			goto next;
+		} else if (res1->end != end) {
+			dev_info(dev, "host bridge window %pR ([%#llx-%#llx] ignored, not CPU addressable)\n",
+				 res1, (unsigned long long)end + 1,
+				 (unsigned long long)res1->end);
+			res1->end = end;
+		}
 
-		for (j = i + 1; j < info->res_num; j++) {
-			res2 = &info->res[j];
+		resource_list_for_each_entry(entry2, crs_res) {
+			res2 = entry2->res;
 			if (!(res2->flags & type))
 				continue;
 
@@ -353,118 +261,97 @@ static void coalesce_windows(struct pci_root_info *info, unsigned long type)
 			if (resource_overlaps(res1, res2)) {
 				res2->start = min(res1->start, res2->start);
 				res2->end = max(res1->end, res2->end);
-				dev_info(&info->bridge->dev,
-					 "host bridge window expanded to %pR; %pR ignored\n",
+				dev_info(dev, "host bridge window expanded to %pR; %pR ignored\n",
 					 res2, res1);
-				res1->flags = 0;
+				free = true;
+				goto next;
 			}
 		}
+
+next:
+		resource_list_del(entry);
+		if (free)
+			resource_list_free_entry(entry);
+		else
+			resource_list_add_tail(entry, crs_res);
 	}
 }
 
 static void add_resources(struct pci_root_info *info,
-			  struct list_head *resources)
+			  struct list_head *resources,
+			  struct list_head *crs_res)
 {
-	int i;
-	struct resource *res, *root, *conflict;
+	struct resource_entry *entry, *tmp;
+	struct resource *res, *conflict, *root = NULL;
 
-	coalesce_windows(info, IORESOURCE_MEM);
-	coalesce_windows(info, IORESOURCE_IO);
-
-	for (i = 0; i < info->res_num; i++) {
-		res = &info->res[i];
+	validate_resources(&info->bridge->dev, crs_res, IORESOURCE_MEM);
+	validate_resources(&info->bridge->dev, crs_res, IORESOURCE_IO);
 
+	resource_list_for_each_entry_safe(entry, tmp, crs_res) {
+		res = entry->res;
 		if (res->flags & IORESOURCE_MEM)
 			root = &iomem_resource;
 		else if (res->flags & IORESOURCE_IO)
 			root = &ioport_resource;
 		else
-			continue;
+			BUG_ON(res);
 
 		conflict = insert_resource_conflict(root, res);
-		if (conflict)
+		if (conflict) {
 			dev_info(&info->bridge->dev,
 				 "ignoring host bridge window %pR (conflicts with %s %pR)\n",
 				 res, conflict->name, conflict);
-		else
-			pci_add_resource_offset(resources, res,
-					info->res_offset[i]);
+			resource_list_destroy_entry(entry);
+		}
 	}
-}
 
-static void free_pci_root_info_res(struct pci_root_info *info)
-{
-	kfree(info->res);
-	info->res = NULL;
-	kfree(info->res_offset);
-	info->res_offset = NULL;
-	info->res_num = 0;
+	list_splice_tail(crs_res, resources);
 }
 
-static void __release_pci_root_info(struct pci_root_info *info)
+static void release_pci_root_info(struct pci_host_bridge *bridge)
 {
-	int i;
 	struct resource *res;
+	struct resource_entry *entry;
+	struct pci_root_info *info = bridge->release_data;
 
-	for (i = 0; i < info->res_num; i++) {
-		res = &info->res[i];
-
-		if (!res->parent)
-			continue;
-
-		if (!(res->flags & (IORESOURCE_MEM | IORESOURCE_IO)))
-			continue;
-
-		release_resource(res);
+	resource_list_for_each_entry(entry, &bridge->windows) {
+		res = entry->res;
+		if (res->parent &&
+		    (res->flags & (IORESOURCE_MEM | IORESOURCE_IO)))
+			release_resource(res);
 	}
 
-	free_pci_root_info_res(info);
-
 	teardown_mcfg_map(info);
-
 	kfree(info);
 }
 
-static void release_pci_root_info(struct pci_host_bridge *bridge)
-{
-	struct pci_root_info *info = bridge->release_data;
-
-	__release_pci_root_info(info);
-}
-
 static void probe_pci_root_info(struct pci_root_info *info,
 				struct acpi_device *device,
-				int busnum, int domain)
+				int busnum, int domain,
+				struct list_head *list)
 {
-	size_t size;
+	int ret;
+	struct resource_entry *entry, *tmp;
 
 	sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum);
 	info->bridge = device;
-
-	info->res_num = 0;
-	acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource,
-				info);
-	if (!info->res_num)
-		return;
-
-	size = sizeof(*info->res) * info->res_num;
-	info->res = kzalloc_node(size, GFP_KERNEL, info->sd.node);
-	if (!info->res) {
-		info->res_num = 0;
-		return;
-	}
-
-	size = sizeof(*info->res_offset) * info->res_num;
-	info->res_num = 0;
-	info->res_offset = kzalloc_node(size, GFP_KERNEL, info->sd.node);
-	if (!info->res_offset) {
-		kfree(info->res);
-		info->res = NULL;
-		return;
-	}
-
-	acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
-				info);
+	ret = acpi_dev_get_resources(device, list,
+				     acpi_dev_filter_resource_type_cb,
+				     (void *)(IORESOURCE_IO | IORESOURCE_MEM));
+	if (ret < 0)
+		dev_warn(&device->dev,
+			 "failed to parse _CRS method, error code %d\n", ret);
+	else if (ret == 0)
+		dev_dbg(&device->dev,
+			"no IO and memory resources present in _CRS\n");
+	else
+		resource_list_for_each_entry_safe(entry, tmp, list) {
+			if ((entry->res->flags & IORESOURCE_WINDOW) == 0 ||
+			    (entry->res->flags & IORESOURCE_DISABLED))
+				resource_list_destroy_entry(entry);
+			else
+				entry->res->name = info->name;
+		}
 }
 
 struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
@@ -473,6 +360,8 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
 	struct pci_root_info *info;
 	int domain = root->segment;
 	int busnum = root->secondary.start;
+	struct resource_entry *res_entry;
+	LIST_HEAD(crs_res);
 	LIST_HEAD(resources);
 	struct pci_bus *bus;
 	struct pci_sysdata *sd;
@@ -520,18 +409,22 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
 		memcpy(bus->sysdata, sd, sizeof(*sd));
 		kfree(info);
 	} else {
-		probe_pci_root_info(info, device, busnum, domain);
-
 		/* insert busn res at first */
 		pci_add_resource(&resources,  &root->secondary);
+
 		/*
 		 * _CRS with no apertures is normal, so only fall back to
 		 * defaults or native bridge info if we're ignoring _CRS.
 		 */
-		if (pci_use_crs)
-			add_resources(info, &resources);
-		else {
-			free_pci_root_info_res(info);
+		probe_pci_root_info(info, device, busnum, domain, &crs_res);
+		if (pci_use_crs) {
+			add_resources(info, &resources, &crs_res);
+		} else {
+			resource_list_for_each_entry(res_entry, &crs_res)
+				dev_printk(KERN_DEBUG, &device->dev,
+					   "host bridge window %pR (ignored)\n",
+					   res_entry->res);
+			resource_list_free(&crs_res);
 			x86_pci_root_bus_resources(busnum, &resources);
 		}
 
@@ -546,8 +439,9 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
 				to_pci_host_bridge(bus->bridge),
 				release_pci_root_info, info);
 		} else {
-			pci_free_resource_list(&resources);
-			__release_pci_root_info(info);
+			resource_list_free(&resources);
+			teardown_mcfg_map(info);
+			kfree(info);
 		}
 	}
 
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index f3a2cfc14125..7bcf06a7cd12 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -31,7 +31,7 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources)
 {
 	struct pci_root_info *info = x86_find_pci_root_info(bus);
 	struct pci_root_res *root_res;
-	struct pci_host_bridge_window *window;
+	struct resource_entry *window;
 	bool found = false;
 
 	if (!info)
@@ -41,7 +41,7 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources)
 	       bus);
 
 	/* already added by acpi ? */
-	list_for_each_entry(window, resources, list)
+	resource_list_for_each_entry(window, resources)
 		if (window->res->flags & IORESOURCE_BUS) {
 			found = true;
 			break;
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 676e5e04e4d4..dd30b7e08bc2 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -397,12 +397,12 @@ static acpi_status check_mcfg_resource(struct acpi_resource *res, void *data)
 
 	status = acpi_resource_to_address64(res, &address);
 	if (ACPI_FAILURE(status) ||
-	   (address.address_length <= 0) ||
+	   (address.address.address_length <= 0) ||
 	   (address.resource_type != ACPI_MEMORY_RANGE))
 		return AE_OK;
 
-	if ((mcfg_res->start >= address.minimum) &&
-	    (mcfg_res->end < (address.minimum + address.address_length))) {
+	if ((mcfg_res->start >= address.address.minimum) &&
+	    (mcfg_res->end < (address.address.minimum + address.address.address_length))) {
 		mcfg_res->flags = 1;
 		return AE_CTRL_TERMINATE;
 	}
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 9098d880c476..d22f4b5bbc04 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -298,12 +298,16 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 			map_irq.entry_nr = nvec;
 		} else if (type == PCI_CAP_ID_MSIX) {
 			int pos;
+			unsigned long flags;
 			u32 table_offset, bir;
 
 			pos = dev->msix_cap;
 			pci_read_config_dword(dev, pos + PCI_MSIX_TABLE,
 					      &table_offset);
 			bir = (u8)(table_offset & PCI_MSIX_TABLE_BIR);
+			flags = pci_resource_flags(dev, bir);
+			if (!flags || (flags & IORESOURCE_UNSET))
+				return -EINVAL;
 
 			map_irq.table_base = pci_resource_start(dev, bir);
 			map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index 85afde1fa3e5..a62e0be3a2f1 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -5,6 +5,7 @@ obj-y	+= geode/
 obj-y	+= goldfish/
 obj-y	+= iris/
 obj-y	+= intel-mid/
+obj-y	+= intel-quark/
 obj-y	+= olpc/
 obj-y	+= scx200/
 obj-y	+= sfi/
diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S
index 5fcda7272550..86d0f9e08dd9 100644
--- a/arch/x86/platform/efi/efi_stub_64.S
+++ b/arch/x86/platform/efi/efi_stub_64.S
@@ -91,167 +91,6 @@ ENTRY(efi_call)
 	ret
 ENDPROC(efi_call)
 
-#ifdef CONFIG_EFI_MIXED
-
-/*
- * We run this function from the 1:1 mapping.
- *
- * This function must be invoked with a 1:1 mapped stack.
- */
-ENTRY(__efi64_thunk)
-	movl	%ds, %eax
-	push	%rax
-	movl	%es, %eax
-	push	%rax
-	movl	%ss, %eax
-	push	%rax
-
-	subq	$32, %rsp
-	movl	%esi, 0x0(%rsp)
-	movl	%edx, 0x4(%rsp)
-	movl	%ecx, 0x8(%rsp)
-	movq	%r8, %rsi
-	movl	%esi, 0xc(%rsp)
-	movq	%r9, %rsi
-	movl	%esi,  0x10(%rsp)
-
-	sgdt	save_gdt(%rip)
-
-	leaq	1f(%rip), %rbx
-	movq	%rbx, func_rt_ptr(%rip)
-
-	/* Switch to gdt with 32-bit segments */
-	movl	64(%rsp), %eax
-	lgdt	(%rax)
-
-	leaq	efi_enter32(%rip), %rax
-	pushq	$__KERNEL_CS
-	pushq	%rax
-	lretq
-
-1:	addq	$32, %rsp
-
-	lgdt	save_gdt(%rip)
-
-	pop	%rbx
-	movl	%ebx, %ss
-	pop	%rbx
-	movl	%ebx, %es
-	pop	%rbx
-	movl	%ebx, %ds
-
-	/*
-	 * Convert 32-bit status code into 64-bit.
-	 */
-	test	%rax, %rax
-	jz	1f
-	movl	%eax, %ecx
-	andl	$0x0fffffff, %ecx
-	andl	$0xf0000000, %eax
-	shl	$32, %rax
-	or	%rcx, %rax
-1:
-	ret
-ENDPROC(__efi64_thunk)
-
-ENTRY(efi_exit32)
-	movq	func_rt_ptr(%rip), %rax
-	push	%rax
-	mov	%rdi, %rax
-	ret
-ENDPROC(efi_exit32)
-
-	.code32
-/*
- * EFI service pointer must be in %edi.
- *
- * The stack should represent the 32-bit calling convention.
- */
-ENTRY(efi_enter32)
-	movl	$__KERNEL_DS, %eax
-	movl	%eax, %ds
-	movl	%eax, %es
-	movl	%eax, %ss
-
-	/* Reload pgtables */
-	movl	%cr3, %eax
-	movl	%eax, %cr3
-
-	/* Disable paging */
-	movl	%cr0, %eax
-	btrl	$X86_CR0_PG_BIT, %eax
-	movl	%eax, %cr0
-
-	/* Disable long mode via EFER */
-	movl	$MSR_EFER, %ecx
-	rdmsr
-	btrl	$_EFER_LME, %eax
-	wrmsr
-
-	call	*%edi
-
-	/* We must preserve return value */
-	movl	%eax, %edi
-
-	/*
-	 * Some firmware will return with interrupts enabled. Be sure to
-	 * disable them before we switch GDTs.
-	 */
-	cli
-
-	movl	68(%esp), %eax
-	movl	%eax, 2(%eax)
-	lgdtl	(%eax)
-
-	movl	%cr4, %eax
-	btsl	$(X86_CR4_PAE_BIT), %eax
-	movl	%eax, %cr4
-
-	movl	%cr3, %eax
-	movl	%eax, %cr3
-
-	movl	$MSR_EFER, %ecx
-	rdmsr
-	btsl	$_EFER_LME, %eax
-	wrmsr
-
-	xorl	%eax, %eax
-	lldt	%ax
-
-	movl	72(%esp), %eax
-	pushl	$__KERNEL_CS
-	pushl	%eax
-
-	/* Enable paging */
-	movl	%cr0, %eax
-	btsl	$X86_CR0_PG_BIT, %eax
-	movl	%eax, %cr0
-	lret
-ENDPROC(efi_enter32)
-
-	.data
-	.balign	8
-	.global	efi32_boot_gdt
-efi32_boot_gdt:	.word	0
-		.quad	0
-
-save_gdt:	.word	0
-		.quad	0
-func_rt_ptr:	.quad	0
-
-	.global efi_gdt64
-efi_gdt64:
-	.word	efi_gdt64_end - efi_gdt64
-	.long	0			/* Filled out by user */
-	.word	0
-	.quad	0x0000000000000000	/* NULL descriptor */
-	.quad	0x00af9a000000ffff	/* __KERNEL_CS */
-	.quad	0x00cf92000000ffff	/* __KERNEL_DS */
-	.quad	0x0080890000000000	/* TS descriptor */
-	.quad   0x0000000000000000	/* TS continued */
-efi_gdt64_end:
-#endif /* CONFIG_EFI_MIXED */
-
 	.data
 ENTRY(efi_scratch)
 	.fill 3,8,0
diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S
index 8806fa73e6e6..ff85d28c50f2 100644
--- a/arch/x86/platform/efi/efi_thunk_64.S
+++ b/arch/x86/platform/efi/efi_thunk_64.S
@@ -1,9 +1,26 @@
 /*
  * Copyright (C) 2014 Intel Corporation; author Matt Fleming
+ *
+ * Support for invoking 32-bit EFI runtime services from a 64-bit
+ * kernel.
+ *
+ * The below thunking functions are only used after ExitBootServices()
+ * has been called. This simplifies things considerably as compared with
+ * the early EFI thunking because we can leave all the kernel state
+ * intact (GDT, IDT, etc) and simply invoke the the 32-bit EFI runtime
+ * services from __KERNEL32_CS. This means we can continue to service
+ * interrupts across an EFI mixed mode call.
+ *
+ * We do however, need to handle the fact that we're running in a full
+ * 64-bit virtual address space. Things like the stack and instruction
+ * addresses need to be accessible by the 32-bit firmware, so we rely on
+ * using the identity mappings in the EFI page table to access the stack
+ * and kernel text (see efi_setup_page_tables()).
  */
 
 #include <linux/linkage.h>
 #include <asm/page_types.h>
+#include <asm/segment.h>
 
 	.text
 	.code64
@@ -33,14 +50,6 @@ ENTRY(efi64_thunk)
 	leaq	efi_exit32(%rip), %rbx
 	subq	%rax, %rbx
 	movl	%ebx, 8(%rsp)
-	leaq	efi_gdt64(%rip), %rbx
-	subq	%rax, %rbx
-	movl	%ebx, 2(%ebx)
-	movl	%ebx, 4(%rsp)
-	leaq	efi_gdt32(%rip), %rbx
-	subq	%rax, %rbx
-	movl	%ebx, 2(%ebx)
-	movl	%ebx, (%rsp)
 
 	leaq	__efi64_thunk(%rip), %rbx
 	subq	%rax, %rbx
@@ -52,14 +61,92 @@ ENTRY(efi64_thunk)
 	retq
 ENDPROC(efi64_thunk)
 
-	.data
-efi_gdt32:
-	.word 	efi_gdt32_end - efi_gdt32
-	.long	0			/* Filled out above */
-	.word	0
-	.quad	0x0000000000000000	/* NULL descriptor */
-	.quad	0x00cf9a000000ffff	/* __KERNEL_CS */
-	.quad	0x00cf93000000ffff	/* __KERNEL_DS */
-efi_gdt32_end:
+/*
+ * We run this function from the 1:1 mapping.
+ *
+ * This function must be invoked with a 1:1 mapped stack.
+ */
+ENTRY(__efi64_thunk)
+	movl	%ds, %eax
+	push	%rax
+	movl	%es, %eax
+	push	%rax
+	movl	%ss, %eax
+	push	%rax
+
+	subq	$32, %rsp
+	movl	%esi, 0x0(%rsp)
+	movl	%edx, 0x4(%rsp)
+	movl	%ecx, 0x8(%rsp)
+	movq	%r8, %rsi
+	movl	%esi, 0xc(%rsp)
+	movq	%r9, %rsi
+	movl	%esi,  0x10(%rsp)
+
+	leaq	1f(%rip), %rbx
+	movq	%rbx, func_rt_ptr(%rip)
+
+	/* Switch to 32-bit descriptor */
+	pushq	$__KERNEL32_CS
+	leaq	efi_enter32(%rip), %rax
+	pushq	%rax
+	lretq
+
+1:	addq	$32, %rsp
+
+	pop	%rbx
+	movl	%ebx, %ss
+	pop	%rbx
+	movl	%ebx, %es
+	pop	%rbx
+	movl	%ebx, %ds
 
+	/*
+	 * Convert 32-bit status code into 64-bit.
+	 */
+	test	%rax, %rax
+	jz	1f
+	movl	%eax, %ecx
+	andl	$0x0fffffff, %ecx
+	andl	$0xf0000000, %eax
+	shl	$32, %rax
+	or	%rcx, %rax
+1:
+	ret
+ENDPROC(__efi64_thunk)
+
+ENTRY(efi_exit32)
+	movq	func_rt_ptr(%rip), %rax
+	push	%rax
+	mov	%rdi, %rax
+	ret
+ENDPROC(efi_exit32)
+
+	.code32
+/*
+ * EFI service pointer must be in %edi.
+ *
+ * The stack should represent the 32-bit calling convention.
+ */
+ENTRY(efi_enter32)
+	movl	$__KERNEL_DS, %eax
+	movl	%eax, %ds
+	movl	%eax, %es
+	movl	%eax, %ss
+
+	call	*%edi
+
+	/* We must preserve return value */
+	movl	%eax, %edi
+
+	movl	72(%esp), %eax
+	pushl	$__KERNEL_CS
+	pushl	%eax
+
+	lret
+ENDPROC(efi_enter32)
+
+	.data
+	.balign	8
+func_rt_ptr:		.quad 0
 efi_saved_sp:		.quad 0
diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile
index af9307f2cc28..91ec9f8704bf 100644
--- a/arch/x86/platform/intel-mid/device_libs/Makefile
+++ b/arch/x86/platform/intel-mid/device_libs/Makefile
@@ -16,8 +16,6 @@ obj-$(subst m,y,$(CONFIG_INPUT_MPU3050)) += platform_mpu3050.o
 obj-$(subst m,y,$(CONFIG_INPUT_BMA150)) += platform_bma023.o
 obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_tca6416.o
 obj-$(subst m,y,$(CONFIG_DRM_MEDFIELD)) += platform_tc35876x.o
-# SPI Devices
-obj-$(subst m,y,$(CONFIG_SERIAL_MRST_MAX3110)) += platform_max3111.o
 # MISC Devices
 obj-$(subst m,y,$(CONFIG_KEYBOARD_GPIO)) += platform_gpio_keys.o
 obj-$(subst m,y,$(CONFIG_INTEL_MID_WATCHDOG)) += platform_wdt.o
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_max3111.c b/arch/x86/platform/intel-mid/device_libs/platform_max3111.c
deleted file mode 100644
index afd1df94e0e5..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_max3111.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * platform_max3111.c: max3111 platform data initilization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-
-#include <linux/gpio.h>
-#include <linux/spi/spi.h>
-#include <asm/intel-mid.h>
-
-static void __init *max3111_platform_data(void *info)
-{
-	struct spi_board_info *spi_info = info;
-	int intr = get_gpio_by_name("max3111_int");
-
-	spi_info->mode = SPI_MODE_0;
-	if (intr == -1)
-		return NULL;
-	spi_info->irq = intr + INTEL_MID_IRQ_OFFSET;
-	return NULL;
-}
-
-static const struct devs_id max3111_dev_id __initconst = {
-	.name = "spi_max3111",
-	.type = SFI_DEV_TYPE_SPI,
-	.get_platform_data = &max3111_platform_data,
-};
-
-sfi_device(max3111_dev_id);
diff --git a/arch/x86/platform/intel-mid/early_printk_intel_mid.c b/arch/x86/platform/intel-mid/early_printk_intel_mid.c
index e0bd082a80e0..4e720829ab90 100644
--- a/arch/x86/platform/intel-mid/early_printk_intel_mid.c
+++ b/arch/x86/platform/intel-mid/early_printk_intel_mid.c
@@ -10,15 +10,13 @@
  */
 
 /*
- * This file implements two early consoles named mrst and hsu.
- * mrst is based on Maxim3110 spi-uart device, it exists in both
- * Moorestown and Medfield platforms, while hsu is based on a High
- * Speed UART device which only exists in the Medfield platform
+ * This file implements early console named hsu.
+ * hsu is based on a High Speed UART device which only exists in the Medfield
+ * platform
  */
 
 #include <linux/serial_reg.h>
 #include <linux/serial_mfd.h>
-#include <linux/kmsg_dump.h>
 #include <linux/console.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
@@ -28,216 +26,6 @@
 #include <asm/pgtable.h>
 #include <asm/intel-mid.h>
 
-#define MRST_SPI_TIMEOUT		0x200000
-#define MRST_REGBASE_SPI0		0xff128000
-#define MRST_REGBASE_SPI1		0xff128400
-#define MRST_CLK_SPI0_REG		0xff11d86c
-
-/* Bit fields in CTRLR0 */
-#define SPI_DFS_OFFSET			0
-
-#define SPI_FRF_OFFSET			4
-#define SPI_FRF_SPI			0x0
-#define SPI_FRF_SSP			0x1
-#define SPI_FRF_MICROWIRE		0x2
-#define SPI_FRF_RESV			0x3
-
-#define SPI_MODE_OFFSET			6
-#define SPI_SCPH_OFFSET			6
-#define SPI_SCOL_OFFSET			7
-#define SPI_TMOD_OFFSET			8
-#define	SPI_TMOD_TR			0x0		/* xmit & recv */
-#define SPI_TMOD_TO			0x1		/* xmit only */
-#define SPI_TMOD_RO			0x2		/* recv only */
-#define SPI_TMOD_EPROMREAD		0x3		/* eeprom read mode */
-
-#define SPI_SLVOE_OFFSET		10
-#define SPI_SRL_OFFSET			11
-#define SPI_CFS_OFFSET			12
-
-/* Bit fields in SR, 7 bits */
-#define SR_MASK				0x7f		/* cover 7 bits */
-#define SR_BUSY				(1 << 0)
-#define SR_TF_NOT_FULL			(1 << 1)
-#define SR_TF_EMPT			(1 << 2)
-#define SR_RF_NOT_EMPT			(1 << 3)
-#define SR_RF_FULL			(1 << 4)
-#define SR_TX_ERR			(1 << 5)
-#define SR_DCOL				(1 << 6)
-
-struct dw_spi_reg {
-	u32	ctrl0;
-	u32	ctrl1;
-	u32	ssienr;
-	u32	mwcr;
-	u32	ser;
-	u32	baudr;
-	u32	txfltr;
-	u32	rxfltr;
-	u32	txflr;
-	u32	rxflr;
-	u32	sr;
-	u32	imr;
-	u32	isr;
-	u32	risr;
-	u32	txoicr;
-	u32	rxoicr;
-	u32	rxuicr;
-	u32	msticr;
-	u32	icr;
-	u32	dmacr;
-	u32	dmatdlr;
-	u32	dmardlr;
-	u32	idr;
-	u32	version;
-
-	/* Currently operates as 32 bits, though only the low 16 bits matter */
-	u32	dr;
-} __packed;
-
-#define dw_readl(dw, name)		__raw_readl(&(dw)->name)
-#define dw_writel(dw, name, val)	__raw_writel((val), &(dw)->name)
-
-/* Default use SPI0 register for mrst, we will detect Penwell and use SPI1 */
-static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0;
-
-static u32 *pclk_spi0;
-/* Always contains an accessible address, start with 0 */
-static struct dw_spi_reg *pspi;
-
-static struct kmsg_dumper dw_dumper;
-static int dumper_registered;
-
-static void dw_kmsg_dump(struct kmsg_dumper *dumper,
-			 enum kmsg_dump_reason reason)
-{
-	static char line[1024];
-	size_t len;
-
-	/* When run to this, we'd better re-init the HW */
-	mrst_early_console_init();
-
-	while (kmsg_dump_get_line(dumper, true, line, sizeof(line), &len))
-		early_mrst_console.write(&early_mrst_console, line, len);
-}
-
-/* Set the ratio rate to 115200, 8n1, IRQ disabled */
-static void max3110_write_config(void)
-{
-	u16 config;
-
-	config = 0xc001;
-	dw_writel(pspi, dr, config);
-}
-
-/* Translate char to a eligible word and send to max3110 */
-static void max3110_write_data(char c)
-{
-	u16 data;
-
-	data = 0x8000 | c;
-	dw_writel(pspi, dr, data);
-}
-
-void mrst_early_console_init(void)
-{
-	u32 ctrlr0 = 0;
-	u32 spi0_cdiv;
-	u32 freq; /* Freqency info only need be searched once */
-
-	/* Base clk is 100 MHz, the actual clk = 100M / (clk_divider + 1) */
-	pclk_spi0 = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
-							MRST_CLK_SPI0_REG);
-	spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9;
-	freq = 100000000 / (spi0_cdiv + 1);
-
-	if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_PENWELL)
-		mrst_spi_paddr = MRST_REGBASE_SPI1;
-
-	pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
-						mrst_spi_paddr);
-
-	/* Disable SPI controller */
-	dw_writel(pspi, ssienr, 0);
-
-	/* Set control param, 8 bits, transmit only mode */
-	ctrlr0 = dw_readl(pspi, ctrl0);
-
-	ctrlr0 &= 0xfcc0;
-	ctrlr0 |= 0xf | (SPI_FRF_SPI << SPI_FRF_OFFSET)
-		      | (SPI_TMOD_TO << SPI_TMOD_OFFSET);
-	dw_writel(pspi, ctrl0, ctrlr0);
-
-	/*
-	 * Change the spi0 clk to comply with 115200 bps, use 100000 to
-	 * calculate the clk dividor to make the clock a little slower
-	 * than real baud rate.
-	 */
-	dw_writel(pspi, baudr, freq/100000);
-
-	/* Disable all INT for early phase */
-	dw_writel(pspi, imr, 0x0);
-
-	/* Set the cs to spi-uart */
-	dw_writel(pspi, ser, 0x2);
-
-	/* Enable the HW, the last step for HW init */
-	dw_writel(pspi, ssienr, 0x1);
-
-	/* Set the default configuration */
-	max3110_write_config();
-
-	/* Register the kmsg dumper */
-	if (!dumper_registered) {
-		dw_dumper.dump = dw_kmsg_dump;
-		kmsg_dump_register(&dw_dumper);
-		dumper_registered = 1;
-	}
-}
-
-/* Slave select should be called in the read/write function */
-static void early_mrst_spi_putc(char c)
-{
-	unsigned int timeout;
-	u32 sr;
-
-	timeout = MRST_SPI_TIMEOUT;
-	/* Early putc needs to make sure the TX FIFO is not full */
-	while (--timeout) {
-		sr = dw_readl(pspi, sr);
-		if (!(sr & SR_TF_NOT_FULL))
-			cpu_relax();
-		else
-			break;
-	}
-
-	if (!timeout)
-		pr_warn("MRST earlycon: timed out\n");
-	else
-		max3110_write_data(c);
-}
-
-/* Early SPI only uses polling mode */
-static void early_mrst_spi_write(struct console *con, const char *str,
-					unsigned n)
-{
-	int i;
-
-	for (i = 0; i < n && *str; i++) {
-		if (*str == '\n')
-			early_mrst_spi_putc('\r');
-		early_mrst_spi_putc(*str);
-		str++;
-	}
-}
-
-struct console early_mrst_console = {
-	.name =		"earlymrst",
-	.write =	early_mrst_spi_write,
-	.flags =	CON_PRINTBUFFER,
-	.index =	-1,
-};
-
 /*
  * Following is the early console based on Medfield HSU (High
  * Speed UART) device.
@@ -259,7 +47,7 @@ void hsu_early_console_init(const char *s)
 		port = clamp_val(port, 0, 2);
 
 	paddr = HSU_PORT_BASE + port * 0x80;
-	phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, paddr);
+	phsu = (void __iomem *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, paddr);
 
 	/* Disable FIFO */
 	writeb(0x0, phsu + UART_FCR);
diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c
index 1bbedc4b0f88..3005f0c89f2e 100644
--- a/arch/x86/platform/intel-mid/intel-mid.c
+++ b/arch/x86/platform/intel-mid/intel-mid.c
@@ -130,7 +130,7 @@ static void intel_mid_arch_setup(void)
 		intel_mid_ops = get_intel_mid_ops[__intel_mid_cpu_chip]();
 	else {
 		intel_mid_ops = get_intel_mid_ops[INTEL_MID_CPU_CHIP_PENWELL]();
-		pr_info("ARCH: Uknown SoC, assuming PENWELL!\n");
+		pr_info("ARCH: Unknown SoC, assuming PENWELL!\n");
 	}
 
 out:
diff --git a/arch/x86/platform/intel-mid/intel_mid_vrtc.c b/arch/x86/platform/intel-mid/intel_mid_vrtc.c
index 4762cff7facd..32947ba0f62d 100644
--- a/arch/x86/platform/intel-mid/intel_mid_vrtc.c
+++ b/arch/x86/platform/intel-mid/intel_mid_vrtc.c
@@ -110,7 +110,7 @@ int vrtc_set_mmss(const struct timespec *now)
 		spin_unlock_irqrestore(&rtc_lock, flags);
 	} else {
 		pr_err("%s: Invalid vRTC value: write of %lx to vRTC failed\n",
-			__FUNCTION__, now->tv_sec);
+			__func__, now->tv_sec);
 		retval = -EINVAL;
 	}
 	return retval;
diff --git a/arch/x86/platform/intel-quark/Makefile b/arch/x86/platform/intel-quark/Makefile
new file mode 100644
index 000000000000..9cc57ed36022
--- /dev/null
+++ b/arch/x86/platform/intel-quark/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_INTEL_IMR) += imr.o
+obj-$(CONFIG_DEBUG_IMR_SELFTEST) += imr_selftest.o
diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c
new file mode 100644
index 000000000000..0ee619f9fcb7
--- /dev/null
+++ b/arch/x86/platform/intel-quark/imr.c
@@ -0,0 +1,661 @@
+/**
+ * imr.c
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
+ *
+ * IMR registers define an isolated region of memory that can
+ * be masked to prohibit certain system agents from accessing memory.
+ * When a device behind a masked port performs an access - snooped or
+ * not, an IMR may optionally prevent that transaction from changing
+ * the state of memory or from getting correct data in response to the
+ * operation.
+ *
+ * Write data will be dropped and reads will return 0xFFFFFFFF, the
+ * system will reset and system BIOS will print out an error message to
+ * inform the user that an IMR has been violated.
+ *
+ * This code is based on the Linux MTRR code and reference code from
+ * Intel's Quark BSP EFI, Linux and grub code.
+ *
+ * See quark-x1000-datasheet.pdf for register definitions.
+ * http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/quark-x1000-datasheet.pdf
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <asm-generic/sections.h>
+#include <asm/cpu_device_id.h>
+#include <asm/imr.h>
+#include <asm/iosf_mbi.h>
+#include <linux/debugfs.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+struct imr_device {
+	struct dentry	*file;
+	bool		init;
+	struct mutex	lock;
+	int		max_imr;
+	int		reg_base;
+};
+
+static struct imr_device imr_dev;
+
+/*
+ * IMR read/write mask control registers.
+ * See quark-x1000-datasheet.pdf sections 12.7.4.5 and 12.7.4.6 for
+ * bit definitions.
+ *
+ * addr_hi
+ * 31		Lock bit
+ * 30:24	Reserved
+ * 23:2		1 KiB aligned lo address
+ * 1:0		Reserved
+ *
+ * addr_hi
+ * 31:24	Reserved
+ * 23:2		1 KiB aligned hi address
+ * 1:0		Reserved
+ */
+#define IMR_LOCK	BIT(31)
+
+struct imr_regs {
+	u32 addr_lo;
+	u32 addr_hi;
+	u32 rmask;
+	u32 wmask;
+};
+
+#define IMR_NUM_REGS	(sizeof(struct imr_regs)/sizeof(u32))
+#define IMR_SHIFT	8
+#define imr_to_phys(x)	((x) << IMR_SHIFT)
+#define phys_to_imr(x)	((x) >> IMR_SHIFT)
+
+/**
+ * imr_is_enabled - true if an IMR is enabled false otherwise.
+ *
+ * Determines if an IMR is enabled based on address range and read/write
+ * mask. An IMR set with an address range set to zero and a read/write
+ * access mask set to all is considered to be disabled. An IMR in any
+ * other state - for example set to zero but without read/write access
+ * all is considered to be enabled. This definition of disabled is how
+ * firmware switches off an IMR and is maintained in kernel for
+ * consistency.
+ *
+ * @imr:	pointer to IMR descriptor.
+ * @return:	true if IMR enabled false if disabled.
+ */
+static inline int imr_is_enabled(struct imr_regs *imr)
+{
+	return !(imr->rmask == IMR_READ_ACCESS_ALL &&
+		 imr->wmask == IMR_WRITE_ACCESS_ALL &&
+		 imr_to_phys(imr->addr_lo) == 0 &&
+		 imr_to_phys(imr->addr_hi) == 0);
+}
+
+/**
+ * imr_read - read an IMR at a given index.
+ *
+ * Requires caller to hold imr mutex.
+ *
+ * @idev:	pointer to imr_device structure.
+ * @imr_id:	IMR entry to read.
+ * @imr:	IMR structure representing address and access masks.
+ * @return:	0 on success or error code passed from mbi_iosf on failure.
+ */
+static int imr_read(struct imr_device *idev, u32 imr_id, struct imr_regs *imr)
+{
+	u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base;
+	int ret;
+
+	ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
+				reg++, &imr->addr_lo);
+	if (ret)
+		return ret;
+
+	ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
+				reg++, &imr->addr_hi);
+	if (ret)
+		return ret;
+
+	ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
+				reg++, &imr->rmask);
+	if (ret)
+		return ret;
+
+	return iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
+				reg++, &imr->wmask);
+}
+
+/**
+ * imr_write - write an IMR at a given index.
+ *
+ * Requires caller to hold imr mutex.
+ * Note lock bits need to be written independently of address bits.
+ *
+ * @idev:	pointer to imr_device structure.
+ * @imr_id:	IMR entry to write.
+ * @imr:	IMR structure representing address and access masks.
+ * @lock:	indicates if the IMR lock bit should be applied.
+ * @return:	0 on success or error code passed from mbi_iosf on failure.
+ */
+static int imr_write(struct imr_device *idev, u32 imr_id,
+		     struct imr_regs *imr, bool lock)
+{
+	unsigned long flags;
+	u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base;
+	int ret;
+
+	local_irq_save(flags);
+
+	ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, reg++,
+				imr->addr_lo);
+	if (ret)
+		goto failed;
+
+	ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE,
+				reg++, imr->addr_hi);
+	if (ret)
+		goto failed;
+
+	ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE,
+				reg++, imr->rmask);
+	if (ret)
+		goto failed;
+
+	ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE,
+				reg++, imr->wmask);
+	if (ret)
+		goto failed;
+
+	/* Lock bit must be set separately to addr_lo address bits. */
+	if (lock) {
+		imr->addr_lo |= IMR_LOCK;
+		ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE,
+					reg - IMR_NUM_REGS, imr->addr_lo);
+		if (ret)
+			goto failed;
+	}
+
+	local_irq_restore(flags);
+	return 0;
+failed:
+	/*
+	 * If writing to the IOSF failed then we're in an unknown state,
+	 * likely a very bad state. An IMR in an invalid state will almost
+	 * certainly lead to a memory access violation.
+	 */
+	local_irq_restore(flags);
+	WARN(ret, "IOSF-MBI write fail range 0x%08x-0x%08x unreliable\n",
+	     imr_to_phys(imr->addr_lo), imr_to_phys(imr->addr_hi) + IMR_MASK);
+
+	return ret;
+}
+
+/**
+ * imr_dbgfs_state_show - print state of IMR registers.
+ *
+ * @s:		pointer to seq_file for output.
+ * @unused:	unused parameter.
+ * @return:	0 on success or error code passed from mbi_iosf on failure.
+ */
+static int imr_dbgfs_state_show(struct seq_file *s, void *unused)
+{
+	phys_addr_t base;
+	phys_addr_t end;
+	int i;
+	struct imr_device *idev = s->private;
+	struct imr_regs imr;
+	size_t size;
+	int ret = -ENODEV;
+
+	mutex_lock(&idev->lock);
+
+	for (i = 0; i < idev->max_imr; i++) {
+
+		ret = imr_read(idev, i, &imr);
+		if (ret)
+			break;
+
+		/*
+		 * Remember to add IMR_ALIGN bytes to size to indicate the
+		 * inherent IMR_ALIGN size bytes contained in the masked away
+		 * lower ten bits.
+		 */
+		if (imr_is_enabled(&imr)) {
+			base = imr_to_phys(imr.addr_lo);
+			end = imr_to_phys(imr.addr_hi) + IMR_MASK;
+		} else {
+			base = 0;
+			end = 0;
+		}
+		size = end - base;
+		seq_printf(s, "imr%02i: base=%pa, end=%pa, size=0x%08zx "
+			   "rmask=0x%08x, wmask=0x%08x, %s, %s\n", i,
+			   &base, &end, size, imr.rmask, imr.wmask,
+			   imr_is_enabled(&imr) ? "enabled " : "disabled",
+			   imr.addr_lo & IMR_LOCK ? "locked" : "unlocked");
+	}
+
+	mutex_unlock(&idev->lock);
+	return ret;
+}
+
+/**
+ * imr_state_open - debugfs open callback.
+ *
+ * @inode:	pointer to struct inode.
+ * @file:	pointer to struct file.
+ * @return:	result of single open.
+ */
+static int imr_state_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, imr_dbgfs_state_show, inode->i_private);
+}
+
+static const struct file_operations imr_state_ops = {
+	.open		= imr_state_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+/**
+ * imr_debugfs_register - register debugfs hooks.
+ *
+ * @idev:	pointer to imr_device structure.
+ * @return:	0 on success - errno on failure.
+ */
+static int imr_debugfs_register(struct imr_device *idev)
+{
+	idev->file = debugfs_create_file("imr_state", S_IFREG | S_IRUGO, NULL,
+					 idev, &imr_state_ops);
+	return PTR_ERR_OR_ZERO(idev->file);
+}
+
+/**
+ * imr_debugfs_unregister - unregister debugfs hooks.
+ *
+ * @idev:	pointer to imr_device structure.
+ * @return:
+ */
+static void imr_debugfs_unregister(struct imr_device *idev)
+{
+	debugfs_remove(idev->file);
+}
+
+/**
+ * imr_check_params - check passed address range IMR alignment and non-zero size
+ *
+ * @base:	base address of intended IMR.
+ * @size:	size of intended IMR.
+ * @return:	zero on valid range -EINVAL on unaligned base/size.
+ */
+static int imr_check_params(phys_addr_t base, size_t size)
+{
+	if ((base & IMR_MASK) || (size & IMR_MASK)) {
+		pr_err("base %pa size 0x%08zx must align to 1KiB\n",
+			&base, size);
+		return -EINVAL;
+	}
+	if (size == 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * imr_raw_size - account for the IMR_ALIGN bytes that addr_hi appends.
+ *
+ * IMR addr_hi has a built in offset of plus IMR_ALIGN (0x400) bytes from the
+ * value in the register. We need to subtract IMR_ALIGN bytes from input sizes
+ * as a result.
+ *
+ * @size:	input size bytes.
+ * @return:	reduced size.
+ */
+static inline size_t imr_raw_size(size_t size)
+{
+	return size - IMR_ALIGN;
+}
+
+/**
+ * imr_address_overlap - detects an address overlap.
+ *
+ * @addr:	address to check against an existing IMR.
+ * @imr:	imr being checked.
+ * @return:	true for overlap false for no overlap.
+ */
+static inline int imr_address_overlap(phys_addr_t addr, struct imr_regs *imr)
+{
+	return addr >= imr_to_phys(imr->addr_lo) && addr <= imr_to_phys(imr->addr_hi);
+}
+
+/**
+ * imr_add_range - add an Isolated Memory Region.
+ *
+ * @base:	physical base address of region aligned to 1KiB.
+ * @size:	physical size of region in bytes must be aligned to 1KiB.
+ * @read_mask:	read access mask.
+ * @write_mask:	write access mask.
+ * @lock:	indicates whether or not to permanently lock this region.
+ * @return:	zero on success or negative value indicating error.
+ */
+int imr_add_range(phys_addr_t base, size_t size,
+		  unsigned int rmask, unsigned int wmask, bool lock)
+{
+	phys_addr_t end;
+	unsigned int i;
+	struct imr_device *idev = &imr_dev;
+	struct imr_regs imr;
+	size_t raw_size;
+	int reg;
+	int ret;
+
+	if (WARN_ONCE(idev->init == false, "driver not initialized"))
+		return -ENODEV;
+
+	ret = imr_check_params(base, size);
+	if (ret)
+		return ret;
+
+	/* Tweak the size value. */
+	raw_size = imr_raw_size(size);
+	end = base + raw_size;
+
+	/*
+	 * Check for reserved IMR value common to firmware, kernel and grub
+	 * indicating a disabled IMR.
+	 */
+	imr.addr_lo = phys_to_imr(base);
+	imr.addr_hi = phys_to_imr(end);
+	imr.rmask = rmask;
+	imr.wmask = wmask;
+	if (!imr_is_enabled(&imr))
+		return -ENOTSUPP;
+
+	mutex_lock(&idev->lock);
+
+	/*
+	 * Find a free IMR while checking for an existing overlapping range.
+	 * Note there's no restriction in silicon to prevent IMR overlaps.
+	 * For the sake of simplicity and ease in defining/debugging an IMR
+	 * memory map we exclude IMR overlaps.
+	 */
+	reg = -1;
+	for (i = 0; i < idev->max_imr; i++) {
+		ret = imr_read(idev, i, &imr);
+		if (ret)
+			goto failed;
+
+		/* Find overlap @ base or end of requested range. */
+		ret = -EINVAL;
+		if (imr_is_enabled(&imr)) {
+			if (imr_address_overlap(base, &imr))
+				goto failed;
+			if (imr_address_overlap(end, &imr))
+				goto failed;
+		} else {
+			reg = i;
+		}
+	}
+
+	/* Error out if we have no free IMR entries. */
+	if (reg == -1) {
+		ret = -ENOMEM;
+		goto failed;
+	}
+
+	pr_debug("add %d phys %pa-%pa size %zx mask 0x%08x wmask 0x%08x\n",
+		 reg, &base, &end, raw_size, rmask, wmask);
+
+	/* Enable IMR at specified range and access mask. */
+	imr.addr_lo = phys_to_imr(base);
+	imr.addr_hi = phys_to_imr(end);
+	imr.rmask = rmask;
+	imr.wmask = wmask;
+
+	ret = imr_write(idev, reg, &imr, lock);
+	if (ret < 0) {
+		/*
+		 * In the highly unlikely event iosf_mbi_write failed
+		 * attempt to rollback the IMR setup skipping the trapping
+		 * of further IOSF write failures.
+		 */
+		imr.addr_lo = 0;
+		imr.addr_hi = 0;
+		imr.rmask = IMR_READ_ACCESS_ALL;
+		imr.wmask = IMR_WRITE_ACCESS_ALL;
+		imr_write(idev, reg, &imr, false);
+	}
+failed:
+	mutex_unlock(&idev->lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(imr_add_range);
+
+/**
+ * __imr_remove_range - delete an Isolated Memory Region.
+ *
+ * This function allows you to delete an IMR by its index specified by reg or
+ * by address range specified by base and size respectively. If you specify an
+ * index on its own the base and size parameters are ignored.
+ * imr_remove_range(0, base, size); delete IMR at index 0 base/size ignored.
+ * imr_remove_range(-1, base, size); delete IMR from base to base+size.
+ *
+ * @reg:	imr index to remove.
+ * @base:	physical base address of region aligned to 1 KiB.
+ * @size:	physical size of region in bytes aligned to 1 KiB.
+ * @return:	-EINVAL on invalid range or out or range id
+ *		-ENODEV if reg is valid but no IMR exists or is locked
+ *		0 on success.
+ */
+static int __imr_remove_range(int reg, phys_addr_t base, size_t size)
+{
+	phys_addr_t end;
+	bool found = false;
+	unsigned int i;
+	struct imr_device *idev = &imr_dev;
+	struct imr_regs imr;
+	size_t raw_size;
+	int ret = 0;
+
+	if (WARN_ONCE(idev->init == false, "driver not initialized"))
+		return -ENODEV;
+
+	/*
+	 * Validate address range if deleting by address, else we are
+	 * deleting by index where base and size will be ignored.
+	 */
+	if (reg == -1) {
+		ret = imr_check_params(base, size);
+		if (ret)
+			return ret;
+	}
+
+	/* Tweak the size value. */
+	raw_size = imr_raw_size(size);
+	end = base + raw_size;
+
+	mutex_lock(&idev->lock);
+
+	if (reg >= 0) {
+		/* If a specific IMR is given try to use it. */
+		ret = imr_read(idev, reg, &imr);
+		if (ret)
+			goto failed;
+
+		if (!imr_is_enabled(&imr) || imr.addr_lo & IMR_LOCK) {
+			ret = -ENODEV;
+			goto failed;
+		}
+		found = true;
+	} else {
+		/* Search for match based on address range. */
+		for (i = 0; i < idev->max_imr; i++) {
+			ret = imr_read(idev, i, &imr);
+			if (ret)
+				goto failed;
+
+			if (!imr_is_enabled(&imr) || imr.addr_lo & IMR_LOCK)
+				continue;
+
+			if ((imr_to_phys(imr.addr_lo) == base) &&
+			    (imr_to_phys(imr.addr_hi) == end)) {
+				found = true;
+				reg = i;
+				break;
+			}
+		}
+	}
+
+	if (!found) {
+		ret = -ENODEV;
+		goto failed;
+	}
+
+	pr_debug("remove %d phys %pa-%pa size %zx\n", reg, &base, &end, raw_size);
+
+	/* Tear down the IMR. */
+	imr.addr_lo = 0;
+	imr.addr_hi = 0;
+	imr.rmask = IMR_READ_ACCESS_ALL;
+	imr.wmask = IMR_WRITE_ACCESS_ALL;
+
+	ret = imr_write(idev, reg, &imr, false);
+
+failed:
+	mutex_unlock(&idev->lock);
+	return ret;
+}
+
+/**
+ * imr_remove_range - delete an Isolated Memory Region by address
+ *
+ * This function allows you to delete an IMR by an address range specified
+ * by base and size respectively.
+ * imr_remove_range(base, size); delete IMR from base to base+size.
+ *
+ * @base:	physical base address of region aligned to 1 KiB.
+ * @size:	physical size of region in bytes aligned to 1 KiB.
+ * @return:	-EINVAL on invalid range or out or range id
+ *		-ENODEV if reg is valid but no IMR exists or is locked
+ *		0 on success.
+ */
+int imr_remove_range(phys_addr_t base, size_t size)
+{
+	return __imr_remove_range(-1, base, size);
+}
+EXPORT_SYMBOL_GPL(imr_remove_range);
+
+/**
+ * imr_clear - delete an Isolated Memory Region by index
+ *
+ * This function allows you to delete an IMR by an address range specified
+ * by the index of the IMR. Useful for initial sanitization of the IMR
+ * address map.
+ * imr_ge(base, size); delete IMR from base to base+size.
+ *
+ * @reg:	imr index to remove.
+ * @return:	-EINVAL on invalid range or out or range id
+ *		-ENODEV if reg is valid but no IMR exists or is locked
+ *		0 on success.
+ */
+static inline int imr_clear(int reg)
+{
+	return __imr_remove_range(reg, 0, 0);
+}
+
+/**
+ * imr_fixup_memmap - Tear down IMRs used during bootup.
+ *
+ * BIOS and Grub both setup IMRs around compressed kernel, initrd memory
+ * that need to be removed before the kernel hands out one of the IMR
+ * encased addresses to a downstream DMA agent such as the SD or Ethernet.
+ * IMRs on Galileo are setup to immediately reset the system on violation.
+ * As a result if you're running a root filesystem from SD - you'll need
+ * the boot-time IMRs torn down or you'll find seemingly random resets when
+ * using your filesystem.
+ *
+ * @idev:	pointer to imr_device structure.
+ * @return:
+ */
+static void __init imr_fixup_memmap(struct imr_device *idev)
+{
+	phys_addr_t base = virt_to_phys(&_text);
+	size_t size = virt_to_phys(&__end_rodata) - base;
+	int i;
+	int ret;
+
+	/* Tear down all existing unlocked IMRs. */
+	for (i = 0; i < idev->max_imr; i++)
+		imr_clear(i);
+
+	/*
+	 * Setup a locked IMR around the physical extent of the kernel
+	 * from the beginning of the .text secton to the end of the
+	 * .rodata section as one physically contiguous block.
+	 */
+	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, true);
+	if (ret < 0) {
+		pr_err("unable to setup IMR for kernel: (%p - %p)\n",
+			&_text, &__end_rodata);
+	} else {
+		pr_info("protecting kernel .text - .rodata: %zu KiB (%p - %p)\n",
+			size / 1024, &_text, &__end_rodata);
+	}
+
+}
+
+static const struct x86_cpu_id imr_ids[] __initconst = {
+	{ X86_VENDOR_INTEL, 5, 9 },	/* Intel Quark SoC X1000. */
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, imr_ids);
+
+/**
+ * imr_init - entry point for IMR driver.
+ *
+ * return: -ENODEV for no IMR support 0 if good to go.
+ */
+static int __init imr_init(void)
+{
+	struct imr_device *idev = &imr_dev;
+	int ret;
+
+	if (!x86_match_cpu(imr_ids) || !iosf_mbi_available())
+		return -ENODEV;
+
+	idev->max_imr = QUARK_X1000_IMR_MAX;
+	idev->reg_base = QUARK_X1000_IMR_REGBASE;
+	idev->init = true;
+
+	mutex_init(&idev->lock);
+	ret = imr_debugfs_register(idev);
+	if (ret != 0)
+		pr_warn("debugfs register failed!\n");
+	imr_fixup_memmap(idev);
+	return 0;
+}
+
+/**
+ * imr_exit - exit point for IMR code.
+ *
+ * Deregisters debugfs, leave IMR state as-is.
+ *
+ * return:
+ */
+static void __exit imr_exit(void)
+{
+	imr_debugfs_unregister(&imr_dev);
+}
+
+module_init(imr_init);
+module_exit(imr_exit);
+
+MODULE_AUTHOR("Bryan O'Donoghue <pure.logic@nexus-software.ie>");
+MODULE_DESCRIPTION("Intel Isolated Memory Region driver");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c
new file mode 100644
index 000000000000..c9a0838890e2
--- /dev/null
+++ b/arch/x86/platform/intel-quark/imr_selftest.c
@@ -0,0 +1,129 @@
+/**
+ * imr_selftest.c
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
+ *
+ * IMR self test. The purpose of this module is to run a set of tests on the
+ * IMR API to validate it's sanity. We check for overlapping, reserved
+ * addresses and setup/teardown sanity.
+ *
+ */
+
+#include <asm-generic/sections.h>
+#include <asm/imr.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#define SELFTEST KBUILD_MODNAME ": "
+/**
+ * imr_self_test_result - Print result string for self test.
+ *
+ * @res:	result code - true if test passed false otherwise.
+ * @fmt:	format string.
+ * ...		variadic argument list.
+ */
+static void __init imr_self_test_result(int res, const char *fmt, ...)
+{
+	va_list vlist;
+
+	/* Print pass/fail. */
+	if (res)
+		pr_info(SELFTEST "pass ");
+	else
+		pr_info(SELFTEST "fail ");
+
+	/* Print variable string. */
+	va_start(vlist, fmt);
+	vprintk(fmt, vlist);
+	va_end(vlist);
+
+	/* Optional warning. */
+	WARN(res == 0, "test failed");
+}
+#undef SELFTEST
+
+/**
+ * imr_self_test
+ *
+ * Verify IMR self_test with some simple tests to verify overlap,
+ * zero sized allocations and 1 KiB sized areas.
+ *
+ */
+static void __init imr_self_test(void)
+{
+	phys_addr_t base  = virt_to_phys(&_text);
+	size_t size = virt_to_phys(&__end_rodata) - base;
+	const char *fmt_over = "overlapped IMR @ (0x%08lx - 0x%08lx)\n";
+	int ret;
+
+	/* Test zero zero. */
+	ret = imr_add_range(0, 0, 0, 0, false);
+	imr_self_test_result(ret < 0, "zero sized IMR\n");
+
+	/* Test exact overlap. */
+	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false);
+	imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
+
+	/* Test overlap with base inside of existing. */
+	base += size - IMR_ALIGN;
+	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false);
+	imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
+
+	/* Test overlap with end inside of existing. */
+	base -= size + IMR_ALIGN * 2;
+	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false);
+	imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
+
+	/* Test that a 1 KiB IMR @ zero with read/write all will bomb out. */
+	ret = imr_add_range(0, IMR_ALIGN, IMR_READ_ACCESS_ALL,
+			    IMR_WRITE_ACCESS_ALL, false);
+	imr_self_test_result(ret < 0, "1KiB IMR @ 0x00000000 - access-all\n");
+
+	/* Test that a 1 KiB IMR @ zero with CPU only will work. */
+	ret = imr_add_range(0, IMR_ALIGN, IMR_CPU, IMR_CPU, false);
+	imr_self_test_result(ret >= 0, "1KiB IMR @ 0x00000000 - cpu-access\n");
+	if (ret >= 0) {
+		ret = imr_remove_range(0, IMR_ALIGN);
+		imr_self_test_result(ret == 0, "teardown - cpu-access\n");
+	}
+
+	/* Test 2 KiB works. */
+	size = IMR_ALIGN * 2;
+	ret = imr_add_range(0, size, IMR_READ_ACCESS_ALL,
+			    IMR_WRITE_ACCESS_ALL, false);
+	imr_self_test_result(ret >= 0, "2KiB IMR @ 0x00000000\n");
+	if (ret >= 0) {
+		ret = imr_remove_range(0, size);
+		imr_self_test_result(ret == 0, "teardown 2KiB\n");
+	}
+}
+
+/**
+ * imr_self_test_init - entry point for IMR driver.
+ *
+ * return: -ENODEV for no IMR support 0 if good to go.
+ */
+static int __init imr_self_test_init(void)
+{
+	imr_self_test();
+	return 0;
+}
+
+/**
+ * imr_self_test_exit - exit point for IMR code.
+ *
+ * return:
+ */
+static void __exit imr_self_test_exit(void)
+{
+}
+
+module_init(imr_self_test_init);
+module_exit(imr_self_test_exit);
+
+MODULE_AUTHOR("Bryan O'Donoghue <pure.logic@nexus-software.ie>");
+MODULE_DESCRIPTION("Intel Isolated Memory Region self-test driver");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index c6b146e67116..7488cafab955 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -273,20 +273,6 @@ static inline void uv_clear_nmi(int cpu)
 	}
 }
 
-/* Print non-responding cpus */
-static void uv_nmi_nr_cpus_pr(char *fmt)
-{
-	static char cpu_list[1024];
-	int len = sizeof(cpu_list);
-	int c = cpumask_weight(uv_nmi_cpu_mask);
-	int n = cpulist_scnprintf(cpu_list, len, uv_nmi_cpu_mask);
-
-	if (n >= len-1)
-		strcpy(&cpu_list[len - 6], "...\n");
-
-	printk(fmt, c, cpu_list);
-}
-
 /* Ping non-responding cpus attemping to force them into the NMI handler */
 static void uv_nmi_nr_cpus_ping(void)
 {
@@ -371,16 +357,19 @@ static void uv_nmi_wait(int master)
 			break;
 
 		/* if not all made it in, send IPI NMI to them */
-		uv_nmi_nr_cpus_pr(KERN_ALERT
-			"UV: Sending NMI IPI to %d non-responding CPUs: %s\n");
+		pr_alert("UV: Sending NMI IPI to %d non-responding CPUs: %*pbl\n",
+			 cpumask_weight(uv_nmi_cpu_mask),
+			 cpumask_pr_args(uv_nmi_cpu_mask));
+
 		uv_nmi_nr_cpus_ping();
 
 		/* if all cpus are in, then done */
 		if (!uv_nmi_wait_cpus(0))
 			break;
 
-		uv_nmi_nr_cpus_pr(KERN_ALERT
-			"UV: %d CPUs not in NMI loop: %s\n");
+		pr_alert("UV: %d CPUs not in NMI loop: %*pbl\n",
+			 cpumask_weight(uv_nmi_cpu_mask),
+			 cpumask_pr_args(uv_nmi_cpu_mask));
 	} while (0);
 
 	pr_alert("UV: %d of %d CPUs in NMI\n",
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 6ec7910f59bf..3e32ed5648a0 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -105,11 +105,8 @@ static void __save_processor_state(struct saved_context *ctxt)
 	ctxt->cr0 = read_cr0();
 	ctxt->cr2 = read_cr2();
 	ctxt->cr3 = read_cr3();
-#ifdef CONFIG_X86_32
-	ctxt->cr4 = read_cr4_safe();
-#else
-/* CONFIG_X86_64 */
-	ctxt->cr4 = read_cr4();
+	ctxt->cr4 = __read_cr4_safe();
+#ifdef CONFIG_X86_64
 	ctxt->cr8 = read_cr8();
 #endif
 	ctxt->misc_enable_saved = !rdmsrl_safe(MSR_IA32_MISC_ENABLE,
@@ -175,12 +172,12 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
 	/* cr4 was introduced in the Pentium CPU */
 #ifdef CONFIG_X86_32
 	if (ctxt->cr4)
-		write_cr4(ctxt->cr4);
+		__write_cr4(ctxt->cr4);
 #else
 /* CONFIG X86_64 */
 	wrmsrl(MSR_EFER, ctxt->efer);
 	write_cr8(ctxt->cr8);
-	write_cr4(ctxt->cr4);
+	__write_cr4(ctxt->cr4);
 #endif
 	write_cr3(ctxt->cr3);
 	write_cr2(ctxt->cr2);
diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile
index 94f7fbe97b08..e02c2c6c56a5 100644
--- a/arch/x86/realmode/Makefile
+++ b/arch/x86/realmode/Makefile
@@ -6,7 +6,7 @@
 # for more details.
 #
 #
-
+KASAN_SANITIZE := n
 subdir- := rm
 
 obj-y += init.o
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index bad628a620c4..0b7a63d98440 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -81,7 +81,7 @@ void __init setup_real_mode(void)
 
 	trampoline_header->start = (u64) secondary_startup_64;
 	trampoline_cr4_features = &trampoline_header->cr4;
-	*trampoline_cr4_features = read_cr4();
+	*trampoline_cr4_features = __read_cr4();
 
 	trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
 	trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd;
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 7c0d7be176a5..2730d775ef9a 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -6,6 +6,7 @@
 # for more details.
 #
 #
+KASAN_SANITIZE := n
 
 always := realmode.bin realmode.relocs
 
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 79d824551c1a..0c8c32bfd792 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -157,7 +157,7 @@ static int copy_sc_from_user(struct pt_regs *regs,
 	int err, pid;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	err = copy_from_user(&sc, from, sizeof(sc));
 	if (err)
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 09297c8e1fcd..7b9be9822724 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -3,6 +3,7 @@
 #
 
 KBUILD_CFLAGS += $(DISABLE_LTO)
+KASAN_SANITIZE := n
 
 VDSO64-$(CONFIG_X86_64)		:= y
 VDSOX32-$(CONFIG_X86_X32_ABI)	:= y
diff --git a/arch/x86/vdso/vdso32/sigreturn.S b/arch/x86/vdso/vdso32/sigreturn.S
index 31776d0efc8c..d7ec4e251c0a 100644
--- a/arch/x86/vdso/vdso32/sigreturn.S
+++ b/arch/x86/vdso/vdso32/sigreturn.S
@@ -17,6 +17,7 @@
 	.text
 	.globl __kernel_sigreturn
 	.type __kernel_sigreturn,@function
+	nop /* this guy is needed for .LSTARTFDEDLSI1 below (watch for HACK) */
 	ALIGN
 __kernel_sigreturn:
 .LSTART_sigreturn:
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 78a881b7fc41..5240f563076d 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1070,6 +1070,23 @@ static inline void xen_write_cr8(unsigned long val)
 	BUG_ON(val);
 }
 #endif
+
+static u64 xen_read_msr_safe(unsigned int msr, int *err)
+{
+	u64 val;
+
+	val = native_read_msr_safe(msr, err);
+	switch (msr) {
+	case MSR_IA32_APICBASE:
+#ifdef CONFIG_X86_X2APIC
+		if (!(cpuid_ecx(1) & (1 << (X86_FEATURE_X2APIC & 31))))
+#endif
+			val &= ~X2APIC_ENABLE;
+		break;
+	}
+	return val;
+}
+
 static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
 {
 	int ret;
@@ -1240,7 +1257,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
 
 	.wbinvd = native_wbinvd,
 
-	.read_msr = native_read_msr_safe,
+	.read_msr = xen_read_msr_safe,
 	.write_msr = xen_write_msr_safe,
 
 	.read_tsc = native_read_tsc,
@@ -1494,10 +1511,10 @@ static void xen_pvh_set_cr_flags(int cpu)
 	 * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu_init.
 	*/
 	if (cpu_has_pse)
-		set_in_cr4(X86_CR4_PSE);
+		cr4_set_bits_and_update_boot(X86_CR4_PSE);
 
 	if (cpu_has_pge)
-		set_in_cr4(X86_CR4_PGE);
+		cr4_set_bits_and_update_boot(X86_CR4_PGE);
 }
 
 /*
@@ -1741,6 +1758,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
 #ifdef CONFIG_X86_32
 	i386_start_kernel();
 #else
+	cr4_init_shadow(); /* 32b kernel does this in i386_start_kernel() */
 	x86_64_start_reservations((char *)__pa_symbol(&boot_params));
 #endif
 }
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 5c1f9ace7ae7..adca9e2b6553 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1489,7 +1489,7 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
 	native_set_pte(ptep, pte);
 }
 
-static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
 {
 	struct mmuext_op op;
 	op.cmd = cmd;
@@ -1657,7 +1657,7 @@ void __init xen_reserve_top(void)
  * Like __va(), but returns address in the kernel mapping (which is
  * all we have until the physical memory mapping has been set up.
  */
-static void *__ka(phys_addr_t paddr)
+static void * __init __ka(phys_addr_t paddr)
 {
 #ifdef CONFIG_X86_64
 	return (void *)(paddr + __START_KERNEL_map);
@@ -1667,7 +1667,7 @@ static void *__ka(phys_addr_t paddr)
 }
 
 /* Convert a machine address to physical address */
-static unsigned long m2p(phys_addr_t maddr)
+static unsigned long __init m2p(phys_addr_t maddr)
 {
 	phys_addr_t paddr;
 
@@ -1678,13 +1678,14 @@ static unsigned long m2p(phys_addr_t maddr)
 }
 
 /* Convert a machine address to kernel virtual */
-static void *m2v(phys_addr_t maddr)
+static void * __init m2v(phys_addr_t maddr)
 {
 	return __ka(m2p(maddr));
 }
 
 /* Set the page permissions on an identity-mapped pages */
-static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags)
+static void __init set_page_prot_flags(void *addr, pgprot_t prot,
+				       unsigned long flags)
 {
 	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
 	pte_t pte = pfn_pte(pfn, prot);
@@ -1696,7 +1697,7 @@ static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags)
 	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
 		BUG();
 }
-static void set_page_prot(void *addr, pgprot_t prot)
+static void __init set_page_prot(void *addr, pgprot_t prot)
 {
 	return set_page_prot_flags(addr, prot, UVMF_NONE);
 }
@@ -1733,10 +1734,8 @@ static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
 		for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
 			pte_t pte;
 
-#ifdef CONFIG_X86_32
 			if (pfn > max_pfn_mapped)
 				max_pfn_mapped = pfn;
-#endif
 
 			if (!pte_none(pte_page[pteidx]))
 				continue;
@@ -1769,7 +1768,7 @@ void __init xen_setup_machphys_mapping(void)
 }
 
 #ifdef CONFIG_X86_64
-static void convert_pfn_mfn(void *v)
+static void __init convert_pfn_mfn(void *v)
 {
 	pte_t *pte = v;
 	int i;
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 70fb5075c901..9f93af56a5fc 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -84,8 +84,6 @@
 
 #define PMDS_PER_MID_PAGE	(P2M_MID_PER_PAGE / PTRS_PER_PTE)
 
-static void __init m2p_override_init(void);
-
 unsigned long *xen_p2m_addr __read_mostly;
 EXPORT_SYMBOL_GPL(xen_p2m_addr);
 unsigned long xen_p2m_size __read_mostly;
@@ -402,8 +400,6 @@ void __init xen_vmalloc_p2m_tree(void)
 	xen_p2m_size = xen_max_p2m_pfn;
 
 	xen_inv_extra_mem();
-
-	m2p_override_init();
 }
 
 unsigned long get_phys_to_machine(unsigned long pfn)
@@ -554,7 +550,7 @@ static bool alloc_p2m(unsigned long pfn)
 		mid_mfn = NULL;
 	}
 
-	p2m_pfn = pte_pfn(ACCESS_ONCE(*ptep));
+	p2m_pfn = pte_pfn(READ_ONCE(*ptep));
 	if (p2m_pfn == PFN_DOWN(__pa(p2m_identity)) ||
 	    p2m_pfn == PFN_DOWN(__pa(p2m_missing))) {
 		/* p2m leaf page is missing */
@@ -567,7 +563,7 @@ static bool alloc_p2m(unsigned long pfn)
 		if (p2m_pfn == PFN_DOWN(__pa(p2m_missing)))
 			p2m_init(p2m);
 		else
-			p2m_init_identity(p2m, pfn);
+			p2m_init_identity(p2m, pfn & ~(P2M_PER_PAGE - 1));
 
 		spin_lock_irqsave(&p2m_update_lock, flags);
 
@@ -652,100 +648,21 @@ bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 	return true;
 }
 
-#define M2P_OVERRIDE_HASH_SHIFT	10
-#define M2P_OVERRIDE_HASH	(1 << M2P_OVERRIDE_HASH_SHIFT)
-
-static struct list_head *m2p_overrides;
-static DEFINE_SPINLOCK(m2p_override_lock);
-
-static void __init m2p_override_init(void)
-{
-	unsigned i;
-
-	m2p_overrides = alloc_bootmem_align(
-				sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
-				sizeof(unsigned long));
-
-	for (i = 0; i < M2P_OVERRIDE_HASH; i++)
-		INIT_LIST_HEAD(&m2p_overrides[i]);
-}
-
-static unsigned long mfn_hash(unsigned long mfn)
-{
-	return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
-}
-
-/* Add an MFN override for a particular page */
-static int m2p_add_override(unsigned long mfn, struct page *page,
-			    struct gnttab_map_grant_ref *kmap_op)
-{
-	unsigned long flags;
-	unsigned long pfn;
-	unsigned long uninitialized_var(address);
-	unsigned level;
-	pte_t *ptep = NULL;
-
-	pfn = page_to_pfn(page);
-	if (!PageHighMem(page)) {
-		address = (unsigned long)__va(pfn << PAGE_SHIFT);
-		ptep = lookup_address(address, &level);
-		if (WARN(ptep == NULL || level != PG_LEVEL_4K,
-			 "m2p_add_override: pfn %lx not mapped", pfn))
-			return -EINVAL;
-	}
-
-	if (kmap_op != NULL) {
-		if (!PageHighMem(page)) {
-			struct multicall_space mcs =
-				xen_mc_entry(sizeof(*kmap_op));
-
-			MULTI_grant_table_op(mcs.mc,
-					GNTTABOP_map_grant_ref, kmap_op, 1);
-
-			xen_mc_issue(PARAVIRT_LAZY_MMU);
-		}
-	}
-	spin_lock_irqsave(&m2p_override_lock, flags);
-	list_add(&page->lru,  &m2p_overrides[mfn_hash(mfn)]);
-	spin_unlock_irqrestore(&m2p_override_lock, flags);
-
-	/* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in
-	 * this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other
-	 * pfn so that the following mfn_to_pfn(mfn) calls will return the
-	 * pfn from the m2p_override (the backend pfn) instead.
-	 * We need to do this because the pages shared by the frontend
-	 * (xen-blkfront) can be already locked (lock_page, called by
-	 * do_read_cache_page); when the userspace backend tries to use them
-	 * with direct_IO, mfn_to_pfn returns the pfn of the frontend, so
-	 * do_blockdev_direct_IO is going to try to lock the same pages
-	 * again resulting in a deadlock.
-	 * As a side effect get_user_pages_fast might not be safe on the
-	 * frontend pages while they are being shared with the backend,
-	 * because mfn_to_pfn (that ends up being called by GUPF) will
-	 * return the backend pfn rather than the frontend pfn. */
-	pfn = mfn_to_pfn_no_overrides(mfn);
-	if (__pfn_to_mfn(pfn) == mfn)
-		set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
-
-	return 0;
-}
-
 int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
 			    struct gnttab_map_grant_ref *kmap_ops,
 			    struct page **pages, unsigned int count)
 {
 	int i, ret = 0;
-	bool lazy = false;
 	pte_t *pte;
 
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return 0;
 
-	if (kmap_ops &&
-	    !in_interrupt() &&
-	    paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
-		arch_enter_lazy_mmu_mode();
-		lazy = true;
+	if (kmap_ops) {
+		ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+						kmap_ops, count);
+		if (ret)
+			goto out;
 	}
 
 	for (i = 0; i < count; i++) {
@@ -764,170 +681,28 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
 		}
 		pfn = page_to_pfn(pages[i]);
 
-		WARN_ON(PagePrivate(pages[i]));
-		SetPagePrivate(pages[i]);
-		set_page_private(pages[i], mfn);
-		pages[i]->index = pfn_to_mfn(pfn);
+		WARN(pfn_to_mfn(pfn) != INVALID_P2M_ENTRY, "page must be ballooned");
 
 		if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) {
 			ret = -ENOMEM;
 			goto out;
 		}
-
-		if (kmap_ops) {
-			ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]);
-			if (ret)
-				goto out;
-		}
 	}
 
 out:
-	if (lazy)
-		arch_leave_lazy_mmu_mode();
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping);
 
-static struct page *m2p_find_override(unsigned long mfn)
-{
-	unsigned long flags;
-	struct list_head *bucket;
-	struct page *p, *ret;
-
-	if (unlikely(!m2p_overrides))
-		return NULL;
-
-	ret = NULL;
-	bucket = &m2p_overrides[mfn_hash(mfn)];
-
-	spin_lock_irqsave(&m2p_override_lock, flags);
-
-	list_for_each_entry(p, bucket, lru) {
-		if (page_private(p) == mfn) {
-			ret = p;
-			break;
-		}
-	}
-
-	spin_unlock_irqrestore(&m2p_override_lock, flags);
-
-	return ret;
-}
-
-static int m2p_remove_override(struct page *page,
-			       struct gnttab_map_grant_ref *kmap_op,
-			       unsigned long mfn)
-{
-	unsigned long flags;
-	unsigned long pfn;
-	unsigned long uninitialized_var(address);
-	unsigned level;
-	pte_t *ptep = NULL;
-
-	pfn = page_to_pfn(page);
-
-	if (!PageHighMem(page)) {
-		address = (unsigned long)__va(pfn << PAGE_SHIFT);
-		ptep = lookup_address(address, &level);
-
-		if (WARN(ptep == NULL || level != PG_LEVEL_4K,
-			 "m2p_remove_override: pfn %lx not mapped", pfn))
-			return -EINVAL;
-	}
-
-	spin_lock_irqsave(&m2p_override_lock, flags);
-	list_del(&page->lru);
-	spin_unlock_irqrestore(&m2p_override_lock, flags);
-
-	if (kmap_op != NULL) {
-		if (!PageHighMem(page)) {
-			struct multicall_space mcs;
-			struct gnttab_unmap_and_replace *unmap_op;
-			struct page *scratch_page = get_balloon_scratch_page();
-			unsigned long scratch_page_address = (unsigned long)
-				__va(page_to_pfn(scratch_page) << PAGE_SHIFT);
-
-			/*
-			 * It might be that we queued all the m2p grant table
-			 * hypercalls in a multicall, then m2p_remove_override
-			 * get called before the multicall has actually been
-			 * issued. In this case handle is going to -1 because
-			 * it hasn't been modified yet.
-			 */
-			if (kmap_op->handle == -1)
-				xen_mc_flush();
-			/*
-			 * Now if kmap_op->handle is negative it means that the
-			 * hypercall actually returned an error.
-			 */
-			if (kmap_op->handle == GNTST_general_error) {
-				pr_warn("m2p_remove_override: pfn %lx mfn %lx, failed to modify kernel mappings",
-					pfn, mfn);
-				put_balloon_scratch_page();
-				return -1;
-			}
-
-			xen_mc_batch();
-
-			mcs = __xen_mc_entry(
-				sizeof(struct gnttab_unmap_and_replace));
-			unmap_op = mcs.args;
-			unmap_op->host_addr = kmap_op->host_addr;
-			unmap_op->new_addr = scratch_page_address;
-			unmap_op->handle = kmap_op->handle;
-
-			MULTI_grant_table_op(mcs.mc,
-				GNTTABOP_unmap_and_replace, unmap_op, 1);
-
-			mcs = __xen_mc_entry(0);
-			MULTI_update_va_mapping(mcs.mc, scratch_page_address,
-					pfn_pte(page_to_pfn(scratch_page),
-					PAGE_KERNEL_RO), 0);
-
-			xen_mc_issue(PARAVIRT_LAZY_MMU);
-
-			kmap_op->host_addr = 0;
-			put_balloon_scratch_page();
-		}
-	}
-
-	/* p2m(m2p(mfn)) == FOREIGN_FRAME(mfn): the mfn is already present
-	 * somewhere in this domain, even before being added to the
-	 * m2p_override (see comment above in m2p_add_override).
-	 * If there are no other entries in the m2p_override corresponding
-	 * to this mfn, then remove the FOREIGN_FRAME_BIT from the p2m for
-	 * the original pfn (the one shared by the frontend): the backend
-	 * cannot do any IO on this page anymore because it has been
-	 * unshared. Removing the FOREIGN_FRAME_BIT from the p2m entry of
-	 * the original pfn causes mfn_to_pfn(mfn) to return the frontend
-	 * pfn again. */
-	mfn &= ~FOREIGN_FRAME_BIT;
-	pfn = mfn_to_pfn_no_overrides(mfn);
-	if (__pfn_to_mfn(pfn) == FOREIGN_FRAME(mfn) &&
-			m2p_find_override(mfn) == NULL)
-		set_phys_to_machine(pfn, mfn);
-
-	return 0;
-}
-
 int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
-			      struct gnttab_map_grant_ref *kmap_ops,
+			      struct gnttab_unmap_grant_ref *kunmap_ops,
 			      struct page **pages, unsigned int count)
 {
 	int i, ret = 0;
-	bool lazy = false;
 
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return 0;
 
-	if (kmap_ops &&
-	    !in_interrupt() &&
-	    paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
-		arch_enter_lazy_mmu_mode();
-		lazy = true;
-	}
-
 	for (i = 0; i < count; i++) {
 		unsigned long mfn = __pfn_to_mfn(page_to_pfn(pages[i]));
 		unsigned long pfn = page_to_pfn(pages[i]);
@@ -937,36 +712,16 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
 			goto out;
 		}
 
-		set_page_private(pages[i], INVALID_P2M_ENTRY);
-		WARN_ON(!PagePrivate(pages[i]));
-		ClearPagePrivate(pages[i]);
-		set_phys_to_machine(pfn, pages[i]->index);
-
-		if (kmap_ops)
-			ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn);
-		if (ret)
-			goto out;
+		set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
 	}
-
+	if (kunmap_ops)
+		ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+						kunmap_ops, count);
 out:
-	if (lazy)
-		arch_leave_lazy_mmu_mode();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping);
 
-unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
-{
-	struct page *p = m2p_find_override(mfn);
-	unsigned long ret = pfn;
-
-	if (p)
-		ret = page_to_pfn(p);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
-
 #ifdef CONFIG_XEN_DEBUG_FS
 #include <linux/debugfs.h>
 #include "debugfs.h"
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 865e56cea7a0..55f388ef481a 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -32,16 +32,6 @@
 #include "p2m.h"
 #include "mmu.h"
 
-/* These are code, but not functions.  Defined in entry.S */
-extern const char xen_hypervisor_callback[];
-extern const char xen_failsafe_callback[];
-#ifdef CONFIG_X86_64
-extern asmlinkage void nmi(void);
-#endif
-extern void xen_sysenter_target(void);
-extern void xen_syscall_target(void);
-extern void xen_syscall32_target(void);
-
 /* Amount of extra memory space we add to the e820 ranges */
 struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
 
@@ -74,7 +64,7 @@ static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
  */
 #define EXTRA_MEM_RATIO		(10)
 
-static void __init xen_add_extra_mem(u64 start, u64 size)
+static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size)
 {
 	int i;
 
@@ -97,10 +87,10 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
 	memblock_reserve(start, size);
 }
 
-static void __init xen_del_extra_mem(u64 start, u64 size)
+static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
 {
 	int i;
-	u64 start_r, size_r;
+	phys_addr_t start_r, size_r;
 
 	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
 		start_r = xen_extra_mem[i].start;
@@ -267,7 +257,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
 static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn)
 {
 	struct mmu_update update = {
-		.ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
+		.ptr = ((uint64_t)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
 		.val = pfn
 	};
 
@@ -545,20 +535,21 @@ static unsigned long __init xen_get_max_pages(void)
 	return min(max_pages, MAX_DOMAIN_PAGES);
 }
 
-static void xen_align_and_add_e820_region(u64 start, u64 size, int type)
+static void __init xen_align_and_add_e820_region(phys_addr_t start,
+						 phys_addr_t size, int type)
 {
-	u64 end = start + size;
+	phys_addr_t end = start + size;
 
 	/* Align RAM regions to page boundaries. */
 	if (type == E820_RAM) {
 		start = PAGE_ALIGN(start);
-		end &= ~((u64)PAGE_SIZE - 1);
+		end &= ~((phys_addr_t)PAGE_SIZE - 1);
 	}
 
 	e820_add_region(start, end - start, type);
 }
 
-void xen_ignore_unusable(struct e820entry *list, size_t map_size)
+static void __init xen_ignore_unusable(struct e820entry *list, size_t map_size)
 {
 	struct e820entry *entry;
 	unsigned int i;
@@ -577,7 +568,7 @@ char * __init xen_memory_setup(void)
 	static struct e820entry map[E820MAX] __initdata;
 
 	unsigned long max_pfn = xen_start_info->nr_pages;
-	unsigned long long mem_end;
+	phys_addr_t mem_end;
 	int rc;
 	struct xen_memory_map memmap;
 	unsigned long max_pages;
@@ -652,16 +643,16 @@ char * __init xen_memory_setup(void)
 			  extra_pages);
 	i = 0;
 	while (i < memmap.nr_entries) {
-		u64 addr = map[i].addr;
-		u64 size = map[i].size;
+		phys_addr_t addr = map[i].addr;
+		phys_addr_t size = map[i].size;
 		u32 type = map[i].type;
 
 		if (type == E820_RAM) {
 			if (addr < mem_end) {
 				size = min(size, mem_end - addr);
 			} else if (extra_pages) {
-				size = min(size, (u64)extra_pages * PAGE_SIZE);
-				extra_pages -= size / PAGE_SIZE;
+				size = min(size, PFN_PHYS(extra_pages));
+				extra_pages -= PFN_DOWN(size);
 				xen_add_extra_mem(addr, size);
 				xen_max_p2m_pfn = PFN_DOWN(addr + size);
 			} else
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 4c071aeb8417..08e8489c47f1 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -507,7 +507,7 @@ static int xen_cpu_disable(void)
 static void xen_cpu_die(unsigned int cpu)
 {
 	while (xen_pv_domain() && HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
-		current->state = TASK_UNINTERRUPTIBLE;
+		__set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(HZ/10);
 	}
 
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 23b45eb9a89c..956374c1edbc 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -41,7 +41,7 @@ static u8 zero_stats;
 static inline void check_zero(void)
 {
 	u8 ret;
-	u8 old = ACCESS_ONCE(zero_stats);
+	u8 old = READ_ONCE(zero_stats);
 	if (unlikely(old)) {
 		ret = cmpxchg(&zero_stats, old, 0);
 		/* This ensures only one fellow resets the stat */
@@ -112,6 +112,7 @@ __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 	struct xen_lock_waiting *w = this_cpu_ptr(&lock_waiting);
 	int cpu = smp_processor_id();
 	u64 start;
+	__ticket_t head;
 	unsigned long flags;
 
 	/* If kicker interrupts not initialized yet, just spin */
@@ -159,11 +160,15 @@ __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 	 */
 	__ticket_enter_slowpath(lock);
 
+	/* make sure enter_slowpath, which is atomic does not cross the read */
+	smp_mb__after_atomic();
+
 	/*
 	 * check again make sure it didn't become free while
 	 * we weren't looking
 	 */
-	if (ACCESS_ONCE(lock->tickets.head) == want) {
+	head = READ_ONCE(lock->tickets.head);
+	if (__tickets_equal(head, want)) {
 		add_stats(TAKEN_SLOW_PICKUP, 1);
 		goto out;
 	}
@@ -204,8 +209,8 @@ static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next)
 		const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu);
 
 		/* Make sure we read lock before want */
-		if (ACCESS_ONCE(w->lock) == lock &&
-		    ACCESS_ONCE(w->want) == next) {
+		if (READ_ONCE(w->lock) == lock &&
+		    READ_ONCE(w->want) == next) {
 			add_stats(RELEASED_SLOW_KICKED, 1);
 			xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
 			break;
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 69087341d9ae..55da33b1d51c 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -479,6 +479,10 @@ static void __init xen_time_init(void)
 	int cpu = smp_processor_id();
 	struct timespec tp;
 
+	/* As Dom0 is never moved, no penalty on using TSC there */
+	if (xen_initial_domain())
+		xen_clocksource.rating = 275;
+
 	clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
 
 	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 5686bd9d58cc..9e195c683549 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -10,6 +10,12 @@
 extern const char xen_hypervisor_callback[];
 extern const char xen_failsafe_callback[];
 
+void xen_sysenter_target(void);
+#ifdef CONFIG_X86_64
+void xen_syscall_target(void);
+void xen_syscall32_target(void);
+#endif
+
 extern void *xen_initial_gdt;
 
 struct trap_info;
author	Ingo Molnar <mingo@kernel.org>	2015-03-23 10:13:36 +0100
committer	Ingo Molnar <mingo@kernel.org>	2015-03-23 10:13:36 +0100
commit	eda2360ad18b7cde87728fad85c6735a52c2576e (patch)
tree	2bc28704b40686c0d3d0fd5aa7d000f9d9dd153f /arch/x86
parent	1d23c4518b1f3a03c278f23333149245c178d2a6 (diff)
parent	bc465aa9d045feb0e13b4a8f32cc33c1943f62d6 (diff)