summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-02-13 09:55:09 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2015-02-13 09:55:09 -0800
commitb9085bcbf5f43adf60533f9b635b2e7faeed0fe9 (patch)
treee397abf5682a45c096e75b3d0fa99c8e228425fc /arch
parentc7d7b98671552abade78834c522b7308bda73c0d (diff)
parent6557bada461afeaa920a189fae2cff7c8fdce39f (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM update from Paolo Bonzini: "Fairly small update, but there are some interesting new features. Common: Optional support for adding a small amount of polling on each HLT instruction executed in the guest (or equivalent for other architectures). This can improve latency up to 50% on some scenarios (e.g. O_DSYNC writes or TCP_RR netperf tests). This also has to be enabled manually for now, but the plan is to auto-tune this in the future. ARM/ARM64: The highlights are support for GICv3 emulation and dirty page tracking s390: Several optimizations and bugfixes. Also a first: a feature exposed by KVM (UUID and long guest name in /proc/sysinfo) before it is available in IBM's hypervisor! :) MIPS: Bugfixes. x86: Support for PML (page modification logging, a new feature in Broadwell Xeons that speeds up dirty page tracking), nested virtualization improvements (nested APICv---a nice optimization), usual round of emulation fixes. There is also a new option to reduce latency of the TSC deadline timer in the guest; this needs to be tuned manually. Some commits are common between this pull and Catalin's; I see you have already included his tree. Powerpc: Nothing yet. The KVM/PPC changes will come in through the PPC maintainers, because I haven't received them yet and I might end up being offline for some part of next week" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (130 commits) KVM: ia64: drop kvm.h from installed user headers KVM: x86: fix build with !CONFIG_SMP KVM: x86: emulate: correct page fault error code for NoWrite instructions KVM: Disable compat ioctl for s390 KVM: s390: add cpu model support KVM: s390: use facilities and cpu_id per KVM KVM: s390/CPACF: Choose crypto control block format s390/kernel: Update /proc/sysinfo file with Extended Name and UUID KVM: s390: reenable LPP facility KVM: s390: floating irqs: fix user triggerable endless loop kvm: add halt_poll_ns module parameter kvm: remove KVM_MMIO_SIZE KVM: MIPS: Don't leak FPU/DSP to guest KVM: MIPS: Disable HTW while in guest KVM: nVMX: Enable nested posted interrupt processing KVM: nVMX: Enable nested virtual interrupt delivery KVM: nVMX: Enable nested apic register virtualization KVM: nVMX: Make nested control MSRs per-cpu KVM: nVMX: Enable nested virtualize x2apic mode KVM: nVMX: Prepare for using hardware MSR bitmap ...
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/include/asm/kvm_asm.h1
-rw-r--r--arch/arm/include/asm/kvm_emulate.h5
-rw-r--r--arch/arm/include/asm/kvm_host.h6
-rw-r--r--arch/arm/include/asm/kvm_mmio.h1
-rw-r--r--arch/arm/include/asm/kvm_mmu.h21
-rw-r--r--arch/arm/include/asm/pgtable-3level.h1
-rw-r--r--arch/arm/include/uapi/asm/kvm.h2
-rw-r--r--arch/arm/kvm/Kconfig2
-rw-r--r--arch/arm/kvm/Makefile1
-rw-r--r--arch/arm/kvm/arm.c58
-rw-r--r--arch/arm/kvm/handle_exit.c8
-rw-r--r--arch/arm/kvm/interrupts.S11
-rw-r--r--arch/arm/kvm/mmu.c271
-rw-r--r--arch/arm/kvm/psci.c17
-rw-r--r--arch/arm/kvm/trace.h11
-rw-r--r--arch/arm64/include/asm/esr.h1
-rw-r--r--arch/arm64/include/asm/kvm_asm.h1
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h10
-rw-r--r--arch/arm64/include/asm/kvm_host.h7
-rw-r--r--arch/arm64/include/asm/kvm_mmio.h1
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h21
-rw-r--r--arch/arm64/include/asm/pgtable-hwdef.h1
-rw-r--r--arch/arm64/include/uapi/asm/kvm.h9
-rw-r--r--arch/arm64/kernel/asm-offsets.c1
-rw-r--r--arch/arm64/kvm/Kconfig2
-rw-r--r--arch/arm64/kvm/Makefile2
-rw-r--r--arch/arm64/kvm/handle_exit.c13
-rw-r--r--arch/arm64/kvm/hyp.S22
-rw-r--r--arch/arm64/kvm/sys_regs.c40
-rw-r--r--arch/arm64/kvm/trace.h55
-rw-r--r--arch/arm64/kvm/vgic-v3-switch.S14
-rw-r--r--arch/ia64/include/uapi/asm/Kbuild1
-rw-r--r--arch/mips/include/asm/kvm_host.h1
-rw-r--r--arch/mips/kvm/locore.S2
-rw-r--r--arch/mips/kvm/mips.c23
-rw-r--r--arch/powerpc/include/asm/kvm_host.h1
-rw-r--r--arch/powerpc/kvm/book3s.c1
-rw-r--r--arch/powerpc/kvm/booke.c1
-rw-r--r--arch/powerpc/kvm/powerpc.c3
-rw-r--r--arch/s390/include/asm/kvm_host.h56
-rw-r--r--arch/s390/include/asm/sclp.h4
-rw-r--r--arch/s390/include/asm/sysinfo.h10
-rw-r--r--arch/s390/include/uapi/asm/kvm.h37
-rw-r--r--arch/s390/kernel/sysinfo.c29
-rw-r--r--arch/s390/kvm/gaccess.c4
-rw-r--r--arch/s390/kvm/intercept.c41
-rw-r--r--arch/s390/kvm/interrupt.c191
-rw-r--r--arch/s390/kvm/kvm-s390.c596
-rw-r--r--arch/s390/kvm/kvm-s390.h19
-rw-r--r--arch/s390/kvm/priv.c13
-rw-r--r--arch/s390/kvm/sigp.c160
-rw-r--r--arch/s390/kvm/trace-s390.h14
-rw-r--r--arch/x86/include/asm/kvm_emulate.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h59
-rw-r--r--arch/x86/include/asm/vmx.h4
-rw-r--r--arch/x86/include/uapi/asm/msr-index.h3
-rw-r--r--arch/x86/include/uapi/asm/vmx.h6
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/emulate.c230
-rw-r--r--arch/x86/kvm/ioapic.h2
-rw-r--r--arch/x86/kvm/iommu.c4
-rw-r--r--arch/x86/kvm/lapic.c147
-rw-r--r--arch/x86/kvm/lapic.h6
-rw-r--r--arch/x86/kvm/mmu.c351
-rw-r--r--arch/x86/kvm/mmu.h17
-rw-r--r--arch/x86/kvm/svm.c4
-rw-r--r--arch/x86/kvm/trace.h38
-rw-r--r--arch/x86/kvm/vmx.c1086
-rw-r--r--arch/x86/kvm/x86.c209
-rw-r--r--arch/x86/kvm/x86.h3
70 files changed, 3276 insertions, 718 deletions
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 3a67bec72d0c..25410b2d8bc1 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -96,6 +96,7 @@ extern char __kvm_hyp_code_end[];
extern void __kvm_flush_vm_context(void);
extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
+extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
#endif
diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
index 7b0152321b20..a9c80a2ea1a7 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -23,6 +23,7 @@
#include <asm/kvm_asm.h>
#include <asm/kvm_mmio.h>
#include <asm/kvm_arm.h>
+#include <asm/cputype.h>
unsigned long *vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num);
unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu);
@@ -177,9 +178,9 @@ static inline u32 kvm_vcpu_hvc_get_imm(struct kvm_vcpu *vcpu)
return kvm_vcpu_get_hsr(vcpu) & HSR_HVC_IMM_MASK;
}
-static inline unsigned long kvm_vcpu_get_mpidr(struct kvm_vcpu *vcpu)
+static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.cp15[c0_MPIDR];
+ return vcpu->arch.cp15[c0_MPIDR] & MPIDR_HWID_BITMASK;
}
static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu)
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 04b4ea0b550a..41008cd7c53f 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -68,6 +68,7 @@ struct kvm_arch {
/* Interrupt controller */
struct vgic_dist vgic;
+ int max_vcpus;
};
#define KVM_NR_MEM_OBJS 40
@@ -144,6 +145,7 @@ struct kvm_vm_stat {
};
struct kvm_vcpu_stat {
+ u32 halt_successful_poll;
u32 halt_wakeup;
};
@@ -231,6 +233,10 @@ static inline void vgic_arch_setup(const struct vgic_params *vgic)
int kvm_perf_init(void);
int kvm_perf_teardown(void);
+void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
+
+struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
+
static inline void kvm_arch_hardware_disable(void) {}
static inline void kvm_arch_hardware_unsetup(void) {}
static inline void kvm_arch_sync_events(struct kvm *kvm) {}
diff --git a/arch/arm/include/asm/kvm_mmio.h b/arch/arm/include/asm/kvm_mmio.h
index adcc0d7d3175..3f83db2f6cf0 100644
--- a/arch/arm/include/asm/kvm_mmio.h
+++ b/arch/arm/include/asm/kvm_mmio.h
@@ -37,6 +37,7 @@ struct kvm_exit_mmio {
u8 data[8];
u32 len;
bool is_write;
+ void *private;
};
static inline void kvm_prepare_mmio(struct kvm_run *run,
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 1bca8f8af442..37ca2a4c6f09 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -115,6 +115,27 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
pmd_val(*pmd) |= L_PMD_S2_RDWR;
}
+static inline void kvm_set_s2pte_readonly(pte_t *pte)
+{
+ pte_val(*pte) = (pte_val(*pte) & ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY;
+}
+
+static inline bool kvm_s2pte_readonly(pte_t *pte)
+{
+ return (pte_val(*pte) & L_PTE_S2_RDWR) == L_PTE_S2_RDONLY;
+}
+
+static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
+{
+ pmd_val(*pmd) = (pmd_val(*pmd) & ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY;
+}
+
+static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
+{
+ return (pmd_val(*pmd) & L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
+}
+
+
/* Open coded p*d_addr_end that can deal with 64bit addresses */
#define kvm_pgd_addr_end(addr, end) \
({ u64 __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index 423a5ac09d3a..a745a2a53853 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -129,6 +129,7 @@
#define L_PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[1] */
#define L_PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */
+#define L_PMD_S2_RDONLY (_AT(pmdval_t, 1) << 6) /* HAP[1] */
#define L_PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */
/*
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index 09ee408c1a67..0db25bc32864 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -175,6 +175,8 @@ struct kvm_arch_memory_slot {
#define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0
#define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT)
#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3
+#define KVM_DEV_ARM_VGIC_GRP_CTRL 4
+#define KVM_DEV_ARM_VGIC_CTRL_INIT 0
/* KVM_IRQ_LINE irq field index values */
#define KVM_ARM_IRQ_TYPE_SHIFT 24
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index 3afee5f40f4f..338ace78ed18 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@@ -21,8 +21,10 @@ config KVM
select PREEMPT_NOTIFIERS
select ANON_INODES
select HAVE_KVM_CPU_RELAX_INTERCEPT
+ select HAVE_KVM_ARCH_TLB_FLUSH_ALL
select KVM_MMIO
select KVM_ARM_HOST
+ select KVM_GENERIC_DIRTYLOG_READ_PROTECT
select SRCU
depends on ARM_VIRT_EXT && ARM_LPAE
---help---
diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
index f7057ed045b6..443b8bea43e9 100644
--- a/arch/arm/kvm/Makefile
+++ b/arch/arm/kvm/Makefile
@@ -22,4 +22,5 @@ obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o
obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic.o
obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2.o
+obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2-emul.o
obj-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 0b0d58a905c4..07e7eb1d7ab6 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -132,6 +132,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
/* Mark the initial VMID generation invalid */
kvm->arch.vmid_gen = 0;
+ /* The maximum number of VCPUs is limited by the host's GIC model */
+ kvm->arch.max_vcpus = kvm_vgic_get_max_vcpus();
+
return ret;
out_free_stage2_pgd:
kvm_free_stage2_pgd(kvm);
@@ -218,6 +221,11 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
goto out;
}
+ if (id >= kvm->arch.max_vcpus) {
+ err = -EINVAL;
+ goto out;
+ }
+
vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
if (!vcpu) {
err = -ENOMEM;
@@ -241,9 +249,8 @@ out:
return ERR_PTR(err);
}
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
{
- return 0;
}
void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
@@ -777,9 +784,39 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
}
+/**
+ * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
+ * @kvm: kvm instance
+ * @log: slot id and address to which we copy the log
+ *
+ * Steps 1-4 below provide general overview of dirty page logging. See
+ * kvm_get_dirty_log_protect() function description for additional details.
+ *
+ * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
+ * always flush the TLB (step 4) even if previous step failed and the dirty
+ * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
+ * does not preclude user space subsequent dirty log read. Flushing TLB ensures
+ * writes will be marked dirty for next log read.
+ *
+ * 1. Take a snapshot of the bit and clear it if needed.
+ * 2. Write protect the corresponding page.
+ * 3. Copy the snapshot to the userspace.
+ * 4. Flush TLB's if needed.
+ */
int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
{
- return -EINVAL;
+ bool is_dirty = false;
+ int r;
+
+ mutex_lock(&kvm->slots_lock);
+
+ r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
+
+ if (is_dirty)
+ kvm_flush_remote_tlbs(kvm);
+
+ mutex_unlock(&kvm->slots_lock);
+ return r;
}
static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
@@ -811,7 +848,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
switch (ioctl) {
case KVM_CREATE_IRQCHIP: {
if (vgic_present)
- return kvm_vgic_create(kvm);
+ return kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
else
return -ENXIO;
}
@@ -1035,6 +1072,19 @@ static void check_kvm_target_cpu(void *ret)
*(int *)ret = kvm_target_cpu();
}
+struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ mpidr &= MPIDR_HWID_BITMASK;
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu))
+ return vcpu;
+ }
+ return NULL;
+}
+
/**
* Initialize Hyp-mode and memory mappings on all CPUs.
*/
diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c
index a96a8043277c..95f12b2ccdcb 100644
--- a/arch/arm/kvm/handle_exit.c
+++ b/arch/arm/kvm/handle_exit.c
@@ -87,11 +87,13 @@ static int handle_dabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run)
*/
static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run)
{
- trace_kvm_wfi(*vcpu_pc(vcpu));
- if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE)
+ if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE) {
+ trace_kvm_wfx(*vcpu_pc(vcpu), true);
kvm_vcpu_on_spin(vcpu);
- else
+ } else {
+ trace_kvm_wfx(*vcpu_pc(vcpu), false);
kvm_vcpu_block(vcpu);
+ }
kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index 01dcb0e752d9..79caf79b304a 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -66,6 +66,17 @@ ENTRY(__kvm_tlb_flush_vmid_ipa)
bx lr
ENDPROC(__kvm_tlb_flush_vmid_ipa)
+/**
+ * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs
+ *
+ * Reuses __kvm_tlb_flush_vmid_ipa() for ARMv7, without passing address
+ * parameter
+ */
+
+ENTRY(__kvm_tlb_flush_vmid)
+ b __kvm_tlb_flush_vmid_ipa
+ENDPROC(__kvm_tlb_flush_vmid)
+
/********************************************************************
* Flush TLBs and instruction caches of all CPUs inside the inner-shareable
* domain, for all VMIDs
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 136662547ca6..3e6859bc3e11 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -45,6 +45,26 @@ static phys_addr_t hyp_idmap_vector;
#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
#define kvm_pmd_huge(_x) (pmd_huge(_x) || pmd_trans_huge(_x))
+#define kvm_pud_huge(_x) pud_huge(_x)
+
+#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0)
+#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1)
+
+static bool memslot_is_logging(struct kvm_memory_slot *memslot)
+{
+ return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
+}
+
+/**
+ * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
+ * @kvm: pointer to kvm structure.
+ *
+ * Interface to HYP function to flush all VM TLB entries
+ */
+void kvm_flush_remote_tlbs(struct kvm *kvm)
+{
+ kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
+}
static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
{
@@ -78,6 +98,25 @@ static void kvm_flush_dcache_pud(pud_t pud)
__kvm_flush_dcache_pud(pud);
}
+/**
+ * stage2_dissolve_pmd() - clear and flush huge PMD entry
+ * @kvm: pointer to kvm structure.
+ * @addr: IPA
+ * @pmd: pmd pointer for IPA
+ *
+ * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
+ * pages in the range dirty.
+ */
+static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
+{
+ if (!kvm_pmd_huge(*pmd))
+ return;
+
+ pmd_clear(pmd);
+ kvm_tlb_flush_vmid_ipa(kvm, addr);
+ put_page(virt_to_page(pmd));
+}
+
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
int min, int max)
{
@@ -819,10 +858,15 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
}
static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr, const pte_t *new_pte, bool iomap)
+ phys_addr_t addr, const pte_t *new_pte,
+ unsigned long flags)
{
pmd_t *pmd;
pte_t *pte, old_pte;
+ bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
+ bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
+
+ VM_BUG_ON(logging_active && !cache);
/* Create stage-2 page table mapping - Levels 0 and 1 */
pmd = stage2_get_pmd(kvm, cache, addr);
@@ -834,6 +878,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
return 0;
}
+ /*
+ * While dirty page logging - dissolve huge PMD, then continue on to
+ * allocate page.
+ */
+ if (logging_active)
+ stage2_dissolve_pmd(kvm, addr, pmd);
+
/* Create stage-2 page mappings - Level 2 */
if (pmd_none(*pmd)) {
if (!cache)
@@ -890,7 +941,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
if (ret)
goto out;
spin_lock(&kvm->mmu_lock);
- ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
+ ret = stage2_set_pte(kvm, &cache, addr, &pte,
+ KVM_S2PTE_FLAG_IS_IOMAP);
spin_unlock(&kvm->mmu_lock);
if (ret)
goto out;
@@ -957,6 +1009,165 @@ static bool kvm_is_device_pfn(unsigned long pfn)
return !pfn_valid(pfn);
}
+/**
+ * stage2_wp_ptes - write protect PMD range
+ * @pmd: pointer to pmd entry
+ * @addr: range start address
+ * @end: range end address
+ */
+static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
+{
+ pte_t *pte;
+
+ pte = pte_offset_kernel(pmd, addr);
+ do {
+ if (!pte_none(*pte)) {
+ if (!kvm_s2pte_readonly(pte))
+ kvm_set_s2pte_readonly(pte);
+ }
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+}
+
+/**
+ * stage2_wp_pmds - write protect PUD range
+ * @pud: pointer to pud entry
+ * @addr: range start address
+ * @end: range end address
+ */
+static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
+{
+ pmd_t *pmd;
+ phys_addr_t next;
+
+ pmd = pmd_offset(pud, addr);
+
+ do {
+ next = kvm_pmd_addr_end(addr, end);
+ if (!pmd_none(*pmd)) {
+ if (kvm_pmd_huge(*pmd)) {
+ if (!kvm_s2pmd_readonly(pmd))
+ kvm_set_s2pmd_readonly(pmd);
+ } else {
+ stage2_wp_ptes(pmd, addr, next);
+ }
+ }
+ } while (pmd++, addr = next, addr != end);
+}
+
+/**
+ * stage2_wp_puds - write protect PGD range
+ * @pgd: pointer to pgd entry
+ * @addr: range start address
+ * @end: range end address
+ *
+ * Process PUD entries, for a huge PUD we cause a panic.
+ */
+static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
+{
+ pud_t *pud;
+ phys_addr_t next;
+
+ pud = pud_offset(pgd, addr);
+ do {
+ next = kvm_pud_addr_end(addr, end);
+ if (!pud_none(*pud)) {
+ /* TODO:PUD not supported, revisit later if supported */
+ BUG_ON(kvm_pud_huge(*pud));
+ stage2_wp_pmds(pud, addr, next);
+ }
+ } while (pud++, addr = next, addr != end);
+}
+
+/**
+ * stage2_wp_range() - write protect stage2 memory region range
+ * @kvm: The KVM pointer
+ * @addr: Start address of range
+ * @end: End address of range
+ */
+static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
+{
+ pgd_t *pgd;
+ phys_addr_t next;
+
+ pgd = kvm->arch.pgd + pgd_index(addr);
+ do {
+ /*
+ * Release kvm_mmu_lock periodically if the memory region is
+ * large. Otherwise, we may see kernel panics with
+ * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
+ * CONFIG_LOCKDEP. Additionally, holding the lock too long
+ * will also starve other vCPUs.
+ */
+ if (need_resched() || spin_needbreak(&kvm->mmu_lock))
+ cond_resched_lock(&kvm->mmu_lock);
+
+ next = kvm_pgd_addr_end(addr, end);
+ if (pgd_present(*pgd))
+ stage2_wp_puds(pgd, addr, next);
+ } while (pgd++, addr = next, addr != end);
+}
+
+/**
+ * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
+ * @kvm: The KVM pointer
+ * @slot: The memory slot to write protect
+ *
+ * Called to start logging dirty pages after memory region
+ * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
+ * all present PMD and PTEs are write protected in the memory region.
+ * Afterwards read of dirty page log can be called.
+ *
+ * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
+ * serializing operations for VM memory regions.
+ */
+void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
+{
+ struct kvm_memory_slot *memslot = id_to_memslot(kvm->memslots, slot);
+ phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
+ phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
+
+ spin_lock(&kvm->mmu_lock);
+ stage2_wp_range(kvm, start, end);
+ spin_unlock(&kvm->mmu_lock);
+ kvm_flush_remote_tlbs(kvm);
+}
+
+/**
+ * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
+ * @kvm: The KVM pointer
+ * @slot: The memory slot associated with mask
+ * @gfn_offset: The gfn offset in memory slot
+ * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
+ * slot to be write protected
+ *
+ * Walks bits set in mask write protects the associated pte's. Caller must
+ * acquire kvm_mmu_lock.
+ */
+static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
+ phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
+ phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
+
+ stage2_wp_range(kvm, start, end);
+}
+
+/*
+ * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
+ * dirty pages.
+ *
+ * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
+ * enable dirty logging for them.
+ */
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+}
+
static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn,
unsigned long size, bool uncached)
{
@@ -977,6 +1188,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
pfn_t pfn;
pgprot_t mem_type = PAGE_S2;
bool fault_ipa_uncached;
+ bool logging_active = memslot_is_logging(memslot);
+ unsigned long flags = 0;
write_fault = kvm_is_write_fault(vcpu);
if (fault_status == FSC_PERM && !write_fault) {
@@ -993,7 +1206,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
}
- if (is_vm_hugetlb_page(vma)) {
+ if (is_vm_hugetlb_page(vma) && !logging_active) {
hugetlb = true;
gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
} else {
@@ -1034,12 +1247,30 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (is_error_pfn(pfn))
return -EFAULT;
- if (kvm_is_device_pfn(pfn))
+ if (kvm_is_device_pfn(pfn)) {
mem_type = PAGE_S2_DEVICE;
+ flags |= KVM_S2PTE_FLAG_IS_IOMAP;
+ } else if (logging_active) {
+ /*
+ * Faults on pages in a memslot with logging enabled
+ * should not be mapped with huge pages (it introduces churn
+ * and performance degradation), so force a pte mapping.
+ */
+ force_pte = true;
+ flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
+
+ /*
+ * Only actually map the page as writable if this was a write
+ * fault.
+ */
+ if (!write_fault)
+ writable = false;
+ }
spin_lock(&kvm->mmu_lock);
if (mmu_notifier_retry(kvm, mmu_seq))
goto out_unlock;
+
if (!hugetlb && !force_pte)
hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
@@ -1056,16 +1287,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
} else {
pte_t new_pte = pfn_pte(pfn, mem_type);
+
if (writable) {
kvm_set_s2pte_writable(&new_pte);
kvm_set_pfn_dirty(pfn);
+ mark_page_dirty(kvm, gfn);
}
coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE, fault_ipa_uncached);
- ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
- pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
+ ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
}
-
out_unlock:
spin_unlock(&kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
@@ -1215,7 +1446,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
{
pte_t *pte = (pte_t *)data;
- stage2_set_pte(kvm, NULL, gpa, pte, false);
+ /*
+ * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
+ * flag clear because MMU notifiers will have unmapped a huge PMD before
+ * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
+ * therefore stage2_set_pte() never needs to clear out a huge PMD
+ * through this calling path.
+ */
+ stage2_set_pte(kvm, NULL, gpa, pte, 0);
}
@@ -1348,6 +1586,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
const struct kvm_memory_slot *old,
enum kvm_mr_change change)
{
+ /*
+ * At this point memslot has been committed and there is an
+ * allocated dirty_bitmap[], dirty pages will be be tracked while the
+ * memory slot is write protected.
+ */
+ if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES)
+ kvm_mmu_wp_memory_region(kvm, mem->slot);
}
int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -1360,7 +1605,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
bool writable = !(mem->flags & KVM_MEM_READONLY);
int ret = 0;
- if (change != KVM_MR_CREATE && change != KVM_MR_MOVE)
+ if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
+ change != KVM_MR_FLAGS_ONLY)
return 0;
/*
@@ -1411,6 +1657,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) +
vm_start - vma->vm_start;
+ /* IO region dirty page logging not allowed */
+ if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES)
+ return -EINVAL;
+
ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
vm_end - vm_start,
writable);
@@ -1420,6 +1670,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
hva = vm_end;
} while (hva < reg_end);
+ if (change == KVM_MR_FLAGS_ONLY)
+ return ret;
+
spin_lock(&kvm->mmu_lock);
if (ret)
unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
index 58cb3248d277..02fa8eff6ae1 100644
--- a/arch/arm/kvm/psci.c
+++ b/arch/arm/kvm/psci.c
@@ -22,6 +22,7 @@
#include <asm/cputype.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_psci.h>
+#include <asm/kvm_host.h>
/*
* This is an implementation of the Power State Coordination Interface
@@ -66,25 +67,17 @@ static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
{
struct kvm *kvm = source_vcpu->kvm;
- struct kvm_vcpu *vcpu = NULL, *tmp;
+ struct kvm_vcpu *vcpu = NULL;
wait_queue_head_t *wq;
unsigned long cpu_id;
unsigned long context_id;
- unsigned long mpidr;
phys_addr_t target_pc;
- int i;
- cpu_id = *vcpu_reg(source_vcpu, 1);
+ cpu_id = *vcpu_reg(source_vcpu, 1) & MPIDR_HWID_BITMASK;
if (vcpu_mode_is_32bit(source_vcpu))
cpu_id &= ~((u32) 0);
- kvm_for_each_vcpu(i, tmp, kvm) {
- mpidr = kvm_vcpu_get_mpidr(tmp);
- if ((mpidr & MPIDR_HWID_BITMASK) == (cpu_id & MPIDR_HWID_BITMASK)) {
- vcpu = tmp;
- break;
- }
- }
+ vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id);
/*
* Make sure the caller requested a valid CPU and that the CPU is
@@ -155,7 +148,7 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
* then ON else OFF
*/
kvm_for_each_vcpu(i, tmp, kvm) {
- mpidr = kvm_vcpu_get_mpidr(tmp);
+ mpidr = kvm_vcpu_get_mpidr_aff(tmp);
if (((mpidr & target_affinity_mask) == target_affinity) &&
!tmp->arch.pause) {
return PSCI_0_2_AFFINITY_LEVEL_ON;
diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h
index b6a6e7102201..881874b1a036 100644
--- a/arch/arm/kvm/trace.h
+++ b/arch/arm/kvm/trace.h
@@ -140,19 +140,22 @@ TRACE_EVENT(kvm_emulate_cp15_imp,
__entry->CRm, __entry->Op2)
);
-TRACE_EVENT(kvm_wfi,
- TP_PROTO(unsigned long vcpu_pc),
- TP_ARGS(vcpu_pc),
+TRACE_EVENT(kvm_wfx,
+ TP_PROTO(unsigned long vcpu_pc, bool is_wfe),
+ TP_ARGS(vcpu_pc, is_wfe),
TP_STRUCT__entry(
__field( unsigned long, vcpu_pc )
+ __field( bool, is_wfe )
),
TP_fast_assign(
__entry->vcpu_pc = vcpu_pc;
+ __entry->is_wfe = is_wfe;
),
- TP_printk("guest executed wfi at: 0x%08lx", __entry->vcpu_pc)
+ TP_printk("guest executed wf%c at: 0x%08lx",
+ __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc)
);
TRACE_EVENT(kvm_unmap_hva,
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index 62167090937d..92bbae381598 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -96,6 +96,7 @@
#define ESR_ELx_COND_SHIFT (20)
#define ESR_ELx_COND_MASK (UL(0xF) << ESR_ELx_COND_SHIFT)
#define ESR_ELx_WFx_ISS_WFE (UL(1) << 0)
+#define ESR_ELx_xVC_IMM_MASK ((1UL << 16) - 1)
#ifndef __ASSEMBLY__
#include <asm/types.h>
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 483842180f8f..4f7310fa77f0 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -126,6 +126,7 @@ extern char __kvm_hyp_vector[];
extern void __kvm_flush_vm_context(void);
extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
+extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 0163b5775ca5..17e92f05b1fe 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -29,6 +29,7 @@
#include <asm/kvm_asm.h>
#include <asm/kvm_mmio.h>
#include <asm/ptrace.h>
+#include <asm/cputype.h>
unsigned long *vcpu_reg32(const struct kvm_vcpu *vcpu, u8 reg_num);
unsigned long *vcpu_spsr32(const struct kvm_vcpu *vcpu);
@@ -140,6 +141,11 @@ static inline phys_addr_t kvm_vcpu_get_fault_ipa(const struct kvm_vcpu *vcpu)
return ((phys_addr_t)vcpu->arch.fault.hpfar_el2 & HPFAR_MASK) << 8;
}
+static inline u32 kvm_vcpu_hvc_get_imm(const struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_get_hsr(vcpu) & ESR_ELx_xVC_IMM_MASK;
+}
+
static inline bool kvm_vcpu_dabt_isvalid(const struct kvm_vcpu *vcpu)
{
return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_ISV);
@@ -201,9 +207,9 @@ static inline u8 kvm_vcpu_trap_get_fault_type(const struct kvm_vcpu *vcpu)
return kvm_vcpu_get_hsr(vcpu) & ESR_ELx_FSC_TYPE;
}
-static inline unsigned long kvm_vcpu_get_mpidr(struct kvm_vcpu *vcpu)
+static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu)
{
- return vcpu_sys_reg(vcpu, MPIDR_EL1);
+ return vcpu_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK;
}
static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index acd101a9014d..8ac3c70fe3c6 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -59,6 +59,9 @@ struct kvm_arch {
/* VTTBR value associated with above pgd and vmid */
u64 vttbr;
+ /* The maximum number of vCPUs depends on the used GIC model */
+ int max_vcpus;
+
/* Interrupt controller */
struct vgic_dist vgic;
@@ -159,6 +162,7 @@ struct kvm_vm_stat {
};
struct kvm_vcpu_stat {
+ u32 halt_successful_poll;
u32 halt_wakeup;
};
@@ -196,6 +200,7 @@ struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
u64 kvm_call_hyp(void *hypfn, ...);
void force_vm_exit(const cpumask_t *mask);
+void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
int exception_index);
@@ -203,6 +208,8 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
int kvm_perf_init(void);
int kvm_perf_teardown(void);
+struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
+
static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
phys_addr_t pgd_ptr,
unsigned long hyp_stack_ptr,
diff --git a/arch/arm64/include/asm/kvm_mmio.h b/arch/arm64/include/asm/kvm_mmio.h
index fc2f689c0694..9f52beb7cb13 100644
--- a/arch/arm64/include/asm/kvm_mmio.h
+++ b/arch/arm64/include/asm/kvm_mmio.h
@@ -40,6 +40,7 @@ struct kvm_exit_mmio {
u8 data[8];
u32 len;
bool is_write;
+ void *private;
};
static inline void kvm_prepare_mmio(struct kvm_run *run,
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index adcf49547301..6458b5373142 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -118,6 +118,27 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
pmd_val(*pmd) |= PMD_S2_RDWR;
}
+static inline void kvm_set_s2pte_readonly(pte_t *pte)
+{
+ pte_val(*pte) = (pte_val(*pte) & ~PTE_S2_RDWR) | PTE_S2_RDONLY;
+}
+
+static inline bool kvm_s2pte_readonly(pte_t *pte)
+{
+ return (pte_val(*pte) & PTE_S2_RDWR) == PTE_S2_RDONLY;
+}
+
+static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
+{
+ pmd_val(*pmd) = (pmd_val(*pmd) & ~PMD_S2_RDWR) | PMD_S2_RDONLY;
+}
+
+static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
+{
+ return (pmd_val(*pmd) & PMD_S2_RDWR) == PMD_S2_RDONLY;
+}
+
+
#define kvm_pgd_addr_end(addr, end) pgd_addr_end(addr, end)
#define kvm_pud_addr_end(addr, end) pud_addr_end(addr, end)
#define kvm_pmd_addr_end(addr, end) pmd_addr_end(addr, end)
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 88174e0bfafe..5f930cc9ea83 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -119,6 +119,7 @@
#define PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[2:1] */
#define PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */
+#define PMD_S2_RDONLY (_AT(pmdval_t, 1) << 6) /* HAP[2:1] */
#define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */
/*
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 8e38878c87c6..3ef77a466018 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -78,6 +78,13 @@ struct kvm_regs {
#define KVM_VGIC_V2_DIST_SIZE 0x1000
#define KVM_VGIC_V2_CPU_SIZE 0x2000
+/* Supported VGICv3 address types */
+#define KVM_VGIC_V3_ADDR_TYPE_DIST 2
+#define KVM_VGIC_V3_ADDR_TYPE_REDIST 3
+
+#define KVM_VGIC_V3_DIST_SIZE SZ_64K
+#define KVM_VGIC_V3_REDIST_SIZE (2 * SZ_64K)
+
#define KVM_ARM_VCPU_POWER_OFF 0 /* CPU is started in OFF state */
#define KVM_ARM_VCPU_EL1_32BIT 1 /* CPU running a 32bit VM */
#define KVM_ARM_VCPU_PSCI_0_2 2 /* CPU uses PSCI v0.2 */
@@ -161,6 +168,8 @@ struct kvm_arch_memory_slot {
#define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0
#define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT)
#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3
+#define KVM_DEV_ARM_VGIC_GRP_CTRL 4
+#define KVM_DEV_ARM_VGIC_CTRL_INIT 0
/* KVM_IRQ_LINE irq field index values */
#define KVM_ARM_IRQ_TYPE_SHIFT 24
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index a2ae19403abb..f7fa65d4c352 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -140,6 +140,7 @@ int main(void)
DEFINE(VGIC_V2_CPU_ELRSR, offsetof(struct vgic_cpu, vgic_v2.vgic_elrsr));
DEFINE(VGIC_V2_CPU_APR, offsetof(struct vgic_cpu, vgic_v2.vgic_apr));
DEFINE(VGIC_V2_CPU_LR, offsetof(struct vgic_cpu, vgic_v2.vgic_lr));
+ DEFINE(VGIC_V3_CPU_SRE, offsetof(struct vgic_cpu, vgic_v3.vgic_sre));
DEFINE(VGIC_V3_CPU_HCR, offsetof(struct vgic_cpu, vgic_v3.vgic_hcr));
DEFINE(VGIC_V3_CPU_VMCR, offsetof(struct vgic_cpu, vgic_v3.vgic_vmcr));
DEFINE(VGIC_V3_CPU_MISR, offsetof(struct vgic_cpu, vgic_v3.vgic_misr));
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index b334084d3675..f5590c81d95f 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -22,10 +22,12 @@ config KVM
select PREEMPT_NOTIFIERS
select ANON_INODES
select HAVE_KVM_CPU_RELAX_INTERCEPT
+ select HAVE_KVM_ARCH_TLB_FLUSH_ALL
select KVM_MMIO
select KVM_ARM_HOST
select KVM_ARM_VGIC
select KVM_ARM_TIMER
+ select KVM_GENERIC_DIRTYLOG_READ_PROTECT
select SRCU
---help---
Support hosting virtualized guest machines.
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 32a096174b94..4e6e09ee4033 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -21,7 +21,9 @@ kvm-$(CONFIG_KVM_ARM_HOST) += guest.o reset.o sys_regs.o sys_regs_generic_v8.o
kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic.o
kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2.o
+kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2-emul.o
kvm-$(CONFIG_KVM_ARM_VGIC) += vgic-v2-switch.o
kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v3.o
+kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v3-emul.o
kvm-$(CONFIG_KVM_ARM_VGIC) += vgic-v3-switch.o
kvm-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index 29b184a8f3f8..524fa25671fc 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -28,12 +28,18 @@
#include <asm/kvm_mmu.h>
#include <asm/kvm_psci.h>
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
typedef int (*exit_handle_fn)(struct kvm_vcpu *, struct kvm_run *);
static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run)
{
int ret;
+ trace_kvm_hvc_arm64(*vcpu_pc(vcpu), *vcpu_reg(vcpu, 0),
+ kvm_vcpu_hvc_get_imm(vcpu));
+
ret = kvm_psci_call(vcpu);
if (ret < 0) {
kvm_inject_undefined(vcpu);
@@ -63,10 +69,13 @@ static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run)
*/
static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run)
{
- if (kvm_vcpu_get_hsr(vcpu) & ESR_ELx_WFx_ISS_WFE)
+ if (kvm_vcpu_get_hsr(vcpu) & ESR_ELx_WFx_ISS_WFE) {
+ trace_kvm_wfx_arm64(*vcpu_pc(vcpu), true);
kvm_vcpu_on_spin(vcpu);
- else
+ } else {
+ trace_kvm_wfx_arm64(*vcpu_pc(vcpu), false);
kvm_vcpu_block(vcpu);
+ }
kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index 9bff671cc561..5befd010e232 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -1032,6 +1032,28 @@ ENTRY(__kvm_tlb_flush_vmid_ipa)
ret
ENDPROC(__kvm_tlb_flush_vmid_ipa)
+/**
+ * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs
+ * @struct kvm *kvm - pointer to kvm structure
+ *
+ * Invalidates all Stage 1 and 2 TLB entries for current VMID.
+ */
+ENTRY(__kvm_tlb_flush_vmid)
+ dsb ishst
+
+ kern_hyp_va x0
+ ldr x2, [x0, #KVM_VTTBR]
+ msr vttbr_el2, x2
+ isb
+
+ tlbi vmalls12e1is
+ dsb ish
+ isb
+
+ msr vttbr_el2, xzr
+ ret
+ENDPROC(__kvm_tlb_flush_vmid)
+
ENTRY(__kvm_flush_vm_context)
dsb ishst
tlbi alle1is
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index b96afdf6cee4..c370b4014799 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -113,6 +113,27 @@ static bool access_vm_reg(struct kvm_vcpu *vcpu,
return true;
}
+/*
+ * Trap handler for the GICv3 SGI generation system register.
+ * Forward the request to the VGIC emulation.
+ * The cp15_64 code makes sure this automatically works
+ * for both AArch64 and AArch32 accesses.
+ */
+static bool access_gic_sgi(struct kvm_vcpu *vcpu,
+ const struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u64 val;
+
+ if (!p->is_write)
+ return read_from_write_only(vcpu, p);
+
+ val = *vcpu_reg(vcpu, p->Rt);
+ vgic_v3_dispatch_sgi(vcpu, val);
+
+ return true;
+}
+
static bool trap_raz_wi(struct kvm_vcpu *vcpu,
const struct sys_reg_params *p,
const struct sys_reg_desc *r)
@@ -200,10 +221,19 @@ static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
+ u64 mpidr;
+
/*
- * Simply map the vcpu_id into the Aff0 field of the MPIDR.
+ * Map the vcpu_id into the first three affinity level fields of
+ * the MPIDR. We limit the number of VCPUs in level 0 due to a
+ * limitation to 16 CPUs in that level in the ICC_SGIxR registers
+ * of the GICv3 to be able to address each CPU directly when
+ * sending IPIs.
*/
- vcpu_sys_reg(vcpu, MPIDR_EL1) = (1UL << 31) | (vcpu->vcpu_id & 0xff);
+ mpidr = (vcpu->vcpu_id & 0x0f) << MPIDR_LEVEL_SHIFT(0);
+ mpidr |= ((vcpu->vcpu_id >> 4) & 0xff) << MPIDR_LEVEL_SHIFT(1);
+ mpidr |= ((vcpu->vcpu_id >> 12) & 0xff) << MPIDR_LEVEL_SHIFT(2);
+ vcpu_sys_reg(vcpu, MPIDR_EL1) = (1ULL << 31) | mpidr;
}
/* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */
@@ -373,6 +403,9 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b0000), Op2(0b000),
NULL, reset_val, VBAR_EL1, 0 },
+ /* ICC_SGI1R_EL1 */
+ { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b1011), Op2(0b101),
+ access_gic_sgi },
/* ICC_SRE_EL1 */
{ Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b1100), Op2(0b101),
trap_raz_wi },
@@ -605,6 +638,8 @@ static const struct sys_reg_desc cp14_64_regs[] = {
* register).
*/
static const struct sys_reg_desc cp15_regs[] = {
+ { Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi },
+
{ Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_vm_reg, NULL, c1_SCTLR },
{ Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, c2_TTBR0 },
{ Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, c2_TTBR1 },
@@ -652,6 +687,7 @@ static const struct sys_reg_desc cp15_regs[] = {
static const struct sys_reg_desc cp15_64_regs[] = {
{ Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR0 },
+ { Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi },
{ Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR1 },
};
diff --git a/arch/arm64/kvm/trace.h b/arch/arm64/kvm/trace.h
new file mode 100644
index 000000000000..157416e963f2
--- /dev/null
+++ b/arch/arm64/kvm/trace.h
@@ -0,0 +1,55 @@
+#if !defined(_TRACE_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_ARM64_KVM_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+
+TRACE_EVENT(kvm_wfx_arm64,
+ TP_PROTO(unsigned long vcpu_pc, bool is_wfe),
+ TP_ARGS(vcpu_pc, is_wfe),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, vcpu_pc)
+ __field(bool, is_wfe)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_pc = vcpu_pc;
+ __entry->is_wfe = is_wfe;
+ ),
+
+ TP_printk("guest executed wf%c at: 0x%08lx",
+ __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc)
+);
+
+TRACE_EVENT(kvm_hvc_arm64,
+ TP_PROTO(unsigned long vcpu_pc, unsigned long r0, unsigned long imm),
+ TP_ARGS(vcpu_pc, r0, imm),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, vcpu_pc)
+ __field(unsigned long, r0)
+ __field(unsigned long, imm)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_pc = vcpu_pc;
+ __entry->r0 = r0;
+ __entry->imm = imm;
+ ),
+
+ TP_printk("HVC at 0x%08lx (r0: 0x%08lx, imm: 0x%lx)",
+ __entry->vcpu_pc, __entry->r0, __entry->imm)
+);
+
+#endif /* _TRACE_ARM64_KVM_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/arm64/kvm/vgic-v3-switch.S b/arch/arm64/kvm/vgic-v3-switch.S
index d16046999e06..617a012a0107 100644
--- a/arch/arm64/kvm/vgic-v3-switch.S
+++ b/arch/arm64/kvm/vgic-v3-switch.S
@@ -148,17 +148,18 @@
* x0: Register pointing to VCPU struct
*/
.macro restore_vgic_v3_state
- // Disable SRE_EL1 access. Necessary, otherwise
- // ICH_VMCR_EL2.VFIQEn becomes one, and FIQ happens...
- msr_s ICC_SRE_EL1, xzr
- isb
-
// Compute the address of struct vgic_cpu
add x3, x0, #VCPU_VGIC_CPU
// Restore all interesting registers
ldr w4, [x3, #VGIC_V3_CPU_HCR]
ldr w5, [x3, #VGIC_V3_CPU_VMCR]
+ ldr w25, [x3, #VGIC_V3_CPU_SRE]
+
+ msr_s ICC_SRE_EL1, x25
+
+ // make sure SRE is valid before writing the other registers
+ isb
msr_s ICH_HCR_EL2, x4
msr_s ICH_VMCR_EL2, x5
@@ -244,9 +245,12 @@
dsb sy
// Prevent the guest from touching the GIC system registers
+ // if SRE isn't enabled for GICv3 emulation
+ cbnz x25, 1f
mrs_s x5, ICC_SRE_EL2
and x5, x5, #~ICC_SRE_EL2_ENABLE
msr_s ICC_SRE_EL2, x5
+1:
.endm
ENTRY(__save_vgic_v3_state)
diff --git a/arch/ia64/include/uapi/asm/Kbuild b/arch/ia64/include/uapi/asm/Kbuild
index 1b3f5eb5fcdb..891002bbb995 100644
--- a/arch/ia64/include/uapi/asm/Kbuild
+++ b/arch/ia64/include/uapi/asm/Kbuild
@@ -18,7 +18,6 @@ header-y += intrinsics.h
header-y += ioctl.h
header-y += ioctls.h
header-y += ipcbuf.h
-header-y += kvm.h
header-y += kvm_para.h
header-y += mman.h
header-y += msgbuf.h
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index f2c249796ea8..ac4fc716062b 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -120,6 +120,7 @@ struct kvm_vcpu_stat {
u32 resvd_inst_exits;
u32 break_inst_exits;
u32 flush_dcache_exits;
+ u32 halt_successful_poll;
u32 halt_wakeup;
};
diff --git a/arch/mips/kvm/locore.S b/arch/mips/kvm/locore.S
index d7279c03c517..4a68b176d6e4 100644
--- a/arch/mips/kvm/locore.S
+++ b/arch/mips/kvm/locore.S
@@ -434,7 +434,7 @@ __kvm_mips_return_to_guest:
/* Setup status register for running guest in UM */
.set at
or v1, v1, (ST0_EXL | KSU_USER | ST0_IE)
- and v1, v1, ~ST0_CU0
+ and v1, v1, ~(ST0_CU0 | ST0_MX)
.set noat
mtc0 v1, CP0_STATUS
ehb
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index e3b21e51ff7e..c9eccf5df912 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -15,9 +15,11 @@
#include <linux/vmalloc.h>
#include <linux/fs.h>
#include <linux/bootmem.h>
+#include <asm/fpu.h>
#include <asm/page.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
#include <linux/kvm_host.h>
@@ -47,6 +49,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "resvd_inst", VCPU_STAT(resvd_inst_exits), KVM_STAT_VCPU },
{ "break_inst", VCPU_STAT(break_inst_exits), KVM_STAT_VCPU },
{ "flush_dcache", VCPU_STAT(flush_dcache_exits), KVM_STAT_VCPU },
+ { "halt_successful_poll", VCPU_STAT(halt_successful_poll), KVM_STAT_VCPU },
{ "halt_wakeup", VCPU_STAT(halt_wakeup), KVM_STAT_VCPU },
{NULL}
};
@@ -378,6 +381,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
vcpu->mmio_needed = 0;
}
+ lose_fpu(1);
+
local_irq_disable();
/* Check if we have any exceptions/interrupts pending */
kvm_mips_deliver_interrupts(vcpu,
@@ -385,8 +390,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
kvm_guest_enter();
+ /* Disable hardware page table walking while in guest */
+ htw_stop();
+
r = __kvm_mips_vcpu_run(run, vcpu);
+ /* Re-enable HTW before enabling interrupts */
+ htw_start();
+
kvm_guest_exit();
local_irq_enable();
@@ -832,9 +843,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
return -ENOIOCTLCMD;
}
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
{
- return 0;
}
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
@@ -980,9 +990,6 @@ static void kvm_mips_set_c0_status(void)
{
uint32_t status = read_c0_status();
- if (cpu_has_fpu)
- status |= (ST0_CU1);
-
if (cpu_has_dsp)
status |= (ST0_MX);
@@ -1002,6 +1009,9 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
+ /* re-enable HTW before enabling interrupts */
+ htw_start();
+
/* Set a default exit reason */
run->exit_reason = KVM_EXIT_UNKNOWN;
run->ready_for_interrupt_injection = 1;
@@ -1136,6 +1146,9 @@ skip_emul:
}
}
+ /* Disable HTW before returning to guest or host */
+ htw_stop();
+
return ret;
}
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 7efd666a3fa7..8ef05121d3cd 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -107,6 +107,7 @@ struct kvm_vcpu_stat {
u32 emulated_inst_exits;
u32 dec_exits;
u32 ext_intr_exits;
+ u32 halt_successful_poll;
u32 halt_wakeup;
u32 dbell_exits;
u32 gdbell_exits;
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 888bf466d8c6..cfbcdc654201 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -52,6 +52,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "dec", VCPU_STAT(dec_exits) },
{ "ext_intr", VCPU_STAT(ext_intr_exits) },
{ "queue_intr", VCPU_STAT(queue_intr) },
+ { "halt_successful_poll", VCPU_STAT(halt_successful_poll), },
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
{ "pf_storage", VCPU_STAT(pf_storage) },
{ "sp_storage", VCPU_STAT(sp_storage) },
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 9b55dec2d6cc..6c1316a15a27 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -62,6 +62,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "inst_emu", VCPU_STAT(emulated_inst_exits) },
{ "dec", VCPU_STAT(dec_exits) },
{ "ext_intr", VCPU_STAT(ext_intr_exits) },
+ { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
{ "doorbell", VCPU_STAT(dbell_exits) },
{ "guest doorbell", VCPU_STAT(gdbell_exits) },
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index c45eaab752b0..27c0face86f4 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -623,9 +623,8 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
return vcpu;
}
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
{
- return 0;
}
void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 9cba74d5d853..d84559e31f32 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -35,11 +35,13 @@
#define KVM_NR_IRQCHIPS 1
#define KVM_IRQCHIP_NUM_PINS 4096
-#define SIGP_CTRL_C 0x00800000
+#define SIGP_CTRL_C 0x80
+#define SIGP_CTRL_SCN_MASK 0x3f
struct sca_entry {
- atomic_t ctrl;
- __u32 reserved;
+ __u8 reserved0;
+ __u8 sigp_ctrl;
+ __u16 reserved[3];
__u64 sda;
__u64 reserved2[2];
} __attribute__((packed));
@@ -87,7 +89,8 @@ struct kvm_s390_sie_block {
atomic_t cpuflags; /* 0x0000 */
__u32 : 1; /* 0x0004 */
__u32 prefix : 18;
- __u32 : 13;
+ __u32 : 1;
+ __u32 ibc : 12;
__u8 reserved08[4]; /* 0x0008 */
#define PROG_IN_SIE (1<<0)
__u32 prog0c; /* 0x000c */
@@ -132,7 +135,9 @@ struct kvm_s390_sie_block {
__u8 reserved60; /* 0x0060 */
__u8 ecb; /* 0x0061 */
__u8 ecb2; /* 0x0062 */
- __u8 reserved63[1]; /* 0x0063 */
+#define ECB3_AES 0x04
+#define ECB3_DEA 0x08
+ __u8 ecb3; /* 0x0063 */
__u32 scaol; /* 0x0064 */
__u8 reserved68[4]; /* 0x0068 */
__u32 todpr; /* 0x006c */
@@ -159,6 +164,7 @@ struct kvm_s390_sie_block {
__u64 tecmc; /* 0x00e8 */
__u8 reservedf0[12]; /* 0x00f0 */
#define CRYCB_FORMAT1 0x00000001
+#define CRYCB_FORMAT2 0x00000003
__u32 crycbd; /* 0x00fc */
__u64 gcr[16]; /* 0x0100 */
__u64 gbea; /* 0x0180 */
@@ -192,6 +198,7 @@ struct kvm_vcpu_stat {
u32 exit_stop_request;
u32 exit_validity;
u32 exit_instruction;
+ u32 halt_successful_poll;
u32 halt_wakeup;
u32 instruction_lctl;
u32 instruction_lctlg;
@@ -378,14 +385,11 @@ struct kvm_s390_interrupt_info {
struct kvm_s390_emerg_info emerg;
struct kvm_s390_extcall_info extcall;
struct kvm_s390_prefix_info prefix;
+ struct kvm_s390_stop_info stop;
struct kvm_s390_mchk_info mchk;
};
};
-/* for local_interrupt.action_flags */
-#define ACTION_STORE_ON_STOP (1<<0)
-#define ACTION_STOP_ON_STOP (1<<1)
-
struct kvm_s390_irq_payload {
struct kvm_s390_io_info io;
struct kvm_s390_ext_info ext;
@@ -393,6 +397,7 @@ struct kvm_s390_irq_payload {
struct kvm_s390_emerg_info emerg;
struct kvm_s390_extcall_info extcall;
struct kvm_s390_prefix_info prefix;
+ struct kvm_s390_stop_info stop;
struct kvm_s390_mchk_info mchk;
};
@@ -401,7 +406,6 @@ struct kvm_s390_local_interrupt {
struct kvm_s390_float_interrupt *float_int;
wait_queue_head_t *wq;
atomic_t *cpuflags;
- unsigned int action_bits;
DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS);
struct kvm_s390_irq_payload irq;
unsigned long pending_irqs;
@@ -470,7 +474,6 @@ struct kvm_vcpu_arch {
};
struct gmap *gmap;
struct kvm_guestdbg_info_arch guestdbg;
-#define KVM_S390_PFAULT_TOKEN_INVALID (-1UL)
unsigned long pfault_token;
unsigned long pfault_select;
unsigned long pfault_compare;
@@ -504,13 +507,39 @@ struct s390_io_adapter {
#define MAX_S390_IO_ADAPTERS ((MAX_ISC + 1) * 8)
#define MAX_S390_ADAPTER_MAPS 256
+/* maximum size of facilities and facility mask is 2k bytes */
+#define S390_ARCH_FAC_LIST_SIZE_BYTE (1<<11)
+#define S390_ARCH_FAC_LIST_SIZE_U64 \
+ (S390_ARCH_FAC_LIST_SIZE_BYTE / sizeof(u64))
+#define S390_ARCH_FAC_MASK_SIZE_BYTE S390_ARCH_FAC_LIST_SIZE_BYTE
+#define S390_ARCH_FAC_MASK_SIZE_U64 \
+ (S390_ARCH_FAC_MASK_SIZE_BYTE / sizeof(u64))
+
+struct s390_model_fac {
+ /* facilities used in SIE context */
+ __u64 sie[S390_ARCH_FAC_LIST_SIZE_U64];
+ /* subset enabled by kvm */
+ __u64 kvm[S390_ARCH_FAC_LIST_SIZE_U64];
+};
+
+struct kvm_s390_cpu_model {
+ struct s390_model_fac *fac;
+ struct cpuid cpu_id;
+ unsigned short ibc;
+};
+
struct kvm_s390_crypto {
struct kvm_s390_crypto_cb *crycb;
__u32 crycbd;
+ __u8 aes_kw;
+ __u8 dea_kw;
};
struct kvm_s390_crypto_cb {
- __u8 reserved00[128]; /* 0x0000 */
+ __u8 reserved00[72]; /* 0x0000 */
+ __u8 dea_wrapping_key_mask[24]; /* 0x0048 */
+ __u8 aes_wrapping_key_mask[32]; /* 0x0060 */
+ __u8 reserved80[128]; /* 0x0080 */
};
struct kvm_arch{
@@ -523,12 +552,15 @@ struct kvm_arch{
int use_irqchip;
int use_cmma;
int user_cpu_state_ctrl;
+ int user_sigp;
struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS];
wait_queue_head_t ipte_wq;
int ipte_lock_count;
struct mutex ipte_mutex;
spinlock_t start_stop_lock;
+ struct kvm_s390_cpu_model model;
struct kvm_s390_crypto crypto;
+ u64 epoch;
};
#define KVM_HVA_ERR_BAD (-1UL)
diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index edb453cfc2c6..f1096bab5199 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -31,7 +31,8 @@ struct sclp_cpu_entry {
u8 reserved0[2];
u8 : 3;
u8 siif : 1;
- u8 : 4;
+ u8 sigpif : 1;
+ u8 : 3;
u8 reserved2[10];
u8 type;
u8 reserved1;
@@ -69,6 +70,7 @@ int memcpy_hsa(void *dest, unsigned long src, size_t count, int mode);
unsigned long sclp_get_hsa_size(void);
void sclp_early_detect(void);
int sclp_has_siif(void);
+int sclp_has_sigpif(void);
unsigned int sclp_get_ibc(void);
long _sclp_print_early(const char *);
diff --git a/arch/s390/include/asm/sysinfo.h b/arch/s390/include/asm/sysinfo.h
index 73f12d21af4d..f7054a892d9e 100644
--- a/arch/s390/include/asm/sysinfo.h
+++ b/arch/s390/include/asm/sysinfo.h
@@ -15,6 +15,7 @@
#define __ASM_S390_SYSINFO_H
#include <asm/bitsperlong.h>
+#include <linux/uuid.h>
struct sysinfo_1_1_1 {
unsigned char p:1;
@@ -116,10 +117,13 @@ struct sysinfo_3_2_2 {
char name[8];
unsigned int caf;
char cpi[16];
- char reserved_1[24];
-
+ char reserved_1[3];
+ char ext_name_encoding;
+ unsigned int reserved_2;
+ uuid_be uuid;
} vm[8];
- char reserved_544[3552];
+ char reserved_3[1504];
+ char ext_names[8][256];
};
extern int topology_max_mnest;
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 48eda3ab4944..9c77e60b9a26 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -57,10 +57,44 @@ struct kvm_s390_io_adapter_req {
/* kvm attr_group on vm fd */
#define KVM_S390_VM_MEM_CTRL 0
+#define KVM_S390_VM_TOD 1
+#define KVM_S390_VM_CRYPTO 2
+#define KVM_S390_VM_CPU_MODEL 3
/* kvm attributes for mem_ctrl */
#define KVM_S390_VM_MEM_ENABLE_CMMA 0
#define KVM_S390_VM_MEM_CLR_CMMA 1
+#define KVM_S390_VM_MEM_LIMIT_SIZE 2
+
+/* kvm attributes for KVM_S390_VM_TOD */
+#define KVM_S390_VM_TOD_LOW 0
+#define KVM_S390_VM_TOD_HIGH 1
+
+/* kvm attributes for KVM_S390_VM_CPU_MODEL */
+/* processor related attributes are r/w */
+#define KVM_S390_VM_CPU_PROCESSOR 0
+struct kvm_s390_vm_cpu_processor {
+ __u64 cpuid;
+ __u16 ibc;
+ __u8 pad[6];
+ __u64 fac_list[256];
+};
+
+/* machine related attributes are r/o */
+#define KVM_S390_VM_CPU_MACHINE 1
+struct kvm_s390_vm_cpu_machine {
+ __u64 cpuid;
+ __u32 ibc;
+ __u8 pad[4];
+ __u64 fac_mask[256];
+ __u64 fac_list[256];
+};
+
+/* kvm attributes for crypto */
+#define KVM_S390_VM_CRYPTO_ENABLE_AES_KW 0
+#define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW 1
+#define KVM_S390_VM_CRYPTO_DISABLE_AES_KW 2
+#define KVM_S390_VM_CRYPTO_DISABLE_DEA_KW 3
/* for KVM_GET_REGS and KVM_SET_REGS */
struct kvm_regs {
@@ -107,6 +141,9 @@ struct kvm_guest_debug_arch {
struct kvm_hw_breakpoint __user *hw_bp;
};
+/* for KVM_SYNC_PFAULT and KVM_REG_S390_PFTOKEN */
+#define KVM_S390_PFAULT_TOKEN_INVALID 0xffffffffffffffffULL
+
#define KVM_SYNC_PREFIX (1UL << 0)
#define KVM_SYNC_GPRS (1UL << 1)
#define KVM_SYNC_ACRS (1UL << 2)
diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c
index 85565f1ff474..99babea026ca 100644
--- a/arch/s390/kernel/sysinfo.c
+++ b/arch/s390/kernel/sysinfo.c
@@ -204,6 +204,33 @@ static void stsi_2_2_2(struct seq_file *m, struct sysinfo_2_2_2 *info)
}
}
+static void print_ext_name(struct seq_file *m, int lvl,
+ struct sysinfo_3_2_2 *info)
+{
+ if (info->vm[lvl].ext_name_encoding == 0)
+ return;
+ if (info->ext_names[lvl][0] == 0)
+ return;
+ switch (info->vm[lvl].ext_name_encoding) {
+ case 1: /* EBCDIC */
+ EBCASC(info->ext_names[lvl], sizeof(info->ext_names[lvl]));
+ break;
+ case 2: /* UTF-8 */
+ break;
+ default:
+ return;
+ }
+ seq_printf(m, "VM%02d Extended Name: %-.256s\n", lvl,
+ info->ext_names[lvl]);
+}
+
+static void print_uuid(struct seq_file *m, int i, struct sysinfo_3_2_2 *info)
+{
+ if (!memcmp(&info->vm[i].uuid, &NULL_UUID_BE, sizeof(uuid_be)))
+ return;
+ seq_printf(m, "VM%02d UUID: %pUb\n", i, &info->vm[i].uuid);
+}
+
static void stsi_3_2_2(struct seq_file *m, struct sysinfo_3_2_2 *info)
{
int i;
@@ -221,6 +248,8 @@ static void stsi_3_2_2(struct seq_file *m, struct sysinfo_3_2_2 *info)
seq_printf(m, "VM%02d CPUs Configured: %d\n", i, info->vm[i].cpus_configured);
seq_printf(m, "VM%02d CPUs Standby: %d\n", i, info->vm[i].cpus_standby);
seq_printf(m, "VM%02d CPUs Reserved: %d\n", i, info->vm[i].cpus_reserved);
+ print_ext_name(m, i, info);
+ print_uuid(m, i, info);
}
}
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 8a1be9017730..267523cac6de 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -357,8 +357,8 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
union asce asce;
ctlreg0.val = vcpu->arch.sie_block->gcr[0];
- edat1 = ctlreg0.edat && test_vfacility(8);
- edat2 = edat1 && test_vfacility(78);
+ edat1 = ctlreg0.edat && test_kvm_facility(vcpu->kvm, 8);
+ edat2 = edat1 && test_kvm_facility(vcpu->kvm, 78);
asce.val = get_vcpu_asce(vcpu);
if (asce.r)
goto real_address;
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 81c77ab8102e..bebd2157edd0 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -68,18 +68,27 @@ static int handle_noop(struct kvm_vcpu *vcpu)
static int handle_stop(struct kvm_vcpu *vcpu)
{
+ struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
int rc = 0;
- unsigned int action_bits;
+ uint8_t flags, stop_pending;
vcpu->stat.exit_stop_request++;
- trace_kvm_s390_stop_request(vcpu->arch.local_int.action_bits);
- action_bits = vcpu->arch.local_int.action_bits;
+ /* delay the stop if any non-stop irq is pending */
+ if (kvm_s390_vcpu_has_irq(vcpu, 1))
+ return 0;
+
+ /* avoid races with the injection/SIGP STOP code */
+ spin_lock(&li->lock);
+ flags = li->irq.stop.flags;
+ stop_pending = kvm_s390_is_stop_irq_pending(vcpu);
+ spin_unlock(&li->lock);
- if (!(action_bits & ACTION_STOP_ON_STOP))
+ trace_kvm_s390_stop_request(stop_pending, flags);
+ if (!stop_pending)
return 0;
- if (action_bits & ACTION_STORE_ON_STOP) {
+ if (flags & KVM_S390_STOP_FLAG_STORE_STATUS) {
rc = kvm_s390_vcpu_store_status(vcpu,
KVM_S390_STORE_STATUS_NOADDR);
if (rc)
@@ -279,11 +288,13 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu)
irq.type = KVM_S390_INT_CPU_TIMER;
break;
case EXT_IRQ_EXTERNAL_CALL:
- if (kvm_s390_si_ext_call_pending(vcpu))
- return 0;
irq.type = KVM_S390_INT_EXTERNAL_CALL;
irq.u.extcall.code = vcpu->arch.sie_block->extcpuaddr;
- break;
+ rc = kvm_s390_inject_vcpu(vcpu, &irq);
+ /* ignore if another external call is already pending */
+ if (rc == -EBUSY)
+ return 0;
+ return rc;
default:
return -EOPNOTSUPP;
}
@@ -307,17 +318,19 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
/* Make sure that the source is paged-in */
- srcaddr = kvm_s390_real_to_abs(vcpu, vcpu->run->s.regs.gprs[reg2]);
- if (kvm_is_error_gpa(vcpu->kvm, srcaddr))
- return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg2],
+ &srcaddr, 0);
+ if (rc)
+ return kvm_s390_inject_prog_cond(vcpu, rc);
rc = kvm_arch_fault_in_page(vcpu, srcaddr, 0);
if (rc != 0)
return rc;
/* Make sure that the destination is paged-in */
- dstaddr = kvm_s390_real_to_abs(vcpu, vcpu->run->s.regs.gprs[reg1]);
- if (kvm_is_error_gpa(vcpu->kvm, dstaddr))
- return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg1],
+ &dstaddr, 1);
+ if (rc)
+ return kvm_s390_inject_prog_cond(vcpu, rc);
rc = kvm_arch_fault_in_page(vcpu, dstaddr, 1);
if (rc != 0)
return rc;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index f00f31e66cd8..073b5f387d1d 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -19,6 +19,7 @@
#include <linux/bitmap.h>
#include <asm/asm-offsets.h>
#include <asm/uaccess.h>
+#include <asm/sclp.h>
#include "kvm-s390.h"
#include "gaccess.h"
#include "trace-s390.h"
@@ -159,6 +160,12 @@ static unsigned long deliverable_local_irqs(struct kvm_vcpu *vcpu)
if (psw_mchk_disabled(vcpu))
active_mask &= ~IRQ_PEND_MCHK_MASK;
+ /*
+ * STOP irqs will never be actively delivered. They are triggered via
+ * intercept requests and cleared when the stop intercept is performed.
+ */
+ __clear_bit(IRQ_PEND_SIGP_STOP, &active_mask);
+
return active_mask;
}
@@ -186,9 +193,6 @@ static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
LCTL_CR10 | LCTL_CR11);
vcpu->arch.sie_block->ictl |= (ICTL_STCTL | ICTL_PINT);
}
-
- if (vcpu->arch.local_int.action_bits & ACTION_STOP_ON_STOP)
- atomic_set_mask(CPUSTAT_STOP_INT, &vcpu->arch.sie_block->cpuflags);
}
static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag)
@@ -216,11 +220,18 @@ static void set_intercept_indicators_mchk(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->lctl |= LCTL_CR14;
}
+static void set_intercept_indicators_stop(struct kvm_vcpu *vcpu)
+{
+ if (kvm_s390_is_stop_irq_pending(vcpu))
+ __set_cpuflag(vcpu, CPUSTAT_STOP_INT);
+}
+
/* Set interception request for non-deliverable local interrupts */
static void set_intercept_indicators_local(struct kvm_vcpu *vcpu)
{
set_intercept_indicators_ext(vcpu);
set_intercept_indicators_mchk(vcpu);
+ set_intercept_indicators_stop(vcpu);
}
static void __set_intercept_indicator(struct kvm_vcpu *vcpu,
@@ -392,18 +403,6 @@ static int __must_check __deliver_restart(struct kvm_vcpu *vcpu)
return rc ? -EFAULT : 0;
}
-static int __must_check __deliver_stop(struct kvm_vcpu *vcpu)
-{
- VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu stop");
- vcpu->stat.deliver_stop_signal++;
- trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_SIGP_STOP,
- 0, 0);
-
- __set_cpuflag(vcpu, CPUSTAT_STOP_INT);
- clear_bit(IRQ_PEND_SIGP_STOP, &vcpu->arch.local_int.pending_irqs);
- return 0;
-}
-
static int __must_check __deliver_set_prefix(struct kvm_vcpu *vcpu)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -705,7 +704,6 @@ static const deliver_irq_t deliver_irq_funcs[] = {
[IRQ_PEND_EXT_CLOCK_COMP] = __deliver_ckc,
[IRQ_PEND_EXT_CPU_TIMER] = __deliver_cpu_timer,
[IRQ_PEND_RESTART] = __deliver_restart,
- [IRQ_PEND_SIGP_STOP] = __deliver_stop,
[IRQ_PEND_SET_PREFIX] = __deliver_set_prefix,
[IRQ_PEND_PFAULT_INIT] = __deliver_pfault_init,
};
@@ -738,21 +736,20 @@ static int __must_check __deliver_floating_interrupt(struct kvm_vcpu *vcpu,
return rc;
}
-/* Check whether SIGP interpretation facility has an external call pending */
-int kvm_s390_si_ext_call_pending(struct kvm_vcpu *vcpu)
+/* Check whether an external call is pending (deliverable or not) */
+int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu)
{
- atomic_t *sigp_ctrl = &vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].ctrl;
+ struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+ uint8_t sigp_ctrl = vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sigp_ctrl;
- if (!psw_extint_disabled(vcpu) &&
- (vcpu->arch.sie_block->gcr[0] & 0x2000ul) &&
- (atomic_read(sigp_ctrl) & SIGP_CTRL_C) &&
- (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_ECALL_PEND))
- return 1;
+ if (!sclp_has_sigpif())
+ return test_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs);
- return 0;
+ return (sigp_ctrl & SIGP_CTRL_C) &&
+ (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_ECALL_PEND);
}
-int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
+int kvm_s390_vcpu_has_irq(struct kvm_vcpu *vcpu, int exclude_stop)
{
struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int;
struct kvm_s390_interrupt_info *inti;
@@ -773,7 +770,13 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
if (!rc && kvm_cpu_has_pending_timer(vcpu))
rc = 1;
- if (!rc && kvm_s390_si_ext_call_pending(vcpu))
+ /* external call pending and deliverable */
+ if (!rc && kvm_s390_ext_call_pending(vcpu) &&
+ !psw_extint_disabled(vcpu) &&
+ (vcpu->arch.sie_block->gcr[0] & 0x2000ul))
+ rc = 1;
+
+ if (!rc && !exclude_stop && kvm_s390_is_stop_irq_pending(vcpu))
rc = 1;
return rc;
@@ -804,14 +807,20 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
return -EOPNOTSUPP; /* disabled wait */
}
- __set_cpu_idle(vcpu);
if (!ckc_interrupts_enabled(vcpu)) {
VCPU_EVENT(vcpu, 3, "%s", "enabled wait w/o timer");
+ __set_cpu_idle(vcpu);
goto no_timer;
}
now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch;
sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
+
+ /* underflow */
+ if (vcpu->arch.sie_block->ckc < now)
+ return 0;
+
+ __set_cpu_idle(vcpu);
hrtimer_start(&vcpu->arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL);
VCPU_EVENT(vcpu, 5, "enabled wait via clock comparator: %llx ns", sltime);
no_timer:
@@ -820,7 +829,7 @@ no_timer:
__unset_cpu_idle(vcpu);
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
- hrtimer_try_to_cancel(&vcpu->arch.ckc_timer);
+ hrtimer_cancel(&vcpu->arch.ckc_timer);
return 0;
}
@@ -840,10 +849,20 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
{
struct kvm_vcpu *vcpu;
+ u64 now, sltime;
vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer);
- kvm_s390_vcpu_wakeup(vcpu);
+ now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch;
+ sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
+ /*
+ * If the monotonic clock runs faster than the tod clock we might be
+ * woken up too early and have to go back to sleep to avoid deadlocks.
+ */
+ if (vcpu->arch.sie_block->ckc > now &&
+ hrtimer_forward_now(timer, ns_to_ktime(sltime)))
+ return HRTIMER_RESTART;
+ kvm_s390_vcpu_wakeup(vcpu);
return HRTIMER_NORESTART;
}
@@ -859,8 +878,7 @@ void kvm_s390_clear_local_irqs(struct kvm_vcpu *vcpu)
/* clear pending external calls set by sigp interpretation facility */
atomic_clear_mask(CPUSTAT_ECALL_PEND, li->cpuflags);
- atomic_clear_mask(SIGP_CTRL_C,
- &vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].ctrl);
+ vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sigp_ctrl = 0;
}
int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
@@ -984,18 +1002,43 @@ static int __inject_pfault_init(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
return 0;
}
-int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
+static int __inject_extcall_sigpif(struct kvm_vcpu *vcpu, uint16_t src_id)
+{
+ unsigned char new_val, old_val;
+ uint8_t *sigp_ctrl = &vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sigp_ctrl;
+
+ new_val = SIGP_CTRL_C | (src_id & SIGP_CTRL_SCN_MASK);
+ old_val = *sigp_ctrl & ~SIGP_CTRL_C;
+ if (cmpxchg(sigp_ctrl, old_val, new_val) != old_val) {
+ /* another external call is pending */
+ return -EBUSY;
+ }
+ atomic_set_mask(CPUSTAT_ECALL_PEND, &vcpu->arch.sie_block->cpuflags);
+ return 0;
+}
+
+static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
struct kvm_s390_extcall_info *extcall = &li->irq.extcall;
+ uint16_t src_id = irq->u.extcall.code;
VCPU_EVENT(vcpu, 3, "inject: external call source-cpu:%u",
- irq->u.extcall.code);
+ src_id);
trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_EXTERNAL_CALL,
- irq->u.extcall.code, 0, 2);
+ src_id, 0, 2);
+
+ /* sending vcpu invalid */
+ if (src_id >= KVM_MAX_VCPUS ||
+ kvm_get_vcpu(vcpu->kvm, src_id) == NULL)
+ return -EINVAL;
+ if (sclp_has_sigpif())
+ return __inject_extcall_sigpif(vcpu, src_id);
+
+ if (!test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs))
+ return -EBUSY;
*extcall = irq->u.extcall;
- set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs);
atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
return 0;
}
@@ -1006,23 +1049,41 @@ static int __inject_set_prefix(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
struct kvm_s390_prefix_info *prefix = &li->irq.prefix;
VCPU_EVENT(vcpu, 3, "inject: set prefix to %x (from user)",
- prefix->address);
+ irq->u.prefix.address);
trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_SIGP_SET_PREFIX,
- prefix->address, 0, 2);
+ irq->u.prefix.address, 0, 2);
+
+ if (!is_vcpu_stopped(vcpu))
+ return -EBUSY;
*prefix = irq->u.prefix;
set_bit(IRQ_PEND_SET_PREFIX, &li->pending_irqs);
return 0;
}
+#define KVM_S390_STOP_SUPP_FLAGS (KVM_S390_STOP_FLAG_STORE_STATUS)
static int __inject_sigp_stop(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+ struct kvm_s390_stop_info *stop = &li->irq.stop;
+ int rc = 0;
trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_SIGP_STOP, 0, 0, 2);
- li->action_bits |= ACTION_STOP_ON_STOP;
- set_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs);
+ if (irq->u.stop.flags & ~KVM_S390_STOP_SUPP_FLAGS)
+ return -EINVAL;
+
+ if (is_vcpu_stopped(vcpu)) {
+ if (irq->u.stop.flags & KVM_S390_STOP_FLAG_STORE_STATUS)
+ rc = kvm_s390_store_status_unloaded(vcpu,
+ KVM_S390_STORE_STATUS_NOADDR);
+ return rc;
+ }
+
+ if (test_and_set_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs))
+ return -EBUSY;
+ stop->flags = irq->u.stop.flags;
+ __set_cpuflag(vcpu, CPUSTAT_STOP_INT);
return 0;
}
@@ -1042,14 +1103,13 @@ static int __inject_sigp_emergency(struct kvm_vcpu *vcpu,
struct kvm_s390_irq *irq)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
- struct kvm_s390_emerg_info *emerg = &li->irq.emerg;
VCPU_EVENT(vcpu, 3, "inject: emergency %u\n",
irq->u.emerg.code);
trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_EMERGENCY,
- emerg->code, 0, 2);
+ irq->u.emerg.code, 0, 2);
- set_bit(emerg->code, li->sigp_emerg_pending);
+ set_bit(irq->u.emerg.code, li->sigp_emerg_pending);
set_bit(IRQ_PEND_EXT_EMERGENCY, &li->pending_irqs);
atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
return 0;
@@ -1061,9 +1121,9 @@ static int __inject_mchk(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
struct kvm_s390_mchk_info *mchk = &li->irq.mchk;
VCPU_EVENT(vcpu, 5, "inject: machine check parm64:%llx",
- mchk->mcic);
+ irq->u.mchk.mcic);
trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_MCHK, 0,
- mchk->mcic, 2);
+ irq->u.mchk.mcic, 2);
/*
* Because repressible machine checks can be indicated along with
@@ -1121,7 +1181,6 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
if ((!schid && !cr6) || (schid && cr6))
return NULL;
- mutex_lock(&kvm->lock);
fi = &kvm->arch.float_int;
spin_lock(&fi->lock);
inti = NULL;
@@ -1149,7 +1208,6 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
if (list_empty(&fi->list))
atomic_set(&fi->active, 0);
spin_unlock(&fi->lock);
- mutex_unlock(&kvm->lock);
return inti;
}
@@ -1162,7 +1220,6 @@ static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
int sigcpu;
int rc = 0;
- mutex_lock(&kvm->lock);
fi = &kvm->arch.float_int;
spin_lock(&fi->lock);
if (fi->irq_count >= KVM_S390_MAX_FLOAT_IRQS) {
@@ -1187,6 +1244,8 @@ static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
list_add_tail(&inti->list, &iter->list);
}
atomic_set(&fi->active, 1);
+ if (atomic_read(&kvm->online_vcpus) == 0)
+ goto unlock_fi;
sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
if (sigcpu == KVM_MAX_VCPUS) {
do {
@@ -1213,7 +1272,6 @@ static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
kvm_s390_vcpu_wakeup(kvm_get_vcpu(kvm, sigcpu));
unlock_fi:
spin_unlock(&fi->lock);
- mutex_unlock(&kvm->lock);
return rc;
}
@@ -1221,6 +1279,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
struct kvm_s390_interrupt *s390int)
{
struct kvm_s390_interrupt_info *inti;
+ int rc;
inti = kzalloc(sizeof(*inti), GFP_KERNEL);
if (!inti)
@@ -1239,7 +1298,6 @@ int kvm_s390_inject_vm(struct kvm *kvm,
inti->ext.ext_params = s390int->parm;
break;
case KVM_S390_INT_PFAULT_DONE:
- inti->type = s390int->type;
inti->ext.ext_params2 = s390int->parm64;
break;
case KVM_S390_MCHK:
@@ -1268,7 +1326,10 @@ int kvm_s390_inject_vm(struct kvm *kvm,
trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64,
2);
- return __inject_vm(kvm, inti);
+ rc = __inject_vm(kvm, inti);
+ if (rc)
+ kfree(inti);
+ return rc;
}
void kvm_s390_reinject_io_int(struct kvm *kvm,
@@ -1290,13 +1351,16 @@ int s390int_to_s390irq(struct kvm_s390_interrupt *s390int,
case KVM_S390_SIGP_SET_PREFIX:
irq->u.prefix.address = s390int->parm;
break;
+ case KVM_S390_SIGP_STOP:
+ irq->u.stop.flags = s390int->parm;
+ break;
case KVM_S390_INT_EXTERNAL_CALL:
- if (irq->u.extcall.code & 0xffff0000)
+ if (s390int->parm & 0xffff0000)
return -EINVAL;
irq->u.extcall.code = s390int->parm;
break;
case KVM_S390_INT_EMERGENCY:
- if (irq->u.emerg.code & 0xffff0000)
+ if (s390int->parm & 0xffff0000)
return -EINVAL;
irq->u.emerg.code = s390int->parm;
break;
@@ -1307,6 +1371,23 @@ int s390int_to_s390irq(struct kvm_s390_interrupt *s390int,
return 0;
}
+int kvm_s390_is_stop_irq_pending(struct kvm_vcpu *vcpu)
+{
+ struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+
+ return test_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs);
+}
+
+void kvm_s390_clear_stop_irq(struct kvm_vcpu *vcpu)
+{
+ struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+
+ spin_lock(&li->lock);
+ li->irq.stop.flags = 0;
+ clear_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs);
+ spin_unlock(&li->lock);
+}
+
int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -1363,7 +1444,6 @@ void kvm_s390_clear_float_irqs(struct kvm *kvm)
struct kvm_s390_float_interrupt *fi;
struct kvm_s390_interrupt_info *n, *inti = NULL;
- mutex_lock(&kvm->lock);
fi = &kvm->arch.float_int;
spin_lock(&fi->lock);
list_for_each_entry_safe(inti, n, &fi->list, list) {
@@ -1373,7 +1453,6 @@ void kvm_s390_clear_float_irqs(struct kvm *kvm)
fi->irq_count = 0;
atomic_set(&fi->active, 0);
spin_unlock(&fi->lock);
- mutex_unlock(&kvm->lock);
}
static inline int copy_irq_to_user(struct kvm_s390_interrupt_info *inti,
@@ -1413,7 +1492,6 @@ static int get_all_floating_irqs(struct kvm *kvm, __u8 *buf, __u64 len)
int ret = 0;
int n = 0;
- mutex_lock(&kvm->lock);
fi = &kvm->arch.float_int;
spin_lock(&fi->lock);
@@ -1432,7 +1510,6 @@ static int get_all_floating_irqs(struct kvm *kvm, __u8 *buf, __u64 len)
}
spin_unlock(&fi->lock);
- mutex_unlock(&kvm->lock);
return ret < 0 ? ret : n;
}
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 3e09801e3104..0c3623927563 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -22,6 +22,7 @@
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <linux/module.h>
+#include <linux/random.h>
#include <linux/slab.h>
#include <linux/timer.h>
#include <asm/asm-offsets.h>
@@ -29,7 +30,6 @@
#include <asm/pgtable.h>
#include <asm/nmi.h>
#include <asm/switch_to.h>
-#include <asm/facility.h>
#include <asm/sclp.h>
#include "kvm-s390.h"
#include "gaccess.h"
@@ -50,6 +50,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "exit_instruction", VCPU_STAT(exit_instruction) },
{ "exit_program_interruption", VCPU_STAT(exit_program_interruption) },
{ "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) },
+ { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
{ "instruction_lctlg", VCPU_STAT(instruction_lctlg) },
{ "instruction_lctl", VCPU_STAT(instruction_lctl) },
@@ -98,15 +99,20 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ NULL }
};
-unsigned long *vfacilities;
-static struct gmap_notifier gmap_notifier;
+/* upper facilities limit for kvm */
+unsigned long kvm_s390_fac_list_mask[] = {
+ 0xff82fffbf4fc2000UL,
+ 0x005c000000000000UL,
+};
-/* test availability of vfacility */
-int test_vfacility(unsigned long nr)
+unsigned long kvm_s390_fac_list_mask_size(void)
{
- return __test_facility(nr, (void *) vfacilities);
+ BUILD_BUG_ON(ARRAY_SIZE(kvm_s390_fac_list_mask) > S390_ARCH_FAC_MASK_SIZE_U64);
+ return ARRAY_SIZE(kvm_s390_fac_list_mask);
}
+static struct gmap_notifier gmap_notifier;
+
/* Section: not file related */
int kvm_arch_hardware_enable(void)
{
@@ -166,6 +172,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_IRQCHIP:
case KVM_CAP_VM_ATTRIBUTES:
case KVM_CAP_MP_STATE:
+ case KVM_CAP_S390_USER_SIGP:
r = 1;
break;
case KVM_CAP_NR_VCPUS:
@@ -254,6 +261,10 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
kvm->arch.use_irqchip = 1;
r = 0;
break;
+ case KVM_CAP_S390_USER_SIGP:
+ kvm->arch.user_sigp = 1;
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
@@ -261,7 +272,24 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
return r;
}
-static int kvm_s390_mem_control(struct kvm *kvm, struct kvm_device_attr *attr)
+static int kvm_s390_get_mem_control(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ int ret;
+
+ switch (attr->attr) {
+ case KVM_S390_VM_MEM_LIMIT_SIZE:
+ ret = 0;
+ if (put_user(kvm->arch.gmap->asce_end, (u64 __user *)attr->addr))
+ ret = -EFAULT;
+ break;
+ default:
+ ret = -ENXIO;
+ break;
+ }
+ return ret;
+}
+
+static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *attr)
{
int ret;
unsigned int idx;
@@ -283,6 +311,36 @@ static int kvm_s390_mem_control(struct kvm *kvm, struct kvm_device_attr *attr)
mutex_unlock(&kvm->lock);
ret = 0;
break;
+ case KVM_S390_VM_MEM_LIMIT_SIZE: {
+ unsigned long new_limit;
+
+ if (kvm_is_ucontrol(kvm))
+ return -EINVAL;
+
+ if (get_user(new_limit, (u64 __user *)attr->addr))
+ return -EFAULT;
+
+ if (new_limit > kvm->arch.gmap->asce_end)
+ return -E2BIG;
+
+ ret = -EBUSY;
+ mutex_lock(&kvm->lock);
+ if (atomic_read(&kvm->online_vcpus) == 0) {
+ /* gmap_alloc will round the limit up */
+ struct gmap *new = gmap_alloc(current->mm, new_limit);
+
+ if (!new) {
+ ret = -ENOMEM;
+ } else {
+ gmap_free(kvm->arch.gmap);
+ new->private = kvm;
+ kvm->arch.gmap = new;
+ ret = 0;
+ }
+ }
+ mutex_unlock(&kvm->lock);
+ break;
+ }
default:
ret = -ENXIO;
break;
@@ -290,13 +348,276 @@ static int kvm_s390_mem_control(struct kvm *kvm, struct kvm_device_attr *attr)
return ret;
}
+static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu);
+
+static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ if (!test_kvm_facility(kvm, 76))
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+ switch (attr->attr) {
+ case KVM_S390_VM_CRYPTO_ENABLE_AES_KW:
+ get_random_bytes(
+ kvm->arch.crypto.crycb->aes_wrapping_key_mask,
+ sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask));
+ kvm->arch.crypto.aes_kw = 1;
+ break;
+ case KVM_S390_VM_CRYPTO_ENABLE_DEA_KW:
+ get_random_bytes(
+ kvm->arch.crypto.crycb->dea_wrapping_key_mask,
+ sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask));
+ kvm->arch.crypto.dea_kw = 1;
+ break;
+ case KVM_S390_VM_CRYPTO_DISABLE_AES_KW:
+ kvm->arch.crypto.aes_kw = 0;
+ memset(kvm->arch.crypto.crycb->aes_wrapping_key_mask, 0,
+ sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask));
+ break;
+ case KVM_S390_VM_CRYPTO_DISABLE_DEA_KW:
+ kvm->arch.crypto.dea_kw = 0;
+ memset(kvm->arch.crypto.crycb->dea_wrapping_key_mask, 0,
+ sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask));
+ break;
+ default:
+ mutex_unlock(&kvm->lock);
+ return -ENXIO;
+ }
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ kvm_s390_vcpu_crypto_setup(vcpu);
+ exit_sie(vcpu);
+ }
+ mutex_unlock(&kvm->lock);
+ return 0;
+}
+
+static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ u8 gtod_high;
+
+ if (copy_from_user(&gtod_high, (void __user *)attr->addr,
+ sizeof(gtod_high)))
+ return -EFAULT;
+
+ if (gtod_high != 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ struct kvm_vcpu *cur_vcpu;
+ unsigned int vcpu_idx;
+ u64 host_tod, gtod;
+ int r;
+
+ if (copy_from_user(&gtod, (void __user *)attr->addr, sizeof(gtod)))
+ return -EFAULT;
+
+ r = store_tod_clock(&host_tod);
+ if (r)
+ return r;
+
+ mutex_lock(&kvm->lock);
+ kvm->arch.epoch = gtod - host_tod;
+ kvm_for_each_vcpu(vcpu_idx, cur_vcpu, kvm) {
+ cur_vcpu->arch.sie_block->epoch = kvm->arch.epoch;
+ exit_sie(cur_vcpu);
+ }
+ mutex_unlock(&kvm->lock);
+ return 0;
+}
+
+static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ int ret;
+
+ if (attr->flags)
+ return -EINVAL;
+
+ switch (attr->attr) {
+ case KVM_S390_VM_TOD_HIGH:
+ ret = kvm_s390_set_tod_high(kvm, attr);
+ break;
+ case KVM_S390_VM_TOD_LOW:
+ ret = kvm_s390_set_tod_low(kvm, attr);
+ break;
+ default:
+ ret = -ENXIO;
+ break;
+ }
+ return ret;
+}
+
+static int kvm_s390_get_tod_high(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ u8 gtod_high = 0;
+
+ if (copy_to_user((void __user *)attr->addr, &gtod_high,
+ sizeof(gtod_high)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int kvm_s390_get_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ u64 host_tod, gtod;
+ int r;
+
+ r = store_tod_clock(&host_tod);
+ if (r)
+ return r;
+
+ gtod = host_tod + kvm->arch.epoch;
+ if (copy_to_user((void __user *)attr->addr, &gtod, sizeof(gtod)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int kvm_s390_get_tod(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ int ret;
+
+ if (attr->flags)
+ return -EINVAL;
+
+ switch (attr->attr) {
+ case KVM_S390_VM_TOD_HIGH:
+ ret = kvm_s390_get_tod_high(kvm, attr);
+ break;
+ case KVM_S390_VM_TOD_LOW:
+ ret = kvm_s390_get_tod_low(kvm, attr);
+ break;
+ default:
+ ret = -ENXIO;
+ break;
+ }
+ return ret;
+}
+
+static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ struct kvm_s390_vm_cpu_processor *proc;
+ int ret = 0;
+
+ mutex_lock(&kvm->lock);
+ if (atomic_read(&kvm->online_vcpus)) {
+ ret = -EBUSY;
+ goto out;
+ }
+ proc = kzalloc(sizeof(*proc), GFP_KERNEL);
+ if (!proc) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ if (!copy_from_user(proc, (void __user *)attr->addr,
+ sizeof(*proc))) {
+ memcpy(&kvm->arch.model.cpu_id, &proc->cpuid,
+ sizeof(struct cpuid));
+ kvm->arch.model.ibc = proc->ibc;
+ memcpy(kvm->arch.model.fac->kvm, proc->fac_list,
+ S390_ARCH_FAC_LIST_SIZE_BYTE);
+ } else
+ ret = -EFAULT;
+ kfree(proc);
+out:
+ mutex_unlock(&kvm->lock);
+ return ret;
+}
+
+static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ int ret = -ENXIO;
+
+ switch (attr->attr) {
+ case KVM_S390_VM_CPU_PROCESSOR:
+ ret = kvm_s390_set_processor(kvm, attr);
+ break;
+ }
+ return ret;
+}
+
+static int kvm_s390_get_processor(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ struct kvm_s390_vm_cpu_processor *proc;
+ int ret = 0;
+
+ proc = kzalloc(sizeof(*proc), GFP_KERNEL);
+ if (!proc) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ memcpy(&proc->cpuid, &kvm->arch.model.cpu_id, sizeof(struct cpuid));
+ proc->ibc = kvm->arch.model.ibc;
+ memcpy(&proc->fac_list, kvm->arch.model.fac->kvm, S390_ARCH_FAC_LIST_SIZE_BYTE);
+ if (copy_to_user((void __user *)attr->addr, proc, sizeof(*proc)))
+ ret = -EFAULT;
+ kfree(proc);
+out:
+ return ret;
+}
+
+static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ struct kvm_s390_vm_cpu_machine *mach;
+ int ret = 0;
+
+ mach = kzalloc(sizeof(*mach), GFP_KERNEL);
+ if (!mach) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ get_cpu_id((struct cpuid *) &mach->cpuid);
+ mach->ibc = sclp_get_ibc();
+ memcpy(&mach->fac_mask, kvm_s390_fac_list_mask,
+ kvm_s390_fac_list_mask_size() * sizeof(u64));
+ memcpy((unsigned long *)&mach->fac_list, S390_lowcore.stfle_fac_list,
+ S390_ARCH_FAC_LIST_SIZE_U64);
+ if (copy_to_user((void __user *)attr->addr, mach, sizeof(*mach)))
+ ret = -EFAULT;
+ kfree(mach);
+out:
+ return ret;
+}
+
+static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ int ret = -ENXIO;
+
+ switch (attr->attr) {
+ case KVM_S390_VM_CPU_PROCESSOR:
+ ret = kvm_s390_get_processor(kvm, attr);
+ break;
+ case KVM_S390_VM_CPU_MACHINE:
+ ret = kvm_s390_get_machine(kvm, attr);
+ break;
+ }
+ return ret;
+}
+
static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
{
int ret;
switch (attr->group) {
case KVM_S390_VM_MEM_CTRL:
- ret = kvm_s390_mem_control(kvm, attr);
+ ret = kvm_s390_set_mem_control(kvm, attr);
+ break;
+ case KVM_S390_VM_TOD:
+ ret = kvm_s390_set_tod(kvm, attr);
+ break;
+ case KVM_S390_VM_CPU_MODEL:
+ ret = kvm_s390_set_cpu_model(kvm, attr);
+ break;
+ case KVM_S390_VM_CRYPTO:
+ ret = kvm_s390_vm_set_crypto(kvm, attr);
break;
default:
ret = -ENXIO;
@@ -308,7 +629,24 @@ static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr)
{
- return -ENXIO;
+ int ret;
+
+ switch (attr->group) {
+ case KVM_S390_VM_MEM_CTRL:
+ ret = kvm_s390_get_mem_control(kvm, attr);
+ break;
+ case KVM_S390_VM_TOD:
+ ret = kvm_s390_get_tod(kvm, attr);
+ break;
+ case KVM_S390_VM_CPU_MODEL:
+ ret = kvm_s390_get_cpu_model(kvm, attr);
+ break;
+ default:
+ ret = -ENXIO;
+ break;
+ }
+
+ return ret;
}
static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
@@ -320,6 +658,42 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
switch (attr->attr) {
case KVM_S390_VM_MEM_ENABLE_CMMA:
case KVM_S390_VM_MEM_CLR_CMMA:
+ case KVM_S390_VM_MEM_LIMIT_SIZE:
+ ret = 0;
+ break;
+ default:
+ ret = -ENXIO;
+ break;
+ }
+ break;
+ case KVM_S390_VM_TOD:
+ switch (attr->attr) {
+ case KVM_S390_VM_TOD_LOW:
+ case KVM_S390_VM_TOD_HIGH:
+ ret = 0;
+ break;
+ default:
+ ret = -ENXIO;
+ break;
+ }
+ break;
+ case KVM_S390_VM_CPU_MODEL:
+ switch (attr->attr) {
+ case KVM_S390_VM_CPU_PROCESSOR:
+ case KVM_S390_VM_CPU_MACHINE:
+ ret = 0;
+ break;
+ default:
+ ret = -ENXIO;
+ break;
+ }
+ break;
+ case KVM_S390_VM_CRYPTO:
+ switch (attr->attr) {
+ case KVM_S390_VM_CRYPTO_ENABLE_AES_KW:
+ case KVM_S390_VM_CRYPTO_ENABLE_DEA_KW:
+ case KVM_S390_VM_CRYPTO_DISABLE_AES_KW:
+ case KVM_S390_VM_CRYPTO_DISABLE_DEA_KW:
ret = 0;
break;
default:
@@ -401,9 +775,61 @@ long kvm_arch_vm_ioctl(struct file *filp,
return r;
}
+static int kvm_s390_query_ap_config(u8 *config)
+{
+ u32 fcn_code = 0x04000000UL;
+ u32 cc;
+
+ asm volatile(
+ "lgr 0,%1\n"
+ "lgr 2,%2\n"
+ ".long 0xb2af0000\n" /* PQAP(QCI) */
+ "ipm %0\n"
+ "srl %0,28\n"
+ : "=r" (cc)
+ : "r" (fcn_code), "r" (config)
+ : "cc", "0", "2", "memory"
+ );
+
+ return cc;
+}
+
+static int kvm_s390_apxa_installed(void)
+{
+ u8 config[128];
+ int cc;
+
+ if (test_facility(2) && test_facility(12)) {
+ cc = kvm_s390_query_ap_config(config);
+
+ if (cc)
+ pr_err("PQAP(QCI) failed with cc=%d", cc);
+ else
+ return config[0] & 0x40;
+ }
+
+ return 0;
+}
+
+static void kvm_s390_set_crycb_format(struct kvm *kvm)
+{
+ kvm->arch.crypto.crycbd = (__u32)(unsigned long) kvm->arch.crypto.crycb;
+
+ if (kvm_s390_apxa_installed())
+ kvm->arch.crypto.crycbd |= CRYCB_FORMAT2;
+ else
+ kvm->arch.crypto.crycbd |= CRYCB_FORMAT1;
+}
+
+static void kvm_s390_get_cpu_id(struct cpuid *cpu_id)
+{
+ get_cpu_id(cpu_id);
+ cpu_id->version = 0xff;
+}
+
static int kvm_s390_crypto_init(struct kvm *kvm)
{
- if (!test_vfacility(76))
+ if (!test_kvm_facility(kvm, 76))
return 0;
kvm->arch.crypto.crycb = kzalloc(sizeof(*kvm->arch.crypto.crycb),
@@ -411,15 +837,18 @@ static int kvm_s390_crypto_init(struct kvm *kvm)
if (!kvm->arch.crypto.crycb)
return -ENOMEM;
- kvm->arch.crypto.crycbd = (__u32) (unsigned long) kvm->arch.crypto.crycb |
- CRYCB_FORMAT1;
+ kvm_s390_set_crycb_format(kvm);
+
+ /* Disable AES/DEA protected key functions by default */
+ kvm->arch.crypto.aes_kw = 0;
+ kvm->arch.crypto.dea_kw = 0;
return 0;
}
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
- int rc;
+ int i, rc;
char debug_name[16];
static unsigned long sca_offset;
@@ -454,6 +883,46 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (!kvm->arch.dbf)
goto out_nodbf;
+ /*
+ * The architectural maximum amount of facilities is 16 kbit. To store
+ * this amount, 2 kbyte of memory is required. Thus we need a full
+ * page to hold the active copy (arch.model.fac->sie) and the current
+ * facilities set (arch.model.fac->kvm). Its address size has to be
+ * 31 bits and word aligned.
+ */
+ kvm->arch.model.fac =
+ (struct s390_model_fac *) get_zeroed_page(GFP_KERNEL | GFP_DMA);
+ if (!kvm->arch.model.fac)
+ goto out_nofac;
+
+ memcpy(kvm->arch.model.fac->kvm, S390_lowcore.stfle_fac_list,
+ S390_ARCH_FAC_LIST_SIZE_U64);
+
+ /*
+ * If this KVM host runs *not* in a LPAR, relax the facility bits
+ * of the kvm facility mask by all missing facilities. This will allow
+ * to determine the right CPU model by means of the remaining facilities.
+ * Live guest migration must prohibit the migration of KVMs running in
+ * a LPAR to non LPAR hosts.
+ */
+ if (!MACHINE_IS_LPAR)
+ for (i = 0; i < kvm_s390_fac_list_mask_size(); i++)
+ kvm_s390_fac_list_mask[i] &= kvm->arch.model.fac->kvm[i];
+
+ /*
+ * Apply the kvm facility mask to limit the kvm supported/tolerated
+ * facility list.
+ */
+ for (i = 0; i < S390_ARCH_FAC_LIST_SIZE_U64; i++) {
+ if (i < kvm_s390_fac_list_mask_size())
+ kvm->arch.model.fac->kvm[i] &= kvm_s390_fac_list_mask[i];
+ else
+ kvm->arch.model.fac->kvm[i] = 0UL;
+ }
+
+ kvm_s390_get_cpu_id(&kvm->arch.model.cpu_id);
+ kvm->arch.model.ibc = sclp_get_ibc() & 0x0fff;
+
if (kvm_s390_crypto_init(kvm) < 0)
goto out_crypto;
@@ -477,6 +946,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm->arch.css_support = 0;
kvm->arch.use_irqchip = 0;
+ kvm->arch.epoch = 0;
spin_lock_init(&kvm->arch.start_stop_lock);
@@ -484,6 +954,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
out_nogmap:
kfree(kvm->arch.crypto.crycb);
out_crypto:
+ free_page((unsigned long)kvm->arch.model.fac);
+out_nofac:
debug_unregister(kvm->arch.dbf);
out_nodbf:
free_page((unsigned long)(kvm->arch.sca));
@@ -536,6 +1008,7 @@ static void kvm_free_vcpus(struct kvm *kvm)
void kvm_arch_destroy_vm(struct kvm *kvm)
{
kvm_free_vcpus(kvm);
+ free_page((unsigned long)kvm->arch.model.fac);
free_page((unsigned long)(kvm->arch.sca));
debug_unregister(kvm->arch.dbf);
kfree(kvm->arch.crypto.crycb);
@@ -546,25 +1019,30 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
}
/* Section: vcpu related */
+static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.gmap = gmap_alloc(current->mm, -1UL);
+ if (!vcpu->arch.gmap)
+ return -ENOMEM;
+ vcpu->arch.gmap->private = vcpu->kvm;
+
+ return 0;
+}
+
int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
{
vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
kvm_clear_async_pf_completion_queue(vcpu);
- if (kvm_is_ucontrol(vcpu->kvm)) {
- vcpu->arch.gmap = gmap_alloc(current->mm, -1UL);
- if (!vcpu->arch.gmap)
- return -ENOMEM;
- vcpu->arch.gmap->private = vcpu->kvm;
- return 0;
- }
-
- vcpu->arch.gmap = vcpu->kvm->arch.gmap;
vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX |
KVM_SYNC_GPRS |
KVM_SYNC_ACRS |
KVM_SYNC_CRS |
KVM_SYNC_ARCH0 |
KVM_SYNC_PFAULT;
+
+ if (kvm_is_ucontrol(vcpu->kvm))
+ return __kvm_ucontrol_vcpu_init(vcpu);
+
return 0;
}
@@ -615,16 +1093,27 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
kvm_s390_clear_local_irqs(vcpu);
}
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
{
- return 0;
+ mutex_lock(&vcpu->kvm->lock);
+ vcpu->arch.sie_block->epoch = vcpu->kvm->arch.epoch;
+ mutex_unlock(&vcpu->kvm->lock);
+ if (!kvm_is_ucontrol(vcpu->kvm))
+ vcpu->arch.gmap = vcpu->kvm->arch.gmap;
}
static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
{
- if (!test_vfacility(76))
+ if (!test_kvm_facility(vcpu->kvm, 76))
return;
+ vcpu->arch.sie_block->ecb3 &= ~(ECB3_AES | ECB3_DEA);
+
+ if (vcpu->kvm->arch.crypto.aes_kw)
+ vcpu->arch.sie_block->ecb3 |= ECB3_AES;
+ if (vcpu->kvm->arch.crypto.dea_kw)
+ vcpu->arch.sie_block->ecb3 |= ECB3_DEA;
+
vcpu->arch.sie_block->crycbd = vcpu->kvm->arch.crypto.crycbd;
}
@@ -654,14 +1143,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
CPUSTAT_STOPPED |
CPUSTAT_GED);
vcpu->arch.sie_block->ecb = 6;
- if (test_vfacility(50) && test_vfacility(73))
+ if (test_kvm_facility(vcpu->kvm, 50) && test_kvm_facility(vcpu->kvm, 73))
vcpu->arch.sie_block->ecb |= 0x10;
vcpu->arch.sie_block->ecb2 = 8;
- vcpu->arch.sie_block->eca = 0xD1002000U;
+ vcpu->arch.sie_block->eca = 0xC1002000U;
if (sclp_has_siif())
vcpu->arch.sie_block->eca |= 1;
- vcpu->arch.sie_block->fac = (int) (long) vfacilities;
+ if (sclp_has_sigpif())
+ vcpu->arch.sie_block->eca |= 0x10000000U;
vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE |
ICTL_TPROT;
@@ -670,10 +1160,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
if (rc)
return rc;
}
- hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+ hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup;
- get_cpu_id(&vcpu->arch.cpu_id);
- vcpu->arch.cpu_id.version = 0xff;
+
+ mutex_lock(&vcpu->kvm->lock);
+ vcpu->arch.cpu_id = vcpu->kvm->arch.model.cpu_id;
+ memcpy(vcpu->kvm->arch.model.fac->sie, vcpu->kvm->arch.model.fac->kvm,
+ S390_ARCH_FAC_LIST_SIZE_BYTE);
+ vcpu->arch.sie_block->ibc = vcpu->kvm->arch.model.ibc;
+ mutex_unlock(&vcpu->kvm->lock);
kvm_s390_vcpu_crypto_setup(vcpu);
@@ -717,6 +1212,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca;
set_bit(63 - id, (unsigned long *) &kvm->arch.sca->mcn);
}
+ vcpu->arch.sie_block->fac = (int) (long) kvm->arch.model.fac->sie;
spin_lock_init(&vcpu->arch.local_int.lock);
vcpu->arch.local_int.float_int = &kvm->arch.float_int;
@@ -741,7 +1237,7 @@ out:
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
- return kvm_cpu_has_interrupt(vcpu);
+ return kvm_s390_vcpu_has_irq(vcpu, 0);
}
void s390_vcpu_block(struct kvm_vcpu *vcpu)
@@ -869,6 +1365,8 @@ static int kvm_arch_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu,
case KVM_REG_S390_PFTOKEN:
r = get_user(vcpu->arch.pfault_token,
(u64 __user *)reg->addr);
+ if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
+ kvm_clear_async_pf_completion_queue(vcpu);
break;
case KVM_REG_S390_PFCOMPARE:
r = get_user(vcpu->arch.pfault_compare,
@@ -1176,7 +1674,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu)
return 0;
if (psw_extint_disabled(vcpu))
return 0;
- if (kvm_cpu_has_interrupt(vcpu))
+ if (kvm_s390_vcpu_has_irq(vcpu, 0))
return 0;
if (!(vcpu->arch.sie_block->gcr[0] & 0x200ul))
return 0;
@@ -1341,6 +1839,8 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vcpu->arch.pfault_token = kvm_run->s.regs.pft;
vcpu->arch.pfault_select = kvm_run->s.regs.pfs;
vcpu->arch.pfault_compare = kvm_run->s.regs.pfc;
+ if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
+ kvm_clear_async_pf_completion_queue(vcpu);
}
kvm_run->kvm_dirty_regs = 0;
}
@@ -1559,15 +2059,10 @@ void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu)
spin_lock(&vcpu->kvm->arch.start_stop_lock);
online_vcpus = atomic_read(&vcpu->kvm->online_vcpus);
- /* Need to lock access to action_bits to avoid a SIGP race condition */
- spin_lock(&vcpu->arch.local_int.lock);
- atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
-
/* SIGP STOP and SIGP STOP AND STORE STATUS has been fully processed */
- vcpu->arch.local_int.action_bits &=
- ~(ACTION_STOP_ON_STOP | ACTION_STORE_ON_STOP);
- spin_unlock(&vcpu->arch.local_int.lock);
+ kvm_s390_clear_stop_irq(vcpu);
+ atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
__disable_ibs_on_vcpu(vcpu);
for (i = 0; i < online_vcpus; i++) {
@@ -1783,30 +2278,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
static int __init kvm_s390_init(void)
{
- int ret;
- ret = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
- if (ret)
- return ret;
-
- /*
- * guests can ask for up to 255+1 double words, we need a full page
- * to hold the maximum amount of facilities. On the other hand, we
- * only set facilities that are known to work in KVM.
- */
- vfacilities = (unsigned long *) get_zeroed_page(GFP_KERNEL|GFP_DMA);
- if (!vfacilities) {
- kvm_exit();
- return -ENOMEM;
- }
- memcpy(vfacilities, S390_lowcore.stfle_fac_list, 16);
- vfacilities[0] &= 0xff82fffbf47c2000UL;
- vfacilities[1] &= 0x005c000000000000UL;
- return 0;
+ return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
}
static void __exit kvm_s390_exit(void)
{
- free_page((unsigned long) vfacilities);
kvm_exit();
}
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index a8f3d9b71c11..985c2114d7ef 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -18,12 +18,10 @@
#include <linux/hrtimer.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
+#include <asm/facility.h>
typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);
-/* declare vfacilities extern */
-extern unsigned long *vfacilities;
-
/* Transactional Memory Execution related macros */
#define IS_TE_ENABLED(vcpu) ((vcpu->arch.sie_block->ecb & 0x10))
#define TDB_FORMAT1 1
@@ -127,6 +125,12 @@ static inline void kvm_s390_set_psw_cc(struct kvm_vcpu *vcpu, unsigned long cc)
vcpu->arch.sie_block->gpsw.mask |= cc << 44;
}
+/* test availability of facility in a kvm intance */
+static inline int test_kvm_facility(struct kvm *kvm, unsigned long nr)
+{
+ return __test_facility(nr, kvm->arch.model.fac->kvm);
+}
+
/* are cpu states controlled by user space */
static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm)
{
@@ -183,7 +187,8 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu);
/* is cmma enabled */
bool kvm_s390_cmma_enabled(struct kvm *kvm);
-int test_vfacility(unsigned long nr);
+unsigned long kvm_s390_fac_list_mask_size(void);
+extern unsigned long kvm_s390_fac_list_mask[];
/* implemented in diag.c */
int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
@@ -228,11 +233,13 @@ int s390int_to_s390irq(struct kvm_s390_interrupt *s390int,
struct kvm_s390_irq *s390irq);
/* implemented in interrupt.c */
-int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_s390_vcpu_has_irq(struct kvm_vcpu *vcpu, int exclude_stop);
int psw_extint_disabled(struct kvm_vcpu *vcpu);
void kvm_s390_destroy_adapters(struct kvm *kvm);
-int kvm_s390_si_ext_call_pending(struct kvm_vcpu *vcpu);
+int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu);
extern struct kvm_device_ops kvm_flic_ops;
+int kvm_s390_is_stop_irq_pending(struct kvm_vcpu *vcpu);
+void kvm_s390_clear_stop_irq(struct kvm_vcpu *vcpu);
/* implemented in guestdbg.c */
void kvm_s390_backup_guest_per_regs(struct kvm_vcpu *vcpu);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 1be578d64dfc..bdd9b5b17e03 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -337,19 +337,24 @@ static int handle_io_inst(struct kvm_vcpu *vcpu)
static int handle_stfl(struct kvm_vcpu *vcpu)
{
int rc;
+ unsigned int fac;
vcpu->stat.instruction_stfl++;
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+ /*
+ * We need to shift the lower 32 facility bits (bit 0-31) from a u64
+ * into a u32 memory representation. They will remain bits 0-31.
+ */
+ fac = *vcpu->kvm->arch.model.fac->sie >> 32;
rc = write_guest_lc(vcpu, offsetof(struct _lowcore, stfl_fac_list),
- vfacilities, 4);
+ &fac, sizeof(fac));
if (rc)
return rc;
- VCPU_EVENT(vcpu, 5, "store facility list value %x",
- *(unsigned int *) vfacilities);
- trace_kvm_s390_handle_stfl(vcpu, *(unsigned int *) vfacilities);
+ VCPU_EVENT(vcpu, 5, "store facility list value %x", fac);
+ trace_kvm_s390_handle_stfl(vcpu, fac);
return 0;
}
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 6651f9f73973..23b1e86b2122 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -26,15 +26,17 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu,
struct kvm_s390_local_interrupt *li;
int cpuflags;
int rc;
+ int ext_call_pending;
li = &dst_vcpu->arch.local_int;
cpuflags = atomic_read(li->cpuflags);
- if (!(cpuflags & (CPUSTAT_ECALL_PEND | CPUSTAT_STOPPED)))
+ ext_call_pending = kvm_s390_ext_call_pending(dst_vcpu);
+ if (!(cpuflags & CPUSTAT_STOPPED) && !ext_call_pending)
rc = SIGP_CC_ORDER_CODE_ACCEPTED;
else {
*reg &= 0xffffffff00000000UL;
- if (cpuflags & CPUSTAT_ECALL_PEND)
+ if (ext_call_pending)
*reg |= SIGP_STATUS_EXT_CALL_PENDING;
if (cpuflags & CPUSTAT_STOPPED)
*reg |= SIGP_STATUS_STOPPED;
@@ -96,7 +98,7 @@ static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu,
}
static int __sigp_external_call(struct kvm_vcpu *vcpu,
- struct kvm_vcpu *dst_vcpu)
+ struct kvm_vcpu *dst_vcpu, u64 *reg)
{
struct kvm_s390_irq irq = {
.type = KVM_S390_INT_EXTERNAL_CALL,
@@ -105,45 +107,31 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu,
int rc;
rc = kvm_s390_inject_vcpu(dst_vcpu, &irq);
- if (!rc)
+ if (rc == -EBUSY) {
+ *reg &= 0xffffffff00000000UL;
+ *reg |= SIGP_STATUS_EXT_CALL_PENDING;
+ return SIGP_CC_STATUS_STORED;
+ } else if (rc == 0) {
VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x",
dst_vcpu->vcpu_id);
-
- return rc ? rc : SIGP_CC_ORDER_CODE_ACCEPTED;
-}
-
-static int __inject_sigp_stop(struct kvm_vcpu *dst_vcpu, int action)
-{
- struct kvm_s390_local_interrupt *li = &dst_vcpu->arch.local_int;
- int rc = SIGP_CC_ORDER_CODE_ACCEPTED;
-
- spin_lock(&li->lock);
- if (li->action_bits & ACTION_STOP_ON_STOP) {
- /* another SIGP STOP is pending */
- rc = SIGP_CC_BUSY;
- goto out;
}
- if ((atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) {
- if ((action & ACTION_STORE_ON_STOP) != 0)
- rc = -ESHUTDOWN;
- goto out;
- }
- set_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs);
- li->action_bits |= action;
- atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags);
- kvm_s390_vcpu_wakeup(dst_vcpu);
-out:
- spin_unlock(&li->lock);
- return rc;
+ return rc ? rc : SIGP_CC_ORDER_CODE_ACCEPTED;
}
static int __sigp_stop(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu)
{
+ struct kvm_s390_irq irq = {
+ .type = KVM_S390_SIGP_STOP,
+ };
int rc;
- rc = __inject_sigp_stop(dst_vcpu, ACTION_STOP_ON_STOP);
- VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x", dst_vcpu->vcpu_id);
+ rc = kvm_s390_inject_vcpu(dst_vcpu, &irq);
+ if (rc == -EBUSY)
+ rc = SIGP_CC_BUSY;
+ else if (rc == 0)
+ VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x",
+ dst_vcpu->vcpu_id);
return rc;
}
@@ -151,20 +139,18 @@ static int __sigp_stop(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu)
static int __sigp_stop_and_store_status(struct kvm_vcpu *vcpu,
struct kvm_vcpu *dst_vcpu, u64 *reg)
{
+ struct kvm_s390_irq irq = {
+ .type = KVM_S390_SIGP_STOP,
+ .u.stop.flags = KVM_S390_STOP_FLAG_STORE_STATUS,
+ };
int rc;
- rc = __inject_sigp_stop(dst_vcpu, ACTION_STOP_ON_STOP |
- ACTION_STORE_ON_STOP);
- VCPU_EVENT(vcpu, 4, "sent sigp stop and store status to cpu %x",
- dst_vcpu->vcpu_id);
-
- if (rc == -ESHUTDOWN) {
- /* If the CPU has already been stopped, we still have
- * to save the status when doing stop-and-store. This
- * has to be done after unlocking all spinlocks. */
- rc = kvm_s390_store_status_unloaded(dst_vcpu,
- KVM_S390_STORE_STATUS_NOADDR);
- }
+ rc = kvm_s390_inject_vcpu(dst_vcpu, &irq);
+ if (rc == -EBUSY)
+ rc = SIGP_CC_BUSY;
+ else if (rc == 0)
+ VCPU_EVENT(vcpu, 4, "sent sigp stop and store status to cpu %x",
+ dst_vcpu->vcpu_id);
return rc;
}
@@ -197,41 +183,33 @@ static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter)
static int __sigp_set_prefix(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu,
u32 address, u64 *reg)
{
- struct kvm_s390_local_interrupt *li;
+ struct kvm_s390_irq irq = {
+ .type = KVM_S390_SIGP_SET_PREFIX,
+ .u.prefix.address = address & 0x7fffe000u,
+ };
int rc;
- li = &dst_vcpu->arch.local_int;
-
/*
* Make sure the new value is valid memory. We only need to check the
* first page, since address is 8k aligned and memory pieces are always
* at least 1MB aligned and have at least a size of 1MB.
*/
- address &= 0x7fffe000u;
- if (kvm_is_error_gpa(vcpu->kvm, address)) {
+ if (kvm_is_error_gpa(vcpu->kvm, irq.u.prefix.address)) {
*reg &= 0xffffffff00000000UL;
*reg |= SIGP_STATUS_INVALID_PARAMETER;
return SIGP_CC_STATUS_STORED;
}
- spin_lock(&li->lock);
- /* cpu must be in stopped state */
- if (!(atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) {
+ rc = kvm_s390_inject_vcpu(dst_vcpu, &irq);
+ if (rc == -EBUSY) {
*reg &= 0xffffffff00000000UL;
*reg |= SIGP_STATUS_INCORRECT_STATE;
- rc = SIGP_CC_STATUS_STORED;
- goto out_li;
+ return SIGP_CC_STATUS_STORED;
+ } else if (rc == 0) {
+ VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x",
+ dst_vcpu->vcpu_id, irq.u.prefix.address);
}
- li->irq.prefix.address = address;
- set_bit(IRQ_PEND_SET_PREFIX, &li->pending_irqs);
- kvm_s390_vcpu_wakeup(dst_vcpu);
- rc = SIGP_CC_ORDER_CODE_ACCEPTED;
-
- VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", dst_vcpu->vcpu_id,
- address);
-out_li:
- spin_unlock(&li->lock);
return rc;
}
@@ -242,9 +220,7 @@ static int __sigp_store_status_at_addr(struct kvm_vcpu *vcpu,
int flags;
int rc;
- spin_lock(&dst_vcpu->arch.local_int.lock);
flags = atomic_read(dst_vcpu->arch.local_int.cpuflags);
- spin_unlock(&dst_vcpu->arch.local_int.lock);
if (!(flags & CPUSTAT_STOPPED)) {
*reg &= 0xffffffff00000000UL;
*reg |= SIGP_STATUS_INCORRECT_STATE;
@@ -291,8 +267,9 @@ static int __prepare_sigp_re_start(struct kvm_vcpu *vcpu,
/* handle (RE)START in user space */
int rc = -EOPNOTSUPP;
+ /* make sure we don't race with STOP irq injection */
spin_lock(&li->lock);
- if (li->action_bits & ACTION_STOP_ON_STOP)
+ if (kvm_s390_is_stop_irq_pending(dst_vcpu))
rc = SIGP_CC_BUSY;
spin_unlock(&li->lock);
@@ -333,7 +310,7 @@ static int handle_sigp_dst(struct kvm_vcpu *vcpu, u8 order_code,
break;
case SIGP_EXTERNAL_CALL:
vcpu->stat.instruction_sigp_external_call++;
- rc = __sigp_external_call(vcpu, dst_vcpu);
+ rc = __sigp_external_call(vcpu, dst_vcpu, status_reg);
break;
case SIGP_EMERGENCY_SIGNAL:
vcpu->stat.instruction_sigp_emergency++;
@@ -394,6 +371,53 @@ static int handle_sigp_dst(struct kvm_vcpu *vcpu, u8 order_code,
return rc;
}
+static int handle_sigp_order_in_user_space(struct kvm_vcpu *vcpu, u8 order_code)
+{
+ if (!vcpu->kvm->arch.user_sigp)
+ return 0;
+
+ switch (order_code) {
+ case SIGP_SENSE:
+ case SIGP_EXTERNAL_CALL:
+ case SIGP_EMERGENCY_SIGNAL:
+ case SIGP_COND_EMERGENCY_SIGNAL:
+ case SIGP_SENSE_RUNNING:
+ return 0;
+ /* update counters as we're directly dropping to user space */
+ case SIGP_STOP:
+ vcpu->stat.instruction_sigp_stop++;
+ break;
+ case SIGP_STOP_AND_STORE_STATUS:
+ vcpu->stat.instruction_sigp_stop_store_status++;
+ break;
+ case SIGP_STORE_STATUS_AT_ADDRESS:
+ vcpu->stat.instruction_sigp_store_status++;
+ break;
+ case SIGP_SET_PREFIX:
+ vcpu->stat.instruction_sigp_prefix++;
+ break;
+ case SIGP_START:
+ vcpu->stat.instruction_sigp_start++;
+ break;
+ case SIGP_RESTART:
+ vcpu->stat.instruction_sigp_restart++;
+ break;
+ case SIGP_INITIAL_CPU_RESET:
+ vcpu->stat.instruction_sigp_init_cpu_reset++;
+ break;
+ case SIGP_CPU_RESET:
+ vcpu->stat.instruction_sigp_cpu_reset++;
+ break;
+ default:
+ vcpu->stat.instruction_sigp_unknown++;
+ }
+
+ VCPU_EVENT(vcpu, 4, "sigp order %u: completely handled in user space",
+ order_code);
+
+ return 1;
+}
+
int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
{
int r1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
@@ -408,6 +432,8 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
order_code = kvm_s390_get_base_disp_rs(vcpu);
+ if (handle_sigp_order_in_user_space(vcpu, order_code))
+ return -EOPNOTSUPP;
if (r1 % 2)
parameter = vcpu->run->s.regs.gprs[r1];
diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h
index 647e9d6a4818..653a7ec09ef5 100644
--- a/arch/s390/kvm/trace-s390.h
+++ b/arch/s390/kvm/trace-s390.h
@@ -209,19 +209,21 @@ TRACE_EVENT(kvm_s390_request_resets,
* Trace point for a vcpu's stop requests.
*/
TRACE_EVENT(kvm_s390_stop_request,
- TP_PROTO(unsigned int action_bits),
- TP_ARGS(action_bits),
+ TP_PROTO(unsigned char stop_irq, unsigned char flags),
+ TP_ARGS(stop_irq, flags),
TP_STRUCT__entry(
- __field(unsigned int, action_bits)
+ __field(unsigned char, stop_irq)
+ __field(unsigned char, flags)
),
TP_fast_assign(
- __entry->action_bits = action_bits;
+ __entry->stop_irq = stop_irq;
+ __entry->flags = flags;
),
- TP_printk("stop request, action_bits = %08x",
- __entry->action_bits)
+ TP_printk("stop request, stop irq = %u, flags = %08x",
+ __entry->stop_irq, __entry->flags)
);
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index eb181178fe0b..57a9d94fe160 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -208,6 +208,7 @@ struct x86_emulate_ops {
void (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
+ void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
};
typedef u32 __attribute__((vector_size(16))) sse128_t;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d89c6b828c96..a236e39cc385 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -38,8 +38,6 @@
#define KVM_PRIVATE_MEM_SLOTS 3
#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
-#define KVM_MMIO_SIZE 16
-
#define KVM_PIO_PAGE_OFFSET 1
#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
@@ -51,7 +49,7 @@
| X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
#define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL
-#define CR3_PCID_INVD (1UL << 63)
+#define CR3_PCID_INVD BIT_64(63)
#define CR4_RESERVED_BITS \
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
| X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
@@ -160,6 +158,18 @@ enum {
#define DR7_FIXED_1 0x00000400
#define DR7_VOLATILE 0xffff2bff
+#define PFERR_PRESENT_BIT 0
+#define PFERR_WRITE_BIT 1
+#define PFERR_USER_BIT 2
+#define PFERR_RSVD_BIT 3
+#define PFERR_FETCH_BIT 4
+
+#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
+#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
+#define PFERR_USER_MASK (1U << PFERR_USER_BIT)
+#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
+#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
+
/* apic attention bits */
#define KVM_APIC_CHECK_VAPIC 0
/*
@@ -615,6 +625,8 @@ struct kvm_arch {
#ifdef CONFIG_KVM_MMU_AUDIT
int audit_point;
#endif
+
+ bool boot_vcpu_runs_old_kvmclock;
};
struct kvm_vm_stat {
@@ -643,6 +655,7 @@ struct kvm_vcpu_stat {
u32 irq_window_exits;
u32 nmi_window_exits;
u32 halt_exits;
+ u32 halt_successful_poll;
u32 halt_wakeup;
u32 request_irq_exits;
u32 irq_exits;
@@ -787,6 +800,31 @@ struct kvm_x86_ops {
int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
+
+ /*
+ * Arch-specific dirty logging hooks. These hooks are only supposed to
+ * be valid if the specific arch has hardware-accelerated dirty logging
+ * mechanism. Currently only for PML on VMX.
+ *
+ * - slot_enable_log_dirty:
+ * called when enabling log dirty mode for the slot.
+ * - slot_disable_log_dirty:
+ * called when disabling log dirty mode for the slot.
+ * also called when slot is created with log dirty disabled.
+ * - flush_log_dirty:
+ * called before reporting dirty_bitmap to userspace.
+ * - enable_log_dirty_pt_masked:
+ * called when reenabling log dirty for the GFNs in the mask after
+ * corresponding bits are cleared in slot->dirty_bitmap.
+ */
+ void (*slot_enable_log_dirty)(struct kvm *kvm,
+ struct kvm_memory_slot *slot);
+ void (*slot_disable_log_dirty)(struct kvm *kvm,
+ struct kvm_memory_slot *slot);
+ void (*flush_log_dirty)(struct kvm *kvm);
+ void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t offset, unsigned long mask);
};
struct kvm_arch_async_pf {
@@ -819,10 +857,17 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask);
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
-void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- gfn_t gfn_offset, unsigned long mask);
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+ struct kvm_memory_slot *memslot);
+void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
+ struct kvm_memory_slot *memslot);
+void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
+ struct kvm_memory_slot *memslot);
+void kvm_mmu_slot_set_dirty(struct kvm *kvm,
+ struct kvm_memory_slot *memslot);
+void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask);
void kvm_mmu_zap_all(struct kvm *kvm);
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm);
unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 45afaee9555c..da772edd19ab 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -69,6 +69,7 @@
#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
+#define SECONDARY_EXEC_ENABLE_PML 0x00020000
#define SECONDARY_EXEC_XSAVES 0x00100000
@@ -121,6 +122,7 @@ enum vmcs_field {
GUEST_LDTR_SELECTOR = 0x0000080c,
GUEST_TR_SELECTOR = 0x0000080e,
GUEST_INTR_STATUS = 0x00000810,
+ GUEST_PML_INDEX = 0x00000812,
HOST_ES_SELECTOR = 0x00000c00,
HOST_CS_SELECTOR = 0x00000c02,
HOST_SS_SELECTOR = 0x00000c04,
@@ -140,6 +142,8 @@ enum vmcs_field {
VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009,
VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a,
VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b,
+ PML_ADDRESS = 0x0000200e,
+ PML_ADDRESS_HIGH = 0x0000200f,
TSC_OFFSET = 0x00002010,
TSC_OFFSET_HIGH = 0x00002011,
VIRTUAL_APIC_PAGE_ADDR = 0x00002012,
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index 536240fa9a95..3ce079136c11 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -364,6 +364,9 @@
#define MSR_IA32_UCODE_WRITE 0x00000079
#define MSR_IA32_UCODE_REV 0x0000008b
+#define MSR_IA32_SMM_MONITOR_CTL 0x0000009b
+#define MSR_IA32_SMBASE 0x0000009e
+
#define MSR_IA32_PERF_STATUS 0x00000198
#define MSR_IA32_PERF_CTL 0x00000199
#define INTEL_PERF_CTL_MASK 0xffff
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index b813bf9da1e2..c5f1a1deb91a 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -56,6 +56,7 @@
#define EXIT_REASON_MSR_READ 31
#define EXIT_REASON_MSR_WRITE 32
#define EXIT_REASON_INVALID_STATE 33
+#define EXIT_REASON_MSR_LOAD_FAIL 34
#define EXIT_REASON_MWAIT_INSTRUCTION 36
#define EXIT_REASON_MONITOR_INSTRUCTION 39
#define EXIT_REASON_PAUSE_INSTRUCTION 40
@@ -72,6 +73,7 @@
#define EXIT_REASON_XSETBV 55
#define EXIT_REASON_APIC_WRITE 56
#define EXIT_REASON_INVPCID 58
+#define EXIT_REASON_PML_FULL 62
#define EXIT_REASON_XSAVES 63
#define EXIT_REASON_XRSTORS 64
@@ -116,10 +118,14 @@
{ EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \
{ EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \
{ EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
+ { EXIT_REASON_MSR_LOAD_FAIL, "MSR_LOAD_FAIL" }, \
{ EXIT_REASON_INVD, "INVD" }, \
{ EXIT_REASON_INVVPID, "INVVPID" }, \
{ EXIT_REASON_INVPCID, "INVPCID" }, \
{ EXIT_REASON_XSAVES, "XSAVES" }, \
{ EXIT_REASON_XRSTORS, "XRSTORS" }
+#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1
+#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4
+
#endif /* _UAPIVMX_H */
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 7dc7ba577ecd..413a7bf9efbb 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -39,6 +39,7 @@ config KVM
select PERF_EVENTS
select HAVE_KVM_MSI
select HAVE_KVM_CPU_RELAX_INTERCEPT
+ select KVM_GENERIC_DIRTYLOG_READ_PROTECT
select KVM_VFIO
select SRCU
---help---
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index de12c1d379f1..e0b794a84c35 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -86,6 +86,7 @@
#define DstAcc (OpAcc << DstShift)
#define DstDI (OpDI << DstShift)
#define DstMem64 (OpMem64 << DstShift)
+#define DstMem16 (OpMem16 << DstShift)
#define DstImmUByte (OpImmUByte << DstShift)
#define DstDX (OpDX << DstShift)
#define DstAccLo (OpAccLo << DstShift)
@@ -124,6 +125,7 @@
#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */
#define Escape (5<<15) /* Escape to coprocessor instruction */
#define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */
+#define ModeDual (7<<15) /* Different instruction for 32/64 bit */
#define Sse (1<<18) /* SSE Vector instruction */
/* Generic ModRM decode. */
#define ModRM (1<<19)
@@ -165,10 +167,10 @@
#define NoMod ((u64)1 << 47) /* Mod field is ignored */
#define Intercept ((u64)1 << 48) /* Has valid intercept field */
#define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */
-#define NoBigReal ((u64)1 << 50) /* No big real mode */
#define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */
#define NearBranch ((u64)1 << 52) /* Near branches */
#define No16 ((u64)1 << 53) /* No 16 bit operand */
+#define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */
#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
@@ -213,6 +215,7 @@ struct opcode {
const struct gprefix *gprefix;
const struct escape *esc;
const struct instr_dual *idual;
+ const struct mode_dual *mdual;
void (*fastop)(struct fastop *fake);
} u;
int (*check_perm)(struct x86_emulate_ctxt *ctxt);
@@ -240,6 +243,11 @@ struct instr_dual {
struct opcode mod3;
};
+struct mode_dual {
+ struct opcode mode32;
+ struct opcode mode64;
+};
+
/* EFLAGS bit definitions. */
#define EFLG_ID (1<<21)
#define EFLG_VIP (1<<20)
@@ -262,6 +270,13 @@ struct instr_dual {
#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
#define EFLG_RESERVED_ONE_MASK 2
+enum x86_transfer_type {
+ X86_TRANSFER_NONE,
+ X86_TRANSFER_CALL_JMP,
+ X86_TRANSFER_RET,
+ X86_TRANSFER_TASK_SWITCH,
+};
+
static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
{
if (!(ctxt->regs_valid & (1 << nr))) {
@@ -669,9 +684,13 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
}
if (addr.ea > lim)
goto bad;
- *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea);
- if (size > *max_size)
- goto bad;
+ if (lim == 0xffffffff)
+ *max_size = ~0u;
+ else {
+ *max_size = (u64)lim + 1 - addr.ea;
+ if (size > *max_size)
+ goto bad;
+ }
la &= (u32)-1;
break;
}
@@ -722,19 +741,26 @@ static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst,
const struct desc_struct *cs_desc)
{
enum x86emul_mode mode = ctxt->mode;
+ int rc;
#ifdef CONFIG_X86_64
- if (ctxt->mode >= X86EMUL_MODE_PROT32 && cs_desc->l) {
- u64 efer = 0;
+ if (ctxt->mode >= X86EMUL_MODE_PROT16) {
+ if (cs_desc->l) {
+ u64 efer = 0;
- ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
- if (efer & EFER_LMA)
- mode = X86EMUL_MODE_PROT64;
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (efer & EFER_LMA)
+ mode = X86EMUL_MODE_PROT64;
+ } else
+ mode = X86EMUL_MODE_PROT32; /* temporary value */
}
#endif
if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32)
mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
- return assign_eip(ctxt, dst, mode);
+ rc = assign_eip(ctxt, dst, mode);
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->mode = mode;
+ return rc;
}
static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
@@ -1057,8 +1083,6 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt)
asm volatile("fnstcw %0": "+m"(fcw));
ctxt->ops->put_fpu(ctxt);
- /* force 2 byte destination */
- ctxt->dst.bytes = 2;
ctxt->dst.val = fcw;
return X86EMUL_CONTINUE;
@@ -1075,8 +1099,6 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
asm volatile("fnstsw %0": "+m"(fsw));
ctxt->ops->put_fpu(ctxt);
- /* force 2 byte destination */
- ctxt->dst.bytes = 2;
ctxt->dst.val = fsw;
return X86EMUL_CONTINUE;
@@ -1223,6 +1245,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
else {
modrm_ea += reg_read(ctxt, base_reg);
adjust_modrm_seg(ctxt, base_reg);
+ /* Increment ESP on POP [ESP] */
+ if ((ctxt->d & IncSP) &&
+ base_reg == VCPU_REGS_RSP)
+ modrm_ea += ctxt->op_bytes;
}
if (index_reg != 4)
modrm_ea += reg_read(ctxt, index_reg) << scale;
@@ -1435,10 +1461,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
ops->get_gdt(ctxt, dt);
}
-/* allowed just for 8 bytes segments */
-static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
- u16 selector, struct desc_struct *desc,
- ulong *desc_addr_p)
+static int get_descriptor_ptr(struct x86_emulate_ctxt *ctxt,
+ u16 selector, ulong *desc_addr_p)
{
struct desc_ptr dt;
u16 index = selector >> 3;
@@ -1449,8 +1473,34 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
if (dt.size < index * 8 + 7)
return emulate_gp(ctxt, selector & 0xfffc);
- *desc_addr_p = addr = dt.address + index * 8;
- return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
+ addr = dt.address + index * 8;
+
+#ifdef CONFIG_X86_64
+ if (addr >> 32 != 0) {
+ u64 efer = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (!(efer & EFER_LMA))
+ addr &= (u32)-1;
+ }
+#endif
+
+ *desc_addr_p = addr;
+ return X86EMUL_CONTINUE;
+}
+
+/* allowed just for 8 bytes segments */
+static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 selector, struct desc_struct *desc,
+ ulong *desc_addr_p)
+{
+ int rc;
+
+ rc = get_descriptor_ptr(ctxt, selector, desc_addr_p);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return ctxt->ops->read_std(ctxt, *desc_addr_p, desc, sizeof(*desc),
&ctxt->exception);
}
@@ -1458,16 +1508,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
u16 selector, struct desc_struct *desc)
{
- struct desc_ptr dt;
- u16 index = selector >> 3;
+ int rc;
ulong addr;
- get_descriptor_table_ptr(ctxt, selector, &dt);
-
- if (dt.size < index * 8 + 7)
- return emulate_gp(ctxt, selector & 0xfffc);
+ rc = get_descriptor_ptr(ctxt, selector, &addr);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
- addr = dt.address + index * 8;
return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc,
&ctxt->exception);
}
@@ -1475,7 +1522,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
/* Does not support long mode */
static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
u16 selector, int seg, u8 cpl,
- bool in_task_switch,
+ enum x86_transfer_type transfer,
struct desc_struct *desc)
{
struct desc_struct seg_desc, old_desc;
@@ -1529,11 +1576,15 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
return ret;
err_code = selector & 0xfffc;
- err_vec = in_task_switch ? TS_VECTOR : GP_VECTOR;
+ err_vec = (transfer == X86_TRANSFER_TASK_SWITCH) ? TS_VECTOR :
+ GP_VECTOR;
/* can't load system descriptor into segment selector */
- if (seg <= VCPU_SREG_GS && !seg_desc.s)
+ if (seg <= VCPU_SREG_GS && !seg_desc.s) {
+ if (transfer == X86_TRANSFER_CALL_JMP)
+ return X86EMUL_UNHANDLEABLE;
goto exception;
+ }
if (!seg_desc.p) {
err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
@@ -1605,10 +1656,13 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
if (seg_desc.s) {
/* mark segment as accessed */
- seg_desc.type |= 1;
- ret = write_segment_descriptor(ctxt, selector, &seg_desc);
- if (ret != X86EMUL_CONTINUE)
- return ret;
+ if (!(seg_desc.type & 1)) {
+ seg_desc.type |= 1;
+ ret = write_segment_descriptor(ctxt, selector,
+ &seg_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ }
} else if (ctxt->mode == X86EMUL_MODE_PROT64) {
ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3,
sizeof(base3), &ctxt->exception);
@@ -1631,7 +1685,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
u16 selector, int seg)
{
u8 cpl = ctxt->ops->cpl(ctxt);
- return __load_segment_descriptor(ctxt, selector, seg, cpl, false, NULL);
+ return __load_segment_descriptor(ctxt, selector, seg, cpl,
+ X86_TRANSFER_NONE, NULL);
}
static void write_register_operand(struct operand *op)
@@ -1828,12 +1883,14 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
unsigned long selector;
int rc;
- rc = emulate_pop(ctxt, &selector, ctxt->op_bytes);
+ rc = emulate_pop(ctxt, &selector, 2);
if (rc != X86EMUL_CONTINUE)
return rc;
if (ctxt->modrm_reg == VCPU_SREG_SS)
ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+ if (ctxt->op_bytes > 2)
+ rsp_increment(ctxt, ctxt->op_bytes - 2);
rc = load_segment_descriptor(ctxt, (u16)selector, seg);
return rc;
@@ -2007,6 +2064,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
+ ctxt->ops->set_nmi_mask(ctxt, false);
return rc;
}
@@ -2041,7 +2099,8 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
- rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false,
+ rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_CALL_JMP,
&new_desc);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -2130,7 +2189,8 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
/* Outer-privilege level return is not implemented */
if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl)
return X86EMUL_UNHANDLEABLE;
- rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, false,
+ rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_RET,
&new_desc);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -2163,12 +2223,15 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
fastop(ctxt, em_cmp);
if (ctxt->eflags & EFLG_ZF) {
- /* Success: write back to memory. */
+ /* Success: write back to memory; no update of EAX */
+ ctxt->src.type = OP_NONE;
ctxt->dst.val = ctxt->src.orig_val;
} else {
/* Failure: write the value we saw to EAX. */
- ctxt->dst.type = OP_REG;
- ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+ ctxt->src.type = OP_REG;
+ ctxt->src.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+ ctxt->src.val = ctxt->dst.orig_val;
+ /* Create write-cycle to dest by writing the same value */
ctxt->dst.val = ctxt->dst.orig_val;
}
return X86EMUL_CONTINUE;
@@ -2556,23 +2619,23 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
* it is handled in a context of new task
*/
ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
@@ -2694,31 +2757,31 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
* it is handled in a context of new task
*/
ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR,
- cpl, true, NULL);
+ cpl, X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
@@ -2739,7 +2802,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
return ret;
save_state_to_tss32(ctxt, &tss_seg);
@@ -2748,13 +2810,11 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip,
ldt_sel_offset - eip_offset, &ctxt->exception);
if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
return ret;
ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
return ret;
if (old_tss_sel != 0xffff) {
@@ -2765,7 +2825,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
sizeof tss_seg.prev_task_link,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
return ret;
}
@@ -2999,15 +3058,16 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
struct desc_struct old_desc, new_desc;
const struct x86_emulate_ops *ops = ctxt->ops;
int cpl = ctxt->ops->cpl(ctxt);
+ enum x86emul_mode prev_mode = ctxt->mode;
old_eip = ctxt->_eip;
ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS);
memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
- rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false,
- &new_desc);
+ rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_CALL_JMP, &new_desc);
if (rc != X86EMUL_CONTINUE)
- return X86EMUL_CONTINUE;
+ return rc;
rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
if (rc != X86EMUL_CONTINUE)
@@ -3022,11 +3082,14 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
rc = em_push(ctxt);
/* If we failed, we tainted the memory, but the very least we should
restore cs */
- if (rc != X86EMUL_CONTINUE)
+ if (rc != X86EMUL_CONTINUE) {
+ pr_warn_once("faulting far call emulation tainted memory\n");
goto fail;
+ }
return rc;
fail:
ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS);
+ ctxt->mode = prev_mode;
return rc;
}
@@ -3477,6 +3540,12 @@ static int em_clflush(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
+static int em_movsxd(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.val = (s32) ctxt->src.val;
+ return X86EMUL_CONTINUE;
+}
+
static bool valid_cr(int nr)
{
switch (nr) {
@@ -3676,6 +3745,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
#define ID(_f, _i) { .flags = ((_f) | InstrDual | ModRM), .u.idual = (_i) }
+#define MD(_f, _m) { .flags = ((_f) | ModeDual), .u.mdual = (_m) }
#define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) }
#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
#define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) }
@@ -3738,7 +3808,7 @@ static const struct opcode group1[] = {
};
static const struct opcode group1A[] = {
- I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N,
+ I(DstMem | SrcNone | Mov | Stack | IncSP, em_pop), N, N, N, N, N, N, N,
};
static const struct opcode group2[] = {
@@ -3854,7 +3924,7 @@ static const struct gprefix pfx_0f_e7 = {
};
static const struct escape escape_d9 = { {
- N, N, N, N, N, N, N, I(DstMem, em_fnstcw),
+ N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstcw),
}, {
/* 0xC0 - 0xC7 */
N, N, N, N, N, N, N, N,
@@ -3896,7 +3966,7 @@ static const struct escape escape_db = { {
} };
static const struct escape escape_dd = { {
- N, N, N, N, N, N, N, I(DstMem, em_fnstsw),
+ N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstsw),
}, {
/* 0xC0 - 0xC7 */
N, N, N, N, N, N, N, N,
@@ -3920,6 +3990,10 @@ static const struct instr_dual instr_dual_0f_c3 = {
I(DstMem | SrcReg | ModRM | No16 | Mov, em_mov), N
};
+static const struct mode_dual mode_dual_63 = {
+ N, I(DstReg | SrcMem32 | ModRM | Mov, em_movsxd)
+};
+
static const struct opcode opcode_table[256] = {
/* 0x00 - 0x07 */
F6ALU(Lock, em_add),
@@ -3954,7 +4028,7 @@ static const struct opcode opcode_table[256] = {
/* 0x60 - 0x67 */
I(ImplicitOps | Stack | No64, em_pusha),
I(ImplicitOps | Stack | No64, em_popa),
- N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
+ N, MD(ModRM, &mode_dual_63),
N, N, N, N,
/* 0x68 - 0x6F */
I(SrcImm | Mov | Stack, em_push),
@@ -4010,8 +4084,8 @@ static const struct opcode opcode_table[256] = {
G(ByteOp, group11), G(0, group11),
/* 0xC8 - 0xCF */
I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave),
- I(ImplicitOps | Stack | SrcImmU16, em_ret_far_imm),
- I(ImplicitOps | Stack, em_ret_far),
+ I(ImplicitOps | SrcImmU16, em_ret_far_imm),
+ I(ImplicitOps, em_ret_far),
D(ImplicitOps), DI(SrcImmByte, intn),
D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
/* 0xD0 - 0xD7 */
@@ -4108,7 +4182,7 @@ static const struct opcode twobyte_table[256] = {
F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
GD(0, &group15), F(DstReg | SrcMem | ModRM, em_imul),
/* 0xB0 - 0xB7 */
- I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg),
+ I2bv(DstMem | SrcReg | ModRM | Lock | PageTable | SrcWrite, em_cmpxchg),
I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
@@ -4174,6 +4248,8 @@ static const struct opcode opcode_map_0f_38[256] = {
#undef I
#undef GP
#undef EXT
+#undef MD
+#undef ID
#undef D2bv
#undef D2bvIP
@@ -4563,6 +4639,12 @@ done_prefixes:
else
opcode = opcode.u.idual->mod012;
break;
+ case ModeDual:
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ opcode = opcode.u.mdual->mode64;
+ else
+ opcode = opcode.u.mdual->mode32;
+ break;
default:
return EMULATION_FAILED;
}
@@ -4860,8 +4942,13 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
/* optimisation - avoid slow emulated read if Mov */
rc = segmented_read(ctxt, ctxt->dst.addr.mem,
&ctxt->dst.val, ctxt->dst.bytes);
- if (rc != X86EMUL_CONTINUE)
+ if (rc != X86EMUL_CONTINUE) {
+ if (!(ctxt->d & NoWrite) &&
+ rc == X86EMUL_PROPAGATE_FAULT &&
+ ctxt->exception.vector == PF_VECTOR)
+ ctxt->exception.error_code |= PFERR_WRITE_MASK;
goto done;
+ }
}
ctxt->dst.orig_val = ctxt->dst.val;
@@ -4899,11 +4986,6 @@ special_insn:
goto threebyte_insn;
switch (ctxt->b) {
- case 0x63: /* movsxd */
- if (ctxt->mode != X86EMUL_MODE_PROT64)
- goto cannot_emulate;
- ctxt->dst.val = (s32) ctxt->src.val;
- break;
case 0x70 ... 0x7f: /* jcc (short) */
if (test_cc(ctxt->b, ctxt->eflags))
rc = jmp_rel(ctxt, ctxt->src.val);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 3c9195535ffc..c2e36d934af4 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -98,7 +98,7 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
}
void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
-int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int short_hand, unsigned int dest, int dest_mode);
int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
index 17b73eeac8a4..7dbced309ddb 100644
--- a/arch/x86/kvm/iommu.c
+++ b/arch/x86/kvm/iommu.c
@@ -138,7 +138,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
gfn += page_size >> PAGE_SHIFT;
-
+ cond_resched();
}
return 0;
@@ -306,6 +306,8 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
kvm_unpin_pages(kvm, pfn, unmap_pages);
gfn += unmap_pages;
+
+ cond_resched();
}
}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d52dcf0776ea..e55b5fc344eb 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -33,6 +33,7 @@
#include <asm/page.h>
#include <asm/current.h>
#include <asm/apicdef.h>
+#include <asm/delay.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>
#include "kvm_cache_regs.h"
@@ -327,17 +328,24 @@ static u8 count_vectors(void *bitmap)
return count;
}
-void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+void __kvm_apic_update_irr(u32 *pir, void *regs)
{
u32 i, pir_val;
- struct kvm_lapic *apic = vcpu->arch.apic;
for (i = 0; i <= 7; i++) {
pir_val = xchg(&pir[i], 0);
if (pir_val)
- *((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val;
+ *((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val;
}
}
+EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
+
+void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ __kvm_apic_update_irr(pir, apic->regs);
+}
EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
static inline void apic_set_irr(int vec, struct kvm_lapic *apic)
@@ -405,7 +413,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
* because the processor can modify ISR under the hood. Instead
* just set SVI.
*/
- if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+ if (unlikely(kvm_x86_ops->hwapic_isr_update))
kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec);
else {
++apic->isr_count;
@@ -453,7 +461,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
* on the other hand isr_count and highest_isr_cache are unused
* and must be left alone.
*/
- if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+ if (unlikely(kvm_x86_ops->hwapic_isr_update))
kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
apic_find_highest_isr(apic));
else {
@@ -580,55 +588,48 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
apic_update_ppr(apic);
}
-static int kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest)
+static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest)
{
return dest == (apic_x2apic_mode(apic) ?
X2APIC_BROADCAST : APIC_BROADCAST);
}
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest)
+static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest)
{
return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest);
}
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
+static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
{
- int result = 0;
u32 logical_id;
if (kvm_apic_broadcast(apic, mda))
- return 1;
+ return true;
- if (apic_x2apic_mode(apic)) {
- logical_id = kvm_apic_get_reg(apic, APIC_LDR);
- return logical_id & mda;
- }
+ logical_id = kvm_apic_get_reg(apic, APIC_LDR);
- logical_id = GET_APIC_LOGICAL_ID(kvm_apic_get_reg(apic, APIC_LDR));
+ if (apic_x2apic_mode(apic))
+ return ((logical_id >> 16) == (mda >> 16))
+ && (logical_id & mda & 0xffff) != 0;
+
+ logical_id = GET_APIC_LOGICAL_ID(logical_id);
switch (kvm_apic_get_reg(apic, APIC_DFR)) {
case APIC_DFR_FLAT:
- if (logical_id & mda)
- result = 1;
- break;
+ return (logical_id & mda) != 0;
case APIC_DFR_CLUSTER:
- if (((logical_id >> 4) == (mda >> 0x4))
- && (logical_id & mda & 0xf))
- result = 1;
- break;
+ return ((logical_id >> 4) == (mda >> 4))
+ && (logical_id & mda & 0xf) != 0;
default:
apic_debug("Bad DFR vcpu %d: %08x\n",
apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR));
- break;
+ return false;
}
-
- return result;
}
-int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int short_hand, unsigned int dest, int dest_mode)
{
- int result = 0;
struct kvm_lapic *target = vcpu->arch.apic;
apic_debug("target %p, source %p, dest 0x%x, "
@@ -638,29 +639,21 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
ASSERT(target);
switch (short_hand) {
case APIC_DEST_NOSHORT:
- if (dest_mode == 0)
- /* Physical mode. */
- result = kvm_apic_match_physical_addr(target, dest);
+ if (dest_mode == APIC_DEST_PHYSICAL)
+ return kvm_apic_match_physical_addr(target, dest);
else
- /* Logical mode. */
- result = kvm_apic_match_logical_addr(target, dest);
- break;
+ return kvm_apic_match_logical_addr(target, dest);
case APIC_DEST_SELF:
- result = (target == source);
- break;
+ return target == source;
case APIC_DEST_ALLINC:
- result = 1;
- break;
+ return true;
case APIC_DEST_ALLBUT:
- result = (target != source);
- break;
+ return target != source;
default:
apic_debug("kvm: apic: Bad dest shorthand value %x\n",
short_hand);
- break;
+ return false;
}
-
- return result;
}
bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
@@ -693,7 +686,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
ret = true;
- if (irq->dest_mode == 0) { /* physical mode */
+ if (irq->dest_mode == APIC_DEST_PHYSICAL) {
if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
goto out;
@@ -1076,25 +1069,72 @@ static void apic_timer_expired(struct kvm_lapic *apic)
{
struct kvm_vcpu *vcpu = apic->vcpu;
wait_queue_head_t *q = &vcpu->wq;
+ struct kvm_timer *ktimer = &apic->lapic_timer;
- /*
- * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
- * vcpu_enter_guest.
- */
if (atomic_read(&apic->lapic_timer.pending))
return;
atomic_inc(&apic->lapic_timer.pending);
- /* FIXME: this code should not know anything about vcpus */
- kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+ kvm_set_pending_timer(vcpu);
if (waitqueue_active(q))
wake_up_interruptible(q);
+
+ if (apic_lvtt_tscdeadline(apic))
+ ktimer->expired_tscdeadline = ktimer->tscdeadline;
+}
+
+/*
+ * On APICv, this test will cause a busy wait
+ * during a higher-priority task.
+ */
+
+static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = kvm_apic_get_reg(apic, APIC_LVTT);
+
+ if (kvm_apic_hw_enabled(apic)) {
+ int vec = reg & APIC_VECTOR_MASK;
+ void *bitmap = apic->regs + APIC_ISR;
+
+ if (kvm_x86_ops->deliver_posted_interrupt)
+ bitmap = apic->regs + APIC_IRR;
+
+ if (apic_test_vector(vec, bitmap))
+ return true;
+ }
+ return false;
+}
+
+void wait_lapic_expire(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u64 guest_tsc, tsc_deadline;
+
+ if (!kvm_vcpu_has_lapic(vcpu))
+ return;
+
+ if (apic->lapic_timer.expired_tscdeadline == 0)
+ return;
+
+ if (!lapic_timer_int_injected(vcpu))
+ return;
+
+ tsc_deadline = apic->lapic_timer.expired_tscdeadline;
+ apic->lapic_timer.expired_tscdeadline = 0;
+ guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+ trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
+
+ /* __delay is delay_tsc whenever the hardware has TSC, thus always. */
+ if (guest_tsc < tsc_deadline)
+ __delay(tsc_deadline - guest_tsc);
}
static void start_apic_timer(struct kvm_lapic *apic)
{
ktime_t now;
+
atomic_set(&apic->lapic_timer.pending, 0);
if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
@@ -1140,6 +1180,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
/* lapic timer in tsc deadline mode */
u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
u64 ns = 0;
+ ktime_t expire;
struct kvm_vcpu *vcpu = apic->vcpu;
unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
unsigned long flags;
@@ -1154,8 +1195,10 @@ static void start_apic_timer(struct kvm_lapic *apic)
if (likely(tscdeadline > guest_tsc)) {
ns = (tscdeadline - guest_tsc) * 1000000ULL;
do_div(ns, this_tsc_khz);
+ expire = ktime_add_ns(now, ns);
+ expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
hrtimer_start(&apic->lapic_timer.timer,
- ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
+ expire, HRTIMER_MODE_ABS);
} else
apic_timer_expired(apic);
@@ -1745,7 +1788,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
if (kvm_x86_ops->hwapic_irr_update)
kvm_x86_ops->hwapic_irr_update(vcpu,
apic_find_highest_irr(apic));
- kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic));
+ if (unlikely(kvm_x86_ops->hwapic_isr_update))
+ kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
+ apic_find_highest_isr(apic));
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_rtc_eoi_tracking_restore_one(vcpu);
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index c674fce53cf9..0bc6c656625b 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -14,6 +14,7 @@ struct kvm_timer {
u32 timer_mode;
u32 timer_mode_mask;
u64 tscdeadline;
+ u64 expired_tscdeadline;
atomic_t pending; /* accumulated triggered timers */
};
@@ -56,9 +57,8 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
void kvm_apic_set_version(struct kvm_vcpu *vcpu);
void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
+void __kvm_apic_update_irr(u32 *pir, void *regs);
void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest);
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
unsigned long *dest_map);
int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
@@ -170,4 +170,6 @@ static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
+void wait_lapic_expire(struct kvm_vcpu *vcpu);
+
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f83fc6c5e0ba..cee759299a35 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -63,30 +63,16 @@ enum {
#undef MMU_DEBUG
#ifdef MMU_DEBUG
+static bool dbg = 0;
+module_param(dbg, bool, 0644);
#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
-
+#define MMU_WARN_ON(x) WARN_ON(x)
#else
-
#define pgprintk(x...) do { } while (0)
#define rmap_printk(x...) do { } while (0)
-
-#endif
-
-#ifdef MMU_DEBUG
-static bool dbg = 0;
-module_param(dbg, bool, 0644);
-#endif
-
-#ifndef MMU_DEBUG
-#define ASSERT(x) do { } while (0)
-#else
-#define ASSERT(x) \
- if (!(x)) { \
- printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
- __FILE__, __LINE__, #x); \
- }
+#define MMU_WARN_ON(x) do { } while (0)
#endif
#define PTE_PREFETCH_NUM 8
@@ -546,6 +532,11 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
return (old_spte & bit_mask) && !(new_spte & bit_mask);
}
+static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask)
+{
+ return (old_spte & bit_mask) != (new_spte & bit_mask);
+}
+
/* Rules for using mmu_spte_set:
* Set the sptep from nonpresent to present.
* Note: the sptep being assigned *must* be either not present
@@ -596,6 +587,14 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
if (!shadow_accessed_mask)
return ret;
+ /*
+ * Flush TLB when accessed/dirty bits are changed in the page tables,
+ * to guarantee consistency between TLB and page tables.
+ */
+ if (spte_is_bit_changed(old_spte, new_spte,
+ shadow_accessed_mask | shadow_dirty_mask))
+ ret = true;
+
if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
kvm_set_pfn_accessed(spte_to_pfn(old_spte));
if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
@@ -1216,6 +1215,60 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
return flush;
}
+static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep)
+{
+ u64 spte = *sptep;
+
+ rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
+
+ spte &= ~shadow_dirty_mask;
+
+ return mmu_spte_update(sptep, spte);
+}
+
+static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ bool flush = false;
+
+ for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
+ BUG_ON(!(*sptep & PT_PRESENT_MASK));
+
+ flush |= spte_clear_dirty(kvm, sptep);
+ sptep = rmap_get_next(&iter);
+ }
+
+ return flush;
+}
+
+static bool spte_set_dirty(struct kvm *kvm, u64 *sptep)
+{
+ u64 spte = *sptep;
+
+ rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
+
+ spte |= shadow_dirty_mask;
+
+ return mmu_spte_update(sptep, spte);
+}
+
+static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ bool flush = false;
+
+ for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
+ BUG_ON(!(*sptep & PT_PRESENT_MASK));
+
+ flush |= spte_set_dirty(kvm, sptep);
+ sptep = rmap_get_next(&iter);
+ }
+
+ return flush;
+}
+
/**
* kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
* @kvm: kvm instance
@@ -1226,7 +1279,7 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
* Used when we do not need to care about huge page mappings: e.g. during dirty
* logging we do not have any such mappings.
*/
-void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
@@ -1242,6 +1295,53 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
}
}
+/**
+ * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages
+ * @kvm: kvm instance
+ * @slot: slot to clear D-bit
+ * @gfn_offset: start of the BITS_PER_LONG pages we care about
+ * @mask: indicates which pages we should clear D-bit
+ *
+ * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
+ */
+void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ unsigned long *rmapp;
+
+ while (mask) {
+ rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PT_PAGE_TABLE_LEVEL, slot);
+ __rmap_clear_dirty(kvm, rmapp);
+
+ /* clear the first set bit */
+ mask &= mask - 1;
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
+
+/**
+ * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
+ * PT level pages.
+ *
+ * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
+ * enable dirty logging for them.
+ *
+ * Used when we do not need to care about huge page mappings: e.g. during dirty
+ * logging we do not have any such mappings.
+ */
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ if (kvm_x86_ops->enable_log_dirty_pt_masked)
+ kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
+ mask);
+ else
+ kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+}
+
static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
{
struct kvm_memory_slot *slot;
@@ -1536,7 +1636,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
{
- ASSERT(is_empty_shadow_page(sp->spt));
+ MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
hlist_del(&sp->hash_link);
list_del(&sp->link);
free_page((unsigned long)sp->spt);
@@ -2501,8 +2601,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
}
}
- if (pte_access & ACC_WRITE_MASK)
+ if (pte_access & ACC_WRITE_MASK) {
mark_page_dirty(vcpu->kvm, gfn);
+ spte |= shadow_dirty_mask;
+ }
set_pte:
if (mmu_spte_update(sptep, spte))
@@ -2818,6 +2920,18 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
*/
gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+ /*
+ * Theoretically we could also set dirty bit (and flush TLB) here in
+ * order to eliminate unnecessary PML logging. See comments in
+ * set_spte. But fast_page_fault is very unlikely to happen with PML
+ * enabled, so we do not do this. This might result in the same GPA
+ * to be logged in PML buffer again when the write really happens, and
+ * eventually to be called by mark_page_dirty twice. But it's also no
+ * harm. This also avoids the TLB flush needed after setting dirty bit
+ * so non-PML cases won't be impacted.
+ *
+ * Compare with set_spte where instead shadow_dirty_mask is set.
+ */
if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
mark_page_dirty(vcpu->kvm, gfn);
@@ -3041,7 +3155,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu.pae_root[i];
- ASSERT(!VALID_PAGE(root));
+ MMU_WARN_ON(VALID_PAGE(root));
spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
@@ -3079,7 +3193,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
- ASSERT(!VALID_PAGE(root));
+ MMU_WARN_ON(VALID_PAGE(root));
spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
@@ -3104,7 +3218,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu.pae_root[i];
- ASSERT(!VALID_PAGE(root));
+ MMU_WARN_ON(VALID_PAGE(root));
if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
if (!is_present_gpte(pdptr)) {
@@ -3329,8 +3443,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
if (r)
return r;
- ASSERT(vcpu);
- ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
gfn = gva >> PAGE_SHIFT;
@@ -3396,8 +3509,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
int write = error_code & PFERR_WRITE_MASK;
bool map_writable;
- ASSERT(vcpu);
- ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
if (unlikely(error_code & PFERR_RSVD_MASK)) {
r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
@@ -3718,7 +3830,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
update_permission_bitmask(vcpu, context, false);
update_last_pte_bitmap(vcpu, context);
- ASSERT(is_pae(vcpu));
+ MMU_WARN_ON(!is_pae(vcpu));
context->page_fault = paging64_page_fault;
context->gva_to_gpa = paging64_gva_to_gpa;
context->sync_page = paging64_sync_page;
@@ -3763,7 +3875,7 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu,
static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
- struct kvm_mmu *context = vcpu->arch.walk_mmu;
+ struct kvm_mmu *context = &vcpu->arch.mmu;
context->base_role.word = 0;
context->page_fault = tdp_page_fault;
@@ -3803,11 +3915,12 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
update_last_pte_bitmap(vcpu, context);
}
-void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
{
bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
- ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ MMU_WARN_ON(VALID_PAGE(context->root_hpa));
if (!is_paging(vcpu))
nonpaging_init_context(vcpu, context);
@@ -3818,19 +3931,19 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
else
paging32_init_context(vcpu, context);
- vcpu->arch.mmu.base_role.nxe = is_nx(vcpu);
- vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
- vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
- vcpu->arch.mmu.base_role.smep_andnot_wp
+ context->base_role.nxe = is_nx(vcpu);
+ context->base_role.cr4_pae = !!is_pae(vcpu);
+ context->base_role.cr0_wp = is_write_protection(vcpu);
+ context->base_role.smep_andnot_wp
= smep && !is_write_protection(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
-void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
- bool execonly)
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
{
- ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ MMU_WARN_ON(VALID_PAGE(context->root_hpa));
context->shadow_root_level = kvm_x86_ops->get_tdp_level();
@@ -3851,11 +3964,13 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
{
- kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
- vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3;
- vcpu->arch.walk_mmu->get_cr3 = get_cr3;
- vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read;
- vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ kvm_init_shadow_mmu(vcpu);
+ context->set_cr3 = kvm_x86_ops->set_cr3;
+ context->get_cr3 = get_cr3;
+ context->get_pdptr = kvm_pdptr_read;
+ context->inject_page_fault = kvm_inject_page_fault;
}
static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
@@ -3900,17 +4015,15 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
static void init_kvm_mmu(struct kvm_vcpu *vcpu)
{
if (mmu_is_nested(vcpu))
- return init_kvm_nested_mmu(vcpu);
+ init_kvm_nested_mmu(vcpu);
else if (tdp_enabled)
- return init_kvm_tdp_mmu(vcpu);
+ init_kvm_tdp_mmu(vcpu);
else
- return init_kvm_softmmu(vcpu);
+ init_kvm_softmmu(vcpu);
}
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
{
- ASSERT(vcpu);
-
kvm_mmu_unload(vcpu);
init_kvm_mmu(vcpu);
}
@@ -4266,8 +4379,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
struct page *page;
int i;
- ASSERT(vcpu);
-
/*
* When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
* Therefore we need to allocate shadow page tables in the first
@@ -4286,8 +4397,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
int kvm_mmu_create(struct kvm_vcpu *vcpu)
{
- ASSERT(vcpu);
-
vcpu->arch.walk_mmu = &vcpu->arch.mmu;
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
vcpu->arch.mmu.translate_gpa = translate_gpa;
@@ -4298,19 +4407,18 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
void kvm_mmu_setup(struct kvm_vcpu *vcpu)
{
- ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
init_kvm_mmu(vcpu);
}
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
{
- struct kvm_memory_slot *memslot;
gfn_t last_gfn;
int i;
+ bool flush = false;
- memslot = id_to_memslot(kvm->memslots, slot);
last_gfn = memslot->base_gfn + memslot->npages - 1;
spin_lock(&kvm->mmu_lock);
@@ -4325,7 +4433,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
for (index = 0; index <= last_index; ++index, ++rmapp) {
if (*rmapp)
- __rmap_write_protect(kvm, rmapp, false);
+ flush |= __rmap_write_protect(kvm, rmapp,
+ false);
if (need_resched() || spin_needbreak(&kvm->mmu_lock))
cond_resched_lock(&kvm->mmu_lock);
@@ -4352,8 +4461,124 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
* instead of PT_WRITABLE_MASK, that means it does not depend
* on PT_WRITABLE_MASK anymore.
*/
- kvm_flush_remote_tlbs(kvm);
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+}
+
+void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
+{
+ gfn_t last_gfn;
+ unsigned long *rmapp;
+ unsigned long last_index, index;
+ bool flush = false;
+
+ last_gfn = memslot->base_gfn + memslot->npages - 1;
+
+ spin_lock(&kvm->mmu_lock);
+
+ rmapp = memslot->arch.rmap[PT_PAGE_TABLE_LEVEL - 1];
+ last_index = gfn_to_index(last_gfn, memslot->base_gfn,
+ PT_PAGE_TABLE_LEVEL);
+
+ for (index = 0; index <= last_index; ++index, ++rmapp) {
+ if (*rmapp)
+ flush |= __rmap_clear_dirty(kvm, rmapp);
+
+ if (need_resched() || spin_needbreak(&kvm->mmu_lock))
+ cond_resched_lock(&kvm->mmu_lock);
+ }
+
+ spin_unlock(&kvm->mmu_lock);
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /*
+ * It's also safe to flush TLBs out of mmu lock here as currently this
+ * function is only used for dirty logging, in which case flushing TLB
+ * out of mmu lock also guarantees no dirty pages will be lost in
+ * dirty_bitmap.
+ */
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
+
+void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
+{
+ gfn_t last_gfn;
+ int i;
+ bool flush = false;
+
+ last_gfn = memslot->base_gfn + memslot->npages - 1;
+
+ spin_lock(&kvm->mmu_lock);
+
+ for (i = PT_PAGE_TABLE_LEVEL + 1; /* skip rmap for 4K page */
+ i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ unsigned long *rmapp;
+ unsigned long last_index, index;
+
+ rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
+ last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
+
+ for (index = 0; index <= last_index; ++index, ++rmapp) {
+ if (*rmapp)
+ flush |= __rmap_write_protect(kvm, rmapp,
+ false);
+
+ if (need_resched() || spin_needbreak(&kvm->mmu_lock))
+ cond_resched_lock(&kvm->mmu_lock);
+ }
+ }
+ spin_unlock(&kvm->mmu_lock);
+
+ /* see kvm_mmu_slot_remove_write_access */
+ lockdep_assert_held(&kvm->slots_lock);
+
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
+
+void kvm_mmu_slot_set_dirty(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
+{
+ gfn_t last_gfn;
+ int i;
+ bool flush = false;
+
+ last_gfn = memslot->base_gfn + memslot->npages - 1;
+
+ spin_lock(&kvm->mmu_lock);
+
+ for (i = PT_PAGE_TABLE_LEVEL;
+ i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ unsigned long *rmapp;
+ unsigned long last_index, index;
+
+ rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
+ last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
+
+ for (index = 0; index <= last_index; ++index, ++rmapp) {
+ if (*rmapp)
+ flush |= __rmap_set_dirty(kvm, rmapp);
+
+ if (need_resched() || spin_needbreak(&kvm->mmu_lock))
+ cond_resched_lock(&kvm->mmu_lock);
+ }
+ }
+
+ spin_unlock(&kvm->mmu_lock);
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /* see kvm_mmu_slot_leaf_clear_dirty */
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
}
+EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
#define BATCH_ZAP_PAGES 10
static void kvm_zap_obsolete_pages(struct kvm *kvm)
@@ -4606,8 +4831,6 @@ EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
- ASSERT(vcpu);
-
kvm_mmu_unload(vcpu);
free_mmu_pages(vcpu);
mmu_free_memory_caches(vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index bde8ee725754..c7d65637c851 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -44,18 +44,6 @@
#define PT_DIRECTORY_LEVEL 2
#define PT_PAGE_TABLE_LEVEL 1
-#define PFERR_PRESENT_BIT 0
-#define PFERR_WRITE_BIT 1
-#define PFERR_USER_BIT 2
-#define PFERR_RSVD_BIT 3
-#define PFERR_FETCH_BIT 4
-
-#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
-#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
-#define PFERR_USER_MASK (1U << PFERR_USER_BIT)
-#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
-#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
-
static inline u64 rsvd_bits(int s, int e)
{
return ((1ULL << (e - s + 1)) - 1) << s;
@@ -81,9 +69,8 @@ enum {
};
int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
-void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
-void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
- bool execonly);
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
bool ept);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 41dd0387cccb..a17d848c6d42 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2003,8 +2003,8 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
{
- kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
-
+ WARN_ON(mmu_is_nested(vcpu));
+ kvm_init_shadow_mmu(vcpu);
vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3;
vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr;
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index c2a34bb5ad93..7c7bc8bef21f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -848,6 +848,24 @@ TRACE_EVENT(kvm_track_tsc,
#endif /* CONFIG_X86_64 */
+/*
+ * Tracepoint for PML full VMEXIT.
+ */
+TRACE_EVENT(kvm_pml_full,
+ TP_PROTO(unsigned int vcpu_id),
+ TP_ARGS(vcpu_id),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ ),
+
+ TP_printk("vcpu %d: PML full", __entry->vcpu_id)
+);
+
TRACE_EVENT(kvm_ple_window,
TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old),
TP_ARGS(grow, vcpu_id, new, old),
@@ -914,6 +932,26 @@ TRACE_EVENT(kvm_pvclock_update,
__entry->flags)
);
+TRACE_EVENT(kvm_wait_lapic_expire,
+ TP_PROTO(unsigned int vcpu_id, s64 delta),
+ TP_ARGS(vcpu_id, delta),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( s64, delta )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->delta = delta;
+ ),
+
+ TP_printk("vcpu %u: delta %lld (%s)",
+ __entry->vcpu_id,
+ __entry->delta,
+ __entry->delta < 0 ? "early" : "late")
+);
+
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d4c58d884838..3f73bfad0349 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -45,6 +45,7 @@
#include <asm/perf_event.h>
#include <asm/debugreg.h>
#include <asm/kexec.h>
+#include <asm/apic.h>
#include "trace.h"
@@ -101,6 +102,9 @@ module_param(nested, bool, S_IRUGO);
static u64 __read_mostly host_xss;
+static bool __read_mostly enable_pml = 1;
+module_param_named(pml, enable_pml, bool, S_IRUGO);
+
#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
#define KVM_VM_CR0_ALWAYS_ON \
@@ -215,7 +219,12 @@ struct __packed vmcs12 {
u64 tsc_offset;
u64 virtual_apic_page_addr;
u64 apic_access_addr;
+ u64 posted_intr_desc_addr;
u64 ept_pointer;
+ u64 eoi_exit_bitmap0;
+ u64 eoi_exit_bitmap1;
+ u64 eoi_exit_bitmap2;
+ u64 eoi_exit_bitmap3;
u64 xss_exit_bitmap;
u64 guest_physical_address;
u64 vmcs_link_pointer;
@@ -330,6 +339,7 @@ struct __packed vmcs12 {
u32 vmx_preemption_timer_value;
u32 padding32[7]; /* room for future expansion */
u16 virtual_processor_id;
+ u16 posted_intr_nv;
u16 guest_es_selector;
u16 guest_cs_selector;
u16 guest_ss_selector;
@@ -338,6 +348,7 @@ struct __packed vmcs12 {
u16 guest_gs_selector;
u16 guest_ldtr_selector;
u16 guest_tr_selector;
+ u16 guest_intr_status;
u16 host_es_selector;
u16 host_cs_selector;
u16 host_ss_selector;
@@ -401,6 +412,10 @@ struct nested_vmx {
*/
struct page *apic_access_page;
struct page *virtual_apic_page;
+ struct page *pi_desc_page;
+ struct pi_desc *pi_desc;
+ bool pi_pending;
+ u16 posted_intr_nv;
u64 msr_ia32_feature_control;
struct hrtimer preemption_timer;
@@ -408,6 +423,23 @@ struct nested_vmx {
/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
u64 vmcs01_debugctl;
+
+ u32 nested_vmx_procbased_ctls_low;
+ u32 nested_vmx_procbased_ctls_high;
+ u32 nested_vmx_true_procbased_ctls_low;
+ u32 nested_vmx_secondary_ctls_low;
+ u32 nested_vmx_secondary_ctls_high;
+ u32 nested_vmx_pinbased_ctls_low;
+ u32 nested_vmx_pinbased_ctls_high;
+ u32 nested_vmx_exit_ctls_low;
+ u32 nested_vmx_exit_ctls_high;
+ u32 nested_vmx_true_exit_ctls_low;
+ u32 nested_vmx_entry_ctls_low;
+ u32 nested_vmx_entry_ctls_high;
+ u32 nested_vmx_true_entry_ctls_low;
+ u32 nested_vmx_misc_low;
+ u32 nested_vmx_misc_high;
+ u32 nested_vmx_ept_caps;
};
#define POSTED_INTR_ON 0
@@ -511,6 +543,10 @@ struct vcpu_vmx {
/* Dynamic PLE window. */
int ple_window;
bool ple_window_dirty;
+
+ /* Support for PML */
+#define PML_ENTITY_NUM 512
+ struct page *pml_pg;
};
enum segment_cache_field {
@@ -594,6 +630,7 @@ static int max_shadow_read_write_fields =
static const unsigned short vmcs_field_to_offset_table[] = {
FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
+ FIELD(POSTED_INTR_NV, posted_intr_nv),
FIELD(GUEST_ES_SELECTOR, guest_es_selector),
FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
@@ -602,6 +639,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
+ FIELD(GUEST_INTR_STATUS, guest_intr_status),
FIELD(HOST_ES_SELECTOR, host_es_selector),
FIELD(HOST_CS_SELECTOR, host_cs_selector),
FIELD(HOST_SS_SELECTOR, host_ss_selector),
@@ -618,7 +656,12 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD64(TSC_OFFSET, tsc_offset),
FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
+ FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
FIELD64(EPT_POINTER, ept_pointer),
+ FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
+ FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
+ FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
+ FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
@@ -766,6 +809,7 @@ static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void);
static bool vmx_mpx_supported(void);
static bool vmx_xsaves_supported(void);
+static int vmx_vm_has_apicv(struct kvm *kvm);
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -793,6 +837,7 @@ static unsigned long *vmx_msr_bitmap_legacy;
static unsigned long *vmx_msr_bitmap_longmode;
static unsigned long *vmx_msr_bitmap_legacy_x2apic;
static unsigned long *vmx_msr_bitmap_longmode_x2apic;
+static unsigned long *vmx_msr_bitmap_nested;
static unsigned long *vmx_vmread_bitmap;
static unsigned long *vmx_vmwrite_bitmap;
@@ -959,16 +1004,6 @@ static inline bool cpu_has_vmx_ept_execute_only(void)
return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
}
-static inline bool cpu_has_vmx_eptp_uncacheable(void)
-{
- return vmx_capability.ept & VMX_EPTP_UC_BIT;
-}
-
-static inline bool cpu_has_vmx_eptp_writeback(void)
-{
- return vmx_capability.ept & VMX_EPTP_WB_BIT;
-}
-
static inline bool cpu_has_vmx_ept_2m_page(void)
{
return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
@@ -1073,6 +1108,11 @@ static inline bool cpu_has_vmx_shadow_vmcs(void)
SECONDARY_EXEC_SHADOW_VMCS;
}
+static inline bool cpu_has_vmx_pml(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
+}
+
static inline bool report_flexpriority(void)
{
return flexpriority_enabled;
@@ -1112,6 +1152,26 @@ static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
vmx_xsaves_supported();
}
+static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
+}
+
+static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
+}
+
+static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+}
+
+static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
+{
+ return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
+}
+
static inline bool is_exception(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2284,20 +2344,8 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
* if the corresponding bit in the (32-bit) control field *must* be on, and a
* bit in the high half is on if the corresponding bit in the control field
* may be on. See also vmx_control_verify().
- * TODO: allow these variables to be modified (downgraded) by module options
- * or other means.
*/
-static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
-static u32 nested_vmx_true_procbased_ctls_low;
-static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
-static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
-static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
-static u32 nested_vmx_true_exit_ctls_low;
-static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
-static u32 nested_vmx_true_entry_ctls_low;
-static u32 nested_vmx_misc_low, nested_vmx_misc_high;
-static u32 nested_vmx_ept_caps;
-static __init void nested_vmx_setup_ctls_msrs(void)
+static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
{
/*
* Note that as a general rule, the high half of the MSRs (bits in
@@ -2316,57 +2364,74 @@ static __init void nested_vmx_setup_ctls_msrs(void)
/* pin-based controls */
rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
- nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high);
- nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
- nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
- PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
- nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+ vmx->nested.nested_vmx_pinbased_ctls_low,
+ vmx->nested.nested_vmx_pinbased_ctls_high);
+ vmx->nested.nested_vmx_pinbased_ctls_low |=
+ PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+ vmx->nested.nested_vmx_pinbased_ctls_high &=
+ PIN_BASED_EXT_INTR_MASK |
+ PIN_BASED_NMI_EXITING |
+ PIN_BASED_VIRTUAL_NMIS;
+ vmx->nested.nested_vmx_pinbased_ctls_high |=
+ PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;
+ if (vmx_vm_has_apicv(vmx->vcpu.kvm))
+ vmx->nested.nested_vmx_pinbased_ctls_high |=
+ PIN_BASED_POSTED_INTR;
/* exit controls */
rdmsr(MSR_IA32_VMX_EXIT_CTLS,
- nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
- nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+ vmx->nested.nested_vmx_exit_ctls_low,
+ vmx->nested.nested_vmx_exit_ctls_high);
+ vmx->nested.nested_vmx_exit_ctls_low =
+ VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
- nested_vmx_exit_ctls_high &=
+ vmx->nested.nested_vmx_exit_ctls_high &=
#ifdef CONFIG_X86_64
VM_EXIT_HOST_ADDR_SPACE_SIZE |
#endif
VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
- nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+ vmx->nested.nested_vmx_exit_ctls_high |=
+ VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
if (vmx_mpx_supported())
- nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
+ vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
/* We support free control of debug control saving. */
- nested_vmx_true_exit_ctls_low = nested_vmx_exit_ctls_low &
+ vmx->nested.nested_vmx_true_exit_ctls_low =
+ vmx->nested.nested_vmx_exit_ctls_low &
~VM_EXIT_SAVE_DEBUG_CONTROLS;
/* entry controls */
rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
- nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
- nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
- nested_vmx_entry_ctls_high &=
+ vmx->nested.nested_vmx_entry_ctls_low,
+ vmx->nested.nested_vmx_entry_ctls_high);
+ vmx->nested.nested_vmx_entry_ctls_low =
+ VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+ vmx->nested.nested_vmx_entry_ctls_high &=
#ifdef CONFIG_X86_64
VM_ENTRY_IA32E_MODE |
#endif
VM_ENTRY_LOAD_IA32_PAT;
- nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
- VM_ENTRY_LOAD_IA32_EFER);
+ vmx->nested.nested_vmx_entry_ctls_high |=
+ (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
if (vmx_mpx_supported())
- nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
+ vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
/* We support free control of debug control loading. */
- nested_vmx_true_entry_ctls_low = nested_vmx_entry_ctls_low &
+ vmx->nested.nested_vmx_true_entry_ctls_low =
+ vmx->nested.nested_vmx_entry_ctls_low &
~VM_ENTRY_LOAD_DEBUG_CONTROLS;
/* cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
- nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
- nested_vmx_procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
- nested_vmx_procbased_ctls_high &=
+ vmx->nested.nested_vmx_procbased_ctls_low,
+ vmx->nested.nested_vmx_procbased_ctls_high);
+ vmx->nested.nested_vmx_procbased_ctls_low =
+ CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+ vmx->nested.nested_vmx_procbased_ctls_high &=
CPU_BASED_VIRTUAL_INTR_PENDING |
CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
@@ -2386,45 +2451,55 @@ static __init void nested_vmx_setup_ctls_msrs(void)
* can use it to avoid exits to L1 - even when L0 runs L2
* without MSR bitmaps.
*/
- nested_vmx_procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+ vmx->nested.nested_vmx_procbased_ctls_high |=
+ CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
CPU_BASED_USE_MSR_BITMAPS;
/* We support free control of CR3 access interception. */
- nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low &
+ vmx->nested.nested_vmx_true_procbased_ctls_low =
+ vmx->nested.nested_vmx_procbased_ctls_low &
~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
/* secondary cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
- nested_vmx_secondary_ctls_low = 0;
- nested_vmx_secondary_ctls_high &=
+ vmx->nested.nested_vmx_secondary_ctls_low,
+ vmx->nested.nested_vmx_secondary_ctls_high);
+ vmx->nested.nested_vmx_secondary_ctls_low = 0;
+ vmx->nested.nested_vmx_secondary_ctls_high &=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_XSAVES;
if (enable_ept) {
/* nested EPT: emulate EPT also to L1 */
- nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT |
+ vmx->nested.nested_vmx_secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_EPT |
SECONDARY_EXEC_UNRESTRICTED_GUEST;
- nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
+ vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
VMX_EPT_INVEPT_BIT;
- nested_vmx_ept_caps &= vmx_capability.ept;
+ vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
/*
* For nested guests, we don't do anything specific
* for single context invalidation. Hence, only advertise
* support for global context invalidation.
*/
- nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
+ vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
} else
- nested_vmx_ept_caps = 0;
+ vmx->nested.nested_vmx_ept_caps = 0;
/* miscellaneous data */
- rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
- nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
- nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
+ rdmsr(MSR_IA32_VMX_MISC,
+ vmx->nested.nested_vmx_misc_low,
+ vmx->nested.nested_vmx_misc_high);
+ vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
+ vmx->nested.nested_vmx_misc_low |=
+ VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
VMX_MISC_ACTIVITY_HLT;
- nested_vmx_misc_high = 0;
+ vmx->nested.nested_vmx_misc_high = 0;
}
static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
@@ -2443,6 +2518,8 @@ static inline u64 vmx_control_msr(u32 low, u32 high)
/* Returns 0 on success, non-0 otherwise. */
static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
switch (msr_index) {
case MSR_IA32_VMX_BASIC:
/*
@@ -2457,36 +2534,44 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
break;
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
case MSR_IA32_VMX_PINBASED_CTLS:
- *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low,
- nested_vmx_pinbased_ctls_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_pinbased_ctls_low,
+ vmx->nested.nested_vmx_pinbased_ctls_high);
break;
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
- *pdata = vmx_control_msr(nested_vmx_true_procbased_ctls_low,
- nested_vmx_procbased_ctls_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_true_procbased_ctls_low,
+ vmx->nested.nested_vmx_procbased_ctls_high);
break;
case MSR_IA32_VMX_PROCBASED_CTLS:
- *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
- nested_vmx_procbased_ctls_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_procbased_ctls_low,
+ vmx->nested.nested_vmx_procbased_ctls_high);
break;
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
- *pdata = vmx_control_msr(nested_vmx_true_exit_ctls_low,
- nested_vmx_exit_ctls_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_true_exit_ctls_low,
+ vmx->nested.nested_vmx_exit_ctls_high);
break;
case MSR_IA32_VMX_EXIT_CTLS:
- *pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
- nested_vmx_exit_ctls_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_exit_ctls_low,
+ vmx->nested.nested_vmx_exit_ctls_high);
break;
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
- *pdata = vmx_control_msr(nested_vmx_true_entry_ctls_low,
- nested_vmx_entry_ctls_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_true_entry_ctls_low,
+ vmx->nested.nested_vmx_entry_ctls_high);
break;
case MSR_IA32_VMX_ENTRY_CTLS:
- *pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
- nested_vmx_entry_ctls_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_entry_ctls_low,
+ vmx->nested.nested_vmx_entry_ctls_high);
break;
case MSR_IA32_VMX_MISC:
- *pdata = vmx_control_msr(nested_vmx_misc_low,
- nested_vmx_misc_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_misc_low,
+ vmx->nested.nested_vmx_misc_high);
break;
/*
* These MSRs specify bits which the guest must keep fixed (on or off)
@@ -2511,12 +2596,13 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
*pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */
break;
case MSR_IA32_VMX_PROCBASED_CTLS2:
- *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
- nested_vmx_secondary_ctls_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_secondary_ctls_low,
+ vmx->nested.nested_vmx_secondary_ctls_high);
break;
case MSR_IA32_VMX_EPT_VPID_CAP:
/* Currently, no nested vpid support */
- *pdata = nested_vmx_ept_caps;
+ *pdata = vmx->nested.nested_vmx_ept_caps;
break;
default:
return 1;
@@ -2929,7 +3015,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_SHADOW_VMCS |
- SECONDARY_EXEC_XSAVES;
+ SECONDARY_EXEC_XSAVES |
+ SECONDARY_EXEC_ENABLE_PML;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -4159,6 +4246,52 @@ static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
}
}
+/*
+ * If a msr is allowed by L0, we should check whether it is allowed by L1.
+ * The corresponding bit will be cleared unless both of L0 and L1 allow it.
+ */
+static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
+ unsigned long *msr_bitmap_nested,
+ u32 msr, int type)
+{
+ int f = sizeof(unsigned long);
+
+ if (!cpu_has_vmx_msr_bitmap()) {
+ WARN_ON(1);
+ return;
+ }
+
+ /*
+ * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+ * have the write-low and read-high bitmap offsets the wrong way round.
+ * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+ */
+ if (msr <= 0x1fff) {
+ if (type & MSR_TYPE_R &&
+ !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
+ /* read-low */
+ __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
+
+ if (type & MSR_TYPE_W &&
+ !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
+ /* write-low */
+ __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
+
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+ msr &= 0x1fff;
+ if (type & MSR_TYPE_R &&
+ !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
+ /* read-high */
+ __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
+
+ if (type & MSR_TYPE_W &&
+ !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
+ /* write-high */
+ __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
+
+ }
+}
+
static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
{
if (!longmode_only)
@@ -4197,6 +4330,64 @@ static int vmx_vm_has_apicv(struct kvm *kvm)
return enable_apicv && irqchip_in_kernel(kvm);
}
+static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int max_irr;
+ void *vapic_page;
+ u16 status;
+
+ if (vmx->nested.pi_desc &&
+ vmx->nested.pi_pending) {
+ vmx->nested.pi_pending = false;
+ if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+ return 0;
+
+ max_irr = find_last_bit(
+ (unsigned long *)vmx->nested.pi_desc->pir, 256);
+
+ if (max_irr == 256)
+ return 0;
+
+ vapic_page = kmap(vmx->nested.virtual_apic_page);
+ if (!vapic_page) {
+ WARN_ON(1);
+ return -ENOMEM;
+ }
+ __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
+ kunmap(vmx->nested.virtual_apic_page);
+
+ status = vmcs_read16(GUEST_INTR_STATUS);
+ if ((u8)max_irr > ((u8)status & 0xff)) {
+ status &= ~0xff;
+ status |= (u8)max_irr;
+ vmcs_write16(GUEST_INTR_STATUS, status);
+ }
+ }
+ return 0;
+}
+
+static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
+ int vector)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (is_guest_mode(vcpu) &&
+ vector == vmx->nested.posted_intr_nv) {
+ /* the PIR and ON have been set by L1. */
+ if (vcpu->mode == IN_GUEST_MODE)
+ apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
+ POSTED_INTR_VECTOR);
+ /*
+ * If a posted intr is not recognized by hardware,
+ * we will accomplish it in the next vmentry.
+ */
+ vmx->nested.pi_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return 0;
+ }
+ return -1;
+}
/*
* Send interrupt to vcpu via posted interrupt way.
* 1. If target vcpu is running(non-root mode), send posted interrupt
@@ -4209,6 +4400,10 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
struct vcpu_vmx *vmx = to_vmx(vcpu);
int r;
+ r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
+ if (!r)
+ return;
+
if (pi_test_and_set_pir(vector, &vmx->pi_desc))
return;
@@ -4360,6 +4555,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
a current VMCS12
*/
exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
+ /* PML is enabled/disabled in creating/destorying vcpu */
+ exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
return exec_control;
}
@@ -4986,11 +5184,12 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[2] = 0xc1;
}
-static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val)
+static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
unsigned long always_on = VMXON_CR0_ALWAYSON;
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- if (nested_vmx_secondary_ctls_high &
+ if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high &
SECONDARY_EXEC_UNRESTRICTED_GUEST &&
nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
always_on &= ~(X86_CR0_PE | X86_CR0_PG);
@@ -5015,7 +5214,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
val = (val & ~vmcs12->cr0_guest_host_mask) |
(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
- if (!nested_cr0_valid(vmcs12, val))
+ if (!nested_cr0_valid(vcpu, val))
return 1;
if (kvm_set_cr0(vcpu, val))
@@ -5817,13 +6016,21 @@ static __init int hardware_setup(void)
(unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_msr_bitmap_longmode_x2apic)
goto out4;
+
+ if (nested) {
+ vmx_msr_bitmap_nested =
+ (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_nested)
+ goto out5;
+ }
+
vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_vmread_bitmap)
- goto out5;
+ goto out6;
vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_vmwrite_bitmap)
- goto out6;
+ goto out7;
memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
@@ -5839,10 +6046,12 @@ static __init int hardware_setup(void)
memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
+ if (nested)
+ memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
if (setup_vmcs_config(&vmcs_config) < 0) {
r = -EIO;
- goto out7;
+ goto out8;
}
if (boot_cpu_has(X86_FEATURE_NX))
@@ -5868,16 +6077,16 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_unrestricted_guest())
enable_unrestricted_guest = 0;
- if (!cpu_has_vmx_flexpriority()) {
+ if (!cpu_has_vmx_flexpriority())
flexpriority_enabled = 0;
- /*
- * set_apic_access_page_addr() is used to reload apic access
- * page upon invalidation. No need to do anything if the
- * processor does not have the APIC_ACCESS_ADDR VMCS field.
- */
+ /*
+ * set_apic_access_page_addr() is used to reload apic access
+ * page upon invalidation. No need to do anything if not
+ * using the APIC_ACCESS_ADDR VMCS field.
+ */
+ if (!flexpriority_enabled)
kvm_x86_ops->set_apic_access_page_addr = NULL;
- }
if (!cpu_has_vmx_tpr_shadow())
kvm_x86_ops->update_cr8_intercept = NULL;
@@ -5895,13 +6104,11 @@ static __init int hardware_setup(void)
kvm_x86_ops->update_cr8_intercept = NULL;
else {
kvm_x86_ops->hwapic_irr_update = NULL;
+ kvm_x86_ops->hwapic_isr_update = NULL;
kvm_x86_ops->deliver_posted_interrupt = NULL;
kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
}
- if (nested)
- nested_vmx_setup_ctls_msrs();
-
vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
@@ -5945,12 +6152,29 @@ static __init int hardware_setup(void)
update_ple_window_actual_max();
+ /*
+ * Only enable PML when hardware supports PML feature, and both EPT
+ * and EPT A/D bit features are enabled -- PML depends on them to work.
+ */
+ if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
+ enable_pml = 0;
+
+ if (!enable_pml) {
+ kvm_x86_ops->slot_enable_log_dirty = NULL;
+ kvm_x86_ops->slot_disable_log_dirty = NULL;
+ kvm_x86_ops->flush_log_dirty = NULL;
+ kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
+ }
+
return alloc_kvm_area();
-out7:
+out8:
free_page((unsigned long)vmx_vmwrite_bitmap);
-out6:
+out7:
free_page((unsigned long)vmx_vmread_bitmap);
+out6:
+ if (nested)
+ free_page((unsigned long)vmx_msr_bitmap_nested);
out5:
free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
out4:
@@ -5977,6 +6201,8 @@ static __exit void hardware_unsetup(void)
free_page((unsigned long)vmx_io_bitmap_a);
free_page((unsigned long)vmx_vmwrite_bitmap);
free_page((unsigned long)vmx_vmread_bitmap);
+ if (nested)
+ free_page((unsigned long)vmx_msr_bitmap_nested);
free_kvm_area();
}
@@ -6143,6 +6369,13 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
*/
}
+static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
+{
+ /* TODO: not to reset guest simply here. */
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ pr_warn("kvm: nested vmx abort, indicator %d\n", indicator);
+}
+
static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
{
struct vcpu_vmx *vmx =
@@ -6432,6 +6665,7 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
vmcs_write64(VMCS_LINK_POINTER, -1ull);
}
+ vmx->nested.posted_intr_nv = -1;
kunmap(vmx->nested.current_vmcs12_page);
nested_release_page(vmx->nested.current_vmcs12_page);
vmx->nested.current_vmptr = -1ull;
@@ -6460,6 +6694,12 @@ static void free_nested(struct vcpu_vmx *vmx)
nested_release_page(vmx->nested.virtual_apic_page);
vmx->nested.virtual_apic_page = NULL;
}
+ if (vmx->nested.pi_desc_page) {
+ kunmap(vmx->nested.pi_desc_page);
+ nested_release_page(vmx->nested.pi_desc_page);
+ vmx->nested.pi_desc_page = NULL;
+ vmx->nested.pi_desc = NULL;
+ }
nested_free_all_saved_vmcss(vmx);
}
@@ -6893,6 +7133,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
/* Emulate the INVEPT instruction */
static int handle_invept(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 vmx_instruction_info, types;
unsigned long type;
gva_t gva;
@@ -6901,8 +7142,9 @@ static int handle_invept(struct kvm_vcpu *vcpu)
u64 eptp, gpa;
} operand;
- if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
- !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+ if (!(vmx->nested.nested_vmx_secondary_ctls_high &
+ SECONDARY_EXEC_ENABLE_EPT) ||
+ !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
@@ -6918,7 +7160,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
- types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
+ types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
if (!(types & (1UL << type))) {
nested_vmx_failValid(vcpu,
@@ -6960,6 +7202,31 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_pml_full(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+
+ trace_kvm_pml_full(vcpu->vcpu_id);
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ /*
+ * PML buffer FULL happened while executing iret from NMI,
+ * "blocked by NMI" bit has to be set before next VM entry.
+ */
+ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ cpu_has_virtual_nmis() &&
+ (exit_qualification & INTR_INFO_UNBLOCK_NMI))
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+
+ /*
+ * PML buffer already flushed at beginning of VMEXIT. Nothing to do
+ * here.., and there's no userspace involvement needed for PML.
+ */
+ return 1;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -7008,6 +7275,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_INVVPID] = handle_invvpid,
[EXIT_REASON_XSAVES] = handle_xsaves,
[EXIT_REASON_XRSTORS] = handle_xrstors,
+ [EXIT_REASON_PML_FULL] = handle_pml_full,
};
static const int kvm_vmx_max_exit_handlers =
@@ -7275,6 +7543,10 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
case EXIT_REASON_APIC_ACCESS:
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+ case EXIT_REASON_APIC_WRITE:
+ case EXIT_REASON_EOI_INDUCED:
+ /* apic_write and eoi_induced should exit unconditionally. */
+ return 1;
case EXIT_REASON_EPT_VIOLATION:
/*
* L0 always deals with the EPT violation. If nested EPT is
@@ -7314,6 +7586,89 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
}
+static int vmx_enable_pml(struct vcpu_vmx *vmx)
+{
+ struct page *pml_pg;
+ u32 exec_control;
+
+ pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!pml_pg)
+ return -ENOMEM;
+
+ vmx->pml_pg = pml_pg;
+
+ vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
+ vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+
+ exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ exec_control |= SECONDARY_EXEC_ENABLE_PML;
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+
+ return 0;
+}
+
+static void vmx_disable_pml(struct vcpu_vmx *vmx)
+{
+ u32 exec_control;
+
+ ASSERT(vmx->pml_pg);
+ __free_page(vmx->pml_pg);
+ vmx->pml_pg = NULL;
+
+ exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+}
+
+static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx)
+{
+ struct kvm *kvm = vmx->vcpu.kvm;
+ u64 *pml_buf;
+ u16 pml_idx;
+
+ pml_idx = vmcs_read16(GUEST_PML_INDEX);
+
+ /* Do nothing if PML buffer is empty */
+ if (pml_idx == (PML_ENTITY_NUM - 1))
+ return;
+
+ /* PML index always points to next available PML buffer entity */
+ if (pml_idx >= PML_ENTITY_NUM)
+ pml_idx = 0;
+ else
+ pml_idx++;
+
+ pml_buf = page_address(vmx->pml_pg);
+ for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
+ u64 gpa;
+
+ gpa = pml_buf[pml_idx];
+ WARN_ON(gpa & (PAGE_SIZE - 1));
+ mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
+ }
+
+ /* reset PML index */
+ vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+}
+
+/*
+ * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
+ * Called before reporting dirty_bitmap to userspace.
+ */
+static void kvm_flush_pml_buffers(struct kvm *kvm)
+{
+ int i;
+ struct kvm_vcpu *vcpu;
+ /*
+ * We only need to kick vcpu out of guest mode here, as PML buffer
+ * is flushed at beginning of all VMEXITs, and it's obvious that only
+ * vcpus running in guest are possible to have unflushed GPAs in PML
+ * buffer.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_vcpu_kick(vcpu);
+}
+
/*
* The guest has exited. See if we can fix it or if we need userspace
* assistance.
@@ -7324,6 +7679,16 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
u32 exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
+ /*
+ * Flush logged GPAs PML buffer, this will make dirty_bitmap more
+ * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
+ * querying dirty_bitmap, we only need to kick all vcpus out of guest
+ * mode as if vcpus is in root mode, the PML buffer must has been
+ * flushed already.
+ */
+ if (enable_pml)
+ vmx_flush_pml_buffer(vmx);
+
/* If guest state is invalid, start emulating */
if (vmx->emulation_required)
return handle_invalid_guest_state(vcpu);
@@ -7471,9 +7836,6 @@ static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
u16 status;
u8 old;
- if (!vmx_vm_has_apicv(kvm))
- return;
-
if (isr == -1)
isr = 0;
@@ -7973,6 +8335,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (enable_pml)
+ vmx_disable_pml(vmx);
free_vpid(vmx);
leave_guest_mode(vcpu);
vmx_load_vmcs01(vcpu);
@@ -8040,9 +8404,25 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
goto free_vmcs;
}
+ if (nested)
+ nested_vmx_setup_ctls_msrs(vmx);
+
+ vmx->nested.posted_intr_nv = -1;
vmx->nested.current_vmptr = -1ull;
vmx->nested.current_vmcs12 = NULL;
+ /*
+ * If PML is turned on, failure on enabling PML just results in failure
+ * of creating the vcpu, therefore we can simplify PML logic (by
+ * avoiding dealing with cases, such as enabling PML partially on vcpus
+ * for the guest, etc.
+ */
+ if (enable_pml) {
+ err = vmx_enable_pml(vmx);
+ if (err)
+ goto free_vmcs;
+ }
+
return &vmx->vcpu;
free_vmcs:
@@ -8184,9 +8564,10 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
{
- kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
- nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
-
+ WARN_ON(mmu_is_nested(vcpu));
+ kvm_init_shadow_ept_mmu(vcpu,
+ to_vmx(vcpu)->nested.nested_vmx_ept_caps &
+ VMX_EPT_EXECUTE_ONLY_BIT);
vcpu->arch.mmu.set_cr3 = vmx_set_cr3;
vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3;
vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
@@ -8199,6 +8580,18 @@ static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
vcpu->arch.walk_mmu = &vcpu->arch.mmu;
}
+static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
+ u16 error_code)
+{
+ bool inequality, bit;
+
+ bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
+ inequality =
+ (error_code & vmcs12->page_fault_error_code_mask) !=
+ vmcs12->page_fault_error_code_match;
+ return inequality ^ bit;
+}
+
static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
struct x86_exception *fault)
{
@@ -8206,8 +8599,7 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
WARN_ON(!is_guest_mode(vcpu));
- /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
- if (vmcs12->exception_bitmap & (1u << PF_VECTOR))
+ if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code))
nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
vmcs_read32(VM_EXIT_INTR_INFO),
vmcs_readl(EXIT_QUALIFICATION));
@@ -8261,6 +8653,31 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
return false;
}
+ if (nested_cpu_has_posted_intr(vmcs12)) {
+ if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64))
+ return false;
+
+ if (vmx->nested.pi_desc_page) { /* shouldn't happen */
+ kunmap(vmx->nested.pi_desc_page);
+ nested_release_page(vmx->nested.pi_desc_page);
+ }
+ vmx->nested.pi_desc_page =
+ nested_get_page(vcpu, vmcs12->posted_intr_desc_addr);
+ if (!vmx->nested.pi_desc_page)
+ return false;
+
+ vmx->nested.pi_desc =
+ (struct pi_desc *)kmap(vmx->nested.pi_desc_page);
+ if (!vmx->nested.pi_desc) {
+ nested_release_page_clean(vmx->nested.pi_desc_page);
+ return false;
+ }
+ vmx->nested.pi_desc =
+ (struct pi_desc *)((void *)vmx->nested.pi_desc +
+ (unsigned long)(vmcs12->posted_intr_desc_addr &
+ (PAGE_SIZE - 1)));
+ }
+
return true;
}
@@ -8286,6 +8703,310 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
}
+static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ int maxphyaddr;
+ u64 addr;
+
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+ return 0;
+
+ if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+ maxphyaddr = cpuid_maxphyaddr(vcpu);
+
+ if (!PAGE_ALIGNED(vmcs12->msr_bitmap) ||
+ ((addr + PAGE_SIZE) >> maxphyaddr))
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * Merge L0's and L1's MSR bitmap, return false to indicate that
+ * we do not use the hardware.
+ */
+static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ int msr;
+ struct page *page;
+ unsigned long *msr_bitmap;
+
+ if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
+ return false;
+
+ page = nested_get_page(vcpu, vmcs12->msr_bitmap);
+ if (!page) {
+ WARN_ON(1);
+ return false;
+ }
+ msr_bitmap = (unsigned long *)kmap(page);
+ if (!msr_bitmap) {
+ nested_release_page_clean(page);
+ WARN_ON(1);
+ return false;
+ }
+
+ if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
+ if (nested_cpu_has_apic_reg_virt(vmcs12))
+ for (msr = 0x800; msr <= 0x8ff; msr++)
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap,
+ vmx_msr_bitmap_nested,
+ msr, MSR_TYPE_R);
+ /* TPR is allowed */
+ nested_vmx_disable_intercept_for_msr(msr_bitmap,
+ vmx_msr_bitmap_nested,
+ APIC_BASE_MSR + (APIC_TASKPRI >> 4),
+ MSR_TYPE_R | MSR_TYPE_W);
+ if (nested_cpu_has_vid(vmcs12)) {
+ /* EOI and self-IPI are allowed */
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap,
+ vmx_msr_bitmap_nested,
+ APIC_BASE_MSR + (APIC_EOI >> 4),
+ MSR_TYPE_W);
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap,
+ vmx_msr_bitmap_nested,
+ APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
+ MSR_TYPE_W);
+ }
+ } else {
+ /*
+ * Enable reading intercept of all the x2apic
+ * MSRs. We should not rely on vmcs12 to do any
+ * optimizations here, it may have been modified
+ * by L1.
+ */
+ for (msr = 0x800; msr <= 0x8ff; msr++)
+ __vmx_enable_intercept_for_msr(
+ vmx_msr_bitmap_nested,
+ msr,
+ MSR_TYPE_R);
+
+ __vmx_enable_intercept_for_msr(
+ vmx_msr_bitmap_nested,
+ APIC_BASE_MSR + (APIC_TASKPRI >> 4),
+ MSR_TYPE_W);
+ __vmx_enable_intercept_for_msr(
+ vmx_msr_bitmap_nested,
+ APIC_BASE_MSR + (APIC_EOI >> 4),
+ MSR_TYPE_W);
+ __vmx_enable_intercept_for_msr(
+ vmx_msr_bitmap_nested,
+ APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
+ MSR_TYPE_W);
+ }
+ kunmap(page);
+ nested_release_page_clean(page);
+
+ return true;
+}
+
+static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+ !nested_cpu_has_apic_reg_virt(vmcs12) &&
+ !nested_cpu_has_vid(vmcs12) &&
+ !nested_cpu_has_posted_intr(vmcs12))
+ return 0;
+
+ /*
+ * If virtualize x2apic mode is enabled,
+ * virtualize apic access must be disabled.
+ */
+ if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+ nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+ return -EINVAL;
+
+ /*
+ * If virtual interrupt delivery is enabled,
+ * we must exit on external interrupts.
+ */
+ if (nested_cpu_has_vid(vmcs12) &&
+ !nested_exit_on_intr(vcpu))
+ return -EINVAL;
+
+ /*
+ * bits 15:8 should be zero in posted_intr_nv,
+ * the descriptor address has been already checked
+ * in nested_get_vmcs12_pages.
+ */
+ if (nested_cpu_has_posted_intr(vmcs12) &&
+ (!nested_cpu_has_vid(vmcs12) ||
+ !nested_exit_intr_ack_set(vcpu) ||
+ vmcs12->posted_intr_nv & 0xff00))
+ return -EINVAL;
+
+ /* tpr shadow is needed by all apicv features. */
+ if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
+ unsigned long count_field,
+ unsigned long addr_field,
+ int maxphyaddr)
+{
+ u64 count, addr;
+
+ if (vmcs12_read_any(vcpu, count_field, &count) ||
+ vmcs12_read_any(vcpu, addr_field, &addr)) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+ if (count == 0)
+ return 0;
+ if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
+ (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
+ pr_warn_ratelimited(
+ "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
+ addr_field, maxphyaddr, count, addr);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ int maxphyaddr;
+
+ if (vmcs12->vm_exit_msr_load_count == 0 &&
+ vmcs12->vm_exit_msr_store_count == 0 &&
+ vmcs12->vm_entry_msr_load_count == 0)
+ return 0; /* Fast path */
+ maxphyaddr = cpuid_maxphyaddr(vcpu);
+ if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
+ VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) ||
+ nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
+ VM_EXIT_MSR_STORE_ADDR, maxphyaddr) ||
+ nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
+ VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr))
+ return -EINVAL;
+ return 0;
+}
+
+static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
+ struct vmx_msr_entry *e)
+{
+ /* x2APIC MSR accesses are not allowed */
+ if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8)
+ return -EINVAL;
+ if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
+ e->index == MSR_IA32_UCODE_REV)
+ return -EINVAL;
+ if (e->reserved != 0)
+ return -EINVAL;
+ return 0;
+}
+
+static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
+ struct vmx_msr_entry *e)
+{
+ if (e->index == MSR_FS_BASE ||
+ e->index == MSR_GS_BASE ||
+ e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
+ nested_vmx_msr_check_common(vcpu, e))
+ return -EINVAL;
+ return 0;
+}
+
+static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
+ struct vmx_msr_entry *e)
+{
+ if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
+ nested_vmx_msr_check_common(vcpu, e))
+ return -EINVAL;
+ return 0;
+}
+
+/*
+ * Load guest's/host's msr at nested entry/exit.
+ * return 0 for success, entry index for failure.
+ */
+static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+ u32 i;
+ struct vmx_msr_entry e;
+ struct msr_data msr;
+
+ msr.host_initiated = false;
+ for (i = 0; i < count; i++) {
+ if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e),
+ &e, sizeof(e))) {
+ pr_warn_ratelimited(
+ "%s cannot read MSR entry (%u, 0x%08llx)\n",
+ __func__, i, gpa + i * sizeof(e));
+ goto fail;
+ }
+ if (nested_vmx_load_msr_check(vcpu, &e)) {
+ pr_warn_ratelimited(
+ "%s check failed (%u, 0x%x, 0x%x)\n",
+ __func__, i, e.index, e.reserved);
+ goto fail;
+ }
+ msr.index = e.index;
+ msr.data = e.value;
+ if (kvm_set_msr(vcpu, &msr)) {
+ pr_warn_ratelimited(
+ "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
+ __func__, i, e.index, e.value);
+ goto fail;
+ }
+ }
+ return 0;
+fail:
+ return i + 1;
+}
+
+static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+ u32 i;
+ struct vmx_msr_entry e;
+
+ for (i = 0; i < count; i++) {
+ if (kvm_read_guest(vcpu->kvm,
+ gpa + i * sizeof(e),
+ &e, 2 * sizeof(u32))) {
+ pr_warn_ratelimited(
+ "%s cannot read MSR entry (%u, 0x%08llx)\n",
+ __func__, i, gpa + i * sizeof(e));
+ return -EINVAL;
+ }
+ if (nested_vmx_store_msr_check(vcpu, &e)) {
+ pr_warn_ratelimited(
+ "%s check failed (%u, 0x%x, 0x%x)\n",
+ __func__, i, e.index, e.reserved);
+ return -EINVAL;
+ }
+ if (kvm_get_msr(vcpu, e.index, &e.value)) {
+ pr_warn_ratelimited(
+ "%s cannot read MSR (%u, 0x%x)\n",
+ __func__, i, e.index);
+ return -EINVAL;
+ }
+ if (kvm_write_guest(vcpu->kvm,
+ gpa + i * sizeof(e) +
+ offsetof(struct vmx_msr_entry, value),
+ &e.value, sizeof(e.value))) {
+ pr_warn_ratelimited(
+ "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
+ __func__, i, e.index, e.value);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
/*
* prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
* L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -8365,8 +9086,23 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
exec_control = vmcs12->pin_based_vm_exec_control;
exec_control |= vmcs_config.pin_based_exec_ctrl;
- exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER |
- PIN_BASED_POSTED_INTR);
+ exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+
+ if (nested_cpu_has_posted_intr(vmcs12)) {
+ /*
+ * Note that we use L0's vector here and in
+ * vmx_deliver_nested_posted_interrupt.
+ */
+ vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
+ vmx->nested.pi_pending = false;
+ vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+ vmcs_write64(POSTED_INTR_DESC_ADDR,
+ page_to_phys(vmx->nested.pi_desc_page) +
+ (unsigned long)(vmcs12->posted_intr_desc_addr &
+ (PAGE_SIZE - 1)));
+ } else
+ exec_control &= ~PIN_BASED_POSTED_INTR;
+
vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
vmx->nested.preemption_timer_expired = false;
@@ -8423,12 +9159,26 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
else
vmcs_write64(APIC_ACCESS_ADDR,
page_to_phys(vmx->nested.apic_access_page));
- } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) {
+ } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
+ (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) {
exec_control |=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
kvm_vcpu_reload_apic_access_page(vcpu);
}
+ if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
+ vmcs_write64(EOI_EXIT_BITMAP0,
+ vmcs12->eoi_exit_bitmap0);
+ vmcs_write64(EOI_EXIT_BITMAP1,
+ vmcs12->eoi_exit_bitmap1);
+ vmcs_write64(EOI_EXIT_BITMAP2,
+ vmcs12->eoi_exit_bitmap2);
+ vmcs_write64(EOI_EXIT_BITMAP3,
+ vmcs12->eoi_exit_bitmap3);
+ vmcs_write16(GUEST_INTR_STATUS,
+ vmcs12->guest_intr_status);
+ }
+
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
}
@@ -8462,11 +9212,17 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
}
+ if (cpu_has_vmx_msr_bitmap() &&
+ exec_control & CPU_BASED_USE_MSR_BITMAPS &&
+ nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) {
+ vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_nested));
+ } else
+ exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
+
/*
- * Merging of IO and MSR bitmaps not currently supported.
+ * Merging of IO bitmap not currently supported.
* Rather, exit every time.
*/
- exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
exec_control |= CPU_BASED_UNCOND_IO_EXITING;
@@ -8582,6 +9338,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
int cpu;
struct loaded_vmcs *vmcs02;
bool ia32e;
+ u32 msr_entry_idx;
if (!nested_vmx_check_permission(vcpu) ||
!nested_vmx_check_vmcs12(vcpu))
@@ -8616,41 +9373,42 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
return 1;
}
- if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
- !PAGE_ALIGNED(vmcs12->msr_bitmap)) {
+ if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
/*TODO: Also verify bits beyond physical address width are 0*/
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
}
- if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
- /*TODO: Also verify bits beyond physical address width are 0*/
+ if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
}
- if (vmcs12->vm_entry_msr_load_count > 0 ||
- vmcs12->vm_exit_msr_load_count > 0 ||
- vmcs12->vm_exit_msr_store_count > 0) {
- pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n",
- __func__);
+ if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) {
+ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ return 1;
+ }
+
+ if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
}
if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
- nested_vmx_true_procbased_ctls_low,
- nested_vmx_procbased_ctls_high) ||
+ vmx->nested.nested_vmx_true_procbased_ctls_low,
+ vmx->nested.nested_vmx_procbased_ctls_high) ||
!vmx_control_verify(vmcs12->secondary_vm_exec_control,
- nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) ||
+ vmx->nested.nested_vmx_secondary_ctls_low,
+ vmx->nested.nested_vmx_secondary_ctls_high) ||
!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
- nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) ||
+ vmx->nested.nested_vmx_pinbased_ctls_low,
+ vmx->nested.nested_vmx_pinbased_ctls_high) ||
!vmx_control_verify(vmcs12->vm_exit_controls,
- nested_vmx_true_exit_ctls_low,
- nested_vmx_exit_ctls_high) ||
+ vmx->nested.nested_vmx_true_exit_ctls_low,
+ vmx->nested.nested_vmx_exit_ctls_high) ||
!vmx_control_verify(vmcs12->vm_entry_controls,
- nested_vmx_true_entry_ctls_low,
- nested_vmx_entry_ctls_high))
+ vmx->nested.nested_vmx_true_entry_ctls_low,
+ vmx->nested.nested_vmx_entry_ctls_high))
{
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
@@ -8663,7 +9421,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
return 1;
}
- if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) ||
+ if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) ||
((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
nested_vmx_entry_failure(vcpu, vmcs12,
EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
@@ -8739,10 +9497,21 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
vmx_segment_cache_clear(vmx);
- vmcs12->launch_state = 1;
-
prepare_vmcs02(vcpu, vmcs12);
+ msr_entry_idx = nested_vmx_load_msr(vcpu,
+ vmcs12->vm_entry_msr_load_addr,
+ vmcs12->vm_entry_msr_load_count);
+ if (msr_entry_idx) {
+ leave_guest_mode(vcpu);
+ vmx_load_vmcs01(vcpu);
+ nested_vmx_entry_failure(vcpu, vmcs12,
+ EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
+ return 1;
+ }
+
+ vmcs12->launch_state = 1;
+
if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
return kvm_emulate_halt(vcpu);
@@ -8869,9 +9638,10 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
if (vmx->nested.nested_run_pending)
return -EBUSY;
nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+ return 0;
}
- return 0;
+ return vmx_complete_nested_posted_interrupt(vcpu);
}
static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
@@ -8981,6 +9751,9 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
}
+ if (nested_cpu_has_vid(vmcs12))
+ vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
+
vmcs12->vm_entry_controls =
(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
@@ -9172,6 +9945,13 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
kvm_set_dr(vcpu, 7, 0x400);
vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmx_set_msr_bitmap(vcpu);
+
+ if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
+ vmcs12->vm_exit_msr_load_count))
+ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
}
/*
@@ -9193,6 +9973,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
exit_qualification);
+ if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
+ vmcs12->vm_exit_msr_store_count))
+ nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+
vmx_load_vmcs01(vcpu);
if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
@@ -9235,6 +10019,12 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
nested_release_page(vmx->nested.virtual_apic_page);
vmx->nested.virtual_apic_page = NULL;
}
+ if (vmx->nested.pi_desc_page) {
+ kunmap(vmx->nested.pi_desc_page);
+ nested_release_page(vmx->nested.pi_desc_page);
+ vmx->nested.pi_desc_page = NULL;
+ vmx->nested.pi_desc = NULL;
+ }
/*
* We are now running in L2, mmu_notifier will force to reload the
@@ -9301,6 +10091,31 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
shrink_ple_window(vcpu);
}
+static void vmx_slot_enable_log_dirty(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
+ kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
+}
+
+static void vmx_slot_disable_log_dirty(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ kvm_mmu_slot_set_dirty(kvm, slot);
+}
+
+static void vmx_flush_log_dirty(struct kvm *kvm)
+{
+ kvm_flush_pml_buffers(kvm);
+}
+
+static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
+ gfn_t offset, unsigned long mask)
+{
+ kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
+}
+
static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -9409,6 +10224,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
.check_nested_events = vmx_check_nested_events,
.sched_in = vmx_sched_in,
+
+ .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
+ .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
+ .flush_log_dirty = vmx_flush_log_dirty,
+ .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c259814200bd..bd7a70be41b3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -108,6 +108,10 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
static u32 tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+/* lapic timer advance (tscdeadline mode only) in nanoseconds */
+unsigned int lapic_timer_advance_ns = 0;
+module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
+
static bool backwards_tsc_observed = false;
#define KVM_NR_SHARED_MSRS 16
@@ -141,6 +145,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "irq_window", VCPU_STAT(irq_window_exits) },
{ "nmi_window", VCPU_STAT(nmi_window_exits) },
{ "halt_exits", VCPU_STAT(halt_exits) },
+ { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
{ "hypercalls", VCPU_STAT(hypercalls) },
{ "request_irq", VCPU_STAT(request_irq_exits) },
@@ -492,7 +497,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
}
EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
-int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
void *data, int offset, int len, u32 access)
{
return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
@@ -643,7 +648,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
}
}
-int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
u64 xcr0 = xcr;
u64 old_xcr0 = vcpu->arch.xcr0;
@@ -1083,6 +1088,15 @@ static void update_pvclock_gtod(struct timekeeper *tk)
}
#endif
+void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
+ * vcpu_enter_guest. This function is only called from
+ * the physical CPU that is running vcpu.
+ */
+ kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+}
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
@@ -1180,7 +1194,7 @@ static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
#endif
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
-unsigned long max_tsc_khz;
+static unsigned long max_tsc_khz;
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
{
@@ -1234,7 +1248,7 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
return tsc;
}
-void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
+static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_X86_64
bool vcpus_matched;
@@ -1529,7 +1543,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
&ka->master_cycle_now);
ka->use_master_clock = host_tsc_clocksource && vcpus_matched
- && !backwards_tsc_observed;
+ && !backwards_tsc_observed
+ && !ka->boot_vcpu_runs_old_kvmclock;
if (ka->use_master_clock)
atomic_set(&kvm_guest_has_master_clock, 1);
@@ -2161,8 +2176,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_KVM_SYSTEM_TIME_NEW:
case MSR_KVM_SYSTEM_TIME: {
u64 gpa_offset;
+ struct kvm_arch *ka = &vcpu->kvm->arch;
+
kvmclock_reset(vcpu);
+ if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
+ bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
+
+ if (ka->boot_vcpu_runs_old_kvmclock != tmp)
+ set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
+ &vcpu->requests);
+
+ ka->boot_vcpu_runs_old_kvmclock = tmp;
+ }
+
vcpu->arch.time = data;
kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
@@ -2324,6 +2351,7 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
{
return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
}
+EXPORT_SYMBOL_GPL(kvm_get_msr);
static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
{
@@ -2738,6 +2766,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_READONLY_MEM:
case KVM_CAP_HYPERV_TIME:
case KVM_CAP_IOAPIC_POLARITY_IGNORED:
+ case KVM_CAP_TSC_DEADLINE_TIMER:
#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3:
@@ -2776,9 +2805,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_TSC_CONTROL:
r = kvm_has_tsc_control;
break;
- case KVM_CAP_TSC_DEADLINE_TIMER:
- r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);
- break;
default:
r = 0;
break;
@@ -3734,83 +3760,43 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
* @kvm: kvm instance
* @log: slot id and address to which we copy the log
*
- * We need to keep it in mind that VCPU threads can write to the bitmap
- * concurrently. So, to avoid losing data, we keep the following order for
- * each bit:
+ * Steps 1-4 below provide general overview of dirty page logging. See
+ * kvm_get_dirty_log_protect() function description for additional details.
+ *
+ * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
+ * always flush the TLB (step 4) even if previous step failed and the dirty
+ * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
+ * does not preclude user space subsequent dirty log read. Flushing TLB ensures
+ * writes will be marked dirty for next log read.
*
* 1. Take a snapshot of the bit and clear it if needed.
* 2. Write protect the corresponding page.
- * 3. Flush TLB's if needed.
- * 4. Copy the snapshot to the userspace.
- *
- * Between 2 and 3, the guest may write to the page using the remaining TLB
- * entry. This is not a problem because the page will be reported dirty at
- * step 4 using the snapshot taken before and step 3 ensures that successive
- * writes will be logged for the next call.
+ * 3. Copy the snapshot to the userspace.
+ * 4. Flush TLB's if needed.
*/
int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
{
- int r;
- struct kvm_memory_slot *memslot;
- unsigned long n, i;
- unsigned long *dirty_bitmap;
- unsigned long *dirty_bitmap_buffer;
bool is_dirty = false;
+ int r;
mutex_lock(&kvm->slots_lock);
- r = -EINVAL;
- if (log->slot >= KVM_USER_MEM_SLOTS)
- goto out;
-
- memslot = id_to_memslot(kvm->memslots, log->slot);
-
- dirty_bitmap = memslot->dirty_bitmap;
- r = -ENOENT;
- if (!dirty_bitmap)
- goto out;
-
- n = kvm_dirty_bitmap_bytes(memslot);
-
- dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
- memset(dirty_bitmap_buffer, 0, n);
-
- spin_lock(&kvm->mmu_lock);
-
- for (i = 0; i < n / sizeof(long); i++) {
- unsigned long mask;
- gfn_t offset;
-
- if (!dirty_bitmap[i])
- continue;
-
- is_dirty = true;
-
- mask = xchg(&dirty_bitmap[i], 0);
- dirty_bitmap_buffer[i] = mask;
-
- offset = i * BITS_PER_LONG;
- kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
- }
-
- spin_unlock(&kvm->mmu_lock);
+ /*
+ * Flush potentially hardware-cached dirty pages to dirty_bitmap.
+ */
+ if (kvm_x86_ops->flush_log_dirty)
+ kvm_x86_ops->flush_log_dirty(kvm);
- /* See the comments in kvm_mmu_slot_remove_write_access(). */
- lockdep_assert_held(&kvm->slots_lock);
+ r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
/*
* All the TLBs can be flushed out of mmu lock, see the comments in
* kvm_mmu_slot_remove_write_access().
*/
+ lockdep_assert_held(&kvm->slots_lock);
if (is_dirty)
kvm_flush_remote_tlbs(kvm);
- r = -EFAULT;
- if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
- goto out;
-
- r = 0;
-out:
mutex_unlock(&kvm->slots_lock);
return r;
}
@@ -4516,6 +4502,8 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
if (rc != X86EMUL_CONTINUE)
return rc;
addr += now;
+ if (ctxt->mode != X86EMUL_MODE_PROT64)
+ addr = (u32)addr;
val += now;
bytes -= now;
}
@@ -4984,6 +4972,11 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon
kvm_register_write(emul_to_vcpu(ctxt), reg, val);
}
+static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
+{
+ kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
+}
+
static const struct x86_emulate_ops emulate_ops = {
.read_gpr = emulator_read_gpr,
.write_gpr = emulator_write_gpr,
@@ -5019,6 +5012,7 @@ static const struct x86_emulate_ops emulate_ops = {
.put_fpu = emulator_put_fpu,
.intercept = emulator_intercept,
.get_cpuid = emulator_get_cpuid,
+ .set_nmi_mask = emulator_set_nmi_mask,
};
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
@@ -6311,6 +6305,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
trace_kvm_entry(vcpu->vcpu_id);
+ wait_lapic_expire(vcpu);
kvm_x86_ops->run(vcpu);
/*
@@ -7041,15 +7036,13 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
return r;
}
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
{
- int r;
struct msr_data msr;
struct kvm *kvm = vcpu->kvm;
- r = vcpu_load(vcpu);
- if (r)
- return r;
+ if (vcpu_load(vcpu))
+ return;
msr.data = 0x0;
msr.index = MSR_IA32_TSC;
msr.host_initiated = true;
@@ -7058,8 +7051,6 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
KVMCLOCK_SYNC_PERIOD);
-
- return r;
}
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -7549,12 +7540,62 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
return 0;
}
+static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
+ struct kvm_memory_slot *new)
+{
+ /* Still write protect RO slot */
+ if (new->flags & KVM_MEM_READONLY) {
+ kvm_mmu_slot_remove_write_access(kvm, new);
+ return;
+ }
+
+ /*
+ * Call kvm_x86_ops dirty logging hooks when they are valid.
+ *
+ * kvm_x86_ops->slot_disable_log_dirty is called when:
+ *
+ * - KVM_MR_CREATE with dirty logging is disabled
+ * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag
+ *
+ * The reason is, in case of PML, we need to set D-bit for any slots
+ * with dirty logging disabled in order to eliminate unnecessary GPA
+ * logging in PML buffer (and potential PML buffer full VMEXT). This
+ * guarantees leaving PML enabled during guest's lifetime won't have
+ * any additonal overhead from PML when guest is running with dirty
+ * logging disabled for memory slots.
+ *
+ * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot
+ * to dirty logging mode.
+ *
+ * If kvm_x86_ops dirty logging hooks are invalid, use write protect.
+ *
+ * In case of write protect:
+ *
+ * Write protect all pages for dirty logging.
+ *
+ * All the sptes including the large sptes which point to this
+ * slot are set to readonly. We can not create any new large
+ * spte on this slot until the end of the logging.
+ *
+ * See the comments in fast_page_fault().
+ */
+ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+ if (kvm_x86_ops->slot_enable_log_dirty)
+ kvm_x86_ops->slot_enable_log_dirty(kvm, new);
+ else
+ kvm_mmu_slot_remove_write_access(kvm, new);
+ } else {
+ if (kvm_x86_ops->slot_disable_log_dirty)
+ kvm_x86_ops->slot_disable_log_dirty(kvm, new);
+ }
+}
+
void kvm_arch_commit_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
const struct kvm_memory_slot *old,
enum kvm_mr_change change)
{
-
+ struct kvm_memory_slot *new;
int nr_mmu_pages = 0;
if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
@@ -7573,17 +7614,20 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
if (nr_mmu_pages)
kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+
+ /* It's OK to get 'new' slot here as it has already been installed */
+ new = id_to_memslot(kvm->memslots, mem->slot);
+
/*
- * Write protect all pages for dirty logging.
+ * Set up write protection and/or dirty logging for the new slot.
*
- * All the sptes including the large sptes which point to this
- * slot are set to readonly. We can not create any new large
- * spte on this slot until the end of the logging.
- *
- * See the comments in fast_page_fault().
+ * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have
+ * been zapped so no dirty logging staff is needed for old slot. For
+ * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
+ * new and it's also covered when dealing with the new slot.
*/
- if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
- kvm_mmu_slot_remove_write_access(kvm, mem->slot);
+ if (change != KVM_MR_DELETE)
+ kvm_mmu_slot_apply_flags(kvm, new);
}
void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -7837,3 +7881,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index cc1d61af6140..f5fef1868096 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -147,6 +147,7 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
+void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
@@ -170,5 +171,7 @@ extern u64 kvm_supported_xcr0(void);
extern unsigned int min_timer_period_us;
+extern unsigned int lapic_timer_advance_ns;
+
extern struct static_key kvm_no_apic_vcpu;
#endif