diff options
Diffstat (limited to 'arch')
40 files changed, 873 insertions, 522 deletions
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index a4217c1a5d01..2769360f195c 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -266,8 +266,10 @@ struct kvm_vcpu_events { #define KVM_DEV_ARM_ITS_CTRL_RESET 4 /* KVM_IRQ_LINE irq field index values */ +#define KVM_ARM_IRQ_VCPU2_SHIFT 28 +#define KVM_ARM_IRQ_VCPU2_MASK 0xf #define KVM_ARM_IRQ_TYPE_SHIFT 24 -#define KVM_ARM_IRQ_TYPE_MASK 0xff +#define KVM_ARM_IRQ_TYPE_MASK 0xf #define KVM_ARM_IRQ_VCPU_SHIFT 16 #define KVM_ARM_IRQ_VCPU_MASK 0xff #define KVM_ARM_IRQ_NUM_SHIFT 0 diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 92d2e9f28f28..9a21b84536f2 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -77,7 +77,7 @@ }) #define PAGE_S2 __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(NORMAL) | PTE_S2_RDONLY | PAGE_S2_XN) -#define PAGE_S2_DEVICE __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(DEVICE_nGnRE) | PTE_S2_RDONLY | PAGE_S2_XN) +#define PAGE_S2_DEVICE __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_S2_XN) #define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) #define PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 9a507716ae2f..67c21f9bdbad 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -325,8 +325,10 @@ struct kvm_vcpu_events { #define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1 /* KVM_IRQ_LINE irq field index values */ +#define KVM_ARM_IRQ_VCPU2_SHIFT 28 +#define KVM_ARM_IRQ_VCPU2_MASK 0xf #define KVM_ARM_IRQ_TYPE_SHIFT 24 -#define KVM_ARM_IRQ_TYPE_MASK 0xff +#define KVM_ARM_IRQ_TYPE_MASK 0xf #define KVM_ARM_IRQ_VCPU_SHIFT 16 #define KVM_ARM_IRQ_VCPU_MASK 0xff #define KVM_ARM_IRQ_NUM_SHIFT 0 diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c index d49a14497715..c466060b76d6 100644 --- a/arch/arm64/kvm/hyp/tlb.c +++ b/arch/arm64/kvm/hyp/tlb.c @@ -193,6 +193,18 @@ void __hyp_text __kvm_flush_vm_context(void) { dsb(ishst); __tlbi(alle1is); - asm volatile("ic ialluis" : : ); + + /* + * VIPT and PIPT caches are not affected by VMID, so no maintenance + * is necessary across a VMID rollover. + * + * VPIPT caches constrain lookup and maintenance to the active VMID, + * so we need to invalidate lines with a stale VMID to avoid an ABA + * race after multiple rollovers. + * + */ + if (icache_is_vpipt()) + asm volatile("ic ialluis"); + dsb(ish); } diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index e6e5f59aaa97..6fb5fb4779e0 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -232,11 +232,25 @@ struct revmap_entry { }; /* - * We use the top bit of each memslot->arch.rmap entry as a lock bit, - * and bit 32 as a present flag. The bottom 32 bits are the - * index in the guest HPT of a HPTE that points to the page. + * The rmap array of size number of guest pages is allocated for each memslot. + * This array is used to store usage specific information about the guest page. + * Below are the encodings of the various possible usage types. */ -#define KVMPPC_RMAP_LOCK_BIT 63 +/* Free bits which can be used to define a new usage */ +#define KVMPPC_RMAP_TYPE_MASK 0xff00000000000000 +#define KVMPPC_RMAP_NESTED 0xc000000000000000 /* Nested rmap array */ +#define KVMPPC_RMAP_HPT 0x0100000000000000 /* HPT guest */ + +/* + * rmap usage definition for a hash page table (hpt) guest: + * 0x0000080000000000 Lock bit + * 0x0000018000000000 RC bits + * 0x0000000100000000 Present bit + * 0x00000000ffffffff HPT index bits + * The bottom 32 bits are the index in the guest HPT of a HPTE that points to + * the page. + */ +#define KVMPPC_RMAP_LOCK_BIT 43 #define KVMPPC_RMAP_RC_SHIFT 32 #define KVMPPC_RMAP_REFERENCED (HPTE_R_R << KVMPPC_RMAP_RC_SHIFT) #define KVMPPC_RMAP_PRESENT 0x100000000ul diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2484e6a8f5ca..8e8514efb124 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -598,6 +598,7 @@ extern int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val); extern int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val); +extern bool kvmppc_xive_native_supported(void); #else static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index e4016985764e..818989e11678 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -46,7 +46,15 @@ struct xive_irq_data { /* Setup/used by frontend */ int target; + /* + * saved_p means that there is a queue entry for this interrupt + * in some CPU's queue (not including guest vcpu queues), even + * if P is not set in the source ESB. + * stale_p means that there is no queue entry for this interrupt + * in some CPU's queue, even if P is set in the source ESB. + */ bool saved_p; + bool stale_p; }; #define XIVE_IRQ_FLAG_STORE_EOI 0x01 #define XIVE_IRQ_FLAG_LSI 0x02 @@ -127,6 +135,7 @@ extern int xive_native_get_queue_state(u32 vp_id, uint32_t prio, u32 *qtoggle, extern int xive_native_set_queue_state(u32 vp_id, uint32_t prio, u32 qtoggle, u32 qindex); extern int xive_native_get_vp_state(u32 vp_id, u64 *out_state); +extern bool xive_native_has_queue_state_support(void); #else diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 9524d92bc45d..d7fcdfa7fee4 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -1083,9 +1083,11 @@ static int kvmppc_book3s_init(void) if (xics_on_xive()) { kvmppc_xive_init_module(); kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS); - kvmppc_xive_native_init_module(); - kvm_register_device_ops(&kvm_xive_native_ops, - KVM_DEV_TYPE_XIVE); + if (kvmppc_xive_native_supported()) { + kvmppc_xive_native_init_module(); + kvm_register_device_ops(&kvm_xive_native_ops, + KVM_DEV_TYPE_XIVE); + } } else #endif kvm_register_device_ops(&kvm_xics_ops, KVM_DEV_TYPE_XICS); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index cde3f5a4b3e4..f8975c620f41 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1678,7 +1678,14 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, *val = get_reg_val(id, vcpu->arch.pspb); break; case KVM_REG_PPC_DPDES: - *val = get_reg_val(id, vcpu->arch.vcore->dpdes); + /* + * On POWER9, where we are emulating msgsndp etc., + * we return 1 bit for each vcpu, which can come from + * either vcore->dpdes or doorbell_request. + * On POWER8, doorbell_request is 0. + */ + *val = get_reg_val(id, vcpu->arch.vcore->dpdes | + vcpu->arch.doorbell_request); break; case KVM_REG_PPC_VTB: *val = get_reg_val(id, vcpu->arch.vcore->vtb); @@ -2860,7 +2867,7 @@ static void collect_piggybacks(struct core_info *cip, int target_threads) if (!spin_trylock(&pvc->lock)) continue; prepare_threads(pvc); - if (!pvc->n_runnable) { + if (!pvc->n_runnable || !pvc->kvm->arch.mmu_ready) { list_del_init(&pvc->preempt_list); if (pvc->runner == NULL) { pvc->vcore_state = VCORE_INACTIVE; @@ -2881,15 +2888,20 @@ static void collect_piggybacks(struct core_info *cip, int target_threads) spin_unlock(&lp->lock); } -static bool recheck_signals(struct core_info *cip) +static bool recheck_signals_and_mmu(struct core_info *cip) { int sub, i; struct kvm_vcpu *vcpu; + struct kvmppc_vcore *vc; - for (sub = 0; sub < cip->n_subcores; ++sub) - for_each_runnable_thread(i, vcpu, cip->vc[sub]) + for (sub = 0; sub < cip->n_subcores; ++sub) { + vc = cip->vc[sub]; + if (!vc->kvm->arch.mmu_ready) + return true; + for_each_runnable_thread(i, vcpu, vc) if (signal_pending(vcpu->arch.run_task)) return true; + } return false; } @@ -3119,7 +3131,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) local_irq_disable(); hard_irq_disable(); if (lazy_irq_pending() || need_resched() || - recheck_signals(&core_info) || !vc->kvm->arch.mmu_ready) { + recheck_signals_and_mmu(&core_info)) { local_irq_enable(); vc->vcore_state = VCORE_INACTIVE; /* Unlock all except the primary vcore */ diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 63e0ce91e29d..7186c65c61c9 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -99,7 +99,7 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, } else { rev->forw = rev->back = pte_index; *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | - pte_index | KVMPPC_RMAP_PRESENT; + pte_index | KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_HPT; } unlock_rmap(rmap); } diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 337e64468d78..07181d0dfcb7 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -942,6 +942,8 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) ld r11, VCPU_XIVE_SAVED_STATE(r4) li r9, TM_QW1_OS lwz r8, VCPU_XIVE_CAM_WORD(r4) + cmpwi r8, 0 + beq no_xive li r7, TM_QW1_OS + TM_WORD2 mfmsr r0 andi. r0, r0, MSR_DR /* in real mode? */ @@ -2831,29 +2833,39 @@ kvm_cede_prodded: kvm_cede_exit: ld r9, HSTATE_KVM_VCPU(r13) #ifdef CONFIG_KVM_XICS - /* Abort if we still have a pending escalation */ + /* are we using XIVE with single escalation? */ + ld r10, VCPU_XIVE_ESC_VADDR(r9) + cmpdi r10, 0 + beq 3f + li r6, XIVE_ESB_SET_PQ_00 + /* + * If we still have a pending escalation, abort the cede, + * and we must set PQ to 10 rather than 00 so that we don't + * potentially end up with two entries for the escalation + * interrupt in the XIVE interrupt queue. In that case + * we also don't want to set xive_esc_on to 1 here in + * case we race with xive_esc_irq(). + */ lbz r5, VCPU_XIVE_ESC_ON(r9) cmpwi r5, 0 - beq 1f + beq 4f li r0, 0 stb r0, VCPU_CEDED(r9) -1: /* Enable XIVE escalation */ - li r5, XIVE_ESB_SET_PQ_00 + li r6, XIVE_ESB_SET_PQ_10 + b 5f +4: li r0, 1 + stb r0, VCPU_XIVE_ESC_ON(r9) + /* make sure store to xive_esc_on is seen before xive_esc_irq runs */ + sync +5: /* Enable XIVE escalation */ mfmsr r0 andi. r0, r0, MSR_DR /* in real mode? */ beq 1f - ld r10, VCPU_XIVE_ESC_VADDR(r9) - cmpdi r10, 0 - beq 3f - ldx r0, r10, r5 + ldx r0, r10, r6 b 2f 1: ld r10, VCPU_XIVE_ESC_RADDR(r9) - cmpdi r10, 0 - beq 3f - ldcix r0, r10, r5 + ldcix r0, r10, r6 2: sync - li r0, 1 - stb r0, VCPU_XIVE_ESC_ON(r9) #endif /* CONFIG_KVM_XICS */ 3: b guest_exit_cont diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index e3ba67095895..591bfb4bfd0f 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -67,8 +67,14 @@ void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt; u64 pq; - if (!tima) + /* + * Nothing to do if the platform doesn't have a XIVE + * or this vCPU doesn't have its own XIVE context + * (e.g. because it's not using an in-kernel interrupt controller). + */ + if (!tima || !vcpu->arch.xive_cam_word) return; + eieio(); __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS); __raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2); @@ -160,6 +166,9 @@ static irqreturn_t xive_esc_irq(int irq, void *data) */ vcpu->arch.xive_esc_on = false; + /* This orders xive_esc_on = false vs. subsequent stale_p = true */ + smp_wmb(); /* goes with smp_mb() in cleanup_single_escalation */ + return IRQ_HANDLED; } @@ -1113,6 +1122,31 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu) vcpu->arch.xive_esc_raddr = 0; } +/* + * In single escalation mode, the escalation interrupt is marked so + * that EOI doesn't re-enable it, but just sets the stale_p flag to + * indicate that the P bit has already been dealt with. However, the + * assembly code that enters the guest sets PQ to 00 without clearing + * stale_p (because it has no easy way to address it). Hence we have + * to adjust stale_p before shutting down the interrupt. + */ +void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, + struct kvmppc_xive_vcpu *xc, int irq) +{ + struct irq_data *d = irq_get_irq_data(irq); + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + + /* + * This slightly odd sequence gives the right result + * (i.e. stale_p set if xive_esc_on is false) even if + * we race with xive_esc_irq() and xive_irq_eoi(). + */ + xd->stale_p = false; + smp_mb(); /* paired with smb_wmb in xive_esc_irq */ + if (!vcpu->arch.xive_esc_on) + xd->stale_p = true; +} + void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) { struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; @@ -1134,20 +1168,28 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) /* Mask the VP IPI */ xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_01); - /* Disable the VP */ - xive_native_disable_vp(xc->vp_id); - - /* Free the queues & associated interrupts */ + /* Free escalations */ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { - struct xive_q *q = &xc->queues[i]; - - /* Free the escalation irq */ if (xc->esc_virq[i]) { + if (xc->xive->single_escalation) + xive_cleanup_single_escalation(vcpu, xc, + xc->esc_virq[i]); free_irq(xc->esc_virq[i], vcpu); irq_dispose_mapping(xc->esc_virq[i]); kfree(xc->esc_virq_names[i]); } - /* Free the queue */ + } + + /* Disable the VP */ + xive_native_disable_vp(xc->vp_id); + + /* Clear the cam word so guest entry won't try to push context */ + vcpu->arch.xive_cam_word = 0; + + /* Free the queues */ + for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { + struct xive_q *q = &xc->queues[i]; + xive_native_disable_queue(xc->vp_id, q, i); if (q->qpage) { free_pages((unsigned long)q->qpage, diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index 50494d0ee375..955b820ffd6d 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -282,6 +282,8 @@ int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio); int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio, bool single_escalation); struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type); +void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, + struct kvmppc_xive_vcpu *xc, int irq); #endif /* CONFIG_KVM_XICS */ #endif /* _KVM_PPC_BOOK3S_XICS_H */ diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index a998823f68a3..248c1ea9e788 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -67,20 +67,28 @@ void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) xc->valid = false; kvmppc_xive_disable_vcpu_interrupts(vcpu); - /* Disable the VP */ - xive_native_disable_vp(xc->vp_id); - - /* Free the queues & associated interrupts */ + /* Free escalations */ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { /* Free the escalation irq */ if (xc->esc_virq[i]) { + if (xc->xive->single_escalation) + xive_cleanup_single_escalation(vcpu, xc, + xc->esc_virq[i]); free_irq(xc->esc_virq[i], vcpu); irq_dispose_mapping(xc->esc_virq[i]); kfree(xc->esc_virq_names[i]); xc->esc_virq[i] = 0; } + } + + /* Disable the VP */ + xive_native_disable_vp(xc->vp_id); + + /* Clear the cam word so guest entry won't try to push context */ + vcpu->arch.xive_cam_word = 0; - /* Free the queue */ + /* Free the queues */ + for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { kvmppc_xive_native_cleanup_queue(vcpu, i); } @@ -1171,6 +1179,11 @@ int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val) return 0; } +bool kvmppc_xive_native_supported(void) +{ + return xive_native_has_queue_state_support(); +} + static int xive_native_debug_show(struct seq_file *m, void *private) { struct kvmppc_xive *xive = m->private; diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index b5a848a55504..00649ca5fa9a 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -440,6 +440,9 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_e500(struct kvm *kvm, struct kvm_vcpu *vcpu; int err; + BUILD_BUG_ON_MSG(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0, + "struct kvm_vcpu must be at offset 0 for arch usercopy region"); + vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); if (!vcpu_e500) { err = -ENOMEM; diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index bb4d09c1ad56..6fca38ca791f 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -271,6 +271,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) */ if (inst == KVMPPC_INST_SW_BREAKPOINT) { run->exit_reason = KVM_EXIT_DEBUG; + run->debug.arch.status = 0; run->debug.arch.address = kvmppc_get_pc(vcpu); emulated = EMULATE_EXIT_USER; advance = 0; diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c index 9208c82ed08d..2e496eb86e94 100644 --- a/arch/powerpc/kvm/emulate_loadstore.c +++ b/arch/powerpc/kvm/emulate_loadstore.c @@ -89,12 +89,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) rs = get_rs(inst); rt = get_rt(inst); - /* - * if mmio_vsx_tx_sx_enabled == 0, copy data between - * VSR[0..31] and memory - * if mmio_vsx_tx_sx_enabled == 1, copy data between - * VSR[32..63] and memory - */ vcpu->arch.mmio_vsx_copy_nums = 0; vcpu->arch.mmio_vsx_offset = 0; vcpu->arch.mmio_copy_type = KVMPPC_VSX_COPY_NONE; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 3e566c2e6066..3a77bb643452 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -561,7 +561,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) * a POWER9 processor) and the PowerNV platform, as * nested is not yet supported. */ - r = xive_enabled() && !!cpu_has_feature(CPU_FTR_HVMODE); + r = xive_enabled() && !!cpu_has_feature(CPU_FTR_HVMODE) && + kvmppc_xive_native_supported(); break; #endif diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 1cdb39575eae..be86fce1a84e 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -135,7 +135,7 @@ static u32 xive_read_eq(struct xive_q *q, bool just_peek) static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek) { u32 irq = 0; - u8 prio; + u8 prio = 0; /* Find highest pending priority */ while (xc->pending_prio != 0) { @@ -148,8 +148,19 @@ static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek) irq = xive_read_eq(&xc->queue[prio], just_peek); /* Found something ? That's it */ - if (irq) - break; + if (irq) { + if (just_peek || irq_to_desc(irq)) + break; + /* + * We should never get here; if we do then we must + * have failed to synchronize the interrupt properly + * when shutting it down. + */ + pr_crit("xive: got interrupt %d without descriptor, dropping\n", + irq); + WARN_ON(1); + continue; + } /* Clear pending bits */ xc->pending_prio &= ~(1 << prio); @@ -307,6 +318,7 @@ static void xive_do_queue_eoi(struct xive_cpu *xc) */ static void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd) { + xd->stale_p = false; /* If the XIVE supports the new "store EOI facility, use it */ if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI) xive_esb_write(xd, XIVE_ESB_STORE_EOI, 0); @@ -350,7 +362,7 @@ static void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd) } } -/* irq_chip eoi callback */ +/* irq_chip eoi callback, called with irq descriptor lock held */ static void xive_irq_eoi(struct irq_data *d) { struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); @@ -366,6 +378,8 @@ static void xive_irq_eoi(struct irq_data *d) if (!irqd_irq_disabled(d) && !irqd_is_forwarded_to_vcpu(d) && !(xd->flags & XIVE_IRQ_NO_EOI)) xive_do_source_eoi(irqd_to_hwirq(d), xd); + else + xd->stale_p = true; /* * Clear saved_p to indicate that it's no longer occupying @@ -397,11 +411,16 @@ static void xive_do_source_set_mask(struct xive_irq_data *xd, */ if (mask) { val = xive_esb_read(xd, XIVE_ESB_SET_PQ_01); - xd->saved_p = !!(val & XIVE_ESB_VAL_P); - } else if (xd->saved_p) + if (!xd->stale_p && !!(val & XIVE_ESB_VAL_P)) + xd->saved_p = true; + xd->stale_p = false; + } else if (xd->saved_p) { xive_esb_read(xd, XIVE_ESB_SET_PQ_10); - else + xd->saved_p = false; + } else { xive_esb_read(xd, XIVE_ESB_SET_PQ_00); + xd->stale_p = false; + } } /* @@ -541,6 +560,8 @@ static unsigned int xive_irq_startup(struct irq_data *d) unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); int target, rc; + xd->saved_p = false; + xd->stale_p = false; pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n", d->irq, hw_irq, d); @@ -587,6 +608,7 @@ static unsigned int xive_irq_startup(struct irq_data *d) return 0; } +/* called with irq descriptor lock held */ static void xive_irq_shutdown(struct irq_data *d) { struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); @@ -602,16 +624,6 @@ static void xive_irq_shutdown(struct irq_data *d) xive_do_source_set_mask(xd, true); /* - * The above may have set saved_p. We clear it otherwise it - * will prevent re-enabling later on. It is ok to forget the - * fact that the interrupt might be in a queue because we are - * accounting that already in xive_dec_target_count() and will - * be re-routing it to a new queue with proper accounting when - * it's started up again - */ - xd->saved_p = false; - - /* * Mask the interrupt in HW in the IVT/EAS and set the number * to be the "bad" IRQ number */ @@ -797,6 +809,10 @@ static int xive_irq_retrigger(struct irq_data *d) return 1; } +/* + * Caller holds the irq descriptor lock, so this won't be called + * concurrently with xive_get_irqchip_state on the same interrupt. + */ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) { struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); @@ -820,6 +836,10 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) /* Set it to PQ=10 state to prevent further sends */ pq = xive_esb_read(xd, XIVE_ESB_SET_PQ_10); + if (!xd->stale_p) { + xd->saved_p = !!(pq & XIVE_ESB_VAL_P); + xd->stale_p = !xd->saved_p; + } /* No target ? nothing to do */ if (xd->target == XIVE_INVALID_TARGET) { @@ -827,7 +847,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) * An untargetted interrupt should have been * also masked at the source */ - WARN_ON(pq & 2); + WARN_ON(xd->saved_p); return 0; } @@ -847,9 +867,8 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) * This saved_p is cleared by the host EOI, when we know * for sure the queue slot is no longer in use. */ - if (pq & 2) { - pq = xive_esb_read(xd, XIVE_ESB_SET_PQ_11); - xd->saved_p = true; + if (xd->saved_p) { + xive_esb_read(xd, XIVE_ESB_SET_PQ_11); /* * Sync the XIVE source HW to ensure the interrupt @@ -862,8 +881,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) */ if (xive_ops->sync_source) xive_ops->sync_source(hw_irq); - } else - xd->saved_p = false; + } } else { irqd_clr_forwarded_to_vcpu(d); @@ -914,6 +932,23 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) return 0; } +/* Called with irq descriptor lock held. */ +static int xive_get_irqchip_state(struct irq_data *data, + enum irqchip_irq_state which, bool *state) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(data); + + switch (which) { + case IRQCHIP_STATE_ACTIVE: + *state = !xd->stale_p && + (xd->saved_p || + !!(xive_esb_read(xd, XIVE_ESB_GET) & XIVE_ESB_VAL_P)); + return 0; + default: + return -EINVAL; + } +} + static struct irq_chip xive_irq_chip = { .name = "XIVE-IRQ", .irq_startup = xive_irq_startup, @@ -925,6 +960,7 @@ static struct irq_chip xive_irq_chip = { .irq_set_type = xive_irq_set_type, .irq_retrigger = xive_irq_retrigger, .irq_set_vcpu_affinity = xive_irq_set_vcpu_affinity, + .irq_get_irqchip_state = xive_get_irqchip_state, }; bool is_xive_irq(struct irq_chip *chip) @@ -1338,6 +1374,11 @@ static void xive_flush_cpu_queue(unsigned int cpu, struct xive_cpu *xc) xd = irq_desc_get_handler_data(desc); /* + * Clear saved_p to indicate that it's no longer pending + */ + xd->saved_p = false; + + /* * For LSIs, we EOI, this will cause a resend if it's * still asserted. Otherwise do an MSI retrigger. */ diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index 2f26b74f6cfa..37987c815913 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -800,6 +800,13 @@ int xive_native_set_queue_state(u32 vp_id, u32 prio, u32 qtoggle, u32 qindex) } EXPORT_SYMBOL_GPL(xive_native_set_queue_state); +bool xive_native_has_queue_state_support(void) +{ + return opal_check_token(OPAL_XIVE_GET_QUEUE_STATE) && + opal_check_token(OPAL_XIVE_SET_QUEUE_STATE); +} +EXPORT_SYMBOL_GPL(xive_native_has_queue_state_support); + int xive_native_get_vp_state(u32 vp_id, u64 *out_state) { __be64 state; diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 47104e5b47fd..436ec7636927 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -231,6 +231,12 @@ struct kvm_guest_debug_arch { #define KVM_SYNC_GSCB (1UL << 9) #define KVM_SYNC_BPBC (1UL << 10) #define KVM_SYNC_ETOKEN (1UL << 11) + +#define KVM_SYNC_S390_VALID_FIELDS \ + (KVM_SYNC_PREFIX | KVM_SYNC_GPRS | KVM_SYNC_ACRS | KVM_SYNC_CRS | \ + KVM_SYNC_ARCH0 | KVM_SYNC_PFAULT | KVM_SYNC_VRS | KVM_SYNC_RICCB | \ + KVM_SYNC_FPRS | KVM_SYNC_GSCB | KVM_SYNC_BPBC | KVM_SYNC_ETOKEN) + /* length and alignment of the sdnx as a power of two */ #define SDNXC 8 #define SDNXL (1UL << SDNXC) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 39cff07bf2eb..f6db0f1bc867 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -4000,6 +4000,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (kvm_run->immediate_exit) return -EINTR; + if (kvm_run->kvm_valid_regs & ~KVM_SYNC_S390_VALID_FIELDS || + kvm_run->kvm_dirty_regs & ~KVM_SYNC_S390_VALID_FIELDS) + return -EINVAL; + vcpu_load(vcpu); if (guestdbg_exit_pending(vcpu)) { @@ -4257,7 +4261,7 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu, const u64 supported_flags = KVM_S390_MEMOP_F_INJECT_EXCEPTION | KVM_S390_MEMOP_F_CHECK_ONLY; - if (mop->flags & ~supported_flags) + if (mop->flags & ~supported_flags || mop->ar >= NUM_ACRS || !mop->size) return -EINVAL; if (mop->size > MEM_OP_MAX_SIZE) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index feab24cac610..77cf6c11f66b 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -229,7 +229,7 @@ struct x86_emulate_ops { int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, const char *smstate); void (*post_leave_smm)(struct x86_emulate_ctxt *ctxt); - + int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); }; typedef u32 __attribute__((vector_size(16))) sse128_t; @@ -429,6 +429,7 @@ enum x86_intercept { x86_intercept_ins, x86_intercept_out, x86_intercept_outs, + x86_intercept_xsetbv, nr_x86_intercepts }; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index bdc16b0aa7c6..a3a3ec73fa2f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -718,7 +718,7 @@ struct kvm_vcpu_arch { /* Cache MMIO info */ u64 mmio_gva; - unsigned access; + unsigned mmio_access; gfn_t mmio_gfn; u64 mmio_gen; @@ -1072,7 +1072,7 @@ struct kvm_x86_ops { void (*run)(struct kvm_vcpu *vcpu); int (*handle_exit)(struct kvm_vcpu *vcpu); - void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); + int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); void (*patch_hypercall)(struct kvm_vcpu *vcpu, @@ -1211,6 +1211,8 @@ struct kvm_x86_ops { uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu); bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu); + + bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); }; struct kvm_arch_async_pf { @@ -1328,8 +1330,10 @@ int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); -int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); -int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); +int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data); +int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data); +int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu); +int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu); struct x86_emulate_ctxt; @@ -1583,6 +1587,13 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq); +static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq) +{ + /* We can only post Fixed and LowPrio IRQs */ + return (irq->delivery_mode == dest_Fixed || + irq->delivery_mode == dest_LowestPrio); +} + static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) { if (kvm_x86_ops->vcpu_blocking) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index a39136b0d509..b15e6465870f 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -562,6 +562,20 @@ enum vm_instruction_error_number { VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28, }; +/* + * VM-instruction errors that can be encountered on VM-Enter, used to trace + * nested VM-Enter failures reported by hardware. Errors unique to VM-Enter + * from a SMI Transfer Monitor are not included as things have gone seriously + * sideways if we get one of those... + */ +#define VMX_VMENTER_INSTRUCTION_ERRORS \ + { VMXERR_VMLAUNCH_NONCLEAR_VMCS, "VMLAUNCH_NONCLEAR_VMCS" }, \ + { VMXERR_VMRESUME_NONLAUNCHED_VMCS, "VMRESUME_NONLAUNCHED_VMCS" }, \ + { VMXERR_VMRESUME_AFTER_VMXOFF, "VMRESUME_AFTER_VMXOFF" }, \ + { VMXERR_ENTRY_INVALID_CONTROL_FIELD, "VMENTRY_INVALID_CONTROL_FIELD" }, \ + { VMXERR_ENTRY_INVALID_HOST_STATE_FIELD, "VMENTRY_INVALID_HOST_STATE_FIELD" }, \ + { VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS, "VMENTRY_EVENTS_BLOCKED_BY_MOV_SS" } + enum vmx_l1d_flush_state { VMENTER_L1D_FLUSH_AUTO, VMENTER_L1D_FLUSH_NEVER, diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index f0b0c90dd398..f01950aa7fae 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -31,6 +31,7 @@ #define EXIT_REASON_EXCEPTION_NMI 0 #define EXIT_REASON_EXTERNAL_INTERRUPT 1 #define EXIT_REASON_TRIPLE_FAULT 2 +#define EXIT_REASON_INIT_SIGNAL 3 #define EXIT_REASON_PENDING_INTERRUPT 7 #define EXIT_REASON_NMI_WINDOW 8 @@ -90,6 +91,7 @@ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ + { EXIT_REASON_INIT_SIGNAL, "INIT_SIGNAL" }, \ { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \ { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index b2f56602af65..e820568ed4d5 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -502,16 +502,6 @@ static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) __send_ipi_mask(local_mask, vector); } -static void kvm_send_ipi_allbutself(int vector) -{ - kvm_send_ipi_mask_allbutself(cpu_online_mask, vector); -} - -static void kvm_send_ipi_all(int vector) -{ - __send_ipi_mask(cpu_online_mask, vector); -} - /* * Set the IPI entry points */ @@ -519,8 +509,6 @@ static void kvm_setup_pv_ipi(void) { apic->send_IPI_mask = kvm_send_ipi_mask; apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself; - apic->send_IPI_allbutself = kvm_send_ipi_allbutself; - apic->send_IPI_all = kvm_send_ipi_all; pr_info("KVM setup pv IPIs\n"); } diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 22c2720cd948..dd5985eb61b4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -392,6 +392,12 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) entry->edx &= kvm_cpuid_7_0_edx_x86_features; cpuid_mask(&entry->edx, CPUID_7_EDX); + if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS)) + entry->edx |= F(SPEC_CTRL); + if (boot_cpu_has(X86_FEATURE_STIBP)) + entry->edx |= F(INTEL_STIBP); + if (boot_cpu_has(X86_FEATURE_SSBD)) + entry->edx |= F(SPEC_CTRL_SSBD); /* * We emulate ARCH_CAPABILITIES in software even * if the host doesn't support it. @@ -729,18 +735,23 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, g_phys_as = phys_as; entry->eax = g_phys_as | (virt_as << 8); entry->edx = 0; + entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; + cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); /* - * IBRS, IBPB and VIRT_SSBD aren't necessarily present in - * hardware cpuid + * AMD has separate bits for each SPEC_CTRL bit. + * arch/x86/kernel/cpu/bugs.c is kind enough to + * record that in cpufeatures so use them. */ - if (boot_cpu_has(X86_FEATURE_AMD_IBPB)) + if (boot_cpu_has(X86_FEATURE_IBPB)) entry->ebx |= F(AMD_IBPB); - if (boot_cpu_has(X86_FEATURE_AMD_IBRS)) + if (boot_cpu_has(X86_FEATURE_IBRS)) entry->ebx |= F(AMD_IBRS); - if (boot_cpu_has(X86_FEATURE_VIRT_SSBD)) - entry->ebx |= F(VIRT_SSBD); - entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; - cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); + if (boot_cpu_has(X86_FEATURE_STIBP)) + entry->ebx |= F(AMD_STIBP); + if (boot_cpu_has(X86_FEATURE_SSBD)) + entry->ebx |= F(AMD_SSBD); + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + entry->ebx |= F(AMD_SSB_NO); /* * The preference is to use SPEC CTRL MSR instead of the * VIRT_SPEC MSR. diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 718f7d9afedc..698efb8c3897 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4156,6 +4156,20 @@ out: return rc; } +static int em_xsetbv(struct x86_emulate_ctxt *ctxt) +{ + u32 eax, ecx, edx; + + eax = reg_read(ctxt, VCPU_REGS_RAX); + edx = reg_read(ctxt, VCPU_REGS_RDX); + ecx = reg_read(ctxt, VCPU_REGS_RCX); + + if (ctxt->ops->set_xcr(ctxt, ecx, ((u64)edx << 32) | eax)) + return emulate_gp(ctxt, 0); + + return X86EMUL_CONTINUE; +} + static bool valid_cr(int nr) { switch (nr) { @@ -4409,6 +4423,12 @@ static const struct opcode group7_rm1[] = { N, N, N, N, N, N, }; +static const struct opcode group7_rm2[] = { + N, + II(ImplicitOps | Priv, em_xsetbv, xsetbv), + N, N, N, N, N, N, +}; + static const struct opcode group7_rm3[] = { DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa), II(SrcNone | Prot | EmulateOnUD, em_hypercall, vmmcall), @@ -4498,7 +4518,8 @@ static const struct group_dual group7 = { { }, { EXT(0, group7_rm0), EXT(0, group7_rm1), - N, EXT(0, group7_rm3), + EXT(0, group7_rm2), + EXT(0, group7_rm3), II(SrcNone | DstMem | Mov, em_smsw, smsw), N, II(SrcMem16 | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7), @@ -5144,7 +5165,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) else { rc = __do_insn_fetch_bytes(ctxt, 1); if (rc != X86EMUL_CONTINUE) - return rc; + goto done; } switch (mode) { @@ -5395,6 +5416,8 @@ done_prefixes: ctxt->memopp->addr.mem.ea + ctxt->_eip); done: + if (rc == X86EMUL_PROPAGATE_FAULT) + ctxt->have_exception = true; return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2a4f278f3b56..8675458c2205 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1198,10 +1198,8 @@ void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector) } EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated); -static void apic_send_ipi(struct kvm_lapic *apic) +static void apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) { - u32 icr_low = kvm_lapic_get_reg(apic, APIC_ICR); - u32 icr_high = kvm_lapic_get_reg(apic, APIC_ICR2); struct kvm_lapic_irq irq; irq.vector = icr_low & APIC_VECTOR_MASK; @@ -1914,8 +1912,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) } case APIC_ICR: /* No delay here, so we always clear the pending bit */ - kvm_lapic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); - apic_send_ipi(apic); + val &= ~(1 << 12); + apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2)); + kvm_lapic_set_reg(apic, APIC_ICR, val); break; case APIC_ICR2: @@ -2707,11 +2706,14 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) return; /* - * INITs are latched while in SMM. Because an SMM CPU cannot - * be in KVM_MP_STATE_INIT_RECEIVED state, just eat SIPIs - * and delay processing of INIT until the next RSM. + * INITs are latched while CPU is in specific states + * (SMM, VMX non-root mode, SVM with GIF=0). + * Because a CPU cannot be in these states immediately + * after it has processed an INIT signal (and thus in + * KVM_MP_STATE_INIT_RECEIVED state), just eat SIPIs + * and leave the INIT pending. */ - if (is_smm(vcpu)) { + if (is_smm(vcpu) || kvm_x86_ops->apic_init_signal_blocked(vcpu)) { WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED); if (test_bit(KVM_APIC_SIPI, &apic->pending_events)) clear_bit(KVM_APIC_SIPI, &apic->pending_events); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a63964e7cec7..a10af9c87f8a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -214,6 +214,7 @@ static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; static u64 __read_mostly shadow_mmio_value; +static u64 __read_mostly shadow_mmio_access_mask; static u64 __read_mostly shadow_present_mask; static u64 __read_mostly shadow_me_mask; @@ -299,14 +300,21 @@ static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, kvm_flush_remote_tlbs_with_range(kvm, &range); } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) +void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask) { + BUG_ON((u64)(unsigned)access_mask != access_mask); BUG_ON((mmio_mask & mmio_value) != mmio_value); shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK; shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; + shadow_mmio_access_mask = access_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +static bool is_mmio_spte(u64 spte) +{ + return (spte & shadow_mmio_mask) == shadow_mmio_value; +} + static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) { return sp->role.ad_disabled; @@ -314,19 +322,19 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) static inline bool spte_ad_enabled(u64 spte) { - MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + MMU_WARN_ON(is_mmio_spte(spte)); return !(spte & shadow_acc_track_value); } static inline u64 spte_shadow_accessed_mask(u64 spte) { - MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + MMU_WARN_ON(is_mmio_spte(spte)); return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; } static inline u64 spte_shadow_dirty_mask(u64 spte) { - MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + MMU_WARN_ON(is_mmio_spte(spte)); return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; } @@ -389,7 +397,7 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, u64 mask = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; - access &= ACC_WRITE_MASK | ACC_USER_MASK; + access &= shadow_mmio_access_mask; mask |= shadow_mmio_value | access; mask |= gpa | shadow_nonpresent_or_rsvd_mask; mask |= (gpa & shadow_nonpresent_or_rsvd_mask) @@ -401,11 +409,6 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, mmu_spte_set(sptep, mask); } -static bool is_mmio_spte(u64 spte) -{ - return (spte & shadow_mmio_mask) == shadow_mmio_value; -} - static gfn_t get_mmio_spte_gfn(u64 spte) { u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask; @@ -418,8 +421,7 @@ static gfn_t get_mmio_spte_gfn(u64 spte) static unsigned get_mmio_spte_access(u64 spte) { - u64 mask = generation_mmio_spte_mask(MMIO_SPTE_GEN_MASK) | shadow_mmio_mask; - return (spte & ~mask) & ~PAGE_MASK; + return spte & shadow_mmio_access_mask; } static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, @@ -3302,7 +3304,8 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, } if (unlikely(is_noslot_pfn(pfn))) - vcpu_cache_mmio_info(vcpu, gva, gfn, access); + vcpu_cache_mmio_info(vcpu, gva, gfn, + access & shadow_mmio_access_mask); return false; } @@ -5611,13 +5614,13 @@ slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, PT_PAGE_TABLE_LEVEL, lock_flush_tlb); } -static void free_mmu_pages(struct kvm_vcpu *vcpu) +static void free_mmu_pages(struct kvm_mmu *mmu) { - free_page((unsigned long)vcpu->arch.mmu->pae_root); - free_page((unsigned long)vcpu->arch.mmu->lm_root); + free_page((unsigned long)mmu->pae_root); + free_page((unsigned long)mmu->lm_root); } -static int alloc_mmu_pages(struct kvm_vcpu *vcpu) +static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { struct page *page; int i; @@ -5638,9 +5641,9 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) if (!page) return -ENOMEM; - vcpu->arch.mmu->pae_root = page_address(page); + mmu->pae_root = page_address(page); for (i = 0; i < 4; ++i) - vcpu->arch.mmu->pae_root[i] = INVALID_PAGE; + mmu->pae_root[i] = INVALID_PAGE; return 0; } @@ -5648,6 +5651,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) int kvm_mmu_create(struct kvm_vcpu *vcpu) { uint i; + int ret; vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; @@ -5665,7 +5669,19 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; - return alloc_mmu_pages(vcpu); + + ret = alloc_mmu_pages(vcpu, &vcpu->arch.guest_mmu); + if (ret) + return ret; + + ret = alloc_mmu_pages(vcpu, &vcpu->arch.root_mmu); + if (ret) + goto fail_allocate_root; + + return ret; + fail_allocate_root: + free_mmu_pages(&vcpu->arch.guest_mmu); + return ret; } @@ -6094,7 +6110,7 @@ static void kvm_set_mmio_spte_mask(void) if (IS_ENABLED(CONFIG_X86_64) && shadow_phys_bits == 52) mask &= ~1ull; - kvm_mmu_set_mmio_spte_mask(mask, mask); + kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); } int kvm_mmu_module_init(void) @@ -6168,7 +6184,8 @@ unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm) void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); - free_mmu_pages(vcpu); + free_mmu_pages(&vcpu->arch.root_mmu); + free_mmu_pages(&vcpu->arch.guest_mmu); mmu_free_memory_caches(vcpu); } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 54c2a377795b..11f8ec89433b 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -51,7 +51,7 @@ static inline u64 rsvd_bits(int s, int e) return ((1ULL << (e - s + 1)) - 1) << s; } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value); +void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask); void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e0368076a1ef..04fe21849b6e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -68,10 +68,8 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); #define SEG_TYPE_LDT 2 #define SEG_TYPE_BUSY_TSS16 3 -#define SVM_FEATURE_NPT (1 << 0) #define SVM_FEATURE_LBRV (1 << 1) #define SVM_FEATURE_SVML (1 << 2) -#define SVM_FEATURE_NRIP (1 << 3) #define SVM_FEATURE_TSC_RATE (1 << 4) #define SVM_FEATURE_VMCB_CLEAN (1 << 5) #define SVM_FEATURE_FLUSH_ASID (1 << 6) @@ -770,7 +768,7 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) } -static void skip_emulated_instruction(struct kvm_vcpu *vcpu) +static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -779,18 +777,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) svm->next_rip = svm->vmcb->control.next_rip; } - if (!svm->next_rip) { - if (kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) != - EMULATE_DONE) - printk(KERN_DEBUG "%s: NOP\n", __func__); - return; - } + if (!svm->next_rip) + return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP); + if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n", __func__, kvm_rip_read(vcpu), svm->next_rip); kvm_rip_write(vcpu, svm->next_rip); svm_set_interrupt_shadow(vcpu, 0); + + return EMULATE_DONE; } static void svm_queue_exception(struct kvm_vcpu *vcpu) @@ -821,7 +818,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) * raises a fault that is not intercepted. Still better than * failing in all cases. */ - skip_emulated_instruction(&svm->vcpu); + (void)skip_emulated_instruction(&svm->vcpu); rip = kvm_rip_read(&svm->vcpu); svm->int3_rip = rip + svm->vmcb->save.cs.base; svm->int3_injected = rip - old_rip; @@ -1269,11 +1266,11 @@ static void grow_ple_window(struct kvm_vcpu *vcpu) pause_filter_count_grow, pause_filter_count_max); - if (control->pause_filter_count != old) + if (control->pause_filter_count != old) { mark_dirty(svm->vmcb, VMCB_INTERCEPTS); - - trace_kvm_ple_window_grow(vcpu->vcpu_id, - control->pause_filter_count, old); + trace_kvm_ple_window_update(vcpu->vcpu_id, + control->pause_filter_count, old); + } } static void shrink_ple_window(struct kvm_vcpu *vcpu) @@ -1287,11 +1284,11 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) pause_filter_count, pause_filter_count_shrink, pause_filter_count); - if (control->pause_filter_count != old) + if (control->pause_filter_count != old) { mark_dirty(svm->vmcb, VMCB_INTERCEPTS); - - trace_kvm_ple_window_shrink(vcpu->vcpu_id, - control->pause_filter_count, old); + trace_kvm_ple_window_update(vcpu->vcpu_id, + control->pause_filter_count, old); + } } static __init int svm_hardware_setup(void) @@ -2136,6 +2133,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) struct page *nested_msrpm_pages; int err; + BUILD_BUG_ON_MSG(offsetof(struct vcpu_svm, vcpu) != 0, + "struct kvm_vcpu must be at offset 0 for arch usercopy region"); + svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); if (!svm) { err = -ENOMEM; @@ -2903,13 +2903,11 @@ static int nop_on_interception(struct vcpu_svm *svm) static int halt_interception(struct vcpu_svm *svm) { - svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; return kvm_emulate_halt(&svm->vcpu); } static int vmmcall_interception(struct vcpu_svm *svm) { - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; return kvm_emulate_hypercall(&svm->vcpu); } @@ -3588,9 +3586,9 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, mark_all_dirty(svm->vmcb); } -static bool nested_svm_vmrun(struct vcpu_svm *svm) +static int nested_svm_vmrun(struct vcpu_svm *svm) { - int rc; + int ret; struct vmcb *nested_vmcb; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; @@ -3599,13 +3597,16 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) vmcb_gpa = svm->vmcb->save.rax; - rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map); - if (rc) { - if (rc == -EINVAL) - kvm_inject_gp(&svm->vcpu, 0); - return false; + ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map); + if (ret == -EINVAL) { + kvm_inject_gp(&svm->vcpu, 0); + return 1; + } else if (ret) { + return kvm_skip_emulated_instruction(&svm->vcpu); } + ret = kvm_skip_emulated_instruction(&svm->vcpu); + nested_vmcb = map.hva; if (!nested_vmcb_checks(nested_vmcb)) { @@ -3616,7 +3617,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) kvm_vcpu_unmap(&svm->vcpu, &map, true); - return false; + return ret; } trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, @@ -3660,7 +3661,16 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map); - return true; + if (!nested_svm_vmrun_msrpm(svm)) { + svm->vmcb->control.exit_code = SVM_EXIT_ERR; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + + nested_svm_vmexit(svm); + } + + return ret; } static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) @@ -3697,7 +3707,6 @@ static int vmload_interception(struct vcpu_svm *svm) nested_vmcb = map.hva; - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; ret = kvm_skip_emulated_instruction(&svm->vcpu); nested_svm_vmloadsave(nested_vmcb, svm->vmcb); @@ -3724,7 +3733,6 @@ static int vmsave_interception(struct vcpu_svm *svm) nested_vmcb = map.hva; - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; ret = kvm_skip_emulated_instruction(&svm->vcpu); nested_svm_vmloadsave(svm->vmcb, nested_vmcb); @@ -3738,27 +3746,7 @@ static int vmrun_interception(struct vcpu_svm *svm) if (nested_svm_check_permissions(svm)) return 1; - /* Save rip after vmrun instruction */ - kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3); - - if (!nested_svm_vmrun(svm)) - return 1; - - if (!nested_svm_vmrun_msrpm(svm)) - goto failed; - - return 1; - -failed: - - svm->vmcb->control.exit_code = SVM_EXIT_ERR; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; - - nested_svm_vmexit(svm); - - return 1; + return nested_svm_vmrun(svm); } static int stgi_interception(struct vcpu_svm *svm) @@ -3775,7 +3763,6 @@ static int stgi_interception(struct vcpu_svm *svm) if (vgif_enabled(svm)) clr_intercept(svm, INTERCEPT_STGI); - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; ret = kvm_skip_emulated_instruction(&svm->vcpu); kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); @@ -3791,7 +3778,6 @@ static int clgi_interception(struct vcpu_svm *svm) if (nested_svm_check_permissions(svm)) return 1; - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; ret = kvm_skip_emulated_instruction(&svm->vcpu); disable_gif(svm); @@ -3816,7 +3802,6 @@ static int invlpga_interception(struct vcpu_svm *svm) /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu)); - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; return kvm_skip_emulated_instruction(&svm->vcpu); } @@ -3839,7 +3824,6 @@ static int xsetbv_interception(struct vcpu_svm *svm) u32 index = kvm_rcx_read(&svm->vcpu); if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) { - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; return kvm_skip_emulated_instruction(&svm->vcpu); } @@ -3898,25 +3882,29 @@ static int task_switch_interception(struct vcpu_svm *svm) if (reason != TASK_SWITCH_GATE || int_type == SVM_EXITINTINFO_TYPE_SOFT || (int_type == SVM_EXITINTINFO_TYPE_EXEPT && - (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) - skip_emulated_instruction(&svm->vcpu); + (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) { + if (skip_emulated_instruction(&svm->vcpu) != EMULATE_DONE) + goto fail; + } if (int_type != SVM_EXITINTINFO_TYPE_SOFT) int_vec = -1; if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, - has_error_code, error_code) == EMULATE_FAIL) { - svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - svm->vcpu.run->internal.ndata = 0; - return 0; - } + has_error_code, error_code) == EMULATE_FAIL) + goto fail; + return 1; + +fail: + svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + svm->vcpu.run->internal.ndata = 0; + return 0; } static int cpuid_interception(struct vcpu_svm *svm) { - svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; return kvm_emulate_cpuid(&svm->vcpu); } @@ -4232,23 +4220,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static int rdmsr_interception(struct vcpu_svm *svm) { - u32 ecx = kvm_rcx_read(&svm->vcpu); - struct msr_data msr_info; - - msr_info.index = ecx; - msr_info.host_initiated = false; - if (svm_get_msr(&svm->vcpu, &msr_info)) { - trace_kvm_msr_read_ex(ecx); - kvm_inject_gp(&svm->vcpu, 0); - return 1; - } else { - trace_kvm_msr_read(ecx, msr_info.data); - - kvm_rax_write(&svm->vcpu, msr_info.data & 0xffffffff); - kvm_rdx_write(&svm->vcpu, msr_info.data >> 32); - svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - return kvm_skip_emulated_instruction(&svm->vcpu); - } + return kvm_emulate_rdmsr(&svm->vcpu); } static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) @@ -4438,23 +4410,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) static int wrmsr_interception(struct vcpu_svm *svm) { - struct msr_data msr; - u32 ecx = kvm_rcx_read(&svm->vcpu); - u64 data = kvm_read_edx_eax(&svm->vcpu); - - msr.data = data; - msr.index = ecx; - msr.host_initiated = false; - - svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - if (kvm_set_msr(&svm->vcpu, &msr)) { - trace_kvm_msr_write_ex(ecx, data); - kvm_inject_gp(&svm->vcpu, 0); - return 1; - } else { - trace_kvm_msr_write(ecx, data); - return kvm_skip_emulated_instruction(&svm->vcpu); - } + return kvm_emulate_wrmsr(&svm->vcpu); } static int msr_interception(struct vcpu_svm *svm) @@ -5025,9 +4981,14 @@ static int handle_exit(struct kvm_vcpu *vcpu) if (exit_code >= ARRAY_SIZE(svm_exit_handlers) || !svm_exit_handlers[exit_code]) { - WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code); - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; + vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code); + dump_vmcb(vcpu); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; + vcpu->run->internal.ndata = 1; + vcpu->run->internal.data[0] = exit_code; + return 0; } return svm_exit_handlers[exit_code](svm); @@ -5274,7 +5235,8 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, kvm_set_msi_irq(kvm, e, &irq); - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || + !kvm_irq_is_postable(&irq)) { pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n", __func__, irq.vector); return -1; @@ -5328,6 +5290,7 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, * 1. When cannot target interrupt to a specific vcpu. * 2. Unsetting posted interrupt. * 3. APIC virtialization is disabled for the vcpu. + * 4. IRQ has incompatible delivery mode (SMI, INIT, etc) */ if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set && kvm_vcpu_apicv_active(&svm->vcpu)) { @@ -5933,6 +5896,8 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC); } +#define F(x) bit(X86_FEATURE_##x) + static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) { switch (func) { @@ -5944,6 +5909,11 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) if (nested) entry->ecx |= (1 << 2); /* Set SVM bit */ break; + case 0x80000008: + if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || + boot_cpu_has(X86_FEATURE_AMD_SSBD)) + entry->ebx |= F(VIRT_SSBD); + break; case 0x8000000A: entry->eax = 1; /* SVM revision 1 */ entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper @@ -5954,11 +5924,11 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) /* Support next_rip if host supports it */ if (boot_cpu_has(X86_FEATURE_NRIPS)) - entry->edx |= SVM_FEATURE_NRIP; + entry->edx |= F(NRIPS); /* Support NPT for the guest if enabled */ if (npt_enabled) - entry->edx |= SVM_FEATURE_NPT; + entry->edx |= F(NPT); break; case 0x8000001F: @@ -6067,6 +6037,7 @@ static const struct __x86_intercept { [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO), [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO), [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO), + [x86_intercept_xsetbv] = PRE_EX(SVM_EXIT_XSETBV), }; #undef PRE_EX @@ -7193,6 +7164,21 @@ static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) return false; } +static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * TODO: Last condition latch INIT signals on vCPU when + * vCPU is in guest-mode and vmcb12 defines intercept on INIT. + * To properly emulate the INIT intercept, SVM should implement + * kvm_x86_ops->check_nested_events() and call nested_svm_vmexit() + * there if an INIT signal is pending. + */ + return !gif_set(svm) || + (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT)); +} + static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -7329,6 +7315,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .nested_get_evmcs_version = NULL, .need_emulation_on_page_fault = svm_need_emulation_on_page_fault, + + .apic_init_signal_blocked = svm_apic_init_signal_blocked, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index b5c831e79094..7c741a0c5f80 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -232,17 +232,20 @@ TRACE_EVENT(kvm_exit, __field( u32, isa ) __field( u64, info1 ) __field( u64, info2 ) + __field( unsigned int, vcpu_id ) ), TP_fast_assign( __entry->exit_reason = exit_reason; __entry->guest_rip = kvm_rip_read(vcpu); __entry->isa = isa; + __entry->vcpu_id = vcpu->vcpu_id; kvm_x86_ops->get_exit_info(vcpu, &__entry->info1, &__entry->info2); ), - TP_printk("reason %s rip 0x%lx info %llx %llx", + TP_printk("vcpu %u reason %s rip 0x%lx info %llx %llx", + __entry->vcpu_id, (__entry->isa == KVM_ISA_VMX) ? __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) : __print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS), @@ -887,36 +890,27 @@ TRACE_EVENT(kvm_pml_full, TP_printk("vcpu %d: PML full", __entry->vcpu_id) ); -TRACE_EVENT(kvm_ple_window, - TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old), - TP_ARGS(grow, vcpu_id, new, old), +TRACE_EVENT(kvm_ple_window_update, + TP_PROTO(unsigned int vcpu_id, unsigned int new, unsigned int old), + TP_ARGS(vcpu_id, new, old), TP_STRUCT__entry( - __field( bool, grow ) __field( unsigned int, vcpu_id ) - __field( int, new ) - __field( int, old ) + __field( unsigned int, new ) + __field( unsigned int, old ) ), TP_fast_assign( - __entry->grow = grow; __entry->vcpu_id = vcpu_id; __entry->new = new; __entry->old = old; ), - TP_printk("vcpu %u: ple_window %d (%s %d)", - __entry->vcpu_id, - __entry->new, - __entry->grow ? "grow" : "shrink", - __entry->old) + TP_printk("vcpu %u old %u new %u (%s)", + __entry->vcpu_id, __entry->old, __entry->new, + __entry->old < __entry->new ? "growed" : "shrinked") ); -#define trace_kvm_ple_window_grow(vcpu_id, new, old) \ - trace_kvm_ple_window(true, vcpu_id, new, old) -#define trace_kvm_ple_window_shrink(vcpu_id, new, old) \ - trace_kvm_ple_window(false, vcpu_id, new, old) - TRACE_EVENT(kvm_pvclock_update, TP_PROTO(unsigned int vcpu_id, struct pvclock_vcpu_time_info *pvclock), TP_ARGS(vcpu_id, pvclock), @@ -1320,7 +1314,7 @@ TRACE_EVENT(kvm_avic_incomplete_ipi, __entry->index = index; ), - TP_printk("vcpu=%u, icrh:icrl=%#010x:%08x, id=%u, index=%u\n", + TP_printk("vcpu=%u, icrh:icrl=%#010x:%08x, id=%u, index=%u", __entry->vcpu, __entry->icrh, __entry->icrl, __entry->id, __entry->index) ); @@ -1345,7 +1339,7 @@ TRACE_EVENT(kvm_avic_unaccelerated_access, __entry->vec = vec; ), - TP_printk("vcpu=%u, offset=%#x(%s), %s, %s, vec=%#x\n", + TP_printk("vcpu=%u, offset=%#x(%s), %s, %s, vec=%#x", __entry->vcpu, __entry->offset, __print_symbolic(__entry->offset, kvm_trace_symbol_apic), @@ -1462,6 +1456,46 @@ TRACE_EVENT(kvm_hv_send_ipi_ex, __entry->vector, __entry->format, __entry->valid_bank_mask) ); + +TRACE_EVENT(kvm_pv_tlb_flush, + TP_PROTO(unsigned int vcpu_id, bool need_flush_tlb), + TP_ARGS(vcpu_id, need_flush_tlb), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( bool, need_flush_tlb ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->need_flush_tlb = need_flush_tlb; + ), + + TP_printk("vcpu %u need_flush_tlb %s", __entry->vcpu_id, + __entry->need_flush_tlb ? "true" : "false") +); + +/* + * Tracepoint for failed nested VMX VM-Enter. + */ +TRACE_EVENT(kvm_nested_vmenter_failed, + TP_PROTO(const char *msg, u32 err), + TP_ARGS(msg, err), + + TP_STRUCT__entry( + __field(const char *, msg) + __field(u32, err) + ), + + TP_fast_assign( + __entry->msg = msg; + __entry->err = err; + ), + + TP_printk("%s%s", __entry->msg, !__entry->err ? "" : + __print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS)) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index a3cba321b5c5..1a10cd351940 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -19,6 +19,14 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); static bool __read_mostly nested_early_check = 0; module_param(nested_early_check, bool, S_IRUGO); +#define CC(consistency_check) \ +({ \ + bool failed = (consistency_check); \ + if (failed) \ + trace_kvm_nested_vmenter_failed(#consistency_check, 0); \ + failed; \ +}) + /* * Hyper-V requires all of these, so mark them as supported even though * they are just treated the same as all-context. @@ -430,8 +438,8 @@ static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) return 0; - if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) || - !page_address_valid(vcpu, vmcs12->io_bitmap_b)) + if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || + CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) return -EINVAL; return 0; @@ -443,7 +451,7 @@ static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return 0; - if (!page_address_valid(vcpu, vmcs12->msr_bitmap)) + if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) return -EINVAL; return 0; @@ -455,7 +463,7 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) return 0; - if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)) + if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) return -EINVAL; return 0; @@ -688,7 +696,7 @@ static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && - !page_address_valid(vcpu, vmcs12->apic_access_addr)) + CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) return -EINVAL; else return 0; @@ -707,16 +715,15 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, * If virtualize x2apic mode is enabled, * virtualize apic access must be disabled. */ - if (nested_cpu_has_virt_x2apic_mode(vmcs12) && - nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) return -EINVAL; /* * If virtual interrupt delivery is enabled, * we must exit on external interrupts. */ - if (nested_cpu_has_vid(vmcs12) && - !nested_exit_on_intr(vcpu)) + if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) return -EINVAL; /* @@ -727,15 +734,15 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, * bits 5:0 of posted_intr_desc_addr should be zero. */ if (nested_cpu_has_posted_intr(vmcs12) && - (!nested_cpu_has_vid(vmcs12) || - !nested_exit_intr_ack_set(vcpu) || - (vmcs12->posted_intr_nv & 0xff00) || - (vmcs12->posted_intr_desc_addr & 0x3f) || - (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))) + (CC(!nested_cpu_has_vid(vmcs12)) || + CC(!nested_exit_intr_ack_set(vcpu)) || + CC((vmcs12->posted_intr_nv & 0xff00)) || + CC((vmcs12->posted_intr_desc_addr & 0x3f)) || + CC((vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))) return -EINVAL; /* tpr shadow is needed by all apicv features. */ - if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) + if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) return -EINVAL; return 0; @@ -759,10 +766,12 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_load_count, - vmcs12->vm_exit_msr_load_addr) || - nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_store_count, - vmcs12->vm_exit_msr_store_addr)) + if (CC(nested_vmx_check_msr_switch(vcpu, + vmcs12->vm_exit_msr_load_count, + vmcs12->vm_exit_msr_load_addr)) || + CC(nested_vmx_check_msr_switch(vcpu, + vmcs12->vm_exit_msr_store_count, + vmcs12->vm_exit_msr_store_addr))) return -EINVAL; return 0; @@ -771,8 +780,9 @@ static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_entry_msr_load_count, - vmcs12->vm_entry_msr_load_addr)) + if (CC(nested_vmx_check_msr_switch(vcpu, + vmcs12->vm_entry_msr_load_count, + vmcs12->vm_entry_msr_load_addr))) return -EINVAL; return 0; @@ -784,8 +794,8 @@ static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, if (!nested_cpu_has_pml(vmcs12)) return 0; - if (!nested_cpu_has_ept(vmcs12) || - !page_address_valid(vcpu, vmcs12->pml_address)) + if (CC(!nested_cpu_has_ept(vmcs12)) || + CC(!page_address_valid(vcpu, vmcs12->pml_address))) return -EINVAL; return 0; @@ -794,8 +804,8 @@ static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && - !nested_cpu_has_ept(vmcs12)) + if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && + !nested_cpu_has_ept(vmcs12))) return -EINVAL; return 0; } @@ -803,8 +813,8 @@ static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && - !nested_cpu_has_ept(vmcs12)) + if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && + !nested_cpu_has_ept(vmcs12))) return -EINVAL; return 0; } @@ -815,8 +825,8 @@ static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, if (!nested_cpu_has_shadow_vmcs(vmcs12)) return 0; - if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) || - !page_address_valid(vcpu, vmcs12->vmwrite_bitmap)) + if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || + CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) return -EINVAL; return 0; @@ -826,12 +836,12 @@ static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { /* x2APIC MSR accesses are not allowed */ - if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8) + if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) return -EINVAL; - if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ - e->index == MSR_IA32_UCODE_REV) + if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ + CC(e->index == MSR_IA32_UCODE_REV)) return -EINVAL; - if (e->reserved != 0) + if (CC(e->reserved != 0)) return -EINVAL; return 0; } @@ -839,9 +849,9 @@ static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { - if (e->index == MSR_FS_BASE || - e->index == MSR_GS_BASE || - e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */ + if (CC(e->index == MSR_FS_BASE) || + CC(e->index == MSR_GS_BASE) || + CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ nested_vmx_msr_check_common(vcpu, e)) return -EINVAL; return 0; @@ -850,7 +860,7 @@ static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { - if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */ + if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ nested_vmx_msr_check_common(vcpu, e)) return -EINVAL; return 0; @@ -864,9 +874,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) { u32 i; struct vmx_msr_entry e; - struct msr_data msr; - msr.host_initiated = false; for (i = 0; i < count; i++) { if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), &e, sizeof(e))) { @@ -881,9 +889,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) __func__, i, e.index, e.reserved); goto fail; } - msr.index = e.index; - msr.data = e.value; - if (kvm_set_msr(vcpu, &msr)) { + if (kvm_set_msr(vcpu, e.index, e.value)) { pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", __func__, i, e.index, e.value); @@ -897,11 +903,11 @@ fail: static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) { + u64 data; u32 i; struct vmx_msr_entry e; for (i = 0; i < count; i++) { - struct msr_data msr_info; if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), &e, 2 * sizeof(u32))) { @@ -916,9 +922,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) __func__, i, e.index, e.reserved); return -EINVAL; } - msr_info.host_initiated = false; - msr_info.index = e.index; - if (kvm_get_msr(vcpu, &msr_info)) { + if (kvm_get_msr(vcpu, e.index, &data)) { pr_debug_ratelimited( "%s cannot read MSR (%u, 0x%x)\n", __func__, i, e.index); @@ -927,10 +931,10 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) if (kvm_vcpu_write_guest(vcpu, gpa + i * sizeof(e) + offsetof(struct vmx_msr_entry, value), - &msr_info.data, sizeof(msr_info.data))) { + &data, sizeof(data))) { pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", - __func__, i, e.index, msr_info.data); + __func__, i, e.index, data); return -EINVAL; } } @@ -955,7 +959,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne u32 *entry_failure_code) { if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { - if (!nested_cr3_valid(vcpu, cr3)) { + if (CC(!nested_cr3_valid(vcpu, cr3))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } @@ -965,7 +969,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne * must not be dereferenced. */ if (is_pae_paging(vcpu) && !nested_ept) { - if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { + if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { *entry_failure_code = ENTRY_FAIL_PDPTE; return -EINVAL; } @@ -2411,12 +2415,12 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) { - if (!nested_cpu_has_nmi_exiting(vmcs12) && - nested_cpu_has_virtual_nmis(vmcs12)) + if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && + nested_cpu_has_virtual_nmis(vmcs12))) return -EINVAL; - if (!nested_cpu_has_virtual_nmis(vmcs12) && - nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING)) + if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && + nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))) return -EINVAL; return 0; @@ -2430,11 +2434,11 @@ static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) /* Check for memory type validity */ switch (address & VMX_EPTP_MT_MASK) { case VMX_EPTP_MT_UC: - if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)) + if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) return false; break; case VMX_EPTP_MT_WB: - if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)) + if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) return false; break; default: @@ -2442,16 +2446,16 @@ static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) } /* only 4 levels page-walk length are valid */ - if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4) + if (CC((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)) return false; /* Reserved bits should not be set */ - if (address >> maxphyaddr || ((address >> 7) & 0x1f)) + if (CC(address >> maxphyaddr || ((address >> 7) & 0x1f))) return false; /* AD, if set, should be supported */ if (address & VMX_EPTP_AD_ENABLE_BIT) { - if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)) + if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) return false; } @@ -2466,21 +2470,21 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!vmx_control_verify(vmcs12->pin_based_vm_exec_control, - vmx->nested.msrs.pinbased_ctls_low, - vmx->nested.msrs.pinbased_ctls_high) || - !vmx_control_verify(vmcs12->cpu_based_vm_exec_control, - vmx->nested.msrs.procbased_ctls_low, - vmx->nested.msrs.procbased_ctls_high)) + if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, + vmx->nested.msrs.pinbased_ctls_low, + vmx->nested.msrs.pinbased_ctls_high)) || + CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, + vmx->nested.msrs.procbased_ctls_low, + vmx->nested.msrs.procbased_ctls_high))) return -EINVAL; if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && - !vmx_control_verify(vmcs12->secondary_vm_exec_control, - vmx->nested.msrs.secondary_ctls_low, - vmx->nested.msrs.secondary_ctls_high)) + CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, + vmx->nested.msrs.secondary_ctls_low, + vmx->nested.msrs.secondary_ctls_high))) return -EINVAL; - if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu) || + if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || @@ -2491,7 +2495,7 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || - (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) + CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) return -EINVAL; if (!nested_cpu_has_preemption_timer(vmcs12) && @@ -2499,17 +2503,17 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, return -EINVAL; if (nested_cpu_has_ept(vmcs12) && - !valid_ept_address(vcpu, vmcs12->ept_pointer)) + CC(!valid_ept_address(vcpu, vmcs12->ept_pointer))) return -EINVAL; if (nested_cpu_has_vmfunc(vmcs12)) { - if (vmcs12->vm_function_control & - ~vmx->nested.msrs.vmfunc_controls) + if (CC(vmcs12->vm_function_control & + ~vmx->nested.msrs.vmfunc_controls)) return -EINVAL; if (nested_cpu_has_eptp_switching(vmcs12)) { - if (!nested_cpu_has_ept(vmcs12) || - !page_address_valid(vcpu, vmcs12->eptp_list_address)) + if (CC(!nested_cpu_has_ept(vmcs12)) || + CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) return -EINVAL; } } @@ -2525,10 +2529,10 @@ static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!vmx_control_verify(vmcs12->vm_exit_controls, - vmx->nested.msrs.exit_ctls_low, - vmx->nested.msrs.exit_ctls_high) || - nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)) + if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, + vmx->nested.msrs.exit_ctls_low, + vmx->nested.msrs.exit_ctls_high)) || + CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) return -EINVAL; return 0; @@ -2542,9 +2546,9 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!vmx_control_verify(vmcs12->vm_entry_controls, - vmx->nested.msrs.entry_ctls_low, - vmx->nested.msrs.entry_ctls_high)) + if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, + vmx->nested.msrs.entry_ctls_low, + vmx->nested.msrs.entry_ctls_high))) return -EINVAL; /* @@ -2564,31 +2568,31 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; /* VM-entry interruption-info field: interruption type */ - if (intr_type == INTR_TYPE_RESERVED || - (intr_type == INTR_TYPE_OTHER_EVENT && - !nested_cpu_supports_monitor_trap_flag(vcpu))) + if (CC(intr_type == INTR_TYPE_RESERVED) || + CC(intr_type == INTR_TYPE_OTHER_EVENT && + !nested_cpu_supports_monitor_trap_flag(vcpu))) return -EINVAL; /* VM-entry interruption-info field: vector */ - if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || - (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || - (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) + if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || + CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || + CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) return -EINVAL; /* VM-entry interruption-info field: deliver error code */ should_have_error_code = intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && x86_exception_has_error_code(vector); - if (has_error_code != should_have_error_code) + if (CC(has_error_code != should_have_error_code)) return -EINVAL; /* VM-entry exception error code */ - if (has_error_code && - vmcs12->vm_entry_exception_error_code & GENMASK(31, 15)) + if (CC(has_error_code && + vmcs12->vm_entry_exception_error_code & GENMASK(31, 15))) return -EINVAL; /* VM-entry interruption-info field: reserved bits */ - if (intr_info & INTR_INFO_RESVD_BITS_MASK) + if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) return -EINVAL; /* VM-entry instruction length */ @@ -2596,9 +2600,9 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, case INTR_TYPE_SOFT_EXCEPTION: case INTR_TYPE_SOFT_INTR: case INTR_TYPE_PRIV_SW_EXCEPTION: - if ((vmcs12->vm_entry_instruction_len > 15) || - (vmcs12->vm_entry_instruction_len == 0 && - !nested_cpu_has_zero_length_injection(vcpu))) + if (CC(vmcs12->vm_entry_instruction_len > 15) || + CC(vmcs12->vm_entry_instruction_len == 0 && + CC(!nested_cpu_has_zero_length_injection(vcpu)))) return -EINVAL; } } @@ -2625,40 +2629,40 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, { bool ia32e; - if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || - !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || - !nested_cr3_valid(vcpu, vmcs12->host_cr3)) + if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || + CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || + CC(!nested_cr3_valid(vcpu, vmcs12->host_cr3))) return -EINVAL; - if (is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu) || - is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)) + if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || + CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) return -EINVAL; if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && - !kvm_pat_valid(vmcs12->host_ia32_pat)) + CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) return -EINVAL; ia32e = (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; - if (vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || - vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || - vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || - vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || - vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || - vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || - vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || - vmcs12->host_cs_selector == 0 || - vmcs12->host_tr_selector == 0 || - (vmcs12->host_ss_selector == 0 && !ia32e)) + if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || + CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || + CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || + CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || + CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || + CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || + CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || + CC(vmcs12->host_cs_selector == 0) || + CC(vmcs12->host_tr_selector == 0) || + CC(vmcs12->host_ss_selector == 0 && !ia32e)) return -EINVAL; #ifdef CONFIG_X86_64 - if (is_noncanonical_address(vmcs12->host_fs_base, vcpu) || - is_noncanonical_address(vmcs12->host_gs_base, vcpu) || - is_noncanonical_address(vmcs12->host_gdtr_base, vcpu) || - is_noncanonical_address(vmcs12->host_idtr_base, vcpu) || - is_noncanonical_address(vmcs12->host_tr_base, vcpu)) + if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || + CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || + CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || + CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || + CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu))) return -EINVAL; #endif @@ -2669,9 +2673,9 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, * the host address-space size VM-exit control. */ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { - if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || - ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || - ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) + if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || + CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || + CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) return -EINVAL; } @@ -2688,16 +2692,16 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, if (vmcs12->vmcs_link_pointer == -1ull) return 0; - if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)) + if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) return -EINVAL; - if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)) + if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))) return -EINVAL; shadow = map.hva; - if (shadow->hdr.revision_id != VMCS12_REVISION || - shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)) + if (CC(shadow->hdr.revision_id != VMCS12_REVISION) || + CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) r = -EINVAL; kvm_vcpu_unmap(vcpu, &map, false); @@ -2709,8 +2713,8 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, */ static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) { - if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && - vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) + if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && + vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)) return -EINVAL; return 0; @@ -2724,12 +2728,12 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, *exit_qual = ENTRY_FAIL_DEFAULT; - if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) || - !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) + if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || + CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) return -EINVAL; if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && - !kvm_pat_valid(vmcs12->guest_ia32_pat)) + CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) return -EINVAL; if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { @@ -2749,16 +2753,16 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, if (to_vmx(vcpu)->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; - if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) || - ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || - ((vmcs12->guest_cr0 & X86_CR0_PG) && - ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) + if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || + CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || + CC(((vmcs12->guest_cr0 & X86_CR0_PG) && + ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) return -EINVAL; } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && - (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || - (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) + (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || + CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) return -EINVAL; if (nested_check_guest_non_reg_state(vmcs12)) @@ -2841,9 +2845,13 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); if (vm_fail) { + u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); + preempt_enable(); - WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != - VMXERR_ENTRY_INVALID_CONTROL_FIELD); + + trace_kvm_nested_vmenter_failed( + "early hardware check VM-instruction error: ", error); + WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } @@ -3401,6 +3409,15 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) unsigned long exit_qual; bool block_nested_events = vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); + struct kvm_lapic *apic = vcpu->arch.apic; + + if (lapic_in_kernel(vcpu) && + test_bit(KVM_APIC_INIT, &apic->pending_events)) { + if (block_nested_events) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); + return 0; + } if (vcpu->arch.exception.pending && nested_vmx_check_exception(vcpu, &exit_qual)) { @@ -3889,7 +3906,6 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_msr_entry g, h; - struct msr_data msr; gpa_t gpa; u32 i, j; @@ -3949,7 +3965,6 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) * from the guest value. The intent is to stuff host state as * silently as possible, not to fully process the exit load list. */ - msr.host_initiated = false; for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { @@ -3979,9 +3994,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) goto vmabort; } - msr.index = h.index; - msr.data = h.value; - if (kvm_set_msr(vcpu, &msr)) { + if (kvm_set_msr(vcpu, h.index, h.value)) { pr_debug_ratelimited( "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", __func__, j, h.index, h.value); @@ -4466,7 +4479,12 @@ static int handle_vmoff(struct kvm_vcpu *vcpu) { if (!nested_vmx_check_permission(vcpu)) return 1; + free_nested(vcpu); + + /* Process a latched INIT during time CPU was in VMX operation */ + kvm_make_request(KVM_REQ_EVENT, vcpu); + return nested_vmx_succeed(vcpu); } @@ -5261,8 +5279,9 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) return false; if (unlikely(vmx->fail)) { - pr_info_ratelimited("%s failed vm entry %x\n", __func__, - vmcs_read32(VM_INSTRUCTION_ERROR)); + trace_kvm_nested_vmenter_failed( + "hardware VM-instruction error: ", + vmcs_read32(VM_INSTRUCTION_ERROR)); return true; } diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 4010d519eb8c..751a384c2eb0 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -94,7 +94,7 @@ ENDPROC(vmx_vmexit) /** * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode - * @vmx: struct vcpu_vmx * + * @vmx: struct vcpu_vmx * (forwarded to vmx_update_host_rsp) * @regs: unsigned long * (to guest registers) * @launched: %true if the VMCS has been launched * @@ -151,7 +151,7 @@ ENTRY(__vmx_vcpu_run) mov VCPU_R14(%_ASM_AX), %r14 mov VCPU_R15(%_ASM_AX), %r15 #endif - /* Load guest RAX. This kills the vmx_vcpu pointer! */ + /* Load guest RAX. This kills the @regs pointer! */ mov VCPU_RAX(%_ASM_AX), %_ASM_AX /* Enter guest mode */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c030c96fc81a..4a99be1fae4e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1472,8 +1472,11 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) return 0; } - -static void skip_emulated_instruction(struct kvm_vcpu *vcpu) +/* + * Returns an int to be compatible with SVM implementation (which can fail). + * Do not use directly, use skip_emulated_instruction() instead. + */ +static int __skip_emulated_instruction(struct kvm_vcpu *vcpu) { unsigned long rip; @@ -1483,6 +1486,13 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) /* skipping an emulated instruction also counts */ vmx_set_interrupt_shadow(vcpu, 0); + + return EMULATE_DONE; +} + +static inline void skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + (void)__skip_emulated_instruction(vcpu); } static void vmx_clear_hlt(struct kvm_vcpu *vcpu) @@ -4026,7 +4036,7 @@ static void ept_set_mmio_spte_mask(void) * of an EPT paging-structure entry is 110b (write/execute). */ kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK, - VMX_EPT_MISCONFIG_WX_VALUE); + VMX_EPT_MISCONFIG_WX_VALUE, 0); } #define VMX_XSS_EXIT_BITMAP 0 @@ -4152,6 +4162,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.microcode_version = 0x100000000ULL; vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); + vmx->hv_deadline_tsc = -1; kvm_set_cr8(vcpu, 0); if (!init_event) { @@ -4856,41 +4867,12 @@ static int handle_cpuid(struct kvm_vcpu *vcpu) static int handle_rdmsr(struct kvm_vcpu *vcpu) { - u32 ecx = kvm_rcx_read(vcpu); - struct msr_data msr_info; - - msr_info.index = ecx; - msr_info.host_initiated = false; - if (vmx_get_msr(vcpu, &msr_info)) { - trace_kvm_msr_read_ex(ecx); - kvm_inject_gp(vcpu, 0); - return 1; - } - - trace_kvm_msr_read(ecx, msr_info.data); - - kvm_rax_write(vcpu, msr_info.data & -1u); - kvm_rdx_write(vcpu, (msr_info.data >> 32) & -1u); - return kvm_skip_emulated_instruction(vcpu); + return kvm_emulate_rdmsr(vcpu); } static int handle_wrmsr(struct kvm_vcpu *vcpu) { - struct msr_data msr; - u32 ecx = kvm_rcx_read(vcpu); - u64 data = kvm_read_edx_eax(vcpu); - - msr.data = data; - msr.index = ecx; - msr.host_initiated = false; - if (kvm_set_msr(vcpu, &msr) != 0) { - trace_kvm_msr_write_ex(ecx, data); - kvm_inject_gp(vcpu, 0); - return 1; - } - - trace_kvm_msr_write(ecx, data); - return kvm_skip_emulated_instruction(vcpu); + return kvm_emulate_wrmsr(vcpu); } static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) @@ -5227,31 +5209,33 @@ emulation_error: static void grow_ple_window(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - int old = vmx->ple_window; + unsigned int old = vmx->ple_window; vmx->ple_window = __grow_ple_window(old, ple_window, ple_window_grow, ple_window_max); - if (vmx->ple_window != old) + if (vmx->ple_window != old) { vmx->ple_window_dirty = true; - - trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old); + trace_kvm_ple_window_update(vcpu->vcpu_id, + vmx->ple_window, old); + } } static void shrink_ple_window(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - int old = vmx->ple_window; + unsigned int old = vmx->ple_window; vmx->ple_window = __shrink_ple_window(old, ple_window, ple_window_shrink, ple_window); - if (vmx->ple_window != old) + if (vmx->ple_window != old) { vmx->ple_window_dirty = true; - - trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old); + trace_kvm_ple_window_update(vcpu->vcpu_id, + vmx->ple_window, old); + } } /* @@ -5887,8 +5871,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) else { vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason); - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; + dump_vmcs(); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; + vcpu->run->internal.ndata = 1; + vcpu->run->internal.data[0] = exit_reason; + return 0; } } @@ -6615,6 +6604,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) unsigned long *msr_bitmap; int cpu; + BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0, + "struct kvm_vcpu must be at offset 0 for arch usercopy region"); + vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); if (!vmx) return ERR_PTR(-ENOMEM); @@ -7369,10 +7361,14 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, * irqbalance to make the interrupts single-CPU. * * We will support full lowest-priority interrupt later. + * + * In addition, we can only inject generic interrupts using + * the PI mechanism, refuse to route others through it. */ kvm_set_msi_irq(kvm, e, &irq); - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || + !kvm_irq_is_postable(&irq)) { /* * Make sure the IRTE is in remapped mode if * we don't handle it in posted mode. @@ -7474,6 +7470,11 @@ static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) return false; } +static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.vmxon; +} + static __init int hardware_setup(void) { unsigned long host_bndcfgs; @@ -7705,7 +7706,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, - .skip_emulated_instruction = skip_emulated_instruction, + .skip_emulated_instruction = __skip_emulated_instruction, .set_interrupt_shadow = vmx_set_interrupt_shadow, .get_interrupt_shadow = vmx_get_interrupt_shadow, .patch_hypercall = vmx_patch_hypercall, @@ -7799,6 +7800,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .nested_enable_evmcs = NULL, .nested_get_evmcs_version = NULL, .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault, + .apic_init_signal_blocked = vmx_apic_init_signal_blocked, }; static void vmx_cleanup_l1d_flush(void) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 82d0bc3a4d52..64d5a4890aa9 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -253,7 +253,7 @@ struct vcpu_vmx { struct nested_vmx nested; /* Dynamic PLE window. */ - int ple_window; + unsigned int ple_window; bool ple_window_dirty; bool req_immediate_exit; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 91602d310a3f..dfd641243568 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -674,8 +674,14 @@ static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, data, offset, len, access); } +static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) +{ + return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) | + rsvd_bits(1, 2); +} + /* - * Load the pae pdptrs. Return true is they are all valid. + * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise. */ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) { @@ -694,8 +700,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) } for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { if ((pdpte[i] & PT_PRESENT_MASK) && - (pdpte[i] & - vcpu->arch.mmu->guest_rsvd_check.rsvd_bits_mask[0][2])) { + (pdpte[i] & pdptr_rsvd_bits(vcpu))) { ret = 0; goto out; } @@ -1254,6 +1259,13 @@ static u64 kvm_get_arch_capabilities(void) if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER) data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + data |= ARCH_CAP_RDCL_NO; + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + data |= ARCH_CAP_SSB_NO; + if (!boot_cpu_has_bug(X86_BUG_MDS)) + data |= ARCH_CAP_MDS_NO; + return data; } @@ -1351,19 +1363,23 @@ void kvm_enable_efer_bits(u64 mask) EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); /* - * Writes msr value into into the appropriate "register". + * Write @data into the MSR specified by @index. Select MSR specific fault + * checks are bypassed if @host_initiated is %true. * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) +static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, + bool host_initiated) { - switch (msr->index) { + struct msr_data msr; + + switch (index) { case MSR_FS_BASE: case MSR_GS_BASE: case MSR_KERNEL_GS_BASE: case MSR_CSTAR: case MSR_LSTAR: - if (is_noncanonical_address(msr->data, vcpu)) + if (is_noncanonical_address(data, vcpu)) return 1; break; case MSR_IA32_SYSENTER_EIP: @@ -1380,38 +1396,95 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * value, and that something deterministic happens if the guest * invokes 64-bit SYSENTER. */ - msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu)); + data = get_canonical(data, vcpu_virt_addr_bits(vcpu)); } - return kvm_x86_ops->set_msr(vcpu, msr); + + msr.data = data; + msr.index = index; + msr.host_initiated = host_initiated; + + return kvm_x86_ops->set_msr(vcpu, &msr); } -EXPORT_SYMBOL_GPL(kvm_set_msr); /* - * Adapt set_msr() to msr_io()'s calling convention + * Read the MSR specified by @index into @data. Select MSR specific fault + * checks are bypassed if @host_initiated is %true. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. */ -static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated) { struct msr_data msr; - int r; + int ret; msr.index = index; - msr.host_initiated = true; - r = kvm_get_msr(vcpu, &msr); - if (r) - return r; + msr.host_initiated = host_initiated; - *data = msr.data; - return 0; + ret = kvm_x86_ops->get_msr(vcpu, &msr); + if (!ret) + *data = msr.data; + return ret; } -static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) { - struct msr_data msr; + return __kvm_get_msr(vcpu, index, data, false); +} +EXPORT_SYMBOL_GPL(kvm_get_msr); - msr.data = *data; - msr.index = index; - msr.host_initiated = true; - return kvm_set_msr(vcpu, &msr); +int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) +{ + return __kvm_set_msr(vcpu, index, data, false); +} +EXPORT_SYMBOL_GPL(kvm_set_msr); + +int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) +{ + u32 ecx = kvm_rcx_read(vcpu); + u64 data; + + if (kvm_get_msr(vcpu, ecx, &data)) { + trace_kvm_msr_read_ex(ecx); + kvm_inject_gp(vcpu, 0); + return 1; + } + + trace_kvm_msr_read(ecx, data); + + kvm_rax_write(vcpu, data & -1u); + kvm_rdx_write(vcpu, (data >> 32) & -1u); + return kvm_skip_emulated_instruction(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); + +int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) +{ + u32 ecx = kvm_rcx_read(vcpu); + u64 data = kvm_read_edx_eax(vcpu); + + if (kvm_set_msr(vcpu, ecx, data)) { + trace_kvm_msr_write_ex(ecx, data); + kvm_inject_gp(vcpu, 0); + return 1; + } + + trace_kvm_msr_write(ecx, data); + return kvm_skip_emulated_instruction(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); + +/* + * Adapt set_msr() to msr_io()'s calling convention + */ +static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +{ + return __kvm_get_msr(vcpu, index, data, true); +} + +static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +{ + return __kvm_set_msr(vcpu, index, *data, true); } #ifdef CONFIG_X86_64 @@ -2452,6 +2525,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu) * Doing a TLB flush here, on the guest's behalf, can avoid * expensive IPIs. */ + trace_kvm_pv_tlb_flush(vcpu->vcpu_id, + vcpu->arch.st.steal.preempted & KVM_VCPU_FLUSH_TLB); if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb(vcpu, false); @@ -2748,18 +2823,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } EXPORT_SYMBOL_GPL(kvm_set_msr_common); - -/* - * Reads an msr value (of 'msr_index') into 'pdata'. - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) -{ - return kvm_x86_ops->get_msr(vcpu, msr); -} -EXPORT_SYMBOL_GPL(kvm_get_msr); - static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { u64 data; @@ -3506,8 +3569,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; - if (kvm_x86_ops->setup_mce) - kvm_x86_ops->setup_mce(vcpu); + kvm_x86_ops->setup_mce(vcpu); out: return r; } @@ -5377,7 +5439,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, */ if (vcpu_match_mmio_gva(vcpu, gva) && !permission_fault(vcpu, vcpu->arch.walk_mmu, - vcpu->arch.access, 0, access)) { + vcpu->arch.mmio_access, 0, access)) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); @@ -5971,28 +6033,13 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata) { - struct msr_data msr; - int r; - - msr.index = msr_index; - msr.host_initiated = false; - r = kvm_get_msr(emul_to_vcpu(ctxt), &msr); - if (r) - return r; - - *pdata = msr.data; - return 0; + return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); } static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data) { - struct msr_data msr; - - msr.data = data; - msr.index = msr_index; - msr.host_initiated = false; - return kvm_set_msr(emul_to_vcpu(ctxt), &msr); + return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); } static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt) @@ -6075,6 +6122,11 @@ static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt) kvm_smm_changed(emul_to_vcpu(ctxt)); } +static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) +{ + return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr); +} + static const struct x86_emulate_ops emulate_ops = { .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, @@ -6116,6 +6168,7 @@ static const struct x86_emulate_ops emulate_ops = { .set_hflags = emulator_set_hflags, .pre_leave_smm = emulator_pre_leave_smm, .post_leave_smm = emulator_post_leave_smm, + .set_xcr = emulator_set_xcr, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) @@ -6390,9 +6443,11 @@ static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r) int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); - int r = EMULATE_DONE; + int r; - kvm_x86_ops->skip_emulated_instruction(vcpu); + r = kvm_x86_ops->skip_emulated_instruction(vcpu); + if (unlikely(r != EMULATE_DONE)) + return 0; /* * rflags is the old, "raw" value of the flags. The new value has @@ -6528,8 +6583,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, emulation_type)) return EMULATE_DONE; - if (ctxt->have_exception && inject_emulated_exception(vcpu)) + if (ctxt->have_exception) { + /* + * #UD should result in just EMULATION_FAILED, and trap-like + * exception should not be encountered during decode. + */ + WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR || + exception_type(ctxt->exception.vector) == EXCPT_TRAP); + inject_emulated_exception(vcpu); return EMULATE_DONE; + } if (emulation_type & EMULTYPE_SKIP) return EMULATE_FAIL; return handle_emulation_failure(vcpu, emulation_type); @@ -6544,6 +6607,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, kvm_rip_write(vcpu, ctxt->_eip); if (ctxt->eflags & X86_EFLAGS_RF) kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF); + kvm_x86_ops->set_interrupt_shadow(vcpu, 0); return EMULATE_DONE; } @@ -9322,10 +9386,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); - if (kvm_x86_ops->vm_init) - return kvm_x86_ops->vm_init(kvm); - - return 0; + return kvm_x86_ops->vm_init(kvm); } static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) @@ -10017,7 +10078,7 @@ EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); bool kvm_arch_has_irq_bypass(void) { - return kvm_x86_ops->update_pi_irte != NULL; + return true; } int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, @@ -10057,9 +10118,6 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set) { - if (!kvm_x86_ops->update_pi_irte) - return -EINVAL; - return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set); } @@ -10086,11 +10144,12 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 6594020c0691..b5274e2a53cf 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -196,7 +196,7 @@ static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, * actually a nGPA. */ vcpu->arch.mmio_gva = mmu_is_nested(vcpu) ? 0 : gva & PAGE_MASK; - vcpu->arch.access = access; + vcpu->arch.mmio_access = access; vcpu->arch.mmio_gfn = gfn; vcpu->arch.mmio_gen = gen; } |