diff options
35 files changed, 490 insertions, 228 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index bb8cfddbb22d..a4267104db50 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -3268,6 +3268,7 @@ number. :Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device, KVM_CAP_VCPU_ATTRIBUTES for vcpu device + KVM_CAP_SYS_ATTRIBUTES for system (/dev/kvm) device (no set) :Type: device ioctl, vm ioctl, vcpu ioctl :Parameters: struct kvm_device_attr :Returns: 0 on success, -1 on error @@ -3302,7 +3303,8 @@ transferred is defined by the particular attribute. ------------------------ :Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device, - KVM_CAP_VCPU_ATTRIBUTES for vcpu device + KVM_CAP_VCPU_ATTRIBUTES for vcpu device + KVM_CAP_SYS_ATTRIBUTES for system (/dev/kvm) device :Type: device ioctl, vm ioctl, vcpu ioctl :Parameters: struct kvm_device_attr :Returns: 0 on success, -1 on error diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c index 0418399e0a20..c5d009715402 100644 --- a/arch/arm64/kvm/hyp/exception.c +++ b/arch/arm64/kvm/hyp/exception.c @@ -38,7 +38,10 @@ static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, u64 val) { - write_sysreg_el1(val, SYS_SPSR); + if (has_vhe()) + write_sysreg_el1(val, SYS_SPSR); + else + __vcpu_sys_reg(vcpu, SPSR_EL1) = val; } static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 844a6f003fd5..2cb3867eb7c2 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -983,13 +983,9 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, */ stage2_put_pte(ptep, mmu, addr, level, mm_ops); - if (need_flush) { - kvm_pte_t *pte_follow = kvm_pte_follow(pte, mm_ops); - - dcache_clean_inval_poc((unsigned long)pte_follow, - (unsigned long)pte_follow + - kvm_granule_size(level)); - } + if (need_flush && mm_ops->dcache_clean_inval_poc) + mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops), + kvm_granule_size(level)); if (childp) mm_ops->put_page(childp); @@ -1151,15 +1147,13 @@ static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct kvm_pgtable *pgt = arg; struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; kvm_pte_t pte = *ptep; - kvm_pte_t *pte_follow; if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pgt, pte)) return 0; - pte_follow = kvm_pte_follow(pte, mm_ops); - dcache_clean_inval_poc((unsigned long)pte_follow, - (unsigned long)pte_follow + - kvm_granule_size(level)); + if (mm_ops->dcache_clean_inval_poc) + mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops), + kvm_granule_size(level)); return 0; } diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 20db2f281cf2..4fb419f7b8b6 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -983,6 +983,9 @@ static void __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT; /* IDbits */ val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT; + /* SEIS */ + if (kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) + val |= BIT(ICC_CTLR_EL1_SEIS_SHIFT); /* A3V */ val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT; /* EOImode */ diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index a33d4366b326..b549af8b1dc2 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -609,6 +609,18 @@ static int __init early_gicv4_enable(char *buf) } early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable); +static const struct midr_range broken_seis[] = { + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM), + {}, +}; + +static bool vgic_v3_broken_seis(void) +{ + return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) && + is_midr_in_range_list(read_cpuid_id(), broken_seis)); +} + /** * vgic_v3_probe - probe for a VGICv3 compatible interrupt controller * @info: pointer to the GIC description @@ -676,9 +688,10 @@ int vgic_v3_probe(const struct gic_kvm_info *info) group1_trap = true; } - if (kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) { - kvm_info("GICv3 with locally generated SEI\n"); + if (vgic_v3_broken_seis()) { + kvm_info("GICv3 with broken locally generated SEI\n"); + kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_SEIS_MASK; group0_trap = true; group1_trap = true; if (ich_vtr_el2 & ICH_VTR_TDS_MASK) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1384517d7709..6e7c545bc7ee 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1483,7 +1483,8 @@ struct kvm_x86_ops { int (*get_msr_feature)(struct kvm_msr_entry *entry); - bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, void *insn, int insn_len); + bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len); bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu); @@ -1496,6 +1497,7 @@ struct kvm_x86_ops { }; struct kvm_x86_nested_ops { + void (*leave_nested)(struct kvm_vcpu *vcpu); int (*check_events)(struct kvm_vcpu *vcpu); bool (*hv_timer_pending)(struct kvm_vcpu *vcpu); void (*triple_fault)(struct kvm_vcpu *vcpu); @@ -1861,7 +1863,6 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); -void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, unsigned long ipi_bitmap_high, u32 min, diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 2da3316bb559..bf6e96011dfe 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -452,6 +452,9 @@ struct kvm_sync_regs { #define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001 +/* attributes for system fd (group 0) */ +#define KVM_X86_XCOMP_GUEST_SUPP 0 + struct kvm_vmx_nested_state_data { __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 3902c28fb6cb..28be02adc669 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -133,6 +133,7 @@ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 orig = &vcpu->arch.cpuid_entries[i]; if (e2[i].function != orig->function || e2[i].index != orig->index || + e2[i].flags != orig->flags || e2[i].eax != orig->eax || e2[i].ebx != orig->ebx || e2[i].ecx != orig->ecx || e2[i].edx != orig->edx) return -EINVAL; @@ -196,10 +197,26 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) vcpu->arch.pv_cpuid.features = best->eax; } +/* + * Calculate guest's supported XCR0 taking into account guest CPUID data and + * supported_xcr0 (comprised of host configuration and KVM_SUPPORTED_XCR0). + */ +static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent) +{ + struct kvm_cpuid_entry2 *best; + + best = cpuid_entry2_find(entries, nent, 0xd, 0); + if (!best) + return 0; + + return (best->eax | ((u64)best->edx << 32)) & supported_xcr0; +} + static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, int nent) { struct kvm_cpuid_entry2 *best; + u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent); best = cpuid_entry2_find(entries, nent, 1, 0); if (best) { @@ -238,6 +255,21 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT); } + + /* + * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate + * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's + * requested XCR0 value. The enclave's XFRM must be a subset of XCRO + * at the time of EENTER, thus adjust the allowed XFRM by the guest's + * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to + * '1' even on CPUs that don't support XSAVE. + */ + best = cpuid_entry2_find(entries, nent, 0x12, 0x1); + if (best) { + best->ecx &= guest_supported_xcr0 & 0xffffffff; + best->edx &= guest_supported_xcr0 >> 32; + best->ecx |= XFEATURE_MASK_FPSSE; + } } void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) @@ -261,27 +293,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_apic_set_version(vcpu); } - best = kvm_find_cpuid_entry(vcpu, 0xD, 0); - if (!best) - vcpu->arch.guest_supported_xcr0 = 0; - else - vcpu->arch.guest_supported_xcr0 = - (best->eax | ((u64)best->edx << 32)) & supported_xcr0; - - /* - * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate - * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's - * requested XCR0 value. The enclave's XFRM must be a subset of XCRO - * at the time of EENTER, thus adjust the allowed XFRM by the guest's - * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to - * '1' even on CPUs that don't support XSAVE. - */ - best = kvm_find_cpuid_entry(vcpu, 0x12, 0x1); - if (best) { - best->ecx &= vcpu->arch.guest_supported_xcr0 & 0xffffffff; - best->edx &= vcpu->arch.guest_supported_xcr0 >> 32; - best->ecx |= XFEATURE_MASK_FPSSE; - } + vcpu->arch.guest_supported_xcr0 = + cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); kvm_update_pv_runtime(vcpu); @@ -346,8 +359,14 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, * KVM_SET_CPUID{,2} again. To support this legacy behavior, check * whether the supplied CPUID data is equal to what's already set. */ - if (vcpu->arch.last_vmentry_cpu != -1) - return kvm_cpuid_check_equal(vcpu, e2, nent); + if (vcpu->arch.last_vmentry_cpu != -1) { + r = kvm_cpuid_check_equal(vcpu, e2, nent); + if (r) + return r; + + kvfree(e2); + return 0; + } r = kvm_check_cpuid(vcpu, e2, nent); if (r) @@ -887,13 +906,14 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) } break; case 0xd: { - u64 guest_perm = xstate_get_guest_group_perm(); + u64 permitted_xcr0 = supported_xcr0 & xstate_get_guest_group_perm(); + u64 permitted_xss = supported_xss; - entry->eax &= supported_xcr0 & guest_perm; - entry->ebx = xstate_required_size(supported_xcr0, false); + entry->eax &= permitted_xcr0; + entry->ebx = xstate_required_size(permitted_xcr0, false); entry->ecx = entry->ebx; - entry->edx &= (supported_xcr0 & guest_perm) >> 32; - if (!supported_xcr0) + entry->edx &= permitted_xcr0 >> 32; + if (!permitted_xcr0) break; entry = do_host_cpuid(array, function, 1); @@ -902,20 +922,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) cpuid_entry_override(entry, CPUID_D_1_EAX); if (entry->eax & (F(XSAVES)|F(XSAVEC))) - entry->ebx = xstate_required_size(supported_xcr0 | supported_xss, + entry->ebx = xstate_required_size(permitted_xcr0 | permitted_xss, true); else { - WARN_ON_ONCE(supported_xss != 0); + WARN_ON_ONCE(permitted_xss != 0); entry->ebx = 0; } - entry->ecx &= supported_xss; - entry->edx &= supported_xss >> 32; + entry->ecx &= permitted_xss; + entry->edx &= permitted_xss >> 32; for (i = 2; i < 64; ++i) { bool s_state; - if (supported_xcr0 & BIT_ULL(i)) + if (permitted_xcr0 & BIT_ULL(i)) s_state = false; - else if (supported_xss & BIT_ULL(i)) + else if (permitted_xss & BIT_ULL(i)) s_state = true; else continue; @@ -929,7 +949,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) * invalid sub-leafs. Only valid sub-leafs should * reach this point, and they should have a non-zero * save state size. Furthermore, check whether the - * processor agrees with supported_xcr0/supported_xss + * processor agrees with permitted_xcr0/permitted_xss * on whether this is an XCR0- or IA32_XSS-managed area. */ if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index baca9fa37a91..4662469240bc 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2629,7 +2629,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) kvm_apic_set_version(vcpu); apic_update_ppr(apic); - hrtimer_cancel(&apic->lapic_timer.timer); + cancel_apic_timer(apic); apic->lapic_timer.expired_tscdeadline = 0; apic_update_lvtt(apic); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index cf206855ebf0..1218b5a342fc 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -983,9 +983,9 @@ void svm_free_nested(struct vcpu_svm *svm) /* * Forcibly leave nested mode in order to be able to reset the VCPU later on. */ -void svm_leave_nested(struct vcpu_svm *svm) +void svm_leave_nested(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = &svm->vcpu; + struct vcpu_svm *svm = to_svm(vcpu); if (is_guest_mode(vcpu)) { svm->nested.nested_run_pending = 0; @@ -1411,7 +1411,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) { - svm_leave_nested(svm); + svm_leave_nested(vcpu); svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); return 0; } @@ -1478,7 +1478,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, */ if (is_guest_mode(vcpu)) - svm_leave_nested(svm); + svm_leave_nested(vcpu); else svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save; @@ -1532,6 +1532,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) } struct kvm_x86_nested_ops svm_nested_ops = { + .leave_nested = svm_leave_nested, .check_events = svm_check_nested_events, .triple_fault = nested_svm_triple_fault, .get_nested_state_pages = svm_get_nested_state_pages, diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 6a22798eaaee..17b53457d866 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2100,8 +2100,13 @@ void __init sev_hardware_setup(void) if (!sev_enabled || !npt_enabled) goto out; - /* Does the CPU support SEV? */ - if (!boot_cpu_has(X86_FEATURE_SEV)) + /* + * SEV must obviously be supported in hardware. Sanity check that the + * CPU supports decode assists, which is mandatory for SEV guests to + * support instruction emulation. + */ + if (!boot_cpu_has(X86_FEATURE_SEV) || + WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS))) goto out; /* Retrieve SEV CPUID information */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 2c99b18d76c0..6d97629655e3 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -290,7 +290,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) { if (!(efer & EFER_SVME)) { - svm_leave_nested(svm); + svm_leave_nested(vcpu); svm_set_gif(svm, true); /* #GP intercept is still needed for vmware backdoor */ if (!enable_vmware_backdoor) @@ -312,7 +312,11 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) return ret; } - if (svm_gp_erratum_intercept) + /* + * Never intercept #GP for SEV guests, KVM can't + * decrypt guest memory to workaround the erratum. + */ + if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm)) set_exception_intercept(svm, GP_VECTOR); } } @@ -1010,9 +1014,10 @@ static void init_vmcb(struct kvm_vcpu *vcpu) * Guest access to VMware backdoor ports could legitimately * trigger #GP because of TSS I/O permission bitmap. * We intercept those #GP and allow access to them anyway - * as VMware does. + * as VMware does. Don't intercept #GP for SEV guests as KVM can't + * decrypt guest memory to decode the faulting instruction. */ - if (enable_vmware_backdoor) + if (enable_vmware_backdoor && !sev_guest(vcpu->kvm)) set_exception_intercept(svm, GP_VECTOR); svm_set_intercept(svm, INTERCEPT_INTR); @@ -2091,10 +2096,6 @@ static int gp_interception(struct kvm_vcpu *vcpu) if (error_code) goto reinject; - /* All SVM instructions expect page aligned RAX */ - if (svm->vmcb->save.rax & ~PAGE_MASK) - goto reinject; - /* Decode the instruction for usage later */ if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK) goto reinject; @@ -2112,8 +2113,13 @@ static int gp_interception(struct kvm_vcpu *vcpu) if (!is_guest_mode(vcpu)) return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE); - } else + } else { + /* All SVM instructions expect page aligned RAX */ + if (svm->vmcb->save.rax & ~PAGE_MASK) + goto reinject; + return emulate_svm_instr(vcpu, opcode); + } reinject: kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); @@ -4252,79 +4258,140 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu) } } -static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len) +static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) { bool smep, smap, is_user; unsigned long cr4; + u64 error_code; + + /* Emulation is always possible when KVM has access to all guest state. */ + if (!sev_guest(vcpu->kvm)) + return true; + + /* #UD and #GP should never be intercepted for SEV guests. */ + WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD | + EMULTYPE_TRAP_UD_FORCED | + EMULTYPE_VMWARE_GP)); /* - * When the guest is an SEV-ES guest, emulation is not possible. + * Emulation is impossible for SEV-ES guests as KVM doesn't have access + * to guest register state. */ if (sev_es_guest(vcpu->kvm)) return false; /* + * Emulation is possible if the instruction is already decoded, e.g. + * when completing I/O after returning from userspace. + */ + if (emul_type & EMULTYPE_NO_DECODE) + return true; + + /* + * Emulation is possible for SEV guests if and only if a prefilled + * buffer containing the bytes of the intercepted instruction is + * available. SEV guest memory is encrypted with a guest specific key + * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and + * decode garbage. + * + * Inject #UD if KVM reached this point without an instruction buffer. + * In practice, this path should never be hit by a well-behaved guest, + * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path + * is still theoretically reachable, e.g. via unaccelerated fault-like + * AVIC access, and needs to be handled by KVM to avoid putting the + * guest into an infinite loop. Injecting #UD is somewhat arbitrary, + * but its the least awful option given lack of insight into the guest. + */ + if (unlikely(!insn)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return false; + } + + /* + * Emulate for SEV guests if the insn buffer is not empty. The buffer + * will be empty if the DecodeAssist microcode cannot fetch bytes for + * the faulting instruction because the code fetch itself faulted, e.g. + * the guest attempted to fetch from emulated MMIO or a guest page + * table used to translate CS:RIP resides in emulated MMIO. + */ + if (likely(insn_len)) + return true; + + /* * Detect and workaround Errata 1096 Fam_17h_00_0Fh. * * Errata: - * When CPU raise #NPF on guest data access and vCPU CR4.SMAP=1, it is - * possible that CPU microcode implementing DecodeAssist will fail - * to read bytes of instruction which caused #NPF. In this case, - * GuestIntrBytes field of the VMCB on a VMEXIT will incorrectly - * return 0 instead of the correct guest instruction bytes. - * - * This happens because CPU microcode reading instruction bytes - * uses a special opcode which attempts to read data using CPL=0 - * privileges. The microcode reads CS:RIP and if it hits a SMAP - * fault, it gives up and returns no instruction bytes. + * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is + * possible that CPU microcode implementing DecodeAssist will fail to + * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly + * be '0'. This happens because microcode reads CS:RIP using a _data_ + * loap uop with CPL=0 privileges. If the load hits a SMAP #PF, ucode + * gives up and does not fill the instruction bytes buffer. * - * Detection: - * We reach here in case CPU supports DecodeAssist, raised #NPF and - * returned 0 in GuestIntrBytes field of the VMCB. - * First, errata can only be triggered in case vCPU CR4.SMAP=1. - * Second, if vCPU CR4.SMEP=1, errata could only be triggered - * in case vCPU CPL==3 (Because otherwise guest would have triggered - * a SMEP fault instead of #NPF). - * Otherwise, vCPU CR4.SMEP=0, errata could be triggered by any vCPU CPL. - * As most guests enable SMAP if they have also enabled SMEP, use above - * logic in order to attempt minimize false-positive of detecting errata - * while still preserving all cases semantic correctness. + * As above, KVM reaches this point iff the VM is an SEV guest, the CPU + * supports DecodeAssist, a #NPF was raised, KVM's page fault handler + * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the + * GuestIntrBytes field of the VMCB. * - * Workaround: - * To determine what instruction the guest was executing, the hypervisor - * will have to decode the instruction at the instruction pointer. + * This does _not_ mean that the erratum has been encountered, as the + * DecodeAssist will also fail if the load for CS:RIP hits a legitimate + * #PF, e.g. if the guest attempt to execute from emulated MMIO and + * encountered a reserved/not-present #PF. * - * In non SEV guest, hypervisor will be able to read the guest - * memory to decode the instruction pointer when insn_len is zero - * so we return true to indicate that decoding is possible. + * To hit the erratum, the following conditions must be true: + * 1. CR4.SMAP=1 (obviously). + * 2. CR4.SMEP=0 || CPL=3. If SMEP=1 and CPL<3, the erratum cannot + * have been hit as the guest would have encountered a SMEP + * violation #PF, not a #NPF. + * 3. The #NPF is not due to a code fetch, in which case failure to + * retrieve the instruction bytes is legitimate (see abvoe). * - * But in the SEV guest, the guest memory is encrypted with the - * guest specific key and hypervisor will not be able to decode the - * instruction pointer so we will not able to workaround it. Lets - * print the error and request to kill the guest. + * In addition, don't apply the erratum workaround if the #NPF occurred + * while translating guest page tables (see below). */ - if (likely(!insn || insn_len)) - return true; - - /* - * If RIP is invalid, go ahead with emulation which will cause an - * internal error exit. - */ - if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT)) - return true; + error_code = to_svm(vcpu)->vmcb->control.exit_info_1; + if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK)) + goto resume_guest; cr4 = kvm_read_cr4(vcpu); smep = cr4 & X86_CR4_SMEP; smap = cr4 & X86_CR4_SMAP; is_user = svm_get_cpl(vcpu) == 3; if (smap && (!smep || is_user)) { - if (!sev_guest(vcpu->kvm)) - return true; - pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n"); - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + + /* + * If the fault occurred in userspace, arbitrarily inject #GP + * to avoid killing the guest and to hopefully avoid confusing + * the guest kernel too much, e.g. injecting #PF would not be + * coherent with respect to the guest's page tables. Request + * triple fault if the fault occurred in the kernel as there's + * no fault that KVM can inject without confusing the guest. + * In practice, the triple fault is moot as no sane SEV kernel + * will execute from user memory while also running with SMAP=1. + */ + if (is_user) + kvm_inject_gp(vcpu, 0); + else + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); } +resume_guest: + /* + * If the erratum was not hit, simply resume the guest and let it fault + * again. While awful, e.g. the vCPU may get stuck in an infinite loop + * if the fault is at CPL=0, it's the lesser of all evils. Exiting to + * userspace will kill the guest, and letting the emulator read garbage + * will yield random behavior and potentially corrupt the guest. + * + * Simply resuming the guest is technically not a violation of the SEV + * architecture. AMD's APM states that all code fetches and page table + * accesses for SEV guest are encrypted, regardless of the C-Bit. The + * APM also states that encrypted accesses to MMIO are "ignored", but + * doesn't explicitly define "ignored", i.e. doing nothing and letting + * the guest spin is technically "ignoring" the access. + */ return false; } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 47ef8f4a9358..73525353e424 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -304,11 +304,6 @@ static inline void vmcb_mark_all_clean(struct vmcb *vmcb) & ~VMCB_ALWAYS_DIRTY_MASK; } -static inline bool vmcb_is_clean(struct vmcb *vmcb, int bit) -{ - return (vmcb->control.clean & (1 << bit)); -} - static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit) { vmcb->control.clean &= ~(1 << bit); @@ -525,7 +520,7 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun); -void svm_leave_nested(struct vcpu_svm *svm); +void svm_leave_nested(struct kvm_vcpu *vcpu); void svm_free_nested(struct vcpu_svm *svm); int svm_allocate_nested(struct vcpu_svm *svm); int nested_svm_vmrun(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index c53b8bf8d013..489ca56212c6 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -46,6 +46,9 @@ static inline void svm_hv_init_vmcb(struct vmcb *vmcb) if (npt_enabled && ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) hve->hv_enlightenments_control.enlightened_npt_tlb = 1; + + if (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP) + hve->hv_enlightenments_control.msr_bitmap = 1; } static inline void svm_hv_hardware_setup(void) @@ -83,14 +86,7 @@ static inline void svm_hv_vmcb_dirty_nested_enlightenments( struct hv_enlightenments *hve = (struct hv_enlightenments *)vmcb->control.reserved_sw; - /* - * vmcb can be NULL if called during early vcpu init. - * And its okay not to mark vmcb dirty during vcpu init - * as we mark it dirty unconditionally towards end of vcpu - * init phase. - */ - if (vmcb_is_clean(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS) && - hve->hv_enlightenments_control.msr_bitmap) + if (hve->hv_enlightenments_control.msr_bitmap) vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS); } diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 959b59d13b5a..3f430e218375 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -54,7 +54,6 @@ struct nested_vmx_msrs { struct vmcs_config { int size; - int order; u32 basic_cap; u32 revision_id; u32 pin_based_exec_ctrl; diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index ba6f99f584ac..87e3dc10edf4 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -12,8 +12,6 @@ DEFINE_STATIC_KEY_FALSE(enable_evmcs); -#if IS_ENABLED(CONFIG_HYPERV) - #define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) #define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ {EVMCS1_OFFSET(name), clean_field} @@ -296,6 +294,7 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = { }; const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1); +#if IS_ENABLED(CONFIG_HYPERV) __init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) { vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL; @@ -362,6 +361,7 @@ void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata) case MSR_IA32_VMX_PROCBASED_CTLS2: ctl_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC; break; + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: ctl_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; break; diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 16731d2cf231..8d70f9aea94b 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -59,12 +59,12 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs); SECONDARY_EXEC_SHADOW_VMCS | \ SECONDARY_EXEC_TSC_SCALING | \ SECONDARY_EXEC_PAUSE_LOOP_EXITING) -#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) +#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL \ + (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) #define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) #define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING) -#if IS_ENABLED(CONFIG_HYPERV) - struct evmcs_field { u16 offset; u16 clean_field; @@ -73,26 +73,56 @@ struct evmcs_field { extern const struct evmcs_field vmcs_field_to_evmcs_1[]; extern const unsigned int nr_evmcs_1_fields; -static __always_inline int get_evmcs_offset(unsigned long field, - u16 *clean_field) +static __always_inline int evmcs_field_offset(unsigned long field, + u16 *clean_field) { unsigned int index = ROL16(field, 6); const struct evmcs_field *evmcs_field; - if (unlikely(index >= nr_evmcs_1_fields)) { - WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n", - field); + if (unlikely(index >= nr_evmcs_1_fields)) return -ENOENT; - } evmcs_field = &vmcs_field_to_evmcs_1[index]; + /* + * Use offset=0 to detect holes in eVMCS. This offset belongs to + * 'revision_id' but this field has no encoding and is supposed to + * be accessed directly. + */ + if (unlikely(!evmcs_field->offset)) + return -ENOENT; + if (clean_field) *clean_field = evmcs_field->clean_field; return evmcs_field->offset; } +static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs, + unsigned long field, u16 offset) +{ + /* + * vmcs12_read_any() doesn't care whether the supplied structure + * is 'struct vmcs12' or 'struct hv_enlightened_vmcs' as it takes + * the exact offset of the required field, use it for convenience + * here. + */ + return vmcs12_read_any((void *)evmcs, field, offset); +} + +#if IS_ENABLED(CONFIG_HYPERV) + +static __always_inline int get_evmcs_offset(unsigned long field, + u16 *clean_field) +{ + int offset = evmcs_field_offset(field, clean_field); + + WARN_ONCE(offset < 0, "KVM: accessing unsupported EVMCS field %lx\n", + field); + + return offset; +} + static __always_inline void evmcs_write64(unsigned long field, u64 value) { u16 clean_field; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index f235f77cbc03..ba34e94049c7 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -7,6 +7,7 @@ #include <asm/mmu_context.h> #include "cpuid.h" +#include "evmcs.h" #include "hyperv.h" #include "mmu.h" #include "nested.h" @@ -4851,18 +4852,20 @@ static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; /* - * We should allocate a shadow vmcs for vmcs01 only when L1 - * executes VMXON and free it when L1 executes VMXOFF. - * As it is invalid to execute VMXON twice, we shouldn't reach - * here when vmcs01 already have an allocated shadow vmcs. + * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it + * when L1 executes VMXOFF or the vCPU is forced out of nested + * operation. VMXON faults if the CPU is already post-VMXON, so it + * should be impossible to already have an allocated shadow VMCS. KVM + * doesn't support virtualization of VMCS shadowing, so vmcs01 should + * always be the loaded VMCS. */ - WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); + if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) + return loaded_vmcs->shadow_vmcs; + + loaded_vmcs->shadow_vmcs = alloc_vmcs(true); + if (loaded_vmcs->shadow_vmcs) + vmcs_clear(loaded_vmcs->shadow_vmcs); - if (!loaded_vmcs->shadow_vmcs) { - loaded_vmcs->shadow_vmcs = alloc_vmcs(true); - if (loaded_vmcs->shadow_vmcs) - vmcs_clear(loaded_vmcs->shadow_vmcs); - } return loaded_vmcs->shadow_vmcs; } @@ -5099,27 +5102,49 @@ static int handle_vmread(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - /* - * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, - * any VMREAD sets the ALU flags for VMfailInvalid. - */ - if (vmx->nested.current_vmptr == INVALID_GPA || - (is_guest_mode(vcpu) && - get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) - return nested_vmx_failInvalid(vcpu); - /* Decode instruction info and find the field to read */ field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); - offset = vmcs_field_to_offset(field); - if (offset < 0) - return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); + if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { + /* + * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, + * any VMREAD sets the ALU flags for VMfailInvalid. + */ + if (vmx->nested.current_vmptr == INVALID_GPA || + (is_guest_mode(vcpu) && + get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) + return nested_vmx_failInvalid(vcpu); - if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) - copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); + offset = get_vmcs12_field_offset(field); + if (offset < 0) + return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); + + if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) + copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); - /* Read the field, zero-extended to a u64 value */ - value = vmcs12_read_any(vmcs12, field, offset); + /* Read the field, zero-extended to a u64 value */ + value = vmcs12_read_any(vmcs12, field, offset); + } else { + /* + * Hyper-V TLFS (as of 6.0b) explicitly states, that while an + * enlightened VMCS is active VMREAD/VMWRITE instructions are + * unsupported. Unfortunately, certain versions of Windows 11 + * don't comply with this requirement which is not enforced in + * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a + * workaround, as misbehaving guests will panic on VM-Fail. + * Note, enlightened VMCS is incompatible with shadow VMCS so + * all VMREADs from L2 should go to L1. + */ + if (WARN_ON_ONCE(is_guest_mode(vcpu))) + return nested_vmx_failInvalid(vcpu); + + offset = evmcs_field_offset(field, NULL); + if (offset < 0) + return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); + + /* Read the field, zero-extended to a u64 value */ + value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset); + } /* * Now copy part of this value to register or memory, as requested. @@ -5214,7 +5239,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); - offset = vmcs_field_to_offset(field); + offset = get_vmcs12_field_offset(field); if (offset < 0) return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); @@ -6462,7 +6487,7 @@ static u64 nested_vmx_calc_vmcs_enum_msr(void) max_idx = 0; for (i = 0; i < nr_vmcs12_fields; i++) { /* The vmcs12 table is very, very sparsely populated. */ - if (!vmcs_field_to_offset_table[i]) + if (!vmcs12_field_offsets[i]) continue; idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); @@ -6771,6 +6796,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) } struct kvm_x86_nested_ops vmx_nested_ops = { + .leave_nested = vmx_leave_nested, .check_events = vmx_check_nested_events, .hv_timer_pending = nested_vmx_preemption_timer_pending, .triple_fault = nested_vmx_triple_fault, diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index cab6ba7a5005..2251b60920f8 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -8,7 +8,7 @@ FIELD(number, name), \ [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32) -const unsigned short vmcs_field_to_offset_table[] = { +const unsigned short vmcs12_field_offsets[] = { FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), FIELD(POSTED_INTR_NV, posted_intr_nv), FIELD(GUEST_ES_SELECTOR, guest_es_selector), @@ -151,4 +151,4 @@ const unsigned short vmcs_field_to_offset_table[] = { FIELD(HOST_RSP, host_rsp), FIELD(HOST_RIP, host_rip), }; -const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs_field_to_offset_table); +const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs12_field_offsets); diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 2a45f026ee11..746129ddd5ae 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -361,10 +361,10 @@ static inline void vmx_check_vmcs12_offsets(void) CHECK_OFFSET(guest_pml_index, 996); } -extern const unsigned short vmcs_field_to_offset_table[]; +extern const unsigned short vmcs12_field_offsets[]; extern const unsigned int nr_vmcs12_fields; -static inline short vmcs_field_to_offset(unsigned long field) +static inline short get_vmcs12_field_offset(unsigned long field) { unsigned short offset; unsigned int index; @@ -377,7 +377,7 @@ static inline short vmcs_field_to_offset(unsigned long field) return -ENOENT; index = array_index_nospec(index, nr_vmcs12_fields); - offset = vmcs_field_to_offset_table[index]; + offset = vmcs12_field_offsets[index]; if (offset == 0) return -ENOENT; return offset; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4ac676066d60..aca3ae2a02f3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1487,11 +1487,12 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) return 0; } -static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len) +static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) { /* * Emulation of instructions in SGX enclaves is impossible as RIP does - * not point tthe failing instruction, and even if it did, the code + * not point at the failing instruction, and even if it did, the code * stream is inaccessible. Inject #UD instead of exiting to userspace * so that guest userspace can't DoS the guest simply by triggering * emulation (enclaves are CPL3 only). @@ -2603,7 +2604,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, return -EIO; vmcs_conf->size = vmx_msr_high & 0x1fff; - vmcs_conf->order = get_order(vmcs_conf->size); vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; vmcs_conf->revision_id = vmx_msr_low; @@ -2628,7 +2628,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) struct page *pages; struct vmcs *vmcs; - pages = __alloc_pages_node(node, flags, vmcs_config.order); + pages = __alloc_pages_node(node, flags, 0); if (!pages) return NULL; vmcs = page_address(pages); @@ -2647,7 +2647,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) void free_vmcs(struct vmcs *vmcs) { - free_pages((unsigned long)vmcs, vmcs_config.order); + free_page((unsigned long)vmcs); } /* @@ -4094,10 +4094,14 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) vmcs_write32(HOST_IA32_SYSENTER_CS, low32); /* - * If 32-bit syscall is enabled, vmx_vcpu_load_vcms rewrites - * HOST_IA32_SYSENTER_ESP. + * SYSENTER is used for 32-bit system calls on either 32-bit or + * 64-bit kernels. It is always zero If neither is allowed, otherwise + * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may + * have already done so!). */ - vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); + if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32)) + vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); + rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ @@ -4901,8 +4905,33 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) dr6 = vmx_get_exit_qual(vcpu); if (!(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { + /* + * If the #DB was due to ICEBP, a.k.a. INT1, skip the + * instruction. ICEBP generates a trap-like #DB, but + * despite its interception control being tied to #DB, + * is an instruction intercept, i.e. the VM-Exit occurs + * on the ICEBP itself. Note, skipping ICEBP also + * clears STI and MOVSS blocking. + * + * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS + * if single-step is enabled in RFLAGS and STI or MOVSS + * blocking is active, as the CPU doesn't set the bit + * on VM-Exit due to #DB interception. VM-Entry has a + * consistency check that a single-step #DB is pending + * in this scenario as the previous instruction cannot + * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV + * don't modify RFLAGS), therefore the one instruction + * delay when activating single-step breakpoints must + * have already expired. Note, the CPU sets/clears BS + * as appropriate for all other VM-Exits types. + */ if (is_icebp(intr_info)) WARN_ON(!skip_emulated_instruction(vcpu)); + else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) && + (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS))) + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS); kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); return 1; @@ -5397,7 +5426,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) { gpa_t gpa; - if (!vmx_can_emulate_instruction(vcpu, NULL, 0)) + if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) return 1; /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9e43d756312f..74b53a16f38a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3535,6 +3535,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & ~supported_xss) return 1; vcpu->arch.ia32_xss = data; + kvm_update_cpuid_runtime(vcpu); break; case MSR_SMI_COUNT: if (!msr_info->host_initiated) @@ -4229,6 +4230,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SREGS2: case KVM_CAP_EXIT_ON_EMULATION_FAILURE: case KVM_CAP_VCPU_ATTRIBUTES: + case KVM_CAP_SYS_ATTRIBUTES: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: @@ -4331,7 +4333,49 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; } return r; +} + +static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr) +{ + void __user *uaddr = (void __user*)(unsigned long)attr->addr; + if ((u64)(unsigned long)uaddr != attr->addr) + return ERR_PTR(-EFAULT); + return uaddr; +} + +static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr) +{ + u64 __user *uaddr = kvm_get_attr_addr(attr); + + if (attr->group) + return -ENXIO; + + if (IS_ERR(uaddr)) + return PTR_ERR(uaddr); + + switch (attr->attr) { + case KVM_X86_XCOMP_GUEST_SUPP: + if (put_user(supported_xcr0, uaddr)) + return -EFAULT; + return 0; + default: + return -ENXIO; + break; + } +} + +static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr) +{ + if (attr->group) + return -ENXIO; + + switch (attr->attr) { + case KVM_X86_XCOMP_GUEST_SUPP: + return 0; + default: + return -ENXIO; + } } long kvm_arch_dev_ioctl(struct file *filp, @@ -4422,6 +4466,22 @@ long kvm_arch_dev_ioctl(struct file *filp, case KVM_GET_SUPPORTED_HV_CPUID: r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp); break; + case KVM_GET_DEVICE_ATTR: { + struct kvm_device_attr attr; + r = -EFAULT; + if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) + break; + r = kvm_x86_dev_get_attr(&attr); + break; + } + case KVM_HAS_DEVICE_ATTR: { + struct kvm_device_attr attr; + r = -EFAULT; + if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) + break; + r = kvm_x86_dev_has_attr(&attr); + break; + } default: r = -EINVAL; break; @@ -4860,8 +4920,10 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.apic->sipi_vector = events->sipi_vector; if (events->flags & KVM_VCPUEVENT_VALID_SMM) { - if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) + if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) { + kvm_x86_ops.nested_ops->leave_nested(vcpu); kvm_smm_changed(vcpu, events->smi.smm); + } vcpu->arch.smi_pending = events->smi.pending; @@ -5022,11 +5084,11 @@ static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu, static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { - u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr; + u64 __user *uaddr = kvm_get_attr_addr(attr); int r; - if ((u64)(unsigned long)uaddr != attr->addr) - return -EFAULT; + if (IS_ERR(uaddr)) + return PTR_ERR(uaddr); switch (attr->attr) { case KVM_VCPU_TSC_OFFSET: @@ -5045,12 +5107,12 @@ static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu, static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { - u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr; + u64 __user *uaddr = kvm_get_attr_addr(attr); struct kvm *kvm = vcpu->kvm; int r; - if ((u64)(unsigned long)uaddr != attr->addr) - return -EFAULT; + if (IS_ERR(uaddr)) + return PTR_ERR(uaddr); switch (attr->attr) { case KVM_VCPU_TSC_OFFSET: { @@ -6810,6 +6872,13 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, } EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); +static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) +{ + return static_call(kvm_x86_can_emulate_instruction)(vcpu, emul_type, + insn, insn_len); +} + int handle_ud(struct kvm_vcpu *vcpu) { static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX }; @@ -6817,7 +6886,7 @@ int handle_ud(struct kvm_vcpu *vcpu) char sig[5]; /* ud2; .ascii "kvm" */ struct x86_exception e; - if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, NULL, 0))) + if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0))) return 1; if (force_emulation_prefix && @@ -8193,7 +8262,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, bool writeback = true; bool write_fault_to_spt; - if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, insn, insn_len))) + if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len))) return 1; vcpu->arch.l1tf_flush_l1d = true; @@ -9706,7 +9775,7 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); } -void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) +static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) { if (!lapic_in_kernel(vcpu)) return; @@ -11209,7 +11278,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.msr_misc_features_enables = 0; - vcpu->arch.xcr0 = XFEATURE_MASK_FP; + __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP); + __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true); } /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */ @@ -11226,8 +11296,6 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0); kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600); - vcpu->arch.ia32_xss = 0; - static_call(kvm_x86_vcpu_reset)(vcpu, init_event); kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 0e3f7d6e9fd7..bad57535fad0 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -316,10 +316,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) "\tnotq %0\n" "\t" LOCK_PREFIX "andq %0, %2\n" "2:\n" - "\t.section .fixup,\"ax\"\n" - "3:\tjmp\t2b\n" - "\t.previous\n" - _ASM_EXTABLE_UA(1b, 3b) + _ASM_EXTABLE_UA(1b, 2b) : "=r" (evtchn_pending_sel), "+m" (vi->evtchn_pending_sel), "+m" (v->arch.xen.evtchn_pending_sel) @@ -335,10 +332,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) "\tnotl %0\n" "\t" LOCK_PREFIX "andl %0, %2\n" "2:\n" - "\t.section .fixup,\"ax\"\n" - "3:\tjmp\t2b\n" - "\t.previous\n" - _ASM_EXTABLE_UA(1b, 3b) + _ASM_EXTABLE_UA(1b, 2b) : "=r" (evtchn_pending_sel32), "+m" (vi->evtchn_pending_sel), "+m" (v->arch.xen.evtchn_pending_sel) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 9563d294f181..b46bcdb0cab1 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1133,6 +1133,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206 #define KVM_CAP_VM_GPA_BITS 207 #define KVM_CAP_XSAVE2 208 +#define KVM_CAP_SYS_ATTRIBUTES 209 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index 2da3316bb559..bf6e96011dfe 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -452,6 +452,9 @@ struct kvm_sync_regs { #define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001 +/* attributes for system fd (group 0) */ +#define KVM_X86_XCOMP_GUEST_SUPP 0 + struct kvm_vmx_nested_state_data { __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 9563d294f181..b46bcdb0cab1 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1133,6 +1133,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206 #define KVM_CAP_VM_GPA_BITS 207 #define KVM_CAP_XSAVE2 208 +#define KVM_CAP_SYS_ATTRIBUTES 209 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 81ebf99d6ff0..0e4926bc9a58 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -85,6 +85,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_pi_mmio_test TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests TEST_GEN_PROGS_x86_64 += x86_64/amx_test +TEST_GEN_PROGS_x86_64 += access_tracking_perf_test TEST_GEN_PROGS_x86_64 += demand_paging_test TEST_GEN_PROGS_x86_64 += dirty_log_test TEST_GEN_PROGS_x86_64 += dirty_log_perf_test diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 66775de26952..4ed6aa049a91 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -345,7 +345,6 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, * guest_code - The vCPU's entry point */ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code); -void vm_xsave_req_perm(void); bool vm_is_unrestricted_guest(struct kvm_vm *vm); diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 423d8a61bd2e..8a470da7b71a 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -458,6 +458,7 @@ uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void); void vcpu_set_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid); struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid); +void vm_xsave_req_perm(int bit); enum x86_page_size { X86_PAGE_SIZE_4K = 0, diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 8c53f96ab7fe..d8cf851ab119 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -393,13 +393,6 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, struct kvm_vm *vm; int i; -#ifdef __x86_64__ - /* - * Permission needs to be requested before KVM_SET_CPUID2. - */ - vm_xsave_req_perm(); -#endif - /* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */ if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES) slot0_mem_pages = DEFAULT_GUEST_PHY_PAGES; diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 5f9d7e91dc69..9f000dfb5594 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -665,16 +665,31 @@ static bool is_xfd_supported(void) return !!(eax & CPUID_XFD_BIT); } -void vm_xsave_req_perm(void) +void vm_xsave_req_perm(int bit) { - unsigned long bitmask; + int kvm_fd; + u64 bitmask; long rc; + struct kvm_device_attr attr = { + .group = 0, + .attr = KVM_X86_XCOMP_GUEST_SUPP, + .addr = (unsigned long) &bitmask + }; + + kvm_fd = open_kvm_dev_path_or_exit(); + rc = ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr); + close(kvm_fd); + if (rc == -1 && (errno == ENXIO || errno == EINVAL)) + exit(KSFT_SKIP); + TEST_ASSERT(rc == 0, "KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) error: %ld", rc); + if (!(bitmask & (1ULL << bit))) + exit(KSFT_SKIP); if (!is_xfd_supported()) - return; + exit(KSFT_SKIP); + + rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit); - rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, - XSTATE_XTILE_DATA_BIT); /* * The older kernel version(<5.15) can't support * ARCH_REQ_XCOMP_GUEST_PERM and directly return. @@ -684,7 +699,7 @@ void vm_xsave_req_perm(void) rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &bitmask); TEST_ASSERT(rc == 0, "prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc); - TEST_ASSERT(bitmask & XFEATURE_XTILE_MASK, + TEST_ASSERT(bitmask & (1ULL << bit), "prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure bitmask=0x%lx", bitmask); } diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index 523c1e99ed64..52a3ef6629e8 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -329,6 +329,8 @@ int main(int argc, char *argv[]) u32 amx_offset; int stage, ret; + vm_xsave_req_perm(XSTATE_XTILE_DATA_BIT); + /* Create VM */ vm = vm_create_default(VCPU_ID, 0, guest_code); diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c index 2da8eb8e2d96..a626d40fdb48 100644 --- a/tools/testing/selftests/kvm/x86_64/smm_test.c +++ b/tools/testing/selftests/kvm/x86_64/smm_test.c @@ -105,7 +105,6 @@ static void guest_code(void *arg) if (cpu_has_svm()) { run_guest(svm->vmcb, svm->vmcb_gpa); - svm->vmcb->save.rip += 3; run_guest(svm->vmcb, svm->vmcb_gpa); } else { vmlaunch(); diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 2ad013b8bde9..59b1dd4a549e 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -463,8 +463,8 @@ bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) idx = srcu_read_lock(&kvm->irq_srcu); gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); if (gsi != -1) - hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, - link) + hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list, + link, srcu_read_lock_held(&kvm->irq_srcu)) if (kian->gsi == gsi) { srcu_read_unlock(&kvm->irq_srcu, idx); return true; @@ -480,8 +480,8 @@ void kvm_notify_acked_gsi(struct kvm *kvm, int gsi) { struct kvm_irq_ack_notifier *kian; - hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, - link) + hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list, + link, srcu_read_lock_held(&kvm->irq_srcu)) if (kian->gsi == gsi) kian->irq_acked(kian); } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9a20f2299386..58d31da8a2f7 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2248,7 +2248,6 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn return NULL; } -EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot); bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) { @@ -2463,9 +2462,8 @@ static int kvm_try_get_pfn(kvm_pfn_t pfn) } static int hva_to_pfn_remapped(struct vm_area_struct *vma, - unsigned long addr, bool *async, - bool write_fault, bool *writable, - kvm_pfn_t *p_pfn) + unsigned long addr, bool write_fault, + bool *writable, kvm_pfn_t *p_pfn) { kvm_pfn_t pfn; pte_t *ptep; @@ -2575,7 +2573,7 @@ retry: if (vma == NULL) pfn = KVM_PFN_ERR_FAULT; else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { - r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn); + r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn); if (r == -EAGAIN) goto retry; if (r < 0) |