From 514946d1436341bae57f647ee633cef5edb19ee2 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Wed, 26 Jul 2023 16:59:43 +0300 Subject: KVM: x86: VMX: __kvm_apic_update_irr must update the IRR atomically If APICv is inhibited, then IPIs from peer vCPUs are done by atomically setting bits in IRR. This means, that when __kvm_apic_update_irr copies PIR to IRR, it has to modify IRR atomically as well. Signed-off-by: Maxim Levitsky Message-Id: <20230726135945.260841-2-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 113ca9661ab2..b17b37e4d4fc 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -637,16 +637,22 @@ bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr) *max_irr = -1; for (i = vec = 0; i <= 7; i++, vec += 32) { + u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10); + + irr_val = *p_irr; pir_val = READ_ONCE(pir[i]); - irr_val = *((u32 *)(regs + APIC_IRR + i * 0x10)); + if (pir_val) { + pir_val = xchg(&pir[i], 0); + prev_irr_val = irr_val; - irr_val |= xchg(&pir[i], 0); - *((u32 *)(regs + APIC_IRR + i * 0x10)) = irr_val; - if (prev_irr_val != irr_val) { - max_updated_irr = - __fls(irr_val ^ prev_irr_val) + vec; - } + do { + irr_val = prev_irr_val | pir_val; + } while (prev_irr_val != irr_val && + !try_cmpxchg(p_irr, &prev_irr_val, irr_val)); + + if (prev_irr_val != irr_val) + max_updated_irr = __fls(irr_val ^ prev_irr_val) + vec; } if (irr_val) *max_irr = __fls(irr_val) + vec; -- cgit v1.2.3-58-ga151 From cff540ebef38303a4cabf4b8bb49317be01b4b0f Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Wed, 26 Jul 2023 16:59:44 +0300 Subject: KVM: x86: VMX: set irr_pending in kvm_apic_update_irr When the APICv is inhibited, the irr_pending optimization is used. Therefore, when kvm_apic_update_irr sets bits in the IRR, it must set irr_pending to true as well. Signed-off-by: Maxim Levitsky Message-Id: <20230726135945.260841-3-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index b17b37e4d4fc..a983a16163b1 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -666,8 +666,11 @@ EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr) { struct kvm_lapic *apic = vcpu->arch.apic; + bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr); - return __kvm_apic_update_irr(pir, apic->regs, max_irr); + if (unlikely(!apic->apicv_active && irr_updated)) + apic->irr_pending = true; + return irr_updated; } EXPORT_SYMBOL_GPL(kvm_apic_update_irr); -- cgit v1.2.3-58-ga151 From bf672720e83cf04c007aa11c242229e70985135b Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Wed, 26 Jul 2023 16:59:45 +0300 Subject: KVM: x86: check the kvm_cpu_get_interrupt result before using it The code was blindly assuming that kvm_cpu_get_interrupt never returns -1 when there is a pending interrupt. While this should be true, a bug in KVM can still cause this. If -1 is returned, the code before this patch was converting it to 0xFF, and 0xFF interrupt was injected to the guest, which results in an issue which was hard to debug. Add WARN_ON_ONCE to catch this case and skip the injection if this happens again. Signed-off-by: Maxim Levitsky Message-Id: <20230726135945.260841-4-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a6b9bea62fb8..00b87fcf6da4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10203,9 +10203,13 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, if (r < 0) goto out; if (r) { - kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); - static_call(kvm_x86_inject_irq)(vcpu, false); - WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0); + int irq = kvm_cpu_get_interrupt(vcpu); + + if (!WARN_ON_ONCE(irq == -1)) { + kvm_queue_interrupt(vcpu, irq, false); + static_call(kvm_x86_inject_irq)(vcpu, false); + WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0); + } } if (kvm_cpu_has_injectable_intr(vcpu)) static_call(kvm_x86_enable_irq_window)(vcpu); -- cgit v1.2.3-58-ga151 From fd1815ea709e414f83a06e4cf13ade4a49dd0fda Mon Sep 17 00:00:00 2001 From: Peng Hao Date: Fri, 28 Jul 2023 14:49:48 +0800 Subject: KVM: X86: Use GFP_KERNEL_ACCOUNT for pid_table in ipiv The pid_table of ipiv is the persistent memory allocated by per-vcpu, which should be counted into the memory cgroup. Signed-off-by: Peng Hao Message-Id: Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0ecf4be2c6af..3c5a13f054fc 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4651,7 +4651,8 @@ static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) if (kvm_vmx->pid_table) return 0; - pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm)); + pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, + vmx_get_pid_table_order(kvm)); if (!pages) return -ENOMEM; -- cgit v1.2.3-58-ga151 From 5e1fe4a21c0c2a69419d97d62d3213e8f843920d Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 24 Jul 2023 19:12:36 +0800 Subject: KVM: x86/irq: Conditionally register IRQ bypass consumer again As was attempted commit 14717e203186 ("kvm: Conditionally register IRQ bypass consumer"): "if we don't support a mechanism for bypassing IRQs, don't register as a consumer. Initially this applied to AMD processors, but when AVIC support was implemented for assigned devices, kvm_arch_has_irq_bypass() was always returning true. We can still skip registering the consumer where enable_apicv or posted-interrupts capability is unsupported or globally disabled. This eliminates meaningless dev_info()s when the connect fails between producer and consumer", such as on Linux hosts where enable_apicv or posted-interrupts capability is unsupported or globally disabled. Cc: Alex Williamson Reported-by: Yong He Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217379 Signed-off-by: Like Xu Message-Id: <20230724111236.76570-1-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 00b87fcf6da4..43e0c51da65e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13189,7 +13189,7 @@ EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); bool kvm_arch_has_irq_bypass(void) { - return true; + return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP); } int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, -- cgit v1.2.3-58-ga151 From c20d403fd04c20959b7e6669868372f433947e5f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 16:56:36 -0700 Subject: KVM: VMX: Make VMREAD error path play nice with noinstr Mark vmread_error_trampoline() as noinstr, and add a second trampoline for the CONFIG_CC_HAS_ASM_GOTO_OUTPUT=n case to enable instrumentation when handling VM-Fail on VMREAD. VMREAD is used in various noinstr flows, e.g. immediately after VM-Exit, and objtool rightly complains that the call to the error trampoline leaves a no-instrumentation section without annotating that it's safe to do so. vmlinux.o: warning: objtool: vmx_vcpu_enter_exit+0xc9: call to vmread_error_trampoline() leaves .noinstr.text section Note, strictly speaking, enabling instrumentation in the VM-Fail path isn't exactly safe, but if VMREAD fails the kernel/system is likely hosed anyways, and logging that there is a fatal error is more important than *maybe* encountering slightly unsafe instrumentation. Reported-by: Su Hui Signed-off-by: Sean Christopherson Message-Id: <20230721235637.2345403-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmenter.S | 8 ++++---- arch/x86/kvm/vmx/vmx.c | 18 ++++++++++++++---- arch/x86/kvm/vmx/vmx_ops.h | 9 ++++++++- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 07e927d4d099..be275a0410a8 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -303,10 +303,8 @@ SYM_FUNC_START(vmx_do_nmi_irqoff) VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx SYM_FUNC_END(vmx_do_nmi_irqoff) - -.section .text, "ax" - #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT + /** * vmread_error_trampoline - Trampoline from inline asm to vmread_error() * @field: VMCS field encoding that failed @@ -335,7 +333,7 @@ SYM_FUNC_START(vmread_error_trampoline) mov 3*WORD_SIZE(%_ASM_BP), %_ASM_ARG2 mov 2*WORD_SIZE(%_ASM_BP), %_ASM_ARG1 - call vmread_error + call vmread_error_trampoline2 /* Zero out @fault, which will be popped into the result register. */ _ASM_MOV $0, 3*WORD_SIZE(%_ASM_BP) @@ -357,6 +355,8 @@ SYM_FUNC_START(vmread_error_trampoline) SYM_FUNC_END(vmread_error_trampoline) #endif +.section .text, "ax" + SYM_FUNC_START(vmx_do_interrupt_irqoff) VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1 SYM_FUNC_END(vmx_do_interrupt_irqoff) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3c5a13f054fc..0dd4612b0ca2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -441,13 +441,23 @@ do { \ pr_warn_ratelimited(fmt); \ } while (0) -void vmread_error(unsigned long field, bool fault) +noinline void vmread_error(unsigned long field) { - if (fault) + vmx_insn_failed("vmread failed: field=%lx\n", field); +} + +#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT +noinstr void vmread_error_trampoline2(unsigned long field, bool fault) +{ + if (fault) { kvm_spurious_fault(); - else - vmx_insn_failed("vmread failed: field=%lx\n", field); + } else { + instrumentation_begin(); + vmread_error(field); + instrumentation_end(); + } } +#endif noinline void vmwrite_error(unsigned long field, unsigned long value) { diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index ce47dc265f89..5fa74779a37a 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -10,7 +10,7 @@ #include "vmcs.h" #include "../x86.h" -void vmread_error(unsigned long field, bool fault); +void vmread_error(unsigned long field); void vmwrite_error(unsigned long field, unsigned long value); void vmclear_error(struct vmcs *vmcs, u64 phys_addr); void vmptrld_error(struct vmcs *vmcs, u64 phys_addr); @@ -31,6 +31,13 @@ void invept_error(unsigned long ext, u64 eptp, gpa_t gpa); * void vmread_error_trampoline(unsigned long field, bool fault); */ extern unsigned long vmread_error_trampoline; + +/* + * The second VMREAD error trampoline, called from the assembly trampoline, + * exists primarily to enable instrumentation for the VM-Fail path. + */ +void vmread_error_trampoline2(unsigned long field, bool fault); + #endif static __always_inline void vmcs_check16(unsigned long field) -- cgit v1.2.3-58-ga151 From a062dad7ec4b91473e2e82c4e03cbee953189a87 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 16:56:37 -0700 Subject: KVM: VMX: Use vmread_error() to report VM-Fail in "goto" path Use vmread_error() to report VM-Fail on VMREAD for the "asm goto" case, now that trampoline case has yet another wrapper around vmread_error() to play nice with instrumentation. Signed-off-by: Sean Christopherson Message-Id: <20230721235637.2345403-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx_ops.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 5fa74779a37a..33af7b4c6eb4 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -108,8 +108,7 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) do_fail: instrumentation_begin(); - WARN_ONCE(1, KBUILD_MODNAME ": vmread failed: field=%lx\n", field); - pr_warn_ratelimited(KBUILD_MODNAME ": vmread failed: field=%lx\n", field); + vmread_error(field); instrumentation_end(); return 0; -- cgit v1.2.3-58-ga151 From 3f2739bd1e0b7e9669333852b4e618294d5a1e54 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 15:43:36 -0700 Subject: KVM: x86: Acquire SRCU read lock when handling fastpath MSR writes Temporarily acquire kvm->srcu for read when potentially emulating WRMSR in the VM-Exit fastpath handler, as several of the common helpers used during emulation expect the caller to provide SRCU protection. E.g. if the guest is counting instructions retired, KVM will query the PMU event filter when stepping over the WRMSR. dump_stack+0x85/0xdf lockdep_rcu_suspicious+0x109/0x120 pmc_event_is_allowed+0x165/0x170 kvm_pmu_trigger_event+0xa5/0x190 handle_fastpath_set_msr_irqoff+0xca/0x1e0 svm_vcpu_run+0x5c3/0x7b0 [kvm_amd] vcpu_enter_guest+0x2108/0x2580 Alternatively, check_pmu_event_filter() could acquire kvm->srcu, but this isn't the first bug of this nature, e.g. see commit 5c30e8101e8d ("KVM: SVM: Skip WRMSR fastpath on VM-Exit if next RIP isn't valid"). Providing protection for the entirety of WRMSR emulation will allow reverting the aforementioned commit, and will avoid having to play whack-a-mole when new uses of SRCU-protected structures are inevitably added in common emulation helpers. Fixes: dfdeda67ea2d ("KVM: x86/pmu: Prevent the PMU from counting disallowed events") Reported-by: Greg Thelen Reported-by: Aaron Lewis Signed-off-by: Sean Christopherson Message-Id: <20230721224337.2335137-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 43e0c51da65e..27080bd986b5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2172,6 +2172,8 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) u64 data; fastpath_t ret = EXIT_FASTPATH_NONE; + kvm_vcpu_srcu_read_lock(vcpu); + switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): data = kvm_read_edx_eax(vcpu); @@ -2194,6 +2196,8 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) if (ret != EXIT_FASTPATH_NONE) trace_kvm_msr_write(msr, data); + kvm_vcpu_srcu_read_unlock(vcpu); + return ret; } EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); -- cgit v1.2.3-58-ga151 From b439eb8ab578557263815ba8581d02c1b730e348 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 15:43:37 -0700 Subject: Revert "KVM: SVM: Skip WRMSR fastpath on VM-Exit if next RIP isn't valid" Now that handle_fastpath_set_msr_irqoff() acquires kvm->srcu, i.e. allows dereferencing memslots during WRMSR emulation, drop the requirement that "next RIP" is valid. In hindsight, acquiring kvm->srcu would have been a better fix than avoiding the pastpath, but at the time it was thought that accessing SRCU-protected data in the fastpath was a one-off edge case. This reverts commit 5c30e8101e8d5d020b1d7119117889756a6ed713. Signed-off-by: Sean Christopherson Message-Id: <20230721224337.2335137-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d381ad424554..cea08e5fa69b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3986,14 +3986,8 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { - struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; - - /* - * Note, the next RIP must be provided as SRCU isn't held, i.e. KVM - * can't read guest memory (dereference memslots) to decode the WRMSR. - */ - if (control->exit_code == SVM_EXIT_MSR && control->exit_info_1 && - nrips && control->next_rip) + if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && + to_svm(vcpu)->vmcb->control.exit_info_1) return handle_fastpath_set_msr_irqoff(vcpu); return EXIT_FASTPATH_NONE; -- cgit v1.2.3-58-ga151 From 3bcbc20942db5d738221cca31a928efc09827069 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 15:33:52 -0700 Subject: selftests/rseq: Play nice with binaries statically linked against glibc 2.35+ To allow running rseq and KVM's rseq selftests as statically linked binaries, initialize the various "trampoline" pointers to point directly at the expect glibc symbols, and skip the dlysm() lookups if the rseq size is non-zero, i.e. the binary is statically linked *and* the libc registered its own rseq. Define weak versions of the symbols so as not to break linking against libc versions that don't support rseq in any capacity. The KVM selftests in particular are often statically linked so that they can be run on targets with very limited runtime environments, i.e. test machines. Fixes: 233e667e1ae3 ("selftests/rseq: Uplift rseq selftests for compatibility with glibc-2.35") Cc: Aaron Lewis Cc: kvm@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20230721223352.2333911-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/rseq/rseq.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c index 4e4aa006004c..a723da253244 100644 --- a/tools/testing/selftests/rseq/rseq.c +++ b/tools/testing/selftests/rseq/rseq.c @@ -34,9 +34,17 @@ #include "../kselftest.h" #include "rseq.h" -static const ptrdiff_t *libc_rseq_offset_p; -static const unsigned int *libc_rseq_size_p; -static const unsigned int *libc_rseq_flags_p; +/* + * Define weak versions to play nice with binaries that are statically linked + * against a libc that doesn't support registering its own rseq. + */ +__weak ptrdiff_t __rseq_offset; +__weak unsigned int __rseq_size; +__weak unsigned int __rseq_flags; + +static const ptrdiff_t *libc_rseq_offset_p = &__rseq_offset; +static const unsigned int *libc_rseq_size_p = &__rseq_size; +static const unsigned int *libc_rseq_flags_p = &__rseq_flags; /* Offset from the thread pointer to the rseq area. */ ptrdiff_t rseq_offset; @@ -155,9 +163,17 @@ unsigned int get_rseq_feature_size(void) static __attribute__((constructor)) void rseq_init(void) { - libc_rseq_offset_p = dlsym(RTLD_NEXT, "__rseq_offset"); - libc_rseq_size_p = dlsym(RTLD_NEXT, "__rseq_size"); - libc_rseq_flags_p = dlsym(RTLD_NEXT, "__rseq_flags"); + /* + * If the libc's registered rseq size isn't already valid, it may be + * because the binary is dynamically linked and not necessarily due to + * libc not having registered a restartable sequence. Try to find the + * symbols if that's the case. + */ + if (!*libc_rseq_size_p) { + libc_rseq_offset_p = dlsym(RTLD_NEXT, "__rseq_offset"); + libc_rseq_size_p = dlsym(RTLD_NEXT, "__rseq_size"); + libc_rseq_flags_p = dlsym(RTLD_NEXT, "__rseq_flags"); + } if (libc_rseq_size_p && libc_rseq_offset_p && libc_rseq_flags_p && *libc_rseq_size_p != 0) { /* rseq registration owned by glibc */ -- cgit v1.2.3-58-ga151 From eed3013faa401aae662398709410a59bb0646e32 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 11 Jul 2023 16:01:25 -0700 Subject: KVM: Grab a reference to KVM for VM and vCPU stats file descriptors Grab a reference to KVM prior to installing VM and vCPU stats file descriptors to ensure the underlying VM and vCPU objects are not freed until the last reference to any and all stats fds are dropped. Note, the stats paths manually invoke fd_install() and so don't need to grab a reference before creating the file. Fixes: ce55c049459c ("KVM: stats: Support binary stats retrieval for a VCPU") Fixes: fcfe1baeddbf ("KVM: stats: Support binary stats retrieval for a VM") Reported-by: Zheng Zhang Closes: https://lore.kernel.org/all/CAC_GQSr3xzZaeZt85k_RCBd5kfiOve8qXo7a81Cq53LuVQ5r=Q@mail.gmail.com Cc: stable@vger.kernel.org Cc: Kees Cook Signed-off-by: Sean Christopherson Reviewed-by: Kees Cook Message-Id: <20230711230131.648752-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index dfbaafbe3a00..5bbb5612b207 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4035,8 +4035,17 @@ static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer, sizeof(vcpu->stat), user_buffer, size, offset); } +static int kvm_vcpu_stats_release(struct inode *inode, struct file *file) +{ + struct kvm_vcpu *vcpu = file->private_data; + + kvm_put_kvm(vcpu->kvm); + return 0; +} + static const struct file_operations kvm_vcpu_stats_fops = { .read = kvm_vcpu_stats_read, + .release = kvm_vcpu_stats_release, .llseek = noop_llseek, }; @@ -4057,6 +4066,9 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu) put_unused_fd(fd); return PTR_ERR(file); } + + kvm_get_kvm(vcpu->kvm); + file->f_mode |= FMODE_PREAD; fd_install(fd, file); @@ -4701,8 +4713,17 @@ static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer, sizeof(kvm->stat), user_buffer, size, offset); } +static int kvm_vm_stats_release(struct inode *inode, struct file *file) +{ + struct kvm *kvm = file->private_data; + + kvm_put_kvm(kvm); + return 0; +} + static const struct file_operations kvm_vm_stats_fops = { .read = kvm_vm_stats_read, + .release = kvm_vm_stats_release, .llseek = noop_llseek, }; @@ -4721,6 +4742,9 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm) put_unused_fd(fd); return PTR_ERR(file); } + + kvm_get_kvm(kvm); + file->f_mode |= FMODE_PREAD; fd_install(fd, file); -- cgit v1.2.3-58-ga151 From 625646aede90554fed8d46fd0e081238e071ac5e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 11 Jul 2023 16:01:26 -0700 Subject: KVM: selftests: Use pread() to read binary stats header Use pread() with an explicit offset when reading the header and the header name for a binary stats fd so that the common helper and the binary stats test don't subtly rely on the file effectively being untouched, e.g. to allow multiple reads of the header, name, etc. Signed-off-by: Sean Christopherson Message-Id: <20230711230131.648752-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/kvm_util_base.h | 6 ++++-- tools/testing/selftests/kvm/kvm_binary_stats_test.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 07732a157ccd..eb1ff597bcca 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -362,8 +362,10 @@ static inline void read_stats_header(int stats_fd, struct kvm_stats_header *head { ssize_t ret; - ret = read(stats_fd, header, sizeof(*header)); - TEST_ASSERT(ret == sizeof(*header), "Read stats header"); + ret = pread(stats_fd, header, sizeof(*header), 0); + TEST_ASSERT(ret == sizeof(*header), + "Failed to read '%lu' header bytes, ret = '%ld'", + sizeof(*header), ret); } struct kvm_stats_desc *read_stats_descriptors(int stats_fd, diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index a7001e29dc06..eae99d0e8377 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -43,8 +43,10 @@ static void stats_test(int stats_fd) id = malloc(header.name_size); TEST_ASSERT(id, "Allocate memory for id string"); - ret = read(stats_fd, id, header.name_size); - TEST_ASSERT(ret == header.name_size, "Read id string"); + ret = pread(stats_fd, id, header.name_size, sizeof(header)); + TEST_ASSERT(ret == header.name_size, + "Expected header size '%u', read '%lu' bytes", + header.name_size, ret); /* Check id string, that should start with "kvm" */ TEST_ASSERT(!strncmp(id, "kvm", 3) && strlen(id) < header.name_size, -- cgit v1.2.3-58-ga151 From 87d53582bc8be30a11654a45fac0a257ed830ea8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 11 Jul 2023 16:01:27 -0700 Subject: KVM: selftests: Clean up stats fd in common stats_test() helper Move the stats fd cleanup code into stats_test() and drop the superfluous vm_stats_test() and vcpu_stats_test() helpers in order to decouple creation of the stats file from consuming/testing the file (deduping code is a bonus). This will make it easier to test various edge cases related to stats, e.g. that userspace can dup() a stats fd, that userspace can have multiple stats files for a singleVM/vCPU, etc. Signed-off-by: Sean Christopherson Message-Id: <20230711230131.648752-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/kvm_binary_stats_test.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index eae99d0e8377..f02663711c90 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -167,23 +167,7 @@ static void stats_test(int stats_fd) free(stats_data); free(stats_desc); free(id); -} - - -static void vm_stats_test(struct kvm_vm *vm) -{ - int stats_fd = vm_get_stats_fd(vm); - - stats_test(stats_fd); - close(stats_fd); - TEST_ASSERT(fcntl(stats_fd, F_GETFD) == -1, "Stats fd not freed"); -} - -static void vcpu_stats_test(struct kvm_vcpu *vcpu) -{ - int stats_fd = vcpu_get_stats_fd(vcpu); - stats_test(stats_fd); close(stats_fd); TEST_ASSERT(fcntl(stats_fd, F_GETFD) == -1, "Stats fd not freed"); } @@ -241,9 +225,11 @@ int main(int argc, char *argv[]) /* Check stats read for every VM and VCPU */ for (i = 0; i < max_vm; ++i) { - vm_stats_test(vms[i]); + stats_test(vm_get_stats_fd(vms[i])); + for (j = 0; j < max_vcpu; ++j) - vcpu_stats_test(vcpus[i * max_vcpu + j]); + stats_test(vcpu_get_stats_fd(vcpus[i * max_vcpu + j])); + ksft_test_result_pass("vm%i\n", i); } -- cgit v1.2.3-58-ga151 From 33b02704071b0972a62ec32516847f91cab6cb8f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 11 Jul 2023 16:01:28 -0700 Subject: KVM: selftests: Explicitly free vcpus array in binary stats test Explicitly free the all-encompassing vcpus array in the binary stats test so that the test is consistent with respect to freeing all dynamically allocated resources (versus letting them be freed on exit). Signed-off-by: Sean Christopherson Message-Id: <20230711230131.648752-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/kvm_binary_stats_test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index f02663711c90..874fa5092551 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -236,6 +236,7 @@ int main(int argc, char *argv[]) for (i = 0; i < max_vm; ++i) kvm_vm_free(vms[i]); free(vms); + free(vcpus); ksft_finished(); /* Print results and exit() accordingly */ } -- cgit v1.2.3-58-ga151 From 47d1be8a78fb587fe5a8f0873244476700171403 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 11 Jul 2023 16:01:29 -0700 Subject: KVM: selftests: Verify userspace can create "redundant" binary stats files Verify that KVM doesn't artificially limit KVM_GET_STATS_FD to a single file per VM/vCPU. There's no known use case for getting multiple stats fds, but it should work, and more importantly creating multiple files will make it easier to test that KVM correct manages VM refcounts for stats files. Signed-off-by: Sean Christopherson Message-Id: <20230711230131.648752-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/kvm_binary_stats_test.c | 25 ++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index 874fa5092551..653f10d8fb7c 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -185,6 +185,7 @@ static void stats_test(int stats_fd) int main(int argc, char *argv[]) { + int vm_stats_fds, *vcpu_stats_fds; int i, j; struct kvm_vcpu **vcpus; struct kvm_vm **vms; @@ -217,18 +218,37 @@ int main(int argc, char *argv[]) vcpus = malloc(sizeof(struct kvm_vcpu *) * max_vm * max_vcpu); TEST_ASSERT(vcpus, "Allocate memory for storing vCPU pointers"); + /* + * Not per-VM as the array is populated, used, and invalidated within a + * single for-loop iteration. + */ + vcpu_stats_fds = calloc(max_vm, sizeof(*vcpu_stats_fds)); + TEST_ASSERT(vcpu_stats_fds, "Allocate memory for VM stats fds"); + for (i = 0; i < max_vm; ++i) { vms[i] = vm_create_barebones(); for (j = 0; j < max_vcpu; ++j) vcpus[i * max_vcpu + j] = __vm_vcpu_add(vms[i], j); } - /* Check stats read for every VM and VCPU */ + /* + * Check stats read for every VM and vCPU, with a variety of testcases. + * Note, stats_test() closes the passed in stats fd. + */ for (i = 0; i < max_vm; ++i) { + vm_stats_fds = vm_get_stats_fd(vms[i]); + + /* Verify userspace can instantiate multiple stats files. */ stats_test(vm_get_stats_fd(vms[i])); - for (j = 0; j < max_vcpu; ++j) + for (j = 0; j < max_vcpu; ++j) { + vcpu_stats_fds[j] = vcpu_get_stats_fd(vcpus[i * max_vcpu + j]); stats_test(vcpu_get_stats_fd(vcpus[i * max_vcpu + j])); + } + + stats_test(vm_stats_fds); + for (j = 0; j < max_vcpu; ++j) + stats_test(vcpu_stats_fds[j]); ksft_test_result_pass("vm%i\n", i); } @@ -237,6 +257,7 @@ int main(int argc, char *argv[]) kvm_vm_free(vms[i]); free(vms); free(vcpus); + free(vcpu_stats_fds); ksft_finished(); /* Print results and exit() accordingly */ } -- cgit v1.2.3-58-ga151 From 65f1f57f35e5e833c879a7afb9c862c603695917 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 11 Jul 2023 16:01:30 -0700 Subject: KVM: selftests: Verify stats fd can be dup()'d and read Expand the binary stats test to verify that a stats fd can be dup()'d and read, to (very) roughly simulate userspace passing around the file. Adding the dup() test is primarily an intermediate step towards verifying that userspace can read VM/vCPU stats before _and_ after userspace closes its copy of the VM fd; the dup() test itself is only mildly interesting. Signed-off-by: Sean Christopherson Message-Id: <20230711230131.648752-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/kvm_binary_stats_test.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index 653f10d8fb7c..5317e27b77d0 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -232,17 +232,23 @@ int main(int argc, char *argv[]) } /* - * Check stats read for every VM and vCPU, with a variety of testcases. + * Check stats read for every VM and vCPU, with a variety of flavors. * Note, stats_test() closes the passed in stats fd. */ for (i = 0; i < max_vm; ++i) { + /* + * Verify that creating multiple userspace references to a + * single stats file works and doesn't cause explosions. + */ vm_stats_fds = vm_get_stats_fd(vms[i]); + stats_test(dup(vm_stats_fds)); /* Verify userspace can instantiate multiple stats files. */ stats_test(vm_get_stats_fd(vms[i])); for (j = 0; j < max_vcpu; ++j) { vcpu_stats_fds[j] = vcpu_get_stats_fd(vcpus[i * max_vcpu + j]); + stats_test(dup(vcpu_stats_fds[j])); stats_test(vcpu_get_stats_fd(vcpus[i * max_vcpu + j])); } -- cgit v1.2.3-58-ga151 From 211c0189ea18648a0cf23dea9f4ed745bc9252f6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 11 Jul 2023 16:01:31 -0700 Subject: KVM: selftests: Verify stats fd is usable after VM fd has been closed Verify that VM and vCPU binary stats files are usable even after userspace has put its last direct reference to the VM. This is a regression test for a UAF bug where KVM didn't gift the stats files a reference to the VM. Signed-off-by: Sean Christopherson Message-Id: <20230711230131.648752-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/kvm_binary_stats_test.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index 5317e27b77d0..698c1cfa3111 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -252,6 +252,14 @@ int main(int argc, char *argv[]) stats_test(vcpu_get_stats_fd(vcpus[i * max_vcpu + j])); } + /* + * Close the VM fd and redo the stats tests. KVM should gift a + * reference (to the VM) to each stats fd, i.e. stats should + * still be accessible even after userspace has put its last + * _direct_ reference to the VM. + */ + kvm_vm_free(vms[i]); + stats_test(vm_stats_fds); for (j = 0; j < max_vcpu; ++j) stats_test(vcpu_stats_fds[j]); @@ -259,8 +267,6 @@ int main(int argc, char *argv[]) ksft_test_result_pass("vm%i\n", i); } - for (i = 0; i < max_vm; ++i) - kvm_vm_free(vms[i]); free(vms); free(vcpus); free(vcpu_stats_fds); -- cgit v1.2.3-58-ga151 From 880218361c10d6dd7f99423d621112b8770fc32f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 26 Jul 2023 13:29:20 -0700 Subject: Revert "debugfs, coccinelle: check for obsolete DEFINE_SIMPLE_ATTRIBUTE() usage" Remove coccinelle's recommendation to use DEFINE_DEBUGFS_ATTRIBUTE() instead of DEFINE_SIMPLE_ATTRIBUTE(). Regardless of whether or not the "significant overhead" incurred by debugfs_create_file() is actually meaningful, warnings from the script have led to a rash of low-quality patches that have sowed confusion and consumed maintainer time for little to no benefit. There have been no less than four attempts to "fix" KVM, and a quick search on lore shows that KVM is not alone. This reverts commit 5103068eaca290f890a30aae70085fac44cecaf6. Link: https://lore.kernel.org/all/87tu2nbnz3.fsf@mpe.ellerman.id.au Link: https://lore.kernel.org/all/c0b98151-16b6-6d8f-1765-0f7d46682d60@redhat.com Link: https://lkml.kernel.org/r/20230706072954.4881-1-duminjie%40vivo.com Link: https://lore.kernel.org/all/Y2FsbufV00jbyF0B@google.com Link: https://lore.kernel.org/all/Y2ENJJ1YiSg5oHiy@orome Link: https://lore.kernel.org/all/7560b350e7b23786ce712118a9a504356ff1cca4.camel@kernel.org Suggested-by: Paolo Bonzini Acked-by: Greg Kroah-Hartman Signed-off-by: Sean Christopherson Message-Id: <20230726202920.507756-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- .../api/debugfs/debugfs_simple_attr.cocci | 68 ---------------------- 1 file changed, 68 deletions(-) delete mode 100644 scripts/coccinelle/api/debugfs/debugfs_simple_attr.cocci diff --git a/scripts/coccinelle/api/debugfs/debugfs_simple_attr.cocci b/scripts/coccinelle/api/debugfs/debugfs_simple_attr.cocci deleted file mode 100644 index 7c312310547c..000000000000 --- a/scripts/coccinelle/api/debugfs/debugfs_simple_attr.cocci +++ /dev/null @@ -1,68 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/// Use DEFINE_DEBUGFS_ATTRIBUTE rather than DEFINE_SIMPLE_ATTRIBUTE -/// for debugfs files. -/// -//# Rationale: DEFINE_SIMPLE_ATTRIBUTE + debugfs_create_file() -//# imposes some significant overhead as compared to -//# DEFINE_DEBUGFS_ATTRIBUTE + debugfs_create_file_unsafe(). -// -// Copyright (C): 2016 Nicolai Stange -// Options: --no-includes -// - -virtual context -virtual patch -virtual org -virtual report - -@dsa@ -declarer name DEFINE_SIMPLE_ATTRIBUTE; -identifier dsa_fops; -expression dsa_get, dsa_set, dsa_fmt; -position p; -@@ -DEFINE_SIMPLE_ATTRIBUTE@p(dsa_fops, dsa_get, dsa_set, dsa_fmt); - -@dcf@ -expression name, mode, parent, data; -identifier dsa.dsa_fops; -@@ -debugfs_create_file(name, mode, parent, data, &dsa_fops) - - -@context_dsa depends on context && dcf@ -declarer name DEFINE_DEBUGFS_ATTRIBUTE; -identifier dsa.dsa_fops; -expression dsa.dsa_get, dsa.dsa_set, dsa.dsa_fmt; -@@ -* DEFINE_SIMPLE_ATTRIBUTE(dsa_fops, dsa_get, dsa_set, dsa_fmt); - - -@patch_dcf depends on patch expression@ -expression name, mode, parent, data; -identifier dsa.dsa_fops; -@@ -- debugfs_create_file(name, mode, parent, data, &dsa_fops) -+ debugfs_create_file_unsafe(name, mode, parent, data, &dsa_fops) - -@patch_dsa depends on patch_dcf && patch@ -identifier dsa.dsa_fops; -expression dsa.dsa_get, dsa.dsa_set, dsa.dsa_fmt; -@@ -- DEFINE_SIMPLE_ATTRIBUTE(dsa_fops, dsa_get, dsa_set, dsa_fmt); -+ DEFINE_DEBUGFS_ATTRIBUTE(dsa_fops, dsa_get, dsa_set, dsa_fmt); - - -@script:python depends on org && dcf@ -fops << dsa.dsa_fops; -p << dsa.p; -@@ -msg="%s should be defined with DEFINE_DEBUGFS_ATTRIBUTE" % (fops) -coccilib.org.print_todo(p[0], msg) - -@script:python depends on report && dcf@ -fops << dsa.dsa_fops; -p << dsa.p; -@@ -msg="WARNING: %s should be defined with DEFINE_DEBUGFS_ATTRIBUTE" % (fops) -coccilib.report.print_report(p[0], msg) -- cgit v1.2.3-58-ga151 From 26a0652cb453c72f6aab0974bc4939e9b14f886b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jun 2023 13:30:35 -0700 Subject: KVM: x86: Disallow KVM_SET_SREGS{2} if incoming CR0 is invalid Reject KVM_SET_SREGS{2} with -EINVAL if the incoming CR0 is invalid, e.g. due to setting bits 63:32, illegal combinations, or to a value that isn't allowed in VMX (non-)root mode. The VMX checks in particular are "fun" as failure to disallow Real Mode for an L2 that is configured with unrestricted guest disabled, when KVM itself has unrestricted guest enabled, will result in KVM forcing VM86 mode to virtual Real Mode for L2, but then fail to unwind the related metadata when synthesizing a nested VM-Exit back to L1 (which has unrestricted guest enabled). Opportunistically fix a benign typo in the prototype for is_valid_cr4(). Cc: stable@vger.kernel.org Reported-by: syzbot+5feef0b9ee9c8e9e5689@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/000000000000f316b705fdf6e2b4@google.com Signed-off-by: Sean Christopherson Message-Id: <20230613203037.1968489-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 1 + arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/svm/svm.c | 6 ++++++ arch/x86/kvm/vmx/vmx.c | 28 +++++++++++++++++++++------- arch/x86/kvm/x86.c | 34 ++++++++++++++++++++++------------ 5 files changed, 52 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 13bc212cd4bc..e3054e3e46d5 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -37,6 +37,7 @@ KVM_X86_OP(get_segment) KVM_X86_OP(get_cpl) KVM_X86_OP(set_segment) KVM_X86_OP(get_cs_db_l_bits) +KVM_X86_OP(is_valid_cr0) KVM_X86_OP(set_cr0) KVM_X86_OP_OPTIONAL(post_set_cr3) KVM_X86_OP(is_valid_cr4) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 28bd38303d70..3bc146dfd38d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1566,9 +1566,10 @@ struct kvm_x86_ops { void (*set_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); + bool (*is_valid_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*post_set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); - bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr0); + bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index cea08e5fa69b..956726d867aa 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1786,6 +1786,11 @@ static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) } } +static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + return true; +} + void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4809,6 +4814,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_segment = svm_set_segment, .get_cpl = svm_get_cpl, .get_cs_db_l_bits = svm_get_cs_db_l_bits, + .is_valid_cr0 = svm_is_valid_cr0, .set_cr0 = svm_set_cr0, .post_set_cr3 = sev_post_set_cr3, .is_valid_cr4 = svm_is_valid_cr4, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0dd4612b0ca2..3d011d62d969 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3047,6 +3047,15 @@ static void enter_rmode(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); + /* + * KVM should never use VM86 to virtualize Real Mode when L2 is active, + * as using VM86 is unnecessary if unrestricted guest is enabled, and + * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0 + * should VM-Fail and KVM should reject userspace attempts to stuff + * CR0.PG=0 when L2 is active. + */ + WARN_ON_ONCE(is_guest_mode(vcpu)); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); @@ -3236,6 +3245,17 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ CPU_BASED_CR3_STORE_EXITING) +static bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + if (is_guest_mode(vcpu)) + return nested_guest_cr0_valid(vcpu, cr0); + + if (to_vmx(vcpu)->nested.vmxon) + return nested_host_cr0_valid(vcpu, cr0); + + return true; +} + void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5375,18 +5395,11 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) val = (val & ~vmcs12->cr0_guest_host_mask) | (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); - if (!nested_guest_cr0_valid(vcpu, val)) - return 1; - if (kvm_set_cr0(vcpu, val)) return 1; vmcs_writel(CR0_READ_SHADOW, orig_val); return 0; } else { - if (to_vmx(vcpu)->nested.vmxon && - !nested_host_cr0_valid(vcpu, val)) - return 1; - return kvm_set_cr0(vcpu, val); } } @@ -8214,6 +8227,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_segment = vmx_set_segment, .get_cpl = vmx_get_cpl, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, + .is_valid_cr0 = vmx_is_valid_cr0, .set_cr0 = vmx_set_cr0, .is_valid_cr4 = vmx_is_valid_cr4, .set_cr4 = vmx_set_cr4, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 27080bd986b5..278dbd37dab2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -906,6 +906,22 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) } EXPORT_SYMBOL_GPL(load_pdptrs); +static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ +#ifdef CONFIG_X86_64 + if (cr0 & 0xffffffff00000000UL) + return false; +#endif + + if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) + return false; + + if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) + return false; + + return static_call(kvm_x86_is_valid_cr0)(vcpu, cr0); +} + void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0) { /* @@ -952,20 +968,13 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { unsigned long old_cr0 = kvm_read_cr0(vcpu); - cr0 |= X86_CR0_ET; - -#ifdef CONFIG_X86_64 - if (cr0 & 0xffffffff00000000UL) + if (!kvm_is_valid_cr0(vcpu, cr0)) return 1; -#endif - - cr0 &= ~CR0_RESERVED_BITS; - if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) - return 1; + cr0 |= X86_CR0_ET; - if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) - return 1; + /* Write to CR0 reserved bits are ignored, even on Intel. */ + cr0 &= ~CR0_RESERVED_BITS; #ifdef CONFIG_X86_64 if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) && @@ -11468,7 +11477,8 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return false; } - return kvm_is_valid_cr4(vcpu, sregs->cr4); + return kvm_is_valid_cr4(vcpu, sregs->cr4) && + kvm_is_valid_cr0(vcpu, sregs->cr0); } static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, -- cgit v1.2.3-58-ga151 From c4abd7352023aa96114915a0bb2b88016a425cda Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jun 2023 13:30:36 -0700 Subject: KVM: VMX: Don't fudge CR0 and CR4 for restricted L2 guest Stuff CR0 and/or CR4 to be compliant with a restricted guest if and only if KVM itself is not configured to utilize unrestricted guests, i.e. don't stuff CR0/CR4 for a restricted L2 that is running as the guest of an unrestricted L1. Any attempt to VM-Enter a restricted guest with invalid CR0/CR4 values should fail, i.e. in a nested scenario, KVM (as L0) should never observe a restricted L2 with incompatible CR0/CR4, since nested VM-Enter from L1 should have failed. And if KVM does observe an active, restricted L2 with incompatible state, e.g. due to a KVM bug, fudging CR0/CR4 instead of letting VM-Enter fail does more harm than good, as KVM will often neglect to undo the side effects, e.g. won't clear rmode.vm86_active on nested VM-Exit, and thus the damage can easily spill over to L1. On the other hand, letting VM-Enter fail due to bad guest state is more likely to contain the damage to L2 as KVM relies on hardware to perform most guest state consistency checks, i.e. KVM needs to be able to reflect a failed nested VM-Enter into L1 irrespective of (un)restricted guest behavior. Cc: Jim Mattson Cc: stable@vger.kernel.org Fixes: bddd82d19e2e ("KVM: nVMX: KVM needs to unset "unrestricted guest" VM-execution control in vmcs02 if vmcs12 doesn't set it") Signed-off-by: Sean Christopherson Message-Id: <20230613203037.1968489-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3d011d62d969..df461f387e20 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1513,6 +1513,11 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long old_rflags; + /* + * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU + * is an unrestricted guest in order to mark L2 as needing emulation + * if L1 runs L2 as a restricted guest. + */ if (is_unrestricted_guest(vcpu)) { kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); vmx->rflags = rflags; @@ -3265,7 +3270,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG); hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); - if (is_unrestricted_guest(vcpu)) + if (enable_unrestricted_guest) hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; else { hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; @@ -3293,7 +3298,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } #endif - if (enable_ept && !is_unrestricted_guest(vcpu)) { + if (enable_ept && !enable_unrestricted_guest) { /* * Ensure KVM has an up-to-date snapshot of the guest's CR3. If * the below code _enables_ CR3 exiting, vmx_cache_reg() will @@ -3424,7 +3429,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) * this bit, even if host CR4.MCE == 0. */ hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); - if (is_unrestricted_guest(vcpu)) + if (enable_unrestricted_guest) hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; else if (vmx->rmode.vm86_active) hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; @@ -3444,7 +3449,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vcpu->arch.cr4 = cr4; kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); - if (!is_unrestricted_guest(vcpu)) { + if (!enable_unrestricted_guest) { if (enable_ept) { if (!is_paging(vcpu)) { hw_cr4 &= ~X86_CR4_PAE; -- cgit v1.2.3-58-ga151 From 5a7591176c47cce363c1eed704241e5d1c42c5a6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jun 2023 13:30:37 -0700 Subject: KVM: selftests: Expand x86's sregs test to cover illegal CR0 values Add coverage to x86's set_sregs_test to verify KVM rejects vendor-agnostic illegal CR0 values, i.e. CR0 values whose legality doesn't depend on the current VMX mode. KVM historically has neglected to reject bad CR0s from userspace, i.e. would happily accept a completely bogus CR0 via KVM_SET_SREGS{2}. Punt VMX specific subtests to future work, as they would require quite a bit more effort, and KVM gets coverage for CR0 checks in general through other means, e.g. KVM-Unit-Tests. Signed-off-by: Sean Christopherson Message-Id: <20230613203037.1968489-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/x86_64/set_sregs_test.c | 70 ++++++++++++---------- 1 file changed, 39 insertions(+), 31 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c index a284fcef6ed7..3610981d9162 100644 --- a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c +++ b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c @@ -22,26 +22,25 @@ #include "kvm_util.h" #include "processor.h" -static void test_cr4_feature_bit(struct kvm_vcpu *vcpu, struct kvm_sregs *orig, - uint64_t feature_bit) -{ - struct kvm_sregs sregs; - int rc; - - /* Skip the sub-test, the feature is supported. */ - if (orig->cr4 & feature_bit) - return; - - memcpy(&sregs, orig, sizeof(sregs)); - sregs.cr4 |= feature_bit; - - rc = _vcpu_sregs_set(vcpu, &sregs); - TEST_ASSERT(rc, "KVM allowed unsupported CR4 bit (0x%lx)", feature_bit); - - /* Sanity check that KVM didn't change anything. */ - vcpu_sregs_get(vcpu, &sregs); - TEST_ASSERT(!memcmp(&sregs, orig, sizeof(sregs)), "KVM modified sregs"); -} +#define TEST_INVALID_CR_BIT(vcpu, cr, orig, bit) \ +do { \ + struct kvm_sregs new; \ + int rc; \ + \ + /* Skip the sub-test, the feature/bit is supported. */ \ + if (orig.cr & bit) \ + break; \ + \ + memcpy(&new, &orig, sizeof(sregs)); \ + new.cr |= bit; \ + \ + rc = _vcpu_sregs_set(vcpu, &new); \ + TEST_ASSERT(rc, "KVM allowed invalid " #cr " bit (0x%lx)", bit); \ + \ + /* Sanity check that KVM didn't change anything. */ \ + vcpu_sregs_get(vcpu, &new); \ + TEST_ASSERT(!memcmp(&new, &orig, sizeof(new)), "KVM modified sregs"); \ +} while (0) static uint64_t calc_supported_cr4_feature_bits(void) { @@ -80,7 +79,7 @@ int main(int argc, char *argv[]) struct kvm_vcpu *vcpu; struct kvm_vm *vm; uint64_t cr4; - int rc; + int rc, i; /* * Create a dummy VM, specifically to avoid doing KVM_SET_CPUID2, and @@ -92,6 +91,7 @@ int main(int argc, char *argv[]) vcpu_sregs_get(vcpu, &sregs); + sregs.cr0 = 0; sregs.cr4 |= calc_supported_cr4_feature_bits(); cr4 = sregs.cr4; @@ -103,16 +103,24 @@ int main(int argc, char *argv[]) sregs.cr4, cr4); /* Verify all unsupported features are rejected by KVM. */ - test_cr4_feature_bit(vcpu, &sregs, X86_CR4_UMIP); - test_cr4_feature_bit(vcpu, &sregs, X86_CR4_LA57); - test_cr4_feature_bit(vcpu, &sregs, X86_CR4_VMXE); - test_cr4_feature_bit(vcpu, &sregs, X86_CR4_SMXE); - test_cr4_feature_bit(vcpu, &sregs, X86_CR4_FSGSBASE); - test_cr4_feature_bit(vcpu, &sregs, X86_CR4_PCIDE); - test_cr4_feature_bit(vcpu, &sregs, X86_CR4_OSXSAVE); - test_cr4_feature_bit(vcpu, &sregs, X86_CR4_SMEP); - test_cr4_feature_bit(vcpu, &sregs, X86_CR4_SMAP); - test_cr4_feature_bit(vcpu, &sregs, X86_CR4_PKE); + TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_UMIP); + TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_LA57); + TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_VMXE); + TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_SMXE); + TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_FSGSBASE); + TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_PCIDE); + TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_OSXSAVE); + TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_SMEP); + TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_SMAP); + TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_PKE); + + for (i = 32; i < 64; i++) + TEST_INVALID_CR_BIT(vcpu, cr0, sregs, BIT(i)); + + /* NW without CD is illegal, as is PG without PE. */ + TEST_INVALID_CR_BIT(vcpu, cr0, sregs, X86_CR0_NW); + TEST_INVALID_CR_BIT(vcpu, cr0, sregs, X86_CR0_PG); + kvm_vm_free(vm); /* Create a "real" VM and verify APIC_BASE can be set. */ -- cgit v1.2.3-58-ga151