From 2b01281273738bf2d6551da48d65db2df3f28998 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:08:45 +0000 Subject: KVM: Register /dev/kvm as the _very_ last thing during initialization Register /dev/kvm, i.e. expose KVM to userspace, only after all other setup has completed. Once /dev/kvm is exposed, userspace can start invoking KVM ioctls, creating VMs, etc... If userspace creates a VM before KVM is done with its configuration, bad things may happen, e.g. KVM will fail to properly migrate vCPU state if a VM is created before KVM has registered preemption notifiers. Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 13e88297f999..28a1a02f5228 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5988,12 +5988,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, kvm_chardev_ops.owner = module; - r = misc_register(&kvm_dev); - if (r) { - pr_err("kvm: misc device register failed\n"); - goto out_unreg; - } - register_syscore_ops(&kvm_syscore_ops); kvm_preempt_ops.sched_in = kvm_sched_in; @@ -6002,11 +5996,24 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, kvm_init_debug(); r = kvm_vfio_ops_init(); - WARN_ON(r); + if (WARN_ON_ONCE(r)) + goto err_vfio; + + /* + * Registration _must_ be the very last thing done, as this exposes + * /dev/kvm to userspace, i.e. all infrastructure must be setup! + */ + r = misc_register(&kvm_dev); + if (r) { + pr_err("kvm: misc device register failed\n"); + goto err_register; + } return 0; -out_unreg: +err_register: + kvm_vfio_ops_exit(); +err_vfio: kvm_async_pf_deinit(); out_free_4: for_each_possible_cpu(cpu) @@ -6032,8 +6039,14 @@ void kvm_exit(void) { int cpu; - debugfs_remove_recursive(kvm_debugfs_dir); + /* + * Note, unregistering /dev/kvm doesn't strictly need to come first, + * fops_get(), a.k.a. try_module_get(), prevents acquiring references + * to KVM while the module is being stopped. + */ misc_deregister(&kvm_dev); + + debugfs_remove_recursive(kvm_debugfs_dir); for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); -- cgit v1.2.3-58-ga151 From 5910ccf03de4fd6d321cf30cab7a06f1436a2053 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:08:46 +0000 Subject: KVM: Initialize IRQ FD after arch hardware setup Move initialization of KVM's IRQ FD workqueue below arch hardware setup as a step towards consolidating arch "init" and "hardware setup", and eventually towards dropping the hooks entirely. There is no dependency on the workqueue being created before hardware setup, the workqueue is used only when destroying VMs, i.e. only needs to be created before /dev/kvm is exposed to userspace. Move the destruction of the workqueue before the arch hooks to maintain symmetry, and so that arch code can move away from the hooks without having to worry about ordering changes. Reword the comment about kvm_irqfd_init() needing to come after kvm_arch_init() to call out that kvm_arch_init() must come before common KVM does _anything_, as x86 very subtly relies on that behavior to deal with multiple calls to kvm_init(), e.g. if userspace attempts to load kvm_amd.ko and kvm_intel.ko. Tag the code with a FIXME, as x86's subtle requirement is gross, and invoking an arch callback as the very first action in a helper that is called only from arch code is silly. Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 28a1a02f5228..6793a7e3525a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5921,24 +5921,19 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, int r; int cpu; - r = kvm_arch_init(opaque); - if (r) - goto out_fail; - /* - * kvm_arch_init makes sure there's at most one caller - * for architectures that support multiple implementations, - * like intel and amd on x86. - * kvm_arch_init must be called before kvm_irqfd_init to avoid creating - * conflicts in case kvm is already setup for another implementation. + * FIXME: Get rid of kvm_arch_init(), vendor code should call arch code + * directly. Note, kvm_arch_init() _must_ be called before anything + * else as x86 relies on checks buried in kvm_arch_init() to guard + * against multiple calls to kvm_init(). */ - r = kvm_irqfd_init(); + r = kvm_arch_init(opaque); if (r) - goto out_irqfd; + return r; if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { r = -ENOMEM; - goto out_free_0; + goto err_hw_enabled; } r = kvm_arch_hardware_setup(opaque); @@ -5982,9 +5977,13 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, } } + r = kvm_irqfd_init(); + if (r) + goto err_irqfd; + r = kvm_async_pf_init(); if (r) - goto out_free_4; + goto err_async_pf; kvm_chardev_ops.owner = module; @@ -6015,6 +6014,9 @@ err_register: kvm_vfio_ops_exit(); err_vfio: kvm_async_pf_deinit(); +err_async_pf: + kvm_irqfd_exit(); +err_irqfd: out_free_4: for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); @@ -6026,11 +6028,8 @@ out_free_2: kvm_arch_hardware_unsetup(); out_free_1: free_cpumask_var(cpus_hardware_enabled); -out_free_0: - kvm_irqfd_exit(); -out_irqfd: +err_hw_enabled: kvm_arch_exit(); -out_fail: return r; } EXPORT_SYMBOL_GPL(kvm_init); @@ -6055,9 +6054,9 @@ void kvm_exit(void) unregister_reboot_notifier(&kvm_reboot_notifier); cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); on_each_cpu(hardware_disable_nolock, NULL, 1); + kvm_irqfd_exit(); kvm_arch_hardware_unsetup(); kvm_arch_exit(); - kvm_irqfd_exit(); free_cpumask_var(cpus_hardware_enabled); kvm_vfio_ops_exit(); } -- cgit v1.2.3-58-ga151 From c9650228efbad65c1b9681198f3502de08a42944 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:08:47 +0000 Subject: KVM: Allocate cpus_hardware_enabled after arch hardware setup Allocate cpus_hardware_enabled after arch hardware setup so that arch "init" and "hardware setup" are called back-to-back and thus can be combined in a future patch. cpus_hardware_enabled is never used before kvm_create_vm(), i.e. doesn't have a dependency with hardware setup and only needs to be allocated before /dev/kvm is exposed to userspace. Free the object before the arch hooks are invoked to maintain symmetry, and so that arch code can move away from the hooks without having to worry about ordering changes. Signed-off-by: Sean Christopherson Reviewed-by: Yuan Yao Message-Id: <20221130230934.1014142-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6793a7e3525a..63983588c93b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5931,15 +5931,15 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, if (r) return r; + r = kvm_arch_hardware_setup(opaque); + if (r < 0) + goto err_hw_setup; + if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { r = -ENOMEM; goto err_hw_enabled; } - r = kvm_arch_hardware_setup(opaque); - if (r < 0) - goto out_free_1; - c.ret = &r; c.opaque = opaque; for_each_online_cpu(cpu) { @@ -6025,10 +6025,10 @@ out_free_3: unregister_reboot_notifier(&kvm_reboot_notifier); cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); out_free_2: - kvm_arch_hardware_unsetup(); -out_free_1: free_cpumask_var(cpus_hardware_enabled); err_hw_enabled: + kvm_arch_hardware_unsetup(); +err_hw_setup: kvm_arch_exit(); return r; } @@ -6055,9 +6055,9 @@ void kvm_exit(void) cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); on_each_cpu(hardware_disable_nolock, NULL, 1); kvm_irqfd_exit(); + free_cpumask_var(cpus_hardware_enabled); kvm_arch_hardware_unsetup(); kvm_arch_exit(); - free_cpumask_var(cpus_hardware_enabled); kvm_vfio_ops_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); -- cgit v1.2.3-58-ga151 From 73b8dc04132c1f8d4f9250a39f1fc9878ba2ee13 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:08:48 +0000 Subject: KVM: Teardown VFIO ops earlier in kvm_exit() Move the call to kvm_vfio_ops_exit() further up kvm_exit() to try and bring some amount of symmetry to the setup order in kvm_init(), and more importantly so that the arch hooks are invoked dead last by kvm_exit(). This will allow arch code to move away from the arch hooks without any change in ordering between arch code and common code in kvm_exit(). That kvm_vfio_ops_exit() is called last appears to be 100% arbitrary. It was bolted on after the fact by commit 571ee1b68598 ("kvm: vfio: fix unregister kvm_device_ops of vfio"). The nullified kvm_device_ops_table is also local to kvm_main.c and is used only when there are active VMs, so unless arch code is doing something truly bizarre, nullifying the table earlier in kvm_exit() is little more than a nop. Signed-off-by: Sean Christopherson Reviewed-by: Cornelia Huck Reviewed-by: Eric Farman Message-Id: <20221130230934.1014142-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 63983588c93b..70264378da41 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -6049,6 +6049,7 @@ void kvm_exit(void) for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); + kvm_vfio_ops_exit(); kvm_async_pf_deinit(); unregister_syscore_ops(&kvm_syscore_ops); unregister_reboot_notifier(&kvm_reboot_notifier); @@ -6058,7 +6059,6 @@ void kvm_exit(void) free_cpumask_var(cpus_hardware_enabled); kvm_arch_hardware_unsetup(); kvm_arch_exit(); - kvm_vfio_ops_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); -- cgit v1.2.3-58-ga151 From 63a1bd8ad1ac9e4e8bfcd5914c8899606e04898d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:08:53 +0000 Subject: KVM: Drop arch hardware (un)setup hooks Drop kvm_arch_hardware_setup() and kvm_arch_hardware_unsetup() now that all implementations are nops. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Eric Farman # s390 Acked-by: Anup Patel Message-Id: <20221130230934.1014142-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/arm64/include/asm/kvm_host.h | 1 - arch/arm64/kvm/arm.c | 5 ----- arch/mips/include/asm/kvm_host.h | 1 - arch/mips/kvm/mips.c | 5 ----- arch/powerpc/include/asm/kvm_host.h | 1 - arch/powerpc/kvm/powerpc.c | 5 ----- arch/riscv/include/asm/kvm_host.h | 1 - arch/riscv/kvm/main.c | 5 ----- arch/s390/kvm/kvm-s390.c | 10 ---------- arch/x86/kvm/x86.c | 10 ---------- include/linux/kvm_host.h | 2 -- virt/kvm/kvm_main.c | 7 ------- 12 files changed, 53 deletions(-) (limited to 'virt') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 35a159d131b5..f50951c51d3b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -943,7 +943,6 @@ static inline bool kvm_system_needs_idmapped_vectors(void) void kvm_arm_vcpu_ptrauth_trap(struct kvm_vcpu *vcpu); -static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 9c5573bc4614..e6c21b804c13 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -63,11 +63,6 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; } -int kvm_arch_hardware_setup(void *opaque) -{ - return 0; -} - int kvm_arch_check_processor_compat(void *opaque) { return 0; diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 5cedb28e8a40..28f0ba97db71 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -888,7 +888,6 @@ extern unsigned long kvm_mips_get_ramsize(struct kvm *kvm); extern int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_mips_interrupt *irq); -static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) {} diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index a25e0b73ee70..af29490d9740 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -135,11 +135,6 @@ void kvm_arch_hardware_disable(void) kvm_mips_callbacks->hardware_disable(); } -int kvm_arch_hardware_setup(void *opaque) -{ - return 0; -} - int kvm_arch_check_processor_compat(void *opaque) { return 0; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index caea15dcb91d..5d2c3a487e73 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -877,7 +877,6 @@ struct kvm_vcpu_arch { #define __KVM_HAVE_CREATE_DEVICE static inline void kvm_arch_hardware_disable(void) {} -static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 04494a4fb37a..5faf69421f13 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -440,11 +440,6 @@ int kvm_arch_hardware_enable(void) return 0; } -int kvm_arch_hardware_setup(void *opaque) -{ - return 0; -} - int kvm_arch_check_processor_compat(void *opaque) { return kvmppc_core_check_processor_compat(); diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 93f43a3e7886..a79b7d1514db 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -230,7 +230,6 @@ struct kvm_vcpu_arch { bool pause; }; -static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index 58c5489d3031..afd6400a9e80 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -25,11 +25,6 @@ int kvm_arch_check_processor_compat(void *opaque) return 0; } -int kvm_arch_hardware_setup(void *opaque) -{ - return 0; -} - int kvm_arch_hardware_enable(void) { unsigned long hideleg, hedeleg; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 97c7ccd189eb..829e6e046003 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -329,16 +329,6 @@ static struct notifier_block kvm_clock_notifier = { .notifier_call = kvm_clock_sync, }; -int kvm_arch_hardware_setup(void *opaque) -{ - return 0; -} - -void kvm_arch_hardware_unsetup(void) -{ - -} - static void allow_cpu_feat(unsigned long nr) { set_bit_inv(nr, kvm_s390_available_cpu_feat); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4e7a71cab828..089f0eeea850 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12033,16 +12033,6 @@ void kvm_arch_hardware_disable(void) drop_user_return_notifiers(); } -int kvm_arch_hardware_setup(void *opaque) -{ - return 0; -} - -void kvm_arch_hardware_unsetup(void) -{ - -} - int kvm_arch_check_processor_compat(void *opaque) { struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4f26b244f6d0..7c1009c0e66d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1447,8 +1447,6 @@ static inline void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) {} int kvm_arch_hardware_enable(void); void kvm_arch_hardware_disable(void); -int kvm_arch_hardware_setup(void *opaque); -void kvm_arch_hardware_unsetup(void); int kvm_arch_check_processor_compat(void *opaque); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 70264378da41..8393347fd35f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5931,10 +5931,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, if (r) return r; - r = kvm_arch_hardware_setup(opaque); - if (r < 0) - goto err_hw_setup; - if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { r = -ENOMEM; goto err_hw_enabled; @@ -6027,8 +6023,6 @@ out_free_3: out_free_2: free_cpumask_var(cpus_hardware_enabled); err_hw_enabled: - kvm_arch_hardware_unsetup(); -err_hw_setup: kvm_arch_exit(); return r; } @@ -6057,7 +6051,6 @@ void kvm_exit(void) on_each_cpu(hardware_disable_nolock, NULL, 1); kvm_irqfd_exit(); free_cpumask_var(cpus_hardware_enabled); - kvm_arch_hardware_unsetup(); kvm_arch_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); -- cgit v1.2.3-58-ga151 From a578a0a9e3526483ad1904fac019d95e7089fb34 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:13 +0000 Subject: KVM: Drop kvm_arch_{init,exit}() hooks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop kvm_arch_init() and kvm_arch_exit() now that all implementations are nops. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Eric Farman # s390 Reviewed-by: Philippe Mathieu-Daudé Acked-by: Anup Patel Message-Id: <20221130230934.1014142-30-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/arm.c | 11 ----------- arch/mips/kvm/mips.c | 10 ---------- arch/powerpc/include/asm/kvm_host.h | 1 - arch/powerpc/kvm/powerpc.c | 5 ----- arch/riscv/kvm/main.c | 9 --------- arch/s390/kvm/kvm-s390.c | 10 ---------- arch/x86/kvm/x86.c | 10 ---------- include/linux/kvm_host.h | 3 --- virt/kvm/kvm_main.c | 19 ++----------------- 9 files changed, 2 insertions(+), 76 deletions(-) (limited to 'virt') diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index ea8057fa113f..04ed741f7b9d 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2301,17 +2301,6 @@ out_err: return err; } -int kvm_arch_init(void *opaque) -{ - return 0; -} - -/* NOP: Compiling as a module not supported */ -void kvm_arch_exit(void) -{ - -} - static int __init early_kvm_mode_cfg(char *arg) { if (!arg) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index ae7a24342fdf..3cade648827a 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1010,16 +1010,6 @@ long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) return r; } -int kvm_arch_init(void *opaque) -{ - return 0; -} - -void kvm_arch_exit(void) -{ - -} - int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 5d2c3a487e73..0a80e80c7b9e 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -881,7 +881,6 @@ static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} -static inline void kvm_arch_exit(void) {} static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index d44b85ba8cef..01d0f9935e6c 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -2539,11 +2539,6 @@ void kvmppc_init_lpid(unsigned long nr_lpids_param) } EXPORT_SYMBOL_GPL(kvmppc_init_lpid); -int kvm_arch_init(void *opaque) -{ - return 0; -} - EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ppc_instr); void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry) diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index 1cd220e96c26..be951feee27a 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -65,15 +65,6 @@ void kvm_arch_hardware_disable(void) csr_write(CSR_HIDELEG, 0); } -int kvm_arch_init(void *opaque) -{ - return 0; -} - -void kvm_arch_exit(void) -{ -} - static int __init riscv_kvm_init(void) { const char *str; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 66d162723d21..25b08b956888 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -541,16 +541,6 @@ static void __kvm_s390_exit(void) debug_unregister(kvm_s390_dbf_uv); } -int kvm_arch_init(void *opaque) -{ - return 0; -} - -void kvm_arch_exit(void) -{ - -} - /* Section: device related */ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 78cb6a46dd50..accf5a7ab8df 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9292,16 +9292,6 @@ static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) kvm_pmu_ops_update(ops->pmu_ops); } -int kvm_arch_init(void *opaque) -{ - return 0; -} - -void kvm_arch_exit(void) -{ - -} - static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) { u64 host_pat; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7c1009c0e66d..62d977bb1448 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1423,9 +1423,6 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg); int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu); -int kvm_arch_init(void *opaque); -void kvm_arch_exit(void); - void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu); void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8393347fd35f..597ff734b312 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5921,20 +5921,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, int r; int cpu; - /* - * FIXME: Get rid of kvm_arch_init(), vendor code should call arch code - * directly. Note, kvm_arch_init() _must_ be called before anything - * else as x86 relies on checks buried in kvm_arch_init() to guard - * against multiple calls to kvm_init(). - */ - r = kvm_arch_init(opaque); - if (r) - return r; - - if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { - r = -ENOMEM; - goto err_hw_enabled; - } + if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) + return -ENOMEM; c.ret = &r; c.opaque = opaque; @@ -6022,8 +6010,6 @@ out_free_3: cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); out_free_2: free_cpumask_var(cpus_hardware_enabled); -err_hw_enabled: - kvm_arch_exit(); return r; } EXPORT_SYMBOL_GPL(kvm_init); @@ -6051,7 +6037,6 @@ void kvm_exit(void) on_each_cpu(hardware_disable_nolock, NULL, 1); kvm_irqfd_exit(); free_cpumask_var(cpus_hardware_enabled); - kvm_arch_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); -- cgit v1.2.3-58-ga151 From 81a1cf9f89a6b71e71bfd7d43837ce9235e70b38 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:16 +0000 Subject: KVM: Drop kvm_arch_check_processor_compat() hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop kvm_arch_check_processor_compat() and its support code now that all architecture implementations are nops. Signed-off-by: Sean Christopherson Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Eric Farman # s390 Acked-by: Anup Patel Reviewed-by: Kai Huang Message-Id: <20221130230934.1014142-33-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/arm.c | 7 +------ arch/mips/kvm/mips.c | 7 +------ arch/powerpc/kvm/book3s.c | 2 +- arch/powerpc/kvm/e500.c | 2 +- arch/powerpc/kvm/e500mc.c | 2 +- arch/powerpc/kvm/powerpc.c | 5 ----- arch/riscv/kvm/main.c | 7 +------ arch/s390/kvm/kvm-s390.c | 7 +------ arch/x86/kvm/svm/svm.c | 4 ++-- arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 5 ----- include/linux/kvm_host.h | 4 +--- virt/kvm/kvm_main.c | 24 +----------------------- 13 files changed, 13 insertions(+), 67 deletions(-) (limited to 'virt') diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 04ed741f7b9d..698787ed87e9 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -63,11 +63,6 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; } -int kvm_arch_check_processor_compat(void *opaque) -{ - return 0; -} - int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { @@ -2285,7 +2280,7 @@ static __init int kvm_arm_init(void) * FIXME: Do something reasonable if kvm_init() fails after pKVM * hypervisor protection is finalized. */ - err = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + err = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); if (err) goto out_subs; diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 3cade648827a..36c8991b5d39 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -135,11 +135,6 @@ void kvm_arch_hardware_disable(void) kvm_mips_callbacks->hardware_disable(); } -int kvm_arch_check_processor_compat(void *opaque) -{ - return 0; -} - extern void kvm_init_loongson_ipi(struct kvm *kvm); int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) @@ -1636,7 +1631,7 @@ static int __init kvm_mips_init(void) register_die_notifier(&kvm_mips_csr_die_notifier); - ret = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + ret = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); if (ret) { unregister_die_notifier(&kvm_mips_csr_die_notifier); return ret; diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 87283a0e33d8..57f4e7896d67 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -1052,7 +1052,7 @@ static int kvmppc_book3s_init(void) { int r; - r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + r = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); if (r) return r; #ifdef CONFIG_KVM_BOOK3S_32_HANDLER diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 0ea61190ec04..b0f695428733 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -531,7 +531,7 @@ static int __init kvmppc_e500_init(void) flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers + ivor[max_ivor] + handler_len); - r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + r = kvm_init(sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); if (r) goto err_out; kvm_ops_e500.owner = THIS_MODULE; diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index 795667f7ebf0..611532a0dedc 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -404,7 +404,7 @@ static int __init kvmppc_e500mc_init(void) */ kvmppc_init_lpid(KVMPPC_NR_LPIDS/threads_per_core); - r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + r = kvm_init(sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); if (r) goto err_out; kvm_ops_e500mc.owner = THIS_MODULE; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 01d0f9935e6c..f5b4ff6bfc89 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -440,11 +440,6 @@ int kvm_arch_hardware_enable(void) return 0; } -int kvm_arch_check_processor_compat(void *opaque) -{ - return 0; -} - int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { struct kvmppc_ops *kvm_ops = NULL; diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index be951feee27a..e2da56ed9069 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -20,11 +20,6 @@ long kvm_arch_dev_ioctl(struct file *filp, return -EINVAL; } -int kvm_arch_check_processor_compat(void *opaque) -{ - return 0; -} - int kvm_arch_hardware_enable(void) { unsigned long hideleg, hedeleg; @@ -110,7 +105,7 @@ static int __init riscv_kvm_init(void) kvm_info("VMID %ld bits available\n", kvm_riscv_gstage_vmid_bits()); - return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + return kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); } module_init(riscv_kvm_init); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 25b08b956888..7ad8252e92c2 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -262,11 +262,6 @@ int kvm_arch_hardware_enable(void) return 0; } -int kvm_arch_check_processor_compat(void *opaque) -{ - return 0; -} - /* forward declarations */ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, unsigned long end); @@ -5716,7 +5711,7 @@ static int __init kvm_s390_init(void) if (r) return r; - r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + r = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); if (r) { __kvm_s390_exit(); return r; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e55f8e54e25a..6db44797e85e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5109,8 +5109,8 @@ static int __init svm_init(void) * Common KVM initialization _must_ come last, after this, /dev/kvm is * exposed to userspace! */ - r = kvm_init(NULL, sizeof(struct vcpu_svm), - __alignof__(struct vcpu_svm), THIS_MODULE); + r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm), + THIS_MODULE); if (r) goto err_kvm_init; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e996034809de..3df6b771d816 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8635,8 +8635,8 @@ static int __init vmx_init(void) * Common KVM initialization _must_ come last, after this, /dev/kvm is * exposed to userspace! */ - r = kvm_init(NULL, sizeof(struct vcpu_vmx), - __alignof__(struct vcpu_vmx), THIS_MODULE); + r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), + THIS_MODULE); if (r) goto err_kvm_init; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d94e6e8692cc..17cbd17c9077 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12084,11 +12084,6 @@ void kvm_arch_hardware_disable(void) drop_user_return_notifiers(); } -int kvm_arch_check_processor_compat(void *opaque) -{ - return 0; -} - bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 62d977bb1448..20e207e4c6c1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -956,8 +956,7 @@ static inline void kvm_irqfd_exit(void) { } #endif -int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, - struct module *module); +int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module); void kvm_exit(void); void kvm_get_kvm(struct kvm *kvm); @@ -1444,7 +1443,6 @@ static inline void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) {} int kvm_arch_hardware_enable(void); void kvm_arch_hardware_disable(void); -int kvm_arch_check_processor_compat(void *opaque); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 597ff734b312..e13b369cfc1b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5902,36 +5902,14 @@ void kvm_unregister_perf_callbacks(void) } #endif -struct kvm_cpu_compat_check { - void *opaque; - int *ret; -}; - -static void check_processor_compat(void *data) +int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) { - struct kvm_cpu_compat_check *c = data; - - *c->ret = kvm_arch_check_processor_compat(c->opaque); -} - -int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, - struct module *module) -{ - struct kvm_cpu_compat_check c; int r; int cpu; if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) return -ENOMEM; - c.ret = &r; - c.opaque = opaque; - for_each_online_cpu(cpu) { - smp_call_function_single(cpu, check_processor_compat, &c, 1); - if (r < 0) - goto out_free_2; - } - r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting", kvm_starting_cpu, kvm_dying_cpu); if (r) -- cgit v1.2.3-58-ga151 From aaf12a7b4323eb7d94677bcefc286ff6b772ed1c Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Wed, 30 Nov 2022 23:09:25 +0000 Subject: KVM: Rename and move CPUHP_AP_KVM_STARTING to ONLINE section The CPU STARTING section doesn't allow callbacks to fail. Move KVM's hotplug callback to ONLINE section so that it can abort onlining a CPU in certain cases to avoid potentially breaking VMs running on existing CPUs. For example, when KVM fails to enable hardware virtualization on the hotplugged CPU. Place KVM's hotplug state before CPUHP_AP_SCHED_WAIT_EMPTY as it ensures when offlining a CPU, all user tasks and non-pinned kernel tasks have left the CPU, i.e. there cannot be a vCPU task around. So, it is safe for KVM's CPU offline callback to disable hardware virtualization at that point. Likewise, KVM's online callback can enable hardware virtualization before any vCPU task gets a chance to run on hotplugged CPUs. Drop kvm_x86_check_processor_compatibility()'s WARN that IRQs are disabled, as the ONLINE section runs with IRQs disabled. The WARN wasn't intended to be a requirement, e.g. disabling preemption is sufficient, the IRQ thing was purely an aggressive sanity check since the helper was only ever invoked via SMP function call. Rename KVM's CPU hotplug callbacks accordingly. Suggested-by: Thomas Gleixner Signed-off-by: Chao Gao Signed-off-by: Isaku Yamahata Reviewed-by: Yuan Yao [sean: drop WARN that IRQs are disabled] Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-42-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 -- include/linux/cpuhotplug.h | 2 +- virt/kvm/kvm_main.c | 30 ++++++++++++++++++++++-------- 3 files changed, 23 insertions(+), 11 deletions(-) (limited to 'virt') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6575d9e7b9b6..f2971821ec26 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9298,8 +9298,6 @@ static int kvm_x86_check_processor_compatibility(void) { struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); - WARN_ON(!irqs_disabled()); - if (__cr4_reserved_bits(cpu_has, c) != __cr4_reserved_bits(cpu_has, &boot_cpu_data)) return -EIO; diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 5cae6bd22f7f..5b2f8147d1ae 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -187,7 +187,6 @@ enum cpuhp_state { CPUHP_AP_CSKY_TIMER_STARTING, CPUHP_AP_TI_GP_TIMER_STARTING, CPUHP_AP_HYPERV_TIMER_STARTING, - CPUHP_AP_KVM_STARTING, /* Must be the last timer callback */ CPUHP_AP_DUMMY_TIMER_STARTING, CPUHP_AP_ARM_XEN_STARTING, @@ -202,6 +201,7 @@ enum cpuhp_state { /* Online section invoked on the hotplugged CPU from the hotplug thread */ CPUHP_AP_ONLINE_IDLE, + CPUHP_AP_KVM_ONLINE, CPUHP_AP_SCHED_WAIT_EMPTY, CPUHP_AP_SMPBOOT_THREADS, CPUHP_AP_X86_VDSO_VMA_ONLINE, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e13b369cfc1b..ee1005cb99e1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5114,13 +5114,27 @@ static void hardware_enable_nolock(void *junk) } } -static int kvm_starting_cpu(unsigned int cpu) +static int kvm_online_cpu(unsigned int cpu) { + int ret = 0; + + /* + * Abort the CPU online process if hardware virtualization cannot + * be enabled. Otherwise running VMs would encounter unrecoverable + * errors when scheduled to this CPU. + */ raw_spin_lock(&kvm_count_lock); - if (kvm_usage_count) + if (kvm_usage_count) { + WARN_ON_ONCE(atomic_read(&hardware_enable_failed)); + hardware_enable_nolock(NULL); + if (atomic_read(&hardware_enable_failed)) { + atomic_set(&hardware_enable_failed, 0); + ret = -EIO; + } + } raw_spin_unlock(&kvm_count_lock); - return 0; + return ret; } static void hardware_disable_nolock(void *junk) @@ -5133,7 +5147,7 @@ static void hardware_disable_nolock(void *junk) kvm_arch_hardware_disable(); } -static int kvm_dying_cpu(unsigned int cpu) +static int kvm_offline_cpu(unsigned int cpu) { raw_spin_lock(&kvm_count_lock); if (kvm_usage_count) @@ -5910,8 +5924,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) return -ENOMEM; - r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting", - kvm_starting_cpu, kvm_dying_cpu); + r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", + kvm_online_cpu, kvm_offline_cpu); if (r) goto out_free_2; register_reboot_notifier(&kvm_reboot_notifier); @@ -5985,7 +5999,7 @@ out_free_4: kmem_cache_destroy(kvm_vcpu_cache); out_free_3: unregister_reboot_notifier(&kvm_reboot_notifier); - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); + cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); out_free_2: free_cpumask_var(cpus_hardware_enabled); return r; @@ -6011,7 +6025,7 @@ void kvm_exit(void) kvm_async_pf_deinit(); unregister_syscore_ops(&kvm_syscore_ops); unregister_reboot_notifier(&kvm_reboot_notifier); - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); + cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); on_each_cpu(hardware_disable_nolock, NULL, 1); kvm_irqfd_exit(); free_cpumask_var(cpus_hardware_enabled); -- cgit v1.2.3-58-ga151 From e4aa7f88af1a123863530af4a238ce64ec8fad5a Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Wed, 30 Nov 2022 23:09:26 +0000 Subject: KVM: Disable CPU hotplug during hardware enabling/disabling Disable CPU hotplug when enabling/disabling hardware to prevent the corner case where if the following sequence occurs: 1. A hotplugged CPU marks itself online in cpu_online_mask 2. The hotplugged CPU enables interrupt before invoking KVM's ONLINE callback 3 hardware_{en,dis}able_all() is invoked on another CPU the hotplugged CPU will be included in on_each_cpu() and thus get sent through hardware_{en,dis}able_nolock() before kvm_online_cpu() is called. start_secondary { ... set_cpu_online(smp_processor_id(), true); <- 1 ... local_irq_enable(); <- 2 ... cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); <- 3 } KVM currently fudges around this race by keeping track of which CPUs have done hardware enabling (see commit 1b6c016818a5 "KVM: Keep track of which cpus have virtualization enabled"), but that's an inefficient, convoluted, and hacky solution. Signed-off-by: Chao Gao [sean: split to separate patch, write changelog] Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-43-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 11 ++++++++++- virt/kvm/kvm_main.c | 12 ++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'virt') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f2971821ec26..c3ac88036b52 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9296,7 +9296,16 @@ static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) static int kvm_x86_check_processor_compatibility(void) { - struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + int cpu = smp_processor_id(); + struct cpuinfo_x86 *c = &cpu_data(cpu); + + /* + * Compatibility checks are done when loading KVM and when enabling + * hardware, e.g. during CPU hotplug, to ensure all online CPUs are + * compatible, i.e. KVM should never perform a compatibility check on + * an offline CPU. + */ + WARN_ON(!cpu_online(cpu)); if (__cr4_reserved_bits(cpu_has, c) != __cr4_reserved_bits(cpu_has, &boot_cpu_data)) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ee1005cb99e1..2d6203ecab88 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5167,15 +5167,26 @@ static void hardware_disable_all_nolock(void) static void hardware_disable_all(void) { + cpus_read_lock(); raw_spin_lock(&kvm_count_lock); hardware_disable_all_nolock(); raw_spin_unlock(&kvm_count_lock); + cpus_read_unlock(); } static int hardware_enable_all(void) { int r = 0; + /* + * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu() + * is called, and so on_each_cpu() between them includes the CPU that + * is being onlined. As a result, hardware_enable_nolock() may get + * invoked before kvm_online_cpu(), which also enables hardware if the + * usage count is non-zero. Disable CPU hotplug to avoid attempting to + * enable hardware multiple times. + */ + cpus_read_lock(); raw_spin_lock(&kvm_count_lock); kvm_usage_count++; @@ -5190,6 +5201,7 @@ static int hardware_enable_all(void) } raw_spin_unlock(&kvm_count_lock); + cpus_read_unlock(); return r; } -- cgit v1.2.3-58-ga151 From 2c106f29004d2a6dbdc375cd9c0c8f981f475f47 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:27 +0000 Subject: KVM: Ensure CPU is stable during low level hardware enable/disable Use the non-raw smp_processor_id() in the low hardware enable/disable helpers as KVM absolutely relies on the CPU being stable, e.g. KVM would end up with incorrect state if the task were migrated between accessing cpus_hardware_enabled and actually enabling/disabling hardware. Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-44-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2d6203ecab88..0c3b9d6fa1b1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5097,7 +5097,7 @@ static struct miscdevice kvm_dev = { static void hardware_enable_nolock(void *junk) { - int cpu = raw_smp_processor_id(); + int cpu = smp_processor_id(); int r; if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) @@ -5139,7 +5139,7 @@ static int kvm_online_cpu(unsigned int cpu) static void hardware_disable_nolock(void *junk) { - int cpu = raw_smp_processor_id(); + int cpu = smp_processor_id(); if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) return; -- cgit v1.2.3-58-ga151 From 0bf50497f03b3d892c470c7d1a10a3e9c3c95821 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 30 Nov 2022 23:09:28 +0000 Subject: KVM: Drop kvm_count_lock and instead protect kvm_usage_count with kvm_lock Drop kvm_count_lock and instead protect kvm_usage_count with kvm_lock now that KVM hooks CPU hotplug during the ONLINE phase, which can sleep. Previously, KVM hooked the STARTING phase, which is not allowed to sleep and thus could not take kvm_lock (a mutex). This effectively allows the task that's initiating hardware enabling/disabling to preempted and/or migrated. Note, the Documentation/virt/kvm/locking.rst statement that kvm_count_lock is "raw" because hardware enabling/disabling needs to be atomic with respect to migration is wrong on multiple fronts. First, while regular spinlocks can be preempted, the task holding the lock cannot be migrated. Second, preventing migration is not required. on_each_cpu() disables preemption, which ensures that cpus_hardware_enabled correctly reflects hardware state. The task may be preempted/migrated between bumping kvm_usage_count and invoking on_each_cpu(), but that's perfectly ok as kvm_usage_count is still protected, e.g. other tasks that call hardware_enable_all() will be blocked until the preempted/migrated owner exits its critical section. KVM does have lockless accesses to kvm_usage_count in the suspend/resume flows, but those are safe because all tasks must be frozen prior to suspending CPUs, and a task cannot be frozen while it holds one or more locks (userspace tasks are frozen via a fake signal). Preemption doesn't need to be explicitly disabled in the hotplug path. The hotplug thread is pinned to the CPU that's being hotplugged, and KVM only cares about having a stable CPU, i.e. to ensure hardware is enabled on the correct CPU. Lockep, i.e. check_preemption_disabled(), plays nice with this state too, as is_percpu_thread() is true for the hotplug thread. Signed-off-by: Isaku Yamahata Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-45-seanjc@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/locking.rst | 19 ++++++++++--------- virt/kvm/kvm_main.c | 36 ++++++++++++++++++++++++------------ 2 files changed, 34 insertions(+), 21 deletions(-) (limited to 'virt') diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index 0f6067d13622..1147836742cc 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -9,6 +9,8 @@ KVM Lock Overview The acquisition orders for mutexes are as follows: +- cpus_read_lock() is taken outside kvm_lock + - kvm->lock is taken outside vcpu->mutex - kvm->lock is taken outside kvm->slots_lock and kvm->irq_lock @@ -225,15 +227,10 @@ time it will be set using the Dirty tracking mechanism described above. :Type: mutex :Arch: any :Protects: - vm_list - -``kvm_count_lock`` -^^^^^^^^^^^^^^^^^^ - -:Type: raw_spinlock_t -:Arch: any -:Protects: - hardware virtualization enable/disable -:Comment: 'raw' because hardware enabling/disabling must be atomic /wrt - migration. + - kvm_usage_count + - hardware virtualization enable/disable +:Comment: KVM also disables CPU hotplug via cpus_read_lock() during + enable/disable. ``kvm->mn_invalidate_lock`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -297,3 +294,7 @@ time it will be set using the Dirty tracking mechanism described above. :Type: mutex :Arch: x86 :Protects: loading a vendor module (kvm_amd or kvm_intel) +:Comment: Exists because using kvm_lock leads to deadlock. cpu_hotplug_lock is + taken outside of kvm_lock, e.g. in KVM's CPU online/offline callbacks, and + many operations need to take cpu_hotplug_lock when loading a vendor module, + e.g. updating static calls. diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0c3b9d6fa1b1..f6caee790dc8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -100,7 +100,6 @@ EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); */ DEFINE_MUTEX(kvm_lock); -static DEFINE_RAW_SPINLOCK(kvm_count_lock); LIST_HEAD(vm_list); static cpumask_var_t cpus_hardware_enabled; @@ -5123,17 +5122,18 @@ static int kvm_online_cpu(unsigned int cpu) * be enabled. Otherwise running VMs would encounter unrecoverable * errors when scheduled to this CPU. */ - raw_spin_lock(&kvm_count_lock); + mutex_lock(&kvm_lock); if (kvm_usage_count) { WARN_ON_ONCE(atomic_read(&hardware_enable_failed)); hardware_enable_nolock(NULL); + if (atomic_read(&hardware_enable_failed)) { atomic_set(&hardware_enable_failed, 0); ret = -EIO; } } - raw_spin_unlock(&kvm_count_lock); + mutex_unlock(&kvm_lock); return ret; } @@ -5149,10 +5149,10 @@ static void hardware_disable_nolock(void *junk) static int kvm_offline_cpu(unsigned int cpu) { - raw_spin_lock(&kvm_count_lock); + mutex_lock(&kvm_lock); if (kvm_usage_count) hardware_disable_nolock(NULL); - raw_spin_unlock(&kvm_count_lock); + mutex_unlock(&kvm_lock); return 0; } @@ -5168,9 +5168,9 @@ static void hardware_disable_all_nolock(void) static void hardware_disable_all(void) { cpus_read_lock(); - raw_spin_lock(&kvm_count_lock); + mutex_lock(&kvm_lock); hardware_disable_all_nolock(); - raw_spin_unlock(&kvm_count_lock); + mutex_unlock(&kvm_lock); cpus_read_unlock(); } @@ -5187,7 +5187,7 @@ static int hardware_enable_all(void) * enable hardware multiple times. */ cpus_read_lock(); - raw_spin_lock(&kvm_count_lock); + mutex_lock(&kvm_lock); kvm_usage_count++; if (kvm_usage_count == 1) { @@ -5200,7 +5200,7 @@ static int hardware_enable_all(void) } } - raw_spin_unlock(&kvm_count_lock); + mutex_unlock(&kvm_lock); cpus_read_unlock(); return r; @@ -5806,6 +5806,17 @@ static void kvm_init_debug(void) static int kvm_suspend(void) { + /* + * Secondary CPUs and CPU hotplug are disabled across the suspend/resume + * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count + * is stable. Assert that kvm_lock is not held to ensure the system + * isn't suspended while KVM is enabling hardware. Hardware enabling + * can be preempted, but the task cannot be frozen until it has dropped + * all locks (userspace tasks are frozen via a fake signal). + */ + lockdep_assert_not_held(&kvm_lock); + lockdep_assert_irqs_disabled(); + if (kvm_usage_count) hardware_disable_nolock(NULL); return 0; @@ -5813,10 +5824,11 @@ static int kvm_suspend(void) static void kvm_resume(void) { - if (kvm_usage_count) { - lockdep_assert_not_held(&kvm_count_lock); + lockdep_assert_not_held(&kvm_lock); + lockdep_assert_irqs_disabled(); + + if (kvm_usage_count) hardware_enable_nolock(NULL); - } } static struct syscore_ops kvm_syscore_ops = { -- cgit v1.2.3-58-ga151 From 667a83bf6a30b5660d02712dce9fb07e99abde38 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 30 Nov 2022 23:09:29 +0000 Subject: KVM: Remove on_each_cpu(hardware_disable_nolock) in kvm_exit() Drop the superfluous invocation of hardware_disable_nolock() during kvm_exit(), as it's nothing more than a glorified nop. KVM automatically disables hardware on all CPUs when the last VM is destroyed, and kvm_exit() cannot be called until the last VM goes away as the calling module is pinned by an elevated refcount of the fops associated with /dev/kvm. This holds true even on x86, where the caller of kvm_exit() is not kvm.ko, but is instead a dependent module, kvm_amd.ko or kvm_intel.ko, as kvm_chardev_ops.owner is set to the module that calls kvm_init(), not hardcoded to the base kvm.ko module. Signed-off-by: Isaku Yamahata [sean: rework changelog] Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-46-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 1 - 1 file changed, 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f6caee790dc8..d1ed34856412 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -6050,7 +6050,6 @@ void kvm_exit(void) unregister_syscore_ops(&kvm_syscore_ops); unregister_reboot_notifier(&kvm_reboot_notifier); cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); - on_each_cpu(hardware_disable_nolock, NULL, 1); kvm_irqfd_exit(); free_cpumask_var(cpus_hardware_enabled); } -- cgit v1.2.3-58-ga151 From 37d258818562bbb5baf487489848640abceafbd9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:30 +0000 Subject: KVM: Use a per-CPU variable to track which CPUs have enabled virtualization Use a per-CPU variable instead of a shared bitmap to track which CPUs have successfully enabled virtualization hardware. Using a per-CPU bool avoids the need for an additional allocation, and arguably yields easier to read code. Using a bitmap would be advantageous if KVM used it to avoid generating IPIs to CPUs that failed to enable hardware, but that's an extreme edge case and not worth optimizing, and the low level helpers would still want to keep their individual checks as attempting to enable virtualization hardware when it's already enabled can be problematic, e.g. Intel's VMXON will fault. Opportunistically change the order in hardware_enable_nolock() to set the flag if and only if hardware enabling is successful, instead of speculatively setting the flag and then clearing it on failure. Add a comment explaining that the check in hardware_disable_nolock() isn't simply paranoia. Waaay back when, commit 1b6c016818a5 ("KVM: Keep track of which cpus have virtualization enabled"), added the logic as a guards against CPU hotplug racing with hardware enable/disable. Now that KVM has eliminated the race by taking cpu_hotplug_lock for read (via cpus_read_lock()) when enabling or disabling hardware, at first glance it appears that the check is now superfluous, i.e. it's tempting to remove the per-CPU flag entirely... Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-47-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 41 ++++++++++++++++++----------------------- 1 file changed, 18 insertions(+), 23 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d1ed34856412..bcfc0bc15153 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -102,7 +102,7 @@ EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); DEFINE_MUTEX(kvm_lock); LIST_HEAD(vm_list); -static cpumask_var_t cpus_hardware_enabled; +static DEFINE_PER_CPU(bool, hardware_enabled); static int kvm_usage_count; static atomic_t hardware_enable_failed; @@ -5096,21 +5096,17 @@ static struct miscdevice kvm_dev = { static void hardware_enable_nolock(void *junk) { - int cpu = smp_processor_id(); - int r; - - if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) + if (__this_cpu_read(hardware_enabled)) return; - cpumask_set_cpu(cpu, cpus_hardware_enabled); - - r = kvm_arch_hardware_enable(); - - if (r) { - cpumask_clear_cpu(cpu, cpus_hardware_enabled); + if (kvm_arch_hardware_enable()) { atomic_inc(&hardware_enable_failed); - pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu); + pr_info("kvm: enabling virtualization on CPU%d failed\n", + raw_smp_processor_id()); + return; } + + __this_cpu_write(hardware_enabled, true); } static int kvm_online_cpu(unsigned int cpu) @@ -5139,12 +5135,16 @@ static int kvm_online_cpu(unsigned int cpu) static void hardware_disable_nolock(void *junk) { - int cpu = smp_processor_id(); - - if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) + /* + * Note, hardware_disable_all_nolock() tells all online CPUs to disable + * hardware, not just CPUs that successfully enabled hardware! + */ + if (!__this_cpu_read(hardware_enabled)) return; - cpumask_clear_cpu(cpu, cpus_hardware_enabled); + kvm_arch_hardware_disable(); + + __this_cpu_write(hardware_enabled, false); } static int kvm_offline_cpu(unsigned int cpu) @@ -5945,13 +5945,11 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) int r; int cpu; - if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) - return -ENOMEM; - r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", kvm_online_cpu, kvm_offline_cpu); if (r) - goto out_free_2; + return r; + register_reboot_notifier(&kvm_reboot_notifier); /* A kmem cache lets us meet the alignment requirements of fx_save. */ @@ -6024,8 +6022,6 @@ out_free_4: out_free_3: unregister_reboot_notifier(&kvm_reboot_notifier); cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); -out_free_2: - free_cpumask_var(cpus_hardware_enabled); return r; } EXPORT_SYMBOL_GPL(kvm_init); @@ -6051,7 +6047,6 @@ void kvm_exit(void) unregister_reboot_notifier(&kvm_reboot_notifier); cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); kvm_irqfd_exit(); - free_cpumask_var(cpus_hardware_enabled); } EXPORT_SYMBOL_GPL(kvm_exit); -- cgit v1.2.3-58-ga151 From e6fb7d6eb4217badb018169ec2871da8e92f6555 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 30 Nov 2022 23:09:31 +0000 Subject: KVM: Make hardware_enable_failed a local variable in the "enable all" path Rework detecting hardware enabling errors to use a local variable in the "enable all" path to track whether or not enabling was successful across all CPUs. Using a global variable complicates paths that enable hardware only on the current CPU, e.g. kvm_resume() and kvm_online_cpu(). Opportunistically add a WARN if hardware enabling fails during kvm_resume(), KVM is all kinds of hosed if CPU0 fails to enable hardware. The WARN is largely futile in the current code, as KVM BUG()s on spurious faults on VMX instructions, e.g. attempting to run a vCPU on CPU if hardware enabling fails will explode. ------------[ cut here ]------------ kernel BUG at arch/x86/kvm/x86.c:508! invalid opcode: 0000 [#1] SMP CPU: 3 PID: 1009 Comm: CPU 4/KVM Not tainted 6.1.0-rc1+ #11 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:kvm_spurious_fault+0xa/0x10 Call Trace: vmx_vcpu_load_vmcs+0x192/0x230 [kvm_intel] vmx_vcpu_load+0x16/0x60 [kvm_intel] kvm_arch_vcpu_load+0x32/0x1f0 vcpu_load+0x2f/0x40 kvm_arch_vcpu_ioctl_run+0x19/0x9d0 kvm_vcpu_ioctl+0x271/0x660 __x64_sys_ioctl+0x80/0xb0 do_syscall_64+0x2b/0x50 entry_SYSCALL_64_after_hwframe+0x46/0xb0 But, the WARN may provide a breadcrumb to understand what went awry, and someday KVM may fix one or both of those bugs, e.g. by finding a way to eat spurious faults no matter the context (easier said than done due to side effects of certain operations, e.g. Intel's VMCLEAR). Signed-off-by: Isaku Yamahata [sean: rebase, WARN on failure in kvm_resume()] Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-48-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index bcfc0bc15153..2d26a898ac6c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -104,7 +104,6 @@ LIST_HEAD(vm_list); static DEFINE_PER_CPU(bool, hardware_enabled); static int kvm_usage_count; -static atomic_t hardware_enable_failed; static struct kmem_cache *kvm_vcpu_cache; @@ -5094,19 +5093,25 @@ static struct miscdevice kvm_dev = { &kvm_chardev_ops, }; -static void hardware_enable_nolock(void *junk) +static int __hardware_enable_nolock(void) { if (__this_cpu_read(hardware_enabled)) - return; + return 0; if (kvm_arch_hardware_enable()) { - atomic_inc(&hardware_enable_failed); pr_info("kvm: enabling virtualization on CPU%d failed\n", raw_smp_processor_id()); - return; + return -EIO; } __this_cpu_write(hardware_enabled, true); + return 0; +} + +static void hardware_enable_nolock(void *failed) +{ + if (__hardware_enable_nolock()) + atomic_inc(failed); } static int kvm_online_cpu(unsigned int cpu) @@ -5119,16 +5124,8 @@ static int kvm_online_cpu(unsigned int cpu) * errors when scheduled to this CPU. */ mutex_lock(&kvm_lock); - if (kvm_usage_count) { - WARN_ON_ONCE(atomic_read(&hardware_enable_failed)); - - hardware_enable_nolock(NULL); - - if (atomic_read(&hardware_enable_failed)) { - atomic_set(&hardware_enable_failed, 0); - ret = -EIO; - } - } + if (kvm_usage_count) + ret = __hardware_enable_nolock(); mutex_unlock(&kvm_lock); return ret; } @@ -5176,6 +5173,7 @@ static void hardware_disable_all(void) static int hardware_enable_all(void) { + atomic_t failed = ATOMIC_INIT(0); int r = 0; /* @@ -5191,10 +5189,9 @@ static int hardware_enable_all(void) kvm_usage_count++; if (kvm_usage_count == 1) { - atomic_set(&hardware_enable_failed, 0); - on_each_cpu(hardware_enable_nolock, NULL, 1); + on_each_cpu(hardware_enable_nolock, &failed, 1); - if (atomic_read(&hardware_enable_failed)) { + if (atomic_read(&failed)) { hardware_disable_all_nolock(); r = -EBUSY; } @@ -5828,7 +5825,7 @@ static void kvm_resume(void) lockdep_assert_irqs_disabled(); if (kvm_usage_count) - hardware_enable_nolock(NULL); + WARN_ON_ONCE(__hardware_enable_nolock()); } static struct syscore_ops kvm_syscore_ops = { -- cgit v1.2.3-58-ga151 From 35774a9f94db08df78e1c08d2b097666deec4e76 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:32 +0000 Subject: KVM: Register syscore (suspend/resume) ops early in kvm_init() Register the suspend/resume notifier hooks at the same time KVM registers its reboot notifier so that all the code in kvm_init() that deals with enabling/disabling hardware is bundled together. Opportunstically move KVM's implementations to reside near the reboot notifier code for the same reason. Bunching the code together will allow architectures to opt out of KVM's generic hardware enable/disable logic with minimal #ifdeffery. Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-49-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 68 ++++++++++++++++++++++++++--------------------------- 1 file changed, 34 insertions(+), 34 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2d26a898ac6c..6c0bd9a8e27e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5223,6 +5223,38 @@ static struct notifier_block kvm_reboot_notifier = { .priority = 0, }; +static int kvm_suspend(void) +{ + /* + * Secondary CPUs and CPU hotplug are disabled across the suspend/resume + * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count + * is stable. Assert that kvm_lock is not held to ensure the system + * isn't suspended while KVM is enabling hardware. Hardware enabling + * can be preempted, but the task cannot be frozen until it has dropped + * all locks (userspace tasks are frozen via a fake signal). + */ + lockdep_assert_not_held(&kvm_lock); + lockdep_assert_irqs_disabled(); + + if (kvm_usage_count) + hardware_disable_nolock(NULL); + return 0; +} + +static void kvm_resume(void) +{ + lockdep_assert_not_held(&kvm_lock); + lockdep_assert_irqs_disabled(); + + if (kvm_usage_count) + WARN_ON_ONCE(__hardware_enable_nolock()); +} + +static struct syscore_ops kvm_syscore_ops = { + .suspend = kvm_suspend, + .resume = kvm_resume, +}; + static void kvm_io_bus_destroy(struct kvm_io_bus *bus) { int i; @@ -5801,38 +5833,6 @@ static void kvm_init_debug(void) } } -static int kvm_suspend(void) -{ - /* - * Secondary CPUs and CPU hotplug are disabled across the suspend/resume - * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count - * is stable. Assert that kvm_lock is not held to ensure the system - * isn't suspended while KVM is enabling hardware. Hardware enabling - * can be preempted, but the task cannot be frozen until it has dropped - * all locks (userspace tasks are frozen via a fake signal). - */ - lockdep_assert_not_held(&kvm_lock); - lockdep_assert_irqs_disabled(); - - if (kvm_usage_count) - hardware_disable_nolock(NULL); - return 0; -} - -static void kvm_resume(void) -{ - lockdep_assert_not_held(&kvm_lock); - lockdep_assert_irqs_disabled(); - - if (kvm_usage_count) - WARN_ON_ONCE(__hardware_enable_nolock()); -} - -static struct syscore_ops kvm_syscore_ops = { - .suspend = kvm_suspend, - .resume = kvm_resume, -}; - static inline struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) { @@ -5948,6 +5948,7 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) return r; register_reboot_notifier(&kvm_reboot_notifier); + register_syscore_ops(&kvm_syscore_ops); /* A kmem cache lets us meet the alignment requirements of fx_save. */ if (!vcpu_align) @@ -5982,8 +5983,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) kvm_chardev_ops.owner = module; - register_syscore_ops(&kvm_syscore_ops); - kvm_preempt_ops.sched_in = kvm_sched_in; kvm_preempt_ops.sched_out = kvm_sched_out; @@ -6017,6 +6016,7 @@ out_free_4: free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); out_free_3: + unregister_syscore_ops(&kvm_syscore_ops); unregister_reboot_notifier(&kvm_reboot_notifier); cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); return r; -- cgit v1.2.3-58-ga151 From 441f7bfa99fe2b8a7e504aa72047e20579e88a5d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:33 +0000 Subject: KVM: Opt out of generic hardware enabling on s390 and PPC Allow architectures to opt out of the generic hardware enabling logic, and opt out on both s390 and PPC, which don't need to manually enable virtualization as it's always on (when available). In addition to letting s390 and PPC drop a bit of dead code, this will hopefully also allow ARM to clean up its related code, e.g. ARM has its own per-CPU flag to track which CPUs have enable hardware due to the need to keep hardware enabled indefinitely when pKVM is enabled. Signed-off-by: Sean Christopherson Acked-by: Anup Patel Message-Id: <20221130230934.1014142-50-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/Kconfig | 1 + arch/mips/kvm/Kconfig | 1 + arch/powerpc/include/asm/kvm_host.h | 1 - arch/powerpc/kvm/powerpc.c | 5 ----- arch/riscv/kvm/Kconfig | 1 + arch/s390/include/asm/kvm_host.h | 1 - arch/s390/kvm/kvm-s390.c | 6 ------ arch/x86/kvm/Kconfig | 1 + include/linux/kvm_host.h | 4 ++++ virt/kvm/Kconfig | 3 +++ virt/kvm/kvm_main.c | 30 ++++++++++++++++++++++++------ 11 files changed, 35 insertions(+), 19 deletions(-) (limited to 'virt') diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 05da3c8f7e88..ca6eadeb7d1a 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -21,6 +21,7 @@ if VIRTUALIZATION menuconfig KVM bool "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM + select KVM_GENERIC_HARDWARE_ENABLING select MMU_NOTIFIER select PREEMPT_NOTIFIERS select HAVE_KVM_CPU_RELAX_INTERCEPT diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index 91d197bee9c0..29e51649203b 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -28,6 +28,7 @@ config KVM select MMU_NOTIFIER select SRCU select INTERVAL_TREE + select KVM_GENERIC_HARDWARE_ENABLING help Support for hosting Guest kernels. diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 0a80e80c7b9e..959f566a455c 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -876,7 +876,6 @@ struct kvm_vcpu_arch { #define __KVM_HAVE_ARCH_WQP #define __KVM_HAVE_CREATE_DEVICE -static inline void kvm_arch_hardware_disable(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index f5b4ff6bfc89..4c5405fc5538 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -435,11 +435,6 @@ int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, } EXPORT_SYMBOL_GPL(kvmppc_ld); -int kvm_arch_hardware_enable(void) -{ - return 0; -} - int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { struct kvmppc_ops *kvm_ops = NULL; diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index f36a737d5f96..d5a658a047a7 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -20,6 +20,7 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support (EXPERIMENTAL)" depends on RISCV_SBI && MMU + select KVM_GENERIC_HARDWARE_ENABLING select MMU_NOTIFIER select PREEMPT_NOTIFIERS select KVM_MMIO diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index d67ce719d16a..2bbc3d54959d 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -1031,7 +1031,6 @@ extern char sie_exit; extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc); extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc); -static inline void kvm_arch_hardware_disable(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 7ad8252e92c2..bd25076aa19b 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -256,12 +256,6 @@ debug_info_t *kvm_s390_dbf; debug_info_t *kvm_s390_dbf_uv; /* Section: not file related */ -int kvm_arch_hardware_enable(void) -{ - /* every s390 is virtualization enabled ;-) */ - return 0; -} - /* forward declarations */ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, unsigned long end); diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index fbeaa9ddef59..8e578311ca9d 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -49,6 +49,7 @@ config KVM select SRCU select INTERVAL_TREE select HAVE_KVM_PM_NOTIFIER if PM + select KVM_GENERIC_HARDWARE_ENABLING help Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 20e207e4c6c1..109b18e2789c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1441,8 +1441,10 @@ void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_ static inline void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) {} #endif +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING int kvm_arch_hardware_enable(void); void kvm_arch_hardware_disable(void); +#endif int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); @@ -2074,7 +2076,9 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) } } +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING extern bool kvm_rebooting; +#endif extern unsigned int halt_poll_ns; extern unsigned int halt_poll_ns_grow; diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 9fb1ff6f19e5..b74916de5183 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -92,3 +92,6 @@ config KVM_XFER_TO_GUEST_WORK config HAVE_KVM_PM_NOTIFIER bool + +config KVM_GENERIC_HARDWARE_ENABLING + bool diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6c0bd9a8e27e..3d0678579c5e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -102,9 +102,6 @@ EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); DEFINE_MUTEX(kvm_lock); LIST_HEAD(vm_list); -static DEFINE_PER_CPU(bool, hardware_enabled); -static int kvm_usage_count; - static struct kmem_cache *kvm_vcpu_cache; static __read_mostly struct preempt_ops kvm_preempt_ops; @@ -146,9 +143,6 @@ static void hardware_disable_all(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); -__visible bool kvm_rebooting; -EXPORT_SYMBOL_GPL(kvm_rebooting); - #define KVM_EVENT_CREATE_VM 0 #define KVM_EVENT_DESTROY_VM 1 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); @@ -5093,6 +5087,13 @@ static struct miscdevice kvm_dev = { &kvm_chardev_ops, }; +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +__visible bool kvm_rebooting; +EXPORT_SYMBOL_GPL(kvm_rebooting); + +static DEFINE_PER_CPU(bool, hardware_enabled); +static int kvm_usage_count; + static int __hardware_enable_nolock(void) { if (__this_cpu_read(hardware_enabled)) @@ -5254,6 +5255,17 @@ static struct syscore_ops kvm_syscore_ops = { .suspend = kvm_suspend, .resume = kvm_resume, }; +#else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ +static int hardware_enable_all(void) +{ + return 0; +} + +static void hardware_disable_all(void) +{ + +} +#endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ static void kvm_io_bus_destroy(struct kvm_io_bus *bus) { @@ -5942,6 +5954,7 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) int r; int cpu; +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", kvm_online_cpu, kvm_offline_cpu); if (r) @@ -5949,6 +5962,7 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) register_reboot_notifier(&kvm_reboot_notifier); register_syscore_ops(&kvm_syscore_ops); +#endif /* A kmem cache lets us meet the alignment requirements of fx_save. */ if (!vcpu_align) @@ -6016,9 +6030,11 @@ out_free_4: free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); out_free_3: +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING unregister_syscore_ops(&kvm_syscore_ops); unregister_reboot_notifier(&kvm_reboot_notifier); cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); +#endif return r; } EXPORT_SYMBOL_GPL(kvm_init); @@ -6040,9 +6056,11 @@ void kvm_exit(void) kmem_cache_destroy(kvm_vcpu_cache); kvm_vfio_ops_exit(); kvm_async_pf_deinit(); +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING unregister_syscore_ops(&kvm_syscore_ops); unregister_reboot_notifier(&kvm_reboot_notifier); cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); +#endif kvm_irqfd_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); -- cgit v1.2.3-58-ga151 From 9f1a4c004869d3c8061f286fec4d8096dd099b84 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:34 +0000 Subject: KVM: Clean up error labels in kvm_init() Convert the last two "out" lables to "err" labels now that the dust has settled, i.e. now that there are no more planned changes to the order of things in kvm_init(). Use "err" instead of "out" as it's easier to describe what failed than it is to describe what needs to be unwound, e.g. if allocating a per-CPU kick mask fails, KVM needs to free any masks that were allocated, and of course needs to unwind previous operations. Reported-by: Chao Gao Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-51-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3d0678579c5e..0f17b8786792 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5976,14 +5976,14 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) NULL); if (!kvm_vcpu_cache) { r = -ENOMEM; - goto out_free_3; + goto err_vcpu_cache; } for_each_possible_cpu(cpu) { if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu), GFP_KERNEL, cpu_to_node(cpu))) { r = -ENOMEM; - goto out_free_4; + goto err_cpu_kick_mask; } } @@ -6025,11 +6025,11 @@ err_vfio: err_async_pf: kvm_irqfd_exit(); err_irqfd: -out_free_4: +err_cpu_kick_mask: for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); -out_free_3: +err_vcpu_cache: #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING unregister_syscore_ops(&kvm_syscore_ops); unregister_reboot_notifier(&kvm_reboot_notifier); -- cgit v1.2.3-58-ga151