diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-11-02 11:24:14 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-11-02 11:24:14 -0700 |
commit | d7e0a795bf37a13554c80cfc5ba97abedf53f391 (patch) | |
tree | 26f107fbe530b1bd0912a748b808cbe476bfbf49 /arch | |
parent | 44261f8e287d1b02a2e4bfbd7399fb8d37d1ee24 (diff) | |
parent | 52cf891d8dbd7592261fa30f373410b97f22b76c (diff) |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini:
"ARM:
- More progress on the protected VM front, now with the full fixed
feature set as well as the limitation of some hypercalls after
initialisation.
- Cleanup of the RAZ/WI sysreg handling, which was pointlessly
complicated
- Fixes for the vgic placement in the IPA space, together with a
bunch of selftests
- More memcg accounting of the memory allocated on behalf of a guest
- Timer and vgic selftests
- Workarounds for the Apple M1 broken vgic implementation
- KConfig cleanups
- New kvmarm.mode=none option, for those who really dislike us
RISC-V:
- New KVM port.
x86:
- New API to control TSC offset from userspace
- TSC scaling for nested hypervisors on SVM
- Switch masterclock protection from raw_spin_lock to seqcount
- Clean up function prototypes in the page fault code and avoid
repeated memslot lookups
- Convey the exit reason to userspace on emulation failure
- Configure time between NX page recovery iterations
- Expose Predictive Store Forwarding Disable CPUID leaf
- Allocate page tracking data structures lazily (if the i915 KVM-GT
functionality is not compiled in)
- Cleanups, fixes and optimizations for the shadow MMU code
s390:
- SIGP Fixes
- initial preparations for lazy destroy of secure VMs
- storage key improvements/fixes
- Log the guest CPNC
Starting from this release, KVM-PPC patches will come from Michael
Ellerman's PPC tree"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (227 commits)
RISC-V: KVM: fix boolreturn.cocci warnings
RISC-V: KVM: remove unneeded semicolon
RISC-V: KVM: Fix GPA passed to __kvm_riscv_hfence_gvma_xyz() functions
RISC-V: KVM: Factor-out FP virtualization into separate sources
KVM: s390: add debug statement for diag 318 CPNC data
KVM: s390: pv: properly handle page flags for protected guests
KVM: s390: Fix handle_sske page fault handling
KVM: x86: SGX must obey the KVM_INTERNAL_ERROR_EMULATION protocol
KVM: x86: On emulation failure, convey the exit reason, etc. to userspace
KVM: x86: Get exit_reason as part of kvm_x86_ops.get_exit_info
KVM: x86: Clarify the kvm_run.emulation_failure structure layout
KVM: s390: Add a routine for setting userspace CPU state
KVM: s390: Simplify SIGP Set Arch handling
KVM: s390: pv: avoid stalls when making pages secure
KVM: s390: pv: avoid stalls for kvm_s390_pv_init_vm
KVM: s390: pv: avoid double free of sida page
KVM: s390: pv: add macros for UVC CC values
s390/mm: optimize reset_guest_reference_bit()
s390/mm: optimize set_guest_storage_key()
s390/mm: no need for pte_alloc_map_lock() if we know the pmd is present
...
Diffstat (limited to 'arch')
106 files changed, 7680 insertions, 1489 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3a00dfb0711e..de65d2fa0657 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -185,6 +185,7 @@ config ARM64 select HAVE_GCC_PLUGINS select HAVE_HW_BREAKPOINT if PERF_EVENTS select HAVE_IRQ_TIME_ACCOUNTING + select HAVE_KVM select HAVE_NMI select HAVE_PATA_PLATFORM select HAVE_PERF_EVENTS diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 327120c0089f..a39fcf318c77 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -295,6 +295,7 @@ #define MDCR_EL2_HPMFZO (UL(1) << 29) #define MDCR_EL2_MTPME (UL(1) << 28) #define MDCR_EL2_TDCC (UL(1) << 27) +#define MDCR_EL2_HLP (UL(1) << 26) #define MDCR_EL2_HCCD (UL(1) << 23) #define MDCR_EL2_TTRF (UL(1) << 19) #define MDCR_EL2_HPMD (UL(1) << 17) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 6486b1db268e..50d5e4de244c 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -44,31 +44,39 @@ #define KVM_HOST_SMCCC_FUNC(name) KVM_HOST_SMCCC_ID(__KVM_HOST_SMCCC_FUNC_##name) #define __KVM_HOST_SMCCC_FUNC___kvm_hyp_init 0 -#define __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run 1 -#define __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context 2 -#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa 3 -#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid 4 -#define __KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context 5 -#define __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff 6 -#define __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs 7 -#define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_gic_config 8 -#define __KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr 9 -#define __KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr 10 -#define __KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs 11 -#define __KVM_HOST_SMCCC_FUNC___kvm_get_mdcr_el2 12 -#define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs 13 -#define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs 14 -#define __KVM_HOST_SMCCC_FUNC___pkvm_init 15 -#define __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp 16 -#define __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping 17 -#define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector 18 -#define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize 19 -#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc 20 #ifndef __ASSEMBLY__ #include <linux/mm.h> +enum __kvm_host_smccc_func { + /* Hypercalls available only prior to pKVM finalisation */ + /* __KVM_HOST_SMCCC_FUNC___kvm_hyp_init */ + __KVM_HOST_SMCCC_FUNC___kvm_get_mdcr_el2 = __KVM_HOST_SMCCC_FUNC___kvm_hyp_init + 1, + __KVM_HOST_SMCCC_FUNC___pkvm_init, + __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping, + __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector, + __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs, + __KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs, + __KVM_HOST_SMCCC_FUNC___vgic_v3_get_gic_config, + __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize, + + /* Hypercalls available after pKVM finalisation */ + __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp, + __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc, + __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run, + __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context, + __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa, + __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid, + __KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context, + __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff, + __KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr, + __KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr, + __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs, + __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs, + __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_init_traps, +}; + #define DECLARE_KVM_VHE_SYM(sym) extern char sym[] #define DECLARE_KVM_NVHE_SYM(sym) extern char kvm_nvhe_sym(sym)[] diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index fd418955e31e..f4871e47b2d0 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -396,7 +396,10 @@ static inline bool kvm_vcpu_is_be(struct kvm_vcpu *vcpu) if (vcpu_mode_is_32bit(vcpu)) return !!(*vcpu_cpsr(vcpu) & PSR_AA32_E_BIT); - return !!(vcpu_read_sys_reg(vcpu, SCTLR_EL1) & (1 << 25)); + if (vcpu_mode_priv(vcpu)) + return !!(vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_EE); + else + return !!(vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_EL1_E0E); } static inline unsigned long vcpu_data_guest_to_host(struct kvm_vcpu *vcpu, diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index f8be56d5342b..4be8486042a7 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -58,6 +58,7 @@ enum kvm_mode { KVM_MODE_DEFAULT, KVM_MODE_PROTECTED, + KVM_MODE_NONE, }; enum kvm_mode kvm_get_mode(void); @@ -771,7 +772,6 @@ int kvm_set_ipa_limit(void); #define __KVM_HAVE_ARCH_VM_ALLOC struct kvm *kvm_arch_alloc_vm(void); -void kvm_arch_free_vm(struct kvm *kvm); int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type); @@ -780,6 +780,8 @@ static inline bool kvm_vm_is_protected(struct kvm *kvm) return false; } +void kvm_init_protected_traps(struct kvm_vcpu *vcpu); + int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature); bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 657d0c94cf82..5afd14ab15b9 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -115,7 +115,12 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus, void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt); #endif +extern u64 kvm_nvhe_sym(id_aa64pfr0_el1_sys_val); +extern u64 kvm_nvhe_sym(id_aa64pfr1_el1_sys_val); +extern u64 kvm_nvhe_sym(id_aa64isar0_el1_sys_val); +extern u64 kvm_nvhe_sym(id_aa64isar1_el1_sys_val); extern u64 kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val); extern u64 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val); +extern u64 kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val); #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 027dbe004df4..16b3f1a1d468 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -1160,6 +1160,7 @@ #define ICH_HCR_TC (1 << 10) #define ICH_HCR_TALL0 (1 << 11) #define ICH_HCR_TALL1 (1 << 12) +#define ICH_HCR_TDIR (1 << 14) #define ICH_HCR_EOIcount_SHIFT 27 #define ICH_HCR_EOIcount_MASK (0x1f << ICH_HCR_EOIcount_SHIFT) @@ -1192,6 +1193,8 @@ #define ICH_VTR_SEIS_MASK (1 << ICH_VTR_SEIS_SHIFT) #define ICH_VTR_A3V_SHIFT 21 #define ICH_VTR_A3V_MASK (1 << ICH_VTR_A3V_SHIFT) +#define ICH_VTR_TDS_SHIFT 19 +#define ICH_VTR_TDS_MASK (1 << ICH_VTR_TDS_SHIFT) #define ARM64_FEATURE_FIELD_BITS 4 diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 6f6ff072acbd..44369b99a57e 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -1128,5 +1128,6 @@ bool cpus_are_stuck_in_kernel(void) { bool smp_spin_tables = (num_possible_cpus() > 1 && !have_cpu_die()); - return !!cpus_stuck_in_kernel || smp_spin_tables; + return !!cpus_stuck_in_kernel || smp_spin_tables || + is_protected_kvm_enabled(); } diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index d7eec0b43744..8ffcbe29395e 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -4,6 +4,7 @@ # source "virt/lib/Kconfig" +source "virt/kvm/Kconfig" menuconfig VIRTUALIZATION bool "Virtualization" @@ -19,7 +20,7 @@ if VIRTUALIZATION menuconfig KVM bool "Kernel-based Virtual Machine (KVM) support" - depends on OF + depends on HAVE_KVM select MMU_NOTIFIER select PREEMPT_NOTIFIERS select HAVE_KVM_CPU_RELAX_INTERCEPT @@ -43,12 +44,9 @@ menuconfig KVM If unsure, say N. -if KVM - -source "virt/kvm/Kconfig" - config NVHE_EL2_DEBUG bool "Debug mode for non-VHE EL2 object" + depends on KVM help Say Y here to enable the debug mode for the non-VHE KVM EL2 object. Failure reports will BUG() in the hypervisor. This is intended for @@ -56,6 +54,4 @@ config NVHE_EL2_DEBUG If unsure, say N. -endif # KVM - endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index fe102cd2e518..f5490afe1ebf 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -291,18 +291,12 @@ long kvm_arch_dev_ioctl(struct file *filp, struct kvm *kvm_arch_alloc_vm(void) { - if (!has_vhe()) - return kzalloc(sizeof(struct kvm), GFP_KERNEL); - - return vzalloc(sizeof(struct kvm)); -} + size_t sz = sizeof(struct kvm); -void kvm_arch_free_vm(struct kvm *kvm) -{ if (!has_vhe()) - kfree(kvm); - else - vfree(kvm); + return kzalloc(sz, GFP_KERNEL_ACCOUNT); + + return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO); } int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) @@ -620,6 +614,14 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) ret = kvm_arm_pmu_v3_enable(vcpu); + /* + * Initialize traps for protected VMs. + * NOTE: Move to run in EL2 directly, rather than via a hypercall, once + * the code is in place for first run initialization at EL2. + */ + if (kvm_vm_is_protected(kvm)) + kvm_call_hyp_nvhe(__pkvm_vcpu_init_traps, vcpu); + return ret; } @@ -1579,25 +1581,33 @@ static void cpu_set_hyp_vector(void) kvm_call_hyp_nvhe(__pkvm_cpu_set_vector, data->slot); } -static void cpu_hyp_reinit(void) +static void cpu_hyp_init_context(void) { kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt); - cpu_hyp_reset(); - - if (is_kernel_in_hyp_mode()) - kvm_timer_init_vhe(); - else + if (!is_kernel_in_hyp_mode()) cpu_init_hyp_mode(); +} +static void cpu_hyp_init_features(void) +{ cpu_set_hyp_vector(); - kvm_arm_init_debug(); + if (is_kernel_in_hyp_mode()) + kvm_timer_init_vhe(); + if (vgic_present) kvm_vgic_init_cpu_hardware(); } +static void cpu_hyp_reinit(void) +{ + cpu_hyp_reset(); + cpu_hyp_init_context(); + cpu_hyp_init_features(); +} + static void _kvm_arch_hardware_enable(void *discard) { if (!__this_cpu_read(kvm_arm_hardware_enabled)) { @@ -1788,10 +1798,17 @@ static int do_pkvm_init(u32 hyp_va_bits) int ret; preempt_disable(); - hyp_install_host_vector(); + cpu_hyp_init_context(); ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size, num_possible_cpus(), kern_hyp_va(per_cpu_base), hyp_va_bits); + cpu_hyp_init_features(); + + /* + * The stub hypercalls are now disabled, so set our local flag to + * prevent a later re-init attempt in kvm_arch_hardware_enable(). + */ + __this_cpu_write(kvm_arm_hardware_enabled, 1); preempt_enable(); return ret; @@ -1802,8 +1819,13 @@ static int kvm_hyp_init_protection(u32 hyp_va_bits) void *addr = phys_to_virt(hyp_mem_base); int ret; + kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); + kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1); + kvm_nvhe_sym(id_aa64isar1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1); kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1); ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP); if (ret) @@ -1971,9 +1993,25 @@ out_err: return err; } -static void _kvm_host_prot_finalize(void *discard) +static void _kvm_host_prot_finalize(void *arg) { - WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize)); + int *err = arg; + + if (WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize))) + WRITE_ONCE(*err, -EINVAL); +} + +static int pkvm_drop_host_privileges(void) +{ + int ret = 0; + + /* + * Flip the static key upfront as that may no longer be possible + * once the host stage 2 is installed. + */ + static_branch_enable(&kvm_protected_mode_initialized); + on_each_cpu(_kvm_host_prot_finalize, &ret, 1); + return ret; } static int finalize_hyp_mode(void) @@ -1987,15 +2025,7 @@ static int finalize_hyp_mode(void) * None of other sections should ever be introspected. */ kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start); - - /* - * Flip the static key upfront as that may no longer be possible - * once the host stage 2 is installed. - */ - static_branch_enable(&kvm_protected_mode_initialized); - on_each_cpu(_kvm_host_prot_finalize, NULL, 1); - - return 0; + return pkvm_drop_host_privileges(); } struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) @@ -2064,6 +2094,11 @@ int kvm_arch_init(void *opaque) return -ENODEV; } + if (kvm_get_mode() == KVM_MODE_NONE) { + kvm_info("KVM disabled from command line\n"); + return -ENODEV; + } + in_hyp_mode = is_kernel_in_hyp_mode(); if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) || @@ -2137,8 +2172,15 @@ static int __init early_kvm_mode_cfg(char *arg) return 0; } - if (strcmp(arg, "nvhe") == 0 && !WARN_ON(is_kernel_in_hyp_mode())) + if (strcmp(arg, "nvhe") == 0 && !WARN_ON(is_kernel_in_hyp_mode())) { + kvm_mode = KVM_MODE_DEFAULT; return 0; + } + + if (strcmp(arg, "none") == 0) { + kvm_mode = KVM_MODE_NONE; + return 0; + } return -EINVAL; } diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h new file mode 100644 index 000000000000..1b8a2dcd712f --- /dev/null +++ b/arch/arm64/kvm/hyp/include/hyp/fault.h @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#ifndef __ARM64_KVM_HYP_FAULT_H__ +#define __ARM64_KVM_HYP_FAULT_H__ + +#include <asm/kvm_asm.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> + +static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) +{ + u64 par, tmp; + + /* + * Resolve the IPA the hard way using the guest VA. + * + * Stage-1 translation already validated the memory access + * rights. As such, we can use the EL1 translation regime, and + * don't have to distinguish between EL0 and EL1 access. + * + * We do need to save/restore PAR_EL1 though, as we haven't + * saved the guest context yet, and we may return early... + */ + par = read_sysreg_par(); + if (!__kvm_at("s1e1r", far)) + tmp = read_sysreg_par(); + else + tmp = SYS_PAR_EL1_F; /* back to the guest */ + write_sysreg(par, par_el1); + + if (unlikely(tmp & SYS_PAR_EL1_F)) + return false; /* Translation failed, back to guest */ + + /* Convert PAR to HPFAR format */ + *hpfar = PAR_TO_HPFAR(tmp); + return true; +} + +static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) +{ + u64 hpfar, far; + + far = read_sysreg_el2(SYS_FAR); + + /* + * The HPFAR can be invalid if the stage 2 fault did not + * happen during a stage 1 page table walk (the ESR_EL2.S1PTW + * bit is clear) and one of the two following cases are true: + * 1. The fault was due to a permission fault + * 2. The processor carries errata 834220 + * + * Therefore, for all non S1PTW faults where we either have a + * permission fault or the errata workaround is enabled, we + * resolve the IPA using the AT instruction. + */ + if (!(esr & ESR_ELx_S1PTW) && + (cpus_have_final_cap(ARM64_WORKAROUND_834220) || + (esr & ESR_ELx_FSC_TYPE) == FSC_PERM)) { + if (!__translate_far_to_hpfar(far, &hpfar)) + return false; + } else { + hpfar = read_sysreg(hpfar_el2); + } + + fault->far_el2 = far; + fault->hpfar_el2 = hpfar; + return true; +} + +#endif diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index d5a47b93ef9b..7a0af1d39303 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -8,6 +8,7 @@ #define __ARM64_KVM_HYP_SWITCH_H__ #include <hyp/adjust_pc.h> +#include <hyp/fault.h> #include <linux/arm-smccc.h> #include <linux/kvm_host.h> @@ -137,78 +138,9 @@ static inline void ___deactivate_traps(struct kvm_vcpu *vcpu) } } -static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) -{ - u64 par, tmp; - - /* - * Resolve the IPA the hard way using the guest VA. - * - * Stage-1 translation already validated the memory access - * rights. As such, we can use the EL1 translation regime, and - * don't have to distinguish between EL0 and EL1 access. - * - * We do need to save/restore PAR_EL1 though, as we haven't - * saved the guest context yet, and we may return early... - */ - par = read_sysreg_par(); - if (!__kvm_at("s1e1r", far)) - tmp = read_sysreg_par(); - else - tmp = SYS_PAR_EL1_F; /* back to the guest */ - write_sysreg(par, par_el1); - - if (unlikely(tmp & SYS_PAR_EL1_F)) - return false; /* Translation failed, back to guest */ - - /* Convert PAR to HPFAR format */ - *hpfar = PAR_TO_HPFAR(tmp); - return true; -} - -static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) -{ - u64 hpfar, far; - - far = read_sysreg_el2(SYS_FAR); - - /* - * The HPFAR can be invalid if the stage 2 fault did not - * happen during a stage 1 page table walk (the ESR_EL2.S1PTW - * bit is clear) and one of the two following cases are true: - * 1. The fault was due to a permission fault - * 2. The processor carries errata 834220 - * - * Therefore, for all non S1PTW faults where we either have a - * permission fault or the errata workaround is enabled, we - * resolve the IPA using the AT instruction. - */ - if (!(esr & ESR_ELx_S1PTW) && - (cpus_have_final_cap(ARM64_WORKAROUND_834220) || - (esr & ESR_ELx_FSC_TYPE) == FSC_PERM)) { - if (!__translate_far_to_hpfar(far, &hpfar)) - return false; - } else { - hpfar = read_sysreg(hpfar_el2); - } - - fault->far_el2 = far; - fault->hpfar_el2 = hpfar; - return true; -} - static inline bool __populate_fault_info(struct kvm_vcpu *vcpu) { - u8 ec; - u64 esr; - - esr = vcpu->arch.fault.esr_el2; - ec = ESR_ELx_EC(esr); - - if (ec != ESR_ELx_EC_DABT_LOW && ec != ESR_ELx_EC_IABT_LOW) - return true; - - return __get_fault_info(esr, &vcpu->arch.fault); + return __get_fault_info(vcpu->arch.fault.esr_el2, &vcpu->arch.fault); } static inline void __hyp_sve_save_host(struct kvm_vcpu *vcpu) @@ -229,8 +161,13 @@ static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu) write_sysreg_el1(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR); } -/* Check for an FPSIMD/SVE trap and handle as appropriate */ -static inline bool __hyp_handle_fpsimd(struct kvm_vcpu *vcpu) +/* + * We trap the first access to the FP/SIMD to save the host context and + * restore the guest context lazily. + * If FP/SIMD is not implemented, handle the trap and inject an undefined + * instruction exception to the guest. Similarly for trapped SVE accesses. + */ +static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) { bool sve_guest, sve_host; u8 esr_ec; @@ -248,9 +185,6 @@ static inline bool __hyp_handle_fpsimd(struct kvm_vcpu *vcpu) } esr_ec = kvm_vcpu_trap_get_class(vcpu); - if (esr_ec != ESR_ELx_EC_FP_ASIMD && - esr_ec != ESR_ELx_EC_SVE) - return false; /* Don't handle SVE traps for non-SVE vcpus here: */ if (!sve_guest && esr_ec != ESR_ELx_EC_FP_ASIMD) @@ -352,14 +286,6 @@ static inline bool handle_tx2_tvm(struct kvm_vcpu *vcpu) static inline bool esr_is_ptrauth_trap(u32 esr) { - u32 ec = ESR_ELx_EC(esr); - - if (ec == ESR_ELx_EC_PAC) - return true; - - if (ec != ESR_ELx_EC_SYS64) - return false; - switch (esr_sys64_to_sysreg(esr)) { case SYS_APIAKEYLO_EL1: case SYS_APIAKEYHI_EL1: @@ -388,13 +314,12 @@ static inline bool esr_is_ptrauth_trap(u32 esr) DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); -static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu) +static bool kvm_hyp_handle_ptrauth(struct kvm_vcpu *vcpu, u64 *exit_code) { struct kvm_cpu_context *ctxt; u64 val; - if (!vcpu_has_ptrauth(vcpu) || - !esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu))) + if (!vcpu_has_ptrauth(vcpu)) return false; ctxt = this_cpu_ptr(&kvm_hyp_ctxt); @@ -413,6 +338,90 @@ static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu) return true; } +static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) && + handle_tx2_tvm(vcpu)) + return true; + + if (static_branch_unlikely(&vgic_v3_cpuif_trap) && + __vgic_v3_perform_cpuif_access(vcpu) == 1) + return true; + + if (esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu))) + return kvm_hyp_handle_ptrauth(vcpu, exit_code); + + return false; +} + +static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + if (static_branch_unlikely(&vgic_v3_cpuif_trap) && + __vgic_v3_perform_cpuif_access(vcpu) == 1) + return true; + + return false; +} + +static bool kvm_hyp_handle_iabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + if (!__populate_fault_info(vcpu)) + return true; + + return false; +} + +static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + if (!__populate_fault_info(vcpu)) + return true; + + if (static_branch_unlikely(&vgic_v2_cpuif_trap)) { + bool valid; + + valid = kvm_vcpu_trap_get_fault_type(vcpu) == FSC_FAULT && + kvm_vcpu_dabt_isvalid(vcpu) && + !kvm_vcpu_abt_issea(vcpu) && + !kvm_vcpu_abt_iss1tw(vcpu); + + if (valid) { + int ret = __vgic_v2_perform_cpuif_access(vcpu); + + if (ret == 1) + return true; + + /* Promote an illegal access to an SError.*/ + if (ret == -1) + *exit_code = ARM_EXCEPTION_EL1_SERROR; + } + } + + return false; +} + +typedef bool (*exit_handler_fn)(struct kvm_vcpu *, u64 *); + +static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu); + +/* + * Allow the hypervisor to handle the exit with an exit handler if it has one. + * + * Returns true if the hypervisor handled the exit, and control should go back + * to the guest, or false if it hasn't. + */ +static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu); + exit_handler_fn fn; + + fn = handlers[kvm_vcpu_trap_get_class(vcpu)]; + + if (fn) + return fn(vcpu, exit_code); + + return false; +} + /* * Return true when we were able to fixup the guest exit and should return to * the guest, false when we should restore the host state and return to the @@ -447,59 +456,9 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) if (*exit_code != ARM_EXCEPTION_TRAP) goto exit; - if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) && - kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_SYS64 && - handle_tx2_tvm(vcpu)) + /* Check if there's an exit handler and allow it to handle the exit. */ + if (kvm_hyp_handle_exit(vcpu, exit_code)) goto guest; - - /* - * We trap the first access to the FP/SIMD to save the host context - * and restore the guest context lazily. - * If FP/SIMD is not implemented, handle the trap and inject an - * undefined instruction exception to the guest. - * Similarly for trapped SVE accesses. - */ - if (__hyp_handle_fpsimd(vcpu)) - goto guest; - - if (__hyp_handle_ptrauth(vcpu)) - goto guest; - - if (!__populate_fault_info(vcpu)) - goto guest; - - if (static_branch_unlikely(&vgic_v2_cpuif_trap)) { - bool valid; - - valid = kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_DABT_LOW && - kvm_vcpu_trap_get_fault_type(vcpu) == FSC_FAULT && - kvm_vcpu_dabt_isvalid(vcpu) && - !kvm_vcpu_abt_issea(vcpu) && - !kvm_vcpu_abt_iss1tw(vcpu); - - if (valid) { - int ret = __vgic_v2_perform_cpuif_access(vcpu); - - if (ret == 1) - goto guest; - - /* Promote an illegal access to an SError.*/ - if (ret == -1) - *exit_code = ARM_EXCEPTION_EL1_SERROR; - - goto exit; - } - } - - if (static_branch_unlikely(&vgic_v3_cpuif_trap) && - (kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_SYS64 || - kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_CP15_32)) { - int ret = __vgic_v3_perform_cpuif_access(vcpu); - - if (ret == 1) - goto guest; - } - exit: /* Return to the host kernel and handle the exit */ return false; diff --git a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h new file mode 100644 index 000000000000..eea1f6a53723 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h @@ -0,0 +1,200 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 Google LLC + * Author: Fuad Tabba <tabba@google.com> + */ + +#ifndef __ARM64_KVM_FIXED_CONFIG_H__ +#define __ARM64_KVM_FIXED_CONFIG_H__ + +#include <asm/sysreg.h> + +/* + * This file contains definitions for features to be allowed or restricted for + * guest virtual machines, depending on the mode KVM is running in and on the + * type of guest that is running. + * + * The ALLOW masks represent a bitmask of feature fields that are allowed + * without any restrictions as long as they are supported by the system. + * + * The RESTRICT_UNSIGNED masks, if present, represent unsigned fields for + * features that are restricted to support at most the specified feature. + * + * If a feature field is not present in either, than it is not supported. + * + * The approach taken for protected VMs is to allow features that are: + * - Needed by common Linux distributions (e.g., floating point) + * - Trivial to support, e.g., supporting the feature does not introduce or + * require tracking of additional state in KVM + * - Cannot be trapped or prevent the guest from using anyway + */ + +/* + * Allow for protected VMs: + * - Floating-point and Advanced SIMD + * - Data Independent Timing + */ +#define PVM_ID_AA64PFR0_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64PFR0_FP) | \ + ARM64_FEATURE_MASK(ID_AA64PFR0_ASIMD) | \ + ARM64_FEATURE_MASK(ID_AA64PFR0_DIT) \ + ) + +/* + * Restrict to the following *unsigned* features for protected VMs: + * - AArch64 guests only (no support for AArch32 guests): + * AArch32 adds complexity in trap handling, emulation, condition codes, + * etc... + * - RAS (v1) + * Supported by KVM + */ +#define PVM_ID_AA64PFR0_RESTRICT_UNSIGNED (\ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL0), ID_AA64PFR0_ELx_64BIT_ONLY) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1), ID_AA64PFR0_ELx_64BIT_ONLY) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL2), ID_AA64PFR0_ELx_64BIT_ONLY) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL3), ID_AA64PFR0_ELx_64BIT_ONLY) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_RAS), ID_AA64PFR0_RAS_V1) \ + ) + +/* + * Allow for protected VMs: + * - Branch Target Identification + * - Speculative Store Bypassing + */ +#define PVM_ID_AA64PFR1_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64PFR1_BT) | \ + ARM64_FEATURE_MASK(ID_AA64PFR1_SSBS) \ + ) + +/* + * Allow for protected VMs: + * - Mixed-endian + * - Distinction between Secure and Non-secure Memory + * - Mixed-endian at EL0 only + * - Non-context synchronizing exception entry and exit + */ +#define PVM_ID_AA64MMFR0_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64MMFR0_BIGENDEL) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR0_SNSMEM) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR0_BIGENDEL0) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR0_EXS) \ + ) + +/* + * Restrict to the following *unsigned* features for protected VMs: + * - 40-bit IPA + * - 16-bit ASID + */ +#define PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED (\ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_PARANGE), ID_AA64MMFR0_PARANGE_40) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_ASID), ID_AA64MMFR0_ASID_16) \ + ) + +/* + * Allow for protected VMs: + * - Hardware translation table updates to Access flag and Dirty state + * - Number of VMID bits from CPU + * - Hierarchical Permission Disables + * - Privileged Access Never + * - SError interrupt exceptions from speculative reads + * - Enhanced Translation Synchronization + */ +#define PVM_ID_AA64MMFR1_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64MMFR1_HADBS) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR1_VMIDBITS) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR1_HPD) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR1_PAN) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR1_SPECSEI) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR1_ETS) \ + ) + +/* + * Allow for protected VMs: + * - Common not Private translations + * - User Access Override + * - IESB bit in the SCTLR_ELx registers + * - Unaligned single-copy atomicity and atomic functions + * - ESR_ELx.EC value on an exception by read access to feature ID space + * - TTL field in address operations. + * - Break-before-make sequences when changing translation block size + * - E0PDx mechanism + */ +#define PVM_ID_AA64MMFR2_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64MMFR2_CNP) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_UAO) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_IESB) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_AT) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_IDS) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_TTL) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_BBM) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_E0PD) \ + ) + +/* + * No support for Scalable Vectors for protected VMs: + * Requires additional support from KVM, e.g., context-switching and + * trapping at EL2 + */ +#define PVM_ID_AA64ZFR0_ALLOW (0ULL) + +/* + * No support for debug, including breakpoints, and watchpoints for protected + * VMs: + * The Arm architecture mandates support for at least the Armv8 debug + * architecture, which would include at least 2 hardware breakpoints and + * watchpoints. Providing that support to protected guests adds + * considerable state and complexity. Therefore, the reserved value of 0 is + * used for debug-related fields. + */ +#define PVM_ID_AA64DFR0_ALLOW (0ULL) +#define PVM_ID_AA64DFR1_ALLOW (0ULL) + +/* + * No support for implementation defined features. + */ +#define PVM_ID_AA64AFR0_ALLOW (0ULL) +#define PVM_ID_AA64AFR1_ALLOW (0ULL) + +/* + * No restrictions on instructions implemented in AArch64. + */ +#define PVM_ID_AA64ISAR0_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64ISAR0_AES) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_SHA1) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_SHA2) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_CRC32) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_ATOMICS) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_RDM) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_SHA3) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_SM3) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_SM4) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_DP) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_FHM) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_TS) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_TLB) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_RNDR) \ + ) + +#define PVM_ID_AA64ISAR1_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64ISAR1_DPB) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_APA) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_API) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_JSCVT) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_FCMA) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_LRCPC) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_GPA) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_GPI) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_FRINTTS) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_SB) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_SPECRES) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_BF16) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_DGH) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_I8MM) \ + ) + +u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id); +bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code); +bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code); +int kvm_check_pvm_sysreg_table(void); + +#endif /* __ARM64_KVM_FIXED_CONFIG_H__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h index 1e6d995968a1..45a84f0ade04 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h +++ b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h @@ -15,4 +15,6 @@ #define DECLARE_REG(type, name, ctxt, reg) \ type name = (type)cpu_reg(ctxt, (reg)) +void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu); + #endif /* __ARM64_KVM_NVHE_TRAP_HANDLER_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index 8d741f71377f..c3c11974fa3b 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -14,7 +14,7 @@ lib-objs := $(addprefix ../../../lib/, $(lib-objs)) obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ hyp-main.o hyp-smp.o psci-relay.o early_alloc.o stub.o page_alloc.o \ - cache.o setup.o mm.o mem_protect.o + cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o obj-y += $(lib-objs) diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 4b652ffb591d..0c6116d34e18 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -110,17 +110,14 @@ SYM_FUNC_START(__hyp_do_panic) b __host_enter_for_panic SYM_FUNC_END(__hyp_do_panic) -.macro host_el1_sync_vect - .align 7 -.L__vect_start\@: - stp x0, x1, [sp, #-16]! - mrs x0, esr_el2 - lsr x0, x0, #ESR_ELx_EC_SHIFT - cmp x0, #ESR_ELx_EC_HVC64 - b.ne __host_exit - +SYM_FUNC_START(__host_hvc) ldp x0, x1, [sp] // Don't fixup the stack yet + /* No stub for you, sonny Jim */ +alternative_if ARM64_KVM_PROTECTED_MODE + b __host_exit +alternative_else_nop_endif + /* Check for a stub HVC call */ cmp x0, #HVC_STUB_HCALL_NR b.hs __host_exit @@ -137,6 +134,17 @@ SYM_FUNC_END(__hyp_do_panic) ldr x5, =__kvm_handle_stub_hvc hyp_pa x5, x6 br x5 +SYM_FUNC_END(__host_hvc) + +.macro host_el1_sync_vect + .align 7 +.L__vect_start\@: + stp x0, x1, [sp, #-16]! + mrs x0, esr_el2 + lsr x0, x0, #ESR_ELx_EC_SHIFT + cmp x0, #ESR_ELx_EC_HVC64 + b.eq __host_hvc + b __host_exit .L__vect_end\@: .if ((.L__vect_end\@ - .L__vect_start\@) > 0x80) .error "host_el1_sync_vect larger than vector entry" diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 2da6aa8da868..b096bf009144 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -4,7 +4,7 @@ * Author: Andrew Scull <ascull@google.com> */ -#include <hyp/switch.h> +#include <hyp/adjust_pc.h> #include <asm/pgtable-types.h> #include <asm/kvm_asm.h> @@ -160,41 +160,65 @@ static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt) { cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize(); } + +static void handle___pkvm_vcpu_init_traps(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); + + __pkvm_vcpu_init_traps(kern_hyp_va(vcpu)); +} + typedef void (*hcall_t)(struct kvm_cpu_context *); #define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x static const hcall_t host_hcall[] = { - HANDLE_FUNC(__kvm_vcpu_run), + /* ___kvm_hyp_init */ + HANDLE_FUNC(__kvm_get_mdcr_el2), + HANDLE_FUNC(__pkvm_init), + HANDLE_FUNC(__pkvm_create_private_mapping), + HANDLE_FUNC(__pkvm_cpu_set_vector), + HANDLE_FUNC(__kvm_enable_ssbs), + HANDLE_FUNC(__vgic_v3_init_lrs), + HANDLE_FUNC(__vgic_v3_get_gic_config), + HANDLE_FUNC(__pkvm_prot_finalize), + + HANDLE_FUNC(__pkvm_host_share_hyp), HANDLE_FUNC(__kvm_adjust_pc), + HANDLE_FUNC(__kvm_vcpu_run), HANDLE_FUNC(__kvm_flush_vm_context), HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa), HANDLE_FUNC(__kvm_tlb_flush_vmid), HANDLE_FUNC(__kvm_flush_cpu_context), HANDLE_FUNC(__kvm_timer_set_cntvoff), - HANDLE_FUNC(__kvm_enable_ssbs), - HANDLE_FUNC(__vgic_v3_get_gic_config), HANDLE_FUNC(__vgic_v3_read_vmcr), HANDLE_FUNC(__vgic_v3_write_vmcr), - HANDLE_FUNC(__vgic_v3_init_lrs), - HANDLE_FUNC(__kvm_get_mdcr_el2), HANDLE_FUNC(__vgic_v3_save_aprs), HANDLE_FUNC(__vgic_v3_restore_aprs), - HANDLE_FUNC(__pkvm_init), - HANDLE_FUNC(__pkvm_cpu_set_vector), - HANDLE_FUNC(__pkvm_host_share_hyp), - HANDLE_FUNC(__pkvm_create_private_mapping), - HANDLE_FUNC(__pkvm_prot_finalize), + HANDLE_FUNC(__pkvm_vcpu_init_traps), }; static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(unsigned long, id, host_ctxt, 0); + unsigned long hcall_min = 0; hcall_t hfn; + /* + * If pKVM has been initialised then reject any calls to the + * early "privileged" hypercalls. Note that we cannot reject + * calls to __pkvm_prot_finalize for two reasons: (1) The static + * key used to determine initialisation must be toggled prior to + * finalisation and (2) finalisation is performed on a per-CPU + * basis. This is all fine, however, since __pkvm_prot_finalize + * returns -EPERM after the first call for a given CPU. + */ + if (static_branch_unlikely(&kvm_protected_mode_initialized)) + hcall_min = __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize; + id -= KVM_HOST_SMCCC_ID(0); - if (unlikely(id >= ARRAY_SIZE(host_hcall))) + if (unlikely(id < hcall_min || id >= ARRAY_SIZE(host_hcall))) goto inval; hfn = host_hcall[id]; diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 34eeb524b686..c1a90dd022b8 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -11,7 +11,7 @@ #include <asm/kvm_pgtable.h> #include <asm/stage2_pgtable.h> -#include <hyp/switch.h> +#include <hyp/fault.h> #include <nvhe/gfp.h> #include <nvhe/memory.h> @@ -25,12 +25,6 @@ struct host_kvm host_kvm; static struct hyp_pool host_s2_pool; -/* - * Copies of the host's CPU features registers holding sanitized values. - */ -u64 id_aa64mmfr0_el1_sys_val; -u64 id_aa64mmfr1_el1_sys_val; - const u8 pkvm_hyp_id = 1; static void *host_s2_zalloc_pages_exact(size_t size) @@ -134,6 +128,9 @@ int __pkvm_prot_finalize(void) struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu; struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); + if (params->hcr_el2 & HCR_VM) + return -EPERM; + params->vttbr = kvm_get_vttbr(mmu); params->vtcr = host_kvm.arch.vtcr; params->hcr_el2 |= HCR_VM; diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c new file mode 100644 index 000000000000..99c8d8b73e70 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021 Google LLC + * Author: Fuad Tabba <tabba@google.com> + */ + +#include <linux/kvm_host.h> +#include <linux/mm.h> +#include <nvhe/fixed_config.h> +#include <nvhe/trap_handler.h> + +/* + * Set trap register values based on features in ID_AA64PFR0. + */ +static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu) +{ + const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1); + u64 hcr_set = HCR_RW; + u64 hcr_clear = 0; + u64 cptr_set = 0; + + /* Protected KVM does not support AArch32 guests. */ + BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL0), + PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_ELx_64BIT_ONLY); + BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1), + PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_ELx_64BIT_ONLY); + + /* + * Linux guests assume support for floating-point and Advanced SIMD. Do + * not change the trapping behavior for these from the KVM default. + */ + BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_FP), + PVM_ID_AA64PFR0_ALLOW)); + BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_ASIMD), + PVM_ID_AA64PFR0_ALLOW)); + + /* Trap RAS unless all current versions are supported */ + if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_RAS), feature_ids) < + ID_AA64PFR0_RAS_V1P1) { + hcr_set |= HCR_TERR | HCR_TEA; + hcr_clear |= HCR_FIEN; + } + + /* Trap AMU */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_AMU), feature_ids)) { + hcr_clear |= HCR_AMVOFFEN; + cptr_set |= CPTR_EL2_TAM; + } + + /* Trap SVE */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_SVE), feature_ids)) + cptr_set |= CPTR_EL2_TZ; + + vcpu->arch.hcr_el2 |= hcr_set; + vcpu->arch.hcr_el2 &= ~hcr_clear; + vcpu->arch.cptr_el2 |= cptr_set; +} + +/* + * Set trap register values based on features in ID_AA64PFR1. + */ +static void pvm_init_traps_aa64pfr1(struct kvm_vcpu *vcpu) +{ + const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR1_EL1); + u64 hcr_set = 0; + u64 hcr_clear = 0; + + /* Memory Tagging: Trap and Treat as Untagged if not supported. */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_MTE), feature_ids)) { + hcr_set |= HCR_TID5; + hcr_clear |= HCR_DCT | HCR_ATA; + } + + vcpu->arch.hcr_el2 |= hcr_set; + vcpu->arch.hcr_el2 &= ~hcr_clear; +} + +/* + * Set trap register values based on features in ID_AA64DFR0. + */ +static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu) +{ + const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1); + u64 mdcr_set = 0; + u64 mdcr_clear = 0; + u64 cptr_set = 0; + + /* Trap/constrain PMU */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_PMUVER), feature_ids)) { + mdcr_set |= MDCR_EL2_TPM | MDCR_EL2_TPMCR; + mdcr_clear |= MDCR_EL2_HPME | MDCR_EL2_MTPME | + MDCR_EL2_HPMN_MASK; + } + + /* Trap Debug */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_DEBUGVER), feature_ids)) + mdcr_set |= MDCR_EL2_TDRA | MDCR_EL2_TDA | MDCR_EL2_TDE; + + /* Trap OS Double Lock */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_DOUBLELOCK), feature_ids)) + mdcr_set |= MDCR_EL2_TDOSA; + + /* Trap SPE */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_PMSVER), feature_ids)) { + mdcr_set |= MDCR_EL2_TPMS; + mdcr_clear |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT; + } + + /* Trap Trace Filter */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_TRACE_FILT), feature_ids)) + mdcr_set |= MDCR_EL2_TTRF; + + /* Trap Trace */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_TRACEVER), feature_ids)) + cptr_set |= CPTR_EL2_TTA; + + vcpu->arch.mdcr_el2 |= mdcr_set; + vcpu->arch.mdcr_el2 &= ~mdcr_clear; + vcpu->arch.cptr_el2 |= cptr_set; +} + +/* + * Set trap register values based on features in ID_AA64MMFR0. + */ +static void pvm_init_traps_aa64mmfr0(struct kvm_vcpu *vcpu) +{ + const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR0_EL1); + u64 mdcr_set = 0; + + /* Trap Debug Communications Channel registers */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_FGT), feature_ids)) + mdcr_set |= MDCR_EL2_TDCC; + + vcpu->arch.mdcr_el2 |= mdcr_set; +} + +/* + * Set trap register values based on features in ID_AA64MMFR1. + */ +static void pvm_init_traps_aa64mmfr1(struct kvm_vcpu *vcpu) +{ + const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR1_EL1); + u64 hcr_set = 0; + + /* Trap LOR */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_LOR), feature_ids)) + hcr_set |= HCR_TLOR; + + vcpu->arch.hcr_el2 |= hcr_set; +} + +/* + * Set baseline trap register values. + */ +static void pvm_init_trap_regs(struct kvm_vcpu *vcpu) +{ + const u64 hcr_trap_feat_regs = HCR_TID3; + const u64 hcr_trap_impdef = HCR_TACR | HCR_TIDCP | HCR_TID1; + + /* + * Always trap: + * - Feature id registers: to control features exposed to guests + * - Implementation-defined features + */ + vcpu->arch.hcr_el2 |= hcr_trap_feat_regs | hcr_trap_impdef; + + /* Clear res0 and set res1 bits to trap potential new features. */ + vcpu->arch.hcr_el2 &= ~(HCR_RES0); + vcpu->arch.mdcr_el2 &= ~(MDCR_EL2_RES0); + vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1; + vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0); +} + +/* + * Initialize trap register values for protected VMs. + */ +void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu) +{ + pvm_init_trap_regs(vcpu); + pvm_init_traps_aa64pfr0(vcpu); + pvm_init_traps_aa64pfr1(vcpu); + pvm_init_traps_aa64dfr0(vcpu); + pvm_init_traps_aa64mmfr0(vcpu); + pvm_init_traps_aa64mmfr1(vcpu); +} diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 57c27846320f..862c7b514e20 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -10,6 +10,7 @@ #include <asm/kvm_pgtable.h> #include <nvhe/early_alloc.h> +#include <nvhe/fixed_config.h> #include <nvhe/gfp.h> #include <nvhe/memory.h> #include <nvhe/mem_protect.h> @@ -260,6 +261,8 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus, void (*fn)(phys_addr_t params_pa, void *finalize_fn_va); int ret; + BUG_ON(kvm_check_pvm_sysreg_table()); + if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size)) return -EINVAL; diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index a34b01cc8ab9..c0e3fed26d93 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -27,6 +27,7 @@ #include <asm/processor.h> #include <asm/thread_info.h> +#include <nvhe/fixed_config.h> #include <nvhe/mem_protect.h> /* Non-VHE specific context */ @@ -158,6 +159,101 @@ static void __pmu_switch_to_host(struct kvm_cpu_context *host_ctxt) write_sysreg(pmu->events_host, pmcntenset_el0); } +/** + * Handler for protected VM MSR, MRS or System instruction execution in AArch64. + * + * Returns true if the hypervisor has handled the exit, and control should go + * back to the guest, or false if it hasn't. + */ +static bool kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + /* + * Make sure we handle the exit for workarounds and ptrauth + * before the pKVM handling, as the latter could decide to + * UNDEF. + */ + return (kvm_hyp_handle_sysreg(vcpu, exit_code) || + kvm_handle_pvm_sysreg(vcpu, exit_code)); +} + +/** + * Handler for protected floating-point and Advanced SIMD accesses. + * + * Returns true if the hypervisor has handled the exit, and control should go + * back to the guest, or false if it hasn't. + */ +static bool kvm_handle_pvm_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + /* Linux guests assume support for floating-point and Advanced SIMD. */ + BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_FP), + PVM_ID_AA64PFR0_ALLOW)); + BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_ASIMD), + PVM_ID_AA64PFR0_ALLOW)); + + return kvm_hyp_handle_fpsimd(vcpu, exit_code); +} + +static const exit_handler_fn hyp_exit_handlers[] = { + [0 ... ESR_ELx_EC_MAX] = NULL, + [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, + [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg, + [ESR_ELx_EC_SVE] = kvm_hyp_handle_fpsimd, + [ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd, + [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, + [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, + [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth, +}; + +static const exit_handler_fn pvm_exit_handlers[] = { + [0 ... ESR_ELx_EC_MAX] = NULL, + [ESR_ELx_EC_SYS64] = kvm_handle_pvm_sys64, + [ESR_ELx_EC_SVE] = kvm_handle_pvm_restricted, + [ESR_ELx_EC_FP_ASIMD] = kvm_handle_pvm_fpsimd, + [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, + [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, + [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth, +}; + +static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) +{ + if (unlikely(kvm_vm_is_protected(kern_hyp_va(vcpu->kvm)))) + return pvm_exit_handlers; + + return hyp_exit_handlers; +} + +/* + * Some guests (e.g., protected VMs) are not be allowed to run in AArch32. + * The ARMv8 architecture does not give the hypervisor a mechanism to prevent a + * guest from dropping to AArch32 EL0 if implemented by the CPU. If the + * hypervisor spots a guest in such a state ensure it is handled, and don't + * trust the host to spot or fix it. The check below is based on the one in + * kvm_arch_vcpu_ioctl_run(). + * + * Returns false if the guest ran in AArch32 when it shouldn't have, and + * thus should exit to the host, or true if a the guest run loop can continue. + */ +static bool handle_aarch32_guest(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + struct kvm *kvm = kern_hyp_va(vcpu->kvm); + + if (kvm_vm_is_protected(kvm) && vcpu_mode_is_32bit(vcpu)) { + /* + * As we have caught the guest red-handed, decide that it isn't + * fit for purpose anymore by making the vcpu invalid. The VMM + * can try and fix it by re-initializing the vcpu with + * KVM_ARM_VCPU_INIT, however, this is likely not possible for + * protected VMs. + */ + vcpu->arch.target = -1; + *exit_code &= BIT(ARM_EXIT_WITH_SERROR_BIT); + *exit_code |= ARM_EXCEPTION_IL; + return false; + } + + return true; +} + /* Switch to the guest for legacy non-VHE systems */ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) { @@ -220,6 +316,9 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) /* Jump in the fire! */ exit_code = __guest_enter(vcpu); + if (unlikely(!handle_aarch32_guest(vcpu, &exit_code))) + break; + /* And we're baaack! */ } while (fixup_guest_exit(vcpu, &exit_code)); diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c new file mode 100644 index 000000000000..3787ee6fb1a2 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -0,0 +1,487 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021 Google LLC + * Author: Fuad Tabba <tabba@google.com> + */ + +#include <linux/irqchip/arm-gic-v3.h> + +#include <asm/kvm_asm.h> +#include <asm/kvm_mmu.h> + +#include <hyp/adjust_pc.h> + +#include <nvhe/fixed_config.h> + +#include "../../sys_regs.h" + +/* + * Copies of the host's CPU features registers holding sanitized values at hyp. + */ +u64 id_aa64pfr0_el1_sys_val; +u64 id_aa64pfr1_el1_sys_val; +u64 id_aa64isar0_el1_sys_val; +u64 id_aa64isar1_el1_sys_val; +u64 id_aa64mmfr0_el1_sys_val; +u64 id_aa64mmfr1_el1_sys_val; +u64 id_aa64mmfr2_el1_sys_val; + +/* + * Inject an unknown/undefined exception to an AArch64 guest while most of its + * sysregs are live. + */ +static void inject_undef64(struct kvm_vcpu *vcpu) +{ + u32 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); + + *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); + *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR); + + vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA64_EL1 | + KVM_ARM64_EXCEPT_AA64_ELx_SYNC | + KVM_ARM64_PENDING_EXCEPTION); + + __kvm_adjust_pc(vcpu); + + write_sysreg_el1(esr, SYS_ESR); + write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR); + write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); + write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR); +} + +/* + * Returns the restricted features values of the feature register based on the + * limitations in restrict_fields. + * A feature id field value of 0b0000 does not impose any restrictions. + * Note: Use only for unsigned feature field values. + */ +static u64 get_restricted_features_unsigned(u64 sys_reg_val, + u64 restrict_fields) +{ + u64 value = 0UL; + u64 mask = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0); + + /* + * According to the Arm Architecture Reference Manual, feature fields + * use increasing values to indicate increases in functionality. + * Iterate over the restricted feature fields and calculate the minimum + * unsigned value between the one supported by the system, and what the + * value is being restricted to. + */ + while (sys_reg_val && restrict_fields) { + value |= min(sys_reg_val & mask, restrict_fields & mask); + sys_reg_val &= ~mask; + restrict_fields &= ~mask; + mask <<= ARM64_FEATURE_FIELD_BITS; + } + + return value; +} + +/* + * Functions that return the value of feature id registers for protected VMs + * based on allowed features, system features, and KVM support. + */ + +static u64 get_pvm_id_aa64pfr0(const struct kvm_vcpu *vcpu) +{ + const struct kvm *kvm = (const struct kvm *)kern_hyp_va(vcpu->kvm); + u64 set_mask = 0; + u64 allow_mask = PVM_ID_AA64PFR0_ALLOW; + + if (!vcpu_has_sve(vcpu)) + allow_mask &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_SVE); + + set_mask |= get_restricted_features_unsigned(id_aa64pfr0_el1_sys_val, + PVM_ID_AA64PFR0_RESTRICT_UNSIGNED); + + /* Spectre and Meltdown mitigation in KVM */ + set_mask |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV2), + (u64)kvm->arch.pfr0_csv2); + set_mask |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3), + (u64)kvm->arch.pfr0_csv3); + + return (id_aa64pfr0_el1_sys_val & allow_mask) | set_mask; +} + +static u64 get_pvm_id_aa64pfr1(const struct kvm_vcpu *vcpu) +{ + const struct kvm *kvm = (const struct kvm *)kern_hyp_va(vcpu->kvm); + u64 allow_mask = PVM_ID_AA64PFR1_ALLOW; + + if (!kvm_has_mte(kvm)) + allow_mask &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_MTE); + + return id_aa64pfr1_el1_sys_val & allow_mask; +} + +static u64 get_pvm_id_aa64zfr0(const struct kvm_vcpu *vcpu) +{ + /* + * No support for Scalable Vectors, therefore, hyp has no sanitized + * copy of the feature id register. + */ + BUILD_BUG_ON(PVM_ID_AA64ZFR0_ALLOW != 0ULL); + return 0; +} + +static u64 get_pvm_id_aa64dfr0(const struct kvm_vcpu *vcpu) +{ + /* + * No support for debug, including breakpoints, and watchpoints, + * therefore, pKVM has no sanitized copy of the feature id register. + */ + BUILD_BUG_ON(PVM_ID_AA64DFR0_ALLOW != 0ULL); + return 0; +} + +static u64 get_pvm_id_aa64dfr1(const struct kvm_vcpu *vcpu) +{ + /* + * No support for debug, therefore, hyp has no sanitized copy of the + * feature id register. + */ + BUILD_BUG_ON(PVM_ID_AA64DFR1_ALLOW != 0ULL); + return 0; +} + +static u64 get_pvm_id_aa64afr0(const struct kvm_vcpu *vcpu) +{ + /* + * No support for implementation defined features, therefore, hyp has no + * sanitized copy of the feature id register. + */ + BUILD_BUG_ON(PVM_ID_AA64AFR0_ALLOW != 0ULL); + return 0; +} + +static u64 get_pvm_id_aa64afr1(const struct kvm_vcpu *vcpu) +{ + /* + * No support for implementation defined features, therefore, hyp has no + * sanitized copy of the feature id register. + */ + BUILD_BUG_ON(PVM_ID_AA64AFR1_ALLOW != 0ULL); + return 0; +} + +static u64 get_pvm_id_aa64isar0(const struct kvm_vcpu *vcpu) +{ + return id_aa64isar0_el1_sys_val & PVM_ID_AA64ISAR0_ALLOW; +} + +static u64 get_pvm_id_aa64isar1(const struct kvm_vcpu *vcpu) +{ + u64 allow_mask = PVM_ID_AA64ISAR1_ALLOW; + + if (!vcpu_has_ptrauth(vcpu)) + allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_APA) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_API) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_GPA) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_GPI)); + + return id_aa64isar1_el1_sys_val & allow_mask; +} + +static u64 get_pvm_id_aa64mmfr0(const struct kvm_vcpu *vcpu) +{ + u64 set_mask; + + set_mask = get_restricted_features_unsigned(id_aa64mmfr0_el1_sys_val, + PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED); + + return (id_aa64mmfr0_el1_sys_val & PVM_ID_AA64MMFR0_ALLOW) | set_mask; +} + +static u64 get_pvm_id_aa64mmfr1(const struct kvm_vcpu *vcpu) +{ + return id_aa64mmfr1_el1_sys_val & PVM_ID_AA64MMFR1_ALLOW; +} + +static u64 get_pvm_id_aa64mmfr2(const struct kvm_vcpu *vcpu) +{ + return id_aa64mmfr2_el1_sys_val & PVM_ID_AA64MMFR2_ALLOW; +} + +/* Read a sanitized cpufeature ID register by its encoding */ +u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id) +{ + switch (id) { + case SYS_ID_AA64PFR0_EL1: + return get_pvm_id_aa64pfr0(vcpu); + case SYS_ID_AA64PFR1_EL1: + return get_pvm_id_aa64pfr1(vcpu); + case SYS_ID_AA64ZFR0_EL1: + return get_pvm_id_aa64zfr0(vcpu); + case SYS_ID_AA64DFR0_EL1: + return get_pvm_id_aa64dfr0(vcpu); + case SYS_ID_AA64DFR1_EL1: + return get_pvm_id_aa64dfr1(vcpu); + case SYS_ID_AA64AFR0_EL1: + return get_pvm_id_aa64afr0(vcpu); + case SYS_ID_AA64AFR1_EL1: + return get_pvm_id_aa64afr1(vcpu); + case SYS_ID_AA64ISAR0_EL1: + return get_pvm_id_aa64isar0(vcpu); + case SYS_ID_AA64ISAR1_EL1: + return get_pvm_id_aa64isar1(vcpu); + case SYS_ID_AA64MMFR0_EL1: + return get_pvm_id_aa64mmfr0(vcpu); + case SYS_ID_AA64MMFR1_EL1: + return get_pvm_id_aa64mmfr1(vcpu); + case SYS_ID_AA64MMFR2_EL1: + return get_pvm_id_aa64mmfr2(vcpu); + default: + /* + * Should never happen because all cases are covered in + * pvm_sys_reg_descs[]. + */ + WARN_ON(1); + break; + } + + return 0; +} + +static u64 read_id_reg(const struct kvm_vcpu *vcpu, + struct sys_reg_desc const *r) +{ + return pvm_read_id_reg(vcpu, reg_to_encoding(r)); +} + +/* Handler to RAZ/WI sysregs */ +static bool pvm_access_raz_wi(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (!p->is_write) + p->regval = 0; + + return true; +} + +/* + * Accessor for AArch32 feature id registers. + * + * The value of these registers is "unknown" according to the spec if AArch32 + * isn't supported. + */ +static bool pvm_access_id_aarch32(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) { + inject_undef64(vcpu); + return false; + } + + /* + * No support for AArch32 guests, therefore, pKVM has no sanitized copy + * of AArch32 feature id registers. + */ + BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1), + PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) > ID_AA64PFR0_ELx_64BIT_ONLY); + + return pvm_access_raz_wi(vcpu, p, r); +} + +/* + * Accessor for AArch64 feature id registers. + * + * If access is allowed, set the regval to the protected VM's view of the + * register and return true. + * Otherwise, inject an undefined exception and return false. + */ +static bool pvm_access_id_aarch64(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) { + inject_undef64(vcpu); + return false; + } + + p->regval = read_id_reg(vcpu, r); + return true; +} + +static bool pvm_gic_read_sre(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + /* pVMs only support GICv3. 'nuf said. */ + if (!p->is_write) + p->regval = ICC_SRE_EL1_DIB | ICC_SRE_EL1_DFB | ICC_SRE_EL1_SRE; + + return true; +} + +/* Mark the specified system register as an AArch32 feature id register. */ +#define AARCH32(REG) { SYS_DESC(REG), .access = pvm_access_id_aarch32 } + +/* Mark the specified system register as an AArch64 feature id register. */ +#define AARCH64(REG) { SYS_DESC(REG), .access = pvm_access_id_aarch64 } + +/* Mark the specified system register as Read-As-Zero/Write-Ignored */ +#define RAZ_WI(REG) { SYS_DESC(REG), .access = pvm_access_raz_wi } + +/* Mark the specified system register as not being handled in hyp. */ +#define HOST_HANDLED(REG) { SYS_DESC(REG), .access = NULL } + +/* + * Architected system registers. + * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 + * + * NOTE: Anything not explicitly listed here is *restricted by default*, i.e., + * it will lead to injecting an exception into the guest. + */ +static const struct sys_reg_desc pvm_sys_reg_descs[] = { + /* Cache maintenance by set/way operations are restricted. */ + + /* Debug and Trace Registers are restricted. */ + + /* AArch64 mappings of the AArch32 ID registers */ + /* CRm=1 */ + AARCH32(SYS_ID_PFR0_EL1), + AARCH32(SYS_ID_PFR1_EL1), + AARCH32(SYS_ID_DFR0_EL1), + AARCH32(SYS_ID_AFR0_EL1), + AARCH32(SYS_ID_MMFR0_EL1), + AARCH32(SYS_ID_MMFR1_EL1), + AARCH32(SYS_ID_MMFR2_EL1), + AARCH32(SYS_ID_MMFR3_EL1), + + /* CRm=2 */ + AARCH32(SYS_ID_ISAR0_EL1), + AARCH32(SYS_ID_ISAR1_EL1), + AARCH32(SYS_ID_ISAR2_EL1), + AARCH32(SYS_ID_ISAR3_EL1), + AARCH32(SYS_ID_ISAR4_EL1), + AARCH32(SYS_ID_ISAR5_EL1), + AARCH32(SYS_ID_MMFR4_EL1), + AARCH32(SYS_ID_ISAR6_EL1), + + /* CRm=3 */ + AARCH32(SYS_MVFR0_EL1), + AARCH32(SYS_MVFR1_EL1), + AARCH32(SYS_MVFR2_EL1), + AARCH32(SYS_ID_PFR2_EL1), + AARCH32(SYS_ID_DFR1_EL1), + AARCH32(SYS_ID_MMFR5_EL1), + + /* AArch64 ID registers */ + /* CRm=4 */ + AARCH64(SYS_ID_AA64PFR0_EL1), + AARCH64(SYS_ID_AA64PFR1_EL1), + AARCH64(SYS_ID_AA64ZFR0_EL1), + AARCH64(SYS_ID_AA64DFR0_EL1), + AARCH64(SYS_ID_AA64DFR1_EL1), + AARCH64(SYS_ID_AA64AFR0_EL1), + AARCH64(SYS_ID_AA64AFR1_EL1), + AARCH64(SYS_ID_AA64ISAR0_EL1), + AARCH64(SYS_ID_AA64ISAR1_EL1), + AARCH64(SYS_ID_AA64MMFR0_EL1), + AARCH64(SYS_ID_AA64MMFR1_EL1), + AARCH64(SYS_ID_AA64MMFR2_EL1), + + /* Scalable Vector Registers are restricted. */ + + RAZ_WI(SYS_ERRIDR_EL1), + RAZ_WI(SYS_ERRSELR_EL1), + RAZ_WI(SYS_ERXFR_EL1), + RAZ_WI(SYS_ERXCTLR_EL1), + RAZ_WI(SYS_ERXSTATUS_EL1), + RAZ_WI(SYS_ERXADDR_EL1), + RAZ_WI(SYS_ERXMISC0_EL1), + RAZ_WI(SYS_ERXMISC1_EL1), + + /* Performance Monitoring Registers are restricted. */ + + /* Limited Ordering Regions Registers are restricted. */ + + HOST_HANDLED(SYS_ICC_SGI1R_EL1), + HOST_HANDLED(SYS_ICC_ASGI1R_EL1), + HOST_HANDLED(SYS_ICC_SGI0R_EL1), + { SYS_DESC(SYS_ICC_SRE_EL1), .access = pvm_gic_read_sre, }, + + HOST_HANDLED(SYS_CCSIDR_EL1), + HOST_HANDLED(SYS_CLIDR_EL1), + HOST_HANDLED(SYS_CSSELR_EL1), + HOST_HANDLED(SYS_CTR_EL0), + + /* Performance Monitoring Registers are restricted. */ + + /* Activity Monitoring Registers are restricted. */ + + HOST_HANDLED(SYS_CNTP_TVAL_EL0), + HOST_HANDLED(SYS_CNTP_CTL_EL0), + HOST_HANDLED(SYS_CNTP_CVAL_EL0), + + /* Performance Monitoring Registers are restricted. */ +}; + +/* + * Checks that the sysreg table is unique and in-order. + * + * Returns 0 if the table is consistent, or 1 otherwise. + */ +int kvm_check_pvm_sysreg_table(void) +{ + unsigned int i; + + for (i = 1; i < ARRAY_SIZE(pvm_sys_reg_descs); i++) { + if (cmp_sys_reg(&pvm_sys_reg_descs[i-1], &pvm_sys_reg_descs[i]) >= 0) + return 1; + } + + return 0; +} + +/* + * Handler for protected VM MSR, MRS or System instruction execution. + * + * Returns true if the hypervisor has handled the exit, and control should go + * back to the guest, or false if it hasn't, to be handled by the host. + */ +bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + const struct sys_reg_desc *r; + struct sys_reg_params params; + unsigned long esr = kvm_vcpu_get_esr(vcpu); + int Rt = kvm_vcpu_sys_get_rt(vcpu); + + params = esr_sys64_to_params(esr); + params.regval = vcpu_get_reg(vcpu, Rt); + + r = find_reg(¶ms, pvm_sys_reg_descs, ARRAY_SIZE(pvm_sys_reg_descs)); + + /* Undefined (RESTRICTED). */ + if (r == NULL) { + inject_undef64(vcpu); + return true; + } + + /* Handled by the host (HOST_HANDLED) */ + if (r->access == NULL) + return false; + + /* Handled by hyp: skip instruction if instructed to do so. */ + if (r->access(vcpu, ¶ms, r)) + __kvm_skip_instr(vcpu); + + if (!params.is_write) + vcpu_set_reg(vcpu, Rt, params.regval); + + return true; +} + +/** + * Handler for protected VM restricted exceptions. + * + * Inject an undefined exception into the guest and return true to indicate that + * the hypervisor has handled the exit, and control should go back to the guest. + */ +bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + inject_undef64(vcpu); + return true; +} diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 39f8f7f9227c..20db2f281cf2 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -695,9 +695,7 @@ static void __vgic_v3_read_iar(struct kvm_vcpu *vcpu, u32 vmcr, int rt) goto spurious; lr_val &= ~ICH_LR_STATE; - /* No active state for LPIs */ - if ((lr_val & ICH_LR_VIRTUAL_ID_MASK) <= VGIC_MAX_SPI) - lr_val |= ICH_LR_ACTIVE_BIT; + lr_val |= ICH_LR_ACTIVE_BIT; __gic_v3_set_lr(lr_val, lr); __vgic_v3_set_active_priority(lr_prio, vmcr, grp); vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK); @@ -764,20 +762,18 @@ static void __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) /* Drop priority in any case */ act_prio = __vgic_v3_clear_highest_active_priority(); - /* If EOIing an LPI, no deactivate to be performed */ - if (vid >= VGIC_MIN_LPI) - return; - - /* EOImode == 1, nothing to be done here */ - if (vmcr & ICH_VMCR_EOIM_MASK) - return; - lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val); if (lr == -1) { - __vgic_v3_bump_eoicount(); + /* Do not bump EOIcount for LPIs that aren't in the LRs */ + if (!(vid >= VGIC_MIN_LPI)) + __vgic_v3_bump_eoicount(); return; } + /* EOImode == 1 and not an LPI, nothing to be done here */ + if ((vmcr & ICH_VMCR_EOIM_MASK) && !(vid >= VGIC_MIN_LPI)) + return; + lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; /* If priorities or group do not match, the guest has fscked-up. */ @@ -987,8 +983,6 @@ static void __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT; /* IDbits */ val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT; - /* SEIS */ - val |= ((vtr >> 22) & 1) << ICC_CTLR_EL1_SEIS_SHIFT; /* A3V */ val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT; /* EOImode */ diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index ded2c66675f0..5a2cb5d9bc4b 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -96,6 +96,22 @@ void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu) __deactivate_traps_common(vcpu); } +static const exit_handler_fn hyp_exit_handlers[] = { + [0 ... ESR_ELx_EC_MAX] = NULL, + [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, + [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg, + [ESR_ELx_EC_SVE] = kvm_hyp_handle_fpsimd, + [ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd, + [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, + [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, + [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth, +}; + +static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) +{ + return hyp_exit_handlers; +} + /* Switch to the guest for VHE systems running in EL2 */ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) { diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 69bd1732a299..326cdfec74a1 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -512,7 +512,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) return -EINVAL; } - pgt = kzalloc(sizeof(*pgt), GFP_KERNEL); + pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT); if (!pgt) return -ENOMEM; diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 2af3c37445e0..a5e4bbf5e68f 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -978,7 +978,7 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) mutex_lock(&vcpu->kvm->lock); if (!vcpu->kvm->arch.pmu_filter) { - vcpu->kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL); + vcpu->kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL_ACCOUNT); if (!vcpu->kvm->arch.pmu_filter) { mutex_unlock(&vcpu->kvm->lock); return -ENOMEM; diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 09cd30a9aafb..426bd7fbc3fd 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -106,7 +106,7 @@ static int kvm_vcpu_finalize_sve(struct kvm_vcpu *vcpu) vl > SVE_VL_ARCH_MAX)) return -EIO; - buf = kzalloc(SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)), GFP_KERNEL); + buf = kzalloc(SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)), GFP_KERNEL_ACCOUNT); if (!buf) return -ENOMEM; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 1d46e185f31e..e3ec1a44f94d 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1064,7 +1064,12 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r, bool raz) { u32 id = reg_to_encoding(r); - u64 val = raz ? 0 : read_sanitised_ftr_reg(id); + u64 val; + + if (raz) + return 0; + + val = read_sanitised_ftr_reg(id); switch (id) { case SYS_ID_AA64PFR0_EL1: @@ -1075,16 +1080,15 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3); val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3); + if (irqchip_in_kernel(vcpu->kvm) && + vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_GIC); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_GIC), 1); + } break; case SYS_ID_AA64PFR1_EL1: - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_MTE); - if (kvm_has_mte(vcpu->kvm)) { - u64 pfr, mte; - - pfr = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); - mte = cpuid_feature_extract_unsigned_field(pfr, ID_AA64PFR1_MTE_SHIFT); - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR1_MTE), mte); - } + if (!kvm_has_mte(vcpu->kvm)) + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_MTE); break; case SYS_ID_AA64ISAR1_EL1: if (!vcpu_has_ptrauth(vcpu)) @@ -1268,16 +1272,19 @@ static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, return __set_id_reg(vcpu, rd, uaddr, raz); } -static int get_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, +static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __get_id_reg(vcpu, rd, uaddr, true); + return __set_id_reg(vcpu, rd, uaddr, true); } -static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) +static int get_raz_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + const struct kvm_one_reg *reg, void __user *uaddr) { - return __set_id_reg(vcpu, rd, uaddr, true); + const u64 id = sys_reg_to_index(rd); + const u64 val = 0; + + return reg_to_user(uaddr, &val, id); } static int set_wi_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, @@ -1388,7 +1395,7 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, #define ID_UNALLOCATED(crm, op2) { \ Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2), \ .access = access_raz_id_reg, \ - .get_user = get_raz_id_reg, \ + .get_user = get_raz_reg, \ .set_user = set_raz_id_reg, \ } @@ -1400,7 +1407,7 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, #define ID_HIDDEN(name) { \ SYS_DESC(SYS_##name), \ .access = access_raz_id_reg, \ - .get_user = get_raz_id_reg, \ + .get_user = get_raz_reg, \ .set_user = set_raz_id_reg, \ } @@ -1642,7 +1649,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { * previously (and pointlessly) advertised in the past... */ { PMU_SYS_REG(SYS_PMSWINC_EL0), - .get_user = get_raz_id_reg, .set_user = set_wi_reg, + .get_user = get_raz_reg, .set_user = set_wi_reg, .access = access_pmswinc, .reset = NULL }, { PMU_SYS_REG(SYS_PMSELR_EL0), .access = access_pmselr, .reset = reset_pmselr, .reg = PMSELR_EL0 }, diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 340c51d87677..0a06d0648970 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -134,7 +134,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0); int i; - dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL); + dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL_ACCOUNT); if (!dist->spis) return -ENOMEM; diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c index 79f8899b234c..475059bacedf 100644 --- a/arch/arm64/kvm/vgic/vgic-irqfd.c +++ b/arch/arm64/kvm/vgic/vgic-irqfd.c @@ -139,7 +139,7 @@ int kvm_vgic_setup_default_irq_routing(struct kvm *kvm) u32 nr = dist->nr_spis; int i, ret; - entries = kcalloc(nr, sizeof(*entries), GFP_KERNEL); + entries = kcalloc(nr, sizeof(*entries), GFP_KERNEL_ACCOUNT); if (!entries) return -ENOMEM; diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 61728c543eb9..089fc2ffcb43 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -48,7 +48,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, if (irq) return irq; - irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL); + irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL_ACCOUNT); if (!irq) return ERR_PTR(-ENOMEM); @@ -332,7 +332,7 @@ int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr) * we must be careful not to overrun the array. */ irq_count = READ_ONCE(dist->lpi_list_count); - intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL); + intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL_ACCOUNT); if (!intids) return -ENOMEM; @@ -985,7 +985,7 @@ static int vgic_its_alloc_collection(struct vgic_its *its, if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL)) return E_ITS_MAPC_COLLECTION_OOR; - collection = kzalloc(sizeof(*collection), GFP_KERNEL); + collection = kzalloc(sizeof(*collection), GFP_KERNEL_ACCOUNT); if (!collection) return -ENOMEM; @@ -1029,7 +1029,7 @@ static struct its_ite *vgic_its_alloc_ite(struct its_device *device, { struct its_ite *ite; - ite = kzalloc(sizeof(*ite), GFP_KERNEL); + ite = kzalloc(sizeof(*ite), GFP_KERNEL_ACCOUNT); if (!ite) return ERR_PTR(-ENOMEM); @@ -1150,7 +1150,7 @@ static struct its_device *vgic_its_alloc_device(struct vgic_its *its, { struct its_device *device; - device = kzalloc(sizeof(*device), GFP_KERNEL); + device = kzalloc(sizeof(*device), GFP_KERNEL_ACCOUNT); if (!device) return ERR_PTR(-ENOMEM); @@ -1847,7 +1847,7 @@ void vgic_lpi_translation_cache_init(struct kvm *kvm) struct vgic_translation_cache_entry *cte; /* An allocation failure is not fatal */ - cte = kzalloc(sizeof(*cte), GFP_KERNEL); + cte = kzalloc(sizeof(*cte), GFP_KERNEL_ACCOUNT); if (WARN_ON(!cte)) break; @@ -1888,7 +1888,7 @@ static int vgic_its_create(struct kvm_device *dev, u32 type) if (type != KVM_DEV_TYPE_ARM_VGIC_ITS) return -ENODEV; - its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL); + its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL_ACCOUNT); if (!its) return -ENOMEM; @@ -2710,8 +2710,8 @@ static int vgic_its_set_attr(struct kvm_device *dev, if (copy_from_user(&addr, uaddr, sizeof(addr))) return -EFAULT; - ret = vgic_check_ioaddr(dev->kvm, &its->vgic_its_base, - addr, SZ_64K); + ret = vgic_check_iorange(dev->kvm, its->vgic_its_base, + addr, SZ_64K, KVM_VGIC_V3_ITS_SIZE); if (ret) return ret; diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c index 7740995de982..0d000d2fe8d2 100644 --- a/arch/arm64/kvm/vgic/vgic-kvm-device.c +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c @@ -14,17 +14,21 @@ /* common helpers */ -int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, - phys_addr_t addr, phys_addr_t alignment) +int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr, + phys_addr_t addr, phys_addr_t alignment, + phys_addr_t size) { - if (addr & ~kvm_phys_mask(kvm)) - return -E2BIG; + if (!IS_VGIC_ADDR_UNDEF(ioaddr)) + return -EEXIST; - if (!IS_ALIGNED(addr, alignment)) + if (!IS_ALIGNED(addr, alignment) || !IS_ALIGNED(size, alignment)) return -EINVAL; - if (!IS_VGIC_ADDR_UNDEF(*ioaddr)) - return -EEXIST; + if (addr + size < addr) + return -EINVAL; + + if (addr & ~kvm_phys_mask(kvm) || addr + size > kvm_phys_size(kvm)) + return -E2BIG; return 0; } @@ -57,7 +61,7 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) { int r = 0; struct vgic_dist *vgic = &kvm->arch.vgic; - phys_addr_t *addr_ptr, alignment; + phys_addr_t *addr_ptr, alignment, size; u64 undef_value = VGIC_ADDR_UNDEF; mutex_lock(&kvm->lock); @@ -66,16 +70,19 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); addr_ptr = &vgic->vgic_dist_base; alignment = SZ_4K; + size = KVM_VGIC_V2_DIST_SIZE; break; case KVM_VGIC_V2_ADDR_TYPE_CPU: r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); addr_ptr = &vgic->vgic_cpu_base; alignment = SZ_4K; + size = KVM_VGIC_V2_CPU_SIZE; break; case KVM_VGIC_V3_ADDR_TYPE_DIST: r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3); addr_ptr = &vgic->vgic_dist_base; alignment = SZ_64K; + size = KVM_VGIC_V3_DIST_SIZE; break; case KVM_VGIC_V3_ADDR_TYPE_REDIST: { struct vgic_redist_region *rdreg; @@ -140,7 +147,7 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) goto out; if (write) { - r = vgic_check_ioaddr(kvm, addr_ptr, *addr, alignment); + r = vgic_check_iorange(kvm, *addr_ptr, *addr, alignment, size); if (!r) *addr_ptr = *addr; } else { diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c index a09cdc0b953c..bf7ec4a78497 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c @@ -796,7 +796,9 @@ static int vgic_v3_alloc_redist_region(struct kvm *kvm, uint32_t index, struct vgic_dist *d = &kvm->arch.vgic; struct vgic_redist_region *rdreg; struct list_head *rd_regions = &d->rd_regions; - size_t size = count * KVM_VGIC_V3_REDIST_SIZE; + int nr_vcpus = atomic_read(&kvm->online_vcpus); + size_t size = count ? count * KVM_VGIC_V3_REDIST_SIZE + : nr_vcpus * KVM_VGIC_V3_REDIST_SIZE; int ret; /* cross the end of memory ? */ @@ -834,13 +836,13 @@ static int vgic_v3_alloc_redist_region(struct kvm *kvm, uint32_t index, if (vgic_v3_rdist_overlap(kvm, base, size)) return -EINVAL; - rdreg = kzalloc(sizeof(*rdreg), GFP_KERNEL); + rdreg = kzalloc(sizeof(*rdreg), GFP_KERNEL_ACCOUNT); if (!rdreg) return -ENOMEM; rdreg->base = VGIC_ADDR_UNDEF; - ret = vgic_check_ioaddr(kvm, &rdreg->base, base, SZ_64K); + ret = vgic_check_iorange(kvm, rdreg->base, base, SZ_64K, size); if (ret) goto free; diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 21a6207fb2ee..04f62c4b07fb 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -15,6 +15,7 @@ static bool group0_trap; static bool group1_trap; static bool common_trap; +static bool dir_trap; static bool gicv4_enable; void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) @@ -296,6 +297,8 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu) vgic_v3->vgic_hcr |= ICH_HCR_TALL1; if (common_trap) vgic_v3->vgic_hcr |= ICH_HCR_TC; + if (dir_trap) + vgic_v3->vgic_hcr |= ICH_HCR_TDIR; } int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq) @@ -483,8 +486,10 @@ bool vgic_v3_check_base(struct kvm *kvm) return false; list_for_each_entry(rdreg, &d->rd_regions, list) { - if (rdreg->base + vgic_v3_rd_region_size(kvm, rdreg) < - rdreg->base) + size_t sz = vgic_v3_rd_region_size(kvm, rdreg); + + if (vgic_check_iorange(kvm, VGIC_ADDR_UNDEF, + rdreg->base, SZ_64K, sz)) return false; } @@ -671,11 +676,23 @@ int vgic_v3_probe(const struct gic_kvm_info *info) group1_trap = true; } - if (group0_trap || group1_trap || common_trap) { - kvm_info("GICv3 sysreg trapping enabled ([%s%s%s], reduced performance)\n", + if (kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) { + kvm_info("GICv3 with locally generated SEI\n"); + + group0_trap = true; + group1_trap = true; + if (ich_vtr_el2 & ICH_VTR_TDS_MASK) + dir_trap = true; + else + common_trap = true; + } + + if (group0_trap || group1_trap || common_trap | dir_trap) { + kvm_info("GICv3 sysreg trapping enabled ([%s%s%s%s], reduced performance)\n", group0_trap ? "G0" : "", group1_trap ? "G1" : "", - common_trap ? "C" : ""); + common_trap ? "C" : "", + dir_trap ? "D" : ""); static_branch_enable(&vgic_v3_cpuif_trap); } diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index c1845d8f5f7e..772dd15a22c7 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -246,7 +246,7 @@ int vgic_v4_init(struct kvm *kvm) nr_vcpus = atomic_read(&kvm->online_vcpus); dist->its_vm.vpes = kcalloc(nr_vcpus, sizeof(*dist->its_vm.vpes), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!dist->its_vm.vpes) return -ENOMEM; diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 14a9218641f5..3fd6c86a7ef3 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -172,8 +172,9 @@ void vgic_kick_vcpus(struct kvm *kvm); void vgic_irq_handle_resampling(struct vgic_irq *irq, bool lr_deactivated, bool lr_pending); -int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, - phys_addr_t addr, phys_addr_t alignment); +int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr, + phys_addr_t addr, phys_addr_t alignment, + phys_addr_t size); void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 75c6f264c626..562aa878b266 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1073,7 +1073,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_MAX_VCPUS; break; case KVM_CAP_MAX_VCPU_ID: - r = KVM_MAX_VCPU_ID; + r = KVM_MAX_VCPU_IDS; break; case KVM_CAP_MIPS_FPU: /* We don't handle systems with inconsistent cpu_has_fpu */ diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index caaa0f592d8e..3d31f2c59e43 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -434,7 +434,7 @@ extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu); #define SPLIT_HACK_OFFS 0xfb000000 /* - * This packs a VCPU ID from the [0..KVM_MAX_VCPU_ID) space down to the + * This packs a VCPU ID from the [0..KVM_MAX_VCPU_IDS) space down to the * [0..KVM_MAX_VCPUS) space, using knowledge of the guest's core stride * (but not its actual threading mode, which is not available) to avoid * collisions. diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 3aed653373a5..e4d23193eba7 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -33,11 +33,11 @@ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE #include <asm/kvm_book3s_asm.h> /* for MAX_SMT_THREADS */ -#define KVM_MAX_VCPU_ID (MAX_SMT_THREADS * KVM_MAX_VCORES) +#define KVM_MAX_VCPU_IDS (MAX_SMT_THREADS * KVM_MAX_VCORES) #define KVM_MAX_NESTED_GUESTS KVMPPC_NR_LPIDS #else -#define KVM_MAX_VCPU_ID KVM_MAX_VCPUS +#define KVM_MAX_VCPU_IDS KVM_MAX_VCPUS #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #define __KVM_HAVE_ARCH_INTC_INITIALIZED diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index a18db9e16ea4..225008882958 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -1928,7 +1928,7 @@ int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr) pr_devel("%s nr_servers=%u\n", __func__, nr_servers); - if (!nr_servers || nr_servers > KVM_MAX_VCPU_ID) + if (!nr_servers || nr_servers > KVM_MAX_VCPU_IDS) return -EINVAL; mutex_lock(&xive->lock); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index b4e6f70b97b9..8ab90ce8738f 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -649,7 +649,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_MAX_VCPUS; break; case KVM_CAP_MAX_VCPU_ID: - r = KVM_MAX_VCPU_ID; + r = KVM_MAX_VCPU_IDS; break; #ifdef CONFIG_PPC_BOOK3S_64 case KVM_CAP_PPC_GET_SMMU_INFO: diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index c28b743eba57..a34c531be4e7 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -566,3 +566,5 @@ menu "Power management options" source "kernel/power/Kconfig" endmenu + +source "arch/riscv/kvm/Kconfig" diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 0eb4568fbd29..58c1a28e20bb 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -100,6 +100,7 @@ endif head-y := arch/riscv/kernel/head.o core-$(CONFIG_RISCV_ERRATA_ALTERNATIVE) += arch/riscv/errata/ +core-$(CONFIG_KVM) += arch/riscv/kvm/ libs-y += arch/riscv/lib/ libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 87ac65696871..5046f431645c 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -58,22 +58,32 @@ /* Interrupt causes (minus the high bit) */ #define IRQ_S_SOFT 1 +#define IRQ_VS_SOFT 2 #define IRQ_M_SOFT 3 #define IRQ_S_TIMER 5 +#define IRQ_VS_TIMER 6 #define IRQ_M_TIMER 7 #define IRQ_S_EXT 9 +#define IRQ_VS_EXT 10 #define IRQ_M_EXT 11 /* Exception causes */ #define EXC_INST_MISALIGNED 0 #define EXC_INST_ACCESS 1 +#define EXC_INST_ILLEGAL 2 #define EXC_BREAKPOINT 3 #define EXC_LOAD_ACCESS 5 #define EXC_STORE_ACCESS 7 #define EXC_SYSCALL 8 +#define EXC_HYPERVISOR_SYSCALL 9 +#define EXC_SUPERVISOR_SYSCALL 10 #define EXC_INST_PAGE_FAULT 12 #define EXC_LOAD_PAGE_FAULT 13 #define EXC_STORE_PAGE_FAULT 15 +#define EXC_INST_GUEST_PAGE_FAULT 20 +#define EXC_LOAD_GUEST_PAGE_FAULT 21 +#define EXC_VIRTUAL_INST_FAULT 22 +#define EXC_STORE_GUEST_PAGE_FAULT 23 /* PMP configuration */ #define PMP_R 0x01 @@ -85,6 +95,58 @@ #define PMP_A_NAPOT 0x18 #define PMP_L 0x80 +/* HSTATUS flags */ +#ifdef CONFIG_64BIT +#define HSTATUS_VSXL _AC(0x300000000, UL) +#define HSTATUS_VSXL_SHIFT 32 +#endif +#define HSTATUS_VTSR _AC(0x00400000, UL) +#define HSTATUS_VTW _AC(0x00200000, UL) +#define HSTATUS_VTVM _AC(0x00100000, UL) +#define HSTATUS_VGEIN _AC(0x0003f000, UL) +#define HSTATUS_VGEIN_SHIFT 12 +#define HSTATUS_HU _AC(0x00000200, UL) +#define HSTATUS_SPVP _AC(0x00000100, UL) +#define HSTATUS_SPV _AC(0x00000080, UL) +#define HSTATUS_GVA _AC(0x00000040, UL) +#define HSTATUS_VSBE _AC(0x00000020, UL) + +/* HGATP flags */ +#define HGATP_MODE_OFF _AC(0, UL) +#define HGATP_MODE_SV32X4 _AC(1, UL) +#define HGATP_MODE_SV39X4 _AC(8, UL) +#define HGATP_MODE_SV48X4 _AC(9, UL) + +#define HGATP32_MODE_SHIFT 31 +#define HGATP32_VMID_SHIFT 22 +#define HGATP32_VMID_MASK _AC(0x1FC00000, UL) +#define HGATP32_PPN _AC(0x003FFFFF, UL) + +#define HGATP64_MODE_SHIFT 60 +#define HGATP64_VMID_SHIFT 44 +#define HGATP64_VMID_MASK _AC(0x03FFF00000000000, UL) +#define HGATP64_PPN _AC(0x00000FFFFFFFFFFF, UL) + +#define HGATP_PAGE_SHIFT 12 + +#ifdef CONFIG_64BIT +#define HGATP_PPN HGATP64_PPN +#define HGATP_VMID_SHIFT HGATP64_VMID_SHIFT +#define HGATP_VMID_MASK HGATP64_VMID_MASK +#define HGATP_MODE_SHIFT HGATP64_MODE_SHIFT +#else +#define HGATP_PPN HGATP32_PPN +#define HGATP_VMID_SHIFT HGATP32_VMID_SHIFT +#define HGATP_VMID_MASK HGATP32_VMID_MASK +#define HGATP_MODE_SHIFT HGATP32_MODE_SHIFT +#endif + +/* VSIP & HVIP relation */ +#define VSIP_TO_HVIP_SHIFT (IRQ_VS_SOFT - IRQ_S_SOFT) +#define VSIP_VALID_MASK ((_AC(1, UL) << IRQ_S_SOFT) | \ + (_AC(1, UL) << IRQ_S_TIMER) | \ + (_AC(1, UL) << IRQ_S_EXT)) + /* symbolic CSR names: */ #define CSR_CYCLE 0xc00 #define CSR_TIME 0xc01 @@ -104,6 +166,31 @@ #define CSR_SIP 0x144 #define CSR_SATP 0x180 +#define CSR_VSSTATUS 0x200 +#define CSR_VSIE 0x204 +#define CSR_VSTVEC 0x205 +#define CSR_VSSCRATCH 0x240 +#define CSR_VSEPC 0x241 +#define CSR_VSCAUSE 0x242 +#define CSR_VSTVAL 0x243 +#define CSR_VSIP 0x244 +#define CSR_VSATP 0x280 + +#define CSR_HSTATUS 0x600 +#define CSR_HEDELEG 0x602 +#define CSR_HIDELEG 0x603 +#define CSR_HIE 0x604 +#define CSR_HTIMEDELTA 0x605 +#define CSR_HCOUNTEREN 0x606 +#define CSR_HGEIE 0x607 +#define CSR_HTIMEDELTAH 0x615 +#define CSR_HTVAL 0x643 +#define CSR_HIP 0x644 +#define CSR_HVIP 0x645 +#define CSR_HTINST 0x64a +#define CSR_HGATP 0x680 +#define CSR_HGEIP 0xe12 + #define CSR_MSTATUS 0x300 #define CSR_MISA 0x301 #define CSR_MIE 0x304 diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h new file mode 100644 index 000000000000..25ba21f98504 --- /dev/null +++ b/arch/riscv/include/asm/kvm_host.h @@ -0,0 +1,264 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2019 Western Digital Corporation or its affiliates. + * + * Authors: + * Anup Patel <anup.patel@wdc.com> + */ + +#ifndef __RISCV_KVM_HOST_H__ +#define __RISCV_KVM_HOST_H__ + +#include <linux/types.h> +#include <linux/kvm.h> +#include <linux/kvm_types.h> +#include <asm/kvm_vcpu_fp.h> +#include <asm/kvm_vcpu_timer.h> + +#ifdef CONFIG_64BIT +#define KVM_MAX_VCPUS (1U << 16) +#else +#define KVM_MAX_VCPUS (1U << 9) +#endif + +#define KVM_HALT_POLL_NS_DEFAULT 500000 + +#define KVM_VCPU_MAX_FEATURES 0 + +#define KVM_REQ_SLEEP \ + KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_VCPU_RESET KVM_ARCH_REQ(1) +#define KVM_REQ_UPDATE_HGATP KVM_ARCH_REQ(2) + +struct kvm_vm_stat { + struct kvm_vm_stat_generic generic; +}; + +struct kvm_vcpu_stat { + struct kvm_vcpu_stat_generic generic; + u64 ecall_exit_stat; + u64 wfi_exit_stat; + u64 mmio_exit_user; + u64 mmio_exit_kernel; + u64 exits; +}; + +struct kvm_arch_memory_slot { +}; + +struct kvm_vmid { + /* + * Writes to vmid_version and vmid happen with vmid_lock held + * whereas reads happen without any lock held. + */ + unsigned long vmid_version; + unsigned long vmid; +}; + +struct kvm_arch { + /* stage2 vmid */ + struct kvm_vmid vmid; + + /* stage2 page table */ + pgd_t *pgd; + phys_addr_t pgd_phys; + + /* Guest Timer */ + struct kvm_guest_timer timer; +}; + +struct kvm_mmio_decode { + unsigned long insn; + int insn_len; + int len; + int shift; + int return_handled; +}; + +struct kvm_sbi_context { + int return_handled; +}; + +#define KVM_MMU_PAGE_CACHE_NR_OBJS 32 + +struct kvm_mmu_page_cache { + int nobjs; + void *objects[KVM_MMU_PAGE_CACHE_NR_OBJS]; +}; + +struct kvm_cpu_trap { + unsigned long sepc; + unsigned long scause; + unsigned long stval; + unsigned long htval; + unsigned long htinst; +}; + +struct kvm_cpu_context { + unsigned long zero; + unsigned long ra; + unsigned long sp; + unsigned long gp; + unsigned long tp; + unsigned long t0; + unsigned long t1; + unsigned long t2; + unsigned long s0; + unsigned long s1; + unsigned long a0; + unsigned long a1; + unsigned long a2; + unsigned long a3; + unsigned long a4; + unsigned long a5; + unsigned long a6; + unsigned long a7; + unsigned long s2; + unsigned long s3; + unsigned long s4; + unsigned long s5; + unsigned long s6; + unsigned long s7; + unsigned long s8; + unsigned long s9; + unsigned long s10; + unsigned long s11; + unsigned long t3; + unsigned long t4; + unsigned long t5; + unsigned long t6; + unsigned long sepc; + unsigned long sstatus; + unsigned long hstatus; + union __riscv_fp_state fp; +}; + +struct kvm_vcpu_csr { + unsigned long vsstatus; + unsigned long vsie; + unsigned long vstvec; + unsigned long vsscratch; + unsigned long vsepc; + unsigned long vscause; + unsigned long vstval; + unsigned long hvip; + unsigned long vsatp; + unsigned long scounteren; +}; + +struct kvm_vcpu_arch { + /* VCPU ran at least once */ + bool ran_atleast_once; + + /* ISA feature bits (similar to MISA) */ + unsigned long isa; + + /* SSCRATCH, STVEC, and SCOUNTEREN of Host */ + unsigned long host_sscratch; + unsigned long host_stvec; + unsigned long host_scounteren; + + /* CPU context of Host */ + struct kvm_cpu_context host_context; + + /* CPU context of Guest VCPU */ + struct kvm_cpu_context guest_context; + + /* CPU CSR context of Guest VCPU */ + struct kvm_vcpu_csr guest_csr; + + /* CPU context upon Guest VCPU reset */ + struct kvm_cpu_context guest_reset_context; + + /* CPU CSR context upon Guest VCPU reset */ + struct kvm_vcpu_csr guest_reset_csr; + + /* + * VCPU interrupts + * + * We have a lockless approach for tracking pending VCPU interrupts + * implemented using atomic bitops. The irqs_pending bitmap represent + * pending interrupts whereas irqs_pending_mask represent bits changed + * in irqs_pending. Our approach is modeled around multiple producer + * and single consumer problem where the consumer is the VCPU itself. + */ + unsigned long irqs_pending; + unsigned long irqs_pending_mask; + + /* VCPU Timer */ + struct kvm_vcpu_timer timer; + + /* MMIO instruction details */ + struct kvm_mmio_decode mmio_decode; + + /* SBI context */ + struct kvm_sbi_context sbi_context; + + /* Cache pages needed to program page tables with spinlock held */ + struct kvm_mmu_page_cache mmu_page_cache; + + /* VCPU power-off state */ + bool power_off; + + /* Don't run the VCPU (blocked) */ + bool pause; + + /* SRCU lock index for in-kernel run loop */ + int srcu_idx; +}; + +static inline void kvm_arch_hardware_unsetup(void) {} +static inline void kvm_arch_sync_events(struct kvm *kvm) {} +static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} +static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} + +#define KVM_ARCH_WANT_MMU_NOTIFIER + +void __kvm_riscv_hfence_gvma_vmid_gpa(unsigned long gpa_divby_4, + unsigned long vmid); +void __kvm_riscv_hfence_gvma_vmid(unsigned long vmid); +void __kvm_riscv_hfence_gvma_gpa(unsigned long gpa_divby_4); +void __kvm_riscv_hfence_gvma_all(void); + +int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, + struct kvm_memory_slot *memslot, + gpa_t gpa, unsigned long hva, bool is_write); +void kvm_riscv_stage2_flush_cache(struct kvm_vcpu *vcpu); +int kvm_riscv_stage2_alloc_pgd(struct kvm *kvm); +void kvm_riscv_stage2_free_pgd(struct kvm *kvm); +void kvm_riscv_stage2_update_hgatp(struct kvm_vcpu *vcpu); +void kvm_riscv_stage2_mode_detect(void); +unsigned long kvm_riscv_stage2_mode(void); + +void kvm_riscv_stage2_vmid_detect(void); +unsigned long kvm_riscv_stage2_vmid_bits(void); +int kvm_riscv_stage2_vmid_init(struct kvm *kvm); +bool kvm_riscv_stage2_vmid_ver_changed(struct kvm_vmid *vmid); +void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu); + +void __kvm_riscv_unpriv_trap(void); + +unsigned long kvm_riscv_vcpu_unpriv_read(struct kvm_vcpu *vcpu, + bool read_insn, + unsigned long guest_addr, + struct kvm_cpu_trap *trap); +void kvm_riscv_vcpu_trap_redirect(struct kvm_vcpu *vcpu, + struct kvm_cpu_trap *trap); +int kvm_riscv_vcpu_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run); +int kvm_riscv_vcpu_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, + struct kvm_cpu_trap *trap); + +void __kvm_riscv_switch_to(struct kvm_vcpu_arch *vcpu_arch); + +int kvm_riscv_vcpu_set_interrupt(struct kvm_vcpu *vcpu, unsigned int irq); +int kvm_riscv_vcpu_unset_interrupt(struct kvm_vcpu *vcpu, unsigned int irq); +void kvm_riscv_vcpu_flush_interrupts(struct kvm_vcpu *vcpu); +void kvm_riscv_vcpu_sync_interrupts(struct kvm_vcpu *vcpu); +bool kvm_riscv_vcpu_has_interrupts(struct kvm_vcpu *vcpu, unsigned long mask); +void kvm_riscv_vcpu_power_off(struct kvm_vcpu *vcpu); +void kvm_riscv_vcpu_power_on(struct kvm_vcpu *vcpu); + +int kvm_riscv_vcpu_sbi_return(struct kvm_vcpu *vcpu, struct kvm_run *run); +int kvm_riscv_vcpu_sbi_ecall(struct kvm_vcpu *vcpu, struct kvm_run *run); + +#endif /* __RISCV_KVM_HOST_H__ */ diff --git a/arch/riscv/include/asm/kvm_types.h b/arch/riscv/include/asm/kvm_types.h new file mode 100644 index 000000000000..e476b404eb67 --- /dev/null +++ b/arch/riscv/include/asm/kvm_types.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_RISCV_KVM_TYPES_H +#define _ASM_RISCV_KVM_TYPES_H + +#define KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 40 + +#endif /* _ASM_RISCV_KVM_TYPES_H */ diff --git a/arch/riscv/include/asm/kvm_vcpu_fp.h b/arch/riscv/include/asm/kvm_vcpu_fp.h new file mode 100644 index 000000000000..4da9b8e0f050 --- /dev/null +++ b/arch/riscv/include/asm/kvm_vcpu_fp.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 Western Digital Corporation or its affiliates. + * + * Authors: + * Atish Patra <atish.patra@wdc.com> + * Anup Patel <anup.patel@wdc.com> + */ + +#ifndef __KVM_VCPU_RISCV_FP_H +#define __KVM_VCPU_RISCV_FP_H + +#include <linux/types.h> + +struct kvm_cpu_context; + +#ifdef CONFIG_FPU +void __kvm_riscv_fp_f_save(struct kvm_cpu_context *context); +void __kvm_riscv_fp_f_restore(struct kvm_cpu_context *context); +void __kvm_riscv_fp_d_save(struct kvm_cpu_context *context); +void __kvm_riscv_fp_d_restore(struct kvm_cpu_context *context); + +void kvm_riscv_vcpu_fp_reset(struct kvm_vcpu *vcpu); +void kvm_riscv_vcpu_guest_fp_save(struct kvm_cpu_context *cntx, + unsigned long isa); +void kvm_riscv_vcpu_guest_fp_restore(struct kvm_cpu_context *cntx, + unsigned long isa); +void kvm_riscv_vcpu_host_fp_save(struct kvm_cpu_context *cntx); +void kvm_riscv_vcpu_host_fp_restore(struct kvm_cpu_context *cntx); +#else +static inline void kvm_riscv_vcpu_fp_reset(struct kvm_vcpu *vcpu) +{ +} +static inline void kvm_riscv_vcpu_guest_fp_save(struct kvm_cpu_context *cntx, + unsigned long isa) +{ +} +static inline void kvm_riscv_vcpu_guest_fp_restore( + struct kvm_cpu_context *cntx, + unsigned long isa) +{ +} +static inline void kvm_riscv_vcpu_host_fp_save(struct kvm_cpu_context *cntx) +{ +} +static inline void kvm_riscv_vcpu_host_fp_restore( + struct kvm_cpu_context *cntx) +{ +} +#endif + +int kvm_riscv_vcpu_get_reg_fp(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg, + unsigned long rtype); +int kvm_riscv_vcpu_set_reg_fp(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg, + unsigned long rtype); + +#endif diff --git a/arch/riscv/include/asm/kvm_vcpu_timer.h b/arch/riscv/include/asm/kvm_vcpu_timer.h new file mode 100644 index 000000000000..375281eb49e0 --- /dev/null +++ b/arch/riscv/include/asm/kvm_vcpu_timer.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2019 Western Digital Corporation or its affiliates. + * + * Authors: + * Atish Patra <atish.patra@wdc.com> + */ + +#ifndef __KVM_VCPU_RISCV_TIMER_H +#define __KVM_VCPU_RISCV_TIMER_H + +#include <linux/hrtimer.h> + +struct kvm_guest_timer { + /* Mult & Shift values to get nanoseconds from cycles */ + u32 nsec_mult; + u32 nsec_shift; + /* Time delta value */ + u64 time_delta; +}; + +struct kvm_vcpu_timer { + /* Flag for whether init is done */ + bool init_done; + /* Flag for whether timer event is configured */ + bool next_set; + /* Next timer event cycles */ + u64 next_cycles; + /* Underlying hrtimer instance */ + struct hrtimer hrt; +}; + +int kvm_riscv_vcpu_timer_next_event(struct kvm_vcpu *vcpu, u64 ncycles); +int kvm_riscv_vcpu_get_reg_timer(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg); +int kvm_riscv_vcpu_set_reg_timer(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg); +int kvm_riscv_vcpu_timer_init(struct kvm_vcpu *vcpu); +int kvm_riscv_vcpu_timer_deinit(struct kvm_vcpu *vcpu); +int kvm_riscv_vcpu_timer_reset(struct kvm_vcpu *vcpu); +void kvm_riscv_vcpu_timer_restore(struct kvm_vcpu *vcpu); +int kvm_riscv_guest_timer_init(struct kvm *kvm); + +#endif diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h new file mode 100644 index 000000000000..f808ad1ce500 --- /dev/null +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (C) 2019 Western Digital Corporation or its affiliates. + * + * Authors: + * Anup Patel <anup.patel@wdc.com> + */ + +#ifndef __LINUX_KVM_RISCV_H +#define __LINUX_KVM_RISCV_H + +#ifndef __ASSEMBLY__ + +#include <linux/types.h> +#include <asm/ptrace.h> + +#define __KVM_HAVE_READONLY_MEM + +#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 + +#define KVM_INTERRUPT_SET -1U +#define KVM_INTERRUPT_UNSET -2U + +/* for KVM_GET_REGS and KVM_SET_REGS */ +struct kvm_regs { +}; + +/* for KVM_GET_FPU and KVM_SET_FPU */ +struct kvm_fpu { +}; + +/* KVM Debug exit structure */ +struct kvm_debug_exit_arch { +}; + +/* for KVM_SET_GUEST_DEBUG */ +struct kvm_guest_debug_arch { +}; + +/* definition of registers in kvm_run */ +struct kvm_sync_regs { +}; + +/* for KVM_GET_SREGS and KVM_SET_SREGS */ +struct kvm_sregs { +}; + +/* CONFIG registers for KVM_GET_ONE_REG and KVM_SET_ONE_REG */ +struct kvm_riscv_config { + unsigned long isa; +}; + +/* CORE registers for KVM_GET_ONE_REG and KVM_SET_ONE_REG */ +struct kvm_riscv_core { + struct user_regs_struct regs; + unsigned long mode; +}; + +/* Possible privilege modes for kvm_riscv_core */ +#define KVM_RISCV_MODE_S 1 +#define KVM_RISCV_MODE_U 0 + +/* CSR registers for KVM_GET_ONE_REG and KVM_SET_ONE_REG */ +struct kvm_riscv_csr { + unsigned long sstatus; + unsigned long sie; + unsigned long stvec; + unsigned long sscratch; + unsigned long sepc; + unsigned long scause; + unsigned long stval; + unsigned long sip; + unsigned long satp; + unsigned long scounteren; +}; + +/* TIMER registers for KVM_GET_ONE_REG and KVM_SET_ONE_REG */ +struct kvm_riscv_timer { + __u64 frequency; + __u64 time; + __u64 compare; + __u64 state; +}; + +/* Possible states for kvm_riscv_timer */ +#define KVM_RISCV_TIMER_STATE_OFF 0 +#define KVM_RISCV_TIMER_STATE_ON 1 + +#define KVM_REG_SIZE(id) \ + (1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT)) + +/* If you need to interpret the index values, here is the key: */ +#define KVM_REG_RISCV_TYPE_MASK 0x00000000FF000000 +#define KVM_REG_RISCV_TYPE_SHIFT 24 + +/* Config registers are mapped as type 1 */ +#define KVM_REG_RISCV_CONFIG (0x01 << KVM_REG_RISCV_TYPE_SHIFT) +#define KVM_REG_RISCV_CONFIG_REG(name) \ + (offsetof(struct kvm_riscv_config, name) / sizeof(unsigned long)) + +/* Core registers are mapped as type 2 */ +#define KVM_REG_RISCV_CORE (0x02 << KVM_REG_RISCV_TYPE_SHIFT) +#define KVM_REG_RISCV_CORE_REG(name) \ + (offsetof(struct kvm_riscv_core, name) / sizeof(unsigned long)) + +/* Control and status registers are mapped as type 3 */ +#define KVM_REG_RISCV_CSR (0x03 << KVM_REG_RISCV_TYPE_SHIFT) +#define KVM_REG_RISCV_CSR_REG(name) \ + (offsetof(struct kvm_riscv_csr, name) / sizeof(unsigned long)) + +/* Timer registers are mapped as type 4 */ +#define KVM_REG_RISCV_TIMER (0x04 << KVM_REG_RISCV_TYPE_SHIFT) +#define KVM_REG_RISCV_TIMER_REG(name) \ + (offsetof(struct kvm_riscv_timer, name) / sizeof(__u64)) + +/* F extension registers are mapped as type 5 */ +#define KVM_REG_RISCV_FP_F (0x05 << KVM_REG_RISCV_TYPE_SHIFT) +#define KVM_REG_RISCV_FP_F_REG(name) \ + (offsetof(struct __riscv_f_ext_state, name) / sizeof(__u32)) + +/* D extension registers are mapped as type 6 */ +#define KVM_REG_RISCV_FP_D (0x06 << KVM_REG_RISCV_TYPE_SHIFT) +#define KVM_REG_RISCV_FP_D_REG(name) \ + (offsetof(struct __riscv_d_ext_state, name) / sizeof(__u64)) + +#endif + +#endif /* __LINUX_KVM_RISCV_H */ diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c index 478d9f02dab5..253126e4beef 100644 --- a/arch/riscv/kernel/asm-offsets.c +++ b/arch/riscv/kernel/asm-offsets.c @@ -7,7 +7,9 @@ #define GENERATING_ASM_OFFSETS #include <linux/kbuild.h> +#include <linux/mm.h> #include <linux/sched.h> +#include <asm/kvm_host.h> #include <asm/thread_info.h> #include <asm/ptrace.h> @@ -110,6 +112,160 @@ void asm_offsets(void) OFFSET(PT_BADADDR, pt_regs, badaddr); OFFSET(PT_CAUSE, pt_regs, cause); + OFFSET(KVM_ARCH_GUEST_ZERO, kvm_vcpu_arch, guest_context.zero); + OFFSET(KVM_ARCH_GUEST_RA, kvm_vcpu_arch, guest_context.ra); + OFFSET(KVM_ARCH_GUEST_SP, kvm_vcpu_arch, guest_context.sp); + OFFSET(KVM_ARCH_GUEST_GP, kvm_vcpu_arch, guest_context.gp); + OFFSET(KVM_ARCH_GUEST_TP, kvm_vcpu_arch, guest_context.tp); + OFFSET(KVM_ARCH_GUEST_T0, kvm_vcpu_arch, guest_context.t0); + OFFSET(KVM_ARCH_GUEST_T1, kvm_vcpu_arch, guest_context.t1); + OFFSET(KVM_ARCH_GUEST_T2, kvm_vcpu_arch, guest_context.t2); + OFFSET(KVM_ARCH_GUEST_S0, kvm_vcpu_arch, guest_context.s0); + OFFSET(KVM_ARCH_GUEST_S1, kvm_vcpu_arch, guest_context.s1); + OFFSET(KVM_ARCH_GUEST_A0, kvm_vcpu_arch, guest_context.a0); + OFFSET(KVM_ARCH_GUEST_A1, kvm_vcpu_arch, guest_context.a1); + OFFSET(KVM_ARCH_GUEST_A2, kvm_vcpu_arch, guest_context.a2); + OFFSET(KVM_ARCH_GUEST_A3, kvm_vcpu_arch, guest_context.a3); + OFFSET(KVM_ARCH_GUEST_A4, kvm_vcpu_arch, guest_context.a4); + OFFSET(KVM_ARCH_GUEST_A5, kvm_vcpu_arch, guest_context.a5); + OFFSET(KVM_ARCH_GUEST_A6, kvm_vcpu_arch, guest_context.a6); + OFFSET(KVM_ARCH_GUEST_A7, kvm_vcpu_arch, guest_context.a7); + OFFSET(KVM_ARCH_GUEST_S2, kvm_vcpu_arch, guest_context.s2); + OFFSET(KVM_ARCH_GUEST_S3, kvm_vcpu_arch, guest_context.s3); + OFFSET(KVM_ARCH_GUEST_S4, kvm_vcpu_arch, guest_context.s4); + OFFSET(KVM_ARCH_GUEST_S5, kvm_vcpu_arch, guest_context.s5); + OFFSET(KVM_ARCH_GUEST_S6, kvm_vcpu_arch, guest_context.s6); + OFFSET(KVM_ARCH_GUEST_S7, kvm_vcpu_arch, guest_context.s7); + OFFSET(KVM_ARCH_GUEST_S8, kvm_vcpu_arch, guest_context.s8); + OFFSET(KVM_ARCH_GUEST_S9, kvm_vcpu_arch, guest_context.s9); + OFFSET(KVM_ARCH_GUEST_S10, kvm_vcpu_arch, guest_context.s10); + OFFSET(KVM_ARCH_GUEST_S11, kvm_vcpu_arch, guest_context.s11); + OFFSET(KVM_ARCH_GUEST_T3, kvm_vcpu_arch, guest_context.t3); + OFFSET(KVM_ARCH_GUEST_T4, kvm_vcpu_arch, guest_context.t4); + OFFSET(KVM_ARCH_GUEST_T5, kvm_vcpu_arch, guest_context.t5); + OFFSET(KVM_ARCH_GUEST_T6, kvm_vcpu_arch, guest_context.t6); + OFFSET(KVM_ARCH_GUEST_SEPC, kvm_vcpu_arch, guest_context.sepc); + OFFSET(KVM_ARCH_GUEST_SSTATUS, kvm_vcpu_arch, guest_context.sstatus); + OFFSET(KVM_ARCH_GUEST_HSTATUS, kvm_vcpu_arch, guest_context.hstatus); + OFFSET(KVM_ARCH_GUEST_SCOUNTEREN, kvm_vcpu_arch, guest_csr.scounteren); + + OFFSET(KVM_ARCH_HOST_ZERO, kvm_vcpu_arch, host_context.zero); + OFFSET(KVM_ARCH_HOST_RA, kvm_vcpu_arch, host_context.ra); + OFFSET(KVM_ARCH_HOST_SP, kvm_vcpu_arch, host_context.sp); + OFFSET(KVM_ARCH_HOST_GP, kvm_vcpu_arch, host_context.gp); + OFFSET(KVM_ARCH_HOST_TP, kvm_vcpu_arch, host_context.tp); + OFFSET(KVM_ARCH_HOST_T0, kvm_vcpu_arch, host_context.t0); + OFFSET(KVM_ARCH_HOST_T1, kvm_vcpu_arch, host_context.t1); + OFFSET(KVM_ARCH_HOST_T2, kvm_vcpu_arch, host_context.t2); + OFFSET(KVM_ARCH_HOST_S0, kvm_vcpu_arch, host_context.s0); + OFFSET(KVM_ARCH_HOST_S1, kvm_vcpu_arch, host_context.s1); + OFFSET(KVM_ARCH_HOST_A0, kvm_vcpu_arch, host_context.a0); + OFFSET(KVM_ARCH_HOST_A1, kvm_vcpu_arch, host_context.a1); + OFFSET(KVM_ARCH_HOST_A2, kvm_vcpu_arch, host_context.a2); + OFFSET(KVM_ARCH_HOST_A3, kvm_vcpu_arch, host_context.a3); + OFFSET(KVM_ARCH_HOST_A4, kvm_vcpu_arch, host_context.a4); + OFFSET(KVM_ARCH_HOST_A5, kvm_vcpu_arch, host_context.a5); + OFFSET(KVM_ARCH_HOST_A6, kvm_vcpu_arch, host_context.a6); + OFFSET(KVM_ARCH_HOST_A7, kvm_vcpu_arch, host_context.a7); + OFFSET(KVM_ARCH_HOST_S2, kvm_vcpu_arch, host_context.s2); + OFFSET(KVM_ARCH_HOST_S3, kvm_vcpu_arch, host_context.s3); + OFFSET(KVM_ARCH_HOST_S4, kvm_vcpu_arch, host_context.s4); + OFFSET(KVM_ARCH_HOST_S5, kvm_vcpu_arch, host_context.s5); + OFFSET(KVM_ARCH_HOST_S6, kvm_vcpu_arch, host_context.s6); + OFFSET(KVM_ARCH_HOST_S7, kvm_vcpu_arch, host_context.s7); + OFFSET(KVM_ARCH_HOST_S8, kvm_vcpu_arch, host_context.s8); + OFFSET(KVM_ARCH_HOST_S9, kvm_vcpu_arch, host_context.s9); + OFFSET(KVM_ARCH_HOST_S10, kvm_vcpu_arch, host_context.s10); + OFFSET(KVM_ARCH_HOST_S11, kvm_vcpu_arch, host_context.s11); + OFFSET(KVM_ARCH_HOST_T3, kvm_vcpu_arch, host_context.t3); + OFFSET(KVM_ARCH_HOST_T4, kvm_vcpu_arch, host_context.t4); + OFFSET(KVM_ARCH_HOST_T5, kvm_vcpu_arch, host_context.t5); + OFFSET(KVM_ARCH_HOST_T6, kvm_vcpu_arch, host_context.t6); + OFFSET(KVM_ARCH_HOST_SEPC, kvm_vcpu_arch, host_context.sepc); + OFFSET(KVM_ARCH_HOST_SSTATUS, kvm_vcpu_arch, host_context.sstatus); + OFFSET(KVM_ARCH_HOST_HSTATUS, kvm_vcpu_arch, host_context.hstatus); + OFFSET(KVM_ARCH_HOST_SSCRATCH, kvm_vcpu_arch, host_sscratch); + OFFSET(KVM_ARCH_HOST_STVEC, kvm_vcpu_arch, host_stvec); + OFFSET(KVM_ARCH_HOST_SCOUNTEREN, kvm_vcpu_arch, host_scounteren); + + OFFSET(KVM_ARCH_TRAP_SEPC, kvm_cpu_trap, sepc); + OFFSET(KVM_ARCH_TRAP_SCAUSE, kvm_cpu_trap, scause); + OFFSET(KVM_ARCH_TRAP_STVAL, kvm_cpu_trap, stval); + OFFSET(KVM_ARCH_TRAP_HTVAL, kvm_cpu_trap, htval); + OFFSET(KVM_ARCH_TRAP_HTINST, kvm_cpu_trap, htinst); + + /* F extension */ + + OFFSET(KVM_ARCH_FP_F_F0, kvm_cpu_context, fp.f.f[0]); + OFFSET(KVM_ARCH_FP_F_F1, kvm_cpu_context, fp.f.f[1]); + OFFSET(KVM_ARCH_FP_F_F2, kvm_cpu_context, fp.f.f[2]); + OFFSET(KVM_ARCH_FP_F_F3, kvm_cpu_context, fp.f.f[3]); + OFFSET(KVM_ARCH_FP_F_F4, kvm_cpu_context, fp.f.f[4]); + OFFSET(KVM_ARCH_FP_F_F5, kvm_cpu_context, fp.f.f[5]); + OFFSET(KVM_ARCH_FP_F_F6, kvm_cpu_context, fp.f.f[6]); + OFFSET(KVM_ARCH_FP_F_F7, kvm_cpu_context, fp.f.f[7]); + OFFSET(KVM_ARCH_FP_F_F8, kvm_cpu_context, fp.f.f[8]); + OFFSET(KVM_ARCH_FP_F_F9, kvm_cpu_context, fp.f.f[9]); + OFFSET(KVM_ARCH_FP_F_F10, kvm_cpu_context, fp.f.f[10]); + OFFSET(KVM_ARCH_FP_F_F11, kvm_cpu_context, fp.f.f[11]); + OFFSET(KVM_ARCH_FP_F_F12, kvm_cpu_context, fp.f.f[12]); + OFFSET(KVM_ARCH_FP_F_F13, kvm_cpu_context, fp.f.f[13]); + OFFSET(KVM_ARCH_FP_F_F14, kvm_cpu_context, fp.f.f[14]); + OFFSET(KVM_ARCH_FP_F_F15, kvm_cpu_context, fp.f.f[15]); + OFFSET(KVM_ARCH_FP_F_F16, kvm_cpu_context, fp.f.f[16]); + OFFSET(KVM_ARCH_FP_F_F17, kvm_cpu_context, fp.f.f[17]); + OFFSET(KVM_ARCH_FP_F_F18, kvm_cpu_context, fp.f.f[18]); + OFFSET(KVM_ARCH_FP_F_F19, kvm_cpu_context, fp.f.f[19]); + OFFSET(KVM_ARCH_FP_F_F20, kvm_cpu_context, fp.f.f[20]); + OFFSET(KVM_ARCH_FP_F_F21, kvm_cpu_context, fp.f.f[21]); + OFFSET(KVM_ARCH_FP_F_F22, kvm_cpu_context, fp.f.f[22]); + OFFSET(KVM_ARCH_FP_F_F23, kvm_cpu_context, fp.f.f[23]); + OFFSET(KVM_ARCH_FP_F_F24, kvm_cpu_context, fp.f.f[24]); + OFFSET(KVM_ARCH_FP_F_F25, kvm_cpu_context, fp.f.f[25]); + OFFSET(KVM_ARCH_FP_F_F26, kvm_cpu_context, fp.f.f[26]); + OFFSET(KVM_ARCH_FP_F_F27, kvm_cpu_context, fp.f.f[27]); + OFFSET(KVM_ARCH_FP_F_F28, kvm_cpu_context, fp.f.f[28]); + OFFSET(KVM_ARCH_FP_F_F29, kvm_cpu_context, fp.f.f[29]); + OFFSET(KVM_ARCH_FP_F_F30, kvm_cpu_context, fp.f.f[30]); + OFFSET(KVM_ARCH_FP_F_F31, kvm_cpu_context, fp.f.f[31]); + OFFSET(KVM_ARCH_FP_F_FCSR, kvm_cpu_context, fp.f.fcsr); + + /* D extension */ + + OFFSET(KVM_ARCH_FP_D_F0, kvm_cpu_context, fp.d.f[0]); + OFFSET(KVM_ARCH_FP_D_F1, kvm_cpu_context, fp.d.f[1]); + OFFSET(KVM_ARCH_FP_D_F2, kvm_cpu_context, fp.d.f[2]); + OFFSET(KVM_ARCH_FP_D_F3, kvm_cpu_context, fp.d.f[3]); + OFFSET(KVM_ARCH_FP_D_F4, kvm_cpu_context, fp.d.f[4]); + OFFSET(KVM_ARCH_FP_D_F5, kvm_cpu_context, fp.d.f[5]); + OFFSET(KVM_ARCH_FP_D_F6, kvm_cpu_context, fp.d.f[6]); + OFFSET(KVM_ARCH_FP_D_F7, kvm_cpu_context, fp.d.f[7]); + OFFSET(KVM_ARCH_FP_D_F8, kvm_cpu_context, fp.d.f[8]); + OFFSET(KVM_ARCH_FP_D_F9, kvm_cpu_context, fp.d.f[9]); + OFFSET(KVM_ARCH_FP_D_F10, kvm_cpu_context, fp.d.f[10]); + OFFSET(KVM_ARCH_FP_D_F11, kvm_cpu_context, fp.d.f[11]); + OFFSET(KVM_ARCH_FP_D_F12, kvm_cpu_context, fp.d.f[12]); + OFFSET(KVM_ARCH_FP_D_F13, kvm_cpu_context, fp.d.f[13]); + OFFSET(KVM_ARCH_FP_D_F14, kvm_cpu_context, fp.d.f[14]); + OFFSET(KVM_ARCH_FP_D_F15, kvm_cpu_context, fp.d.f[15]); + OFFSET(KVM_ARCH_FP_D_F16, kvm_cpu_context, fp.d.f[16]); + OFFSET(KVM_ARCH_FP_D_F17, kvm_cpu_context, fp.d.f[17]); + OFFSET(KVM_ARCH_FP_D_F18, kvm_cpu_context, fp.d.f[18]); + OFFSET(KVM_ARCH_FP_D_F19, kvm_cpu_context, fp.d.f[19]); + OFFSET(KVM_ARCH_FP_D_F20, kvm_cpu_context, fp.d.f[20]); + OFFSET(KVM_ARCH_FP_D_F21, kvm_cpu_context, fp.d.f[21]); + OFFSET(KVM_ARCH_FP_D_F22, kvm_cpu_context, fp.d.f[22]); + OFFSET(KVM_ARCH_FP_D_F23, kvm_cpu_context, fp.d.f[23]); + OFFSET(KVM_ARCH_FP_D_F24, kvm_cpu_context, fp.d.f[24]); + OFFSET(KVM_ARCH_FP_D_F25, kvm_cpu_context, fp.d.f[25]); + OFFSET(KVM_ARCH_FP_D_F26, kvm_cpu_context, fp.d.f[26]); + OFFSET(KVM_ARCH_FP_D_F27, kvm_cpu_context, fp.d.f[27]); + OFFSET(KVM_ARCH_FP_D_F28, kvm_cpu_context, fp.d.f[28]); + OFFSET(KVM_ARCH_FP_D_F29, kvm_cpu_context, fp.d.f[29]); + OFFSET(KVM_ARCH_FP_D_F30, kvm_cpu_context, fp.d.f[30]); + OFFSET(KVM_ARCH_FP_D_F31, kvm_cpu_context, fp.d.f[31]); + OFFSET(KVM_ARCH_FP_D_FCSR, kvm_cpu_context, fp.d.fcsr); + /* * THREAD_{F,X}* might be larger than a S-type offset can handle, but * these are used in performance-sensitive assembly so we can't resort diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig new file mode 100644 index 000000000000..f5a342fa1b1d --- /dev/null +++ b/arch/riscv/kvm/Kconfig @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# KVM configuration +# + +source "virt/kvm/Kconfig" + +menuconfig VIRTUALIZATION + bool "Virtualization" + help + Say Y here to get to see options for using your Linux host to run + other operating systems inside virtual machines (guests). + This option alone does not add any kernel code. + + If you say N, all options in this submenu will be skipped and + disabled. + +if VIRTUALIZATION + +config KVM + tristate "Kernel-based Virtual Machine (KVM) support (EXPERIMENTAL)" + depends on RISCV_SBI && MMU + select MMU_NOTIFIER + select PREEMPT_NOTIFIERS + select KVM_MMIO + select KVM_GENERIC_DIRTYLOG_READ_PROTECT + select HAVE_KVM_VCPU_ASYNC_IOCTL + select HAVE_KVM_EVENTFD + select SRCU + help + Support hosting virtualized guest machines. + + If unsure, say N. + +endif # VIRTUALIZATION diff --git a/arch/riscv/kvm/Makefile b/arch/riscv/kvm/Makefile new file mode 100644 index 000000000000..30cdd1df0098 --- /dev/null +++ b/arch/riscv/kvm/Makefile @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for RISC-V KVM support +# + +ccflags-y += -I $(srctree)/$(src) + +KVM := ../../../virt/kvm + +obj-$(CONFIG_KVM) += kvm.o + +kvm-y += $(KVM)/kvm_main.o +kvm-y += $(KVM)/coalesced_mmio.o +kvm-y += $(KVM)/binary_stats.o +kvm-y += $(KVM)/eventfd.o +kvm-y += main.o +kvm-y += vm.o +kvm-y += vmid.o +kvm-y += tlb.o +kvm-y += mmu.o +kvm-y += vcpu.o +kvm-y += vcpu_exit.o +kvm-y += vcpu_fp.o +kvm-y += vcpu_switch.o +kvm-y += vcpu_sbi.o +kvm-y += vcpu_timer.o diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c new file mode 100644 index 000000000000..421ecf4e6360 --- /dev/null +++ b/arch/riscv/kvm/main.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Western Digital Corporation or its affiliates. + * + * Authors: + * Anup Patel <anup.patel@wdc.com> + */ + +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/module.h> +#include <linux/kvm_host.h> +#include <asm/csr.h> +#include <asm/hwcap.h> +#include <asm/sbi.h> + +long kvm_arch_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + return -EINVAL; +} + +int kvm_arch_check_processor_compat(void *opaque) +{ + return 0; +} + +int kvm_arch_hardware_setup(void *opaque) +{ + return 0; +} + +int kvm_arch_hardware_enable(void) +{ + unsigned long hideleg, hedeleg; + + hedeleg = 0; + hedeleg |= (1UL << EXC_INST_MISALIGNED); + hedeleg |= (1UL << EXC_BREAKPOINT); + hedeleg |= (1UL << EXC_SYSCALL); + hedeleg |= (1UL << EXC_INST_PAGE_FAULT); + hedeleg |= (1UL << EXC_LOAD_PAGE_FAULT); + hedeleg |= (1UL << EXC_STORE_PAGE_FAULT); + csr_write(CSR_HEDELEG, hedeleg); + + hideleg = 0; + hideleg |= (1UL << IRQ_VS_SOFT); + hideleg |= (1UL << IRQ_VS_TIMER); + hideleg |= (1UL << IRQ_VS_EXT); + csr_write(CSR_HIDELEG, hideleg); + + csr_write(CSR_HCOUNTEREN, -1UL); + + csr_write(CSR_HVIP, 0); + + return 0; +} + +void kvm_arch_hardware_disable(void) +{ + csr_write(CSR_HEDELEG, 0); + csr_write(CSR_HIDELEG, 0); +} + +int kvm_arch_init(void *opaque) +{ + const char *str; + + if (!riscv_isa_extension_available(NULL, h)) { + kvm_info("hypervisor extension not available\n"); + return -ENODEV; + } + + if (sbi_spec_is_0_1()) { + kvm_info("require SBI v0.2 or higher\n"); + return -ENODEV; + } + + if (sbi_probe_extension(SBI_EXT_RFENCE) <= 0) { + kvm_info("require SBI RFENCE extension\n"); + return -ENODEV; + } + + kvm_riscv_stage2_mode_detect(); + + kvm_riscv_stage2_vmid_detect(); + + kvm_info("hypervisor extension available\n"); + + switch (kvm_riscv_stage2_mode()) { + case HGATP_MODE_SV32X4: + str = "Sv32x4"; + break; + case HGATP_MODE_SV39X4: + str = "Sv39x4"; + break; + case HGATP_MODE_SV48X4: + str = "Sv48x4"; + break; + default: + return -ENODEV; + } + kvm_info("using %s G-stage page table format\n", str); + + kvm_info("VMID %ld bits available\n", kvm_riscv_stage2_vmid_bits()); + + return 0; +} + +void kvm_arch_exit(void) +{ +} + +static int riscv_kvm_init(void) +{ + return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); +} +module_init(riscv_kvm_init); diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c new file mode 100644 index 000000000000..d81bae8eb55e --- /dev/null +++ b/arch/riscv/kvm/mmu.c @@ -0,0 +1,802 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Western Digital Corporation or its affiliates. + * + * Authors: + * Anup Patel <anup.patel@wdc.com> + */ + +#include <linux/bitops.h> +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/hugetlb.h> +#include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <linux/kvm_host.h> +#include <linux/sched/signal.h> +#include <asm/csr.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/sbi.h> + +#ifdef CONFIG_64BIT +static unsigned long stage2_mode = (HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT); +static unsigned long stage2_pgd_levels = 3; +#define stage2_index_bits 9 +#else +static unsigned long stage2_mode = (HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT); +static unsigned long stage2_pgd_levels = 2; +#define stage2_index_bits 10 +#endif + +#define stage2_pgd_xbits 2 +#define stage2_pgd_size (1UL << (HGATP_PAGE_SHIFT + stage2_pgd_xbits)) +#define stage2_gpa_bits (HGATP_PAGE_SHIFT + \ + (stage2_pgd_levels * stage2_index_bits) + \ + stage2_pgd_xbits) +#define stage2_gpa_size ((gpa_t)(1ULL << stage2_gpa_bits)) + +#define stage2_pte_leaf(__ptep) \ + (pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)) + +static inline unsigned long stage2_pte_index(gpa_t addr, u32 level) +{ + unsigned long mask; + unsigned long shift = HGATP_PAGE_SHIFT + (stage2_index_bits * level); + + if (level == (stage2_pgd_levels - 1)) + mask = (PTRS_PER_PTE * (1UL << stage2_pgd_xbits)) - 1; + else + mask = PTRS_PER_PTE - 1; + + return (addr >> shift) & mask; +} + +static inline unsigned long stage2_pte_page_vaddr(pte_t pte) +{ + return (unsigned long)pfn_to_virt(pte_val(pte) >> _PAGE_PFN_SHIFT); +} + +static int stage2_page_size_to_level(unsigned long page_size, u32 *out_level) +{ + u32 i; + unsigned long psz = 1UL << 12; + + for (i = 0; i < stage2_pgd_levels; i++) { + if (page_size == (psz << (i * stage2_index_bits))) { + *out_level = i; + return 0; + } + } + + return -EINVAL; +} + +static int stage2_level_to_page_size(u32 level, unsigned long *out_pgsize) +{ + if (stage2_pgd_levels < level) + return -EINVAL; + + *out_pgsize = 1UL << (12 + (level * stage2_index_bits)); + + return 0; +} + +static int stage2_cache_topup(struct kvm_mmu_page_cache *pcache, + int min, int max) +{ + void *page; + + BUG_ON(max > KVM_MMU_PAGE_CACHE_NR_OBJS); + if (pcache->nobjs >= min) + return 0; + while (pcache->nobjs < max) { + page = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + return -ENOMEM; + pcache->objects[pcache->nobjs++] = page; + } + + return 0; +} + +static void stage2_cache_flush(struct kvm_mmu_page_cache *pcache) +{ + while (pcache && pcache->nobjs) + free_page((unsigned long)pcache->objects[--pcache->nobjs]); +} + +static void *stage2_cache_alloc(struct kvm_mmu_page_cache *pcache) +{ + void *p; + + if (!pcache) + return NULL; + + BUG_ON(!pcache->nobjs); + p = pcache->objects[--pcache->nobjs]; + + return p; +} + +static bool stage2_get_leaf_entry(struct kvm *kvm, gpa_t addr, + pte_t **ptepp, u32 *ptep_level) +{ + pte_t *ptep; + u32 current_level = stage2_pgd_levels - 1; + + *ptep_level = current_level; + ptep = (pte_t *)kvm->arch.pgd; + ptep = &ptep[stage2_pte_index(addr, current_level)]; + while (ptep && pte_val(*ptep)) { + if (stage2_pte_leaf(ptep)) { + *ptep_level = current_level; + *ptepp = ptep; + return true; + } + + if (current_level) { + current_level--; + *ptep_level = current_level; + ptep = (pte_t *)stage2_pte_page_vaddr(*ptep); + ptep = &ptep[stage2_pte_index(addr, current_level)]; + } else { + ptep = NULL; + } + } + + return false; +} + +static void stage2_remote_tlb_flush(struct kvm *kvm, u32 level, gpa_t addr) +{ + struct cpumask hmask; + unsigned long size = PAGE_SIZE; + struct kvm_vmid *vmid = &kvm->arch.vmid; + + if (stage2_level_to_page_size(level, &size)) + return; + addr &= ~(size - 1); + + /* + * TODO: Instead of cpu_online_mask, we should only target CPUs + * where the Guest/VM is running. + */ + preempt_disable(); + riscv_cpuid_to_hartid_mask(cpu_online_mask, &hmask); + sbi_remote_hfence_gvma_vmid(cpumask_bits(&hmask), addr, size, + READ_ONCE(vmid->vmid)); + preempt_enable(); +} + +static int stage2_set_pte(struct kvm *kvm, u32 level, + struct kvm_mmu_page_cache *pcache, + gpa_t addr, const pte_t *new_pte) +{ + u32 current_level = stage2_pgd_levels - 1; + pte_t *next_ptep = (pte_t *)kvm->arch.pgd; + pte_t *ptep = &next_ptep[stage2_pte_index(addr, current_level)]; + + if (current_level < level) + return -EINVAL; + + while (current_level != level) { + if (stage2_pte_leaf(ptep)) + return -EEXIST; + + if (!pte_val(*ptep)) { + next_ptep = stage2_cache_alloc(pcache); + if (!next_ptep) + return -ENOMEM; + *ptep = pfn_pte(PFN_DOWN(__pa(next_ptep)), + __pgprot(_PAGE_TABLE)); + } else { + if (stage2_pte_leaf(ptep)) + return -EEXIST; + next_ptep = (pte_t *)stage2_pte_page_vaddr(*ptep); + } + + current_level--; + ptep = &next_ptep[stage2_pte_index(addr, current_level)]; + } + + *ptep = *new_pte; + if (stage2_pte_leaf(ptep)) + stage2_remote_tlb_flush(kvm, current_level, addr); + + return 0; +} + +static int stage2_map_page(struct kvm *kvm, + struct kvm_mmu_page_cache *pcache, + gpa_t gpa, phys_addr_t hpa, + unsigned long page_size, + bool page_rdonly, bool page_exec) +{ + int ret; + u32 level = 0; + pte_t new_pte; + pgprot_t prot; + + ret = stage2_page_size_to_level(page_size, &level); + if (ret) + return ret; + + /* + * A RISC-V implementation can choose to either: + * 1) Update 'A' and 'D' PTE bits in hardware + * 2) Generate page fault when 'A' and/or 'D' bits are not set + * PTE so that software can update these bits. + * + * We support both options mentioned above. To achieve this, we + * always set 'A' and 'D' PTE bits at time of creating stage2 + * mapping. To support KVM dirty page logging with both options + * mentioned above, we will write-protect stage2 PTEs to track + * dirty pages. + */ + + if (page_exec) { + if (page_rdonly) + prot = PAGE_READ_EXEC; + else + prot = PAGE_WRITE_EXEC; + } else { + if (page_rdonly) + prot = PAGE_READ; + else + prot = PAGE_WRITE; + } + new_pte = pfn_pte(PFN_DOWN(hpa), prot); + new_pte = pte_mkdirty(new_pte); + + return stage2_set_pte(kvm, level, pcache, gpa, &new_pte); +} + +enum stage2_op { + STAGE2_OP_NOP = 0, /* Nothing */ + STAGE2_OP_CLEAR, /* Clear/Unmap */ + STAGE2_OP_WP, /* Write-protect */ +}; + +static void stage2_op_pte(struct kvm *kvm, gpa_t addr, + pte_t *ptep, u32 ptep_level, enum stage2_op op) +{ + int i, ret; + pte_t *next_ptep; + u32 next_ptep_level; + unsigned long next_page_size, page_size; + + ret = stage2_level_to_page_size(ptep_level, &page_size); + if (ret) + return; + + BUG_ON(addr & (page_size - 1)); + + if (!pte_val(*ptep)) + return; + + if (ptep_level && !stage2_pte_leaf(ptep)) { + next_ptep = (pte_t *)stage2_pte_page_vaddr(*ptep); + next_ptep_level = ptep_level - 1; + ret = stage2_level_to_page_size(next_ptep_level, + &next_page_size); + if (ret) + return; + + if (op == STAGE2_OP_CLEAR) + set_pte(ptep, __pte(0)); + for (i = 0; i < PTRS_PER_PTE; i++) + stage2_op_pte(kvm, addr + i * next_page_size, + &next_ptep[i], next_ptep_level, op); + if (op == STAGE2_OP_CLEAR) + put_page(virt_to_page(next_ptep)); + } else { + if (op == STAGE2_OP_CLEAR) + set_pte(ptep, __pte(0)); + else if (op == STAGE2_OP_WP) + set_pte(ptep, __pte(pte_val(*ptep) & ~_PAGE_WRITE)); + stage2_remote_tlb_flush(kvm, ptep_level, addr); + } +} + +static void stage2_unmap_range(struct kvm *kvm, gpa_t start, + gpa_t size, bool may_block) +{ + int ret; + pte_t *ptep; + u32 ptep_level; + bool found_leaf; + unsigned long page_size; + gpa_t addr = start, end = start + size; + + while (addr < end) { + found_leaf = stage2_get_leaf_entry(kvm, addr, + &ptep, &ptep_level); + ret = stage2_level_to_page_size(ptep_level, &page_size); + if (ret) + break; + + if (!found_leaf) + goto next; + + if (!(addr & (page_size - 1)) && ((end - addr) >= page_size)) + stage2_op_pte(kvm, addr, ptep, + ptep_level, STAGE2_OP_CLEAR); + +next: + addr += page_size; + + /* + * If the range is too large, release the kvm->mmu_lock + * to prevent starvation and lockup detector warnings. + */ + if (may_block && addr < end) + cond_resched_lock(&kvm->mmu_lock); + } +} + +static void stage2_wp_range(struct kvm *kvm, gpa_t start, gpa_t end) +{ + int ret; + pte_t *ptep; + u32 ptep_level; + bool found_leaf; + gpa_t addr = start; + unsigned long page_size; + + while (addr < end) { + found_leaf = stage2_get_leaf_entry(kvm, addr, + &ptep, &ptep_level); + ret = stage2_level_to_page_size(ptep_level, &page_size); + if (ret) + break; + + if (!found_leaf) + goto next; + + if (!(addr & (page_size - 1)) && ((end - addr) >= page_size)) + stage2_op_pte(kvm, addr, ptep, + ptep_level, STAGE2_OP_WP); + +next: + addr += page_size; + } +} + +static void stage2_wp_memory_region(struct kvm *kvm, int slot) +{ + struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); + phys_addr_t start = memslot->base_gfn << PAGE_SHIFT; + phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; + + spin_lock(&kvm->mmu_lock); + stage2_wp_range(kvm, start, end); + spin_unlock(&kvm->mmu_lock); + kvm_flush_remote_tlbs(kvm); +} + +static int stage2_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, + unsigned long size, bool writable) +{ + pte_t pte; + int ret = 0; + unsigned long pfn; + phys_addr_t addr, end; + struct kvm_mmu_page_cache pcache = { 0, }; + + end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK; + pfn = __phys_to_pfn(hpa); + + for (addr = gpa; addr < end; addr += PAGE_SIZE) { + pte = pfn_pte(pfn, PAGE_KERNEL); + + if (!writable) + pte = pte_wrprotect(pte); + + ret = stage2_cache_topup(&pcache, + stage2_pgd_levels, + KVM_MMU_PAGE_CACHE_NR_OBJS); + if (ret) + goto out; + + spin_lock(&kvm->mmu_lock); + ret = stage2_set_pte(kvm, 0, &pcache, addr, &pte); + spin_unlock(&kvm->mmu_lock); + if (ret) + goto out; + + pfn++; + } + +out: + stage2_cache_flush(&pcache); + return ret; +} + +void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, + unsigned long mask) +{ + phys_addr_t base_gfn = slot->base_gfn + gfn_offset; + phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; + phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; + + stage2_wp_range(kvm, start, end); +} + +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) +{ +} + +void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, + const struct kvm_memory_slot *memslot) +{ + kvm_flush_remote_tlbs(kvm); +} + +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free) +{ +} + +void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) +{ +} + +void kvm_arch_flush_shadow_all(struct kvm *kvm) +{ + kvm_riscv_stage2_free_pgd(kvm); +} + +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, + const struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot *old, + const struct kvm_memory_slot *new, + enum kvm_mr_change change) +{ + /* + * At this point memslot has been committed and there is an + * allocated dirty_bitmap[], dirty pages will be tracked while + * the memory slot is write protected. + */ + if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) + stage2_wp_memory_region(kvm, mem->slot); +} + +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + const struct kvm_userspace_memory_region *mem, + enum kvm_mr_change change) +{ + hva_t hva = mem->userspace_addr; + hva_t reg_end = hva + mem->memory_size; + bool writable = !(mem->flags & KVM_MEM_READONLY); + int ret = 0; + + if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && + change != KVM_MR_FLAGS_ONLY) + return 0; + + /* + * Prevent userspace from creating a memory region outside of the GPA + * space addressable by the KVM guest GPA space. + */ + if ((memslot->base_gfn + memslot->npages) >= + (stage2_gpa_size >> PAGE_SHIFT)) + return -EFAULT; + + mmap_read_lock(current->mm); + + /* + * A memory region could potentially cover multiple VMAs, and + * any holes between them, so iterate over all of them to find + * out if we can map any of them right now. + * + * +--------------------------------------------+ + * +---------------+----------------+ +----------------+ + * | : VMA 1 | VMA 2 | | VMA 3 : | + * +---------------+----------------+ +----------------+ + * | memory region | + * +--------------------------------------------+ + */ + do { + struct vm_area_struct *vma = find_vma(current->mm, hva); + hva_t vm_start, vm_end; + + if (!vma || vma->vm_start >= reg_end) + break; + + /* + * Mapping a read-only VMA is only allowed if the + * memory region is configured as read-only. + */ + if (writable && !(vma->vm_flags & VM_WRITE)) { + ret = -EPERM; + break; + } + + /* Take the intersection of this VMA with the memory region */ + vm_start = max(hva, vma->vm_start); + vm_end = min(reg_end, vma->vm_end); + + if (vma->vm_flags & VM_PFNMAP) { + gpa_t gpa = mem->guest_phys_addr + + (vm_start - mem->userspace_addr); + phys_addr_t pa; + + pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT; + pa += vm_start - vma->vm_start; + + /* IO region dirty page logging not allowed */ + if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) { + ret = -EINVAL; + goto out; + } + + ret = stage2_ioremap(kvm, gpa, pa, + vm_end - vm_start, writable); + if (ret) + break; + } + hva = vm_end; + } while (hva < reg_end); + + if (change == KVM_MR_FLAGS_ONLY) + goto out; + + spin_lock(&kvm->mmu_lock); + if (ret) + stage2_unmap_range(kvm, mem->guest_phys_addr, + mem->memory_size, false); + spin_unlock(&kvm->mmu_lock); + +out: + mmap_read_unlock(current->mm); + return ret; +} + +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) +{ + if (!kvm->arch.pgd) + return false; + + stage2_unmap_range(kvm, range->start << PAGE_SHIFT, + (range->end - range->start) << PAGE_SHIFT, + range->may_block); + return false; +} + +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + int ret; + kvm_pfn_t pfn = pte_pfn(range->pte); + + if (!kvm->arch.pgd) + return false; + + WARN_ON(range->end - range->start != 1); + + ret = stage2_map_page(kvm, NULL, range->start << PAGE_SHIFT, + __pfn_to_phys(pfn), PAGE_SIZE, true, true); + if (ret) { + kvm_debug("Failed to map stage2 page (error %d)\n", ret); + return true; + } + + return false; +} + +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + pte_t *ptep; + u32 ptep_level = 0; + u64 size = (range->end - range->start) << PAGE_SHIFT; + + if (!kvm->arch.pgd) + return false; + + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE); + + if (!stage2_get_leaf_entry(kvm, range->start << PAGE_SHIFT, + &ptep, &ptep_level)) + return false; + + return ptep_test_and_clear_young(NULL, 0, ptep); +} + +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + pte_t *ptep; + u32 ptep_level = 0; + u64 size = (range->end - range->start) << PAGE_SHIFT; + + if (!kvm->arch.pgd) + return false; + + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE); + + if (!stage2_get_leaf_entry(kvm, range->start << PAGE_SHIFT, + &ptep, &ptep_level)) + return false; + + return pte_young(*ptep); +} + +int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, + struct kvm_memory_slot *memslot, + gpa_t gpa, unsigned long hva, bool is_write) +{ + int ret; + kvm_pfn_t hfn; + bool writeable; + short vma_pageshift; + gfn_t gfn = gpa >> PAGE_SHIFT; + struct vm_area_struct *vma; + struct kvm *kvm = vcpu->kvm; + struct kvm_mmu_page_cache *pcache = &vcpu->arch.mmu_page_cache; + bool logging = (memslot->dirty_bitmap && + !(memslot->flags & KVM_MEM_READONLY)) ? true : false; + unsigned long vma_pagesize, mmu_seq; + + mmap_read_lock(current->mm); + + vma = find_vma_intersection(current->mm, hva, hva + 1); + if (unlikely(!vma)) { + kvm_err("Failed to find VMA for hva 0x%lx\n", hva); + mmap_read_unlock(current->mm); + return -EFAULT; + } + + if (is_vm_hugetlb_page(vma)) + vma_pageshift = huge_page_shift(hstate_vma(vma)); + else + vma_pageshift = PAGE_SHIFT; + vma_pagesize = 1ULL << vma_pageshift; + if (logging || (vma->vm_flags & VM_PFNMAP)) + vma_pagesize = PAGE_SIZE; + + if (vma_pagesize == PMD_SIZE || vma_pagesize == PGDIR_SIZE) + gfn = (gpa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; + + mmap_read_unlock(current->mm); + + if (vma_pagesize != PGDIR_SIZE && + vma_pagesize != PMD_SIZE && + vma_pagesize != PAGE_SIZE) { + kvm_err("Invalid VMA page size 0x%lx\n", vma_pagesize); + return -EFAULT; + } + + /* We need minimum second+third level pages */ + ret = stage2_cache_topup(pcache, stage2_pgd_levels, + KVM_MMU_PAGE_CACHE_NR_OBJS); + if (ret) { + kvm_err("Failed to topup stage2 cache\n"); + return ret; + } + + mmu_seq = kvm->mmu_notifier_seq; + + hfn = gfn_to_pfn_prot(kvm, gfn, is_write, &writeable); + if (hfn == KVM_PFN_ERR_HWPOISON) { + send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, + vma_pageshift, current); + return 0; + } + if (is_error_noslot_pfn(hfn)) + return -EFAULT; + + /* + * If logging is active then we allow writable pages only + * for write faults. + */ + if (logging && !is_write) + writeable = false; + + spin_lock(&kvm->mmu_lock); + + if (mmu_notifier_retry(kvm, mmu_seq)) + goto out_unlock; + + if (writeable) { + kvm_set_pfn_dirty(hfn); + mark_page_dirty(kvm, gfn); + ret = stage2_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT, + vma_pagesize, false, true); + } else { + ret = stage2_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT, + vma_pagesize, true, true); + } + + if (ret) + kvm_err("Failed to map in stage2\n"); + +out_unlock: + spin_unlock(&kvm->mmu_lock); + kvm_set_pfn_accessed(hfn); + kvm_release_pfn_clean(hfn); + return ret; +} + +void kvm_riscv_stage2_flush_cache(struct kvm_vcpu *vcpu) +{ + stage2_cache_flush(&vcpu->arch.mmu_page_cache); +} + +int kvm_riscv_stage2_alloc_pgd(struct kvm *kvm) +{ + struct page *pgd_page; + + if (kvm->arch.pgd != NULL) { + kvm_err("kvm_arch already initialized?\n"); + return -EINVAL; + } + + pgd_page = alloc_pages(GFP_KERNEL | __GFP_ZERO, + get_order(stage2_pgd_size)); + if (!pgd_page) + return -ENOMEM; + kvm->arch.pgd = page_to_virt(pgd_page); + kvm->arch.pgd_phys = page_to_phys(pgd_page); + + return 0; +} + +void kvm_riscv_stage2_free_pgd(struct kvm *kvm) +{ + void *pgd = NULL; + + spin_lock(&kvm->mmu_lock); + if (kvm->arch.pgd) { + stage2_unmap_range(kvm, 0UL, stage2_gpa_size, false); + pgd = READ_ONCE(kvm->arch.pgd); + kvm->arch.pgd = NULL; + kvm->arch.pgd_phys = 0; + } + spin_unlock(&kvm->mmu_lock); + + if (pgd) + free_pages((unsigned long)pgd, get_order(stage2_pgd_size)); +} + +void kvm_riscv_stage2_update_hgatp(struct kvm_vcpu *vcpu) +{ + unsigned long hgatp = stage2_mode; + struct kvm_arch *k = &vcpu->kvm->arch; + + hgatp |= (READ_ONCE(k->vmid.vmid) << HGATP_VMID_SHIFT) & + HGATP_VMID_MASK; + hgatp |= (k->pgd_phys >> PAGE_SHIFT) & HGATP_PPN; + + csr_write(CSR_HGATP, hgatp); + + if (!kvm_riscv_stage2_vmid_bits()) + __kvm_riscv_hfence_gvma_all(); +} + +void kvm_riscv_stage2_mode_detect(void) +{ +#ifdef CONFIG_64BIT + /* Try Sv48x4 stage2 mode */ + csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT); + if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) { + stage2_mode = (HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT); + stage2_pgd_levels = 4; + } + csr_write(CSR_HGATP, 0); + + __kvm_riscv_hfence_gvma_all(); +#endif +} + +unsigned long kvm_riscv_stage2_mode(void) +{ + return stage2_mode >> HGATP_MODE_SHIFT; +} diff --git a/arch/riscv/kvm/tlb.S b/arch/riscv/kvm/tlb.S new file mode 100644 index 000000000000..899f75d60bad --- /dev/null +++ b/arch/riscv/kvm/tlb.S @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Western Digital Corporation or its affiliates. + * + * Authors: + * Anup Patel <anup.patel@wdc.com> + */ + +#include <linux/linkage.h> +#include <asm/asm.h> + + .text + .altmacro + .option norelax + + /* + * Instruction encoding of hfence.gvma is: + * HFENCE.GVMA rs1, rs2 + * HFENCE.GVMA zero, rs2 + * HFENCE.GVMA rs1 + * HFENCE.GVMA + * + * rs1!=zero and rs2!=zero ==> HFENCE.GVMA rs1, rs2 + * rs1==zero and rs2!=zero ==> HFENCE.GVMA zero, rs2 + * rs1!=zero and rs2==zero ==> HFENCE.GVMA rs1 + * rs1==zero and rs2==zero ==> HFENCE.GVMA + * + * Instruction encoding of HFENCE.GVMA is: + * 0110001 rs2(5) rs1(5) 000 00000 1110011 + */ + +ENTRY(__kvm_riscv_hfence_gvma_vmid_gpa) + /* + * rs1 = a0 (GPA >> 2) + * rs2 = a1 (VMID) + * HFENCE.GVMA a0, a1 + * 0110001 01011 01010 000 00000 1110011 + */ + .word 0x62b50073 + ret +ENDPROC(__kvm_riscv_hfence_gvma_vmid_gpa) + +ENTRY(__kvm_riscv_hfence_gvma_vmid) + /* + * rs1 = zero + * rs2 = a0 (VMID) + * HFENCE.GVMA zero, a0 + * 0110001 01010 00000 000 00000 1110011 + */ + .word 0x62a00073 + ret +ENDPROC(__kvm_riscv_hfence_gvma_vmid) + +ENTRY(__kvm_riscv_hfence_gvma_gpa) + /* + * rs1 = a0 (GPA >> 2) + * rs2 = zero + * HFENCE.GVMA a0 + * 0110001 00000 01010 000 00000 1110011 + */ + .word 0x62050073 + ret +ENDPROC(__kvm_riscv_hfence_gvma_gpa) + +ENTRY(__kvm_riscv_hfence_gvma_all) + /* + * rs1 = zero + * rs2 = zero + * HFENCE.GVMA + * 0110001 00000 00000 000 00000 1110011 + */ + .word 0x62000073 + ret +ENDPROC(__kvm_riscv_hfence_gvma_all) diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c new file mode 100644 index 000000000000..e3d3aed46184 --- /dev/null +++ b/arch/riscv/kvm/vcpu.c @@ -0,0 +1,825 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Western Digital Corporation or its affiliates. + * + * Authors: + * Anup Patel <anup.patel@wdc.com> + */ + +#include <linux/bitops.h> +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/kdebug.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <linux/sched/signal.h> +#include <linux/fs.h> +#include <linux/kvm_host.h> +#include <asm/csr.h> +#include <asm/hwcap.h> + +const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { + KVM_GENERIC_VCPU_STATS(), + STATS_DESC_COUNTER(VCPU, ecall_exit_stat), + STATS_DESC_COUNTER(VCPU, wfi_exit_stat), + STATS_DESC_COUNTER(VCPU, mmio_exit_user), + STATS_DESC_COUNTER(VCPU, mmio_exit_kernel), + STATS_DESC_COUNTER(VCPU, exits) +}; + +const struct kvm_stats_header kvm_vcpu_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vcpu_stats_desc), +}; + +#define KVM_RISCV_ISA_ALLOWED (riscv_isa_extension_mask(a) | \ + riscv_isa_extension_mask(c) | \ + riscv_isa_extension_mask(d) | \ + riscv_isa_extension_mask(f) | \ + riscv_isa_extension_mask(i) | \ + riscv_isa_extension_mask(m) | \ + riscv_isa_extension_mask(s) | \ + riscv_isa_extension_mask(u)) + +static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; + struct kvm_vcpu_csr *reset_csr = &vcpu->arch.guest_reset_csr; + struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; + struct kvm_cpu_context *reset_cntx = &vcpu->arch.guest_reset_context; + + memcpy(csr, reset_csr, sizeof(*csr)); + + memcpy(cntx, reset_cntx, sizeof(*cntx)); + + kvm_riscv_vcpu_fp_reset(vcpu); + + kvm_riscv_vcpu_timer_reset(vcpu); + + WRITE_ONCE(vcpu->arch.irqs_pending, 0); + WRITE_ONCE(vcpu->arch.irqs_pending_mask, 0); +} + +int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) +{ + return 0; +} + +int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) +{ + struct kvm_cpu_context *cntx; + + /* Mark this VCPU never ran */ + vcpu->arch.ran_atleast_once = false; + + /* Setup ISA features available to VCPU */ + vcpu->arch.isa = riscv_isa_extension_base(NULL) & KVM_RISCV_ISA_ALLOWED; + + /* Setup reset state of shadow SSTATUS and HSTATUS CSRs */ + cntx = &vcpu->arch.guest_reset_context; + cntx->sstatus = SR_SPP | SR_SPIE; + cntx->hstatus = 0; + cntx->hstatus |= HSTATUS_VTW; + cntx->hstatus |= HSTATUS_SPVP; + cntx->hstatus |= HSTATUS_SPV; + + /* Setup VCPU timer */ + kvm_riscv_vcpu_timer_init(vcpu); + + /* Reset VCPU */ + kvm_riscv_reset_vcpu(vcpu); + + return 0; +} + +void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ +} + +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + /* Cleanup VCPU timer */ + kvm_riscv_vcpu_timer_deinit(vcpu); + + /* Flush the pages pre-allocated for Stage2 page table mappings */ + kvm_riscv_stage2_flush_cache(vcpu); +} + +int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) +{ + return kvm_riscv_vcpu_has_interrupts(vcpu, 1UL << IRQ_VS_TIMER); +} + +void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) +{ +} + +void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) +{ +} + +int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) +{ + return (kvm_riscv_vcpu_has_interrupts(vcpu, -1UL) && + !vcpu->arch.power_off && !vcpu->arch.pause); +} + +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) +{ + return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; +} + +bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) +{ + return (vcpu->arch.guest_context.sstatus & SR_SPP) ? true : false; +} + +vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + +static int kvm_riscv_vcpu_get_reg_config(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_CONFIG); + unsigned long reg_val; + + if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) + return -EINVAL; + + switch (reg_num) { + case KVM_REG_RISCV_CONFIG_REG(isa): + reg_val = vcpu->arch.isa; + break; + default: + return -EINVAL; + } + + if (copy_to_user(uaddr, ®_val, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + return 0; +} + +static int kvm_riscv_vcpu_set_reg_config(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_CONFIG); + unsigned long reg_val; + + if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) + return -EINVAL; + + if (copy_from_user(®_val, uaddr, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + switch (reg_num) { + case KVM_REG_RISCV_CONFIG_REG(isa): + if (!vcpu->arch.ran_atleast_once) { + vcpu->arch.isa = reg_val; + vcpu->arch.isa &= riscv_isa_extension_base(NULL); + vcpu->arch.isa &= KVM_RISCV_ISA_ALLOWED; + kvm_riscv_vcpu_fp_reset(vcpu); + } else { + return -EOPNOTSUPP; + } + break; + default: + return -EINVAL; + } + + return 0; +} + +static int kvm_riscv_vcpu_get_reg_core(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_CORE); + unsigned long reg_val; + + if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) + return -EINVAL; + if (reg_num >= sizeof(struct kvm_riscv_core) / sizeof(unsigned long)) + return -EINVAL; + + if (reg_num == KVM_REG_RISCV_CORE_REG(regs.pc)) + reg_val = cntx->sepc; + else if (KVM_REG_RISCV_CORE_REG(regs.pc) < reg_num && + reg_num <= KVM_REG_RISCV_CORE_REG(regs.t6)) + reg_val = ((unsigned long *)cntx)[reg_num]; + else if (reg_num == KVM_REG_RISCV_CORE_REG(mode)) + reg_val = (cntx->sstatus & SR_SPP) ? + KVM_RISCV_MODE_S : KVM_RISCV_MODE_U; + else + return -EINVAL; + + if (copy_to_user(uaddr, ®_val, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + return 0; +} + +static int kvm_riscv_vcpu_set_reg_core(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_CORE); + unsigned long reg_val; + + if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) + return -EINVAL; + if (reg_num >= sizeof(struct kvm_riscv_core) / sizeof(unsigned long)) + return -EINVAL; + + if (copy_from_user(®_val, uaddr, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + if (reg_num == KVM_REG_RISCV_CORE_REG(regs.pc)) + cntx->sepc = reg_val; + else if (KVM_REG_RISCV_CORE_REG(regs.pc) < reg_num && + reg_num <= KVM_REG_RISCV_CORE_REG(regs.t6)) + ((unsigned long *)cntx)[reg_num] = reg_val; + else if (reg_num == KVM_REG_RISCV_CORE_REG(mode)) { + if (reg_val == KVM_RISCV_MODE_S) + cntx->sstatus |= SR_SPP; + else + cntx->sstatus &= ~SR_SPP; + } else + return -EINVAL; + + return 0; +} + +static int kvm_riscv_vcpu_get_reg_csr(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_CSR); + unsigned long reg_val; + + if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) + return -EINVAL; + if (reg_num >= sizeof(struct kvm_riscv_csr) / sizeof(unsigned long)) + return -EINVAL; + + if (reg_num == KVM_REG_RISCV_CSR_REG(sip)) { + kvm_riscv_vcpu_flush_interrupts(vcpu); + reg_val = (csr->hvip >> VSIP_TO_HVIP_SHIFT) & VSIP_VALID_MASK; + } else + reg_val = ((unsigned long *)csr)[reg_num]; + + if (copy_to_user(uaddr, ®_val, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + return 0; +} + +static int kvm_riscv_vcpu_set_reg_csr(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_CSR); + unsigned long reg_val; + + if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) + return -EINVAL; + if (reg_num >= sizeof(struct kvm_riscv_csr) / sizeof(unsigned long)) + return -EINVAL; + + if (copy_from_user(®_val, uaddr, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + if (reg_num == KVM_REG_RISCV_CSR_REG(sip)) { + reg_val &= VSIP_VALID_MASK; + reg_val <<= VSIP_TO_HVIP_SHIFT; + } + + ((unsigned long *)csr)[reg_num] = reg_val; + + if (reg_num == KVM_REG_RISCV_CSR_REG(sip)) + WRITE_ONCE(vcpu->arch.irqs_pending_mask, 0); + + return 0; +} + +static int kvm_riscv_vcpu_set_reg(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_CONFIG) + return kvm_riscv_vcpu_set_reg_config(vcpu, reg); + else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_CORE) + return kvm_riscv_vcpu_set_reg_core(vcpu, reg); + else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_CSR) + return kvm_riscv_vcpu_set_reg_csr(vcpu, reg); + else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_TIMER) + return kvm_riscv_vcpu_set_reg_timer(vcpu, reg); + else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_FP_F) + return kvm_riscv_vcpu_set_reg_fp(vcpu, reg, + KVM_REG_RISCV_FP_F); + else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_FP_D) + return kvm_riscv_vcpu_set_reg_fp(vcpu, reg, + KVM_REG_RISCV_FP_D); + + return -EINVAL; +} + +static int kvm_riscv_vcpu_get_reg(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_CONFIG) + return kvm_riscv_vcpu_get_reg_config(vcpu, reg); + else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_CORE) + return kvm_riscv_vcpu_get_reg_core(vcpu, reg); + else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_CSR) + return kvm_riscv_vcpu_get_reg_csr(vcpu, reg); + else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_TIMER) + return kvm_riscv_vcpu_get_reg_timer(vcpu, reg); + else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_FP_F) + return kvm_riscv_vcpu_get_reg_fp(vcpu, reg, + KVM_REG_RISCV_FP_F); + else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_FP_D) + return kvm_riscv_vcpu_get_reg_fp(vcpu, reg, + KVM_REG_RISCV_FP_D); + + return -EINVAL; +} + +long kvm_arch_vcpu_async_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + + if (ioctl == KVM_INTERRUPT) { + struct kvm_interrupt irq; + + if (copy_from_user(&irq, argp, sizeof(irq))) + return -EFAULT; + + if (irq.irq == KVM_INTERRUPT_SET) + return kvm_riscv_vcpu_set_interrupt(vcpu, IRQ_VS_EXT); + else + return kvm_riscv_vcpu_unset_interrupt(vcpu, IRQ_VS_EXT); + } + + return -ENOIOCTLCMD; +} + +long kvm_arch_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + long r = -EINVAL; + + switch (ioctl) { + case KVM_SET_ONE_REG: + case KVM_GET_ONE_REG: { + struct kvm_one_reg reg; + + r = -EFAULT; + if (copy_from_user(®, argp, sizeof(reg))) + break; + + if (ioctl == KVM_SET_ONE_REG) + r = kvm_riscv_vcpu_set_reg(vcpu, ®); + else + r = kvm_riscv_vcpu_get_reg(vcpu, ®); + break; + } + default: + break; + } + + return r; +} + +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + return -EINVAL; +} + +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + return -EINVAL; +} + +int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + return -EINVAL; +} + +int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + return -EINVAL; +} + +int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, + struct kvm_translation *tr) +{ + return -EINVAL; +} + +int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + return -EINVAL; +} + +int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + return -EINVAL; +} + +void kvm_riscv_vcpu_flush_interrupts(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; + unsigned long mask, val; + + if (READ_ONCE(vcpu->arch.irqs_pending_mask)) { + mask = xchg_acquire(&vcpu->arch.irqs_pending_mask, 0); + val = READ_ONCE(vcpu->arch.irqs_pending) & mask; + + csr->hvip &= ~mask; + csr->hvip |= val; + } +} + +void kvm_riscv_vcpu_sync_interrupts(struct kvm_vcpu *vcpu) +{ + unsigned long hvip; + struct kvm_vcpu_arch *v = &vcpu->arch; + struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; + + /* Read current HVIP and VSIE CSRs */ + csr->vsie = csr_read(CSR_VSIE); + + /* Sync-up HVIP.VSSIP bit changes does by Guest */ + hvip = csr_read(CSR_HVIP); + if ((csr->hvip ^ hvip) & (1UL << IRQ_VS_SOFT)) { + if (hvip & (1UL << IRQ_VS_SOFT)) { + if (!test_and_set_bit(IRQ_VS_SOFT, + &v->irqs_pending_mask)) + set_bit(IRQ_VS_SOFT, &v->irqs_pending); + } else { + if (!test_and_set_bit(IRQ_VS_SOFT, + &v->irqs_pending_mask)) + clear_bit(IRQ_VS_SOFT, &v->irqs_pending); + } + } +} + +int kvm_riscv_vcpu_set_interrupt(struct kvm_vcpu *vcpu, unsigned int irq) +{ + if (irq != IRQ_VS_SOFT && + irq != IRQ_VS_TIMER && + irq != IRQ_VS_EXT) + return -EINVAL; + + set_bit(irq, &vcpu->arch.irqs_pending); + smp_mb__before_atomic(); + set_bit(irq, &vcpu->arch.irqs_pending_mask); + + kvm_vcpu_kick(vcpu); + + return 0; +} + +int kvm_riscv_vcpu_unset_interrupt(struct kvm_vcpu *vcpu, unsigned int irq) +{ + if (irq != IRQ_VS_SOFT && + irq != IRQ_VS_TIMER && + irq != IRQ_VS_EXT) + return -EINVAL; + + clear_bit(irq, &vcpu->arch.irqs_pending); + smp_mb__before_atomic(); + set_bit(irq, &vcpu->arch.irqs_pending_mask); + + return 0; +} + +bool kvm_riscv_vcpu_has_interrupts(struct kvm_vcpu *vcpu, unsigned long mask) +{ + unsigned long ie = ((vcpu->arch.guest_csr.vsie & VSIP_VALID_MASK) + << VSIP_TO_HVIP_SHIFT) & mask; + + return (READ_ONCE(vcpu->arch.irqs_pending) & ie) ? true : false; +} + +void kvm_riscv_vcpu_power_off(struct kvm_vcpu *vcpu) +{ + vcpu->arch.power_off = true; + kvm_make_request(KVM_REQ_SLEEP, vcpu); + kvm_vcpu_kick(vcpu); +} + +void kvm_riscv_vcpu_power_on(struct kvm_vcpu *vcpu) +{ + vcpu->arch.power_off = false; + kvm_vcpu_wake_up(vcpu); +} + +int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + if (vcpu->arch.power_off) + mp_state->mp_state = KVM_MP_STATE_STOPPED; + else + mp_state->mp_state = KVM_MP_STATE_RUNNABLE; + + return 0; +} + +int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + int ret = 0; + + switch (mp_state->mp_state) { + case KVM_MP_STATE_RUNNABLE: + vcpu->arch.power_off = false; + break; + case KVM_MP_STATE_STOPPED: + kvm_riscv_vcpu_power_off(vcpu); + break; + default: + ret = -EINVAL; + } + + return ret; +} + +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) +{ + /* TODO; To be implemented later. */ + return -EINVAL; +} + +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; + + csr_write(CSR_VSSTATUS, csr->vsstatus); + csr_write(CSR_VSIE, csr->vsie); + csr_write(CSR_VSTVEC, csr->vstvec); + csr_write(CSR_VSSCRATCH, csr->vsscratch); + csr_write(CSR_VSEPC, csr->vsepc); + csr_write(CSR_VSCAUSE, csr->vscause); + csr_write(CSR_VSTVAL, csr->vstval); + csr_write(CSR_HVIP, csr->hvip); + csr_write(CSR_VSATP, csr->vsatp); + + kvm_riscv_stage2_update_hgatp(vcpu); + + kvm_riscv_vcpu_timer_restore(vcpu); + + kvm_riscv_vcpu_host_fp_save(&vcpu->arch.host_context); + kvm_riscv_vcpu_guest_fp_restore(&vcpu->arch.guest_context, + vcpu->arch.isa); + + vcpu->cpu = cpu; +} + +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; + + vcpu->cpu = -1; + + kvm_riscv_vcpu_guest_fp_save(&vcpu->arch.guest_context, + vcpu->arch.isa); + kvm_riscv_vcpu_host_fp_restore(&vcpu->arch.host_context); + + csr_write(CSR_HGATP, 0); + + csr->vsstatus = csr_read(CSR_VSSTATUS); + csr->vsie = csr_read(CSR_VSIE); + csr->vstvec = csr_read(CSR_VSTVEC); + csr->vsscratch = csr_read(CSR_VSSCRATCH); + csr->vsepc = csr_read(CSR_VSEPC); + csr->vscause = csr_read(CSR_VSCAUSE); + csr->vstval = csr_read(CSR_VSTVAL); + csr->hvip = csr_read(CSR_HVIP); + csr->vsatp = csr_read(CSR_VSATP); +} + +static void kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu) +{ + struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu); + + if (kvm_request_pending(vcpu)) { + if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) { + rcuwait_wait_event(wait, + (!vcpu->arch.power_off) && (!vcpu->arch.pause), + TASK_INTERRUPTIBLE); + + if (vcpu->arch.power_off || vcpu->arch.pause) { + /* + * Awaken to handle a signal, request to + * sleep again later. + */ + kvm_make_request(KVM_REQ_SLEEP, vcpu); + } + } + + if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) + kvm_riscv_reset_vcpu(vcpu); + + if (kvm_check_request(KVM_REQ_UPDATE_HGATP, vcpu)) + kvm_riscv_stage2_update_hgatp(vcpu); + + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) + __kvm_riscv_hfence_gvma_all(); + } +} + +static void kvm_riscv_update_hvip(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; + + csr_write(CSR_HVIP, csr->hvip); +} + +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) +{ + int ret; + struct kvm_cpu_trap trap; + struct kvm_run *run = vcpu->run; + + /* Mark this VCPU ran at least once */ + vcpu->arch.ran_atleast_once = true; + + vcpu->arch.srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + + /* Process MMIO value returned from user-space */ + if (run->exit_reason == KVM_EXIT_MMIO) { + ret = kvm_riscv_vcpu_mmio_return(vcpu, vcpu->run); + if (ret) { + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->arch.srcu_idx); + return ret; + } + } + + /* Process SBI value returned from user-space */ + if (run->exit_reason == KVM_EXIT_RISCV_SBI) { + ret = kvm_riscv_vcpu_sbi_return(vcpu, vcpu->run); + if (ret) { + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->arch.srcu_idx); + return ret; + } + } + + if (run->immediate_exit) { + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->arch.srcu_idx); + return -EINTR; + } + + vcpu_load(vcpu); + + kvm_sigset_activate(vcpu); + + ret = 1; + run->exit_reason = KVM_EXIT_UNKNOWN; + while (ret > 0) { + /* Check conditions before entering the guest */ + cond_resched(); + + kvm_riscv_stage2_vmid_update(vcpu); + + kvm_riscv_check_vcpu_requests(vcpu); + + preempt_disable(); + + local_irq_disable(); + + /* + * Exit if we have a signal pending so that we can deliver + * the signal to user space. + */ + if (signal_pending(current)) { + ret = -EINTR; + run->exit_reason = KVM_EXIT_INTR; + } + + /* + * Ensure we set mode to IN_GUEST_MODE after we disable + * interrupts and before the final VCPU requests check. + * See the comment in kvm_vcpu_exiting_guest_mode() and + * Documentation/virtual/kvm/vcpu-requests.rst + */ + vcpu->mode = IN_GUEST_MODE; + + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->arch.srcu_idx); + smp_mb__after_srcu_read_unlock(); + + /* + * We might have got VCPU interrupts updated asynchronously + * so update it in HW. + */ + kvm_riscv_vcpu_flush_interrupts(vcpu); + + /* Update HVIP CSR for current CPU */ + kvm_riscv_update_hvip(vcpu); + + if (ret <= 0 || + kvm_riscv_stage2_vmid_ver_changed(&vcpu->kvm->arch.vmid) || + kvm_request_pending(vcpu)) { + vcpu->mode = OUTSIDE_GUEST_MODE; + local_irq_enable(); + preempt_enable(); + vcpu->arch.srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + continue; + } + + guest_enter_irqoff(); + + __kvm_riscv_switch_to(&vcpu->arch); + + vcpu->mode = OUTSIDE_GUEST_MODE; + vcpu->stat.exits++; + + /* + * Save SCAUSE, STVAL, HTVAL, and HTINST because we might + * get an interrupt between __kvm_riscv_switch_to() and + * local_irq_enable() which can potentially change CSRs. + */ + trap.sepc = vcpu->arch.guest_context.sepc; + trap.scause = csr_read(CSR_SCAUSE); + trap.stval = csr_read(CSR_STVAL); + trap.htval = csr_read(CSR_HTVAL); + trap.htinst = csr_read(CSR_HTINST); + + /* Syncup interrupts state with HW */ + kvm_riscv_vcpu_sync_interrupts(vcpu); + + /* + * We may have taken a host interrupt in VS/VU-mode (i.e. + * while executing the guest). This interrupt is still + * pending, as we haven't serviced it yet! + * + * We're now back in HS-mode with interrupts disabled + * so enabling the interrupts now will have the effect + * of taking the interrupt again, in HS-mode this time. + */ + local_irq_enable(); + + /* + * We do local_irq_enable() before calling guest_exit() so + * that if a timer interrupt hits while running the guest + * we account that tick as being spent in the guest. We + * enable preemption after calling guest_exit() so that if + * we get preempted we make sure ticks after that is not + * counted as guest time. + */ + guest_exit(); + + preempt_enable(); + + vcpu->arch.srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + + ret = kvm_riscv_vcpu_exit(vcpu, run, &trap); + } + + kvm_sigset_deactivate(vcpu); + + vcpu_put(vcpu); + + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->arch.srcu_idx); + + return ret; +} diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c new file mode 100644 index 000000000000..7f2d742ae4c6 --- /dev/null +++ b/arch/riscv/kvm/vcpu_exit.c @@ -0,0 +1,701 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Western Digital Corporation or its affiliates. + * + * Authors: + * Anup Patel <anup.patel@wdc.com> + */ + +#include <linux/bitops.h> +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/kvm_host.h> +#include <asm/csr.h> + +#define INSN_OPCODE_MASK 0x007c +#define INSN_OPCODE_SHIFT 2 +#define INSN_OPCODE_SYSTEM 28 + +#define INSN_MASK_WFI 0xffffffff +#define INSN_MATCH_WFI 0x10500073 + +#define INSN_MATCH_LB 0x3 +#define INSN_MASK_LB 0x707f +#define INSN_MATCH_LH 0x1003 +#define INSN_MASK_LH 0x707f +#define INSN_MATCH_LW 0x2003 +#define INSN_MASK_LW 0x707f +#define INSN_MATCH_LD 0x3003 +#define INSN_MASK_LD 0x707f +#define INSN_MATCH_LBU 0x4003 +#define INSN_MASK_LBU 0x707f +#define INSN_MATCH_LHU 0x5003 +#define INSN_MASK_LHU 0x707f +#define INSN_MATCH_LWU 0x6003 +#define INSN_MASK_LWU 0x707f +#define INSN_MATCH_SB 0x23 +#define INSN_MASK_SB 0x707f +#define INSN_MATCH_SH 0x1023 +#define INSN_MASK_SH 0x707f +#define INSN_MATCH_SW 0x2023 +#define INSN_MASK_SW 0x707f +#define INSN_MATCH_SD 0x3023 +#define INSN_MASK_SD 0x707f + +#define INSN_MATCH_C_LD 0x6000 +#define INSN_MASK_C_LD 0xe003 +#define INSN_MATCH_C_SD 0xe000 +#define INSN_MASK_C_SD 0xe003 +#define INSN_MATCH_C_LW 0x4000 +#define INSN_MASK_C_LW 0xe003 +#define INSN_MATCH_C_SW 0xc000 +#define INSN_MASK_C_SW 0xe003 +#define INSN_MATCH_C_LDSP 0x6002 +#define INSN_MASK_C_LDSP 0xe003 +#define INSN_MATCH_C_SDSP 0xe002 +#define INSN_MASK_C_SDSP 0xe003 +#define INSN_MATCH_C_LWSP 0x4002 +#define INSN_MASK_C_LWSP 0xe003 +#define INSN_MATCH_C_SWSP 0xc002 +#define INSN_MASK_C_SWSP 0xe003 + +#define INSN_16BIT_MASK 0x3 + +#define INSN_IS_16BIT(insn) (((insn) & INSN_16BIT_MASK) != INSN_16BIT_MASK) + +#define INSN_LEN(insn) (INSN_IS_16BIT(insn) ? 2 : 4) + +#ifdef CONFIG_64BIT +#define LOG_REGBYTES 3 +#else +#define LOG_REGBYTES 2 +#endif +#define REGBYTES (1 << LOG_REGBYTES) + +#define SH_RD 7 +#define SH_RS1 15 +#define SH_RS2 20 +#define SH_RS2C 2 + +#define RV_X(x, s, n) (((x) >> (s)) & ((1 << (n)) - 1)) +#define RVC_LW_IMM(x) ((RV_X(x, 6, 1) << 2) | \ + (RV_X(x, 10, 3) << 3) | \ + (RV_X(x, 5, 1) << 6)) +#define RVC_LD_IMM(x) ((RV_X(x, 10, 3) << 3) | \ + (RV_X(x, 5, 2) << 6)) +#define RVC_LWSP_IMM(x) ((RV_X(x, 4, 3) << 2) | \ + (RV_X(x, 12, 1) << 5) | \ + (RV_X(x, 2, 2) << 6)) +#define RVC_LDSP_IMM(x) ((RV_X(x, 5, 2) << 3) | \ + (RV_X(x, 12, 1) << 5) | \ + (RV_X(x, 2, 3) << 6)) +#define RVC_SWSP_IMM(x) ((RV_X(x, 9, 4) << 2) | \ + (RV_X(x, 7, 2) << 6)) +#define RVC_SDSP_IMM(x) ((RV_X(x, 10, 3) << 3) | \ + (RV_X(x, 7, 3) << 6)) +#define RVC_RS1S(insn) (8 + RV_X(insn, SH_RD, 3)) +#define RVC_RS2S(insn) (8 + RV_X(insn, SH_RS2C, 3)) +#define RVC_RS2(insn) RV_X(insn, SH_RS2C, 5) + +#define SHIFT_RIGHT(x, y) \ + ((y) < 0 ? ((x) << -(y)) : ((x) >> (y))) + +#define REG_MASK \ + ((1 << (5 + LOG_REGBYTES)) - (1 << LOG_REGBYTES)) + +#define REG_OFFSET(insn, pos) \ + (SHIFT_RIGHT((insn), (pos) - LOG_REGBYTES) & REG_MASK) + +#define REG_PTR(insn, pos, regs) \ + ((ulong *)((ulong)(regs) + REG_OFFSET(insn, pos))) + +#define GET_RM(insn) (((insn) >> 12) & 7) + +#define GET_RS1(insn, regs) (*REG_PTR(insn, SH_RS1, regs)) +#define GET_RS2(insn, regs) (*REG_PTR(insn, SH_RS2, regs)) +#define GET_RS1S(insn, regs) (*REG_PTR(RVC_RS1S(insn), 0, regs)) +#define GET_RS2S(insn, regs) (*REG_PTR(RVC_RS2S(insn), 0, regs)) +#define GET_RS2C(insn, regs) (*REG_PTR(insn, SH_RS2C, regs)) +#define GET_SP(regs) (*REG_PTR(2, 0, regs)) +#define SET_RD(insn, regs, val) (*REG_PTR(insn, SH_RD, regs) = (val)) +#define IMM_I(insn) ((s32)(insn) >> 20) +#define IMM_S(insn) (((s32)(insn) >> 25 << 5) | \ + (s32)(((insn) >> 7) & 0x1f)) +#define MASK_FUNCT3 0x7000 + +static int truly_illegal_insn(struct kvm_vcpu *vcpu, + struct kvm_run *run, + ulong insn) +{ + struct kvm_cpu_trap utrap = { 0 }; + + /* Redirect trap to Guest VCPU */ + utrap.sepc = vcpu->arch.guest_context.sepc; + utrap.scause = EXC_INST_ILLEGAL; + utrap.stval = insn; + kvm_riscv_vcpu_trap_redirect(vcpu, &utrap); + + return 1; +} + +static int system_opcode_insn(struct kvm_vcpu *vcpu, + struct kvm_run *run, + ulong insn) +{ + if ((insn & INSN_MASK_WFI) == INSN_MATCH_WFI) { + vcpu->stat.wfi_exit_stat++; + if (!kvm_arch_vcpu_runnable(vcpu)) { + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->arch.srcu_idx); + kvm_vcpu_block(vcpu); + vcpu->arch.srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); + } + vcpu->arch.guest_context.sepc += INSN_LEN(insn); + return 1; + } + + return truly_illegal_insn(vcpu, run, insn); +} + +static int virtual_inst_fault(struct kvm_vcpu *vcpu, struct kvm_run *run, + struct kvm_cpu_trap *trap) +{ + unsigned long insn = trap->stval; + struct kvm_cpu_trap utrap = { 0 }; + struct kvm_cpu_context *ct; + + if (unlikely(INSN_IS_16BIT(insn))) { + if (insn == 0) { + ct = &vcpu->arch.guest_context; + insn = kvm_riscv_vcpu_unpriv_read(vcpu, true, + ct->sepc, + &utrap); + if (utrap.scause) { + utrap.sepc = ct->sepc; + kvm_riscv_vcpu_trap_redirect(vcpu, &utrap); + return 1; + } + } + if (INSN_IS_16BIT(insn)) + return truly_illegal_insn(vcpu, run, insn); + } + + switch ((insn & INSN_OPCODE_MASK) >> INSN_OPCODE_SHIFT) { + case INSN_OPCODE_SYSTEM: + return system_opcode_insn(vcpu, run, insn); + default: + return truly_illegal_insn(vcpu, run, insn); + } +} + +static int emulate_load(struct kvm_vcpu *vcpu, struct kvm_run *run, + unsigned long fault_addr, unsigned long htinst) +{ + u8 data_buf[8]; + unsigned long insn; + int shift = 0, len = 0, insn_len = 0; + struct kvm_cpu_trap utrap = { 0 }; + struct kvm_cpu_context *ct = &vcpu->arch.guest_context; + + /* Determine trapped instruction */ + if (htinst & 0x1) { + /* + * Bit[0] == 1 implies trapped instruction value is + * transformed instruction or custom instruction. + */ + insn = htinst | INSN_16BIT_MASK; + insn_len = (htinst & BIT(1)) ? INSN_LEN(insn) : 2; + } else { + /* + * Bit[0] == 0 implies trapped instruction value is + * zero or special value. + */ + insn = kvm_riscv_vcpu_unpriv_read(vcpu, true, ct->sepc, + &utrap); + if (utrap.scause) { + /* Redirect trap if we failed to read instruction */ + utrap.sepc = ct->sepc; + kvm_riscv_vcpu_trap_redirect(vcpu, &utrap); + return 1; + } + insn_len = INSN_LEN(insn); + } + + /* Decode length of MMIO and shift */ + if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) { + len = 4; + shift = 8 * (sizeof(ulong) - len); + } else if ((insn & INSN_MASK_LB) == INSN_MATCH_LB) { + len = 1; + shift = 8 * (sizeof(ulong) - len); + } else if ((insn & INSN_MASK_LBU) == INSN_MATCH_LBU) { + len = 1; + shift = 8 * (sizeof(ulong) - len); +#ifdef CONFIG_64BIT + } else if ((insn & INSN_MASK_LD) == INSN_MATCH_LD) { + len = 8; + shift = 8 * (sizeof(ulong) - len); + } else if ((insn & INSN_MASK_LWU) == INSN_MATCH_LWU) { + len = 4; +#endif + } else if ((insn & INSN_MASK_LH) == INSN_MATCH_LH) { + len = 2; + shift = 8 * (sizeof(ulong) - len); + } else if ((insn & INSN_MASK_LHU) == INSN_MATCH_LHU) { + len = 2; +#ifdef CONFIG_64BIT + } else if ((insn & INSN_MASK_C_LD) == INSN_MATCH_C_LD) { + len = 8; + shift = 8 * (sizeof(ulong) - len); + insn = RVC_RS2S(insn) << SH_RD; + } else if ((insn & INSN_MASK_C_LDSP) == INSN_MATCH_C_LDSP && + ((insn >> SH_RD) & 0x1f)) { + len = 8; + shift = 8 * (sizeof(ulong) - len); +#endif + } else if ((insn & INSN_MASK_C_LW) == INSN_MATCH_C_LW) { + len = 4; + shift = 8 * (sizeof(ulong) - len); + insn = RVC_RS2S(insn) << SH_RD; + } else if ((insn & INSN_MASK_C_LWSP) == INSN_MATCH_C_LWSP && + ((insn >> SH_RD) & 0x1f)) { + len = 4; + shift = 8 * (sizeof(ulong) - len); + } else { + return -EOPNOTSUPP; + } + + /* Fault address should be aligned to length of MMIO */ + if (fault_addr & (len - 1)) + return -EIO; + + /* Save instruction decode info */ + vcpu->arch.mmio_decode.insn = insn; + vcpu->arch.mmio_decode.insn_len = insn_len; + vcpu->arch.mmio_decode.shift = shift; + vcpu->arch.mmio_decode.len = len; + vcpu->arch.mmio_decode.return_handled = 0; + + /* Update MMIO details in kvm_run struct */ + run->mmio.is_write = false; + run->mmio.phys_addr = fault_addr; + run->mmio.len = len; + + /* Try to handle MMIO access in the kernel */ + if (!kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_addr, len, data_buf)) { + /* Successfully handled MMIO access in the kernel so resume */ + memcpy(run->mmio.data, data_buf, len); + vcpu->stat.mmio_exit_kernel++; + kvm_riscv_vcpu_mmio_return(vcpu, run); + return 1; + } + + /* Exit to userspace for MMIO emulation */ + vcpu->stat.mmio_exit_user++; + run->exit_reason = KVM_EXIT_MMIO; + + return 0; +} + +static int emulate_store(struct kvm_vcpu *vcpu, struct kvm_run *run, + unsigned long fault_addr, unsigned long htinst) +{ + u8 data8; + u16 data16; + u32 data32; + u64 data64; + ulong data; + unsigned long insn; + int len = 0, insn_len = 0; + struct kvm_cpu_trap utrap = { 0 }; + struct kvm_cpu_context *ct = &vcpu->arch.guest_context; + + /* Determine trapped instruction */ + if (htinst & 0x1) { + /* + * Bit[0] == 1 implies trapped instruction value is + * transformed instruction or custom instruction. + */ + insn = htinst | INSN_16BIT_MASK; + insn_len = (htinst & BIT(1)) ? INSN_LEN(insn) : 2; + } else { + /* + * Bit[0] == 0 implies trapped instruction value is + * zero or special value. + */ + insn = kvm_riscv_vcpu_unpriv_read(vcpu, true, ct->sepc, + &utrap); + if (utrap.scause) { + /* Redirect trap if we failed to read instruction */ + utrap.sepc = ct->sepc; + kvm_riscv_vcpu_trap_redirect(vcpu, &utrap); + return 1; + } + insn_len = INSN_LEN(insn); + } + + data = GET_RS2(insn, &vcpu->arch.guest_context); + data8 = data16 = data32 = data64 = data; + + if ((insn & INSN_MASK_SW) == INSN_MATCH_SW) { + len = 4; + } else if ((insn & INSN_MASK_SB) == INSN_MATCH_SB) { + len = 1; +#ifdef CONFIG_64BIT + } else if ((insn & INSN_MASK_SD) == INSN_MATCH_SD) { + len = 8; +#endif + } else if ((insn & INSN_MASK_SH) == INSN_MATCH_SH) { + len = 2; +#ifdef CONFIG_64BIT + } else if ((insn & INSN_MASK_C_SD) == INSN_MATCH_C_SD) { + len = 8; + data64 = GET_RS2S(insn, &vcpu->arch.guest_context); + } else if ((insn & INSN_MASK_C_SDSP) == INSN_MATCH_C_SDSP && + ((insn >> SH_RD) & 0x1f)) { + len = 8; + data64 = GET_RS2C(insn, &vcpu->arch.guest_context); +#endif + } else if ((insn & INSN_MASK_C_SW) == INSN_MATCH_C_SW) { + len = 4; + data32 = GET_RS2S(insn, &vcpu->arch.guest_context); + } else if ((insn & INSN_MASK_C_SWSP) == INSN_MATCH_C_SWSP && + ((insn >> SH_RD) & 0x1f)) { + len = 4; + data32 = GET_RS2C(insn, &vcpu->arch.guest_context); + } else { + return -EOPNOTSUPP; + } + + /* Fault address should be aligned to length of MMIO */ + if (fault_addr & (len - 1)) + return -EIO; + + /* Save instruction decode info */ + vcpu->arch.mmio_decode.insn = insn; + vcpu->arch.mmio_decode.insn_len = insn_len; + vcpu->arch.mmio_decode.shift = 0; + vcpu->arch.mmio_decode.len = len; + vcpu->arch.mmio_decode.return_handled = 0; + + /* Copy data to kvm_run instance */ + switch (len) { + case 1: + *((u8 *)run->mmio.data) = data8; + break; + case 2: + *((u16 *)run->mmio.data) = data16; + break; + case 4: + *((u32 *)run->mmio.data) = data32; + break; + case 8: + *((u64 *)run->mmio.data) = data64; + break; + default: + return -EOPNOTSUPP; + } + + /* Update MMIO details in kvm_run struct */ + run->mmio.is_write = true; + run->mmio.phys_addr = fault_addr; + run->mmio.len = len; + + /* Try to handle MMIO access in the kernel */ + if (!kvm_io_bus_write(vcpu, KVM_MMIO_BUS, + fault_addr, len, run->mmio.data)) { + /* Successfully handled MMIO access in the kernel so resume */ + vcpu->stat.mmio_exit_kernel++; + kvm_riscv_vcpu_mmio_return(vcpu, run); + return 1; + } + + /* Exit to userspace for MMIO emulation */ + vcpu->stat.mmio_exit_user++; + run->exit_reason = KVM_EXIT_MMIO; + + return 0; +} + +static int stage2_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run, + struct kvm_cpu_trap *trap) +{ + struct kvm_memory_slot *memslot; + unsigned long hva, fault_addr; + bool writeable; + gfn_t gfn; + int ret; + + fault_addr = (trap->htval << 2) | (trap->stval & 0x3); + gfn = fault_addr >> PAGE_SHIFT; + memslot = gfn_to_memslot(vcpu->kvm, gfn); + hva = gfn_to_hva_memslot_prot(memslot, gfn, &writeable); + + if (kvm_is_error_hva(hva) || + (trap->scause == EXC_STORE_GUEST_PAGE_FAULT && !writeable)) { + switch (trap->scause) { + case EXC_LOAD_GUEST_PAGE_FAULT: + return emulate_load(vcpu, run, fault_addr, + trap->htinst); + case EXC_STORE_GUEST_PAGE_FAULT: + return emulate_store(vcpu, run, fault_addr, + trap->htinst); + default: + return -EOPNOTSUPP; + }; + } + + ret = kvm_riscv_stage2_map(vcpu, memslot, fault_addr, hva, + (trap->scause == EXC_STORE_GUEST_PAGE_FAULT) ? true : false); + if (ret < 0) + return ret; + + return 1; +} + +/** + * kvm_riscv_vcpu_unpriv_read -- Read machine word from Guest memory + * + * @vcpu: The VCPU pointer + * @read_insn: Flag representing whether we are reading instruction + * @guest_addr: Guest address to read + * @trap: Output pointer to trap details + */ +unsigned long kvm_riscv_vcpu_unpriv_read(struct kvm_vcpu *vcpu, + bool read_insn, + unsigned long guest_addr, + struct kvm_cpu_trap *trap) +{ + register unsigned long taddr asm("a0") = (unsigned long)trap; + register unsigned long ttmp asm("a1"); + register unsigned long val asm("t0"); + register unsigned long tmp asm("t1"); + register unsigned long addr asm("t2") = guest_addr; + unsigned long flags; + unsigned long old_stvec, old_hstatus; + + local_irq_save(flags); + + old_hstatus = csr_swap(CSR_HSTATUS, vcpu->arch.guest_context.hstatus); + old_stvec = csr_swap(CSR_STVEC, (ulong)&__kvm_riscv_unpriv_trap); + + if (read_insn) { + /* + * HLVX.HU instruction + * 0110010 00011 rs1 100 rd 1110011 + */ + asm volatile ("\n" + ".option push\n" + ".option norvc\n" + "add %[ttmp], %[taddr], 0\n" + /* + * HLVX.HU %[val], (%[addr]) + * HLVX.HU t0, (t2) + * 0110010 00011 00111 100 00101 1110011 + */ + ".word 0x6433c2f3\n" + "andi %[tmp], %[val], 3\n" + "addi %[tmp], %[tmp], -3\n" + "bne %[tmp], zero, 2f\n" + "addi %[addr], %[addr], 2\n" + /* + * HLVX.HU %[tmp], (%[addr]) + * HLVX.HU t1, (t2) + * 0110010 00011 00111 100 00110 1110011 + */ + ".word 0x6433c373\n" + "sll %[tmp], %[tmp], 16\n" + "add %[val], %[val], %[tmp]\n" + "2:\n" + ".option pop" + : [val] "=&r" (val), [tmp] "=&r" (tmp), + [taddr] "+&r" (taddr), [ttmp] "+&r" (ttmp), + [addr] "+&r" (addr) : : "memory"); + + if (trap->scause == EXC_LOAD_PAGE_FAULT) + trap->scause = EXC_INST_PAGE_FAULT; + } else { + /* + * HLV.D instruction + * 0110110 00000 rs1 100 rd 1110011 + * + * HLV.W instruction + * 0110100 00000 rs1 100 rd 1110011 + */ + asm volatile ("\n" + ".option push\n" + ".option norvc\n" + "add %[ttmp], %[taddr], 0\n" +#ifdef CONFIG_64BIT + /* + * HLV.D %[val], (%[addr]) + * HLV.D t0, (t2) + * 0110110 00000 00111 100 00101 1110011 + */ + ".word 0x6c03c2f3\n" +#else + /* + * HLV.W %[val], (%[addr]) + * HLV.W t0, (t2) + * 0110100 00000 00111 100 00101 1110011 + */ + ".word 0x6803c2f3\n" +#endif + ".option pop" + : [val] "=&r" (val), + [taddr] "+&r" (taddr), [ttmp] "+&r" (ttmp) + : [addr] "r" (addr) : "memory"); + } + + csr_write(CSR_STVEC, old_stvec); + csr_write(CSR_HSTATUS, old_hstatus); + + local_irq_restore(flags); + + return val; +} + +/** + * kvm_riscv_vcpu_trap_redirect -- Redirect trap to Guest + * + * @vcpu: The VCPU pointer + * @trap: Trap details + */ +void kvm_riscv_vcpu_trap_redirect(struct kvm_vcpu *vcpu, + struct kvm_cpu_trap *trap) +{ + unsigned long vsstatus = csr_read(CSR_VSSTATUS); + + /* Change Guest SSTATUS.SPP bit */ + vsstatus &= ~SR_SPP; + if (vcpu->arch.guest_context.sstatus & SR_SPP) + vsstatus |= SR_SPP; + + /* Change Guest SSTATUS.SPIE bit */ + vsstatus &= ~SR_SPIE; + if (vsstatus & SR_SIE) + vsstatus |= SR_SPIE; + + /* Clear Guest SSTATUS.SIE bit */ + vsstatus &= ~SR_SIE; + + /* Update Guest SSTATUS */ + csr_write(CSR_VSSTATUS, vsstatus); + + /* Update Guest SCAUSE, STVAL, and SEPC */ + csr_write(CSR_VSCAUSE, trap->scause); + csr_write(CSR_VSTVAL, trap->stval); + csr_write(CSR_VSEPC, trap->sepc); + + /* Set Guest PC to Guest exception vector */ + vcpu->arch.guest_context.sepc = csr_read(CSR_VSTVEC); +} + +/** + * kvm_riscv_vcpu_mmio_return -- Handle MMIO loads after user space emulation + * or in-kernel IO emulation + * + * @vcpu: The VCPU pointer + * @run: The VCPU run struct containing the mmio data + */ +int kvm_riscv_vcpu_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + u8 data8; + u16 data16; + u32 data32; + u64 data64; + ulong insn; + int len, shift; + + if (vcpu->arch.mmio_decode.return_handled) + return 0; + + vcpu->arch.mmio_decode.return_handled = 1; + insn = vcpu->arch.mmio_decode.insn; + + if (run->mmio.is_write) + goto done; + + len = vcpu->arch.mmio_decode.len; + shift = vcpu->arch.mmio_decode.shift; + + switch (len) { + case 1: + data8 = *((u8 *)run->mmio.data); + SET_RD(insn, &vcpu->arch.guest_context, + (ulong)data8 << shift >> shift); + break; + case 2: + data16 = *((u16 *)run->mmio.data); + SET_RD(insn, &vcpu->arch.guest_context, + (ulong)data16 << shift >> shift); + break; + case 4: + data32 = *((u32 *)run->mmio.data); + SET_RD(insn, &vcpu->arch.guest_context, + (ulong)data32 << shift >> shift); + break; + case 8: + data64 = *((u64 *)run->mmio.data); + SET_RD(insn, &vcpu->arch.guest_context, + (ulong)data64 << shift >> shift); + break; + default: + return -EOPNOTSUPP; + } + +done: + /* Move to next instruction */ + vcpu->arch.guest_context.sepc += vcpu->arch.mmio_decode.insn_len; + + return 0; +} + +/* + * Return > 0 to return to guest, < 0 on error, 0 (and set exit_reason) on + * proper exit to userspace. + */ +int kvm_riscv_vcpu_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, + struct kvm_cpu_trap *trap) +{ + int ret; + + /* If we got host interrupt then do nothing */ + if (trap->scause & CAUSE_IRQ_FLAG) + return 1; + + /* Handle guest traps */ + ret = -EFAULT; + run->exit_reason = KVM_EXIT_UNKNOWN; + switch (trap->scause) { + case EXC_VIRTUAL_INST_FAULT: + if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV) + ret = virtual_inst_fault(vcpu, run, trap); + break; + case EXC_INST_GUEST_PAGE_FAULT: + case EXC_LOAD_GUEST_PAGE_FAULT: + case EXC_STORE_GUEST_PAGE_FAULT: + if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV) + ret = stage2_page_fault(vcpu, run, trap); + break; + case EXC_SUPERVISOR_SYSCALL: + if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV) + ret = kvm_riscv_vcpu_sbi_ecall(vcpu, run); + break; + default: + break; + } + + /* Print details in-case of error */ + if (ret < 0) { + kvm_err("VCPU exit error %d\n", ret); + kvm_err("SEPC=0x%lx SSTATUS=0x%lx HSTATUS=0x%lx\n", + vcpu->arch.guest_context.sepc, + vcpu->arch.guest_context.sstatus, + vcpu->arch.guest_context.hstatus); + kvm_err("SCAUSE=0x%lx STVAL=0x%lx HTVAL=0x%lx HTINST=0x%lx\n", + trap->scause, trap->stval, trap->htval, trap->htinst); + } + + return ret; +} diff --git a/arch/riscv/kvm/vcpu_fp.c b/arch/riscv/kvm/vcpu_fp.c new file mode 100644 index 000000000000..1b070152578f --- /dev/null +++ b/arch/riscv/kvm/vcpu_fp.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Western Digital Corporation or its affiliates. + * + * Authors: + * Atish Patra <atish.patra@wdc.com> + * Anup Patel <anup.patel@wdc.com> + */ + +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/kvm_host.h> +#include <linux/uaccess.h> + +#ifdef CONFIG_FPU +void kvm_riscv_vcpu_fp_reset(struct kvm_vcpu *vcpu) +{ + unsigned long isa = vcpu->arch.isa; + struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; + + cntx->sstatus &= ~SR_FS; + if (riscv_isa_extension_available(&isa, f) || + riscv_isa_extension_available(&isa, d)) + cntx->sstatus |= SR_FS_INITIAL; + else + cntx->sstatus |= SR_FS_OFF; +} + +void kvm_riscv_vcpu_fp_clean(struct kvm_cpu_context *cntx) +{ + cntx->sstatus &= ~SR_FS; + cntx->sstatus |= SR_FS_CLEAN; +} + +void kvm_riscv_vcpu_guest_fp_save(struct kvm_cpu_context *cntx, + unsigned long isa) +{ + if ((cntx->sstatus & SR_FS) == SR_FS_DIRTY) { + if (riscv_isa_extension_available(&isa, d)) + __kvm_riscv_fp_d_save(cntx); + else if (riscv_isa_extension_available(&isa, f)) + __kvm_riscv_fp_f_save(cntx); + kvm_riscv_vcpu_fp_clean(cntx); + } +} + +void kvm_riscv_vcpu_guest_fp_restore(struct kvm_cpu_context *cntx, + unsigned long isa) +{ + if ((cntx->sstatus & SR_FS) != SR_FS_OFF) { + if (riscv_isa_extension_available(&isa, d)) + __kvm_riscv_fp_d_restore(cntx); + else if (riscv_isa_extension_available(&isa, f)) + __kvm_riscv_fp_f_restore(cntx); + kvm_riscv_vcpu_fp_clean(cntx); + } +} + +void kvm_riscv_vcpu_host_fp_save(struct kvm_cpu_context *cntx) +{ + /* No need to check host sstatus as it can be modified outside */ + if (riscv_isa_extension_available(NULL, d)) + __kvm_riscv_fp_d_save(cntx); + else if (riscv_isa_extension_available(NULL, f)) + __kvm_riscv_fp_f_save(cntx); +} + +void kvm_riscv_vcpu_host_fp_restore(struct kvm_cpu_context *cntx) +{ + if (riscv_isa_extension_available(NULL, d)) + __kvm_riscv_fp_d_restore(cntx); + else if (riscv_isa_extension_available(NULL, f)) + __kvm_riscv_fp_f_restore(cntx); +} +#endif + +int kvm_riscv_vcpu_get_reg_fp(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg, + unsigned long rtype) +{ + struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; + unsigned long isa = vcpu->arch.isa; + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + rtype); + void *reg_val; + + if ((rtype == KVM_REG_RISCV_FP_F) && + riscv_isa_extension_available(&isa, f)) { + if (KVM_REG_SIZE(reg->id) != sizeof(u32)) + return -EINVAL; + if (reg_num == KVM_REG_RISCV_FP_F_REG(fcsr)) + reg_val = &cntx->fp.f.fcsr; + else if ((KVM_REG_RISCV_FP_F_REG(f[0]) <= reg_num) && + reg_num <= KVM_REG_RISCV_FP_F_REG(f[31])) + reg_val = &cntx->fp.f.f[reg_num]; + else + return -EINVAL; + } else if ((rtype == KVM_REG_RISCV_FP_D) && + riscv_isa_extension_available(&isa, d)) { + if (reg_num == KVM_REG_RISCV_FP_D_REG(fcsr)) { + if (KVM_REG_SIZE(reg->id) != sizeof(u32)) + return -EINVAL; + reg_val = &cntx->fp.d.fcsr; + } else if ((KVM_REG_RISCV_FP_D_REG(f[0]) <= reg_num) && + reg_num <= KVM_REG_RISCV_FP_D_REG(f[31])) { + if (KVM_REG_SIZE(reg->id) != sizeof(u64)) + return -EINVAL; + reg_val = &cntx->fp.d.f[reg_num]; + } else + return -EINVAL; + } else + return -EINVAL; + + if (copy_to_user(uaddr, reg_val, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + return 0; +} + +int kvm_riscv_vcpu_set_reg_fp(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg, + unsigned long rtype) +{ + struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; + unsigned long isa = vcpu->arch.isa; + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + rtype); + void *reg_val; + + if ((rtype == KVM_REG_RISCV_FP_F) && + riscv_isa_extension_available(&isa, f)) { + if (KVM_REG_SIZE(reg->id) != sizeof(u32)) + return -EINVAL; + if (reg_num == KVM_REG_RISCV_FP_F_REG(fcsr)) + reg_val = &cntx->fp.f.fcsr; + else if ((KVM_REG_RISCV_FP_F_REG(f[0]) <= reg_num) && + reg_num <= KVM_REG_RISCV_FP_F_REG(f[31])) + reg_val = &cntx->fp.f.f[reg_num]; + else + return -EINVAL; + } else if ((rtype == KVM_REG_RISCV_FP_D) && + riscv_isa_extension_available(&isa, d)) { + if (reg_num == KVM_REG_RISCV_FP_D_REG(fcsr)) { + if (KVM_REG_SIZE(reg->id) != sizeof(u32)) + return -EINVAL; + reg_val = &cntx->fp.d.fcsr; + } else if ((KVM_REG_RISCV_FP_D_REG(f[0]) <= reg_num) && + reg_num <= KVM_REG_RISCV_FP_D_REG(f[31])) { + if (KVM_REG_SIZE(reg->id) != sizeof(u64)) + return -EINVAL; + reg_val = &cntx->fp.d.f[reg_num]; + } else + return -EINVAL; + } else + return -EINVAL; + + if (copy_from_user(reg_val, uaddr, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + return 0; +} diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c new file mode 100644 index 000000000000..eb3c045edf11 --- /dev/null +++ b/arch/riscv/kvm/vcpu_sbi.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0 +/** + * Copyright (c) 2019 Western Digital Corporation or its affiliates. + * + * Authors: + * Atish Patra <atish.patra@wdc.com> + */ + +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/kvm_host.h> +#include <asm/csr.h> +#include <asm/sbi.h> +#include <asm/kvm_vcpu_timer.h> + +#define SBI_VERSION_MAJOR 0 +#define SBI_VERSION_MINOR 1 + +static void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, + struct kvm_run *run) +{ + struct kvm_cpu_context *cp = &vcpu->arch.guest_context; + + vcpu->arch.sbi_context.return_handled = 0; + vcpu->stat.ecall_exit_stat++; + run->exit_reason = KVM_EXIT_RISCV_SBI; + run->riscv_sbi.extension_id = cp->a7; + run->riscv_sbi.function_id = cp->a6; + run->riscv_sbi.args[0] = cp->a0; + run->riscv_sbi.args[1] = cp->a1; + run->riscv_sbi.args[2] = cp->a2; + run->riscv_sbi.args[3] = cp->a3; + run->riscv_sbi.args[4] = cp->a4; + run->riscv_sbi.args[5] = cp->a5; + run->riscv_sbi.ret[0] = cp->a0; + run->riscv_sbi.ret[1] = cp->a1; +} + +int kvm_riscv_vcpu_sbi_return(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + struct kvm_cpu_context *cp = &vcpu->arch.guest_context; + + /* Handle SBI return only once */ + if (vcpu->arch.sbi_context.return_handled) + return 0; + vcpu->arch.sbi_context.return_handled = 1; + + /* Update return values */ + cp->a0 = run->riscv_sbi.ret[0]; + cp->a1 = run->riscv_sbi.ret[1]; + + /* Move to next instruction */ + vcpu->arch.guest_context.sepc += 4; + + return 0; +} + +#ifdef CONFIG_RISCV_SBI_V01 + +static void kvm_sbi_system_shutdown(struct kvm_vcpu *vcpu, + struct kvm_run *run, u32 type) +{ + int i; + struct kvm_vcpu *tmp; + + kvm_for_each_vcpu(i, tmp, vcpu->kvm) + tmp->arch.power_off = true; + kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP); + + memset(&run->system_event, 0, sizeof(run->system_event)); + run->system_event.type = type; + run->exit_reason = KVM_EXIT_SYSTEM_EVENT; +} + +int kvm_riscv_vcpu_sbi_ecall(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + ulong hmask; + int i, ret = 1; + u64 next_cycle; + struct kvm_vcpu *rvcpu; + bool next_sepc = true; + struct cpumask cm, hm; + struct kvm *kvm = vcpu->kvm; + struct kvm_cpu_trap utrap = { 0 }; + struct kvm_cpu_context *cp = &vcpu->arch.guest_context; + + if (!cp) + return -EINVAL; + + switch (cp->a7) { + case SBI_EXT_0_1_CONSOLE_GETCHAR: + case SBI_EXT_0_1_CONSOLE_PUTCHAR: + /* + * The CONSOLE_GETCHAR/CONSOLE_PUTCHAR SBI calls cannot be + * handled in kernel so we forward these to user-space + */ + kvm_riscv_vcpu_sbi_forward(vcpu, run); + next_sepc = false; + ret = 0; + break; + case SBI_EXT_0_1_SET_TIMER: +#if __riscv_xlen == 32 + next_cycle = ((u64)cp->a1 << 32) | (u64)cp->a0; +#else + next_cycle = (u64)cp->a0; +#endif + kvm_riscv_vcpu_timer_next_event(vcpu, next_cycle); + break; + case SBI_EXT_0_1_CLEAR_IPI: + kvm_riscv_vcpu_unset_interrupt(vcpu, IRQ_VS_SOFT); + break; + case SBI_EXT_0_1_SEND_IPI: + if (cp->a0) + hmask = kvm_riscv_vcpu_unpriv_read(vcpu, false, cp->a0, + &utrap); + else + hmask = (1UL << atomic_read(&kvm->online_vcpus)) - 1; + if (utrap.scause) { + utrap.sepc = cp->sepc; + kvm_riscv_vcpu_trap_redirect(vcpu, &utrap); + next_sepc = false; + break; + } + for_each_set_bit(i, &hmask, BITS_PER_LONG) { + rvcpu = kvm_get_vcpu_by_id(vcpu->kvm, i); + kvm_riscv_vcpu_set_interrupt(rvcpu, IRQ_VS_SOFT); + } + break; + case SBI_EXT_0_1_SHUTDOWN: + kvm_sbi_system_shutdown(vcpu, run, KVM_SYSTEM_EVENT_SHUTDOWN); + next_sepc = false; + ret = 0; + break; + case SBI_EXT_0_1_REMOTE_FENCE_I: + case SBI_EXT_0_1_REMOTE_SFENCE_VMA: + case SBI_EXT_0_1_REMOTE_SFENCE_VMA_ASID: + if (cp->a0) + hmask = kvm_riscv_vcpu_unpriv_read(vcpu, false, cp->a0, + &utrap); + else + hmask = (1UL << atomic_read(&kvm->online_vcpus)) - 1; + if (utrap.scause) { + utrap.sepc = cp->sepc; + kvm_riscv_vcpu_trap_redirect(vcpu, &utrap); + next_sepc = false; + break; + } + cpumask_clear(&cm); + for_each_set_bit(i, &hmask, BITS_PER_LONG) { + rvcpu = kvm_get_vcpu_by_id(vcpu->kvm, i); + if (rvcpu->cpu < 0) + continue; + cpumask_set_cpu(rvcpu->cpu, &cm); + } + riscv_cpuid_to_hartid_mask(&cm, &hm); + if (cp->a7 == SBI_EXT_0_1_REMOTE_FENCE_I) + sbi_remote_fence_i(cpumask_bits(&hm)); + else if (cp->a7 == SBI_EXT_0_1_REMOTE_SFENCE_VMA) + sbi_remote_hfence_vvma(cpumask_bits(&hm), + cp->a1, cp->a2); + else + sbi_remote_hfence_vvma_asid(cpumask_bits(&hm), + cp->a1, cp->a2, cp->a3); + break; + default: + /* Return error for unsupported SBI calls */ + cp->a0 = SBI_ERR_NOT_SUPPORTED; + break; + } + + if (next_sepc) + cp->sepc += 4; + + return ret; +} + +#else + +int kvm_riscv_vcpu_sbi_ecall(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + kvm_riscv_vcpu_sbi_forward(vcpu, run); + return 0; +} + +#endif diff --git a/arch/riscv/kvm/vcpu_switch.S b/arch/riscv/kvm/vcpu_switch.S new file mode 100644 index 000000000000..029a28a195c6 --- /dev/null +++ b/arch/riscv/kvm/vcpu_switch.S @@ -0,0 +1,400 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Western Digital Corporation or its affiliates. + * + * Authors: + * Anup Patel <anup.patel@wdc.com> + */ + +#include <linux/linkage.h> +#include <asm/asm.h> +#include <asm/asm-offsets.h> +#include <asm/csr.h> + + .text + .altmacro + .option norelax + +ENTRY(__kvm_riscv_switch_to) + /* Save Host GPRs (except A0 and T0-T6) */ + REG_S ra, (KVM_ARCH_HOST_RA)(a0) + REG_S sp, (KVM_ARCH_HOST_SP)(a0) + REG_S gp, (KVM_ARCH_HOST_GP)(a0) + REG_S tp, (KVM_ARCH_HOST_TP)(a0) + REG_S s0, (KVM_ARCH_HOST_S0)(a0) + REG_S s1, (KVM_ARCH_HOST_S1)(a0) + REG_S a1, (KVM_ARCH_HOST_A1)(a0) + REG_S a2, (KVM_ARCH_HOST_A2)(a0) + REG_S a3, (KVM_ARCH_HOST_A3)(a0) + REG_S a4, (KVM_ARCH_HOST_A4)(a0) + REG_S a5, (KVM_ARCH_HOST_A5)(a0) + REG_S a6, (KVM_ARCH_HOST_A6)(a0) + REG_S a7, (KVM_ARCH_HOST_A7)(a0) + REG_S s2, (KVM_ARCH_HOST_S2)(a0) + REG_S s3, (KVM_ARCH_HOST_S3)(a0) + REG_S s4, (KVM_ARCH_HOST_S4)(a0) + REG_S s5, (KVM_ARCH_HOST_S5)(a0) + REG_S s6, (KVM_ARCH_HOST_S6)(a0) + REG_S s7, (KVM_ARCH_HOST_S7)(a0) + REG_S s8, (KVM_ARCH_HOST_S8)(a0) + REG_S s9, (KVM_ARCH_HOST_S9)(a0) + REG_S s10, (KVM_ARCH_HOST_S10)(a0) + REG_S s11, (KVM_ARCH_HOST_S11)(a0) + + /* Save Host and Restore Guest SSTATUS */ + REG_L t0, (KVM_ARCH_GUEST_SSTATUS)(a0) + csrrw t0, CSR_SSTATUS, t0 + REG_S t0, (KVM_ARCH_HOST_SSTATUS)(a0) + + /* Save Host and Restore Guest HSTATUS */ + REG_L t1, (KVM_ARCH_GUEST_HSTATUS)(a0) + csrrw t1, CSR_HSTATUS, t1 + REG_S t1, (KVM_ARCH_HOST_HSTATUS)(a0) + + /* Save Host and Restore Guest SCOUNTEREN */ + REG_L t2, (KVM_ARCH_GUEST_SCOUNTEREN)(a0) + csrrw t2, CSR_SCOUNTEREN, t2 + REG_S t2, (KVM_ARCH_HOST_SCOUNTEREN)(a0) + + /* Save Host SSCRATCH and change it to struct kvm_vcpu_arch pointer */ + csrrw t3, CSR_SSCRATCH, a0 + REG_S t3, (KVM_ARCH_HOST_SSCRATCH)(a0) + + /* Save Host STVEC and change it to return path */ + la t4, __kvm_switch_return + csrrw t4, CSR_STVEC, t4 + REG_S t4, (KVM_ARCH_HOST_STVEC)(a0) + + /* Restore Guest SEPC */ + REG_L t0, (KVM_ARCH_GUEST_SEPC)(a0) + csrw CSR_SEPC, t0 + + /* Restore Guest GPRs (except A0) */ + REG_L ra, (KVM_ARCH_GUEST_RA)(a0) + REG_L sp, (KVM_ARCH_GUEST_SP)(a0) + REG_L gp, (KVM_ARCH_GUEST_GP)(a0) + REG_L tp, (KVM_ARCH_GUEST_TP)(a0) + REG_L t0, (KVM_ARCH_GUEST_T0)(a0) + REG_L t1, (KVM_ARCH_GUEST_T1)(a0) + REG_L t2, (KVM_ARCH_GUEST_T2)(a0) + REG_L s0, (KVM_ARCH_GUEST_S0)(a0) + REG_L s1, (KVM_ARCH_GUEST_S1)(a0) + REG_L a1, (KVM_ARCH_GUEST_A1)(a0) + REG_L a2, (KVM_ARCH_GUEST_A2)(a0) + REG_L a3, (KVM_ARCH_GUEST_A3)(a0) + REG_L a4, (KVM_ARCH_GUEST_A4)(a0) + REG_L a5, (KVM_ARCH_GUEST_A5)(a0) + REG_L a6, (KVM_ARCH_GUEST_A6)(a0) + REG_L a7, (KVM_ARCH_GUEST_A7)(a0) + REG_L s2, (KVM_ARCH_GUEST_S2)(a0) + REG_L s3, (KVM_ARCH_GUEST_S3)(a0) + REG_L s4, (KVM_ARCH_GUEST_S4)(a0) + REG_L s5, (KVM_ARCH_GUEST_S5)(a0) + REG_L s6, (KVM_ARCH_GUEST_S6)(a0) + REG_L s7, (KVM_ARCH_GUEST_S7)(a0) + REG_L s8, (KVM_ARCH_GUEST_S8)(a0) + REG_L s9, (KVM_ARCH_GUEST_S9)(a0) + REG_L s10, (KVM_ARCH_GUEST_S10)(a0) + REG_L s11, (KVM_ARCH_GUEST_S11)(a0) + REG_L t3, (KVM_ARCH_GUEST_T3)(a0) + REG_L t4, (KVM_ARCH_GUEST_T4)(a0) + REG_L t5, (KVM_ARCH_GUEST_T5)(a0) + REG_L t6, (KVM_ARCH_GUEST_T6)(a0) + + /* Restore Guest A0 */ + REG_L a0, (KVM_ARCH_GUEST_A0)(a0) + + /* Resume Guest */ + sret + + /* Back to Host */ + .align 2 +__kvm_switch_return: + /* Swap Guest A0 with SSCRATCH */ + csrrw a0, CSR_SSCRATCH, a0 + + /* Save Guest GPRs (except A0) */ + REG_S ra, (KVM_ARCH_GUEST_RA)(a0) + REG_S sp, (KVM_ARCH_GUEST_SP)(a0) + REG_S gp, (KVM_ARCH_GUEST_GP)(a0) + REG_S tp, (KVM_ARCH_GUEST_TP)(a0) + REG_S t0, (KVM_ARCH_GUEST_T0)(a0) + REG_S t1, (KVM_ARCH_GUEST_T1)(a0) + REG_S t2, (KVM_ARCH_GUEST_T2)(a0) + REG_S s0, (KVM_ARCH_GUEST_S0)(a0) + REG_S s1, (KVM_ARCH_GUEST_S1)(a0) + REG_S a1, (KVM_ARCH_GUEST_A1)(a0) + REG_S a2, (KVM_ARCH_GUEST_A2)(a0) + REG_S a3, (KVM_ARCH_GUEST_A3)(a0) + REG_S a4, (KVM_ARCH_GUEST_A4)(a0) + REG_S a5, (KVM_ARCH_GUEST_A5)(a0) + REG_S a6, (KVM_ARCH_GUEST_A6)(a0) + REG_S a7, (KVM_ARCH_GUEST_A7)(a0) + REG_S s2, (KVM_ARCH_GUEST_S2)(a0) + REG_S s3, (KVM_ARCH_GUEST_S3)(a0) + REG_S s4, (KVM_ARCH_GUEST_S4)(a0) + REG_S s5, (KVM_ARCH_GUEST_S5)(a0) + REG_S s6, (KVM_ARCH_GUEST_S6)(a0) + REG_S s7, (KVM_ARCH_GUEST_S7)(a0) + REG_S s8, (KVM_ARCH_GUEST_S8)(a0) + REG_S s9, (KVM_ARCH_GUEST_S9)(a0) + REG_S s10, (KVM_ARCH_GUEST_S10)(a0) + REG_S s11, (KVM_ARCH_GUEST_S11)(a0) + REG_S t3, (KVM_ARCH_GUEST_T3)(a0) + REG_S t4, (KVM_ARCH_GUEST_T4)(a0) + REG_S t5, (KVM_ARCH_GUEST_T5)(a0) + REG_S t6, (KVM_ARCH_GUEST_T6)(a0) + + /* Save Guest SEPC */ + csrr t0, CSR_SEPC + REG_S t0, (KVM_ARCH_GUEST_SEPC)(a0) + + /* Restore Host STVEC */ + REG_L t1, (KVM_ARCH_HOST_STVEC)(a0) + csrw CSR_STVEC, t1 + + /* Save Guest A0 and Restore Host SSCRATCH */ + REG_L t2, (KVM_ARCH_HOST_SSCRATCH)(a0) + csrrw t2, CSR_SSCRATCH, t2 + REG_S t2, (KVM_ARCH_GUEST_A0)(a0) + + /* Save Guest and Restore Host SCOUNTEREN */ + REG_L t3, (KVM_ARCH_HOST_SCOUNTEREN)(a0) + csrrw t3, CSR_SCOUNTEREN, t3 + REG_S t3, (KVM_ARCH_GUEST_SCOUNTEREN)(a0) + + /* Save Guest and Restore Host HSTATUS */ + REG_L t4, (KVM_ARCH_HOST_HSTATUS)(a0) + csrrw t4, CSR_HSTATUS, t4 + REG_S t4, (KVM_ARCH_GUEST_HSTATUS)(a0) + + /* Save Guest and Restore Host SSTATUS */ + REG_L t5, (KVM_ARCH_HOST_SSTATUS)(a0) + csrrw t5, CSR_SSTATUS, t5 + REG_S t5, (KVM_ARCH_GUEST_SSTATUS)(a0) + + /* Restore Host GPRs (except A0 and T0-T6) */ + REG_L ra, (KVM_ARCH_HOST_RA)(a0) + REG_L sp, (KVM_ARCH_HOST_SP)(a0) + REG_L gp, (KVM_ARCH_HOST_GP)(a0) + REG_L tp, (KVM_ARCH_HOST_TP)(a0) + REG_L s0, (KVM_ARCH_HOST_S0)(a0) + REG_L s1, (KVM_ARCH_HOST_S1)(a0) + REG_L a1, (KVM_ARCH_HOST_A1)(a0) + REG_L a2, (KVM_ARCH_HOST_A2)(a0) + REG_L a3, (KVM_ARCH_HOST_A3)(a0) + REG_L a4, (KVM_ARCH_HOST_A4)(a0) + REG_L a5, (KVM_ARCH_HOST_A5)(a0) + REG_L a6, (KVM_ARCH_HOST_A6)(a0) + REG_L a7, (KVM_ARCH_HOST_A7)(a0) + REG_L s2, (KVM_ARCH_HOST_S2)(a0) + REG_L s3, (KVM_ARCH_HOST_S3)(a0) + REG_L s4, (KVM_ARCH_HOST_S4)(a0) + REG_L s5, (KVM_ARCH_HOST_S5)(a0) + REG_L s6, (KVM_ARCH_HOST_S6)(a0) + REG_L s7, (KVM_ARCH_HOST_S7)(a0) + REG_L s8, (KVM_ARCH_HOST_S8)(a0) + REG_L s9, (KVM_ARCH_HOST_S9)(a0) + REG_L s10, (KVM_ARCH_HOST_S10)(a0) + REG_L s11, (KVM_ARCH_HOST_S11)(a0) + + /* Return to C code */ + ret +ENDPROC(__kvm_riscv_switch_to) + +ENTRY(__kvm_riscv_unpriv_trap) + /* + * We assume that faulting unpriv load/store instruction is + * 4-byte long and blindly increment SEPC by 4. + * + * The trap details will be saved at address pointed by 'A0' + * register and we use 'A1' register as temporary. + */ + csrr a1, CSR_SEPC + REG_S a1, (KVM_ARCH_TRAP_SEPC)(a0) + addi a1, a1, 4 + csrw CSR_SEPC, a1 + csrr a1, CSR_SCAUSE + REG_S a1, (KVM_ARCH_TRAP_SCAUSE)(a0) + csrr a1, CSR_STVAL + REG_S a1, (KVM_ARCH_TRAP_STVAL)(a0) + csrr a1, CSR_HTVAL + REG_S a1, (KVM_ARCH_TRAP_HTVAL)(a0) + csrr a1, CSR_HTINST + REG_S a1, (KVM_ARCH_TRAP_HTINST)(a0) + sret +ENDPROC(__kvm_riscv_unpriv_trap) + +#ifdef CONFIG_FPU + .align 3 + .global __kvm_riscv_fp_f_save +__kvm_riscv_fp_f_save: + csrr t2, CSR_SSTATUS + li t1, SR_FS + csrs CSR_SSTATUS, t1 + frcsr t0 + fsw f0, KVM_ARCH_FP_F_F0(a0) + fsw f1, KVM_ARCH_FP_F_F1(a0) + fsw f2, KVM_ARCH_FP_F_F2(a0) + fsw f3, KVM_ARCH_FP_F_F3(a0) + fsw f4, KVM_ARCH_FP_F_F4(a0) + fsw f5, KVM_ARCH_FP_F_F5(a0) + fsw f6, KVM_ARCH_FP_F_F6(a0) + fsw f7, KVM_ARCH_FP_F_F7(a0) + fsw f8, KVM_ARCH_FP_F_F8(a0) + fsw f9, KVM_ARCH_FP_F_F9(a0) + fsw f10, KVM_ARCH_FP_F_F10(a0) + fsw f11, KVM_ARCH_FP_F_F11(a0) + fsw f12, KVM_ARCH_FP_F_F12(a0) + fsw f13, KVM_ARCH_FP_F_F13(a0) + fsw f14, KVM_ARCH_FP_F_F14(a0) + fsw f15, KVM_ARCH_FP_F_F15(a0) + fsw f16, KVM_ARCH_FP_F_F16(a0) + fsw f17, KVM_ARCH_FP_F_F17(a0) + fsw f18, KVM_ARCH_FP_F_F18(a0) + fsw f19, KVM_ARCH_FP_F_F19(a0) + fsw f20, KVM_ARCH_FP_F_F20(a0) + fsw f21, KVM_ARCH_FP_F_F21(a0) + fsw f22, KVM_ARCH_FP_F_F22(a0) + fsw f23, KVM_ARCH_FP_F_F23(a0) + fsw f24, KVM_ARCH_FP_F_F24(a0) + fsw f25, KVM_ARCH_FP_F_F25(a0) + fsw f26, KVM_ARCH_FP_F_F26(a0) + fsw f27, KVM_ARCH_FP_F_F27(a0) + fsw f28, KVM_ARCH_FP_F_F28(a0) + fsw f29, KVM_ARCH_FP_F_F29(a0) + fsw f30, KVM_ARCH_FP_F_F30(a0) + fsw f31, KVM_ARCH_FP_F_F31(a0) + sw t0, KVM_ARCH_FP_F_FCSR(a0) + csrw CSR_SSTATUS, t2 + ret + + .align 3 + .global __kvm_riscv_fp_d_save +__kvm_riscv_fp_d_save: + csrr t2, CSR_SSTATUS + li t1, SR_FS + csrs CSR_SSTATUS, t1 + frcsr t0 + fsd f0, KVM_ARCH_FP_D_F0(a0) + fsd f1, KVM_ARCH_FP_D_F1(a0) + fsd f2, KVM_ARCH_FP_D_F2(a0) + fsd f3, KVM_ARCH_FP_D_F3(a0) + fsd f4, KVM_ARCH_FP_D_F4(a0) + fsd f5, KVM_ARCH_FP_D_F5(a0) + fsd f6, KVM_ARCH_FP_D_F6(a0) + fsd f7, KVM_ARCH_FP_D_F7(a0) + fsd f8, KVM_ARCH_FP_D_F8(a0) + fsd f9, KVM_ARCH_FP_D_F9(a0) + fsd f10, KVM_ARCH_FP_D_F10(a0) + fsd f11, KVM_ARCH_FP_D_F11(a0) + fsd f12, KVM_ARCH_FP_D_F12(a0) + fsd f13, KVM_ARCH_FP_D_F13(a0) + fsd f14, KVM_ARCH_FP_D_F14(a0) + fsd f15, KVM_ARCH_FP_D_F15(a0) + fsd f16, KVM_ARCH_FP_D_F16(a0) + fsd f17, KVM_ARCH_FP_D_F17(a0) + fsd f18, KVM_ARCH_FP_D_F18(a0) + fsd f19, KVM_ARCH_FP_D_F19(a0) + fsd f20, KVM_ARCH_FP_D_F20(a0) + fsd f21, KVM_ARCH_FP_D_F21(a0) + fsd f22, KVM_ARCH_FP_D_F22(a0) + fsd f23, KVM_ARCH_FP_D_F23(a0) + fsd f24, KVM_ARCH_FP_D_F24(a0) + fsd f25, KVM_ARCH_FP_D_F25(a0) + fsd f26, KVM_ARCH_FP_D_F26(a0) + fsd f27, KVM_ARCH_FP_D_F27(a0) + fsd f28, KVM_ARCH_FP_D_F28(a0) + fsd f29, KVM_ARCH_FP_D_F29(a0) + fsd f30, KVM_ARCH_FP_D_F30(a0) + fsd f31, KVM_ARCH_FP_D_F31(a0) + sw t0, KVM_ARCH_FP_D_FCSR(a0) + csrw CSR_SSTATUS, t2 + ret + + .align 3 + .global __kvm_riscv_fp_f_restore +__kvm_riscv_fp_f_restore: + csrr t2, CSR_SSTATUS + li t1, SR_FS + lw t0, KVM_ARCH_FP_F_FCSR(a0) + csrs CSR_SSTATUS, t1 + flw f0, KVM_ARCH_FP_F_F0(a0) + flw f1, KVM_ARCH_FP_F_F1(a0) + flw f2, KVM_ARCH_FP_F_F2(a0) + flw f3, KVM_ARCH_FP_F_F3(a0) + flw f4, KVM_ARCH_FP_F_F4(a0) + flw f5, KVM_ARCH_FP_F_F5(a0) + flw f6, KVM_ARCH_FP_F_F6(a0) + flw f7, KVM_ARCH_FP_F_F7(a0) + flw f8, KVM_ARCH_FP_F_F8(a0) + flw f9, KVM_ARCH_FP_F_F9(a0) + flw f10, KVM_ARCH_FP_F_F10(a0) + flw f11, KVM_ARCH_FP_F_F11(a0) + flw f12, KVM_ARCH_FP_F_F12(a0) + flw f13, KVM_ARCH_FP_F_F13(a0) + flw f14, KVM_ARCH_FP_F_F14(a0) + flw f15, KVM_ARCH_FP_F_F15(a0) + flw f16, KVM_ARCH_FP_F_F16(a0) + flw f17, KVM_ARCH_FP_F_F17(a0) + flw f18, KVM_ARCH_FP_F_F18(a0) + flw f19, KVM_ARCH_FP_F_F19(a0) + flw f20, KVM_ARCH_FP_F_F20(a0) + flw f21, KVM_ARCH_FP_F_F21(a0) + flw f22, KVM_ARCH_FP_F_F22(a0) + flw f23, KVM_ARCH_FP_F_F23(a0) + flw f24, KVM_ARCH_FP_F_F24(a0) + flw f25, KVM_ARCH_FP_F_F25(a0) + flw f26, KVM_ARCH_FP_F_F26(a0) + flw f27, KVM_ARCH_FP_F_F27(a0) + flw f28, KVM_ARCH_FP_F_F28(a0) + flw f29, KVM_ARCH_FP_F_F29(a0) + flw f30, KVM_ARCH_FP_F_F30(a0) + flw f31, KVM_ARCH_FP_F_F31(a0) + fscsr t0 + csrw CSR_SSTATUS, t2 + ret + + .align 3 + .global __kvm_riscv_fp_d_restore +__kvm_riscv_fp_d_restore: + csrr t2, CSR_SSTATUS + li t1, SR_FS + lw t0, KVM_ARCH_FP_D_FCSR(a0) + csrs CSR_SSTATUS, t1 + fld f0, KVM_ARCH_FP_D_F0(a0) + fld f1, KVM_ARCH_FP_D_F1(a0) + fld f2, KVM_ARCH_FP_D_F2(a0) + fld f3, KVM_ARCH_FP_D_F3(a0) + fld f4, KVM_ARCH_FP_D_F4(a0) + fld f5, KVM_ARCH_FP_D_F5(a0) + fld f6, KVM_ARCH_FP_D_F6(a0) + fld f7, KVM_ARCH_FP_D_F7(a0) + fld f8, KVM_ARCH_FP_D_F8(a0) + fld f9, KVM_ARCH_FP_D_F9(a0) + fld f10, KVM_ARCH_FP_D_F10(a0) + fld f11, KVM_ARCH_FP_D_F11(a0) + fld f12, KVM_ARCH_FP_D_F12(a0) + fld f13, KVM_ARCH_FP_D_F13(a0) + fld f14, KVM_ARCH_FP_D_F14(a0) + fld f15, KVM_ARCH_FP_D_F15(a0) + fld f16, KVM_ARCH_FP_D_F16(a0) + fld f17, KVM_ARCH_FP_D_F17(a0) + fld f18, KVM_ARCH_FP_D_F18(a0) + fld f19, KVM_ARCH_FP_D_F19(a0) + fld f20, KVM_ARCH_FP_D_F20(a0) + fld f21, KVM_ARCH_FP_D_F21(a0) + fld f22, KVM_ARCH_FP_D_F22(a0) + fld f23, KVM_ARCH_FP_D_F23(a0) + fld f24, KVM_ARCH_FP_D_F24(a0) + fld f25, KVM_ARCH_FP_D_F25(a0) + fld f26, KVM_ARCH_FP_D_F26(a0) + fld f27, KVM_ARCH_FP_D_F27(a0) + fld f28, KVM_ARCH_FP_D_F28(a0) + fld f29, KVM_ARCH_FP_D_F29(a0) + fld f30, KVM_ARCH_FP_D_F30(a0) + fld f31, KVM_ARCH_FP_D_F31(a0) + fscsr t0 + csrw CSR_SSTATUS, t2 + ret +#endif diff --git a/arch/riscv/kvm/vcpu_timer.c b/arch/riscv/kvm/vcpu_timer.c new file mode 100644 index 000000000000..5c4c37ff2d48 --- /dev/null +++ b/arch/riscv/kvm/vcpu_timer.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Western Digital Corporation or its affiliates. + * + * Authors: + * Atish Patra <atish.patra@wdc.com> + */ + +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/kvm_host.h> +#include <linux/uaccess.h> +#include <clocksource/timer-riscv.h> +#include <asm/csr.h> +#include <asm/delay.h> +#include <asm/kvm_vcpu_timer.h> + +static u64 kvm_riscv_current_cycles(struct kvm_guest_timer *gt) +{ + return get_cycles64() + gt->time_delta; +} + +static u64 kvm_riscv_delta_cycles2ns(u64 cycles, + struct kvm_guest_timer *gt, + struct kvm_vcpu_timer *t) +{ + unsigned long flags; + u64 cycles_now, cycles_delta, delta_ns; + + local_irq_save(flags); + cycles_now = kvm_riscv_current_cycles(gt); + if (cycles_now < cycles) + cycles_delta = cycles - cycles_now; + else + cycles_delta = 0; + delta_ns = (cycles_delta * gt->nsec_mult) >> gt->nsec_shift; + local_irq_restore(flags); + + return delta_ns; +} + +static enum hrtimer_restart kvm_riscv_vcpu_hrtimer_expired(struct hrtimer *h) +{ + u64 delta_ns; + struct kvm_vcpu_timer *t = container_of(h, struct kvm_vcpu_timer, hrt); + struct kvm_vcpu *vcpu = container_of(t, struct kvm_vcpu, arch.timer); + struct kvm_guest_timer *gt = &vcpu->kvm->arch.timer; + + if (kvm_riscv_current_cycles(gt) < t->next_cycles) { + delta_ns = kvm_riscv_delta_cycles2ns(t->next_cycles, gt, t); + hrtimer_forward_now(&t->hrt, ktime_set(0, delta_ns)); + return HRTIMER_RESTART; + } + + t->next_set = false; + kvm_riscv_vcpu_set_interrupt(vcpu, IRQ_VS_TIMER); + + return HRTIMER_NORESTART; +} + +static int kvm_riscv_vcpu_timer_cancel(struct kvm_vcpu_timer *t) +{ + if (!t->init_done || !t->next_set) + return -EINVAL; + + hrtimer_cancel(&t->hrt); + t->next_set = false; + + return 0; +} + +int kvm_riscv_vcpu_timer_next_event(struct kvm_vcpu *vcpu, u64 ncycles) +{ + struct kvm_vcpu_timer *t = &vcpu->arch.timer; + struct kvm_guest_timer *gt = &vcpu->kvm->arch.timer; + u64 delta_ns; + + if (!t->init_done) + return -EINVAL; + + kvm_riscv_vcpu_unset_interrupt(vcpu, IRQ_VS_TIMER); + + delta_ns = kvm_riscv_delta_cycles2ns(ncycles, gt, t); + t->next_cycles = ncycles; + hrtimer_start(&t->hrt, ktime_set(0, delta_ns), HRTIMER_MODE_REL); + t->next_set = true; + + return 0; +} + +int kvm_riscv_vcpu_get_reg_timer(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + struct kvm_vcpu_timer *t = &vcpu->arch.timer; + struct kvm_guest_timer *gt = &vcpu->kvm->arch.timer; + u64 __user *uaddr = (u64 __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_TIMER); + u64 reg_val; + + if (KVM_REG_SIZE(reg->id) != sizeof(u64)) + return -EINVAL; + if (reg_num >= sizeof(struct kvm_riscv_timer) / sizeof(u64)) + return -EINVAL; + + switch (reg_num) { + case KVM_REG_RISCV_TIMER_REG(frequency): + reg_val = riscv_timebase; + break; + case KVM_REG_RISCV_TIMER_REG(time): + reg_val = kvm_riscv_current_cycles(gt); + break; + case KVM_REG_RISCV_TIMER_REG(compare): + reg_val = t->next_cycles; + break; + case KVM_REG_RISCV_TIMER_REG(state): + reg_val = (t->next_set) ? KVM_RISCV_TIMER_STATE_ON : + KVM_RISCV_TIMER_STATE_OFF; + break; + default: + return -EINVAL; + } + + if (copy_to_user(uaddr, ®_val, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + return 0; +} + +int kvm_riscv_vcpu_set_reg_timer(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + struct kvm_vcpu_timer *t = &vcpu->arch.timer; + struct kvm_guest_timer *gt = &vcpu->kvm->arch.timer; + u64 __user *uaddr = (u64 __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_TIMER); + u64 reg_val; + int ret = 0; + + if (KVM_REG_SIZE(reg->id) != sizeof(u64)) + return -EINVAL; + if (reg_num >= sizeof(struct kvm_riscv_timer) / sizeof(u64)) + return -EINVAL; + + if (copy_from_user(®_val, uaddr, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + switch (reg_num) { + case KVM_REG_RISCV_TIMER_REG(frequency): + ret = -EOPNOTSUPP; + break; + case KVM_REG_RISCV_TIMER_REG(time): + gt->time_delta = reg_val - get_cycles64(); + break; + case KVM_REG_RISCV_TIMER_REG(compare): + t->next_cycles = reg_val; + break; + case KVM_REG_RISCV_TIMER_REG(state): + if (reg_val == KVM_RISCV_TIMER_STATE_ON) + ret = kvm_riscv_vcpu_timer_next_event(vcpu, reg_val); + else + ret = kvm_riscv_vcpu_timer_cancel(t); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +int kvm_riscv_vcpu_timer_init(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_timer *t = &vcpu->arch.timer; + + if (t->init_done) + return -EINVAL; + + hrtimer_init(&t->hrt, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + t->hrt.function = kvm_riscv_vcpu_hrtimer_expired; + t->init_done = true; + t->next_set = false; + + return 0; +} + +int kvm_riscv_vcpu_timer_deinit(struct kvm_vcpu *vcpu) +{ + int ret; + + ret = kvm_riscv_vcpu_timer_cancel(&vcpu->arch.timer); + vcpu->arch.timer.init_done = false; + + return ret; +} + +int kvm_riscv_vcpu_timer_reset(struct kvm_vcpu *vcpu) +{ + return kvm_riscv_vcpu_timer_cancel(&vcpu->arch.timer); +} + +void kvm_riscv_vcpu_timer_restore(struct kvm_vcpu *vcpu) +{ + struct kvm_guest_timer *gt = &vcpu->kvm->arch.timer; + +#ifdef CONFIG_64BIT + csr_write(CSR_HTIMEDELTA, gt->time_delta); +#else + csr_write(CSR_HTIMEDELTA, (u32)(gt->time_delta)); + csr_write(CSR_HTIMEDELTAH, (u32)(gt->time_delta >> 32)); +#endif +} + +int kvm_riscv_guest_timer_init(struct kvm *kvm) +{ + struct kvm_guest_timer *gt = &kvm->arch.timer; + + riscv_cs_get_mult_shift(>->nsec_mult, >->nsec_shift); + gt->time_delta = -get_cycles64(); + + return 0; +} diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c new file mode 100644 index 000000000000..26399df15b63 --- /dev/null +++ b/arch/riscv/kvm/vm.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Western Digital Corporation or its affiliates. + * + * Authors: + * Anup Patel <anup.patel@wdc.com> + */ + +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/kvm_host.h> + +const struct _kvm_stats_desc kvm_vm_stats_desc[] = { + KVM_GENERIC_VM_STATS() +}; +static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == + sizeof(struct kvm_vm_stat) / sizeof(u64)); + +const struct kvm_stats_header kvm_vm_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vm_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vm_stats_desc), +}; + +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) +{ + int r; + + r = kvm_riscv_stage2_alloc_pgd(kvm); + if (r) + return r; + + r = kvm_riscv_stage2_vmid_init(kvm); + if (r) { + kvm_riscv_stage2_free_pgd(kvm); + return r; + } + + return kvm_riscv_guest_timer_init(kvm); +} + +void kvm_arch_destroy_vm(struct kvm *kvm) +{ + int i; + + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + if (kvm->vcpus[i]) { + kvm_vcpu_destroy(kvm->vcpus[i]); + kvm->vcpus[i] = NULL; + } + } + atomic_set(&kvm->online_vcpus, 0); +} + +int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) +{ + int r; + + switch (ext) { + case KVM_CAP_IOEVENTFD: + case KVM_CAP_DEVICE_CTRL: + case KVM_CAP_USER_MEMORY: + case KVM_CAP_SYNC_MMU: + case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: + case KVM_CAP_ONE_REG: + case KVM_CAP_READONLY_MEM: + case KVM_CAP_MP_STATE: + case KVM_CAP_IMMEDIATE_EXIT: + r = 1; + break; + case KVM_CAP_NR_VCPUS: + r = num_online_cpus(); + break; + case KVM_CAP_MAX_VCPUS: + r = KVM_MAX_VCPUS; + break; + case KVM_CAP_NR_MEMSLOTS: + r = KVM_USER_MEM_SLOTS; + break; + default: + r = 0; + break; + } + + return r; +} + +long kvm_arch_vm_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + return -EINVAL; +} diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c new file mode 100644 index 000000000000..2c6253b293bc --- /dev/null +++ b/arch/riscv/kvm/vmid.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Western Digital Corporation or its affiliates. + * + * Authors: + * Anup Patel <anup.patel@wdc.com> + */ + +#include <linux/bitops.h> +#include <linux/cpumask.h> +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/module.h> +#include <linux/kvm_host.h> +#include <asm/csr.h> +#include <asm/sbi.h> + +static unsigned long vmid_version = 1; +static unsigned long vmid_next; +static unsigned long vmid_bits; +static DEFINE_SPINLOCK(vmid_lock); + +void kvm_riscv_stage2_vmid_detect(void) +{ + unsigned long old; + + /* Figure-out number of VMID bits in HW */ + old = csr_read(CSR_HGATP); + csr_write(CSR_HGATP, old | HGATP_VMID_MASK); + vmid_bits = csr_read(CSR_HGATP); + vmid_bits = (vmid_bits & HGATP_VMID_MASK) >> HGATP_VMID_SHIFT; + vmid_bits = fls_long(vmid_bits); + csr_write(CSR_HGATP, old); + + /* We polluted local TLB so flush all guest TLB */ + __kvm_riscv_hfence_gvma_all(); + + /* We don't use VMID bits if they are not sufficient */ + if ((1UL << vmid_bits) < num_possible_cpus()) + vmid_bits = 0; +} + +unsigned long kvm_riscv_stage2_vmid_bits(void) +{ + return vmid_bits; +} + +int kvm_riscv_stage2_vmid_init(struct kvm *kvm) +{ + /* Mark the initial VMID and VMID version invalid */ + kvm->arch.vmid.vmid_version = 0; + kvm->arch.vmid.vmid = 0; + + return 0; +} + +bool kvm_riscv_stage2_vmid_ver_changed(struct kvm_vmid *vmid) +{ + if (!vmid_bits) + return false; + + return unlikely(READ_ONCE(vmid->vmid_version) != + READ_ONCE(vmid_version)); +} + +void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_vcpu *v; + struct cpumask hmask; + struct kvm_vmid *vmid = &vcpu->kvm->arch.vmid; + + if (!kvm_riscv_stage2_vmid_ver_changed(vmid)) + return; + + spin_lock(&vmid_lock); + + /* + * We need to re-check the vmid_version here to ensure that if + * another vcpu already allocated a valid vmid for this vm. + */ + if (!kvm_riscv_stage2_vmid_ver_changed(vmid)) { + spin_unlock(&vmid_lock); + return; + } + + /* First user of a new VMID version? */ + if (unlikely(vmid_next == 0)) { + WRITE_ONCE(vmid_version, READ_ONCE(vmid_version) + 1); + vmid_next = 1; + + /* + * We ran out of VMIDs so we increment vmid_version and + * start assigning VMIDs from 1. + * + * This also means existing VMIDs assignement to all Guest + * instances is invalid and we have force VMID re-assignement + * for all Guest instances. The Guest instances that were not + * running will automatically pick-up new VMIDs because will + * call kvm_riscv_stage2_vmid_update() whenever they enter + * in-kernel run loop. For Guest instances that are already + * running, we force VM exits on all host CPUs using IPI and + * flush all Guest TLBs. + */ + riscv_cpuid_to_hartid_mask(cpu_online_mask, &hmask); + sbi_remote_hfence_gvma(cpumask_bits(&hmask), 0, 0); + } + + vmid->vmid = vmid_next; + vmid_next++; + vmid_next &= (1 << vmid_bits) - 1; + + WRITE_ONCE(vmid->vmid_version, READ_ONCE(vmid_version)); + + spin_unlock(&vmid_lock); + + /* Request stage2 page table update for all VCPUs */ + kvm_for_each_vcpu(i, v, vcpu->kvm) + kvm_make_request(KVM_REQ_UPDATE_HGATP, v); +} diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index b61426c9ef17..e43416950245 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1074,8 +1074,9 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, pte_t res; res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); + /* At this point the reference through the mapping is still present */ if (mm_is_protected(mm) && pte_present(res)) - uv_convert_from_secure(pte_val(res) & PAGE_MASK); + uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK); return res; } @@ -1091,8 +1092,9 @@ static inline pte_t ptep_clear_flush(struct vm_area_struct *vma, pte_t res; res = ptep_xchg_direct(vma->vm_mm, addr, ptep, __pte(_PAGE_INVALID)); + /* At this point the reference through the mapping is still present */ if (mm_is_protected(vma->vm_mm) && pte_present(res)) - uv_convert_from_secure(pte_val(res) & PAGE_MASK); + uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK); return res; } @@ -1116,8 +1118,9 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, } else { res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); } + /* At this point the reference through the mapping is still present */ if (mm_is_protected(mm) && pte_present(res)) - uv_convert_from_secure(pte_val(res) & PAGE_MASK); + uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK); return res; } diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index fe92a4caf5ec..72d3e49c2860 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -18,6 +18,11 @@ #include <asm/page.h> #include <asm/gmap.h> +#define UVC_CC_OK 0 +#define UVC_CC_ERROR 1 +#define UVC_CC_BUSY 2 +#define UVC_CC_PARTIAL 3 + #define UVC_RC_EXECUTED 0x0001 #define UVC_RC_INV_CMD 0x0002 #define UVC_RC_INV_STATE 0x0003 @@ -351,8 +356,9 @@ static inline int is_prot_virt_host(void) } int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb); -int uv_destroy_page(unsigned long paddr); +int uv_destroy_owned_page(unsigned long paddr); int uv_convert_from_secure(unsigned long paddr); +int uv_convert_owned_from_secure(unsigned long paddr); int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr); void setup_uv(void); @@ -360,7 +366,7 @@ void setup_uv(void); #define is_prot_virt_host() 0 static inline void setup_uv(void) {} -static inline int uv_destroy_page(unsigned long paddr) +static inline int uv_destroy_owned_page(unsigned long paddr) { return 0; } @@ -369,6 +375,11 @@ static inline int uv_convert_from_secure(unsigned long paddr) { return 0; } + +static inline int uv_convert_owned_from_secure(unsigned long paddr) +{ + return 0; +} #endif #endif /* _ASM_S390_UV_H */ diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 5a656c7b7a67..8b0e62507d62 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -100,7 +100,7 @@ static int uv_pin_shared(unsigned long paddr) * * @paddr: Absolute host address of page to be destroyed */ -int uv_destroy_page(unsigned long paddr) +static int uv_destroy_page(unsigned long paddr) { struct uv_cb_cfs uvcb = { .header.cmd = UVC_CMD_DESTR_SEC_STOR, @@ -121,6 +121,22 @@ int uv_destroy_page(unsigned long paddr) } /* + * The caller must already hold a reference to the page + */ +int uv_destroy_owned_page(unsigned long paddr) +{ + struct page *page = phys_to_page(paddr); + int rc; + + get_page(page); + rc = uv_destroy_page(paddr); + if (!rc) + clear_bit(PG_arch_1, &page->flags); + put_page(page); + return rc; +} + +/* * Requests the Ultravisor to encrypt a guest page and make it * accessible to the host for paging (export). * @@ -140,6 +156,22 @@ int uv_convert_from_secure(unsigned long paddr) } /* + * The caller must already hold a reference to the page + */ +int uv_convert_owned_from_secure(unsigned long paddr) +{ + struct page *page = phys_to_page(paddr); + int rc; + + get_page(page); + rc = uv_convert_from_secure(paddr); + if (!rc) + clear_bit(PG_arch_1, &page->flags); + put_page(page); + return rc; +} + +/* * Calculate the expected ref_count for a page that would otherwise have no * further pins. This was cribbed from similar functions in other places in * the kernel, but with some slight modifications. We know that a secure @@ -165,7 +197,7 @@ static int make_secure_pte(pte_t *ptep, unsigned long addr, { pte_t entry = READ_ONCE(*ptep); struct page *page; - int expected, rc = 0; + int expected, cc = 0; if (!pte_present(entry)) return -ENXIO; @@ -181,12 +213,25 @@ static int make_secure_pte(pte_t *ptep, unsigned long addr, if (!page_ref_freeze(page, expected)) return -EBUSY; set_bit(PG_arch_1, &page->flags); - rc = uv_call(0, (u64)uvcb); + /* + * If the UVC does not succeed or fail immediately, we don't want to + * loop for long, or we might get stall notifications. + * On the other hand, this is a complex scenario and we are holding a lot of + * locks, so we can't easily sleep and reschedule. We try only once, + * and if the UVC returned busy or partial completion, we return + * -EAGAIN and we let the callers deal with it. + */ + cc = __uv_call(0, (u64)uvcb); page_ref_unfreeze(page, expected); - /* Return -ENXIO if the page was not mapped, -EINVAL otherwise */ - if (rc) - rc = uvcb->rc == 0x10a ? -ENXIO : -EINVAL; - return rc; + /* + * Return -ENXIO if the page was not mapped, -EINVAL for other errors. + * If busy or partially completed, return -EAGAIN. + */ + if (cc == UVC_CC_OK) + return 0; + else if (cc == UVC_CC_BUSY || cc == UVC_CC_PARTIAL) + return -EAGAIN; + return uvcb->rc == 0x10a ? -ENXIO : -EINVAL; } /* @@ -212,7 +257,7 @@ again: uaddr = __gmap_translate(gmap, gaddr); if (IS_ERR_VALUE(uaddr)) goto out; - vma = find_vma(gmap->mm, uaddr); + vma = vma_lookup(gmap->mm, uaddr); if (!vma) goto out; /* @@ -239,6 +284,10 @@ out: mmap_read_unlock(gmap->mm); if (rc == -EAGAIN) { + /* + * If we are here because the UVC returned busy or partial + * completion, this is just a useless check, but it is safe. + */ wait_on_page_writeback(page); } else if (rc == -EBUSY) { /* diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 2bd8f854f1b4..d07ff646d844 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -518,6 +518,11 @@ static int handle_pv_uvc(struct kvm_vcpu *vcpu) */ if (rc == -EINVAL) return 0; + /* + * If we got -EAGAIN here, we simply return it. It will eventually + * get propagated all the way to userspace, which should then try + * again. + */ return rc; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 1c97493d21e1..c6257f625929 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2487,8 +2487,8 @@ long kvm_arch_vm_ioctl(struct file *filp, case KVM_S390_PV_COMMAND: { struct kvm_pv_cmd args; - /* protvirt means user sigp */ - kvm->arch.user_cpu_state_ctrl = 1; + /* protvirt means user cpu state */ + kvm_s390_set_user_cpu_state_ctrl(kvm); r = 0; if (!is_prot_virt_host()) { r = -EINVAL; @@ -3802,7 +3802,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, vcpu_load(vcpu); /* user space knows about this interface - let it control the state */ - vcpu->kvm->arch.user_cpu_state_ctrl = 1; + kvm_s390_set_user_cpu_state_ctrl(vcpu->kvm); switch (mp_state->mp_state) { case KVM_MP_STATE_STOPPED: @@ -4255,6 +4255,7 @@ static void sync_regs_fmt2(struct kvm_vcpu *vcpu) if (kvm_run->kvm_dirty_regs & KVM_SYNC_DIAG318) { vcpu->arch.diag318_info.val = kvm_run->s.regs.diag318; vcpu->arch.sie_block->cpnc = vcpu->arch.diag318_info.cpnc; + VCPU_EVENT(vcpu, 3, "setting cpnc to %d", vcpu->arch.diag318_info.cpnc); } /* * If userspace sets the riccb (e.g. after migration) to a valid state, diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 52bc8fbaa60a..c07a050d757d 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -208,6 +208,15 @@ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm) return kvm->arch.user_cpu_state_ctrl != 0; } +static inline void kvm_s390_set_user_cpu_state_ctrl(struct kvm *kvm) +{ + if (kvm->arch.user_cpu_state_ctrl) + return; + + VM_EVENT(kvm, 3, "%s", "ENABLE: Userspace CPU state control"); + kvm->arch.user_cpu_state_ctrl = 1; +} + /* implemented in pv.c */ int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 53da4ceb16a3..417154b314a6 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -397,6 +397,8 @@ static int handle_sske(struct kvm_vcpu *vcpu) mmap_read_unlock(current->mm); if (rc == -EFAULT) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + if (rc == -EAGAIN) + continue; if (rc < 0) return rc; start += PAGE_SIZE; diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index c8841f476e91..00d272d134c2 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -16,18 +16,17 @@ int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) { - int cc = 0; + int cc; - if (kvm_s390_pv_cpu_get_handle(vcpu)) { - cc = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), - UVC_CMD_DESTROY_SEC_CPU, rc, rrc); + if (!kvm_s390_pv_cpu_get_handle(vcpu)) + return 0; + + cc = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), UVC_CMD_DESTROY_SEC_CPU, rc, rrc); + + KVM_UV_EVENT(vcpu->kvm, 3, "PROTVIRT DESTROY VCPU %d: rc %x rrc %x", + vcpu->vcpu_id, *rc, *rrc); + WARN_ONCE(cc, "protvirt destroy cpu failed rc %x rrc %x", *rc, *rrc); - KVM_UV_EVENT(vcpu->kvm, 3, - "PROTVIRT DESTROY VCPU %d: rc %x rrc %x", - vcpu->vcpu_id, *rc, *rrc); - WARN_ONCE(cc, "protvirt destroy cpu failed rc %x rrc %x", - *rc, *rrc); - } /* Intended memory leak for something that should never happen. */ if (!cc) free_pages(vcpu->arch.pv.stor_base, @@ -196,7 +195,7 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) uvcb.conf_base_stor_origin = (u64)kvm->arch.pv.stor_base; uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var; - cc = uv_call(0, (u64)&uvcb); + cc = uv_call_sched(0, (u64)&uvcb); *rc = uvcb.header.rc; *rrc = uvcb.header.rrc; KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x", diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 683036c1c92a..cf4de80bd541 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -151,22 +151,10 @@ static int __sigp_stop_and_store_status(struct kvm_vcpu *vcpu, static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter, u64 *status_reg) { - unsigned int i; - struct kvm_vcpu *v; - bool all_stopped = true; - - kvm_for_each_vcpu(i, v, vcpu->kvm) { - if (v == vcpu) - continue; - if (!is_vcpu_stopped(v)) - all_stopped = false; - } - *status_reg &= 0xffffffff00000000UL; /* Reject set arch order, with czam we're always in z/Arch mode. */ - *status_reg |= (all_stopped ? SIGP_STATUS_INVALID_PARAMETER : - SIGP_STATUS_INCORRECT_STATE); + *status_reg |= SIGP_STATUS_INVALID_PARAMETER; return SIGP_CC_STATUS_STORED; } diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 4d3b33ce81c6..dfee0ebb2fac 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -672,6 +672,7 @@ EXPORT_SYMBOL_GPL(gmap_fault); */ void __gmap_zap(struct gmap *gmap, unsigned long gaddr) { + struct vm_area_struct *vma; unsigned long vmaddr; spinlock_t *ptl; pte_t *ptep; @@ -681,11 +682,17 @@ void __gmap_zap(struct gmap *gmap, unsigned long gaddr) gaddr >> PMD_SHIFT); if (vmaddr) { vmaddr |= gaddr & ~PMD_MASK; + + vma = vma_lookup(gmap->mm, vmaddr); + if (!vma || is_vm_hugetlb_page(vma)) + return; + /* Get pointer to the page table entry */ ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); - if (likely(ptep)) + if (likely(ptep)) { ptep_zap_unused(gmap->mm, vmaddr, ptep, 0); - pte_unmap_unlock(ptep, ptl); + pte_unmap_unlock(ptep, ptl); + } } } EXPORT_SYMBOL_GPL(__gmap_zap); @@ -2677,8 +2684,10 @@ static int __s390_reset_acc(pte_t *ptep, unsigned long addr, { pte_t pte = READ_ONCE(*ptep); + /* There is a reference through the mapping */ if (pte_present(pte)) - WARN_ON_ONCE(uv_destroy_page(pte_val(pte) & PAGE_MASK)); + WARN_ON_ONCE(uv_destroy_owned_page(pte_val(pte) & PAGE_MASK)); + return 0; } diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 034721a68d8f..c16232cd0ec5 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -429,22 +429,36 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, } #ifdef CONFIG_PGSTE -static pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr) +static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp) { + struct vm_area_struct *vma; pgd_t *pgd; p4d_t *p4d; pud_t *pud; - pmd_t *pmd; + + /* We need a valid VMA, otherwise this is clearly a fault. */ + vma = vma_lookup(mm, addr); + if (!vma) + return -EFAULT; pgd = pgd_offset(mm, addr); - p4d = p4d_alloc(mm, pgd, addr); - if (!p4d) - return NULL; - pud = pud_alloc(mm, p4d, addr); - if (!pud) - return NULL; - pmd = pmd_alloc(mm, pud, addr); - return pmd; + if (!pgd_present(*pgd)) + return -ENOENT; + + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) + return -ENOENT; + + pud = pud_offset(p4d, addr); + if (!pud_present(*pud)) + return -ENOENT; + + /* Large PUDs are not supported yet. */ + if (pud_large(*pud)) + return -EFAULT; + + *pmdp = pmd_offset(pud, addr); + return 0; } #endif @@ -778,14 +792,23 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp; pte_t *ptep; - pmdp = pmd_alloc_map(mm, addr); - if (unlikely(!pmdp)) + /* + * If we don't have a PTE table and if there is no huge page mapped, + * we can ignore attempts to set the key to 0, because it already is 0. + */ + switch (pmd_lookup(mm, addr, &pmdp)) { + case -ENOENT: + return key ? -EFAULT : 0; + case 0: + break; + default: return -EFAULT; + } ptl = pmd_lock(mm, pmdp); if (!pmd_present(*pmdp)) { spin_unlock(ptl); - return -EFAULT; + return key ? -EFAULT : 0; } if (pmd_large(*pmdp)) { @@ -801,10 +824,7 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, } spin_unlock(ptl); - ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl); - if (unlikely(!ptep)) - return -EFAULT; - + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); new = old = pgste_get_lock(ptep); pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | PGSTE_ACC_BITS | PGSTE_FP_BIT); @@ -881,14 +901,23 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) pte_t *ptep; int cc = 0; - pmdp = pmd_alloc_map(mm, addr); - if (unlikely(!pmdp)) + /* + * If we don't have a PTE table and if there is no huge page mapped, + * the storage key is 0 and there is nothing for us to do. + */ + switch (pmd_lookup(mm, addr, &pmdp)) { + case -ENOENT: + return 0; + case 0: + break; + default: return -EFAULT; + } ptl = pmd_lock(mm, pmdp); if (!pmd_present(*pmdp)) { spin_unlock(ptl); - return -EFAULT; + return 0; } if (pmd_large(*pmdp)) { @@ -900,10 +929,7 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) } spin_unlock(ptl); - ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl); - if (unlikely(!ptep)) - return -EFAULT; - + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); new = old = pgste_get_lock(ptep); /* Reset guest reference bit only */ pgste_val(new) &= ~PGSTE_GR_BIT; @@ -935,15 +961,24 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp; pte_t *ptep; - pmdp = pmd_alloc_map(mm, addr); - if (unlikely(!pmdp)) + /* + * If we don't have a PTE table and if there is no huge page mapped, + * the storage key is 0. + */ + *key = 0; + + switch (pmd_lookup(mm, addr, &pmdp)) { + case -ENOENT: + return 0; + case 0: + break; + default: return -EFAULT; + } ptl = pmd_lock(mm, pmdp); if (!pmd_present(*pmdp)) { - /* Not yet mapped memory has a zero key */ spin_unlock(ptl); - *key = 0; return 0; } @@ -956,10 +991,7 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, } spin_unlock(ptl); - ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl); - if (unlikely(!ptep)) - return -EFAULT; - + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); pgste = pgste_get_lock(ptep); *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; paddr = pte_val(*ptep) & PAGE_MASK; @@ -988,6 +1020,7 @@ EXPORT_SYMBOL(get_guest_storage_key); int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, unsigned long *oldpte, unsigned long *oldpgste) { + struct vm_area_struct *vma; unsigned long pgstev; spinlock_t *ptl; pgste_t pgste; @@ -997,6 +1030,10 @@ int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, WARN_ON_ONCE(orc > ESSA_MAX); if (unlikely(orc > ESSA_MAX)) return -EINVAL; + + vma = vma_lookup(mm, hva); + if (!vma || is_vm_hugetlb_page(vma)) + return -EFAULT; ptep = get_locked_pte(mm, hva, &ptl); if (unlikely(!ptep)) return -EFAULT; @@ -1089,10 +1126,14 @@ EXPORT_SYMBOL(pgste_perform_essa); int set_pgste_bits(struct mm_struct *mm, unsigned long hva, unsigned long bits, unsigned long value) { + struct vm_area_struct *vma; spinlock_t *ptl; pgste_t new; pte_t *ptep; + vma = vma_lookup(mm, hva); + if (!vma || is_vm_hugetlb_page(vma)) + return -EFAULT; ptep = get_locked_pte(mm, hva, &ptl); if (unlikely(!ptep)) return -EFAULT; @@ -1117,9 +1158,13 @@ EXPORT_SYMBOL(set_pgste_bits); */ int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep) { + struct vm_area_struct *vma; spinlock_t *ptl; pte_t *ptep; + vma = vma_lookup(mm, hva); + if (!vma || is_vm_hugetlb_page(vma)) + return -EFAULT; ptep = get_locked_pte(mm, hva, &ptl); if (unlikely(!ptep)) return -EFAULT; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 32f300dade5e..2acf37cc1991 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -50,7 +50,7 @@ * so ratio of 4 should be enough. */ #define KVM_VCPU_ID_RATIO 4 -#define KVM_MAX_VCPU_ID (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO) +#define KVM_MAX_VCPU_IDS (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO) /* memory slots that are not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 3 @@ -407,6 +407,7 @@ struct kvm_mmu_root_info { #define KVM_HAVE_MMU_RWLOCK struct kvm_mmu_page; +struct kvm_page_fault; /* * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit, @@ -416,8 +417,7 @@ struct kvm_mmu_page; struct kvm_mmu { unsigned long (*get_guest_pgd)(struct kvm_vcpu *vcpu); u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index); - int (*page_fault)(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 err, - bool prefault); + int (*page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void (*inject_page_fault)(struct kvm_vcpu *vcpu, struct x86_exception *fault); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t gva_or_gpa, @@ -499,7 +499,6 @@ struct kvm_pmu { u64 fixed_ctr_ctrl; u64 global_ctrl; u64 global_status; - u64 global_ovf_ctrl; u64 counter_bitmask[2]; u64 global_ctrl_mask; u64 global_ovf_ctrl_mask; @@ -581,7 +580,6 @@ struct kvm_vcpu_hv { struct kvm_hyperv_exit exit; struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT]; DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); - cpumask_t tlb_flush; bool enforce_cpuid; struct { u32 features_eax; /* HYPERV_CPUID_FEATURES.EAX */ @@ -1073,7 +1071,7 @@ struct kvm_arch { atomic_t apic_map_dirty; /* Protects apic_access_memslot_enabled and apicv_inhibit_reasons */ - struct mutex apicv_update_lock; + struct rw_semaphore apicv_update_lock; bool apic_access_memslot_enabled; unsigned long apicv_inhibit_reasons; @@ -1087,17 +1085,23 @@ struct kvm_arch { unsigned long irq_sources_bitmap; s64 kvmclock_offset; + + /* + * This also protects nr_vcpus_matched_tsc which is read from a + * preemption-disabled region, so it must be a raw spinlock. + */ raw_spinlock_t tsc_write_lock; u64 last_tsc_nsec; u64 last_tsc_write; u32 last_tsc_khz; + u64 last_tsc_offset; u64 cur_tsc_nsec; u64 cur_tsc_write; u64 cur_tsc_offset; u64 cur_tsc_generation; int nr_vcpus_matched_tsc; - raw_spinlock_t pvclock_gtod_sync_lock; + seqcount_raw_spinlock_t pvclock_sc; bool use_master_clock; u64 master_kernel_ns; u64 master_cycle_now; @@ -1207,10 +1211,11 @@ struct kvm_arch { #endif /* CONFIG_X86_64 */ /* - * If set, rmaps have been allocated for all memslots and should be - * allocated for any newly created or modified memslots. + * If set, at least one shadow root has been allocated. This flag + * is used as one input when determining whether certain memslot + * related allocations are necessary. */ - bool memslots_have_rmaps; + bool shadow_root_allocated; #if IS_ENABLED(CONFIG_HYPERV) hpa_t hv_root_tdp; @@ -1296,6 +1301,8 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) } struct kvm_x86_ops { + const char *name; + int (*hardware_enable)(void); void (*hardware_disable)(void); void (*hardware_unsetup)(void); @@ -1405,10 +1412,11 @@ struct kvm_x86_ops { void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier); /* - * Retrieve somewhat arbitrary exit information. Intended to be used - * only from within tracepoints to avoid VMREADs when tracing is off. + * Retrieve somewhat arbitrary exit information. Intended to + * be used only from within tracepoints or error paths. */ - void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, + void (*get_exit_info)(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *exit_int_info, u32 *exit_int_info_err_code); int (*check_intercept)(struct kvm_vcpu *vcpu, @@ -1541,6 +1549,8 @@ static inline struct kvm *kvm_arch_alloc_vm(void) { return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); } + +#define __KVM_HAVE_ARCH_VM_FREE void kvm_arch_free_vm(struct kvm *kvm); #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB @@ -1657,6 +1667,9 @@ extern u64 kvm_mce_cap_supported; int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type); int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, void *insn, int insn_len); +void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, + u64 *data, u8 ndata); +void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu); void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); @@ -1713,9 +1726,6 @@ void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); -int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gfn_t gfn, void *data, int offset, int len, - u32 access); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr); @@ -1864,7 +1874,6 @@ u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier); unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); -void kvm_make_mclock_inprogress_request(struct kvm *kvm); void kvm_make_scan_ioapic_request(struct kvm *kvm); void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, unsigned long *vcpu_bitmap); @@ -1933,6 +1942,9 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) int kvm_cpu_dirty_log_size(void); -int alloc_all_memslots_rmaps(struct kvm *kvm); +int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); + +#define KVM_CLOCK_VALID_FLAGS \ + (KVM_CLOCK_TSC_STABLE | KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC) #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h index 6a5f3acf2b33..9d4a3b1b25b9 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -49,8 +49,12 @@ struct kvm_page_track_notifier_node { int kvm_page_track_init(struct kvm *kvm); void kvm_page_track_cleanup(struct kvm *kvm); +bool kvm_page_track_write_tracking_enabled(struct kvm *kvm); +int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot); + void kvm_page_track_free_memslot(struct kvm_memory_slot *slot); -int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, +int kvm_page_track_create_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, unsigned long npages); void kvm_slot_page_track_add_page(struct kvm *kvm, @@ -59,8 +63,9 @@ void kvm_slot_page_track_add_page(struct kvm *kvm, void kvm_slot_page_track_remove_page(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, enum kvm_page_track_mode mode); -bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, - enum kvm_page_track_mode mode); +bool kvm_slot_page_track_is_active(struct kvm_vcpu *vcpu, + struct kvm_memory_slot *slot, gfn_t gfn, + enum kvm_page_track_mode mode); void kvm_page_track_register_notifier(struct kvm *kvm, diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 2ef1f6513c68..5a776a08f78c 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -504,4 +504,8 @@ struct kvm_pmu_event_filter { #define KVM_PMU_EVENT_ALLOW 0 #define KVM_PMU_EVENT_DENY 1 +/* for KVM_{GET,SET,HAS}_DEVICE_ATTR */ +#define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */ +#define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index e28f6a5d14f1..766ffe3ba313 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -291,8 +291,10 @@ void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)) { if (handler) kvm_posted_intr_wakeup_handler = handler; - else + else { kvm_posted_intr_wakeup_handler = dummy_handler; + synchronize_rcu(); + } } EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler); diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ac69894eab88..619186138176 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -129,4 +129,7 @@ config KVM_MMU_AUDIT This option adds a R/W kVM module parameter 'mmu_audit', which allows auditing of KVM MMU events at runtime. +config KVM_EXTERNAL_WRITE_TRACKING + bool + endif # VIRTUALIZATION diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 751aa85a3001..2d70edb0f323 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -53,9 +53,16 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) return ret; } +/* + * This one is tied to SSB in the user API, and not + * visible in /proc/cpuinfo. + */ +#define KVM_X86_FEATURE_PSFD (13*32+28) /* Predictive Store Forwarding Disable */ + #define F feature_bit #define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0) + static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index) { @@ -500,7 +507,8 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_mask(CPUID_8000_0008_EBX, F(CLZERO) | F(XSAVEERPTR) | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | - F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) + F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) | + __feature_bit(KVM_X86_FEATURE_PSFD) ); /* diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 9a144ca8e146..28b1a4e57827 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4222,6 +4222,11 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt) if (enable_vmware_backdoor && is_vmware_backdoor_pmc(rcx)) return X86EMUL_CONTINUE; + /* + * If CR4.PCE is set, the SDM requires CPL=0 or CR0.PE=0. The CR0.PE + * check however is unnecessary because CPL is always 0 outside + * protected mode. + */ if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || ctxt->ops->check_pmc(ctxt, rcx)) return emulate_gp(ctxt, 0); diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index d5124b520f76..4f15c0165c05 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -112,7 +112,7 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, if (!!auto_eoi_old == !!auto_eoi_new) return; - mutex_lock(&vcpu->kvm->arch.apicv_update_lock); + down_write(&vcpu->kvm->arch.apicv_update_lock); if (auto_eoi_new) hv->synic_auto_eoi_used++; @@ -123,7 +123,7 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, !hv->synic_auto_eoi_used, APICV_INHIBIT_REASON_HYPERV); - mutex_unlock(&vcpu->kvm->arch.apicv_update_lock); + up_write(&vcpu->kvm->arch.apicv_update_lock); } static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, @@ -1754,7 +1754,6 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool int i; gpa_t gpa; struct kvm *kvm = vcpu->kvm; - struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct hv_tlb_flush_ex flush_ex; struct hv_tlb_flush flush; u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; @@ -1836,18 +1835,19 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool } } - cpumask_clear(&hv_vcpu->tlb_flush); - - vcpu_mask = all_cpus ? NULL : - sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, - vp_bitmap, vcpu_bitmap); - /* * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't * analyze it here, flush TLB regardless of the specified address space. */ - kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, - NULL, vcpu_mask, &hv_vcpu->tlb_flush); + if (all_cpus) { + kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH_GUEST); + } else { + vcpu_mask = sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, + vp_bitmap, vcpu_bitmap); + + kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, + vcpu_mask); + } ret_success: /* We always do full TLB flush, set 'Reps completed' = 'Rep Count' */ diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 8c065da73f8e..816a82515dcd 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -96,7 +96,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic) { ioapic->rtc_status.pending_eoi = 0; - bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID + 1); + bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_IDS); } static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index bbd4a5d18b5d..e66e620c3bed 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -39,13 +39,13 @@ struct kvm_vcpu; struct dest_map { /* vcpu bitmap where IRQ has been sent */ - DECLARE_BITMAP(map, KVM_MAX_VCPU_ID + 1); + DECLARE_BITMAP(map, KVM_MAX_VCPU_IDS); /* * Vector sent to a given vcpu, only valid when * the vcpu's bit in map is set */ - u8 vectors[KVM_MAX_VCPU_ID + 1]; + u8 vectors[KVM_MAX_VCPU_IDS]; }; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index e9688a9f7b57..9ae6168d381e 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -44,9 +44,8 @@ #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 -#define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | \ - X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE | \ - X86_CR4_LA57) +#define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PSE | X86_CR4_PAE | X86_CR4_LA57 | \ + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE) #define KVM_MMU_CR0_ROLE_BITS (X86_CR0_PG | X86_CR0_WP) @@ -80,6 +79,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); +void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu); static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { @@ -114,17 +114,91 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu) vcpu->arch.mmu->shadow_root_level); } -int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - bool prefault); +struct kvm_page_fault { + /* arguments to kvm_mmu_do_page_fault. */ + const gpa_t addr; + const u32 error_code; + const bool prefetch; + + /* Derived from error_code. */ + const bool exec; + const bool write; + const bool present; + const bool rsvd; + const bool user; + + /* Derived from mmu and global state. */ + const bool is_tdp; + const bool nx_huge_page_workaround_enabled; + + /* + * Whether a >4KB mapping can be created or is forbidden due to NX + * hugepages. + */ + bool huge_page_disallowed; + + /* + * Maximum page size that can be created for this fault; input to + * FNAME(fetch), __direct_map and kvm_tdp_mmu_map. + */ + u8 max_level; + + /* + * Page size that can be created based on the max_level and the + * page size used by the host mapping. + */ + u8 req_level; + + /* + * Page size that will be created based on the req_level and + * huge_page_disallowed. + */ + u8 goal_level; + + /* Shifted addr, or result of guest page table walk if addr is a gva. */ + gfn_t gfn; + + /* The memslot containing gfn. May be NULL. */ + struct kvm_memory_slot *slot; + + /* Outputs of kvm_faultin_pfn. */ + kvm_pfn_t pfn; + hva_t hva; + bool map_writable; +}; + +int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); + +extern int nx_huge_pages; +static inline bool is_nx_huge_page_enabled(void) +{ + return READ_ONCE(nx_huge_pages); +} static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u32 err, bool prefault) + u32 err, bool prefetch) { + struct kvm_page_fault fault = { + .addr = cr2_or_gpa, + .error_code = err, + .exec = err & PFERR_FETCH_MASK, + .write = err & PFERR_WRITE_MASK, + .present = err & PFERR_PRESENT_MASK, + .rsvd = err & PFERR_RSVD_MASK, + .user = err & PFERR_USER_MASK, + .prefetch = prefetch, + .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault), + .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(), + + .max_level = KVM_MAX_HUGEPAGE_LEVEL, + .req_level = PG_LEVEL_4K, + .goal_level = PG_LEVEL_4K, + }; #ifdef CONFIG_RETPOLINE - if (likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault)) - return kvm_tdp_page_fault(vcpu, cr2_or_gpa, err, prefault); + if (fault.is_tdp) + return kvm_tdp_page_fault(vcpu, &fault); #endif - return vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa, err, prefault); + return vcpu->arch.mmu->page_fault(vcpu, &fault); } /* @@ -230,14 +304,26 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); int kvm_mmu_post_init_vm(struct kvm *kvm); void kvm_mmu_pre_destroy_vm(struct kvm *kvm); -static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) +static inline bool kvm_shadow_root_allocated(struct kvm *kvm) { /* - * Read memslot_have_rmaps before rmap pointers. Hence, threads reading - * memslots_have_rmaps in any lock context are guaranteed to see the - * pointers. Pairs with smp_store_release in alloc_all_memslots_rmaps. + * Read shadow_root_allocated before related pointers. Hence, threads + * reading shadow_root_allocated in any lock context are guaranteed to + * see the pointers. Pairs with smp_store_release in + * mmu_first_shadow_root_alloc. */ - return smp_load_acquire(&kvm->arch.memslots_have_rmaps); + return smp_load_acquire(&kvm->arch.shadow_root_allocated); +} + +#ifdef CONFIG_X86_64 +static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } +#else +static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } +#endif + +static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) +{ + return !is_tdp_mmu_enabled(kvm) || kvm_shadow_root_allocated(kvm); } static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0cc58901bf7a..323b5057d08f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -58,6 +58,7 @@ extern bool itlb_multihit_kvm_mitigation; int __read_mostly nx_huge_pages = -1; +static uint __read_mostly nx_huge_pages_recovery_period_ms; #ifdef CONFIG_PREEMPT_RT /* Recovery can cause latency spikes, disable it for PREEMPT_RT. */ static uint __read_mostly nx_huge_pages_recovery_ratio = 0; @@ -66,23 +67,26 @@ static uint __read_mostly nx_huge_pages_recovery_ratio = 60; #endif static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); -static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp); +static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops nx_huge_pages_ops = { .set = set_nx_huge_pages, .get = param_get_bool, }; -static const struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { - .set = set_nx_huge_pages_recovery_ratio, +static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = { + .set = set_nx_huge_pages_recovery_param, .get = param_get_uint, }; module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); __MODULE_PARM_TYPE(nx_huge_pages, "bool"); -module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops, +module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops, &nx_huge_pages_recovery_ratio, 0644); __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); +module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops, + &nx_huge_pages_recovery_period_ms, 0644); +__MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint"); static bool __read_mostly force_flush_and_sync_on_reuse; module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); @@ -1071,20 +1075,6 @@ static bool rmap_can_add(struct kvm_vcpu *vcpu) return kvm_mmu_memory_cache_nr_free_objects(mc); } -static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) -{ - struct kvm_memory_slot *slot; - struct kvm_mmu_page *sp; - struct kvm_rmap_head *rmap_head; - - sp = sptep_to_sp(spte); - kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); - return pte_list_add(vcpu, spte, rmap_head); -} - - static void rmap_remove(struct kvm *kvm, u64 *spte) { struct kvm_memslots *slots; @@ -1097,9 +1087,9 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); /* - * Unlike rmap_add and rmap_recycle, rmap_remove does not run in the - * context of a vCPU so have to determine which memslots to use based - * on context information in sp->role. + * Unlike rmap_add, rmap_remove does not run in the context of a vCPU + * so we have to determine which memslots to use based on context + * information in sp->role. */ slots = kvm_memslots_for_spte_role(kvm, sp->role); @@ -1639,19 +1629,23 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, #define RMAP_RECYCLE_THRESHOLD 1000 -static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) +static void rmap_add(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, + u64 *spte, gfn_t gfn) { - struct kvm_memory_slot *slot; - struct kvm_rmap_head *rmap_head; struct kvm_mmu_page *sp; + struct kvm_rmap_head *rmap_head; + int rmap_count; sp = sptep_to_sp(spte); - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); + rmap_count = pte_list_add(vcpu, spte, rmap_head); - kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); - kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, - KVM_PAGES_PER_HPAGE(sp->role.level)); + if (rmap_count > RMAP_RECYCLE_THRESHOLD) { + kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); + kvm_flush_remote_tlbs_with_address( + vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); + } } bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) @@ -1795,7 +1789,7 @@ static void mark_unsync(u64 *spte) static int nonpaging_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - return 0; + return -1; } #define KVM_PAGE_ARRAY_NR 16 @@ -1909,12 +1903,14 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { - if (vcpu->arch.mmu->sync_page(vcpu, sp) == 0) { + int ret = vcpu->arch.mmu->sync_page(vcpu, sp); + + if (ret < 0) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return false; } - return true; + return !!ret; } static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, @@ -1931,17 +1927,6 @@ static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, return true; } -static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu, - struct list_head *invalid_list, - bool remote_flush, bool local_flush) -{ - if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush)) - return; - - if (local_flush) - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); -} - #ifdef CONFIG_KVM_MMU_AUDIT #include "mmu_audit.c" #else @@ -2044,7 +2029,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu, protected |= rmap_write_protect(vcpu, sp->gfn); if (protected) { - kvm_flush_remote_tlbs(vcpu->kvm); + kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true); flush = false; } @@ -2054,7 +2039,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu, mmu_pages_clear_parents(&parents); } if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) { - kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); + kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); if (!can_yield) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); return -EINTR; @@ -2065,7 +2050,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu, } } - kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); + kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); return 0; } @@ -2149,7 +2134,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, break; WARN_ON(!list_empty(&invalid_list)); - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + kvm_flush_remote_tlbs(vcpu->kvm); } __clear_sp_write_flooding_count(sp); @@ -2229,7 +2214,7 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, u64 spte) { - if (is_last_spte(spte, iterator->level)) { + if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) { iterator->level = 0; return; } @@ -2591,7 +2576,8 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must * be write-protected. */ -int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) +int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, + gfn_t gfn, bool can_unsync, bool prefetch) { struct kvm_mmu_page *sp; bool locked = false; @@ -2601,7 +2587,7 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) * track machinery is used to write-protect upper-level shadow pages, * i.e. this guards the role.level == 4K assertion below! */ - if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) + if (kvm_slot_page_track_is_active(vcpu, slot, gfn, KVM_PAGE_TRACK_WRITE)) return -EPERM; /* @@ -2617,6 +2603,9 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) if (sp->unsync) continue; + if (prefetch) + return -EEXIST; + /* * TDP MMU page faults require an additional spinlock as they * run with mmu_lock held for read, not write, and the unsync @@ -2680,48 +2669,30 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) * (sp->unsync = true) * * The write barrier below ensures that 1.1 happens before 1.2 and thus - * the situation in 2.4 does not arise. The implicit barrier in 2.2 - * pairs with this write barrier. + * the situation in 2.4 does not arise. It pairs with the read barrier + * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3. */ smp_wmb(); return 0; } -static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned int pte_access, int level, - gfn_t gfn, kvm_pfn_t pfn, bool speculative, - bool can_unsync, bool host_writable) -{ - u64 spte; - struct kvm_mmu_page *sp; - int ret; - - sp = sptep_to_sp(sptep); - - ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative, - can_unsync, host_writable, sp_ad_disabled(sp), &spte); - - if (spte & PT_WRITABLE_MASK) - kvm_vcpu_mark_page_dirty(vcpu, gfn); - - if (*sptep == spte) - ret |= SET_SPTE_SPURIOUS; - else if (mmu_spte_update(sptep, spte)) - ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; - return ret; -} - -static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned int pte_access, bool write_fault, int level, - gfn_t gfn, kvm_pfn_t pfn, bool speculative, - bool host_writable) +static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, + u64 *sptep, unsigned int pte_access, gfn_t gfn, + kvm_pfn_t pfn, struct kvm_page_fault *fault) { + struct kvm_mmu_page *sp = sptep_to_sp(sptep); + int level = sp->role.level; int was_rmapped = 0; - int rmap_count; - int set_spte_ret; int ret = RET_PF_FIXED; bool flush = false; + bool wrprot; + u64 spte; + + /* Prefetching always gets a writable pfn. */ + bool host_writable = !fault || fault->map_writable; + bool prefetch = !fault || fault->prefetch; + bool write_fault = fault && fault->write; pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, *sptep, write_fault, gfn); @@ -2752,52 +2723,36 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, was_rmapped = 1; } - set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn, - speculative, true, host_writable); - if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { + wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch, + true, host_writable, &spte); + + if (*sptep == spte) { + ret = RET_PF_SPURIOUS; + } else { + trace_kvm_mmu_set_spte(level, gfn, sptep); + flush |= mmu_spte_update(sptep, spte); + } + + if (wrprot) { if (write_fault) ret = RET_PF_EMULATE; - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } - if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush) + if (flush) kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, KVM_PAGES_PER_HPAGE(level)); - /* - * The fault is fully spurious if and only if the new SPTE and old SPTE - * are identical, and emulation is not required. - */ - if ((set_spte_ret & SET_SPTE_SPURIOUS) && ret == RET_PF_FIXED) { - WARN_ON_ONCE(!was_rmapped); - return RET_PF_SPURIOUS; - } - pgprintk("%s: setting spte %llx\n", __func__, *sptep); - trace_kvm_mmu_set_spte(level, gfn, sptep); if (!was_rmapped) { + WARN_ON_ONCE(ret == RET_PF_SPURIOUS); kvm_update_page_stats(vcpu->kvm, level, 1); - rmap_count = rmap_add(vcpu, sptep, gfn); - if (rmap_count > RMAP_RECYCLE_THRESHOLD) - rmap_recycle(vcpu, sptep, gfn); + rmap_add(vcpu, slot, sptep, gfn); } return ret; } -static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, - bool no_dirty_log) -{ - struct kvm_memory_slot *slot; - - slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); - if (!slot) - return KVM_PFN_ERR_FAULT; - - return gfn_to_pfn_memslot_atomic(slot, gfn); -} - static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *start, u64 *end) @@ -2818,8 +2773,8 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, return -1; for (i = 0; i < ret; i++, gfn++, start++) { - mmu_set_spte(vcpu, start, access, false, sp->role.level, gfn, - page_to_pfn(pages[i]), true, true); + mmu_set_spte(vcpu, slot, start, access, gfn, + page_to_pfn(pages[i]), NULL); put_page(pages[i]); } @@ -2842,11 +2797,13 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, if (!start) continue; if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) - break; + return; start = NULL; } else if (!start) start = spte; } + if (start) + direct_pte_prefetch_many(vcpu, sp, start, spte); } static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) @@ -2924,52 +2881,46 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, return min(host_level, max_level); } -int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, - int max_level, kvm_pfn_t *pfnp, - bool huge_page_disallowed, int *req_level) +void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - struct kvm_memory_slot *slot; - kvm_pfn_t pfn = *pfnp; + struct kvm_memory_slot *slot = fault->slot; kvm_pfn_t mask; - int level; - *req_level = PG_LEVEL_4K; + fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled; - if (unlikely(max_level == PG_LEVEL_4K)) - return PG_LEVEL_4K; + if (unlikely(fault->max_level == PG_LEVEL_4K)) + return; - if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn)) - return PG_LEVEL_4K; + if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn)) + return; - slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true); - if (!slot) - return PG_LEVEL_4K; + if (kvm_slot_dirty_track_enabled(slot)) + return; /* * Enforce the iTLB multihit workaround after capturing the requested * level, which will be used to do precise, accurate accounting. */ - *req_level = level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level); - if (level == PG_LEVEL_4K || huge_page_disallowed) - return PG_LEVEL_4K; + fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, + fault->gfn, fault->pfn, + fault->max_level); + if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) + return; /* * mmu_notifier_retry() was successful and mmu_lock is held, so * the pmd can't be split from under us. */ - mask = KVM_PAGES_PER_HPAGE(level) - 1; - VM_BUG_ON((gfn & mask) != (pfn & mask)); - *pfnp = pfn & ~mask; - - return level; + fault->goal_level = fault->req_level; + mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1; + VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask)); + fault->pfn &= ~mask; } -void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, - kvm_pfn_t *pfnp, int *goal_levelp) +void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level) { - int level = *goal_levelp; - - if (cur_level == level && level > PG_LEVEL_4K && + if (cur_level > PG_LEVEL_4K && + cur_level == fault->goal_level && is_shadow_present_pte(spte) && !is_large_pte(spte)) { /* @@ -2979,42 +2930,33 @@ void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, * patching back for them into pfn the next 9 bits of * the address. */ - u64 page_mask = KVM_PAGES_PER_HPAGE(level) - - KVM_PAGES_PER_HPAGE(level - 1); - *pfnp |= gfn & page_mask; - (*goal_levelp)--; + u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) - + KVM_PAGES_PER_HPAGE(cur_level - 1); + fault->pfn |= fault->gfn & page_mask; + fault->goal_level--; } } -static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - int map_writable, int max_level, kvm_pfn_t pfn, - bool prefault, bool is_tdp) +static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); - bool write = error_code & PFERR_WRITE_MASK; - bool exec = error_code & PFERR_FETCH_MASK; - bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; - int level, req_level, ret; - gfn_t gfn = gpa >> PAGE_SHIFT; - gfn_t base_gfn = gfn; + int ret; + gfn_t base_gfn = fault->gfn; - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, - huge_page_disallowed, &req_level); + kvm_mmu_hugepage_adjust(vcpu, fault); - trace_kvm_mmu_spte_requested(gpa, level, pfn); - for_each_shadow_entry(vcpu, gpa, it) { + trace_kvm_mmu_spte_requested(fault); + for_each_shadow_entry(vcpu, fault->addr, it) { /* * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ - if (nx_huge_page_workaround_enabled) - disallowed_hugepage_adjust(*it.sptep, gfn, it.level, - &pfn, &level); + if (fault->nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(fault, *it.sptep, it.level); - base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); - if (it.level == level) + base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + if (it.level == fault->goal_level) break; drop_large_spte(vcpu, it.sptep); @@ -3025,14 +2967,16 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, it.level - 1, true, ACC_ALL); link_shadow_page(vcpu, it.sptep, sp); - if (is_tdp && huge_page_disallowed && - req_level >= it.level) + if (fault->is_tdp && fault->huge_page_disallowed && + fault->req_level >= it.level) account_huge_nx_page(vcpu->kvm, sp); } - ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, - write, level, base_gfn, pfn, prefault, - map_writable); + if (WARN_ON_ONCE(it.level != fault->goal_level)) + return -EFAULT; + + ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL, + base_gfn, fault->pfn, fault); if (ret == RET_PF_SPURIOUS) return ret; @@ -3064,18 +3008,19 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) return -EFAULT; } -static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, - kvm_pfn_t pfn, unsigned int access, - int *ret_val) +static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, + unsigned int access, int *ret_val) { /* The pfn is invalid, report the error! */ - if (unlikely(is_error_pfn(pfn))) { - *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); + if (unlikely(is_error_pfn(fault->pfn))) { + *ret_val = kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn); return true; } - if (unlikely(is_noslot_pfn(pfn))) { - vcpu_cache_mmio_info(vcpu, gva, gfn, + if (unlikely(!fault->slot)) { + gva_t gva = fault->is_tdp ? 0 : fault->addr; + + vcpu_cache_mmio_info(vcpu, gva, fault->gfn, access & shadow_mmio_access_mask); /* * If MMIO caching is disabled, emulate immediately without @@ -3091,18 +3036,17 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, return false; } -static bool page_fault_can_be_fast(u32 error_code) +static bool page_fault_can_be_fast(struct kvm_page_fault *fault) { /* * Do not fix the mmio spte with invalid generation number which * need to be updated by slow page fault path. */ - if (unlikely(error_code & PFERR_RSVD_MASK)) + if (fault->rsvd) return false; /* See if the page fault is due to an NX violation */ - if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)) - == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)))) + if (unlikely(fault->exec && fault->present)) return false; /* @@ -3119,9 +3063,7 @@ static bool page_fault_can_be_fast(u32 error_code) * accesses to a present page. */ - return shadow_acc_track_mask != 0 || - ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)) - == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)); + return shadow_acc_track_mask != 0 || (fault->write && fault->present); } /* @@ -3129,13 +3071,9 @@ static bool page_fault_can_be_fast(u32 error_code) * someone else modified the SPTE from its original value. */ static bool -fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, +fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, u64 *sptep, u64 old_spte, u64 new_spte) { - gfn_t gfn; - - WARN_ON(!sp->role.direct); - /* * Theoretically we could also set dirty bit (and flush TLB) here in * order to eliminate unnecessary PML logging. See comments in @@ -3151,24 +3089,18 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) return false; - if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) { - /* - * The gfn of direct spte is stable since it is - * calculated by sp->gfn. - */ - gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); - kvm_vcpu_mark_page_dirty(vcpu, gfn); - } + if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) + mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn); return true; } -static bool is_access_allowed(u32 fault_err_code, u64 spte) +static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) { - if (fault_err_code & PFERR_FETCH_MASK) + if (fault->exec) return is_executable_pte(spte); - if (fault_err_code & PFERR_WRITE_MASK) + if (fault->write) return is_writable_pte(spte); /* Fault was on Read access */ @@ -3193,9 +3125,6 @@ static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte) for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) { sptep = iterator.sptep; *spte = old_spte; - - if (!is_shadow_present_pte(old_spte)) - break; } return sptep; @@ -3204,7 +3133,7 @@ static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte) /* * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. */ -static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) +static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_mmu_page *sp; int ret = RET_PF_INVALID; @@ -3212,7 +3141,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) u64 *sptep = NULL; uint retry_count = 0; - if (!page_fault_can_be_fast(error_code)) + if (!page_fault_can_be_fast(fault)) return ret; walk_shadow_page_lockless_begin(vcpu); @@ -3221,9 +3150,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) u64 new_spte; if (is_tdp_mmu(vcpu->arch.mmu)) - sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, gpa, &spte); + sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte); else - sptep = fast_pf_get_last_sptep(vcpu, gpa, &spte); + sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte); if (!is_shadow_present_pte(spte)) break; @@ -3242,7 +3171,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) * Need not check the access of upper level table entries since * they are always ACC_ALL. */ - if (is_access_allowed(error_code, spte)) { + if (is_access_allowed(fault, spte)) { ret = RET_PF_SPURIOUS; break; } @@ -3257,7 +3186,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) * be removed in the fast path only if the SPTE was * write-protected for dirty-logging or access tracking. */ - if ((error_code & PFERR_WRITE_MASK) && + if (fault->write && spte_can_locklessly_be_made_writable(spte)) { new_spte |= PT_WRITABLE_MASK; @@ -3278,7 +3207,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) /* Verify that the fault can be handled in the fast path */ if (new_spte == spte || - !is_access_allowed(error_code, new_spte)) + !is_access_allowed(fault, new_spte)) break; /* @@ -3286,7 +3215,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) * since the gfn is not stable for indirect shadow page. See * Documentation/virt/kvm/locking.rst to get more detail. */ - if (fast_pf_fix_direct_spte(vcpu, sp, sptep, spte, new_spte)) { + if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) { ret = RET_PF_FIXED; break; } @@ -3299,7 +3228,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) } while (true); - trace_fast_page_fault(vcpu, gpa, error_code, sptep, spte, ret); + trace_fast_page_fault(vcpu, fault, sptep, spte, ret); walk_shadow_page_lockless_end(vcpu); return ret; @@ -3472,6 +3401,67 @@ out_unlock: return r; } +static int mmu_first_shadow_root_alloc(struct kvm *kvm) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; + int r = 0, i; + + /* + * Check if this is the first shadow root being allocated before + * taking the lock. + */ + if (kvm_shadow_root_allocated(kvm)) + return 0; + + mutex_lock(&kvm->slots_arch_lock); + + /* Recheck, under the lock, whether this is the first shadow root. */ + if (kvm_shadow_root_allocated(kvm)) + goto out_unlock; + + /* + * Check if anything actually needs to be allocated, e.g. all metadata + * will be allocated upfront if TDP is disabled. + */ + if (kvm_memslots_have_rmaps(kvm) && + kvm_page_track_write_tracking_enabled(kvm)) + goto out_success; + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(slot, slots) { + /* + * Both of these functions are no-ops if the target is + * already allocated, so unconditionally calling both + * is safe. Intentionally do NOT free allocations on + * failure to avoid having to track which allocations + * were made now versus when the memslot was created. + * The metadata is guaranteed to be freed when the slot + * is freed, and will be kept/used if userspace retries + * KVM_RUN instead of killing the VM. + */ + r = memslot_rmap_alloc(slot, slot->npages); + if (r) + goto out_unlock; + r = kvm_page_track_write_tracking_alloc(slot); + if (r) + goto out_unlock; + } + } + + /* + * Ensure that shadow_root_allocated becomes true strictly after + * all the related pointers are set. + */ +out_success: + smp_store_release(&kvm->arch.shadow_root_allocated, true); + +out_unlock: + mutex_unlock(&kvm->slots_arch_lock); + return r; +} + static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; @@ -3502,7 +3492,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) } } - r = alloc_all_memslots_rmaps(vcpu->kvm); + r = mmu_first_shadow_root_alloc(vcpu->kvm); if (r) return r; @@ -3653,6 +3643,33 @@ err_pml4: #endif } +static bool is_unsync_root(hpa_t root) +{ + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(root)) + return false; + + /* + * The read barrier orders the CPU's read of SPTE.W during the page table + * walk before the reads of sp->unsync/sp->unsync_children here. + * + * Even if another CPU was marking the SP as unsync-ed simultaneously, + * any guest page table changes are not guaranteed to be visible anyway + * until this VCPU issues a TLB flush strictly after those changes are + * made. We only need to ensure that the other CPU sets these flags + * before any actual changes to the page tables are made. The comments + * in mmu_try_to_unsync_pages() describe what could go wrong if this + * requirement isn't satisfied. + */ + smp_rmb(); + sp = to_shadow_page(root); + if (sp->unsync || sp->unsync_children) + return true; + + return false; +} + void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) { int i; @@ -3670,18 +3687,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) hpa_t root = vcpu->arch.mmu->root_hpa; sp = to_shadow_page(root); - /* - * Even if another CPU was marking the SP as unsync-ed - * simultaneously, any guest page table changes are not - * guaranteed to be visible anyway until this VCPU issues a TLB - * flush strictly after those changes are made. We only need to - * ensure that the other CPU sets these flags before any actual - * changes to the page tables are made. The comments in - * mmu_try_to_unsync_pages() describe what could go wrong if - * this requirement isn't satisfied. - */ - if (!smp_load_acquire(&sp->unsync) && - !smp_load_acquire(&sp->unsync_children)) + if (!is_unsync_root(root)) return; write_lock(&vcpu->kvm->mmu_lock); @@ -3711,6 +3717,19 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) write_unlock(&vcpu->kvm->mmu_lock); } +void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu) +{ + unsigned long roots_to_free = 0; + int i; + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa)) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + + /* sync prev_roots by simply freeing them */ + kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); +} + static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr, u32 access, struct x86_exception *exception) { @@ -3763,9 +3782,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level spte = mmu_spte_get_lockless(iterator.sptep); sptes[leaf] = spte; - - if (!is_shadow_present_pte(spte)) - break; } return leaf; @@ -3856,20 +3872,19 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) } static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, - u32 error_code, gfn_t gfn) + struct kvm_page_fault *fault) { - if (unlikely(error_code & PFERR_RSVD_MASK)) + if (unlikely(fault->rsvd)) return false; - if (!(error_code & PFERR_PRESENT_MASK) || - !(error_code & PFERR_WRITE_MASK)) + if (!fault->present || !fault->write) return false; /* * guest is writing the page which is write tracked which can * not be fixed by page fault handler. */ - if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) + if (kvm_slot_page_track_is_active(vcpu, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE)) return true; return false; @@ -3881,11 +3896,8 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) u64 spte; walk_shadow_page_lockless_begin(vcpu); - for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { + for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) clear_sp_write_flooding_count(iterator.sptep); - if (!is_shadow_present_pte(spte)) - break; - } walk_shadow_page_lockless_end(vcpu); } @@ -3903,11 +3915,9 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } -static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva, - bool write, bool *writable, int *r) +static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int *r) { - struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + struct kvm_memory_slot *slot = fault->slot; bool async; /* @@ -3921,8 +3931,9 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, if (!kvm_is_visible_memslot(slot)) { /* Don't expose private memslots to L2. */ if (is_guest_mode(vcpu)) { - *pfn = KVM_PFN_NOSLOT; - *writable = false; + fault->slot = NULL; + fault->pfn = KVM_PFN_NOSLOT; + fault->map_writable = false; return false; } /* @@ -3939,46 +3950,46 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, } async = false; - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, - write, writable, hva); + fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async, + fault->write, &fault->map_writable, + &fault->hva); if (!async) return false; /* *pfn has correct page already */ - if (!prefault && kvm_can_do_async_pf(vcpu)) { - trace_kvm_try_async_get_page(cr2_or_gpa, gfn); - if (kvm_find_async_pf_gfn(vcpu, gfn)) { - trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn); + if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) { + trace_kvm_try_async_get_page(fault->addr, fault->gfn); + if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) { + trace_kvm_async_pf_doublefault(fault->addr, fault->gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); goto out_retry; - } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn)) + } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) goto out_retry; } - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, - write, writable, hva); + fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL, + fault->write, &fault->map_writable, + &fault->hva); + return false; out_retry: *r = RET_PF_RETRY; return true; } -static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - bool prefault, int max_level, bool is_tdp) +static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); - bool write = error_code & PFERR_WRITE_MASK; - bool map_writable; - gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; - kvm_pfn_t pfn; - hva_t hva; int r; - if (page_fault_handle_page_track(vcpu, error_code, gfn)) + fault->gfn = fault->addr >> PAGE_SHIFT; + fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); + + if (page_fault_handle_page_track(vcpu, fault)) return RET_PF_EMULATE; - r = fast_page_fault(vcpu, gpa, error_code); + r = fast_page_fault(vcpu, fault); if (r != RET_PF_INVALID) return r; @@ -3989,11 +4000,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (kvm_faultin_pfn(vcpu, prefault, gfn, gpa, &pfn, &hva, - write, &map_writable, &r)) + if (kvm_faultin_pfn(vcpu, fault, &r)) return r; - if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r)) + if (handle_abnormal_pfn(vcpu, fault, ACC_ALL, &r)) return r; r = RET_PF_RETRY; @@ -4003,36 +4013,34 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, else write_lock(&vcpu->kvm->mmu_lock); - if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) + if (fault->slot && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva)) goto out_unlock; r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; if (is_tdp_mmu_fault) - r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level, - pfn, prefault); + r = kvm_tdp_mmu_map(vcpu, fault); else - r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn, - prefault, is_tdp); + r = __direct_map(vcpu, fault); out_unlock: if (is_tdp_mmu_fault) read_unlock(&vcpu->kvm->mmu_lock); else write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); + kvm_release_pfn_clean(fault->pfn); return r; } -static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, - u32 error_code, bool prefault) +static int nonpaging_page_fault(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) { - pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code); + pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code); /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ - return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault, - PG_LEVEL_2M, false); + fault->max_level = PG_LEVEL_2M; + return direct_page_fault(vcpu, fault); } int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, @@ -4068,23 +4076,19 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, } EXPORT_SYMBOL_GPL(kvm_handle_page_fault); -int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - bool prefault) +int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - int max_level; - - for (max_level = KVM_MAX_HUGEPAGE_LEVEL; - max_level > PG_LEVEL_4K; - max_level--) { - int page_num = KVM_PAGES_PER_HPAGE(max_level); - gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1); + while (fault->max_level > PG_LEVEL_4K) { + int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); + gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1); if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) break; + + --fault->max_level; } - return direct_page_fault(vcpu, gpa, error_code, prefault, - max_level, true); + return direct_page_fault(vcpu, fault); } static void nonpaging_init_context(struct kvm_mmu *context) @@ -4205,7 +4209,7 @@ static unsigned long get_cr3(struct kvm_vcpu *vcpu) } static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, - unsigned int access, int *nr_present) + unsigned int access) { if (unlikely(is_mmio_spte(*sptep))) { if (gfn != get_mmio_spte_gfn(*sptep)) { @@ -4213,7 +4217,6 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, return true; } - (*nr_present)++; mark_mmio_spte(vcpu, sptep, gfn, access); return true; } @@ -5212,7 +5215,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, LIST_HEAD(invalid_list); u64 entry, gentry, *spte; int npte; - bool remote_flush, local_flush; + bool flush = false; /* * If we don't have indirect shadow pages, it means no page is @@ -5221,8 +5224,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) return; - remote_flush = local_flush = false; - pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); /* @@ -5251,18 +5252,17 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (!spte) continue; - local_flush = true; while (npte--) { entry = *spte; mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL); if (gentry && sp->role.level != PG_LEVEL_4K) ++vcpu->kvm->stat.mmu_pde_zapped; if (need_remote_flush(entry, *spte)) - remote_flush = true; + flush = true; ++spte; } } - kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush); + kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); write_unlock(&vcpu->kvm->mmu_lock); } @@ -5473,8 +5473,8 @@ slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot, } static __always_inline bool -slot_handle_leaf(struct kvm *kvm, const struct kvm_memory_slot *memslot, - slot_level_handler fn, bool flush_on_yield) +slot_handle_level_4k(struct kvm *kvm, const struct kvm_memory_slot *memslot, + slot_level_handler fn, bool flush_on_yield) { return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield); @@ -5694,13 +5694,7 @@ void kvm_mmu_init_vm(struct kvm *kvm) spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); - if (!kvm_mmu_init_tdp_mmu(kvm)) - /* - * No smp_load/store wrappers needed here as we are in - * VM init and there cannot be any memslots / other threads - * accessing this struct kvm yet. - */ - kvm->arch.memslots_have_rmaps = true; + kvm_mmu_init_tdp_mmu(kvm); node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; @@ -5716,55 +5710,58 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) kvm_mmu_uninit_tdp_mmu(kvm); } +static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) +{ + const struct kvm_memory_slot *memslot; + struct kvm_memslots *slots; + bool flush = false; + gfn_t start, end; + int i; + + if (!kvm_memslots_have_rmaps(kvm)) + return flush; + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(memslot, slots) { + start = max(gfn_start, memslot->base_gfn); + end = min(gfn_end, memslot->base_gfn + memslot->npages); + if (start >= end) + continue; + + flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, + start, end - 1, true, flush); + } + } + + return flush; +} + /* * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end * (not including it) */ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; + bool flush; int i; - bool flush = false; write_lock(&kvm->mmu_lock); kvm_inc_notifier_count(kvm, gfn_start, gfn_end); - if (kvm_memslots_have_rmaps(kvm)) { - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(memslot, slots) { - gfn_t start, end; - - start = max(gfn_start, memslot->base_gfn); - end = min(gfn_end, memslot->base_gfn + memslot->npages); - if (start >= end) - continue; - - flush = slot_handle_level_range(kvm, - (const struct kvm_memory_slot *) memslot, - kvm_zap_rmapp, PG_LEVEL_4K, - KVM_MAX_HUGEPAGE_LEVEL, start, - end - 1, true, flush); - } - } - if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, - gfn_end - gfn_start); - } + flush = __kvm_zap_rmaps(kvm, gfn_start, gfn_end); if (is_tdp_mmu_enabled(kvm)) { for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start, gfn_end, flush); - if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, - gfn_end - gfn_start); } if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, + gfn_end - gfn_start); kvm_dec_notifier_count(kvm, gfn_start, gfn_end); @@ -5860,7 +5857,12 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); - flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true); + /* + * Zap only 4k SPTEs since the legacy MMU only supports dirty + * logging at a 4k granularity and never creates collapsible + * 2m SPTEs during dirty logging. + */ + flush = slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true); if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, slot); write_unlock(&kvm->mmu_lock); @@ -5897,8 +5899,11 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); - flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, - false); + /* + * Clear dirty bits only on 4k SPTEs since the legacy MMU only + * support dirty logging at a 4k granularity. + */ + flush = slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false); write_unlock(&kvm->mmu_lock); } @@ -6176,18 +6181,24 @@ void kvm_mmu_module_exit(void) mmu_audit_disable(); } -static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp) +static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp) { - unsigned int old_val; + bool was_recovery_enabled, is_recovery_enabled; + uint old_period, new_period; int err; - old_val = nx_huge_pages_recovery_ratio; + was_recovery_enabled = nx_huge_pages_recovery_ratio; + old_period = nx_huge_pages_recovery_period_ms; + err = param_set_uint(val, kp); if (err) return err; - if (READ_ONCE(nx_huge_pages) && - !old_val && nx_huge_pages_recovery_ratio) { + is_recovery_enabled = nx_huge_pages_recovery_ratio; + new_period = nx_huge_pages_recovery_period_ms; + + if (READ_ONCE(nx_huge_pages) && is_recovery_enabled && + (!was_recovery_enabled || old_period > new_period)) { struct kvm *kvm; mutex_lock(&kvm_lock); @@ -6250,8 +6261,17 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) static long get_nx_lpage_recovery_timeout(u64 start_time) { - return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio) - ? start_time + 60 * HZ - get_jiffies_64() + uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio); + uint period = READ_ONCE(nx_huge_pages_recovery_period_ms); + + if (!period && ratio) { + /* Make sure the period is not less than one second. */ + ratio = min(ratio, 3600u); + period = 60 * 60 * 1000 / ratio; + } + + return READ_ONCE(nx_huge_pages) && ratio + ? start_time + msecs_to_jiffies(period) - get_jiffies_64() : MAX_SCHEDULE_TIMEOUT; } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index bf2bdbf333c2..52c6527b1a06 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -118,13 +118,8 @@ static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) kvm_x86_ops.cpu_dirty_log_size; } -extern int nx_huge_pages; -static inline bool is_nx_huge_page_enabled(void) -{ - return READ_ONCE(nx_huge_pages); -} - -int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync); +int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, + gfn_t gfn, bool can_unsync, bool prefetch); void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); @@ -155,19 +150,11 @@ enum { RET_PF_SPURIOUS, }; -/* Bits which may be returned by set_spte() */ -#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) -#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) -#define SET_SPTE_SPURIOUS BIT(2) - int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t pfn, int max_level); -int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, - int max_level, kvm_pfn_t *pfnp, - bool huge_page_disallowed, int *req_level); -void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, - kvm_pfn_t *pfnp, int *goal_levelp); +void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); +void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index 2924a4081a19..b8151bbca36a 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -252,9 +252,9 @@ TRACE_EVENT( TRACE_EVENT( fast_page_fault, - TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code, + TP_PROTO(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, u64 *sptep, u64 old_spte, int ret), - TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, ret), + TP_ARGS(vcpu, fault, sptep, old_spte, ret), TP_STRUCT__entry( __field(int, vcpu_id) @@ -268,8 +268,8 @@ TRACE_EVENT( TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; - __entry->cr2_or_gpa = cr2_or_gpa; - __entry->error_code = error_code; + __entry->cr2_or_gpa = fault->addr; + __entry->error_code = fault->error_code; __entry->sptep = sptep; __entry->old_spte = old_spte; __entry->new_spte = *sptep; @@ -367,8 +367,8 @@ TRACE_EVENT( TRACE_EVENT( kvm_mmu_spte_requested, - TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn), - TP_ARGS(addr, level, pfn), + TP_PROTO(struct kvm_page_fault *fault), + TP_ARGS(fault), TP_STRUCT__entry( __field(u64, gfn) @@ -377,9 +377,9 @@ TRACE_EVENT( ), TP_fast_assign( - __entry->gfn = addr >> PAGE_SHIFT; - __entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); - __entry->level = level; + __entry->gfn = fault->gfn; + __entry->pfn = fault->pfn | (fault->gfn & (KVM_PAGES_PER_HPAGE(fault->goal_level) - 1)); + __entry->level = fault->goal_level; ), TP_printk("gfn %llx pfn %llx level %d", diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 21427e84a82e..cc4eb5b7fb76 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -19,6 +19,12 @@ #include "mmu.h" #include "mmu_internal.h" +bool kvm_page_track_write_tracking_enabled(struct kvm *kvm) +{ + return IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) || + !tdp_enabled || kvm_shadow_root_allocated(kvm); +} + void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) { int i; @@ -29,12 +35,17 @@ void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) } } -int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, +int kvm_page_track_create_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, unsigned long npages) { - int i; + int i; for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { + if (i == KVM_PAGE_TRACK_WRITE && + !kvm_page_track_write_tracking_enabled(kvm)) + continue; + slot->arch.gfn_track[i] = kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]), GFP_KERNEL_ACCOUNT); @@ -57,6 +68,21 @@ static inline bool page_track_mode_is_valid(enum kvm_page_track_mode mode) return true; } +int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot) +{ + unsigned short *gfn_track; + + if (slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE]) + return 0; + + gfn_track = kvcalloc(slot->npages, sizeof(*gfn_track), GFP_KERNEL_ACCOUNT); + if (gfn_track == NULL) + return -ENOMEM; + + slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE] = gfn_track; + return 0; +} + static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn, enum kvm_page_track_mode mode, short count) { @@ -92,6 +118,10 @@ void kvm_slot_page_track_add_page(struct kvm *kvm, if (WARN_ON(!page_track_mode_is_valid(mode))) return; + if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE && + !kvm_page_track_write_tracking_enabled(kvm))) + return; + update_gfn_track(slot, gfn, mode, 1); /* @@ -126,6 +156,10 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm, if (WARN_ON(!page_track_mode_is_valid(mode))) return; + if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE && + !kvm_page_track_write_tracking_enabled(kvm))) + return; + update_gfn_track(slot, gfn, mode, -1); /* @@ -139,19 +173,22 @@ EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page); /* * check if the corresponding access on the specified guest page is tracked. */ -bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, - enum kvm_page_track_mode mode) +bool kvm_slot_page_track_is_active(struct kvm_vcpu *vcpu, + struct kvm_memory_slot *slot, gfn_t gfn, + enum kvm_page_track_mode mode) { - struct kvm_memory_slot *slot; int index; if (WARN_ON(!page_track_mode_is_valid(mode))) return false; - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (!slot) return false; + if (mode == KVM_PAGE_TRACK_WRITE && + !kvm_page_track_write_tracking_enabled(vcpu->kvm)) + return false; + index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K); return !!READ_ONCE(slot->arch.gfn_track[mode][index]); } diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 913d52a7923e..f87d36898c44 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -561,6 +561,7 @@ static bool FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, pt_element_t gpte, bool no_dirty_log) { + struct kvm_memory_slot *slot; unsigned pte_access; gfn_t gfn; kvm_pfn_t pfn; @@ -573,30 +574,21 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, gfn = gpte_to_gfn(gpte); pte_access = sp->role.access & FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, + + slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log && (pte_access & ACC_WRITE_MASK)); - if (is_error_pfn(pfn)) + if (!slot) return false; - /* - * we call mmu_set_spte() with host_writable = true because - * pte_prefetch_gfn_to_pfn always gets a writable pfn. - */ - mmu_set_spte(vcpu, spte, pte_access, false, PG_LEVEL_4K, gfn, pfn, - true, true); + pfn = gfn_to_pfn_memslot_atomic(slot, gfn); + if (is_error_pfn(pfn)) + return false; + mmu_set_spte(vcpu, slot, spte, pte_access, gfn, pfn, NULL); kvm_release_pfn_clean(pfn); return true; } -static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, const void *pte) -{ - pt_element_t gpte = *(const pt_element_t *)pte; - - FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false); -} - static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, struct guest_walker *gw, int level) { @@ -663,21 +655,16 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, * If the guest tries to write a write-protected page, we need to * emulate this operation, return 1 to indicate this case. */ -static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, - struct guest_walker *gw, u32 error_code, - int max_level, kvm_pfn_t pfn, bool map_writable, - bool prefault) +static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, + struct guest_walker *gw) { - bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); - bool write_fault = error_code & PFERR_WRITE_MASK; - bool exec = error_code & PFERR_FETCH_MASK; - bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned int direct_access, access; - int top_level, level, req_level, ret; - gfn_t base_gfn = gw->gfn; + int top_level, ret; + gfn_t base_gfn = fault->gfn; + WARN_ON_ONCE(gw->gfn != base_gfn); direct_access = gw->pte_access; top_level = vcpu->arch.mmu->root_level; @@ -695,7 +682,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) goto out_gpte_changed; - for (shadow_walk_init(&it, vcpu, addr); + for (shadow_walk_init(&it, vcpu, fault->addr); shadow_walk_okay(&it) && it.level > gw->level; shadow_walk_next(&it)) { gfn_t table_gfn; @@ -707,7 +694,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, if (!is_shadow_present_pte(*it.sptep)) { table_gfn = gw->table_gfn[it.level - 2]; access = gw->pt_access[it.level - 2]; - sp = kvm_mmu_get_page(vcpu, table_gfn, addr, + sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr, it.level-1, false, access); /* * We must synchronize the pagetable before linking it @@ -741,10 +728,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, link_shadow_page(vcpu, it.sptep, sp); } - level = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn, - huge_page_disallowed, &req_level); + kvm_mmu_hugepage_adjust(vcpu, fault); - trace_kvm_mmu_spte_requested(addr, gw->level, pfn); + trace_kvm_mmu_spte_requested(fault); for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { clear_sp_write_flooding_count(it.sptep); @@ -753,12 +739,11 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ - if (nx_huge_page_workaround_enabled) - disallowed_hugepage_adjust(*it.sptep, gw->gfn, it.level, - &pfn, &level); + if (fault->nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(fault, *it.sptep, it.level); - base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); - if (it.level == level) + base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + if (it.level == fault->goal_level) break; validate_direct_spte(vcpu, it.sptep, direct_access); @@ -766,16 +751,20 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, drop_large_spte(vcpu, it.sptep); if (!is_shadow_present_pte(*it.sptep)) { - sp = kvm_mmu_get_page(vcpu, base_gfn, addr, + sp = kvm_mmu_get_page(vcpu, base_gfn, fault->addr, it.level - 1, true, direct_access); link_shadow_page(vcpu, it.sptep, sp); - if (huge_page_disallowed && req_level >= it.level) + if (fault->huge_page_disallowed && + fault->req_level >= it.level) account_huge_nx_page(vcpu->kvm, sp); } } - ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, - it.level, base_gfn, pfn, prefault, map_writable); + if (WARN_ON_ONCE(it.level != fault->goal_level)) + return -EFAULT; + + ret = mmu_set_spte(vcpu, fault->slot, it.sptep, gw->pte_access, + base_gfn, fault->pfn, fault); if (ret == RET_PF_SPURIOUS) return ret; @@ -841,45 +830,40 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, * Returns: 1 if we need to emulate the instruction, 0 otherwise, or * a negative value on error. */ -static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, - bool prefault) +static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - bool write_fault = error_code & PFERR_WRITE_MASK; - bool user_fault = error_code & PFERR_USER_MASK; struct guest_walker walker; int r; - kvm_pfn_t pfn; - hva_t hva; unsigned long mmu_seq; - bool map_writable, is_self_change_mapping; - int max_level; + bool is_self_change_mapping; - pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); + pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code); + WARN_ON_ONCE(fault->is_tdp); /* + * Look up the guest pte for the faulting address. * If PFEC.RSVD is set, this is a shadow page fault. * The bit needs to be cleared before walking guest page tables. */ - error_code &= ~PFERR_RSVD_MASK; - - /* - * Look up the guest pte for the faulting address. - */ - r = FNAME(walk_addr)(&walker, vcpu, addr, error_code); + r = FNAME(walk_addr)(&walker, vcpu, fault->addr, + fault->error_code & ~PFERR_RSVD_MASK); /* * The page is not mapped by the guest. Let the guest handle it. */ if (!r) { pgprintk("%s: guest page fault\n", __func__); - if (!prefault) + if (!fault->prefetch) kvm_inject_emulated_page_fault(vcpu, &walker.fault); return RET_PF_RETRY; } - if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) { - shadow_page_table_clear_flood(vcpu, addr); + fault->gfn = walker.gfn; + fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); + + if (page_fault_handle_page_track(vcpu, fault)) { + shadow_page_table_clear_flood(vcpu, fault->addr); return RET_PF_EMULATE; } @@ -890,29 +874,28 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, vcpu->arch.write_fault_to_shadow_pgtable = false; is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, - &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); + &walker, fault->user, &vcpu->arch.write_fault_to_shadow_pgtable); if (is_self_change_mapping) - max_level = PG_LEVEL_4K; + fault->max_level = PG_LEVEL_4K; else - max_level = walker.level; + fault->max_level = walker.level; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (kvm_faultin_pfn(vcpu, prefault, walker.gfn, addr, &pfn, &hva, - write_fault, &map_writable, &r)) + if (kvm_faultin_pfn(vcpu, fault, &r)) return r; - if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) + if (handle_abnormal_pfn(vcpu, fault, walker.pte_access, &r)) return r; /* * Do not change pte_access if the pfn is a mmio page, otherwise * we will cache the incorrect access into mmio spte. */ - if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) && - !is_cr0_wp(vcpu->arch.mmu) && !user_fault && !is_noslot_pfn(pfn)) { + if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) && + !is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) { walker.pte_access |= ACC_WRITE_MASK; walker.pte_access &= ~ACC_USER_MASK; @@ -928,20 +911,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); - if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) + if (fault->slot && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva)) goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; - r = FNAME(fetch)(vcpu, addr, &walker, error_code, max_level, pfn, - map_writable, prefault); + r = FNAME(fetch)(vcpu, fault, &walker); kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); + kvm_release_pfn_clean(fault->pfn); return r; } @@ -1007,10 +989,10 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) sizeof(pt_element_t))) break; - FNAME(update_pte)(vcpu, sp, sptep, &gpte); + FNAME(prefetch_gpte)(vcpu, sp, sptep, gpte, false); } - if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) + if (!sp->unsync_children) break; } write_unlock(&vcpu->kvm->mmu_lock); @@ -1066,14 +1048,19 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr, * Using the cached information from sp->gfns is safe because: * - The spte has a reference to the struct page, so the pfn for a given gfn * can't change unless all sptes pointing to it are nuked first. + * + * Returns + * < 0: the sp should be zapped + * 0: the sp is synced and no tlb flushing is required + * > 0: the sp is synced and tlb flushing is required */ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { union kvm_mmu_page_role mmu_role = vcpu->arch.mmu->mmu_role.base; - int i, nr_present = 0; + int i; bool host_writable; gpa_t first_pte_gpa; - int set_spte_ret = 0; + bool flush = false; /* * Ignore various flags when verifying that it's safe to sync a shadow @@ -1098,11 +1085,13 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) */ if (WARN_ON_ONCE(sp->role.direct || (sp->role.word ^ mmu_role.word) & ~sync_role_ign.word)) - return 0; + return -1; first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + u64 *sptep, spte; + struct kvm_memory_slot *slot; unsigned pte_access; pt_element_t gpte; gpa_t pte_gpa; @@ -1115,10 +1104,10 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, sizeof(pt_element_t))) - return 0; + return -1; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { - set_spte_ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; + flush = true; continue; } @@ -1127,30 +1116,27 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) pte_access &= FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access, - &nr_present)) + if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) continue; if (gfn != sp->gfns[i]) { drop_spte(vcpu->kvm, &sp->spt[i]); - set_spte_ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; + flush = true; continue; } - nr_present++; - - host_writable = sp->spt[i] & shadow_host_writable_mask; + sptep = &sp->spt[i]; + spte = *sptep; + host_writable = spte & shadow_host_writable_mask; + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + make_spte(vcpu, sp, slot, pte_access, gfn, + spte_to_pfn(spte), spte, true, false, + host_writable, &spte); - set_spte_ret |= set_spte(vcpu, &sp->spt[i], - pte_access, PG_LEVEL_4K, - gfn, spte_to_pfn(sp->spt[i]), - true, false, host_writable); + flush |= mmu_spte_update(sptep, spte); } - if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH) - kvm_flush_remote_tlbs(vcpu->kvm); - - return nr_present; + return flush; } #undef pt_element_t diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 3e97cdb13eb7..0c76c45fdb68 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -89,15 +89,17 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) E820_TYPE_RAM); } -int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, - gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative, - bool can_unsync, bool host_writable, bool ad_disabled, - u64 *new_spte) +bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + struct kvm_memory_slot *slot, + unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, + u64 old_spte, bool prefetch, bool can_unsync, + bool host_writable, u64 *new_spte) { + int level = sp->role.level; u64 spte = SPTE_MMU_PRESENT_MASK; - int ret = 0; + bool wrprot = false; - if (ad_disabled) + if (sp->role.ad_disabled) spte |= SPTE_TDP_AD_DISABLED_MASK; else if (kvm_vcpu_ad_need_write_protect(vcpu)) spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK; @@ -109,7 +111,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, * read access. See FNAME(gpte_access) in paging_tmpl.h. */ spte |= shadow_present_mask; - if (!speculative) + if (!prefetch) spte |= spte_shadow_accessed_mask(spte); if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && @@ -150,7 +152,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, * is responsibility of kvm_mmu_get_page / kvm_mmu_sync_roots. * Same reasoning can be applied to dirty page accounting. */ - if (!can_unsync && is_writable_pte(old_spte)) + if (is_writable_pte(old_spte)) goto out; /* @@ -159,10 +161,10 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, * e.g. it's write-tracked (upper-level SPs) or has one or more * shadow pages and unsync'ing pages is not allowed. */ - if (mmu_try_to_unsync_pages(vcpu, gfn, can_unsync)) { + if (mmu_try_to_unsync_pages(vcpu, slot, gfn, can_unsync, prefetch)) { pgprintk("%s: found shadow page for %llx, marking ro\n", __func__, gfn); - ret |= SET_SPTE_WRITE_PROTECTED_PT; + wrprot = true; pte_access &= ~ACC_WRITE_MASK; spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); } @@ -171,16 +173,22 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, if (pte_access & ACC_WRITE_MASK) spte |= spte_shadow_dirty_mask(spte); - if (speculative) +out: + if (prefetch) spte = mark_spte_for_access_track(spte); -out: WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level), "spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level, get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level)); + if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) { + /* Enforced by kvm_mmu_hugepage_adjust. */ + WARN_ON(level > PG_LEVEL_4K); + mark_page_dirty_in_slot(vcpu->kvm, slot, gfn); + } + *new_spte = spte; - return ret; + return wrprot; } u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index eb7b227fc6cf..cc432f9a966b 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -310,12 +310,7 @@ static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, u64 spte, int level) { - /* - * Use a bitwise-OR instead of a logical-OR to aggregate the reserved - * bits and EPT's invalid memtype/XWR checks to avoid an extra Jcc - * (this is extremely unlikely to be short-circuited as true). - */ - return __is_bad_mt_xwr(rsvd_check, spte) | + return __is_bad_mt_xwr(rsvd_check, spte) || __is_rsvd_bits_set(rsvd_check, spte, level); } @@ -334,15 +329,11 @@ static inline u64 get_mmio_spte_generation(u64 spte) return gen; } -/* Bits which may be returned by set_spte() */ -#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) -#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) -#define SET_SPTE_SPURIOUS BIT(2) - -int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, - gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative, - bool can_unsync, bool host_writable, bool ad_disabled, - u64 *new_spte); +bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + struct kvm_memory_slot *slot, + unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, + u64 old_spte, bool prefetch, bool can_unsync, + bool host_writable, u64 *new_spte); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); u64 mark_spte_for_access_track(u64 spte); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 64ccfc1fa553..7c5dd83e52de 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -167,6 +167,7 @@ static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, role.direct = true; role.gpte_is_8_bytes = true; role.access = ACC_ALL; + role.ad_disabled = !shadow_accessed_mask; return role; } @@ -489,8 +490,8 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, } /* - * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically - * and handle the associated bookkeeping, but do not mark the page dirty + * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically + * and handle the associated bookkeeping. Do not mark the page dirty * in KVM's dirty bitmaps. * * @kvm: kvm instance @@ -499,9 +500,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, * Returns: true if the SPTE was set, false if it was not. If false is returned, * this function will have no side-effects. */ -static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) +static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) { lockdep_assert_held_read(&kvm->mmu_lock); @@ -527,43 +528,6 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, return true; } -/* - * tdp_mmu_map_set_spte_atomic - Set a leaf TDP MMU SPTE atomically to resolve a - * TDP page fault. - * - * @vcpu: The vcpu instance that took the TDP page fault. - * @iter: a tdp_iter instance currently on the SPTE that should be set - * @new_spte: The value the SPTE should be set to - * - * Returns: true if the SPTE was set, false if it was not. If false is returned, - * this function will have no side-effects. - */ -static inline bool tdp_mmu_map_set_spte_atomic(struct kvm_vcpu *vcpu, - struct tdp_iter *iter, - u64 new_spte) -{ - struct kvm *kvm = vcpu->kvm; - - if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte)) - return false; - - /* - * Use kvm_vcpu_gfn_to_memslot() instead of going through - * handle_changed_spte_dirty_log() to leverage vcpu->last_used_slot. - */ - if (is_writable_pte(new_spte)) { - struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, iter->gfn); - - if (slot && kvm_slot_dirty_track_enabled(slot)) { - /* Enforced by kvm_mmu_hugepage_adjust. */ - WARN_ON_ONCE(iter->level > PG_LEVEL_4K); - mark_page_dirty_in_slot(kvm, slot, iter->gfn); - } - } - - return true; -} - static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, struct tdp_iter *iter) { @@ -573,7 +537,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, * immediately installing a present entry in its place * before the TLBs are flushed. */ - if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, REMOVED_SPTE)) + if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) return false; kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, @@ -929,26 +893,26 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) * Installs a last-level SPTE to handle a TDP page fault. * (NPT/EPT violation/misconfiguration) */ -static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, - int map_writable, - struct tdp_iter *iter, - kvm_pfn_t pfn, bool prefault) +static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, + struct tdp_iter *iter) { + struct kvm_mmu_page *sp = sptep_to_sp(iter->sptep); u64 new_spte; int ret = RET_PF_FIXED; - int make_spte_ret = 0; + bool wrprot = false; - if (unlikely(is_noslot_pfn(pfn))) + WARN_ON(sp->role.level != fault->goal_level); + if (unlikely(!fault->slot)) new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); else - make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, - pfn, iter->old_spte, prefault, true, - map_writable, !shadow_accessed_mask, - &new_spte); + wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, + fault->pfn, iter->old_spte, fault->prefetch, true, + fault->map_writable, &new_spte); if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; - else if (!tdp_mmu_map_set_spte_atomic(vcpu, iter, new_spte)) + else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) return RET_PF_RETRY; /* @@ -956,10 +920,9 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, * protected, emulation is needed. If the emulation was skipped, * the vCPU would have the same fault again. */ - if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { - if (write) + if (wrprot) { + if (fault->write) ret = RET_PF_EMULATE; - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ @@ -986,37 +949,26 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing * page tables and SPTEs to translate the faulting guest physical address. */ -int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - int map_writable, int max_level, kvm_pfn_t pfn, - bool prefault) +int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); - bool write = error_code & PFERR_WRITE_MASK; - bool exec = error_code & PFERR_FETCH_MASK; - bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_mmu *mmu = vcpu->arch.mmu; struct tdp_iter iter; struct kvm_mmu_page *sp; u64 *child_pt; u64 new_spte; int ret; - gfn_t gfn = gpa >> PAGE_SHIFT; - int level; - int req_level; - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, - huge_page_disallowed, &req_level); + kvm_mmu_hugepage_adjust(vcpu, fault); - trace_kvm_mmu_spte_requested(gpa, level, pfn); + trace_kvm_mmu_spte_requested(fault); rcu_read_lock(); - tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { - if (nx_huge_page_workaround_enabled) - disallowed_hugepage_adjust(iter.old_spte, gfn, - iter.level, &pfn, &level); + tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { + if (fault->nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); - if (iter.level == level) + if (iter.level == fault->goal_level) break; /* @@ -1052,10 +1004,10 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, new_spte = make_nonleaf_spte(child_pt, !shadow_accessed_mask); - if (tdp_mmu_set_spte_atomic_no_dirty_log(vcpu->kvm, &iter, new_spte)) { + if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) { tdp_mmu_link_page(vcpu->kvm, sp, - huge_page_disallowed && - req_level >= iter.level); + fault->huge_page_disallowed && + fault->req_level >= iter.level); trace_kvm_mmu_get_page(sp, true); } else { @@ -1065,13 +1017,12 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, } } - if (iter.level != level) { + if (iter.level != fault->goal_level) { rcu_read_unlock(); return RET_PF_RETRY; } - ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, - pfn, prefault); + ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); rcu_read_unlock(); return ret; @@ -1241,8 +1192,7 @@ retry: new_spte = iter.old_spte & ~PT_WRITABLE_MASK; - if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter, - new_spte)) { + if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) { /* * The iter must explicitly re-read the SPTE because * the atomic cmpxchg failed. @@ -1310,8 +1260,7 @@ retry: continue; } - if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter, - new_spte)) { + if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) { /* * The iter must explicitly re-read the SPTE because * the atomic cmpxchg failed. diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 358f447d4012..476b133544dd 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -48,9 +48,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm); void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm); void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm); -int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - int map_writable, int max_level, kvm_pfn_t pfn, - bool prefault); +int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool flush); @@ -92,7 +90,6 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, #ifdef CONFIG_X86_64 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm); void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); -static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } static inline bool is_tdp_mmu(struct kvm_mmu *mmu) @@ -114,7 +111,6 @@ static inline bool is_tdp_mmu(struct kvm_mmu *mmu) #else static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; } static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} -static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; } #endif diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 510b833cbd39..f8b7bc04b3e7 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -238,6 +238,18 @@ static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size) kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1); } +static bool nested_svm_check_tlb_ctl(struct kvm_vcpu *vcpu, u8 tlb_ctl) +{ + /* Nested FLUSHBYASID is not supported yet. */ + switch(tlb_ctl) { + case TLB_CONTROL_DO_NOTHING: + case TLB_CONTROL_FLUSH_ALL_ASID: + return true; + default: + return false; + } +} + static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, struct vmcb_control_area *control) { @@ -257,6 +269,9 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, IOPM_SIZE))) return false; + if (CC(!nested_svm_check_tlb_ctl(vcpu, control->tlb_ctl))) + return false; + return true; } @@ -538,8 +553,17 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) if (nested_npt_enabled(svm)) nested_svm_init_mmu_context(vcpu); - svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset = - vcpu->arch.l1_tsc_offset + svm->nested.ctl.tsc_offset; + vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( + vcpu->arch.l1_tsc_offset, + svm->nested.ctl.tsc_offset, + svm->tsc_ratio_msr); + + svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset; + + if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { + WARN_ON(!svm->tsc_scaling_enabled); + nested_svm_update_tsc_ratio_msr(vcpu); + } svm->vmcb->control.int_ctl = (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) | @@ -550,9 +574,6 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) svm->vmcb->control.event_inj = svm->nested.ctl.event_inj; svm->vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err; - svm->vmcb->control.pause_filter_count = svm->nested.ctl.pause_filter_count; - svm->vmcb->control.pause_filter_thresh = svm->nested.ctl.pause_filter_thresh; - nested_svm_transition_tlb_flush(vcpu); /* Enter Guest-Mode */ @@ -810,11 +831,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.event_inj = svm->nested.ctl.event_inj; vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err; - vmcb12->control.pause_filter_count = - svm->vmcb->control.pause_filter_count; - vmcb12->control.pause_filter_thresh = - svm->vmcb->control.pause_filter_thresh; - nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); svm_switch_vmcb(svm, &svm->vmcb01); @@ -832,6 +848,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } + if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { + WARN_ON(!svm->tsc_scaling_enabled); + vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; + svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio); + } + svm->nested.ctl.nested_cr3 = 0; /* @@ -1219,6 +1241,16 @@ int nested_svm_exit_special(struct vcpu_svm *svm) return NESTED_EXIT_CONTINUE; } +void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + vcpu->arch.tsc_scaling_ratio = + kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio, + svm->tsc_ratio_msr); + svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio); +} + static int svm_get_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, u32 user_data_size) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 5847b05d29da..1964b9a174be 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2652,11 +2652,11 @@ void sev_es_init_vmcb(struct vcpu_svm *svm) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); } -void sev_es_create_vcpu(struct vcpu_svm *svm) +void sev_es_vcpu_reset(struct vcpu_svm *svm) { /* - * Set the GHCB MSR value as per the GHCB specification when creating - * a vCPU for an SEV-ES guest. + * Set the GHCB MSR value as per the GHCB specification when emulating + * vCPU RESET for an SEV-ES guest. */ set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX, GHCB_VERSION_MIN, diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 226482daa6eb..b36ca4e476c2 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -188,6 +188,13 @@ module_param(vls, int, 0444); static int vgif = true; module_param(vgif, int, 0444); +/* enable/disable LBR virtualization */ +static int lbrv = true; +module_param(lbrv, int, 0444); + +static int tsc_scaling = true; +module_param(tsc_scaling, int, 0444); + /* * enable / disable AVIC. Because the defaults differ for APICv * support between VMX and SVM we cannot use module_param_named. @@ -468,7 +475,7 @@ static int has_svm(void) static void svm_hardware_disable(void) { /* Make sure we clean up behind us */ - if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) + if (tsc_scaling) wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); cpu_svm_disable(); @@ -511,6 +518,10 @@ static int svm_hardware_enable(void) wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area)); if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { + /* + * Set the default value, even if we don't use TSC scaling + * to avoid having stale value in the msr + */ wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT); } @@ -931,6 +942,9 @@ static __init void svm_set_cpu_caps(void) if (npt_enabled) kvm_cpu_cap_set(X86_FEATURE_NPT); + if (tsc_scaling) + kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); + /* Nested VM can receive #VMEXIT instead of triggering #GP */ kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); } @@ -978,10 +992,15 @@ static __init int svm_hardware_setup(void) if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) kvm_enable_efer_bits(EFER_FFXSR); - if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { - kvm_has_tsc_control = true; - kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; - kvm_tsc_scaling_ratio_frac_bits = 32; + if (tsc_scaling) { + if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { + tsc_scaling = false; + } else { + pr_info("TSC scaling supported\n"); + kvm_has_tsc_control = true; + kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; + kvm_tsc_scaling_ratio_frac_bits = 32; + } } tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); @@ -1061,6 +1080,13 @@ static __init int svm_hardware_setup(void) pr_info("Virtual GIF supported\n"); } + if (lbrv) { + if (!boot_cpu_has(X86_FEATURE_LBRV)) + lbrv = false; + else + pr_info("LBR virtualization supported\n"); + } + svm_set_cpu_caps(); /* @@ -1111,7 +1137,9 @@ static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu) static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) { - return kvm_default_tsc_scaling_ratio; + struct vcpu_svm *svm = to_svm(vcpu); + + return svm->tsc_ratio_msr; } static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) @@ -1123,7 +1151,7 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) +void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) { wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); } @@ -1152,6 +1180,38 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, } } +static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (guest_cpuid_is_intel(vcpu)) { + /* + * We must intercept SYSENTER_EIP and SYSENTER_ESP + * accesses because the processor only stores 32 bits. + * For the same reason we cannot use virtual VMLOAD/VMSAVE. + */ + svm_set_intercept(svm, INTERCEPT_VMLOAD); + svm_set_intercept(svm, INTERCEPT_VMSAVE); + svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); + } else { + /* + * If hardware supports Virtual VMLOAD VMSAVE then enable it + * in VMCB and clear intercepts to avoid #VMEXIT. + */ + if (vls) { + svm_clr_intercept(svm, INTERCEPT_VMLOAD); + svm_clr_intercept(svm, INTERCEPT_VMSAVE); + svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + } + /* No need to intercept these MSRs */ + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); + } +} + static void init_vmcb(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1298,11 +1358,25 @@ static void init_vmcb(struct kvm_vcpu *vcpu) } svm_hv_init_vmcb(svm->vmcb); + init_vmcb_after_set_cpuid(vcpu); vmcb_mark_all_dirty(svm->vmcb); enable_gif(svm); +} + +static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + svm_vcpu_init_msrpm(vcpu, svm->msrpm); + + svm_init_osvw(vcpu); + vcpu->arch.microcode_version = 0x01000065; + svm->tsc_ratio_msr = kvm_default_tsc_scaling_ratio; + + if (sev_es_guest(vcpu->kvm)) + sev_es_vcpu_reset(svm); } static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -1313,6 +1387,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) svm->virt_spec_ctrl = 0; init_vmcb(vcpu); + + if (!init_event) + __svm_vcpu_reset(vcpu); } void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) @@ -1372,24 +1449,13 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm->vmcb01.ptr = page_address(vmcb01_page); svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); + svm_switch_vmcb(svm, &svm->vmcb01); if (vmsa_page) svm->vmsa = page_address(vmsa_page); svm->guest_state_loaded = false; - svm_switch_vmcb(svm, &svm->vmcb01); - init_vmcb(vcpu); - - svm_vcpu_init_msrpm(vcpu, svm->msrpm); - - svm_init_osvw(vcpu); - vcpu->arch.microcode_version = 0x01000065; - - if (sev_es_guest(vcpu->kvm)) - /* Perform SEV-ES specific VMCB creation updates */ - sev_es_create_vcpu(svm); - return 0; error_free_vmsa_page: @@ -1449,7 +1515,7 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) vmsave(__sme_page_pa(sd->save_area)); } - if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { + if (tsc_scaling) { u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio; if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) { __this_cpu_write(current_tsc_ratio, tsc_ratio); @@ -2659,6 +2725,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) struct vcpu_svm *svm = to_svm(vcpu); switch (msr_info->index) { + case MSR_AMD64_TSC_RATIO: + if (!msr_info->host_initiated && !svm->tsc_scaling_enabled) + return 1; + msr_info->data = svm->tsc_ratio_msr; + break; case MSR_STAR: msr_info->data = svm->vmcb01.ptr->save.star; break; @@ -2808,6 +2879,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) u32 ecx = msr->index; u64 data = msr->data; switch (ecx) { + case MSR_AMD64_TSC_RATIO: + if (!msr->host_initiated && !svm->tsc_scaling_enabled) + return 1; + + if (data & TSC_RATIO_RSVD) + return 1; + + svm->tsc_ratio_msr = data; + + if (svm->tsc_scaling_enabled && is_guest_mode(vcpu)) + nested_svm_update_tsc_ratio_msr(vcpu); + + break; case MSR_IA32_CR_PAT: if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) return 1; @@ -2920,7 +3004,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->tsc_aux = data; break; case MSR_IA32_DEBUGCTLMSR: - if (!boot_cpu_has(X86_FEATURE_LBRV)) { + if (!lbrv) { vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", __func__, data); break; @@ -3280,11 +3364,13 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) return svm_exit_handlers[exit_code](vcpu); } -static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, +static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) { struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; + *reason = control->exit_code; *info1 = control->exit_info_1; *info2 = control->exit_info_2; *intr_info = control->exit_int_info; @@ -3301,7 +3387,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) struct kvm_run *kvm_run = vcpu->run; u32 exit_code = svm->vmcb->control.exit_code; - trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); + trace_kvm_exit(vcpu, KVM_ISA_SVM); /* SEV-ES guests must use the CR write traps to track CR registers. */ if (!sev_es_guest(vcpu->kvm)) { @@ -3314,7 +3400,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (is_guest_mode(vcpu)) { int vmexit; - trace_kvm_nested_vmexit(exit_code, vcpu, KVM_ISA_SVM); + trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM); vmexit = nested_svm_exit_special(svm); @@ -3782,8 +3868,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) pre_svm_run(vcpu); - WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu)); - sync_lapic_to_cr8(vcpu); if (unlikely(svm->asid != svm->vmcb->control.asid)) { @@ -4003,6 +4087,8 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) && guest_cpuid_has(vcpu, X86_FEATURE_NRIPS); + svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR); + svm_recalc_instruction_intercepts(vcpu, svm); /* For sev guests, the memory encryption bit is not reserved in CR3. */ @@ -4029,33 +4115,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_NESTED); } - - if (guest_cpuid_is_intel(vcpu)) { - /* - * We must intercept SYSENTER_EIP and SYSENTER_ESP - * accesses because the processor only stores 32 bits. - * For the same reason we cannot use virtual VMLOAD/VMSAVE. - */ - svm_set_intercept(svm, INTERCEPT_VMLOAD); - svm_set_intercept(svm, INTERCEPT_VMSAVE); - svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; - - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); - } else { - /* - * If hardware supports Virtual VMLOAD VMSAVE then enable it - * in VMCB and clear intercepts to avoid #VMEXIT. - */ - if (vls) { - svm_clr_intercept(svm, INTERCEPT_VMLOAD); - svm_clr_intercept(svm, INTERCEPT_VMSAVE); - svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; - } - /* No need to intercept these MSRs */ - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); - } + init_vmcb_after_set_cpuid(vcpu); } static bool svm_has_wbinvd_exit(void) @@ -4522,6 +4582,8 @@ static int svm_vm_init(struct kvm *kvm) } static struct kvm_x86_ops svm_x86_ops __initdata = { + .name = "kvm_amd", + .hardware_unsetup = svm_hardware_teardown, .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index e63ac08115cf..5e9510d4574e 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -140,6 +140,8 @@ struct vcpu_svm { u64 next_rip; u64 spec_ctrl; + + u64 tsc_ratio_msr; /* * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be * translated into the appropriate L2_CFG bits on the host to @@ -160,7 +162,8 @@ struct vcpu_svm { unsigned long int3_rip; /* cached guest cpuid flags for faster access */ - bool nrips_enabled : 1; + bool nrips_enabled : 1; + bool tsc_scaling_enabled : 1; u32 ldr_reg; u32 dfr_reg; @@ -483,6 +486,8 @@ int nested_svm_check_permissions(struct kvm_vcpu *vcpu); int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); int nested_svm_exit_special(struct vcpu_svm *svm); +void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu); +void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier); void nested_load_control_from_vmcb12(struct vcpu_svm *svm, struct vmcb_control_area *control); void nested_sync_control_from_vmcb02(struct vcpu_svm *svm); @@ -562,7 +567,7 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu); int sev_handle_vmgexit(struct kvm_vcpu *vcpu); int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); void sev_es_init_vmcb(struct vcpu_svm *svm); -void sev_es_create_vcpu(struct vcpu_svm *svm); +void sev_es_vcpu_reset(struct vcpu_svm *svm); void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu); void sev_es_unmap_ghcb(struct vcpu_svm *svm); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 03ebe368333e..953b0fcb21ee 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -288,8 +288,8 @@ TRACE_EVENT(kvm_apic, #define TRACE_EVENT_KVM_EXIT(name) \ TRACE_EVENT(name, \ - TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), \ - TP_ARGS(exit_reason, vcpu, isa), \ + TP_PROTO(struct kvm_vcpu *vcpu, u32 isa), \ + TP_ARGS(vcpu, isa), \ \ TP_STRUCT__entry( \ __field( unsigned int, exit_reason ) \ @@ -303,11 +303,12 @@ TRACE_EVENT(name, \ ), \ \ TP_fast_assign( \ - __entry->exit_reason = exit_reason; \ __entry->guest_rip = kvm_rip_read(vcpu); \ __entry->isa = isa; \ __entry->vcpu_id = vcpu->vcpu_id; \ - static_call(kvm_x86_get_exit_info)(vcpu, &__entry->info1, \ + static_call(kvm_x86_get_exit_info)(vcpu, \ + &__entry->exit_reason, \ + &__entry->info1, \ &__entry->info2, \ &__entry->intr_info, \ &__entry->error_code); \ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index eedcebf58004..b4ee5e9f9e20 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -191,7 +191,7 @@ static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) * failValid writes the error number to the current VMCS, which * can't be done if there isn't a current VMCS. */ - if (vmx->nested.current_vmptr == -1ull && + if (vmx->nested.current_vmptr == INVALID_GPA && !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) return nested_vmx_failInvalid(vcpu); @@ -218,7 +218,7 @@ static inline u64 vmx_control_msr(u32 low, u32 high) static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) { secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); - vmcs_write64(VMCS_LINK_POINTER, -1ull); + vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); vmx->nested.need_vmcs12_to_shadow_sync = false; } @@ -290,9 +290,10 @@ static void free_nested(struct kvm_vcpu *vcpu) vmx->nested.vmxon = false; vmx->nested.smm.vmxon = false; + vmx->nested.vmxon_ptr = INVALID_GPA; free_vpid(vmx->nested.vpid02); vmx->nested.posted_intr_nv = -1; - vmx->nested.current_vmptr = -1ull; + vmx->nested.current_vmptr = INVALID_GPA; if (enable_shadow_vmcs) { vmx_disable_shadow_vmcs(vmx); vmcs_clear(vmx->vmcs01.shadow_vmcs); @@ -709,7 +710,7 @@ static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *shadow; if (!nested_cpu_has_shadow_vmcs(vmcs12) || - vmcs12->vmcs_link_pointer == -1ull) + vmcs12->vmcs_link_pointer == INVALID_GPA) return; shadow = get_shadow_vmcs12(vcpu); @@ -727,7 +728,7 @@ static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx = to_vmx(vcpu); if (!nested_cpu_has_shadow_vmcs(vmcs12) || - vmcs12->vmcs_link_pointer == -1ull) + vmcs12->vmcs_link_pointer == INVALID_GPA) return; kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, @@ -1994,7 +1995,7 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( } if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { - vmx->nested.current_vmptr = -1ull; + vmx->nested.current_vmptr = INVALID_GPA; nested_release_evmcs(vcpu); @@ -2178,7 +2179,7 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) } if (cpu_has_vmx_encls_vmexit()) - vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); + vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); /* * Set the MSR load/store lists to match L0's settings. Only the @@ -2197,7 +2198,7 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, { prepare_vmcs02_constant_state(vmx); - vmcs_write64(VMCS_LINK_POINTER, -1ull); + vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); if (enable_vpid) { if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) @@ -2949,7 +2950,7 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, struct vmcs12 *shadow; struct kvm_host_map map; - if (vmcs12->vmcs_link_pointer == -1ull) + if (vmcs12->vmcs_link_pointer == INVALID_GPA) return 0; if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) @@ -3216,7 +3217,7 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to * force VM-Entry to fail. */ - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); } } @@ -3527,7 +3528,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) } if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) && - vmx->nested.current_vmptr == -1ull)) + vmx->nested.current_vmptr == INVALID_GPA)) return nested_vmx_failInvalid(vcpu); vmcs12 = get_vmcs12(vcpu); @@ -4975,7 +4976,7 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (vmx->nested.current_vmptr == -1ull) + if (vmx->nested.current_vmptr == INVALID_GPA) return; copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); @@ -4995,7 +4996,7 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); - vmx->nested.current_vmptr = -1ull; + vmx->nested.current_vmptr = INVALID_GPA; } /* Emulate the VMXOFF instruction */ @@ -5090,12 +5091,12 @@ static int handle_vmread(struct kvm_vcpu *vcpu) return 1; /* - * In VMX non-root operation, when the VMCS-link pointer is -1ull, + * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, * any VMREAD sets the ALU flags for VMfailInvalid. */ - if (vmx->nested.current_vmptr == -1ull || + if (vmx->nested.current_vmptr == INVALID_GPA || (is_guest_mode(vcpu) && - get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) + get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) return nested_vmx_failInvalid(vcpu); /* Decode instruction info and find the field to read */ @@ -5182,12 +5183,12 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return 1; /* - * In VMX non-root operation, when the VMCS-link pointer is -1ull, + * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, * any VMWRITE sets the ALU flags for VMfailInvalid. */ - if (vmx->nested.current_vmptr == -1ull || + if (vmx->nested.current_vmptr == INVALID_GPA || (is_guest_mode(vcpu) && - get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) + get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) return nested_vmx_failInvalid(vcpu); if (instr_info & BIT(10)) @@ -5630,7 +5631,7 @@ bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, gpa_t bitmap, last_bitmap; u8 b; - last_bitmap = (gpa_t)-1; + last_bitmap = INVALID_GPA; b = -1; while (size > 0) { @@ -6065,7 +6066,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) goto reflect_vmexit; } - trace_kvm_nested_vmexit(exit_reason.full, vcpu, KVM_ISA_VMX); + trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); /* If L0 (KVM) wants the exit, it trumps L1's desires. */ if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) @@ -6106,8 +6107,8 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, .format = KVM_STATE_NESTED_FORMAT_VMX, .size = sizeof(kvm_state), .hdr.vmx.flags = 0, - .hdr.vmx.vmxon_pa = -1ull, - .hdr.vmx.vmcs12_pa = -1ull, + .hdr.vmx.vmxon_pa = INVALID_GPA, + .hdr.vmx.vmcs12_pa = INVALID_GPA, .hdr.vmx.preemption_timer_deadline = 0, }; struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = @@ -6133,7 +6134,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, if (is_guest_mode(vcpu) && nested_cpu_has_shadow_vmcs(vmcs12) && - vmcs12->vmcs_link_pointer != -1ull) + vmcs12->vmcs_link_pointer != INVALID_GPA) kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); } @@ -6209,7 +6210,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, return -EFAULT; if (nested_cpu_has_shadow_vmcs(vmcs12) && - vmcs12->vmcs_link_pointer != -1ull) { + vmcs12->vmcs_link_pointer != INVALID_GPA) { if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, get_shadow_vmcs12(vcpu), VMCS12_SIZE)) return -EFAULT; @@ -6244,11 +6245,11 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) return -EINVAL; - if (kvm_state->hdr.vmx.vmxon_pa == -1ull) { + if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { if (kvm_state->hdr.vmx.smm.flags) return -EINVAL; - if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) + if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) return -EINVAL; /* @@ -6302,7 +6303,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, vmx_leave_nested(vcpu); - if (kvm_state->hdr.vmx.vmxon_pa == -1ull) + if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) return 0; vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; @@ -6315,13 +6316,13 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, /* See vmx_has_valid_vmcs12. */ if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || - (kvm_state->hdr.vmx.vmcs12_pa != -1ull)) + (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) return -EINVAL; else return 0; } - if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) { + if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) return -EINVAL; @@ -6366,7 +6367,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, ret = -EINVAL; if (nested_cpu_has_shadow_vmcs(vmcs12) && - vmcs12->vmcs_link_pointer != -1ull) { + vmcs12->vmcs_link_pointer != INVALID_GPA) { struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); if (kvm_state->size < diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 10cc4f65c4ef..b8e0d21b7c8a 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -365,7 +365,7 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = pmu->global_ctrl; return 0; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - msr_info->data = pmu->global_ovf_ctrl; + msr_info->data = 0; return 0; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || @@ -423,7 +423,6 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!(data & pmu->global_ovf_ctrl_mask)) { if (!msr_info->host_initiated) pmu->global_status &= ~data; - pmu->global_ovf_ctrl = data; return 0; } break; @@ -588,8 +587,7 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) pmc->counter = 0; } - pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = - pmu->global_ovf_ctrl = 0; + pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; intel_pmu_release_guest_lbr_event(vcpu); } diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 6693ebdc0770..35e7ec91ae86 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -53,11 +53,9 @@ static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset, static void sgx_handle_emulation_failure(struct kvm_vcpu *vcpu, u64 addr, unsigned int size) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 2; - vcpu->run->internal.data[0] = addr; - vcpu->run->internal.data[1] = size; + uint64_t data[2] = { addr, size }; + + __kvm_prepare_emulation_failure_exit(vcpu, data, ARRAY_SIZE(data)); } static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data, @@ -112,9 +110,7 @@ static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr) * but the error code isn't (yet) plumbed through the ENCLS helpers. */ if (trapnr == PF_VECTOR && !boot_cpu_has(X86_FEATURE_SGX2)) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; + kvm_prepare_emulation_failure_exit(vcpu); return 0; } @@ -155,9 +151,7 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0); sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1); if (!sgx_12_0 || !sgx_12_1) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; + kvm_prepare_emulation_failure_exit(vcpu); return 0; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index fb9e4ac3df22..76861b66bbcf 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1059,8 +1059,8 @@ static void pt_guest_enter(struct vcpu_vmx *vmx) rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { wrmsrl(MSR_IA32_RTIT_CTL, 0); - pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range); - pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range); + pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); + pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); } } @@ -1070,12 +1070,16 @@ static void pt_guest_exit(struct vcpu_vmx *vmx) return; if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { - pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range); - pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range); + pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); + pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); } - /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */ - wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); + /* + * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest, + * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary. + */ + if (vmx->pt_desc.host.ctl) + wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); } void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, @@ -1456,16 +1460,16 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) * cause a #GP fault. */ value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; - if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2)) + if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2)) return 1; value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; - if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2)) + if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2)) return 1; value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; - if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2)) + if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2)) return 1; value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; - if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2)) + if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2)) return 1; return 0; @@ -1886,8 +1890,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; if (!vmx_pt_mode_is_host_guest() || - (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, - PT_CAP_num_address_ranges))) + (index >= 2 * vmx->pt_desc.num_address_ranges)) return 1; if (index % 2) msr_info->data = vmx->pt_desc.guest.addr_b[index / 2]; @@ -2202,8 +2205,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!pt_can_write_msr(vmx)) return 1; index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; - if (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, - PT_CAP_num_address_ranges)) + if (index >= 2 * vmx->pt_desc.num_address_ranges) return 1; if (is_noncanonical_address(data, vcpu)) return 1; @@ -3879,7 +3881,7 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag); vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag); vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag); - for (i = 0; i < vmx->pt_desc.addr_range; i++) { + for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) { vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); } @@ -4328,10 +4330,6 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) #define VMX_XSS_EXIT_BITMAP 0 -/* - * Noting that the initialization of Guest-state Area of VMCS is in - * vmx_vcpu_reset(). - */ static void init_vmcs(struct vcpu_vmx *vmx) { if (nested) @@ -4340,7 +4338,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (cpu_has_vmx_msr_bitmap()) vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); - vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ + vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */ /* Control */ pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); @@ -4436,10 +4434,40 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmx_setup_uret_msrs(vmx); } +static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + init_vmcs(vmx); + + if (nested) + memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); + + vcpu_setup_sgx_lepubkeyhash(vcpu); + + vmx->nested.posted_intr_nv = -1; + vmx->nested.vmxon_ptr = INVALID_GPA; + vmx->nested.current_vmptr = INVALID_GPA; + vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; + + vcpu->arch.microcode_version = 0x100000000ULL; + vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; + + /* + * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR + * or POSTED_INTR_WAKEUP_VECTOR. + */ + vmx->pi_desc.nv = POSTED_INTR_VECTOR; + vmx->pi_desc.sn = 1; +} + static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (!init_event) + __vmx_vcpu_reset(vcpu); + vmx->rmode.vm86_active = 0; vmx->spec_ctrl = 0; @@ -4449,6 +4477,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_set_cr8(vcpu, 0); vmx_segment_cache_clear(vmx); + kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); seg_setup(VCPU_SREG_CS); vmcs_write16(GUEST_CS_SELECTOR, 0xf000); @@ -5379,10 +5408,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (vmx->emulation_required && !vmx->rmode.vm86_active && vcpu->arch.exception.pending) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = - KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; + kvm_prepare_emulation_failure_exit(vcpu); return 0; } @@ -5633,11 +5659,13 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { static const int kvm_vmx_max_exit_handlers = ARRAY_SIZE(kvm_vmx_exit_handlers); -static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, +static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); + *reason = vmx->exit_reason.full; *info1 = vmx_get_exit_qual(vcpu); if (!(vmx->exit_reason.failed_vmentry)) { *info2 = vmx->idt_vectoring_info; @@ -6406,6 +6434,7 @@ static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: return nested; case MSR_AMD64_VIRT_SPEC_CTRL: + case MSR_AMD64_TSC_RATIO: /* This is AMD only. */ return false; default: @@ -6782,7 +6811,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) if (likely(!vmx->exit_reason.failed_vmentry)) vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - trace_kvm_exit(vmx->exit_reason.full, vcpu, KVM_ISA_VMX); + trace_kvm_exit(vcpu, KVM_ISA_VMX); if (unlikely(vmx->exit_reason.failed_vmentry)) return EXIT_FASTPATH_NONE; @@ -6813,7 +6842,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) { struct vmx_uret_msr *tsx_ctrl; struct vcpu_vmx *vmx; - int i, cpu, err; + int i, err; BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); vmx = to_vmx(vcpu); @@ -6834,10 +6863,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) goto free_vpid; } - for (i = 0; i < kvm_nr_uret_msrs; ++i) { - vmx->guest_uret_msrs[i].data = 0; + for (i = 0; i < kvm_nr_uret_msrs; ++i) vmx->guest_uret_msrs[i].mask = -1ull; - } if (boot_cpu_has(X86_FEATURE_RTM)) { /* * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception. @@ -6874,12 +6901,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) } vmx->loaded_vmcs = &vmx->vmcs01; - cpu = get_cpu(); - vmx_vcpu_load(vcpu, cpu); - vcpu->cpu = cpu; - init_vmcs(vmx); - vmx_vcpu_put(vcpu); - put_cpu(); + if (cpu_need_virtualize_apic_accesses(vcpu)) { err = alloc_apic_access_page(vcpu->kvm); if (err) @@ -6892,27 +6914,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) goto free_vmcs; } - if (nested) - memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); - else - memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); - - vcpu_setup_sgx_lepubkeyhash(vcpu); - - vmx->nested.posted_intr_nv = -1; - vmx->nested.current_vmptr = -1ull; - vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; - - vcpu->arch.microcode_version = 0x100000000ULL; - vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; - - /* - * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR - * or POSTED_INTR_WAKEUP_VECTOR. - */ - vmx->pi_desc.nv = POSTED_INTR_VECTOR; - vmx->pi_desc.sn = 1; - return 0; free_vmcs: @@ -7127,12 +7128,13 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) } /* Get the number of configurable Address Ranges for filtering */ - vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps, + vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_num_address_ranges); /* Initialize and clear the no dependency bits */ vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | - RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC); + RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC | + RTIT_CTL_BRANCH_EN); /* * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise @@ -7150,12 +7152,11 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); /* - * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and - * MTCFreq can be set + * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc)) vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | - RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE); + RTIT_CTL_MTC_RANGE); /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite)) @@ -7175,7 +7176,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; /* unmask address range configure area */ - for (i = 0; i < vmx->pt_desc.addr_range; i++) + for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); } @@ -7551,6 +7552,8 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu) static void hardware_unsetup(void) { + kvm_set_posted_intr_wakeup_handler(NULL); + if (nested) nested_vmx_hardware_unsetup(); @@ -7566,6 +7569,8 @@ static bool vmx_check_apicv_inhibit_reasons(ulong bit) } static struct kvm_x86_ops vmx_x86_ops __initdata = { + .name = "kvm_intel", + .hardware_unsetup = hardware_unsetup, .hardware_enable = hardware_enable, @@ -7879,8 +7884,6 @@ static __init int hardware_setup(void) vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; } - kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); - kvm_mce_cap_supported |= MCG_LMCE_P; if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) @@ -7904,6 +7907,9 @@ static __init int hardware_setup(void) r = alloc_kvm_area(); if (r) nested_vmx_hardware_unsetup(); + + kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); + return r; } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 592217fd7d92..e7db42e3b0ce 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -62,7 +62,7 @@ struct pt_ctx { struct pt_desc { u64 ctl_bitmask; - u32 addr_range; + u32 num_address_ranges; u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; struct pt_ctx host; struct pt_ctx guest; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2686f2edb47c..c1c4e2b05a63 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -790,30 +790,6 @@ bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) } EXPORT_SYMBOL_GPL(kvm_require_dr); -/* - * This function will be used to read from the physical memory of the currently - * running guest. The difference to kvm_vcpu_read_guest_page is that this function - * can read from guest physical or from the guest's guest physical memory. - */ -int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gfn_t ngfn, void *data, int offset, int len, - u32 access) -{ - struct x86_exception exception; - gfn_t real_gfn; - gpa_t ngpa; - - ngpa = gfn_to_gpa(ngfn); - real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception); - if (real_gfn == UNMAPPED_GVA) - return -EFAULT; - - real_gfn = gpa_to_gfn(real_gfn); - - return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len); -} -EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); - static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) { return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2); @@ -825,34 +801,38 @@ static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) { gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; - unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; + gpa_t real_gpa; int i; int ret; u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; - ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte, - offset * sizeof(u64), sizeof(pdpte), - PFERR_USER_MASK|PFERR_WRITE_MASK); - if (ret < 0) { - ret = 0; - goto out; - } + /* + * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated + * to an L1 GPA. + */ + real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(pdpt_gfn), + PFERR_USER_MASK | PFERR_WRITE_MASK, NULL); + if (real_gpa == UNMAPPED_GVA) + return 0; + + /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */ + ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(real_gpa), pdpte, + cr3 & GENMASK(11, 5), sizeof(pdpte)); + if (ret < 0) + return 0; + for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { if ((pdpte[i] & PT_PRESENT_MASK) && (pdpte[i] & pdptr_rsvd_bits(vcpu))) { - ret = 0; - goto out; + return 0; } } - ret = 1; memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); vcpu->arch.pdptrs_from_userspace = false; -out: - - return ret; + return 1; } EXPORT_SYMBOL_GPL(load_pdptrs); @@ -993,7 +973,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) /* * Do not allow the guest to set bits that we do not support * saving. However, xcr0 bit 0 is always set, even if the - * emulated CPU does not support XSAVE (see fx_init). + * emulated CPU does not support XSAVE (see kvm_vcpu_reset()). */ valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP; if (xcr0 & ~valid_bits) @@ -1042,9 +1022,28 @@ EXPORT_SYMBOL_GPL(kvm_is_valid_cr4); void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) { - if (((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) || - (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) + /* + * If any role bit is changed, the MMU needs to be reset. + * + * If CR4.PCIDE is changed 1 -> 0, the guest TLB must be flushed. + * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB + * according to the SDM; however, stale prev_roots could be reused + * incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we + * free them all. KVM_REQ_MMU_RELOAD is fit for the both cases; it + * is slow, but changing CR4.PCIDE is a rare case. + * + * If CR4.PGE is changed, the guest TLB must be flushed. + * + * Note: resetting MMU is a superset of KVM_REQ_MMU_RELOAD and + * KVM_REQ_MMU_RELOAD is a superset of KVM_REQ_TLB_FLUSH_GUEST, hence + * the usage of "else if". + */ + if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) kvm_mmu_reset_context(vcpu); + else if ((cr4 ^ old_cr4) & X86_CR4_PCIDE) + kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); + else if ((cr4 ^ old_cr4) & X86_CR4_PGE) + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); } EXPORT_SYMBOL_GPL(kvm_post_set_cr4); @@ -1092,6 +1091,18 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) int i; /* + * MOV CR3 and INVPCID are usually not intercepted when using TDP, but + * this is reachable when running EPT=1 and unrestricted_guest=0, and + * also via the emulator. KVM's TDP page tables are not in the scope of + * the invalidation, but the guest's TLB entries need to be flushed as + * the CPU may have cached entries in its TLB for the target PCID. + */ + if (unlikely(tdp_enabled)) { + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + return; + } + + /* * If neither the current CR3 nor any of the prev_roots use the given * PCID, then nothing needs to be done here because a resync will * happen anyway before switching to any other CR3. @@ -1101,6 +1112,14 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } + /* + * If PCID is disabled, there is no need to free prev_roots even if the + * PCIDs for them are also 0, because MOV to CR3 always flushes the TLB + * with PCIDE=0. + */ + if (!kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) + return; + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); @@ -1381,6 +1400,7 @@ static const u32 emulated_msrs_all[] = { MSR_PLATFORM_INFO, MSR_MISC_FEATURES_ENABLES, MSR_AMD64_VIRT_SPEC_CTRL, + MSR_AMD64_TSC_RATIO, MSR_IA32_POWER_CTL, MSR_IA32_UCODE_REV, @@ -2454,13 +2474,64 @@ static inline bool kvm_check_tsc_unstable(void) return check_tsc_unstable(); } +/* + * Infers attempts to synchronize the guest's tsc from host writes. Sets the + * offset for the vcpu and tracks the TSC matching generation that the vcpu + * participates in. + */ +static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, + u64 ns, bool matched) +{ + struct kvm *kvm = vcpu->kvm; + + lockdep_assert_held(&kvm->arch.tsc_write_lock); + + /* + * We also track th most recent recorded KHZ, write and time to + * allow the matching interval to be extended at each write. + */ + kvm->arch.last_tsc_nsec = ns; + kvm->arch.last_tsc_write = tsc; + kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; + kvm->arch.last_tsc_offset = offset; + + vcpu->arch.last_guest_tsc = tsc; + + kvm_vcpu_write_tsc_offset(vcpu, offset); + + if (!matched) { + /* + * We split periods of matched TSC writes into generations. + * For each generation, we track the original measured + * nanosecond time, offset, and write, so if TSCs are in + * sync, we can match exact offset, and if not, we can match + * exact software computation in compute_guest_tsc() + * + * These values are tracked in kvm->arch.cur_xxx variables. + */ + kvm->arch.cur_tsc_generation++; + kvm->arch.cur_tsc_nsec = ns; + kvm->arch.cur_tsc_write = tsc; + kvm->arch.cur_tsc_offset = offset; + kvm->arch.nr_vcpus_matched_tsc = 0; + } else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) { + kvm->arch.nr_vcpus_matched_tsc++; + } + + /* Keep track of which generation this VCPU has synchronized to */ + vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; + vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; + vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; + + kvm_track_tsc_matching(vcpu); +} + static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) { struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; - bool matched; - bool already_matched; + bool matched = false; bool synchronizing = false; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); @@ -2506,51 +2577,10 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) offset = kvm_compute_l1_tsc_offset(vcpu, data); } matched = true; - already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation); - } else { - /* - * We split periods of matched TSC writes into generations. - * For each generation, we track the original measured - * nanosecond time, offset, and write, so if TSCs are in - * sync, we can match exact offset, and if not, we can match - * exact software computation in compute_guest_tsc() - * - * These values are tracked in kvm->arch.cur_xxx variables. - */ - kvm->arch.cur_tsc_generation++; - kvm->arch.cur_tsc_nsec = ns; - kvm->arch.cur_tsc_write = data; - kvm->arch.cur_tsc_offset = offset; - matched = false; } - /* - * We also track th most recent recorded KHZ, write and time to - * allow the matching interval to be extended at each write. - */ - kvm->arch.last_tsc_nsec = ns; - kvm->arch.last_tsc_write = data; - kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; - - vcpu->arch.last_guest_tsc = data; - - /* Keep track of which generation this VCPU has synchronized to */ - vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; - vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; - vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; - - kvm_vcpu_write_tsc_offset(vcpu, offset); + __kvm_synchronize_tsc(vcpu, offset, data, ns, matched); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); - - raw_spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags); - if (!matched) { - kvm->arch.nr_vcpus_matched_tsc = 0; - } else if (!already_matched) { - kvm->arch.nr_vcpus_matched_tsc++; - } - - kvm_track_tsc_matching(vcpu); - raw_spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags); } static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, @@ -2738,6 +2768,7 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) int vclock_mode; bool host_tsc_clocksource, vcpus_matched; + lockdep_assert_held(&kvm->arch.tsc_write_lock); vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == atomic_read(&kvm->online_vcpus)); @@ -2762,68 +2793,101 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) #endif } -void kvm_make_mclock_inprogress_request(struct kvm *kvm) +static void kvm_make_mclock_inprogress_request(struct kvm *kvm) { kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); } -static void kvm_gen_update_masterclock(struct kvm *kvm) +static void __kvm_start_pvclock_update(struct kvm *kvm) { -#ifdef CONFIG_X86_64 - int i; - struct kvm_vcpu *vcpu; - struct kvm_arch *ka = &kvm->arch; - unsigned long flags; - - kvm_hv_invalidate_tsc_page(kvm); + raw_spin_lock_irq(&kvm->arch.tsc_write_lock); + write_seqcount_begin(&kvm->arch.pvclock_sc); +} +static void kvm_start_pvclock_update(struct kvm *kvm) +{ kvm_make_mclock_inprogress_request(kvm); /* no guest entries from this point */ - raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); - pvclock_update_vm_gtod_copy(kvm); - raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); + __kvm_start_pvclock_update(kvm); +} + +static void kvm_end_pvclock_update(struct kvm *kvm) +{ + struct kvm_arch *ka = &kvm->arch; + struct kvm_vcpu *vcpu; + int i; + write_seqcount_end(&ka->pvclock_sc); + raw_spin_unlock_irq(&ka->tsc_write_lock); kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); /* guest entries allowed */ kvm_for_each_vcpu(i, vcpu, kvm) kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); -#endif } -u64 get_kvmclock_ns(struct kvm *kvm) +static void kvm_update_masterclock(struct kvm *kvm) +{ + kvm_hv_invalidate_tsc_page(kvm); + kvm_start_pvclock_update(kvm); + pvclock_update_vm_gtod_copy(kvm); + kvm_end_pvclock_update(kvm); +} + +/* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */ +static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) { struct kvm_arch *ka = &kvm->arch; struct pvclock_vcpu_time_info hv_clock; - unsigned long flags; - u64 ret; - - raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); - if (!ka->use_master_clock) { - raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); - return get_kvmclock_base_ns() + ka->kvmclock_offset; - } - - hv_clock.tsc_timestamp = ka->master_cycle_now; - hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; - raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); /* both __this_cpu_read() and rdtsc() should be on the same cpu */ get_cpu(); - if (__this_cpu_read(cpu_tsc_khz)) { + data->flags = 0; + if (ka->use_master_clock && __this_cpu_read(cpu_tsc_khz)) { +#ifdef CONFIG_X86_64 + struct timespec64 ts; + + if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) { + data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec; + data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC; + } else +#endif + data->host_tsc = rdtsc(); + + data->flags |= KVM_CLOCK_TSC_STABLE; + hv_clock.tsc_timestamp = ka->master_cycle_now; + hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, &hv_clock.tsc_shift, &hv_clock.tsc_to_system_mul); - ret = __pvclock_read_cycles(&hv_clock, rdtsc()); - } else - ret = get_kvmclock_base_ns() + ka->kvmclock_offset; + data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc); + } else { + data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset; + } put_cpu(); +} - return ret; +static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) +{ + struct kvm_arch *ka = &kvm->arch; + unsigned seq; + + do { + seq = read_seqcount_begin(&ka->pvclock_sc); + __get_kvmclock(kvm, data); + } while (read_seqcount_retry(&ka->pvclock_sc, seq)); +} + +u64 get_kvmclock_ns(struct kvm *kvm) +{ + struct kvm_clock_data data; + + get_kvmclock(kvm, &data); + return data.clock; } static void kvm_setup_pvclock_page(struct kvm_vcpu *v, @@ -2888,6 +2952,7 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v, static int kvm_guest_time_update(struct kvm_vcpu *v) { unsigned long flags, tgt_tsc_khz; + unsigned seq; struct kvm_vcpu_arch *vcpu = &v->arch; struct kvm_arch *ka = &v->kvm->arch; s64 kernel_ns; @@ -2902,13 +2967,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) * If the host uses TSC clock, then passthrough TSC as stable * to the guest. */ - raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); - use_master_clock = ka->use_master_clock; - if (use_master_clock) { - host_tsc = ka->master_cycle_now; - kernel_ns = ka->master_kernel_ns; - } - raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); + do { + seq = read_seqcount_begin(&ka->pvclock_sc); + use_master_clock = ka->use_master_clock; + if (use_master_clock) { + host_tsc = ka->master_cycle_now; + kernel_ns = ka->master_kernel_ns; + } + } while (read_seqcount_retry(&ka->pvclock_sc, seq)); /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); @@ -3179,15 +3245,14 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) ++vcpu->stat.tlb_flush; if (!tdp_enabled) { - /* + /* * A TLB flush on behalf of the guest is equivalent to * INVPCID(all), toggling CR4.PGE, etc., which requires - * a forced sync of the shadow page tables. Unload the - * entire MMU here and the subsequent load will sync the - * shadow page tables, and also flush the TLB. + * a forced sync of the shadow page tables. Ensure all the + * roots are synced and the guest TLB in hardware is clean. */ - kvm_mmu_unload(vcpu); - return; + kvm_mmu_sync_roots(vcpu); + kvm_mmu_sync_prev_roots(vcpu); } static_call(kvm_x86_tlb_flush_guest)(vcpu); @@ -4028,6 +4093,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: case KVM_CAP_SREGS2: case KVM_CAP_EXIT_ON_EMULATION_FAILURE: + case KVM_CAP_VCPU_ATTRIBUTES: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: @@ -4048,7 +4114,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_SYNC_X86_VALID_FIELDS; break; case KVM_CAP_ADJUST_CLOCK: - r = KVM_CLOCK_TSC_STABLE; + r = KVM_CLOCK_VALID_FLAGS; break; case KVM_CAP_X86_DISABLE_EXITS: r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE | @@ -4077,7 +4143,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_MAX_VCPUS; break; case KVM_CAP_MAX_VCPU_ID: - r = KVM_MAX_VCPU_ID; + r = KVM_MAX_VCPU_IDS; break; case KVM_CAP_PV_MMU: /* obsolete */ r = 0; @@ -4775,6 +4841,115 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) return 0; } +static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int r; + + switch (attr->attr) { + case KVM_VCPU_TSC_OFFSET: + r = 0; + break; + default: + r = -ENXIO; + } + + return r; +} + +static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr; + int r; + + if ((u64)(unsigned long)uaddr != attr->addr) + return -EFAULT; + + switch (attr->attr) { + case KVM_VCPU_TSC_OFFSET: + r = -EFAULT; + if (put_user(vcpu->arch.l1_tsc_offset, uaddr)) + break; + r = 0; + break; + default: + r = -ENXIO; + } + + return r; +} + +static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr; + struct kvm *kvm = vcpu->kvm; + int r; + + if ((u64)(unsigned long)uaddr != attr->addr) + return -EFAULT; + + switch (attr->attr) { + case KVM_VCPU_TSC_OFFSET: { + u64 offset, tsc, ns; + unsigned long flags; + bool matched; + + r = -EFAULT; + if (get_user(offset, uaddr)) + break; + + raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); + + matched = (vcpu->arch.virtual_tsc_khz && + kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz && + kvm->arch.last_tsc_offset == offset); + + tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset; + ns = get_kvmclock_base_ns(); + + __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched); + raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + + r = 0; + break; + } + default: + r = -ENXIO; + } + + return r; +} + +static int kvm_vcpu_ioctl_device_attr(struct kvm_vcpu *vcpu, + unsigned int ioctl, + void __user *argp) +{ + struct kvm_device_attr attr; + int r; + + if (copy_from_user(&attr, argp, sizeof(attr))) + return -EFAULT; + + if (attr.group != KVM_VCPU_TSC_CTRL) + return -ENXIO; + + switch (ioctl) { + case KVM_HAS_DEVICE_ATTR: + r = kvm_arch_tsc_has_attr(vcpu, &attr); + break; + case KVM_GET_DEVICE_ATTR: + r = kvm_arch_tsc_get_attr(vcpu, &attr); + break; + case KVM_SET_DEVICE_ATTR: + r = kvm_arch_tsc_set_attr(vcpu, &attr); + break; + } + + return r; +} + static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, struct kvm_enable_cap *cap) { @@ -5229,6 +5404,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = __set_sregs2(vcpu, u.sregs2); break; } + case KVM_HAS_DEVICE_ATTR: + case KVM_GET_DEVICE_ATTR: + case KVM_SET_DEVICE_ATTR: + r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp); + break; default: r = -EINVAL; } @@ -5712,6 +5892,63 @@ int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state) } #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */ +static int kvm_vm_ioctl_get_clock(struct kvm *kvm, void __user *argp) +{ + struct kvm_clock_data data = { 0 }; + + get_kvmclock(kvm, &data); + if (copy_to_user(argp, &data, sizeof(data))) + return -EFAULT; + + return 0; +} + +static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) +{ + struct kvm_arch *ka = &kvm->arch; + struct kvm_clock_data data; + u64 now_raw_ns; + + if (copy_from_user(&data, argp, sizeof(data))) + return -EFAULT; + + /* + * Only KVM_CLOCK_REALTIME is used, but allow passing the + * result of KVM_GET_CLOCK back to KVM_SET_CLOCK. + */ + if (data.flags & ~KVM_CLOCK_VALID_FLAGS) + return -EINVAL; + + kvm_hv_invalidate_tsc_page(kvm); + kvm_start_pvclock_update(kvm); + pvclock_update_vm_gtod_copy(kvm); + + /* + * This pairs with kvm_guest_time_update(): when masterclock is + * in use, we use master_kernel_ns + kvmclock_offset to set + * unsigned 'system_time' so if we use get_kvmclock_ns() (which + * is slightly ahead) here we risk going negative on unsigned + * 'system_time' when 'data.clock' is very small. + */ + if (data.flags & KVM_CLOCK_REALTIME) { + u64 now_real_ns = ktime_get_real_ns(); + + /* + * Avoid stepping the kvmclock backwards. + */ + if (now_real_ns > data.realtime) + data.clock += now_real_ns - data.realtime; + } + + if (ka->use_master_clock) + now_raw_ns = ka->master_kernel_ns; + else + now_raw_ns = get_kvmclock_base_ns(); + ka->kvmclock_offset = data.clock - now_raw_ns; + kvm_end_pvclock_update(kvm); + return 0; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -5955,60 +6192,12 @@ set_pit2_out: break; } #endif - case KVM_SET_CLOCK: { - struct kvm_arch *ka = &kvm->arch; - struct kvm_clock_data user_ns; - u64 now_ns; - - r = -EFAULT; - if (copy_from_user(&user_ns, argp, sizeof(user_ns))) - goto out; - - r = -EINVAL; - if (user_ns.flags) - goto out; - - r = 0; - /* - * TODO: userspace has to take care of races with VCPU_RUN, so - * kvm_gen_update_masterclock() can be cut down to locked - * pvclock_update_vm_gtod_copy(). - */ - kvm_gen_update_masterclock(kvm); - - /* - * This pairs with kvm_guest_time_update(): when masterclock is - * in use, we use master_kernel_ns + kvmclock_offset to set - * unsigned 'system_time' so if we use get_kvmclock_ns() (which - * is slightly ahead) here we risk going negative on unsigned - * 'system_time' when 'user_ns.clock' is very small. - */ - raw_spin_lock_irq(&ka->pvclock_gtod_sync_lock); - if (kvm->arch.use_master_clock) - now_ns = ka->master_kernel_ns; - else - now_ns = get_kvmclock_base_ns(); - ka->kvmclock_offset = user_ns.clock - now_ns; - raw_spin_unlock_irq(&ka->pvclock_gtod_sync_lock); - - kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); + case KVM_SET_CLOCK: + r = kvm_vm_ioctl_set_clock(kvm, argp); break; - } - case KVM_GET_CLOCK: { - struct kvm_clock_data user_ns; - u64 now_ns; - - now_ns = get_kvmclock_ns(kvm); - user_ns.clock = now_ns; - user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0; - memset(&user_ns.pad, 0, sizeof(user_ns.pad)); - - r = -EFAULT; - if (copy_to_user(argp, &user_ns, sizeof(user_ns))) - goto out; - r = 0; + case KVM_GET_CLOCK: + r = kvm_vm_ioctl_get_clock(kvm, argp); break; - } case KVM_MEMORY_ENCRYPT_OP: { r = -ENOTTY; if (kvm_x86_ops.mem_enc_op) @@ -7375,28 +7564,77 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) } EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); -static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu) +static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, + u8 ndata, u8 *insn_bytes, u8 insn_size) { - struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; - u32 insn_size = ctxt->fetch.end - ctxt->fetch.data; struct kvm_run *run = vcpu->run; + u64 info[5]; + u8 info_start; + + /* + * Zero the whole array used to retrieve the exit info, as casting to + * u32 for select entries will leave some chunks uninitialized. + */ + memset(&info, 0, sizeof(info)); + + static_call(kvm_x86_get_exit_info)(vcpu, (u32 *)&info[0], &info[1], + &info[2], (u32 *)&info[3], + (u32 *)&info[4]); run->exit_reason = KVM_EXIT_INTERNAL_ERROR; run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION; - run->emulation_failure.ndata = 0; + + /* + * There's currently space for 13 entries, but 5 are used for the exit + * reason and info. Restrict to 4 to reduce the maintenance burden + * when expanding kvm_run.emulation_failure in the future. + */ + if (WARN_ON_ONCE(ndata > 4)) + ndata = 4; + + /* Always include the flags as a 'data' entry. */ + info_start = 1; run->emulation_failure.flags = 0; if (insn_size) { - run->emulation_failure.ndata = 3; + BUILD_BUG_ON((sizeof(run->emulation_failure.insn_size) + + sizeof(run->emulation_failure.insn_bytes) != 16)); + info_start += 2; run->emulation_failure.flags |= KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES; run->emulation_failure.insn_size = insn_size; memset(run->emulation_failure.insn_bytes, 0x90, sizeof(run->emulation_failure.insn_bytes)); - memcpy(run->emulation_failure.insn_bytes, - ctxt->fetch.data, insn_size); + memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size); } + + memcpy(&run->internal.data[info_start], info, sizeof(info)); + memcpy(&run->internal.data[info_start + ARRAY_SIZE(info)], data, + ndata * sizeof(data[0])); + + run->emulation_failure.ndata = info_start + ARRAY_SIZE(info) + ndata; +} + +static void prepare_emulation_ctxt_failure_exit(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; + + prepare_emulation_failure_exit(vcpu, NULL, 0, ctxt->fetch.data, + ctxt->fetch.end - ctxt->fetch.data); +} + +void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, + u8 ndata) +{ + prepare_emulation_failure_exit(vcpu, data, ndata, NULL, 0); } +EXPORT_SYMBOL_GPL(__kvm_prepare_emulation_failure_exit); + +void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu) +{ + __kvm_prepare_emulation_failure_exit(vcpu, NULL, 0); +} +EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit); static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) { @@ -7412,16 +7650,14 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) if (kvm->arch.exit_on_emulation_error || (emulation_type & EMULTYPE_SKIP)) { - prepare_emulation_failure_exit(vcpu); + prepare_emulation_ctxt_failure_exit(vcpu); return 0; } kvm_queue_exception(vcpu, UD_VECTOR); if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; + prepare_emulation_ctxt_failure_exit(vcpu); return 0; } @@ -8021,14 +8257,13 @@ static void tsc_khz_changed(void *data) static void kvm_hyperv_tsc_notifier(void) { struct kvm *kvm; - struct kvm_vcpu *vcpu; int cpu; - unsigned long flags; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_make_mclock_inprogress_request(kvm); + /* no guest entries from this point */ hyperv_stop_tsc_emulation(); /* TSC frequency always matches when on Hyper-V */ @@ -8037,18 +8272,11 @@ static void kvm_hyperv_tsc_notifier(void) kvm_max_guest_tsc_khz = tsc_khz; list_for_each_entry(kvm, &vm_list, vm_list) { - struct kvm_arch *ka = &kvm->arch; - - raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); + __kvm_start_pvclock_update(kvm); pvclock_update_vm_gtod_copy(kvm); - raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); - - kvm_for_each_vcpu(cpu, vcpu, kvm) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - - kvm_for_each_vcpu(cpu, vcpu, kvm) - kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); + kvm_end_pvclock_update(kvm); } + mutex_unlock(&kvm_lock); } #endif @@ -8289,18 +8517,20 @@ int kvm_arch_init(void *opaque) int r; if (kvm_x86_ops.hardware_enable) { - printk(KERN_ERR "kvm: already loaded the other module\n"); + pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name); r = -EEXIST; goto out; } if (!ops->cpu_has_kvm_support()) { - pr_err_ratelimited("kvm: no hardware support\n"); + pr_err_ratelimited("kvm: no hardware support for '%s'\n", + ops->runtime_ops->name); r = -EOPNOTSUPP; goto out; } if (ops->disabled_by_bios()) { - pr_err_ratelimited("kvm: disabled by bios\n"); + pr_err_ratelimited("kvm: support for '%s' disabled by bios\n", + ops->runtime_ops->name); r = -EOPNOTSUPP; goto out; } @@ -8485,7 +8715,7 @@ EXPORT_SYMBOL_GPL(kvm_apicv_activated); static void kvm_apicv_init(struct kvm *kvm) { - mutex_init(&kvm->arch.apicv_update_lock); + init_rwsem(&kvm->arch.apicv_update_lock); if (enable_apicv) clear_bit(APICV_INHIBIT_REASON_DISABLE, @@ -9140,14 +9370,7 @@ static void process_smi(struct kvm_vcpu *vcpu) void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, unsigned long *vcpu_bitmap) { - cpumask_var_t cpus; - - zalloc_cpumask_var(&cpus, GFP_ATOMIC); - - kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, - NULL, vcpu_bitmap, cpus); - - free_cpumask_var(cpus); + kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, vcpu_bitmap); } void kvm_make_scan_ioapic_request(struct kvm *kvm) @@ -9162,7 +9385,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; - mutex_lock(&vcpu->kvm->arch.apicv_update_lock); + down_read(&vcpu->kvm->arch.apicv_update_lock); activate = kvm_apicv_activated(vcpu->kvm); if (vcpu->arch.apicv_active == activate) @@ -9182,7 +9405,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); out: - mutex_unlock(&vcpu->kvm->arch.apicv_update_lock); + up_read(&vcpu->kvm->arch.apicv_update_lock); } EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); @@ -9190,6 +9413,8 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { unsigned long old, new; + lockdep_assert_held_write(&kvm->arch.apicv_update_lock); + if (!kvm_x86_ops.check_apicv_inhibit_reasons || !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit)) return; @@ -9203,6 +9428,18 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) if (!!old != !!new) { trace_kvm_apicv_update_request(activate, bit); + /* + * Kick all vCPUs before setting apicv_inhibit_reasons to avoid + * false positives in the sanity check WARN in svm_vcpu_run(). + * This task will wait for all vCPUs to ack the kick IRQ before + * updating apicv_inhibit_reasons, and all other vCPUs will + * block on acquiring apicv_update_lock so that vCPUs can't + * redo svm_vcpu_run() without seeing the new inhibit state. + * + * Note, holding apicv_update_lock and taking it in the read + * side (handling the request) also prevents other vCPUs from + * servicing the request with a stale apicv_inhibit_reasons. + */ kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); kvm->arch.apicv_inhibit_reasons = new; if (new) { @@ -9216,9 +9453,9 @@ EXPORT_SYMBOL_GPL(__kvm_request_apicv_update); void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { - mutex_lock(&kvm->arch.apicv_update_lock); + down_write(&kvm->arch.apicv_update_lock); __kvm_request_apicv_update(kvm, activate, bit); - mutex_unlock(&kvm->arch.apicv_update_lock); + up_write(&kvm->arch.apicv_update_lock); } EXPORT_SYMBOL_GPL(kvm_request_apicv_update); @@ -9330,7 +9567,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) __kvm_migrate_timers(vcpu); if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) - kvm_gen_update_masterclock(vcpu->kvm); + kvm_update_masterclock(vcpu->kvm); if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu)) kvm_gen_kvmclock_update(vcpu); if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { @@ -9537,6 +9774,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } for (;;) { + /* + * Assert that vCPU vs. VM APICv state is consistent. An APICv + * update must kick and wait for all vCPUs before toggling the + * per-VM state, and responsing vCPUs must wait for the update + * to complete before servicing KVM_REQ_APICV_UPDATE. + */ + WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu)); + exit_fastpath = static_call(kvm_x86_run)(vcpu); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) break; @@ -10485,16 +10730,6 @@ static int sync_regs(struct kvm_vcpu *vcpu) return 0; } -static void fx_init(struct kvm_vcpu *vcpu) -{ - /* - * Ensure guest xcr0 is valid for loading - */ - vcpu->arch.xcr0 = XFEATURE_MASK_FP; - - vcpu->arch.cr0 |= X86_CR0_ET; -} - int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) @@ -10556,8 +10791,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) goto free_emulate_ctxt; } - fx_init(vcpu); - vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); @@ -10654,9 +10887,19 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { + struct kvm_cpuid_entry2 *cpuid_0x1; unsigned long old_cr0 = kvm_read_cr0(vcpu); unsigned long new_cr0; - u32 eax, dummy; + + /* + * Several of the "set" flows, e.g. ->set_cr0(), read other registers + * to handle side effects. RESET emulation hits those flows and relies + * on emulated/virtualized registers, including those that are loaded + * into hardware, to be zeroed at vCPU creation. Use CRs as a sentinel + * to detect improper or missing initialization. + */ + WARN_ON_ONCE(!init_event && + (old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu))); kvm_lapic_reset(vcpu, init_event); @@ -10715,21 +10958,19 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.xcr0 = XFEATURE_MASK_FP; } + /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */ memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); - vcpu->arch.regs_avail = ~0; - vcpu->arch.regs_dirty = ~0; + kvm_register_mark_dirty(vcpu, VCPU_REGS_RSP); /* * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon) * if no CPUID match is found. Note, it's impossible to get a match at * RESET since KVM emulates RESET before exposing the vCPU to userspace, - * i.e. it'simpossible for kvm_cpuid() to find a valid entry on RESET. - * But, go through the motions in case that's ever remedied. + * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry + * on RESET. But, go through the motions in case that's ever remedied. */ - eax = 1; - if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true)) - eax = 0x600; - kvm_rdx_write(vcpu, eax); + cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0); + kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600); vcpu->arch.ia32_xss = 0; @@ -10981,13 +11222,14 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_free_vm(struct kvm *kvm) { kfree(to_kvm_hv(kvm)->hv_pa_pg); - vfree(kvm); + __kvm_arch_free_vm(kvm); } int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int ret; + unsigned long flags; if (type) return -EINVAL; @@ -11011,10 +11253,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); - raw_spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); - + seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock); kvm->arch.kvmclock_offset = -get_kvmclock_base_ns(); + + raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); pvclock_update_vm_gtod_copy(kvm); + raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); kvm->arch.guest_can_read_msr_platform_info = true; @@ -11211,8 +11455,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) kvm_page_track_free_memslot(slot); } -static int memslot_rmap_alloc(struct kvm_memory_slot *slot, - unsigned long npages) +int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages) { const int sz = sizeof(*slot->arch.rmap[0]); int i; @@ -11234,50 +11477,6 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot, return 0; } -int alloc_all_memslots_rmaps(struct kvm *kvm) -{ - struct kvm_memslots *slots; - struct kvm_memory_slot *slot; - int r, i; - - /* - * Check if memslots alreday have rmaps early before acquiring - * the slots_arch_lock below. - */ - if (kvm_memslots_have_rmaps(kvm)) - return 0; - - mutex_lock(&kvm->slots_arch_lock); - - /* - * Read memslots_have_rmaps again, under the slots arch lock, - * before allocating the rmaps - */ - if (kvm_memslots_have_rmaps(kvm)) { - mutex_unlock(&kvm->slots_arch_lock); - return 0; - } - - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(slot, slots) { - r = memslot_rmap_alloc(slot, slot->npages); - if (r) { - mutex_unlock(&kvm->slots_arch_lock); - return r; - } - } - } - - /* - * Ensure that memslots_have_rmaps becomes true strictly after - * all the rmap pointers are set. - */ - smp_store_release(&kvm->arch.memslots_have_rmaps, true); - mutex_unlock(&kvm->slots_arch_lock); - return 0; -} - static int kvm_alloc_memslot_metadata(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned long npages) @@ -11328,7 +11527,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm, } } - if (kvm_page_track_create_memslot(slot, npages)) + if (kvm_page_track_create_memslot(kvm, slot, npages)) goto out_free; return 0; @@ -11926,6 +12125,15 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, return static_call(kvm_x86_update_pi_irte)(kvm, host_irq, guest_irq, set); } +bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) +{ + if (new->type != KVM_IRQ_ROUTING_MSI) + return true; + + return !!memcmp(&old->msi, &new->msi, sizeof(new->msi)); +} + bool kvm_vector_hashing_enabled(void) { return vector_hashing; @@ -12007,9 +12215,7 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, * doesn't seem to be a real use-case behind such requests, just return * KVM_EXIT_INTERNAL_ERROR for now. */ - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; + kvm_prepare_emulation_failure_exit(vcpu); return 0; } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 7d66d63dc55a..ea264c4502e4 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -343,8 +343,6 @@ extern bool enable_vmware_backdoor; extern int pi_inject_timer; -extern struct static_key kvm_no_apic_vcpu; - extern bool report_ignored_msrs; static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) |