summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/virt/kvm/api.rst78
-rw-r--r--arch/x86/include/asm/kvm_host.h24
-rw-r--r--arch/x86/include/uapi/asm/kvm.h1
-rw-r--r--arch/x86/kvm/cpuid.c12
-rw-r--r--arch/x86/kvm/cpuid.h18
-rw-r--r--arch/x86/kvm/emulate.c71
-rw-r--r--arch/x86/kvm/hyperv.c3
-rw-r--r--arch/x86/kvm/kvm_emulate.h1
-rw-r--r--arch/x86/kvm/lapic.c6
-rw-r--r--arch/x86/kvm/lapic.h3
-rw-r--r--arch/x86/kvm/mmu.h27
-rw-r--r--arch/x86/kvm/mmu/mmu.c2
-rw-r--r--arch/x86/kvm/mmu/spte.c26
-rw-r--r--arch/x86/kvm/pmu.c2
-rw-r--r--arch/x86/kvm/svm/sev.c4
-rw-r--r--arch/x86/kvm/svm/svm.c15
-rw-r--r--arch/x86/kvm/svm/svm.h2
-rw-r--r--arch/x86/kvm/trace.h9
-rw-r--r--arch/x86/kvm/vmx/main.c2
-rw-r--r--arch/x86/kvm/vmx/nested.c8
-rw-r--r--arch/x86/kvm/vmx/vmx.c29
-rw-r--r--arch/x86/kvm/vmx/vmx.h2
-rw-r--r--arch/x86/kvm/vmx/x86_ops.h2
-rw-r--r--arch/x86/kvm/x86.c112
-rw-r--r--arch/x86/kvm/x86.h19
-rw-r--r--include/uapi/linux/kvm.h2
-rw-r--r--tools/testing/selftests/kvm/Makefile1
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/apic.h8
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/processor.h18
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/processor.c11
-rw-r--r--tools/testing/selftests/kvm/x86_64/apic_bus_clock_test.c194
31 files changed, 503 insertions, 209 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 6ef7a08b745f..798ad65f4fee 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6483,9 +6483,12 @@ More architecture-specific flags detailing state of the VCPU that may
affect the device's behavior. Current defined flags::
/* x86, set if the VCPU is in system management mode */
- #define KVM_RUN_X86_SMM (1 << 0)
+ #define KVM_RUN_X86_SMM (1 << 0)
/* x86, set if bus lock detected in VM */
- #define KVM_RUN_BUS_LOCK (1 << 1)
+ #define KVM_RUN_X86_BUS_LOCK (1 << 1)
+ /* x86, set if the VCPU is executing a nested (L2) guest */
+ #define KVM_RUN_X86_GUEST_MODE (1 << 2)
+
/* arm64, set for KVM_EXIT_DEBUG */
#define KVM_DEBUG_ARCH_HSR_HIGH_VALID (1 << 0)
@@ -7831,29 +7834,31 @@ Valid bits in args[0] are::
#define KVM_BUS_LOCK_DETECTION_OFF (1 << 0)
#define KVM_BUS_LOCK_DETECTION_EXIT (1 << 1)
-Enabling this capability on a VM provides userspace with a way to select
-a policy to handle the bus locks detected in guest. Userspace can obtain
-the supported modes from the result of KVM_CHECK_EXTENSION and define it
-through the KVM_ENABLE_CAP.
+Enabling this capability on a VM provides userspace with a way to select a
+policy to handle the bus locks detected in guest. Userspace can obtain the
+supported modes from the result of KVM_CHECK_EXTENSION and define it through
+the KVM_ENABLE_CAP. The supported modes are mutually-exclusive.
-KVM_BUS_LOCK_DETECTION_OFF and KVM_BUS_LOCK_DETECTION_EXIT are supported
-currently and mutually exclusive with each other. More bits can be added in
-the future.
+This capability allows userspace to force VM exits on bus locks detected in the
+guest, irrespective whether or not the host has enabled split-lock detection
+(which triggers an #AC exception that KVM intercepts). This capability is
+intended to mitigate attacks where a malicious/buggy guest can exploit bus
+locks to degrade the performance of the whole system.
-With KVM_BUS_LOCK_DETECTION_OFF set, bus locks in guest will not cause vm exits
-so that no additional actions are needed. This is the default mode.
+If KVM_BUS_LOCK_DETECTION_OFF is set, KVM doesn't force guest bus locks to VM
+exit, although the host kernel's split-lock #AC detection still applies, if
+enabled.
-With KVM_BUS_LOCK_DETECTION_EXIT set, vm exits happen when bus lock detected
-in VM. KVM just exits to userspace when handling them. Userspace can enforce
-its own throttling or other policy based mitigations.
+If KVM_BUS_LOCK_DETECTION_EXIT is set, KVM enables a CPU feature that ensures
+bus locks in the guest trigger a VM exit, and KVM exits to userspace for all
+such VM exits, e.g. to allow userspace to throttle the offending guest and/or
+apply some other policy-based mitigation. When exiting to userspace, KVM sets
+KVM_RUN_X86_BUS_LOCK in vcpu-run->flags, and conditionally sets the exit_reason
+to KVM_EXIT_X86_BUS_LOCK.
-This capability is aimed to address the thread that VM can exploit bus locks to
-degree the performance of the whole system. Once the userspace enable this
-capability and select the KVM_BUS_LOCK_DETECTION_EXIT mode, KVM will set the
-KVM_RUN_BUS_LOCK flag in vcpu-run->flags field and exit to userspace. Concerning
-the bus lock vm exit can be preempted by a higher priority VM exit, the exit
-notifications to userspace can be KVM_EXIT_BUS_LOCK or other reasons.
-KVM_RUN_BUS_LOCK flag is used to distinguish between them.
+Note! Detected bus locks may be coincident with other exits to userspace, i.e.
+KVM_RUN_X86_BUS_LOCK should be checked regardless of the primary exit reason if
+userspace wants to take action on all detected bus locks.
7.23 KVM_CAP_PPC_DAWR1
----------------------
@@ -8137,6 +8142,37 @@ error/annotated fault.
See KVM_EXIT_MEMORY_FAULT for more information.
+7.35 KVM_CAP_X86_APIC_BUS_CYCLES_NS
+-----------------------------------
+
+:Architectures: x86
+:Target: VM
+:Parameters: args[0] is the desired APIC bus clock rate, in nanoseconds
+:Returns: 0 on success, -EINVAL if args[0] contains an invalid value for the
+ frequency or if any vCPUs have been created, -ENXIO if a virtual
+ local APIC has not been created using KVM_CREATE_IRQCHIP.
+
+This capability sets the VM's APIC bus clock frequency, used by KVM's in-kernel
+virtual APIC when emulating APIC timers. KVM's default value can be retrieved
+by KVM_CHECK_EXTENSION.
+
+Note: Userspace is responsible for correctly configuring CPUID 0x15, a.k.a. the
+core crystal clock frequency, if a non-zero CPUID 0x15 is exposed to the guest.
+
+7.36 KVM_CAP_X86_GUEST_MODE
+------------------------------
+
+:Architectures: x86
+:Returns: Informational only, -EINVAL on direct KVM_ENABLE_CAP.
+
+The presence of this capability indicates that KVM_RUN will update the
+KVM_RUN_X86_GUEST_MODE bit in kvm_run.flags to indicate whether the
+vCPU was executing nested guest code when it exited.
+
+KVM exits with the register state of either the L1 or L2 guest
+depending on which executed at the time of an exit. Userspace must
+take care to differentiate between these cases.
+
8. Other capabilities.
======================
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 058a5c67979d..d5101f52e76c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1208,7 +1208,7 @@ enum kvm_apicv_inhibit {
* APIC acceleration is disabled by a module parameter
* and/or not supported in hardware.
*/
- APICV_INHIBIT_REASON_DISABLE,
+ APICV_INHIBIT_REASON_DISABLED,
/*
* APIC acceleration is inhibited because AutoEOI feature is
@@ -1278,8 +1278,27 @@ enum kvm_apicv_inhibit {
* mapping between logical ID and vCPU.
*/
APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED,
+
+ NR_APICV_INHIBIT_REASONS,
};
+#define __APICV_INHIBIT_REASON(reason) \
+ { BIT(APICV_INHIBIT_REASON_##reason), #reason }
+
+#define APICV_INHIBIT_REASONS \
+ __APICV_INHIBIT_REASON(DISABLED), \
+ __APICV_INHIBIT_REASON(HYPERV), \
+ __APICV_INHIBIT_REASON(ABSENT), \
+ __APICV_INHIBIT_REASON(BLOCKIRQ), \
+ __APICV_INHIBIT_REASON(PHYSICAL_ID_ALIASED), \
+ __APICV_INHIBIT_REASON(APIC_ID_MODIFIED), \
+ __APICV_INHIBIT_REASON(APIC_BASE_MODIFIED), \
+ __APICV_INHIBIT_REASON(NESTED), \
+ __APICV_INHIBIT_REASON(IRQWIN), \
+ __APICV_INHIBIT_REASON(PIT_REINJ), \
+ __APICV_INHIBIT_REASON(SEV), \
+ __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED)
+
struct kvm_arch {
unsigned long n_used_mmu_pages;
unsigned long n_requested_mmu_pages;
@@ -1365,6 +1384,7 @@ struct kvm_arch {
u32 default_tsc_khz;
bool user_set_tsc;
+ u64 apic_bus_cycle_ns;
seqcount_raw_spinlock_t pvclock_sc;
bool use_master_clock;
@@ -1709,7 +1729,6 @@ struct kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
- bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason);
const unsigned long required_apicv_inhibits;
bool allow_apicv_in_x2apic_without_x2apic_virtualization;
void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
@@ -1855,7 +1874,6 @@ struct kvm_arch_async_pf {
};
extern u32 __read_mostly kvm_nr_uret_msrs;
-extern u64 __read_mostly host_efer;
extern bool __read_mostly allow_smaller_maxphyaddr;
extern bool __read_mostly enable_apicv;
extern struct kvm_x86_ops kvm_x86_ops;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 988b5204d636..bf57a824f722 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -106,6 +106,7 @@ struct kvm_ioapic_state {
#define KVM_RUN_X86_SMM (1 << 0)
#define KVM_RUN_X86_BUS_LOCK (1 << 1)
+#define KVM_RUN_X86_GUEST_MODE (1 << 2)
/* for KVM_GET_REGS and KVM_SET_REGS */
struct kvm_regs {
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index f2f2be5d1141..8cf4ca2ae79d 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -335,6 +335,18 @@ static bool kvm_cpuid_has_hyperv(struct kvm_cpuid_entry2 *entries, int nent)
#endif
}
+static bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *entry;
+
+ entry = kvm_find_cpuid_entry(vcpu, 0);
+ if (!entry)
+ return false;
+
+ return is_guest_vendor_amd(entry->ebx, entry->ecx, entry->edx) ||
+ is_guest_vendor_hygon(entry->ebx, entry->ecx, entry->edx);
+}
+
static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 23dbb9eb277c..41697cca354e 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -102,24 +102,6 @@ static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu,
*reg &= ~__feature_bit(x86_feature);
}
-static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 0);
- return best &&
- (is_guest_vendor_amd(best->ebx, best->ecx, best->edx) ||
- is_guest_vendor_hygon(best->ebx, best->ecx, best->edx));
-}
-
-static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 0);
- return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx);
-}
-
static inline bool guest_cpuid_is_amd_compatible(struct kvm_vcpu *vcpu)
{
return vcpu->arch.is_amd_compatible;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5d4c86133453..1acd97c6fa53 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2354,50 +2354,6 @@ setup_syscalls_segments(struct desc_struct *cs, struct desc_struct *ss)
ss->avl = 0;
}
-static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
-{
- u32 eax, ebx, ecx, edx;
-
- eax = ecx = 0;
- ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true);
- return is_guest_vendor_intel(ebx, ecx, edx);
-}
-
-static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
-{
- const struct x86_emulate_ops *ops = ctxt->ops;
- u32 eax, ebx, ecx, edx;
-
- /*
- * syscall should always be enabled in longmode - so only become
- * vendor specific (cpuid) if other modes are active...
- */
- if (ctxt->mode == X86EMUL_MODE_PROT64)
- return true;
-
- eax = 0x00000000;
- ecx = 0x00000000;
- ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true);
- /*
- * remark: Intel CPUs only support "syscall" in 64bit longmode. Also a
- * 64bit guest with a 32bit compat-app running will #UD !! While this
- * behaviour can be fixed (by emulating) into AMD response - CPUs of
- * AMD can't behave like Intel.
- */
- if (is_guest_vendor_intel(ebx, ecx, edx))
- return false;
-
- if (is_guest_vendor_amd(ebx, ecx, edx) ||
- is_guest_vendor_hygon(ebx, ecx, edx))
- return true;
-
- /*
- * default: (not Intel, not AMD, not Hygon), apply Intel's
- * stricter rules...
- */
- return false;
-}
-
static int em_syscall(struct x86_emulate_ctxt *ctxt)
{
const struct x86_emulate_ops *ops = ctxt->ops;
@@ -2411,7 +2367,15 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
ctxt->mode == X86EMUL_MODE_VM86)
return emulate_ud(ctxt);
- if (!(em_syscall_is_enabled(ctxt)))
+ /*
+ * Intel compatible CPUs only support SYSCALL in 64-bit mode, whereas
+ * AMD allows SYSCALL in any flavor of protected mode. Note, it's
+ * infeasible to emulate Intel behavior when running on AMD hardware,
+ * as SYSCALL won't fault in the "wrong" mode, i.e. there is no #UD
+ * for KVM to trap-and-emulate, unlike emulating AMD on Intel.
+ */
+ if (ctxt->mode != X86EMUL_MODE_PROT64 &&
+ ctxt->ops->guest_cpuid_is_intel_compatible(ctxt))
return emulate_ud(ctxt);
ops->get_msr(ctxt, MSR_EFER, &efer);
@@ -2471,11 +2435,11 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
return emulate_gp(ctxt, 0);
/*
- * Not recognized on AMD in compat mode (but is recognized in legacy
- * mode).
+ * Intel's architecture allows SYSENTER in compatibility mode, but AMD
+ * does not. Note, AMD does allow SYSENTER in legacy protected mode.
*/
- if ((ctxt->mode != X86EMUL_MODE_PROT64) && (efer & EFER_LMA)
- && !vendor_intel(ctxt))
+ if ((ctxt->mode != X86EMUL_MODE_PROT64) && (efer & EFER_LMA) &&
+ !ctxt->ops->guest_cpuid_is_intel_compatible(ctxt))
return emulate_ud(ctxt);
/* sysenter/sysexit have not been tested in 64bit mode. */
@@ -2647,7 +2611,14 @@ static void string_registers_quirk(struct x86_emulate_ctxt *ctxt)
* manner when ECX is zero due to REP-string optimizations.
*/
#ifdef CONFIG_X86_64
- if (ctxt->ad_bytes != 4 || !vendor_intel(ctxt))
+ u32 eax, ebx, ecx, edx;
+
+ if (ctxt->ad_bytes != 4)
+ return;
+
+ eax = ecx = 0;
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true);
+ if (!is_guest_vendor_intel(ebx, ecx, edx))
return;
*reg_write(ctxt, VCPU_REGS_RCX) = 0;
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 8a47f8541eab..5c31e715d2ad 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1737,7 +1737,8 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
data = (u64)vcpu->arch.virtual_tsc_khz * 1000;
break;
case HV_X64_MSR_APIC_FREQUENCY:
- data = APIC_BUS_FREQUENCY;
+ data = div64_u64(1000000000ULL,
+ vcpu->kvm->arch.apic_bus_cycle_ns);
break;
default:
kvm_pr_unimpl_rdmsr(vcpu, msr);
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 29ea4313e1bb..55a18e2f2dcd 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -223,6 +223,7 @@ struct x86_emulate_ops {
bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt);
bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt);
bool (*guest_has_rdpid)(struct x86_emulate_ctxt *ctxt);
+ bool (*guest_cpuid_is_intel_compatible)(struct x86_emulate_ctxt *ctxt);
void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index acd7d48100a1..4e44c267959a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1557,7 +1557,8 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
remaining = 0;
ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
- return div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->divide_count));
+ return div64_u64(ns, (apic->vcpu->kvm->arch.apic_bus_cycle_ns *
+ apic->divide_count));
}
static void __report_tpr_access(struct kvm_lapic *apic, bool write)
@@ -1973,7 +1974,8 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
static inline u64 tmict_to_ns(struct kvm_lapic *apic, u32 tmict)
{
- return (u64)tmict * APIC_BUS_CYCLE_NS * (u64)apic->divide_count;
+ return (u64)tmict * apic->vcpu->kvm->arch.apic_bus_cycle_ns *
+ (u64)apic->divide_count;
}
static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index a69e706b9080..9030778e3077 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -16,8 +16,7 @@
#define APIC_DEST_NOSHORT 0x0
#define APIC_DEST_MASK 0x800
-#define APIC_BUS_CYCLE_NS 1
-#define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS)
+#define APIC_BUS_CYCLE_NS_DEFAULT 1
#define APIC_BROADCAST 0xFF
#define X2APIC_BROADCAST 0xFFFFFFFFul
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index dc80e72e4848..f2e7e5c9b9ef 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -57,12 +57,6 @@ static __always_inline u64 rsvd_bits(int s, int e)
return ((2ULL << (e - s)) - 1) << s;
}
-/*
- * The number of non-reserved physical address bits irrespective of features
- * that repurpose legal bits, e.g. MKTME.
- */
-extern u8 __read_mostly shadow_phys_bits;
-
static inline gfn_t kvm_mmu_max_gfn(void)
{
/*
@@ -76,30 +70,11 @@ static inline gfn_t kvm_mmu_max_gfn(void)
* than hardware's real MAXPHYADDR. Using the host MAXPHYADDR
* disallows such SPTEs entirely and simplifies the TDP MMU.
*/
- int max_gpa_bits = likely(tdp_enabled) ? shadow_phys_bits : 52;
+ int max_gpa_bits = likely(tdp_enabled) ? kvm_host.maxphyaddr : 52;
return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1;
}
-static inline u8 kvm_get_shadow_phys_bits(void)
-{
- /*
- * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
- * in CPU detection code, but the processor treats those reduced bits as
- * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
- * the physical address bits reported by CPUID.
- */
- if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
- return cpuid_eax(0x80000008) & 0xff;
-
- /*
- * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
- * custom CPUID. Proceed with whatever the kernel found since these features
- * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
- */
- return boot_cpu_data.x86_phys_bits;
-}
-
u8 kvm_mmu_get_max_tdp_level(void);
void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 4e0e9963066f..44ae8455f91e 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5109,7 +5109,7 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
static inline u64 reserved_hpa_bits(void)
{
- return rsvd_bits(shadow_phys_bits, 63);
+ return rsvd_bits(kvm_host.maxphyaddr, 63);
}
/*
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 59cac37615b6..8275b96b6596 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -43,7 +43,25 @@ u64 __read_mostly shadow_acc_track_mask;
u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
-u8 __read_mostly shadow_phys_bits;
+static u8 __init kvm_get_host_maxphyaddr(void)
+{
+ /*
+ * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
+ * in CPU detection code, but the processor treats those reduced bits as
+ * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
+ * the physical address bits reported by CPUID, i.e. the raw MAXPHYADDR,
+ * when reasoning about CPU behavior with respect to MAXPHYADDR.
+ */
+ if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
+ return cpuid_eax(0x80000008) & 0xff;
+
+ /*
+ * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
+ * custom CPUID. Proceed with whatever the kernel found since these features
+ * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
+ */
+ return boot_cpu_data.x86_phys_bits;
+}
void __init kvm_mmu_spte_module_init(void)
{
@@ -55,6 +73,8 @@ void __init kvm_mmu_spte_module_init(void)
* will change when the vendor module is (re)loaded.
*/
allow_mmio_caching = enable_mmio_caching;
+
+ kvm_host.maxphyaddr = kvm_get_host_maxphyaddr();
}
static u64 generation_mmio_spte_mask(u64 gen)
@@ -441,8 +461,6 @@ void kvm_mmu_reset_all_pte_masks(void)
u8 low_phys_bits;
u64 mask;
- shadow_phys_bits = kvm_get_shadow_phys_bits();
-
/*
* If the CPU has 46 or less physical address bits, then set an
* appropriate mask to guard against L1TF attacks. Otherwise, it is
@@ -494,7 +512,7 @@ void kvm_mmu_reset_all_pte_masks(void)
* 52-bit physical addresses then there are no reserved PA bits in the
* PTEs and so the reserved PA approach must be disabled.
*/
- if (shadow_phys_bits < 52)
+ if (kvm_host.maxphyaddr < 52)
mask = BIT_ULL(51) | PT_PRESENT_MASK;
else
mask = 0;
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index f9149c9fc275..7f0fdbc8240f 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -194,7 +194,7 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
attr.sample_period = get_sample_period(pmc, pmc->counter);
if ((attr.config & HSW_IN_TX_CHECKPOINTED) &&
- guest_cpuid_is_intel(pmc->vcpu)) {
+ (boot_cpu_has(X86_FEATURE_RTM) || boot_cpu_has(X86_FEATURE_HLE))) {
/*
* HSW_IN_TX_CHECKPOINTED is not supported with nonzero
* period. Just clear the sample period so at least
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index df8818759698..8cb439a5c77a 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -4401,9 +4401,9 @@ void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_are
* isn't saved by VMRUN, that isn't already saved by VMSAVE (performed
* by common SVM code).
*/
- hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ hostsa->xcr0 = kvm_host.xcr0;
hostsa->pkru = read_pkru();
- hostsa->xss = host_xss;
+ hostsa->xss = kvm_host.xss;
/*
* If DebugSwap is enabled, debug registers are loaded but NOT saved by
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 80ac33a5effe..f011849e7ef8 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -53,6 +53,7 @@
#include "svm_onhyperv.h"
MODULE_AUTHOR("Qumranet");
+MODULE_DESCRIPTION("KVM support for SVM (AMD-V) extensions");
MODULE_LICENSE("GPL");
#ifdef MODULE
@@ -1202,7 +1203,7 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (guest_cpuid_is_intel(vcpu)) {
+ if (guest_cpuid_is_intel_compatible(vcpu)) {
/*
* We must intercept SYSENTER_EIP and SYSENTER_ESP
* accesses because the processor only stores 32 bits.
@@ -2890,12 +2891,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_SYSENTER_EIP:
msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
- if (guest_cpuid_is_intel(vcpu))
+ if (guest_cpuid_is_intel_compatible(vcpu))
msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
break;
case MSR_IA32_SYSENTER_ESP:
msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
- if (guest_cpuid_is_intel(vcpu))
+ if (guest_cpuid_is_intel_compatible(vcpu))
msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
break;
case MSR_TSC_AUX:
@@ -3122,11 +3123,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
* 32 bit part of these msrs to support Intel's
* implementation of SYSENTER/SYSEXIT.
*/
- svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
+ svm->sysenter_eip_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0;
break;
case MSR_IA32_SYSENTER_ESP:
svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
- svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
+ svm->sysenter_esp_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0;
break;
case MSR_TSC_AUX:
/*
@@ -4387,11 +4388,11 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV);
/*
- * Intercept VMLOAD if the vCPU mode is Intel in order to emulate that
+ * Intercept VMLOAD if the vCPU model is Intel in order to emulate that
* VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing
* SVM on Intel is bonkers and extremely unlikely to work).
*/
- if (!guest_cpuid_is_intel(vcpu))
+ if (!guest_cpuid_is_intel_compatible(vcpu))
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index d2397b98bbf0..0eb87bb6c5a8 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -668,7 +668,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
/* avic.c */
#define AVIC_REQUIRED_APICV_INHIBITS \
( \
- BIT(APICV_INHIBIT_REASON_DISABLE) | \
+ BIT(APICV_INHIBIT_REASON_DISABLED) | \
BIT(APICV_INHIBIT_REASON_ABSENT) | \
BIT(APICV_INHIBIT_REASON_HYPERV) | \
BIT(APICV_INHIBIT_REASON_NESTED) | \
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 10ad5d32fcc3..e30d01ef5a61 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1375,6 +1375,10 @@ TRACE_EVENT(kvm_hv_stimer_cleanup,
__entry->vcpu_id, __entry->timer_index)
);
+#define kvm_print_apicv_inhibit_reasons(inhibits) \
+ (inhibits), (inhibits) ? " " : "", \
+ (inhibits) ? __print_flags(inhibits, "|", APICV_INHIBIT_REASONS) : ""
+
TRACE_EVENT(kvm_apicv_inhibit_changed,
TP_PROTO(int reason, bool set, unsigned long inhibits),
TP_ARGS(reason, set, inhibits),
@@ -1391,9 +1395,10 @@ TRACE_EVENT(kvm_apicv_inhibit_changed,
__entry->inhibits = inhibits;
),
- TP_printk("%s reason=%u, inhibits=0x%lx",
+ TP_printk("%s reason=%u, inhibits=0x%lx%s%s",
__entry->set ? "set" : "cleared",
- __entry->reason, __entry->inhibits)
+ __entry->reason,
+ kvm_print_apicv_inhibit_reasons(__entry->inhibits))
);
TRACE_EVENT(kvm_apicv_accept_irq,
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index c7a86be0f30e..d8e2e853be51 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -8,7 +8,7 @@
#include "posted_intr.h"
#define VMX_REQUIRED_APICV_INHIBITS \
- (BIT(APICV_INHIBIT_REASON_DISABLE)| \
+ (BIT(APICV_INHIBIT_REASON_DISABLED) | \
BIT(APICV_INHIBIT_REASON_ABSENT) | \
BIT(APICV_INHIBIT_REASON_HYPERV) | \
BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 643935a0f70a..75b4f41d9926 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2425,7 +2425,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
if (cpu_has_load_ia32_efer()) {
if (guest_efer & EFER_LMA)
exec_control |= VM_ENTRY_IA32E_MODE;
- if (guest_efer != host_efer)
+ if (guest_efer != kvm_host.efer)
exec_control |= VM_ENTRY_LOAD_IA32_EFER;
}
vm_entry_controls_set(vmx, exec_control);
@@ -2438,7 +2438,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
* bits may be modified by vmx_set_efer() in prepare_vmcs02().
*/
exec_control = __vm_exit_controls_get(vmcs01);
- if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
+ if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer)
exec_control |= VM_EXIT_LOAD_IA32_EFER;
else
exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
@@ -4665,7 +4665,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
return vmcs_read64(GUEST_IA32_EFER);
if (cpu_has_load_ia32_efer())
- return host_efer;
+ return kvm_host.efer;
for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
@@ -4676,7 +4676,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
if (efer_msr)
return efer_msr->data;
- return host_efer;
+ return kvm_host.efer;
}
static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 078386582782..bedb9ba96918 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -74,6 +74,7 @@
#include "posted_intr.h"
MODULE_AUTHOR("Qumranet");
+MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions");
MODULE_LICENSE("GPL");
#ifdef MODULE
@@ -259,7 +260,7 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
return 0;
}
- if (host_arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
+ if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
return 0;
}
@@ -404,7 +405,7 @@ static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
* and VM-Exit.
*/
vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) &&
- (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
+ (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
!boot_cpu_has_bug(X86_BUG_MDS) &&
!boot_cpu_has_bug(X86_BUG_TAA);
@@ -1123,12 +1124,12 @@ static bool update_transition_efer(struct vcpu_vmx *vmx)
* atomically, since it's faster than switching it manually.
*/
if (cpu_has_load_ia32_efer() ||
- (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
+ (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) {
if (!(guest_efer & EFER_LMA))
guest_efer &= ~EFER_LME;
- if (guest_efer != host_efer)
+ if (guest_efer != kvm_host.efer)
add_atomic_switch_msr(vmx, MSR_EFER,
- guest_efer, host_efer, false);
+ guest_efer, kvm_host.efer, false);
else
clear_atomic_switch_msr(vmx, MSR_EFER);
return false;
@@ -1141,7 +1142,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx)
clear_atomic_switch_msr(vmx, MSR_EFER);
guest_efer &= ~ignore_bits;
- guest_efer |= host_efer & ignore_bits;
+ guest_efer |= kvm_host.efer & ignore_bits;
vmx->guest_uret_msrs[i].data = guest_efer;
vmx->guest_uret_msrs[i].mask = ~ignore_bits;
@@ -4392,7 +4393,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
}
if (cpu_has_load_ia32_efer())
- vmcs_write64(HOST_IA32_EFER, host_efer);
+ vmcs_write64(HOST_IA32_EFER, kvm_host.efer);
}
void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
@@ -8394,18 +8395,16 @@ static void __init vmx_setup_me_spte_mask(void)
u64 me_mask = 0;
/*
- * kvm_get_shadow_phys_bits() returns shadow_phys_bits. Use
- * the former to avoid exposing shadow_phys_bits.
- *
* On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
- * shadow_phys_bits. On MKTME and/or TDX capable systems,
+ * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems,
* boot_cpu_data.x86_phys_bits holds the actual physical address
- * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR
- * reported by CPUID. Those bits between are KeyID bits.
+ * w/o the KeyID bits, and kvm_host.maxphyaddr equals to
+ * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits.
*/
- if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits())
+ if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr)
me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
- kvm_get_shadow_phys_bits() - 1);
+ kvm_host.maxphyaddr - 1);
+
/*
* Unlike SME, host kernel doesn't support setting up any
* MKTME KeyID on Intel platforms. No memory encryption
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 7b64e271a931..08d7d67fe760 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -727,7 +727,7 @@ static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu)
return true;
return allow_smaller_maxphyaddr &&
- cpuid_maxphyaddr(vcpu) < kvm_get_shadow_phys_bits();
+ cpuid_maxphyaddr(vcpu) < kvm_host.maxphyaddr;
}
static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index 3cb0be94e779..6016883c8533 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -46,7 +46,6 @@ bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu);
void vmx_migrate_timers(struct kvm_vcpu *vcpu);
void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu);
-bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason);
void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr);
void vmx_hwapic_isr_update(int max_isr);
bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu);
@@ -111,7 +110,6 @@ u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu);
u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
void vmx_write_tsc_offset(struct kvm_vcpu *vcpu);
void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu);
-void vmx_request_immediate_exit(struct kvm_vcpu *vcpu);
void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
#ifdef CONFIG_X86_64
int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e8c34f8e1964..e90e1a74564e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -100,6 +100,9 @@
struct kvm_caps kvm_caps __read_mostly;
EXPORT_SYMBOL_GPL(kvm_caps);
+struct kvm_host_values kvm_host __read_mostly;
+EXPORT_SYMBOL_GPL(kvm_host);
+
#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e))
#define emul_to_vcpu(ctxt) \
@@ -220,21 +223,12 @@ static struct kvm_user_return_msrs __percpu *user_return_msrs;
| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
| XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE)
-u64 __read_mostly host_efer;
-EXPORT_SYMBOL_GPL(host_efer);
-
bool __read_mostly allow_smaller_maxphyaddr = 0;
EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
bool __read_mostly enable_apicv = true;
EXPORT_SYMBOL_GPL(enable_apicv);
-u64 __read_mostly host_xss;
-EXPORT_SYMBOL_GPL(host_xss);
-
-u64 __read_mostly host_arch_capabilities;
-EXPORT_SYMBOL_GPL(host_arch_capabilities);
-
const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
KVM_GENERIC_VM_STATS(),
STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
@@ -308,8 +302,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
sizeof(kvm_vcpu_stats_desc),
};
-u64 __read_mostly host_xcr0;
-
static struct kmem_cache *x86_emulator_cache;
/*
@@ -1016,11 +1008,11 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
- if (vcpu->arch.xcr0 != host_xcr0)
+ if (vcpu->arch.xcr0 != kvm_host.xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
- vcpu->arch.ia32_xss != host_xss)
+ vcpu->arch.ia32_xss != kvm_host.xss)
wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
}
@@ -1047,12 +1039,12 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
- if (vcpu->arch.xcr0 != host_xcr0)
- xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
+ if (vcpu->arch.xcr0 != kvm_host.xcr0)
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
- vcpu->arch.ia32_xss != host_xss)
- wrmsrl(MSR_IA32_XSS, host_xss);
+ vcpu->arch.ia32_xss != kvm_host.xss)
+ wrmsrl(MSR_IA32_XSS, kvm_host.xss);
}
}
@@ -1619,7 +1611,7 @@ static bool kvm_is_immutable_feature_msr(u32 msr)
static u64 kvm_get_arch_capabilities(void)
{
- u64 data = host_arch_capabilities & KVM_SUPPORTED_ARCH_CAP;
+ u64 data = kvm_host.arch_capabilities & KVM_SUPPORTED_ARCH_CAP;
/*
* If nx_huge_pages is enabled, KVM's shadow paging will ensure that
@@ -1877,11 +1869,11 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
* incomplete and conflicting architectural behavior. Current
* AMD CPUs completely ignore bits 63:32, i.e. they aren't
* reserved and always read as zeros. Enforce Intel's reserved
- * bits check if and only if the guest CPU is Intel, and clear
- * the bits in all other cases. This ensures cross-vendor
- * migration will provide consistent behavior for the guest.
+ * bits check if the guest CPU is Intel compatible, otherwise
+ * clear the bits. This ensures cross-vendor migration will
+ * provide consistent behavior for the guest.
*/
- if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0)
+ if (guest_cpuid_is_intel_compatible(vcpu) && (data >> 32) != 0)
return 1;
data = (u32)data;
@@ -4703,11 +4695,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
case KVM_CAP_IRQFD_RESAMPLE:
case KVM_CAP_MEMORY_FAULT_INFO:
+ case KVM_CAP_X86_GUEST_MODE:
r = 1;
break;
case KVM_CAP_PRE_FAULT_MEMORY:
r = tdp_enabled;
break;
+ case KVM_CAP_X86_APIC_BUS_CYCLES_NS:
+ r = APIC_BUS_CYCLE_NS_DEFAULT;
+ break;
case KVM_CAP_EXIT_HYPERCALL:
r = KVM_EXIT_HYPERCALL_VALID_MASK;
break;
@@ -5891,8 +5887,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EINVAL;
if (!lapic_in_kernel(vcpu))
goto out;
- u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
- GFP_KERNEL_ACCOUNT);
+ u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
r = -ENOMEM;
if (!u.lapic)
@@ -6085,7 +6080,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave))
break;
- u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
+ u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
r = -ENOMEM;
if (!u.xsave)
break;
@@ -6116,7 +6111,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
case KVM_GET_XSAVE2: {
int size = vcpu->arch.guest_fpu.uabi_size;
- u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT);
+ u.xsave = kzalloc(size, GFP_KERNEL);
r = -ENOMEM;
if (!u.xsave)
break;
@@ -6134,7 +6129,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
case KVM_GET_XCRS: {
- u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
+ u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
r = -ENOMEM;
if (!u.xcrs)
break;
@@ -6756,6 +6751,30 @@ split_irqchip_unlock:
}
mutex_unlock(&kvm->lock);
break;
+ case KVM_CAP_X86_APIC_BUS_CYCLES_NS: {
+ u64 bus_cycle_ns = cap->args[0];
+ u64 unused;
+
+ /*
+ * Guard against overflow in tmict_to_ns(). 128 is the highest
+ * divide value that can be programmed in APIC_TDCR.
+ */
+ r = -EINVAL;
+ if (!bus_cycle_ns ||
+ check_mul_overflow((u64)U32_MAX * 128, bus_cycle_ns, &unused))
+ break;
+
+ r = 0;
+ mutex_lock(&kvm->lock);
+ if (!irqchip_in_kernel(kvm))
+ r = -ENXIO;
+ else if (kvm->created_vcpus)
+ r = -EINVAL;
+ else
+ kvm->arch.apic_bus_cycle_ns = bus_cycle_ns;
+ mutex_unlock(&kvm->lock);
+ break;
+ }
default:
r = -EINVAL;
break;
@@ -8535,6 +8554,11 @@ static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt)
return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
}
+static bool emulator_guest_cpuid_is_intel_compatible(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpuid_is_intel_compatible(emul_to_vcpu(ctxt));
+}
+
static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
{
return kvm_register_read_raw(emul_to_vcpu(ctxt), reg);
@@ -8633,6 +8657,7 @@ static const struct x86_emulate_ops emulate_ops = {
.guest_has_movbe = emulator_guest_has_movbe,
.guest_has_fxsr = emulator_guest_has_fxsr,
.guest_has_rdpid = emulator_guest_has_rdpid,
+ .guest_cpuid_is_intel_compatible = emulator_guest_cpuid_is_intel_compatible,
.set_nmi_mask = emulator_set_nmi_mask,
.is_smm = emulator_is_smm,
.is_guest_mode = emulator_is_guest_mode,
@@ -9014,19 +9039,17 @@ EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu)
{
- u32 shadow;
-
if (kvm_get_rflags(vcpu) & X86_EFLAGS_RF)
return true;
/*
- * Intel CPUs inhibit code #DBs when MOV/POP SS blocking is active,
- * but AMD CPUs do not. MOV/POP SS blocking is rare, check that first
- * to avoid the relatively expensive CPUID lookup.
+ * Intel compatible CPUs inhibit code #DBs when MOV/POP SS blocking is
+ * active, but AMD compatible CPUs do not.
*/
- shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
- return (shadow & KVM_X86_SHADOW_INT_MOV_SS) &&
- guest_cpuid_is_intel(vcpu);
+ if (!guest_cpuid_is_intel_compatible(vcpu))
+ return false;
+
+ return static_call(kvm_x86_get_interrupt_shadow)(vcpu) & KVM_X86_SHADOW_INT_MOV_SS;
}
static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu,
@@ -9786,19 +9809,19 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P;
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
- host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
- kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
+ kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0;
}
- rdmsrl_safe(MSR_EFER, &host_efer);
+ rdmsrl_safe(MSR_EFER, &kvm_host.efer);
if (boot_cpu_has(X86_FEATURE_XSAVES))
- rdmsrl(MSR_IA32_XSS, host_xss);
+ rdmsrl(MSR_IA32_XSS, kvm_host.xss);
kvm_init_pmu_capability(ops->pmu_ops);
if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, host_arch_capabilities);
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities);
r = ops->hardware_setup();
if (r != 0)
@@ -10023,6 +10046,10 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated);
static void set_or_clear_apicv_inhibit(unsigned long *inhibits,
enum kvm_apicv_inhibit reason, bool set)
{
+ const struct trace_print_flags apicv_inhibits[] = { APICV_INHIBIT_REASONS };
+
+ BUILD_BUG_ON(ARRAY_SIZE(apicv_inhibits) != NR_APICV_INHIBIT_REASONS);
+
if (set)
__set_bit(reason, inhibits);
else
@@ -10034,7 +10061,7 @@ static void set_or_clear_apicv_inhibit(unsigned long *inhibits,
static void kvm_apicv_init(struct kvm *kvm)
{
enum kvm_apicv_inhibit reason = enable_apicv ? APICV_INHIBIT_REASON_ABSENT :
- APICV_INHIBIT_REASON_DISABLE;
+ APICV_INHIBIT_REASON_DISABLED;
set_or_clear_apicv_inhibit(&kvm->arch.apicv_inhibit_reasons, reason, true);
@@ -10255,6 +10282,8 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
if (is_smm(vcpu))
kvm_run->flags |= KVM_RUN_X86_SMM;
+ if (is_guest_mode(vcpu))
+ kvm_run->flags |= KVM_RUN_X86_GUEST_MODE;
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -12629,6 +12658,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
+ kvm->arch.apic_bus_cycle_ns = APIC_BUS_CYCLE_NS_DEFAULT;
kvm->arch.guest_can_read_msr_platform_info = true;
kvm->arch.enable_pmu = enable_pmu;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index d80a4c6b5a38..a88c65d3ea26 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -33,6 +33,20 @@ struct kvm_caps {
u64 supported_perf_cap;
};
+struct kvm_host_values {
+ /*
+ * The host's raw MAXPHYADDR, i.e. the number of non-reserved physical
+ * address bits irrespective of features that repurpose legal bits,
+ * e.g. MKTME.
+ */
+ u8 maxphyaddr;
+
+ u64 efer;
+ u64 xcr0;
+ u64 xss;
+ u64 arch_capabilities;
+};
+
void kvm_spurious_fault(void);
#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \
@@ -325,11 +339,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int emulation_type, void *insn, int insn_len);
fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
-extern u64 host_xcr0;
-extern u64 host_xss;
-extern u64 host_arch_capabilities;
-
extern struct kvm_caps kvm_caps;
+extern struct kvm_host_values kvm_host;
extern bool enable_pmu;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 1053e7c5eab9..637efc055145 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -931,6 +931,8 @@ struct kvm_enable_cap {
#define KVM_CAP_GUEST_MEMFD 234
#define KVM_CAP_VM_TYPES 235
#define KVM_CAP_PRE_FAULT_MEMORY 236
+#define KVM_CAP_X86_APIC_BUS_CYCLES_NS 237
+#define KVM_CAP_X86_GUEST_MODE 238
struct kvm_irq_routing_irqchip {
__u32 irqchip;
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index b073b818be37..b084ba2262a0 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -112,6 +112,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state
TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test
+TEST_GEN_PROGS_x86_64 += x86_64/apic_bus_clock_test
TEST_GEN_PROGS_x86_64 += x86_64/xapic_ipi_test
TEST_GEN_PROGS_x86_64 += x86_64/xapic_state_test
TEST_GEN_PROGS_x86_64 += x86_64/xcr0_cpuid_test
diff --git a/tools/testing/selftests/kvm/include/x86_64/apic.h b/tools/testing/selftests/kvm/include/x86_64/apic.h
index bed316fdecd5..0f268b55fa06 100644
--- a/tools/testing/selftests/kvm/include/x86_64/apic.h
+++ b/tools/testing/selftests/kvm/include/x86_64/apic.h
@@ -60,6 +60,14 @@
#define APIC_VECTOR_MASK 0x000FF
#define APIC_ICR2 0x310
#define SET_APIC_DEST_FIELD(x) ((x) << 24)
+#define APIC_LVTT 0x320
+#define APIC_LVT_TIMER_ONESHOT (0 << 17)
+#define APIC_LVT_TIMER_PERIODIC (1 << 17)
+#define APIC_LVT_TIMER_TSCDEADLINE (2 << 17)
+#define APIC_LVT_MASKED (1 << 16)
+#define APIC_TMICT 0x380
+#define APIC_TMCCT 0x390
+#define APIC_TDCR 0x3E0
void apic_disable(void);
void xapic_enable(void);
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index c0c7c1fe93f9..a0c1440017bb 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -23,6 +23,7 @@
extern bool host_cpu_is_intel;
extern bool host_cpu_is_amd;
+extern uint64_t guest_tsc_khz;
/* Forced emulation prefix, used to invoke the emulator unconditionally. */
#define KVM_FEP "ud2; .byte 'k', 'v', 'm';"
@@ -816,6 +817,23 @@ static inline void cpu_relax(void)
asm volatile("rep; nop" ::: "memory");
}
+static inline void udelay(unsigned long usec)
+{
+ uint64_t start, now, cycles;
+
+ GUEST_ASSERT(guest_tsc_khz);
+ cycles = guest_tsc_khz / 1000 * usec;
+
+ /*
+ * Deliberately don't PAUSE, a.k.a. cpu_relax(), so that the delay is
+ * as accurate as possible, e.g. doesn't trigger PAUSE-Loop VM-Exits.
+ */
+ start = rdtsc();
+ do {
+ now = rdtsc();
+ } while (now - start < cycles);
+}
+
#define ud2() \
__asm__ __volatile__( \
"ud2\n" \
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 594b061aef52..153739f2e201 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -25,6 +25,7 @@ vm_vaddr_t exception_handlers;
bool host_cpu_is_amd;
bool host_cpu_is_intel;
bool is_forced_emulation_enabled;
+uint64_t guest_tsc_khz;
static void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent)
{
@@ -616,6 +617,11 @@ void assert_on_unhandled_exception(struct kvm_vcpu *vcpu)
void kvm_arch_vm_post_create(struct kvm_vm *vm)
{
+ int r;
+
+ TEST_ASSERT(kvm_has_cap(KVM_CAP_GET_TSC_KHZ),
+ "Require KVM_GET_TSC_KHZ to provide udelay() to guest.");
+
vm_create_irqchip(vm);
vm_init_descriptor_tables(vm);
@@ -628,6 +634,11 @@ void kvm_arch_vm_post_create(struct kvm_vm *vm)
vm_sev_ioctl(vm, KVM_SEV_INIT2, &init);
}
+
+ r = __vm_ioctl(vm, KVM_GET_TSC_KHZ, NULL);
+ TEST_ASSERT(r > 0, "KVM_GET_TSC_KHZ did not provide a valid TSC frequency.");
+ guest_tsc_khz = r;
+ sync_global_to_guest(vm, guest_tsc_khz);
}
void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code)
diff --git a/tools/testing/selftests/kvm/x86_64/apic_bus_clock_test.c b/tools/testing/selftests/kvm/x86_64/apic_bus_clock_test.c
new file mode 100644
index 000000000000..f8916bb34405
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/apic_bus_clock_test.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024 Intel Corporation
+ *
+ * Verify KVM correctly emulates the APIC bus frequency when the VMM configures
+ * the frequency via KVM_CAP_X86_APIC_BUS_CYCLES_NS. Start the APIC timer by
+ * programming TMICT (timer initial count) to the largest value possible (so
+ * that the timer will not expire during the test). Then, after an arbitrary
+ * amount of time has elapsed, verify TMCCT (timer current count) is within 1%
+ * of the expected value based on the time elapsed, the APIC bus frequency, and
+ * the programmed TDCR (timer divide configuration register).
+ */
+
+#include "apic.h"
+#include "test_util.h"
+
+/*
+ * Possible TDCR values with matching divide count. Used to modify APIC
+ * timer frequency.
+ */
+static const struct {
+ const uint32_t tdcr;
+ const uint32_t divide_count;
+} tdcrs[] = {
+ {0x0, 2},
+ {0x1, 4},
+ {0x2, 8},
+ {0x3, 16},
+ {0x8, 32},
+ {0x9, 64},
+ {0xa, 128},
+ {0xb, 1},
+};
+
+static bool is_x2apic;
+
+static void apic_enable(void)
+{
+ if (is_x2apic)
+ x2apic_enable();
+ else
+ xapic_enable();
+}
+
+static uint32_t apic_read_reg(unsigned int reg)
+{
+ return is_x2apic ? x2apic_read_reg(reg) : xapic_read_reg(reg);
+}
+
+static void apic_write_reg(unsigned int reg, uint32_t val)
+{
+ if (is_x2apic)
+ x2apic_write_reg(reg, val);
+ else
+ xapic_write_reg(reg, val);
+}
+
+static void apic_guest_code(uint64_t apic_hz, uint64_t delay_ms)
+{
+ uint64_t tsc_hz = guest_tsc_khz * 1000;
+ const uint32_t tmict = ~0u;
+ uint64_t tsc0, tsc1, freq;
+ uint32_t tmcct;
+ int i;
+
+ apic_enable();
+
+ /*
+ * Setup one-shot timer. The vector does not matter because the
+ * interrupt should not fire.
+ */
+ apic_write_reg(APIC_LVTT, APIC_LVT_TIMER_ONESHOT | APIC_LVT_MASKED);
+
+ for (i = 0; i < ARRAY_SIZE(tdcrs); i++) {
+ apic_write_reg(APIC_TDCR, tdcrs[i].tdcr);
+ apic_write_reg(APIC_TMICT, tmict);
+
+ tsc0 = rdtsc();
+ udelay(delay_ms * 1000);
+ tmcct = apic_read_reg(APIC_TMCCT);
+ tsc1 = rdtsc();
+
+ /*
+ * Stop the timer _after_ reading the current, final count, as
+ * writing the initial counter also modifies the current count.
+ */
+ apic_write_reg(APIC_TMICT, 0);
+
+ freq = (tmict - tmcct) * tdcrs[i].divide_count * tsc_hz / (tsc1 - tsc0);
+ /* Check if measured frequency is within 5% of configured frequency. */
+ __GUEST_ASSERT(freq < apic_hz * 105 / 100 && freq > apic_hz * 95 / 100,
+ "Frequency = %lu (wanted %lu - %lu), bus = %lu, div = %u, tsc = %lu",
+ freq, apic_hz * 95 / 100, apic_hz * 105 / 100,
+ apic_hz, tdcrs[i].divide_count, tsc_hz);
+ }
+
+ GUEST_DONE();
+}
+
+static void test_apic_bus_clock(struct kvm_vcpu *vcpu)
+{
+ bool done = false;
+ struct ucall uc;
+
+ while (!done) {
+ vcpu_run(vcpu);
+
+ TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+ switch (get_ucall(vcpu, &uc)) {
+ case UCALL_DONE:
+ done = true;
+ break;
+ case UCALL_ABORT:
+ REPORT_GUEST_ASSERT(uc);
+ break;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ break;
+ }
+ }
+}
+
+static void run_apic_bus_clock_test(uint64_t apic_hz, uint64_t delay_ms,
+ bool x2apic)
+{
+ struct kvm_vcpu *vcpu;
+ struct kvm_vm *vm;
+ int ret;
+
+ is_x2apic = x2apic;
+
+ vm = vm_create(1);
+
+ sync_global_to_guest(vm, is_x2apic);
+
+ vm_enable_cap(vm, KVM_CAP_X86_APIC_BUS_CYCLES_NS,
+ NSEC_PER_SEC / apic_hz);
+
+ vcpu = vm_vcpu_add(vm, 0, apic_guest_code);
+ vcpu_args_set(vcpu, 2, apic_hz, delay_ms);
+
+ ret = __vm_enable_cap(vm, KVM_CAP_X86_APIC_BUS_CYCLES_NS,
+ NSEC_PER_SEC / apic_hz);
+ TEST_ASSERT(ret < 0 && errno == EINVAL,
+ "Setting of APIC bus frequency after vCPU is created should fail.");
+
+ if (!is_x2apic)
+ virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
+
+ test_apic_bus_clock(vcpu);
+ kvm_vm_free(vm);
+}
+
+static void help(char *name)
+{
+ puts("");
+ printf("usage: %s [-h] [-d delay] [-f APIC bus freq]\n", name);
+ puts("");
+ printf("-d: Delay (in msec) guest uses to measure APIC bus frequency.\n");
+ printf("-f: The APIC bus frequency (in MHz) to be configured for the guest.\n");
+ puts("");
+}
+
+int main(int argc, char *argv[])
+{
+ /*
+ * Arbitrarilty default to 25MHz for the APIC bus frequency, which is
+ * different enough from the default 1GHz to be interesting.
+ */
+ uint64_t apic_hz = 25 * 1000 * 1000;
+ uint64_t delay_ms = 100;
+ int opt;
+
+ TEST_REQUIRE(kvm_has_cap(KVM_CAP_X86_APIC_BUS_CYCLES_NS));
+
+ while ((opt = getopt(argc, argv, "d:f:h")) != -1) {
+ switch (opt) {
+ case 'f':
+ apic_hz = atoi_positive("APIC bus frequency", optarg) * 1000 * 1000;
+ break;
+ case 'd':
+ delay_ms = atoi_positive("Delay in milliseconds", optarg);
+ break;
+ case 'h':
+ default:
+ help(argv[0]);
+ exit(KSFT_SKIP);
+ }
+ }
+
+ run_apic_bus_clock_test(apic_hz, delay_ms, false);
+ run_apic_bus_clock_test(apic_hz, delay_ms, true);
+}