summaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2022-07-29 09:46:01 -0400
committerPaolo Bonzini <pbonzini@redhat.com>2022-08-01 03:21:00 -0400
commit63f4b210414b65aa3103c54369cacbd0b1bdf02f (patch)
tree2dc7b490d3a89306669c70256a41764ca52ab3b3 /arch/x86/kvm
parent2e2e91158febfeb73b5d4f249440218304f34101 (diff)
parent7edc3a68038ab151a8791ddb6217755a5e4a5809 (diff)
Merge remote-tracking branch 'kvm/next' into kvm-next-5.20
KVM/s390, KVM/x86 and common infrastructure changes for 5.20 x86: * Permit guests to ignore single-bit ECC errors * Fix races in gfn->pfn cache refresh; do not pin pages tracked by the cache * Intel IPI virtualization * Allow getting/setting pending triple fault with KVM_GET/SET_VCPU_EVENTS * PEBS virtualization * Simplify PMU emulation by just using PERF_TYPE_RAW events * More accurate event reinjection on SVM (avoid retrying instructions) * Allow getting/setting the state of the speaker port data bit * Refuse starting the kvm-intel module if VM-Entry/VM-Exit controls are inconsistent * "Notify" VM exit (detect microarchitectural hangs) for Intel * Cleanups for MCE MSR emulation s390: * add an interface to provide a hypervisor dump for secure guests * improve selftests to use TAP interface * enable interpretive execution of zPCI instructions (for PCI passthrough) * First part of deferred teardown * CPU Topology * PV attestation * Minor fixes Generic: * new selftests API using struct kvm_vcpu instead of a (vm, id) tuple x86: * Use try_cmpxchg64 instead of cmpxchg64 * Bugfixes * Ignore benign host accesses to PMU MSRs when PMU is disabled * Allow disabling KVM's "MONITOR/MWAIT are NOPs!" behavior * x86/MMU: Allow NX huge pages to be disabled on a per-vm basis * Port eager page splitting to shadow MMU as well * Enable CMCI capability by default and handle injected UCNA errors * Expose pid of vcpu threads in debugfs * x2AVIC support for AMD * cleanup PIO emulation * Fixes for LLDT/LTR emulation * Don't require refcounted "struct page" to create huge SPTEs x86 cleanups: * Use separate namespaces for guest PTEs and shadow PTEs bitmasks * PIO emulation * Reorganize rmap API, mostly around rmap destruction * Do not workaround very old KVM bugs for L0 that runs with nesting enabled * new selftests API for CPUID
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/cpuid.c115
-rw-r--r--arch/x86/kvm/cpuid.h21
-rw-r--r--arch/x86/kvm/debugfs.c4
-rw-r--r--arch/x86/kvm/emulate.c49
-rw-r--r--arch/x86/kvm/hyperv.c8
-rw-r--r--arch/x86/kvm/i8254.c10
-rw-r--r--arch/x86/kvm/i8254.h1
-rw-r--r--arch/x86/kvm/kvm_emulate.h28
-rw-r--r--arch/x86/kvm/lapic.c181
-rw-r--r--arch/x86/kvm/lapic.h20
-rw-r--r--arch/x86/kvm/mmu.h10
-rw-r--r--arch/x86/kvm/mmu/mmu.c965
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h40
-rw-r--r--arch/x86/kvm/mmu/paging.h14
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h126
-rw-r--r--arch/x86/kvm/mmu/spte.c43
-rw-r--r--arch/x86/kvm/mmu/spte.h38
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c15
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h1
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c87
-rw-r--r--arch/x86/kvm/pmu.c212
-rw-r--r--arch/x86/kvm/pmu.h45
-rw-r--r--arch/x86/kvm/svm/avic.c170
-rw-r--r--arch/x86/kvm/svm/nested.c72
-rw-r--r--arch/x86/kvm/svm/pmu.c62
-rw-r--r--arch/x86/kvm/svm/sev.c20
-rw-r--r--arch/x86/kvm/svm/svm.c273
-rw-r--r--arch/x86/kvm/svm/svm.h39
-rw-r--r--arch/x86/kvm/trace.h51
-rw-r--r--arch/x86/kvm/vmx/capabilities.h57
-rw-r--r--arch/x86/kvm/vmx/evmcs.c2
-rw-r--r--arch/x86/kvm/vmx/evmcs.h1
-rw-r--r--arch/x86/kvm/vmx/nested.c197
-rw-r--r--arch/x86/kvm/vmx/nested.h5
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c198
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c30
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h2
-rw-r--r--arch/x86/kvm/vmx/sgx.c10
-rw-r--r--arch/x86/kvm/vmx/vmcs.h1
-rw-r--r--arch/x86/kvm/vmx/vmx.c367
-rw-r--r--arch/x86/kvm/vmx/vmx.h95
-rw-r--r--arch/x86/kvm/x86.c704
-rw-r--r--arch/x86/kvm/x86.h35
-rw-r--r--arch/x86/kvm/xen.c10
44 files changed, 2994 insertions, 1440 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index de6d44e07e34..75dcf7a72605 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -67,9 +67,17 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted)
#define F feature_bit
#define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0)
+/*
+ * Magic value used by KVM when querying userspace-provided CPUID entries and
+ * doesn't care about the CPIUD index because the index of the function in
+ * question is not significant. Note, this magic value must have at least one
+ * bit set in bits[63:32] and must be consumed as a u64 by cpuid_entry2_find()
+ * to avoid false positives when processing guest CPUID input.
+ */
+#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull
static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
- struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index)
+ struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index)
{
struct kvm_cpuid_entry2 *e;
int i;
@@ -77,9 +85,31 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
for (i = 0; i < nent; i++) {
e = &entries[i];
- if (e->function == function &&
- (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index))
+ if (e->function != function)
+ continue;
+
+ /*
+ * If the index isn't significant, use the first entry with a
+ * matching function. It's userspace's responsibilty to not
+ * provide "duplicate" entries in all cases.
+ */
+ if (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index)
+ return e;
+
+
+ /*
+ * Similarly, use the first matching entry if KVM is doing a
+ * lookup (as opposed to emulating CPUID) for a function that's
+ * architecturally defined as not having a significant index.
+ */
+ if (index == KVM_CPUID_INDEX_NOT_SIGNIFICANT) {
+ /*
+ * Direct lookups from KVM should not diverge from what
+ * KVM defines internally (the architectural behavior).
+ */
+ WARN_ON_ONCE(cpuid_function_is_indexed(function));
return e;
+ }
}
return NULL;
@@ -96,7 +126,8 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu,
* The existing code assumes virtual address is 48-bit or 57-bit in the
* canonical address checks; exit if it is ever changed.
*/
- best = cpuid_entry2_find(entries, nent, 0x80000008, 0);
+ best = cpuid_entry2_find(entries, nent, 0x80000008,
+ KVM_CPUID_INDEX_NOT_SIGNIFICANT);
if (best) {
int vaddr_bits = (best->eax & 0xff00) >> 8;
@@ -151,7 +182,7 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
vcpu->arch.kvm_cpuid_base = 0;
for_each_possible_hypervisor_cpuid_base(function) {
- entry = kvm_find_cpuid_entry(vcpu, function, 0);
+ entry = kvm_find_cpuid_entry(vcpu, function);
if (entry) {
u32 signature[3];
@@ -177,7 +208,8 @@ static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *v
if (!base)
return NULL;
- return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, 0);
+ return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES,
+ KVM_CPUID_INDEX_NOT_SIGNIFICANT);
}
static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
@@ -200,7 +232,7 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
/*
* Calculate guest's supported XCR0 taking into account guest CPUID data and
- * supported_xcr0 (comprised of host configuration and KVM_SUPPORTED_XCR0).
+ * KVM's supported XCR0 (comprised of host's XCR0 and KVM_SUPPORTED_XCR0).
*/
static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent)
{
@@ -210,7 +242,7 @@ static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent)
if (!best)
return 0;
- return (best->eax | ((u64)best->edx << 32)) & supported_xcr0;
+ return (best->eax | ((u64)best->edx << 32)) & kvm_caps.supported_xcr0;
}
static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries,
@@ -219,7 +251,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e
struct kvm_cpuid_entry2 *best;
u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent);
- best = cpuid_entry2_find(entries, nent, 1, 0);
+ best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
if (best) {
/* Update OSXSAVE bit */
if (boot_cpu_has(X86_FEATURE_XSAVE))
@@ -250,7 +282,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e
best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
- best = cpuid_entry2_find(entries, nent, 0x1, 0);
+ best = cpuid_entry2_find(entries, nent, 0x1, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
if (best)
cpuid_entry_change(best, X86_FEATURE_MWAIT,
vcpu->arch.ia32_misc_enable_msr &
@@ -285,7 +317,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
struct kvm_cpuid_entry2 *best;
u64 guest_supported_xcr0;
- best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ best = kvm_find_cpuid_entry(vcpu, 1);
if (best && apic) {
if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER))
apic->lapic_timer.timer_mode_mask = 3 << 17;
@@ -325,10 +357,10 @@ int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
- best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
+ best = kvm_find_cpuid_entry(vcpu, 0x80000000);
if (!best || best->eax < 0x80000008)
goto not_found;
- best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
+ best = kvm_find_cpuid_entry(vcpu, 0x80000008);
if (best)
return best->eax & 0xff;
not_found:
@@ -868,7 +900,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
case 9:
break;
case 0xa: { /* Architectural Performance Monitoring */
- struct x86_pmu_capability cap;
union cpuid10_eax eax;
union cpuid10_edx edx;
@@ -877,30 +908,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
break;
}
- perf_get_x86_pmu_capability(&cap);
+ eax.split.version_id = kvm_pmu_cap.version;
+ eax.split.num_counters = kvm_pmu_cap.num_counters_gp;
+ eax.split.bit_width = kvm_pmu_cap.bit_width_gp;
+ eax.split.mask_length = kvm_pmu_cap.events_mask_len;
+ edx.split.num_counters_fixed = kvm_pmu_cap.num_counters_fixed;
+ edx.split.bit_width_fixed = kvm_pmu_cap.bit_width_fixed;
- /*
- * The guest architecture pmu is only supported if the architecture
- * pmu exists on the host and the module parameters allow it.
- */
- if (!cap.version || !enable_pmu)
- memset(&cap, 0, sizeof(cap));
-
- eax.split.version_id = min(cap.version, 2);
- eax.split.num_counters = cap.num_counters_gp;
- eax.split.bit_width = cap.bit_width_gp;
- eax.split.mask_length = cap.events_mask_len;
-
- edx.split.num_counters_fixed =
- min(cap.num_counters_fixed, KVM_PMC_MAX_FIXED);
- edx.split.bit_width_fixed = cap.bit_width_fixed;
- if (cap.version)
+ if (kvm_pmu_cap.version)
edx.split.anythread_deprecated = 1;
edx.split.reserved1 = 0;
edx.split.reserved2 = 0;
entry->eax = eax.full;
- entry->ebx = cap.events_mask;
+ entry->ebx = kvm_pmu_cap.events_mask;
entry->ecx = 0;
entry->edx = edx.full;
break;
@@ -923,8 +944,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
}
break;
case 0xd: {
- u64 permitted_xcr0 = supported_xcr0 & xstate_get_guest_group_perm();
- u64 permitted_xss = supported_xss;
+ u64 permitted_xcr0 = kvm_caps.supported_xcr0 & xstate_get_guest_group_perm();
+ u64 permitted_xss = kvm_caps.supported_xss;
entry->eax &= permitted_xcr0;
entry->ebx = xstate_required_size(permitted_xcr0, false);
@@ -1313,12 +1334,20 @@ out_free:
return r;
}
-struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
- u32 function, u32 index)
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
+ u32 function, u32 index)
{
return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent,
function, index);
}
+EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry_index);
+
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+ u32 function)
+{
+ return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent,
+ function, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
+}
EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
/*
@@ -1355,7 +1384,7 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index)
struct kvm_cpuid_entry2 *basic, *class;
u32 function = *fn_ptr;
- basic = kvm_find_cpuid_entry(vcpu, 0, 0);
+ basic = kvm_find_cpuid_entry(vcpu, 0);
if (!basic)
return NULL;
@@ -1364,11 +1393,11 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index)
return NULL;
if (function >= 0x40000000 && function <= 0x4fffffff)
- class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00, 0);
+ class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00);
else if (function >= 0xc0000000)
- class = kvm_find_cpuid_entry(vcpu, 0xc0000000, 0);
+ class = kvm_find_cpuid_entry(vcpu, 0xc0000000);
else
- class = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
+ class = kvm_find_cpuid_entry(vcpu, function & 0x80000000);
if (class && function <= class->eax)
return NULL;
@@ -1386,7 +1415,7 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index)
* the effective CPUID entry is the max basic leaf. Note, the index of
* the original requested leaf is observed!
*/
- return kvm_find_cpuid_entry(vcpu, basic->eax, index);
+ return kvm_find_cpuid_entry_index(vcpu, basic->eax, index);
}
bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
@@ -1396,7 +1425,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
struct kvm_cpuid_entry2 *entry;
bool exact, used_max_basic = false;
- entry = kvm_find_cpuid_entry(vcpu, function, index);
+ entry = kvm_find_cpuid_entry_index(vcpu, function, index);
exact = !!entry;
if (!entry && !exact_only) {
@@ -1425,7 +1454,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
* exists. EDX can be copied from any existing index.
*/
if (function == 0xb || function == 0x1f) {
- entry = kvm_find_cpuid_entry(vcpu, function, 1);
+ entry = kvm_find_cpuid_entry_index(vcpu, function, 1);
if (entry) {
*ecx = index & 0xff;
*edx = entry->edx;
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 8a770b481d9d..b1658c0de847 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -13,8 +13,10 @@ void kvm_set_cpu_caps(void);
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
void kvm_update_pv_runtime(struct kvm_vcpu *vcpu);
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
+ u32 function, u32 index);
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
- u32 function, u32 index);
+ u32 function);
int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries,
unsigned int type);
@@ -76,7 +78,7 @@ static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu,
const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
struct kvm_cpuid_entry2 *entry;
- entry = kvm_find_cpuid_entry(vcpu, cpuid.function, cpuid.index);
+ entry = kvm_find_cpuid_entry_index(vcpu, cpuid.function, cpuid.index);
if (!entry)
return NULL;
@@ -109,7 +111,7 @@ static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
- best = kvm_find_cpuid_entry(vcpu, 0, 0);
+ best = kvm_find_cpuid_entry(vcpu, 0);
return best &&
(is_guest_vendor_amd(best->ebx, best->ecx, best->edx) ||
is_guest_vendor_hygon(best->ebx, best->ecx, best->edx));
@@ -119,7 +121,7 @@ static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
- best = kvm_find_cpuid_entry(vcpu, 0, 0);
+ best = kvm_find_cpuid_entry(vcpu, 0);
return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx);
}
@@ -127,7 +129,7 @@ static inline int guest_cpuid_family(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
- best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+ best = kvm_find_cpuid_entry(vcpu, 0x1);
if (!best)
return -1;
@@ -138,18 +140,23 @@ static inline int guest_cpuid_model(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
- best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+ best = kvm_find_cpuid_entry(vcpu, 0x1);
if (!best)
return -1;
return x86_model(best->eax);
}
+static inline bool cpuid_model_is_consistent(struct kvm_vcpu *vcpu)
+{
+ return boot_cpu_data.x86_model == guest_cpuid_model(vcpu);
+}
+
static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
- best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+ best = kvm_find_cpuid_entry(vcpu, 0x1);
if (!best)
return -1;
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index 9240b3b7f8dd..cfed36aba2f7 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -48,7 +48,7 @@ DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL,
static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val)
{
- *val = kvm_tsc_scaling_ratio_frac_bits;
+ *val = kvm_caps.tsc_scaling_ratio_frac_bits;
return 0;
}
@@ -66,7 +66,7 @@ void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_
debugfs_dentry, vcpu,
&vcpu_timer_advance_ns_fops);
- if (kvm_has_tsc_control) {
+ if (kvm_caps.has_tsc_control) {
debugfs_create_file("tsc-scaling-ratio", 0444,
debugfs_dentry, vcpu,
&vcpu_tsc_scaling_fops);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f8382abe22ff..047c583596bb 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -244,6 +244,9 @@ enum x86_transfer_type {
static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
{
+ if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
+ nr &= NR_EMULATOR_GPRS - 1;
+
if (!(ctxt->regs_valid & (1 << nr))) {
ctxt->regs_valid |= 1 << nr;
ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr);
@@ -253,6 +256,12 @@ static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr)
{
+ if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
+ nr &= NR_EMULATOR_GPRS - 1;
+
+ BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
+ BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
+
ctxt->regs_valid |= 1 << nr;
ctxt->regs_dirty |= 1 << nr;
return &ctxt->_regs[nr];
@@ -266,9 +275,10 @@ static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr)
static void writeback_registers(struct x86_emulate_ctxt *ctxt)
{
+ unsigned long dirty = ctxt->regs_dirty;
unsigned reg;
- for_each_set_bit(reg, (ulong *)&ctxt->regs_dirty, 16)
+ for_each_set_bit(reg, &dirty, NR_EMULATOR_GPRS)
ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]);
}
@@ -615,7 +625,9 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
u32 error, bool valid)
{
- WARN_ON(vec > 0x1f);
+ if (KVM_EMULATOR_BUG_ON(vec > 0x1f, ctxt))
+ return X86EMUL_UNHANDLEABLE;
+
ctxt->exception.vector = vec;
ctxt->exception.error_code = error;
ctxt->exception.error_code_valid = valid;
@@ -1362,7 +1374,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
if (mc->pos < mc->end)
goto read_cached;
- WARN_ON((mc->end + size) >= sizeof(mc->data));
+ if (KVM_EMULATOR_BUG_ON((mc->end + size) >= sizeof(mc->data), ctxt))
+ return X86EMUL_UNHANDLEABLE;
rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size,
&ctxt->exception);
@@ -1687,16 +1700,6 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
case VCPU_SREG_TR:
if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
goto exception;
- if (!seg_desc.p) {
- err_vec = NP_VECTOR;
- goto exception;
- }
- old_desc = seg_desc;
- seg_desc.type |= 2; /* busy */
- ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc,
- sizeof(seg_desc), &ctxt->exception);
- if (ret != X86EMUL_CONTINUE)
- return ret;
break;
case VCPU_SREG_LDTR:
if (seg_desc.s || seg_desc.type != 2)
@@ -1734,8 +1737,17 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
if (ret != X86EMUL_CONTINUE)
return ret;
if (emul_is_noncanonical_address(get_desc_base(&seg_desc) |
- ((u64)base3 << 32), ctxt))
- return emulate_gp(ctxt, 0);
+ ((u64)base3 << 32), ctxt))
+ return emulate_gp(ctxt, err_code);
+ }
+
+ if (seg == VCPU_SREG_TR) {
+ old_desc = seg_desc;
+ seg_desc.type |= 2; /* busy */
+ ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc,
+ sizeof(seg_desc), &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
}
load:
ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg);
@@ -2432,7 +2444,7 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED;
ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0);
- for (i = 0; i < 8; i++)
+ for (i = 0; i < NR_EMULATOR_GPRS; i++)
*reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4);
val = GET_SMSTATE(u32, smstate, 0x7fcc);
@@ -2489,7 +2501,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
u16 selector;
int i, r;
- for (i = 0; i < 16; i++)
+ for (i = 0; i < NR_EMULATOR_GPRS; i++)
*reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8);
ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78);
@@ -5719,7 +5731,8 @@ writeback:
done:
if (rc == X86EMUL_PROPAGATE_FAULT) {
- WARN_ON(ctxt->exception.vector > 0x1f);
+ if (KVM_EMULATOR_BUG_ON(ctxt->exception.vector > 0x1f, ctxt))
+ return EMULATION_FAILED;
ctxt->have_exception = true;
}
if (rc == X86EMUL_INTERCEPTED)
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index e2e95a6fccfd..ed804447589c 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1992,7 +1992,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu)
struct kvm_cpuid_entry2 *entry;
struct kvm_vcpu_hv *hv_vcpu;
- entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0);
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE);
if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) {
vcpu->arch.hyperv_enabled = true;
} else {
@@ -2005,7 +2005,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu)
hv_vcpu = to_hv_vcpu(vcpu);
- entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES, 0);
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES);
if (entry) {
hv_vcpu->cpuid_cache.features_eax = entry->eax;
hv_vcpu->cpuid_cache.features_ebx = entry->ebx;
@@ -2016,7 +2016,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu)
hv_vcpu->cpuid_cache.features_edx = 0;
}
- entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO, 0);
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO);
if (entry) {
hv_vcpu->cpuid_cache.enlightenments_eax = entry->eax;
hv_vcpu->cpuid_cache.enlightenments_ebx = entry->ebx;
@@ -2025,7 +2025,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu)
hv_vcpu->cpuid_cache.enlightenments_ebx = 0;
}
- entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, 0);
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES);
if (entry)
hv_vcpu->cpuid_cache.syndbg_cap_eax = entry->eax;
else
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 1c83076091af..e0a7a0e7a73c 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -591,7 +591,10 @@ static int speaker_ioport_write(struct kvm_vcpu *vcpu,
return -EOPNOTSUPP;
mutex_lock(&pit_state->lock);
- pit_state->speaker_data_on = (val >> 1) & 1;
+ if (val & (1 << 1))
+ pit_state->flags |= KVM_PIT_FLAGS_SPEAKER_DATA_ON;
+ else
+ pit_state->flags &= ~KVM_PIT_FLAGS_SPEAKER_DATA_ON;
pit_set_gate(pit, 2, val & 1);
mutex_unlock(&pit_state->lock);
return 0;
@@ -612,8 +615,9 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu,
refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
mutex_lock(&pit_state->lock);
- ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(pit, 2) |
- (pit_get_out(pit, 2) << 5) | (refresh_clock << 4));
+ ret = (!!(pit_state->flags & KVM_PIT_FLAGS_SPEAKER_DATA_ON) << 1) |
+ pit_get_gate(pit, 2) | (pit_get_out(pit, 2) << 5) |
+ (refresh_clock << 4);
if (len > sizeof(ret))
len = sizeof(ret);
memcpy(data, (char *)&ret, len);
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 394d9527da7e..a768212ba821 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -29,7 +29,6 @@ struct kvm_kpit_state {
bool is_periodic;
s64 period; /* unit: ns */
struct hrtimer timer;
- u32 speaker_data_on;
struct mutex lock;
atomic_t reinject;
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 8dff25d267b7..89246446d6aa 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -89,6 +89,7 @@ struct x86_instruction_info {
#define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */
struct x86_emulate_ops {
+ void (*vm_bugged)(struct x86_emulate_ctxt *ctxt);
/*
* read_gpr: read a general purpose register (rax - r15)
*
@@ -301,6 +302,18 @@ struct fastop;
typedef void (*fastop_t)(struct fastop *);
+/*
+ * The emulator's _regs array tracks only the GPRs, i.e. excludes RIP. RIP is
+ * tracked/accessed via _eip, and except for RIP relative addressing, which
+ * also uses _eip, RIP cannot be a register operand nor can it be an operand in
+ * a ModRM or SIB byte.
+ */
+#ifdef CONFIG_X86_64
+#define NR_EMULATOR_GPRS 16
+#else
+#define NR_EMULATOR_GPRS 8
+#endif
+
struct x86_emulate_ctxt {
void *vcpu;
const struct x86_emulate_ops *ops;
@@ -345,9 +358,9 @@ struct x86_emulate_ctxt {
u8 lock_prefix;
u8 rep_prefix;
/* bitmaps of registers in _regs[] that can be read */
- u32 regs_valid;
+ u16 regs_valid;
/* bitmaps of registers in _regs[] that have been written */
- u32 regs_dirty;
+ u16 regs_dirty;
/* modrm */
u8 modrm;
u8 modrm_mod;
@@ -363,7 +376,7 @@ struct x86_emulate_ctxt {
struct operand src2;
struct operand dst;
struct operand memop;
- unsigned long _regs[NR_VCPU_REGS];
+ unsigned long _regs[NR_EMULATOR_GPRS];
struct operand *memopp;
struct fetch_cache fetch;
struct read_cache io_read;
@@ -371,6 +384,15 @@ struct x86_emulate_ctxt {
bool is_branch;
};
+#define KVM_EMULATOR_BUG_ON(cond, ctxt) \
+({ \
+ int __ret = (cond); \
+ \
+ if (WARN_ON_ONCE(__ret)) \
+ ctxt->ops->vm_bugged(ctxt); \
+ unlikely(__ret); \
+})
+
/* Repeat String Operation Prefix */
#define REPE_PREFIX 0xf3
#define REPNE_PREFIX 0xf2
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0e68b4c937fc..e2ce3556915e 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -27,6 +27,7 @@
#include <linux/math64.h>
#include <linux/slab.h>
#include <asm/processor.h>
+#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/page.h>
#include <asm/current.h>
@@ -54,7 +55,7 @@
#define PRIo64 "o"
/* 14 is the version for Xeon and Pentium 8.4.8*/
-#define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16))
+#define APIC_VERSION 0x14UL
#define LAPIC_MMIO_LENGTH (1 << 12)
/* followed define is not in apicdef.h */
#define MAX_APIC_VECTOR 256
@@ -67,6 +68,8 @@ static bool lapic_timer_advance_dynamic __read_mostly;
#define LAPIC_TIMER_ADVANCE_NS_MAX 5000
/* step-by-step approximation to mitigate fluctuation */
#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
+static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data);
+static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data);
static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val)
{
@@ -398,14 +401,26 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val)
return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
}
+static inline bool kvm_lapic_lvt_supported(struct kvm_lapic *apic, int lvt_index)
+{
+ return apic->nr_lvt_entries > lvt_index;
+}
+
+static inline int kvm_apic_calc_nr_lvt_entries(struct kvm_vcpu *vcpu)
+{
+ return KVM_APIC_MAX_NR_LVT_ENTRIES - !(vcpu->arch.mcg_cap & MCG_CMCI_P);
+}
+
void kvm_apic_set_version(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- u32 v = APIC_VERSION;
+ u32 v = 0;
if (!lapic_in_kernel(vcpu))
return;
+ v = APIC_VERSION | ((apic->nr_lvt_entries - 1) << 16);
+
/*
* KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation)
* which doesn't have EOI register; Some buggy OSes (e.g. Windows with
@@ -419,12 +434,33 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
kvm_lapic_set_reg(apic, APIC_LVR, v);
}
-static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = {
- LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */
- LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
- LVT_MASK | APIC_MODE_MASK, /* LVTPC */
- LINT_MASK, LINT_MASK, /* LVT0-1 */
- LVT_MASK /* LVTERR */
+void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu)
+{
+ int nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu);
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ int i;
+
+ if (!lapic_in_kernel(vcpu) || nr_lvt_entries == apic->nr_lvt_entries)
+ return;
+
+ /* Initialize/mask any "new" LVT entries. */
+ for (i = apic->nr_lvt_entries; i < nr_lvt_entries; i++)
+ kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED);
+
+ apic->nr_lvt_entries = nr_lvt_entries;
+
+ /* The number of LVT entries is reflected in the version register. */
+ kvm_apic_set_version(vcpu);
+}
+
+static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = {
+ [LVT_TIMER] = LVT_MASK, /* timer mode mask added at runtime */
+ [LVT_THERMAL_MONITOR] = LVT_MASK | APIC_MODE_MASK,
+ [LVT_PERFORMANCE_COUNTER] = LVT_MASK | APIC_MODE_MASK,
+ [LVT_LINT0] = LINT_MASK,
+ [LVT_LINT1] = LINT_MASK,
+ [LVT_ERROR] = LVT_MASK,
+ [LVT_CMCI] = LVT_MASK | APIC_MODE_MASK
};
static int find_highest_vector(void *bitmap)
@@ -518,14 +554,11 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
{
- struct kvm_vcpu *vcpu;
-
- vcpu = apic->vcpu;
-
- if (unlikely(vcpu->arch.apicv_active)) {
+ if (unlikely(apic->apicv_active)) {
/* need to update RVI */
kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
- static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
+ static_call_cond(kvm_x86_hwapic_irr_update)(apic->vcpu,
+ apic_find_highest_irr(apic));
} else {
apic->irr_pending = false;
kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
@@ -542,20 +575,16 @@ EXPORT_SYMBOL_GPL(kvm_apic_clear_irr);
static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
{
- struct kvm_vcpu *vcpu;
-
if (__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
return;
- vcpu = apic->vcpu;
-
/*
* With APIC virtualization enabled, all caching is disabled
* because the processor can modify ISR under the hood. Instead
* just set SVI.
*/
- if (unlikely(vcpu->arch.apicv_active))
- static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, vec);
+ if (unlikely(apic->apicv_active))
+ static_call_cond(kvm_x86_hwapic_isr_update)(vec);
else {
++apic->isr_count;
BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
@@ -589,12 +618,9 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
{
- struct kvm_vcpu *vcpu;
if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
return;
- vcpu = apic->vcpu;
-
/*
* We do get here for APIC virtualization enabled if the guest
* uses the Hyper-V APIC enlightenment. In this case we may need
@@ -602,8 +628,8 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
* on the other hand isr_count and highest_isr_cache are unused
* and must be left alone.
*/
- if (unlikely(vcpu->arch.apicv_active))
- static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
+ if (unlikely(apic->apicv_active))
+ static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic));
else {
--apic->isr_count;
BUG_ON(apic->isr_count < 0);
@@ -801,17 +827,17 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
if (kvm_apic_broadcast(apic, mda))
return true;
- if (apic_x2apic_mode(apic))
- return mda == kvm_x2apic_id(apic);
-
/*
- * Hotplug hack: Make LAPIC in xAPIC mode also accept interrupts as if
- * it were in x2APIC mode. Hotplugged VCPUs start in xAPIC mode and
- * this allows unique addressing of VCPUs with APIC ID over 0xff.
- * The 0xff condition is needed because writeable xAPIC ID.
+ * Hotplug hack: Accept interrupts for vCPUs in xAPIC mode as if they
+ * were in x2APIC mode if the target APIC ID can't be encoded as an
+ * xAPIC ID. This allows unique addressing of hotplugged vCPUs (which
+ * start in xAPIC mode) with an APIC ID that is unaddressable in xAPIC
+ * mode. Match the x2APIC ID if and only if the target APIC ID can't
+ * be encoded in xAPIC to avoid spurious matches against a vCPU that
+ * changed its (addressable) xAPIC ID (which is writable).
*/
- if (kvm_x2apic_id(apic) > 0xff && mda == kvm_x2apic_id(apic))
- return true;
+ if (apic_x2apic_mode(apic) || mda > 0xff)
+ return mda == kvm_x2apic_id(apic);
return mda == kvm_xapic_id(apic);
}
@@ -1325,7 +1351,7 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
if (apic_x2apic_mode(apic))
irq.dest_id = icr_high;
else
- irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
+ irq.dest_id = GET_XAPIC_DEST_FIELD(icr_high);
trace_kvm_apic_ipi(icr_low, irq.dest_id);
@@ -1444,6 +1470,9 @@ static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
APIC_REG_MASK(APIC_TMCCT) |
APIC_REG_MASK(APIC_TDCR);
+ if (kvm_lapic_lvt_supported(apic, LVT_CMCI))
+ valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI);
+
/*
* ARBPRI and ICR2 are not valid in x2APIC mode. WARN if KVM reads ICR
* in x2APIC mode as it's an 8-byte register in x2APIC and needs to be
@@ -1583,7 +1612,7 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
int vec = reg & APIC_VECTOR_MASK;
void *bitmap = apic->regs + APIC_ISR;
- if (vcpu->arch.apicv_active)
+ if (apic->apicv_active)
bitmap = apic->regs + APIC_IRR;
if (apic_test_vector(vec, bitmap))
@@ -1602,7 +1631,7 @@ static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles)
* that __delay() uses delay_tsc whenever the hardware has TSC, thus
* always for VMX enabled hardware.
*/
- if (vcpu->arch.tsc_scaling_ratio == kvm_default_tsc_scaling_ratio) {
+ if (vcpu->arch.tsc_scaling_ratio == kvm_caps.default_tsc_scaling_ratio) {
__delay(min(guest_cycles,
nsec_to_cycles(vcpu, timer_advance_ns)));
} else {
@@ -1700,7 +1729,7 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn)
if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use)
ktimer->expired_tscdeadline = ktimer->tscdeadline;
- if (!from_timer_fn && vcpu->arch.apicv_active) {
+ if (!from_timer_fn && apic->apicv_active) {
WARN_ON(kvm_get_running_vcpu() != vcpu);
kvm_apic_inject_pending_timer_irqs(apic);
return;
@@ -2052,6 +2081,16 @@ static void kvm_lapic_xapic_id_updated(struct kvm_lapic *apic)
kvm_set_apicv_inhibit(apic->vcpu->kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
}
+static int get_lvt_index(u32 reg)
+{
+ if (reg == APIC_LVTCMCI)
+ return LVT_CMCI;
+ if (reg < APIC_LVTT || reg > APIC_LVTERR)
+ return -1;
+ return array_index_nospec(
+ (reg - APIC_LVTT) >> 4, KVM_APIC_MAX_NR_LVT_ENTRIES);
+}
+
static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
{
int ret = 0;
@@ -2098,13 +2137,10 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
apic_set_spiv(apic, val & mask);
if (!(val & APIC_SPIV_APIC_ENABLED)) {
int i;
- u32 lvt_val;
- for (i = 0; i < KVM_APIC_LVT_NUM; i++) {
- lvt_val = kvm_lapic_get_reg(apic,
- APIC_LVTT + 0x10 * i);
- kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i,
- lvt_val | APIC_LVT_MASKED);
+ for (i = 0; i < apic->nr_lvt_entries; i++) {
+ kvm_lapic_set_reg(apic, APIC_LVTx(i),
+ kvm_lapic_get_reg(apic, APIC_LVTx(i)) | APIC_LVT_MASKED);
}
apic_update_lvtt(apic);
atomic_set(&apic->lapic_timer.pending, 0);
@@ -2133,16 +2169,15 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_LVTTHMR:
case APIC_LVTPC:
case APIC_LVT1:
- case APIC_LVTERR: {
- /* TODO: Check vector */
- size_t size;
- u32 index;
-
+ case APIC_LVTERR:
+ case APIC_LVTCMCI: {
+ u32 index = get_lvt_index(reg);
+ if (!kvm_lapic_lvt_supported(apic, index)) {
+ ret = 1;
+ break;
+ }
if (!kvm_apic_sw_enabled(apic))
val |= APIC_LVT_MASKED;
- size = ARRAY_SIZE(apic_lvt_mask);
- index = array_index_nospec(
- (reg - APIC_LVTT) >> 4, size);
val &= apic_lvt_mask[index];
kvm_lapic_set_reg(apic, reg, val);
break;
@@ -2246,10 +2281,26 @@ EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
/* emulate APIC access in a trap manner */
void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
{
- u32 val = kvm_lapic_get_reg(vcpu->arch.apic, offset);
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u64 val;
+
+ if (apic_x2apic_mode(apic))
+ kvm_lapic_msr_read(apic, offset, &val);
+ else
+ val = kvm_lapic_get_reg(apic, offset);
- /* TODO: optimize to just emulate side effect w/o one more write */
- kvm_lapic_reg_write(vcpu->arch.apic, offset, val);
+ /*
+ * ICR is a single 64-bit register when x2APIC is enabled. For legacy
+ * xAPIC, ICR writes need to go down the common (slightly slower) path
+ * to get the upper half from ICR2.
+ */
+ if (apic_x2apic_mode(apic) && offset == APIC_ICR) {
+ kvm_apic_send_ipi(apic, (u32)val, (u32)(val >> 32));
+ trace_kvm_apic_write(APIC_ICR, val);
+ } else {
+ /* TODO: optimize to just emulate side effect w/o one more write */
+ kvm_lapic_reg_write(apic, offset, (u32)val);
+ }
}
EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode);
@@ -2344,8 +2395,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE))
kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
- if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE))
+ if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) {
+ kvm_vcpu_update_apicv(vcpu);
static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu);
+ }
apic->base_address = apic->vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
@@ -2361,7 +2414,7 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (vcpu->arch.apicv_active) {
+ if (apic->apicv_active) {
/* irr_pending is always true when apicv is activated. */
apic->irr_pending = true;
apic->isr_count = 1;
@@ -2401,8 +2454,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
kvm_apic_set_version(apic->vcpu);
- for (i = 0; i < KVM_APIC_LVT_NUM; i++)
- kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
+ for (i = 0; i < apic->nr_lvt_entries; i++)
+ kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED);
apic_update_lvtt(apic);
if (kvm_vcpu_is_reset_bsp(vcpu) &&
kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
@@ -2436,10 +2489,10 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.pv_eoi.msr_val = 0;
apic_update_ppr(apic);
- if (vcpu->arch.apicv_active) {
+ if (apic->apicv_active) {
static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1);
- static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, -1);
+ static_call_cond(kvm_x86_hwapic_isr_update)(-1);
}
vcpu->arch.apic_arb_prio = 0;
@@ -2532,6 +2585,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
}
apic->vcpu = vcpu;
+ apic->nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu);
+
hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
HRTIMER_MODE_ABS_HARD);
apic->lapic_timer.timer.function = apic_timer_fn;
@@ -2716,10 +2771,10 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
kvm_lapic_set_reg(apic, APIC_TMCCT, 0);
kvm_apic_update_apicv(vcpu);
apic->highest_isr_cache = -1;
- if (vcpu->arch.apicv_active) {
+ if (apic->apicv_active) {
static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
- static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
+ static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic));
}
kvm_make_request(KVM_REQ_EVENT, vcpu);
if (ioapic_in_kernel(vcpu->kvm))
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 65bb2a8cf145..117a46df5cc1 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -10,7 +10,6 @@
#define KVM_APIC_INIT 0
#define KVM_APIC_SIPI 1
-#define KVM_APIC_LVT_NUM 6
#define APIC_SHORT_MASK 0xc0000
#define APIC_DEST_NOSHORT 0x0
@@ -29,6 +28,20 @@ enum lapic_mode {
LAPIC_MODE_X2APIC = MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE,
};
+enum lapic_lvt_entry {
+ LVT_TIMER,
+ LVT_THERMAL_MONITOR,
+ LVT_PERFORMANCE_COUNTER,
+ LVT_LINT0,
+ LVT_LINT1,
+ LVT_ERROR,
+ LVT_CMCI,
+
+ KVM_APIC_MAX_NR_LVT_ENTRIES,
+};
+
+#define APIC_LVTx(x) ((x) == LVT_CMCI ? APIC_LVTCMCI : APIC_LVTT + 0x10 * (x))
+
struct kvm_timer {
struct hrtimer timer;
s64 period; /* unit: ns */
@@ -48,6 +61,7 @@ struct kvm_lapic {
struct kvm_timer lapic_timer;
u32 divide_count;
struct kvm_vcpu *vcpu;
+ bool apicv_active;
bool sw_enabled;
bool irr_pending;
bool lvt0_in_nmi_mode;
@@ -65,6 +79,7 @@ struct kvm_lapic {
struct gfn_to_hva_cache vapic_cache;
unsigned long pending_events;
unsigned int sipi_vector;
+ int nr_lvt_entries;
};
struct dest_map;
@@ -84,6 +99,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
void kvm_recalculate_apic_map(struct kvm *kvm);
void kvm_apic_set_version(struct kvm_vcpu *vcpu);
+void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu);
bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int shorthand, unsigned int dest, int dest_mode);
int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
@@ -204,7 +220,7 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic)
static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.apic && vcpu->arch.apicv_active;
+ return lapic_in_kernel(vcpu) && vcpu->arch.apic->apicv_active;
}
static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index f8192864b496..a99acec925eb 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -6,11 +6,6 @@
#include "kvm_cache_regs.h"
#include "cpuid.h"
-#define PT64_PT_BITS 9
-#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
-#define PT32_PT_BITS 10
-#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
-
#define PT_WRITABLE_SHIFT 1
#define PT_USER_SHIFT 2
@@ -34,11 +29,6 @@
#define PT_DIR_PAT_SHIFT 12
#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
-#define PT32_DIR_PSE36_SIZE 4
-#define PT32_DIR_PSE36_SHIFT 13
-#define PT32_DIR_PSE36_MASK \
- (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
-
#define PT64_ROOT_5LEVEL 5
#define PT64_ROOT_4LEVEL 4
#define PT32_ROOT_LEVEL 2
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 17252f39bd7c..3e1317325e1f 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -53,8 +53,6 @@
#include <asm/kvm_page_track.h>
#include "trace.h"
-#include "paging.h"
-
extern bool itlb_multihit_kvm_mitigation;
int __read_mostly nx_huge_pages = -1;
@@ -111,26 +109,6 @@ module_param(dbg, bool, 0644);
#define PTE_PREFETCH_NUM 8
-#define PT32_LEVEL_BITS 10
-
-#define PT32_LEVEL_SHIFT(level) \
- (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
-
-#define PT32_LVL_OFFSET_MASK(level) \
- (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
- * PT32_LEVEL_BITS))) - 1))
-
-#define PT32_INDEX(address, level)\
- (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
-
-
-#define PT32_BASE_ADDR_MASK PAGE_MASK
-#define PT32_DIR_BASE_ADDR_MASK \
- (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
-#define PT32_LVL_ADDR_MASK(level) \
- (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
- * PT32_LEVEL_BITS))) - 1))
-
#include <trace/events/kvm.h>
/* make pte_list_desc fit well in cache lines */
@@ -326,13 +304,6 @@ static int is_cpuid_PSE36(void)
return 1;
}
-static gfn_t pse36_gfn_delta(u32 gpte)
-{
- int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
-
- return (gpte & PT32_DIR_PSE36_MASK) << shift;
-}
-
#ifdef CONFIG_X86_64
static void __set_spte(u64 *sptep, u64 spte)
{
@@ -432,7 +403,7 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
* The idea using the light way get the spte on x86_32 guest is from
* gup_get_pte (mm/gup.c).
*
- * An spte tlb flush may be pending, because kvm_set_pte_rmapp
+ * An spte tlb flush may be pending, because kvm_set_pte_rmap
* coalesces them and we are running out of the MMU lock. Therefore
* we need to protect against in-progress updates of the spte.
*
@@ -558,11 +529,12 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
* state bits, it is used to clear the last level sptep.
* Returns the old PTE.
*/
-static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
+static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
{
kvm_pfn_t pfn;
u64 old_spte = *sptep;
int level = sptep_to_sp(sptep)->role.level;
+ struct page *page;
if (!is_shadow_present_pte(old_spte) ||
!spte_has_volatile_bits(old_spte))
@@ -578,11 +550,13 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
pfn = spte_to_pfn(old_spte);
/*
- * KVM does not hold the refcount of the page used by
- * kvm mmu, before reclaiming the page, we should
- * unmap it from mmu first.
+ * KVM doesn't hold a reference to any pages mapped into the guest, and
+ * instead uses the mmu_notifier to ensure that KVM unmaps any pages
+ * before they are reclaimed. Sanity check that, if the pfn is backed
+ * by a refcounted page, the refcount is elevated.
*/
- WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
+ page = kvm_pfn_to_refcounted_page(pfn);
+ WARN_ON(page && !page_count(page));
if (is_accessed_spte(old_spte))
kvm_set_pfn_accessed(pfn);
@@ -682,7 +656,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
if (r)
return r;
if (maybe_indirect) {
- r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache,
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache,
PT64_ROOT_MAX_LEVEL);
if (r)
return r;
@@ -695,48 +669,79 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
- kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
}
-static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
-{
- return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
-}
-
static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
{
kmem_cache_free(pte_list_desc_cache, pte_list_desc);
}
+static bool sp_has_gptes(struct kvm_mmu_page *sp);
+
static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
{
if (sp->role.passthrough)
return sp->gfn;
if (!sp->role.direct)
- return sp->gfns[index];
+ return sp->shadowed_translation[index] >> PAGE_SHIFT;
- return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
+ return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS));
}
-static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
+/*
+ * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note
+ * that the SPTE itself may have a more constrained access permissions that
+ * what the guest enforces. For example, a guest may create an executable
+ * huge PTE but KVM may disallow execution to mitigate iTLB multihit.
+ */
+static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
{
- if (sp->role.passthrough) {
- WARN_ON_ONCE(gfn != sp->gfn);
- return;
- }
+ if (sp_has_gptes(sp))
+ return sp->shadowed_translation[index] & ACC_ALL;
- if (!sp->role.direct) {
- sp->gfns[index] = gfn;
+ /*
+ * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs,
+ * KVM is not shadowing any guest page tables, so the "guest access
+ * permissions" are just ACC_ALL.
+ *
+ * For direct SPs in indirect MMUs (shadow paging), i.e. when KVM
+ * is shadowing a guest huge page with small pages, the guest access
+ * permissions being shadowed are the access permissions of the huge
+ * page.
+ *
+ * In both cases, sp->role.access contains the correct access bits.
+ */
+ return sp->role.access;
+}
+
+static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index,
+ gfn_t gfn, unsigned int access)
+{
+ if (sp_has_gptes(sp)) {
+ sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access;
return;
}
- if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
- pr_err_ratelimited("gfn mismatch under direct page %llx "
- "(expected %llx, got %llx)\n",
- sp->gfn,
- kvm_mmu_page_get_gfn(sp, index), gfn);
+ WARN_ONCE(access != kvm_mmu_page_get_access(sp, index),
+ "access mismatch under %s page %llx (expected %u, got %u)\n",
+ sp->role.passthrough ? "passthrough" : "direct",
+ sp->gfn, kvm_mmu_page_get_access(sp, index), access);
+
+ WARN_ONCE(gfn != kvm_mmu_page_get_gfn(sp, index),
+ "gfn mismatch under %s page %llx (expected %llx, got %llx)\n",
+ sp->role.passthrough ? "passthrough" : "direct",
+ sp->gfn, kvm_mmu_page_get_gfn(sp, index), gfn);
+}
+
+static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index,
+ unsigned int access)
+{
+ gfn_t gfn = kvm_mmu_page_get_gfn(sp, index);
+
+ kvm_mmu_page_set_translation(sp, index, gfn, access);
}
/*
@@ -792,6 +797,9 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
KVM_PAGE_TRACK_WRITE);
kvm_mmu_gfn_disallow_lpage(slot, gfn);
+
+ if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
+ kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
}
void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -855,7 +863,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
/*
* Returns the number of pointers in the rmap chain, not counting the new one.
*/
-static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
+static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
@@ -866,7 +874,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
rmap_head->val = (unsigned long)spte;
} else if (!(rmap_head->val & 1)) {
rmap_printk("%p %llx 1->many\n", spte, *spte);
- desc = mmu_alloc_pte_list_desc(vcpu);
+ desc = kvm_mmu_memory_cache_alloc(cache);
desc->sptes[0] = (u64 *)rmap_head->val;
desc->sptes[1] = spte;
desc->spte_count = 2;
@@ -878,7 +886,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
while (desc->spte_count == PTE_LIST_EXT) {
count += PTE_LIST_EXT;
if (!desc->more) {
- desc->more = mmu_alloc_pte_list_desc(vcpu);
+ desc->more = kvm_mmu_memory_cache_alloc(cache);
desc = desc->more;
desc->spte_count = 0;
break;
@@ -913,7 +921,7 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
mmu_free_pte_list_desc(desc);
}
-static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
+static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
struct pte_list_desc *prev_desc;
@@ -949,15 +957,16 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
}
}
-static void pte_list_remove(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- u64 *sptep)
+static void kvm_zap_one_rmap_spte(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head, u64 *sptep)
{
mmu_spte_clear_track_bits(kvm, sptep);
- __pte_list_remove(sptep, rmap_head);
+ pte_list_remove(sptep, rmap_head);
}
-/* Return true if rmap existed, false otherwise */
-static bool pte_list_destroy(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
+/* Return true if at least one SPTE was zapped, false otherwise */
+static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc, *next;
int i;
@@ -1030,7 +1039,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
struct kvm_rmap_head *rmap_head;
sp = sptep_to_sp(spte);
- gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
+ gfn = kvm_mmu_page_get_gfn(sp, spte_index(spte));
/*
* Unlike rmap_add, rmap_remove does not run in the context of a vCPU
@@ -1042,7 +1051,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
slot = __gfn_to_memslot(slots, gfn);
rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
- __pte_list_remove(spte, rmap_head);
+ pte_list_remove(spte, rmap_head);
}
/*
@@ -1129,26 +1138,18 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
rmap_remove(kvm, sptep);
}
-
-static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
+static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush)
{
- if (is_large_pte(*sptep)) {
- WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K);
- drop_spte(kvm, sptep);
- return true;
- }
+ struct kvm_mmu_page *sp;
- return false;
-}
+ sp = sptep_to_sp(sptep);
+ WARN_ON(sp->role.level == PG_LEVEL_4K);
-static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
-{
- if (__drop_large_spte(vcpu->kvm, sptep)) {
- struct kvm_mmu_page *sp = sptep_to_sp(sptep);
+ drop_spte(kvm, sptep);
- kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
+ if (flush)
+ kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
KVM_PAGES_PER_HPAGE(sp->role.level));
- }
}
/*
@@ -1383,22 +1384,22 @@ static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
}
-static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- const struct kvm_memory_slot *slot)
+static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot)
{
- return pte_list_destroy(kvm, rmap_head);
+ return kvm_zap_all_rmap_sptes(kvm, rmap_head);
}
-static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- pte_t unused)
+static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ pte_t unused)
{
- return kvm_zap_rmapp(kvm, rmap_head, slot);
+ return __kvm_zap_rmap(kvm, rmap_head, slot);
}
-static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- pte_t pte)
+static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ pte_t pte)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1417,7 +1418,7 @@ restart:
need_flush = true;
if (pte_write(pte)) {
- pte_list_remove(kvm, rmap_head, sptep);
+ kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
goto restart;
} else {
new_spte = kvm_mmu_changed_pte_notifier_make_spte(
@@ -1529,7 +1530,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
bool flush = false;
if (kvm_memslots_have_rmaps(kvm))
- flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
+ flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap);
if (is_tdp_mmu_enabled(kvm))
flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
@@ -1542,7 +1543,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
bool flush = false;
if (kvm_memslots_have_rmaps(kvm))
- flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp);
+ flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap);
if (is_tdp_mmu_enabled(kvm))
flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
@@ -1550,9 +1551,9 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
return flush;
}
-static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- pte_t unused)
+static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ pte_t unused)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1564,9 +1565,9 @@ static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
return young;
}
-static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn,
- int level, pte_t unused)
+static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int level, pte_t unused)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1579,31 +1580,43 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
#define RMAP_RECYCLE_THRESHOLD 1000
-static void rmap_add(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
- u64 *spte, gfn_t gfn)
+static void __rmap_add(struct kvm *kvm,
+ struct kvm_mmu_memory_cache *cache,
+ const struct kvm_memory_slot *slot,
+ u64 *spte, gfn_t gfn, unsigned int access)
{
struct kvm_mmu_page *sp;
struct kvm_rmap_head *rmap_head;
int rmap_count;
sp = sptep_to_sp(spte);
- kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
+ kvm_mmu_page_set_translation(sp, spte_index(spte), gfn, access);
+ kvm_update_page_stats(kvm, sp->role.level, 1);
+
rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
- rmap_count = pte_list_add(vcpu, spte, rmap_head);
+ rmap_count = pte_list_add(cache, spte, rmap_head);
if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
- kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0));
+ kvm_zap_all_rmap_sptes(kvm, rmap_head);
kvm_flush_remote_tlbs_with_address(
- vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
+ kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
}
}
+static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot,
+ u64 *spte, gfn_t gfn, unsigned int access)
+{
+ struct kvm_mmu_memory_cache *cache = &vcpu->arch.mmu_pte_list_desc_cache;
+
+ __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access);
+}
+
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
bool young = false;
if (kvm_memslots_have_rmaps(kvm))
- young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp);
+ young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap);
if (is_tdp_mmu_enabled(kvm))
young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
@@ -1616,7 +1629,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
bool young = false;
if (kvm_memslots_have_rmaps(kvm))
- young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp);
+ young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap);
if (is_tdp_mmu_enabled(kvm))
young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
@@ -1652,14 +1665,14 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
}
-static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
+static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
{
MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
hlist_del(&sp->hash_link);
list_del(&sp->link);
free_page((unsigned long)sp->spt);
if (!sp->role.direct)
- free_page((unsigned long)sp->gfns);
+ free_page((unsigned long)sp->shadowed_translation);
kmem_cache_free(mmu_page_header_cache, sp);
}
@@ -1668,19 +1681,19 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
return hash_64(gfn, KVM_MMU_HASH_SHIFT);
}
-static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
+static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache,
struct kvm_mmu_page *sp, u64 *parent_pte)
{
if (!parent_pte)
return;
- pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
+ pte_list_add(cache, parent_pte, &sp->parent_ptes);
}
static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
u64 *parent_pte)
{
- __pte_list_remove(parent_pte, &sp->parent_ptes);
+ pte_list_remove(parent_pte, &sp->parent_ptes);
}
static void drop_parent_pte(struct kvm_mmu_page *sp,
@@ -1690,27 +1703,6 @@ static void drop_parent_pte(struct kvm_mmu_page *sp,
mmu_spte_clear_no_track(parent_pte);
}
-static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
-{
- struct kvm_mmu_page *sp;
-
- sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
- sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
- if (!direct)
- sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache);
- set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
-
- /*
- * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
- * depends on valid pages being added to the head of the list. See
- * comments in kvm_zap_obsolete_pages().
- */
- sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
- list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
- kvm_mod_used_mmu_pages(vcpu->kvm, +1);
- return sp;
-}
-
static void mark_unsync(u64 *spte);
static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
{
@@ -1725,11 +1717,9 @@ static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
static void mark_unsync(u64 *spte)
{
struct kvm_mmu_page *sp;
- unsigned int index;
sp = sptep_to_sp(spte);
- index = spte - sp->spt;
- if (__test_and_set_bit(index, sp->unsync_child_bitmap))
+ if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap))
return;
if (sp->unsync_children++)
return;
@@ -1789,7 +1779,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
continue;
}
- child = to_shadow_page(ent & PT64_BASE_ADDR_MASK);
+ child = to_shadow_page(ent & SPTE_BASE_ADDR_MASK);
if (child->unsync_children) {
if (mmu_pages_add(pvec, child, i))
@@ -2019,36 +2009,24 @@ static void clear_sp_write_flooding_count(u64 *spte)
__clear_sp_write_flooding_count(sptep_to_sp(spte));
}
-static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
- gfn_t gfn,
- gva_t gaddr,
- unsigned level,
- int direct,
- unsigned int access)
+/*
+ * The vCPU is required when finding indirect shadow pages; the shadow
+ * page may already exist and syncing it needs the vCPU pointer in
+ * order to read guest page tables. Direct shadow pages are never
+ * unsync, thus @vcpu can be NULL if @role.direct is true.
+ */
+static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm,
+ struct kvm_vcpu *vcpu,
+ gfn_t gfn,
+ struct hlist_head *sp_list,
+ union kvm_mmu_page_role role)
{
- bool direct_mmu = vcpu->arch.mmu->root_role.direct;
- union kvm_mmu_page_role role;
- struct hlist_head *sp_list;
- unsigned quadrant;
struct kvm_mmu_page *sp;
int ret;
int collisions = 0;
LIST_HEAD(invalid_list);
- role = vcpu->arch.mmu->root_role;
- role.level = level;
- role.direct = direct;
- role.access = access;
- if (role.has_4_byte_gpte) {
- quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
- quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
- role.quadrant = quadrant;
- }
- if (level <= vcpu->arch.mmu->cpu_role.base.level)
- role.passthrough = 0;
-
- sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
- for_each_valid_sp(vcpu->kvm, sp, sp_list) {
+ for_each_valid_sp(kvm, sp, sp_list) {
if (sp->gfn != gfn) {
collisions++;
continue;
@@ -2064,16 +2042,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
* Unsync pages must not be left as is, because the new
* upper-level page will be write-protected.
*/
- if (level > PG_LEVEL_4K && sp->unsync)
- kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+ if (role.level > PG_LEVEL_4K && sp->unsync)
+ kvm_mmu_prepare_zap_page(kvm, sp,
&invalid_list);
continue;
}
- if (direct_mmu)
- goto trace_get_page;
+ /* unsync and write-flooding only apply to indirect SPs. */
+ if (sp->role.direct)
+ goto out;
if (sp->unsync) {
+ if (KVM_BUG_ON(!vcpu, kvm))
+ break;
+
/*
* The page is good, but is stale. kvm_sync_page does
* get the latest guest state, but (unlike mmu_unsync_children)
@@ -2092,37 +2074,160 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
WARN_ON(!list_empty(&invalid_list));
if (ret > 0)
- kvm_flush_remote_tlbs(vcpu->kvm);
+ kvm_flush_remote_tlbs(kvm);
}
__clear_sp_write_flooding_count(sp);
-trace_get_page:
- trace_kvm_mmu_get_page(sp, false);
goto out;
}
- ++vcpu->kvm->stat.mmu_cache_miss;
+ sp = NULL;
+ ++kvm->stat.mmu_cache_miss;
+
+out:
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+
+ if (collisions > kvm->stat.max_mmu_page_hash_collisions)
+ kvm->stat.max_mmu_page_hash_collisions = collisions;
+ return sp;
+}
+
+/* Caches used when allocating a new shadow page. */
+struct shadow_page_caches {
+ struct kvm_mmu_memory_cache *page_header_cache;
+ struct kvm_mmu_memory_cache *shadow_page_cache;
+ struct kvm_mmu_memory_cache *shadowed_info_cache;
+};
+
+static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
+ struct shadow_page_caches *caches,
+ gfn_t gfn,
+ struct hlist_head *sp_list,
+ union kvm_mmu_page_role role)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
+ sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
+ if (!role.direct)
+ sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
+
+ set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
- sp = kvm_mmu_alloc_page(vcpu, direct);
+ /*
+ * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
+ * depends on valid pages being added to the head of the list. See
+ * comments in kvm_zap_obsolete_pages().
+ */
+ sp->mmu_valid_gen = kvm->arch.mmu_valid_gen;
+ list_add(&sp->link, &kvm->arch.active_mmu_pages);
+ kvm_mod_used_mmu_pages(kvm, +1);
sp->gfn = gfn;
sp->role = role;
hlist_add_head(&sp->hash_link, sp_list);
- if (sp_has_gptes(sp)) {
- account_shadowed(vcpu->kvm, sp);
- if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn))
- kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
+ if (sp_has_gptes(sp))
+ account_shadowed(kvm, sp);
+
+ return sp;
+}
+
+/* Note, @vcpu may be NULL if @role.direct is true; see kvm_mmu_find_shadow_page. */
+static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm,
+ struct kvm_vcpu *vcpu,
+ struct shadow_page_caches *caches,
+ gfn_t gfn,
+ union kvm_mmu_page_role role)
+{
+ struct hlist_head *sp_list;
+ struct kvm_mmu_page *sp;
+ bool created = false;
+
+ sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
+
+ sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role);
+ if (!sp) {
+ created = true;
+ sp = kvm_mmu_alloc_shadow_page(kvm, caches, gfn, sp_list, role);
}
- trace_kvm_mmu_get_page(sp, true);
-out:
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
- vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
+ trace_kvm_mmu_get_page(sp, created);
return sp;
}
+static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
+ gfn_t gfn,
+ union kvm_mmu_page_role role)
+{
+ struct shadow_page_caches caches = {
+ .page_header_cache = &vcpu->arch.mmu_page_header_cache,
+ .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache,
+ .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
+ };
+
+ return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role);
+}
+
+static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct,
+ unsigned int access)
+{
+ struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep);
+ union kvm_mmu_page_role role;
+
+ role = parent_sp->role;
+ role.level--;
+ role.access = access;
+ role.direct = direct;
+ role.passthrough = 0;
+
+ /*
+ * If the guest has 4-byte PTEs then that means it's using 32-bit,
+ * 2-level, non-PAE paging. KVM shadows such guests with PAE paging
+ * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must
+ * shadow each guest page table with multiple shadow page tables, which
+ * requires extra bookkeeping in the role.
+ *
+ * Specifically, to shadow the guest's page directory (which covers a
+ * 4GiB address space), KVM uses 4 PAE page directories, each mapping
+ * 1GiB of the address space. @role.quadrant encodes which quarter of
+ * the address space each maps.
+ *
+ * To shadow the guest's page tables (which each map a 4MiB region), KVM
+ * uses 2 PAE page tables, each mapping a 2MiB region. For these,
+ * @role.quadrant encodes which half of the region they map.
+ *
+ * Concretely, a 4-byte PDE consumes bits 31:22, while an 8-byte PDE
+ * consumes bits 29:21. To consume bits 31:30, KVM's uses 4 shadow
+ * PDPTEs; those 4 PAE page directories are pre-allocated and their
+ * quadrant is assigned in mmu_alloc_root(). A 4-byte PTE consumes
+ * bits 21:12, while an 8-byte PTE consumes bits 20:12. To consume
+ * bit 21 in the PTE (the child here), KVM propagates that bit to the
+ * quadrant, i.e. sets quadrant to '0' or '1'. The parent 8-byte PDE
+ * covers bit 21 (see above), thus the quadrant is calculated from the
+ * _least_ significant bit of the PDE index.
+ */
+ if (role.has_4_byte_gpte) {
+ WARN_ON_ONCE(role.level != PG_LEVEL_4K);
+ role.quadrant = spte_index(sptep) & 1;
+ }
+
+ return role;
+}
+
+static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu,
+ u64 *sptep, gfn_t gfn,
+ bool direct, unsigned int access)
+{
+ union kvm_mmu_page_role role;
+
+ if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
+ return ERR_PTR(-EEXIST);
+
+ role = kvm_mmu_child_role(sptep, direct, access);
+ return kvm_mmu_get_shadow_page(vcpu, gfn, role);
+}
+
static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
struct kvm_vcpu *vcpu, hpa_t root,
u64 addr)
@@ -2145,7 +2250,7 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
iterator->shadow_addr
= vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
- iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
+ iterator->shadow_addr &= SPTE_BASE_ADDR_MASK;
--iterator->level;
if (!iterator->shadow_addr)
iterator->level = 0;
@@ -2164,7 +2269,7 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
if (iterator->level < PG_LEVEL_4K)
return false;
- iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
+ iterator->index = SPTE_INDEX(iterator->addr, iterator->level);
iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
return true;
}
@@ -2177,7 +2282,7 @@ static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
return;
}
- iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
+ iterator->shadow_addr = spte & SPTE_BASE_ADDR_MASK;
--iterator->level;
}
@@ -2186,23 +2291,38 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
__shadow_walk_next(iterator, *iterator->sptep);
}
-static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
- struct kvm_mmu_page *sp)
+static void __link_shadow_page(struct kvm *kvm,
+ struct kvm_mmu_memory_cache *cache, u64 *sptep,
+ struct kvm_mmu_page *sp, bool flush)
{
u64 spte;
BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
+ /*
+ * If an SPTE is present already, it must be a leaf and therefore
+ * a large one. Drop it, and flush the TLB if needed, before
+ * installing sp.
+ */
+ if (is_shadow_present_pte(*sptep))
+ drop_large_spte(kvm, sptep, flush);
+
spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));
mmu_spte_set(sptep, spte);
- mmu_page_add_parent_pte(vcpu, sp, sptep);
+ mmu_page_add_parent_pte(cache, sp, sptep);
if (sp->unsync_children || sp->unsync)
mark_unsync(sptep);
}
+static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
+ struct kvm_mmu_page *sp)
+{
+ __link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp, true);
+}
+
static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned direct_access)
{
@@ -2216,7 +2336,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
* so we should update the spte at this point to get
* a new sp with the correct access.
*/
- child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK);
+ child = to_shadow_page(*sptep & SPTE_BASE_ADDR_MASK);
if (child->role.access == direct_access)
return;
@@ -2237,7 +2357,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
if (is_last_spte(pte, sp->role.level)) {
drop_spte(kvm, spte);
} else {
- child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
+ child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK);
drop_parent_pte(child, spte);
/*
@@ -2263,7 +2383,7 @@ static int kvm_mmu_page_unlink_children(struct kvm *kvm,
int zapped = 0;
unsigned i;
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+ for (i = 0; i < SPTE_ENT_PER_PAGE; ++i)
zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);
return zapped;
@@ -2396,7 +2516,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
list_for_each_entry_safe(sp, nsp, invalid_list, link) {
WARN_ON(!sp->role.invalid || sp->root_count);
- kvm_mmu_free_page(sp);
+ kvm_mmu_free_shadow_page(sp);
}
}
@@ -2676,7 +2796,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
struct kvm_mmu_page *child;
u64 pte = *sptep;
- child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
+ child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK);
drop_parent_pte(child, sptep);
flush = true;
} else if (pfn != spte_to_pfn(*sptep)) {
@@ -2711,8 +2831,10 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
if (!was_rmapped) {
WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
- kvm_update_page_stats(vcpu->kvm, level, 1);
- rmap_add(vcpu, slot, sptep, gfn);
+ rmap_add(vcpu, slot, sptep, gfn, pte_access);
+ } else {
+ /* Already rmapped but the pte_access bits may have changed. */
+ kvm_mmu_page_set_access(sp, spte_index(sptep), pte_access);
}
return ret;
@@ -2728,7 +2850,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
int i, ret;
gfn_t gfn;
- gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
+ gfn = kvm_mmu_page_get_gfn(sp, spte_index(start));
slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
if (!slot)
return -1;
@@ -2754,7 +2876,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
WARN_ON(!sp->role.direct);
- i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+ i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1);
spte = sp->spt + i;
for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
@@ -2798,20 +2920,42 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
__direct_pte_prefetch(vcpu, sp, sptep);
}
-static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
+/*
+ * Lookup the mapping level for @gfn in the current mm.
+ *
+ * WARNING! Use of host_pfn_mapping_level() requires the caller and the end
+ * consumer to be tied into KVM's handlers for MMU notifier events!
+ *
+ * There are several ways to safely use this helper:
+ *
+ * - Check mmu_notifier_retry_hva() after grabbing the mapping level, before
+ * consuming it. In this case, mmu_lock doesn't need to be held during the
+ * lookup, but it does need to be held while checking the MMU notifier.
+ *
+ * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation
+ * event for the hva. This can be done by explicit checking the MMU notifier
+ * or by ensuring that KVM already has a valid mapping that covers the hva.
+ *
+ * - Do not use the result to install new mappings, e.g. use the host mapping
+ * level only to decide whether or not to zap an entry. In this case, it's
+ * not required to hold mmu_lock (though it's highly likely the caller will
+ * want to hold mmu_lock anyways, e.g. to modify SPTEs).
+ *
+ * Note! The lookup can still race with modifications to host page tables, but
+ * the above "rules" ensure KVM will not _consume_ the result of the walk if a
+ * race with the primary MMU occurs.
+ */
+static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
const struct kvm_memory_slot *slot)
{
+ int level = PG_LEVEL_4K;
unsigned long hva;
unsigned long flags;
- int level = PG_LEVEL_4K;
pgd_t pgd;
p4d_t p4d;
pud_t pud;
pmd_t pmd;
- if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
- return PG_LEVEL_4K;
-
/*
* Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
* is not solely for performance, it's also necessary to avoid the
@@ -2823,16 +2967,19 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
hva = __gfn_to_hva_memslot(slot, gfn);
/*
- * Lookup the mapping level in the current mm. The information
- * may become stale soon, but it is safe to use as long as
- * 1) mmu_notifier_retry was checked after taking mmu_lock, and
- * 2) mmu_lock is taken now.
- *
- * We still need to disable IRQs to prevent concurrent tear down
- * of page tables.
+ * Disable IRQs to prevent concurrent tear down of host page tables,
+ * e.g. if the primary MMU promotes a P*D to a huge page and then frees
+ * the original page table.
*/
local_irq_save(flags);
+ /*
+ * Read each entry once. As above, a non-leaf entry can be promoted to
+ * a huge page _during_ this walk. Re-reading the entry could send the
+ * walk into the weeks, e.g. p*d_large() returns false (sees the old
+ * value) and then p*d_offset() walks into the target huge page instead
+ * of the old page table (sees the new value).
+ */
pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
if (pgd_none(pgd))
goto out;
@@ -2864,7 +3011,7 @@ out:
int kvm_mmu_max_mapping_level(struct kvm *kvm,
const struct kvm_memory_slot *slot, gfn_t gfn,
- kvm_pfn_t pfn, int max_level)
+ int max_level)
{
struct kvm_lpage_info *linfo;
int host_level;
@@ -2879,7 +3026,7 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
if (max_level == PG_LEVEL_4K)
return PG_LEVEL_4K;
- host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
+ host_level = host_pfn_mapping_level(kvm, gfn, slot);
return min(host_level, max_level);
}
@@ -2893,7 +3040,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (unlikely(fault->max_level == PG_LEVEL_4K))
return;
- if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn))
+ if (is_error_noslot_pfn(fault->pfn))
return;
if (kvm_slot_dirty_track_enabled(slot))
@@ -2904,8 +3051,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
* level, which will be used to do precise, accurate accounting.
*/
fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
- fault->gfn, fault->pfn,
- fault->max_level);
+ fault->gfn, fault->max_level);
if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
return;
@@ -2961,13 +3107,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
if (it.level == fault->goal_level)
break;
- drop_large_spte(vcpu, it.sptep);
- if (is_shadow_present_pte(*it.sptep))
+ sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL);
+ if (sp == ERR_PTR(-EEXIST))
continue;
- sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
- it.level - 1, true, ACC_ALL);
-
link_shadow_page(vcpu, it.sptep, sp);
if (fault->is_tdp && fault->huge_page_disallowed &&
fault->req_level >= it.level)
@@ -3095,7 +3238,7 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
*
* Compare with set_spte where instead shadow_dirty_mask is set.
*/
- if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
+ if (!try_cmpxchg64(sptep, &old_spte, new_spte))
return false;
if (is_writable_pte(new_spte) && !is_writable_pte(old_spte))
@@ -3265,7 +3408,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
if (!VALID_PAGE(*root_hpa))
return;
- sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
+ sp = to_shadow_page(*root_hpa & SPTE_BASE_ADDR_MASK);
if (WARN_ON(!sp))
return;
@@ -3369,12 +3512,19 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
return ret;
}
-static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
- u8 level, bool direct)
+static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant,
+ u8 level)
{
+ union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
struct kvm_mmu_page *sp;
- sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
+ role.level = level;
+ role.quadrant = quadrant;
+
+ WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte);
+ WARN_ON_ONCE(role.direct && role.has_4_byte_gpte);
+
+ sp = kvm_mmu_get_shadow_page(vcpu, gfn, role);
++sp->root_count;
return __pa(sp->spt);
@@ -3397,7 +3547,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
mmu->root.hpa = root;
} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
- root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
+ root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level);
mmu->root.hpa = root;
} else if (shadow_root_level == PT32E_ROOT_LEVEL) {
if (WARN_ON_ONCE(!mmu->pae_root)) {
@@ -3408,8 +3558,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
for (i = 0; i < 4; ++i) {
WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
- root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
- i << 30, PT32_ROOT_LEVEL, true);
+ root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 0,
+ PT32_ROOT_LEVEL);
mmu->pae_root[i] = root | PT_PRESENT_MASK |
shadow_me_value;
}
@@ -3493,9 +3643,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
struct kvm_mmu *mmu = vcpu->arch.mmu;
u64 pdptrs[4], pm_mask;
gfn_t root_gfn, root_pgd;
+ int quadrant, i, r;
hpa_t root;
- unsigned i;
- int r;
root_pgd = mmu->get_guest_pgd(vcpu);
root_gfn = root_pgd >> PAGE_SHIFT;
@@ -3533,7 +3682,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
*/
if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
root = mmu_alloc_root(vcpu, root_gfn, 0,
- mmu->root_role.level, false);
+ mmu->root_role.level);
mmu->root.hpa = root;
goto set_root_pgd;
}
@@ -3578,8 +3727,15 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
root_gfn = pdptrs[i] >> PAGE_SHIFT;
}
- root = mmu_alloc_root(vcpu, root_gfn, i << 30,
- PT32_ROOT_LEVEL, false);
+ /*
+ * If shadowing 32-bit non-PAE page tables, each PAE page
+ * directory maps one quarter of the guest's non-PAE page
+ * directory. Othwerise each PAE page direct shadows one guest
+ * PAE page directory so that quadrant should be 0.
+ */
+ quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0;
+
+ root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL);
mmu->pae_root[i] = root | pm_mask;
}
@@ -3737,7 +3893,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
hpa_t root = vcpu->arch.mmu->pae_root[i];
if (IS_VALID_PAE_ROOT(root)) {
- root &= PT64_BASE_ADDR_MASK;
+ root &= SPTE_BASE_ADDR_MASK;
sp = to_shadow_page(root);
mmu_sync_children(vcpu, sp, true);
}
@@ -4155,14 +4311,26 @@ EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- while (fault->max_level > PG_LEVEL_4K) {
- int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
- gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1);
-
- if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
- break;
+ /*
+ * If the guest's MTRRs may be used to compute the "real" memtype,
+ * restrict the mapping level to ensure KVM uses a consistent memtype
+ * across the entire mapping. If the host MTRRs are ignored by TDP
+ * (shadow_memtype_mask is non-zero), and the VM has non-coherent DMA
+ * (DMA doesn't snoop CPU caches), KVM's ABI is to honor the memtype
+ * from the guest's MTRRs so that guest accesses to memory that is
+ * DMA'd aren't cached against the guest's wishes.
+ *
+ * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs,
+ * e.g. KVM will force UC memtype for host MMIO.
+ */
+ if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
+ for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
+ int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
+ gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1);
- --fault->max_level;
+ if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
+ break;
+ }
}
return direct_page_fault(vcpu, fault);
@@ -4567,7 +4735,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
if (boot_cpu_is_amd())
__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
- context->root_role.level, false,
+ context->root_role.level, true,
boot_cpu_has(X86_FEATURE_GBPAGES),
false, true);
else
@@ -5199,11 +5367,11 @@ static bool need_remote_flush(u64 old, u64 new)
return false;
if (!is_shadow_present_pte(new))
return true;
- if ((old ^ new) & PT64_BASE_ADDR_MASK)
+ if ((old ^ new) & SPTE_BASE_ADDR_MASK)
return true;
old ^= shadow_nx_mask;
new ^= shadow_nx_mask;
- return (old & ~new & PT64_PERM_MASK) != 0;
+ return (old & ~new & SPTE_PERM_MASK) != 0;
}
static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
@@ -5328,13 +5496,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
- /*
- * No need to care whether allocation memory is successful
- * or not since pte prefetch is skipped if it does not have
- * enough objects in the cache.
- */
- mmu_topup_memory_caches(vcpu, true);
-
write_lock(&vcpu->kvm->mmu_lock);
gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
@@ -5819,9 +5980,25 @@ int kvm_mmu_init_vm(struct kvm *kvm)
node->track_write = kvm_mmu_pte_write;
node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
kvm_page_track_register_notifier(kvm, node);
+
+ kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
+ kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
+
+ kvm->arch.split_shadow_page_cache.gfp_zero = __GFP_ZERO;
+
+ kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
+ kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
+
return 0;
}
+static void mmu_free_vm_memory_caches(struct kvm *kvm)
+{
+ kvm_mmu_free_memory_cache(&kvm->arch.split_desc_cache);
+ kvm_mmu_free_memory_cache(&kvm->arch.split_page_header_cache);
+ kvm_mmu_free_memory_cache(&kvm->arch.split_shadow_page_cache);
+}
+
void kvm_mmu_uninit_vm(struct kvm *kvm)
{
struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
@@ -5829,9 +6006,11 @@ void kvm_mmu_uninit_vm(struct kvm *kvm)
kvm_page_track_unregister_notifier(kvm, node);
kvm_mmu_uninit_tdp_mmu(kvm);
+
+ mmu_free_vm_memory_caches(kvm);
}
-static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
+static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
{
const struct kvm_memory_slot *memslot;
struct kvm_memslots *slots;
@@ -5853,8 +6032,7 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
if (WARN_ON_ONCE(start >= end))
continue;
- flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
-
+ flush = slot_handle_level_range(kvm, memslot, __kvm_zap_rmap,
PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
start, end - 1, true, flush);
}
@@ -5879,7 +6057,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
kvm_inc_notifier_count(kvm, gfn_start, gfn_end);
- flush = __kvm_zap_rmaps(kvm, gfn_start, gfn_end);
+ flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
if (is_tdp_mmu_enabled(kvm)) {
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
@@ -5950,15 +6128,249 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
}
+static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min)
+{
+ return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
+}
+
+static bool need_topup_split_caches_or_resched(struct kvm *kvm)
+{
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
+ return true;
+
+ /*
+ * In the worst case, SPLIT_DESC_CACHE_MIN_NR_OBJECTS descriptors are needed
+ * to split a single huge page. Calculating how many are actually needed
+ * is possible but not worth the complexity.
+ */
+ return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) ||
+ need_topup(&kvm->arch.split_page_header_cache, 1) ||
+ need_topup(&kvm->arch.split_shadow_page_cache, 1);
+}
+
+static int topup_split_caches(struct kvm *kvm)
+{
+ /*
+ * Allocating rmap list entries when splitting huge pages for nested
+ * MMUs is uncommon as KVM needs to use a list if and only if there is
+ * more than one rmap entry for a gfn, i.e. requires an L1 gfn to be
+ * aliased by multiple L2 gfns and/or from multiple nested roots with
+ * different roles. Aliasing gfns when using TDP is atypical for VMMs;
+ * a few gfns are often aliased during boot, e.g. when remapping BIOS,
+ * but aliasing rarely occurs post-boot or for many gfns. If there is
+ * only one rmap entry, rmap->val points directly at that one entry and
+ * doesn't need to allocate a list. Buffer the cache by the default
+ * capacity so that KVM doesn't have to drop mmu_lock to topup if KVM
+ * encounters an aliased gfn or two.
+ */
+ const int capacity = SPLIT_DESC_CACHE_MIN_NR_OBJECTS +
+ KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE;
+ int r;
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache, capacity,
+ SPLIT_DESC_CACHE_MIN_NR_OBJECTS);
+ if (r)
+ return r;
+
+ r = kvm_mmu_topup_memory_cache(&kvm->arch.split_page_header_cache, 1);
+ if (r)
+ return r;
+
+ return kvm_mmu_topup_memory_cache(&kvm->arch.split_shadow_page_cache, 1);
+}
+
+static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *huge_sptep)
+{
+ struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
+ struct shadow_page_caches caches = {};
+ union kvm_mmu_page_role role;
+ unsigned int access;
+ gfn_t gfn;
+
+ gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep));
+ access = kvm_mmu_page_get_access(huge_sp, spte_index(huge_sptep));
+
+ /*
+ * Note, huge page splitting always uses direct shadow pages, regardless
+ * of whether the huge page itself is mapped by a direct or indirect
+ * shadow page, since the huge page region itself is being directly
+ * mapped with smaller pages.
+ */
+ role = kvm_mmu_child_role(huge_sptep, /*direct=*/true, access);
+
+ /* Direct SPs do not require a shadowed_info_cache. */
+ caches.page_header_cache = &kvm->arch.split_page_header_cache;
+ caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache;
+
+ /* Safe to pass NULL for vCPU since requesting a direct SP. */
+ return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role);
+}
+
+static void shadow_mmu_split_huge_page(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ u64 *huge_sptep)
+
+{
+ struct kvm_mmu_memory_cache *cache = &kvm->arch.split_desc_cache;
+ u64 huge_spte = READ_ONCE(*huge_sptep);
+ struct kvm_mmu_page *sp;
+ bool flush = false;
+ u64 *sptep, spte;
+ gfn_t gfn;
+ int index;
+
+ sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep);
+
+ for (index = 0; index < SPTE_ENT_PER_PAGE; index++) {
+ sptep = &sp->spt[index];
+ gfn = kvm_mmu_page_get_gfn(sp, index);
+
+ /*
+ * The SP may already have populated SPTEs, e.g. if this huge
+ * page is aliased by multiple sptes with the same access
+ * permissions. These entries are guaranteed to map the same
+ * gfn-to-pfn translation since the SP is direct, so no need to
+ * modify them.
+ *
+ * However, if a given SPTE points to a lower level page table,
+ * that lower level page table may only be partially populated.
+ * Installing such SPTEs would effectively unmap a potion of the
+ * huge page. Unmapping guest memory always requires a TLB flush
+ * since a subsequent operation on the unmapped regions would
+ * fail to detect the need to flush.
+ */
+ if (is_shadow_present_pte(*sptep)) {
+ flush |= !is_last_spte(*sptep, sp->role.level);
+ continue;
+ }
+
+ spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index);
+ mmu_spte_set(sptep, spte);
+ __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access);
+ }
+
+ __link_shadow_page(kvm, cache, huge_sptep, sp, flush);
+}
+
+static int shadow_mmu_try_split_huge_page(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ u64 *huge_sptep)
+{
+ struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
+ int level, r = 0;
+ gfn_t gfn;
+ u64 spte;
+
+ /* Grab information for the tracepoint before dropping the MMU lock. */
+ gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep));
+ level = huge_sp->role.level;
+ spte = *huge_sptep;
+
+ if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) {
+ r = -ENOSPC;
+ goto out;
+ }
+
+ if (need_topup_split_caches_or_resched(kvm)) {
+ write_unlock(&kvm->mmu_lock);
+ cond_resched();
+ /*
+ * If the topup succeeds, return -EAGAIN to indicate that the
+ * rmap iterator should be restarted because the MMU lock was
+ * dropped.
+ */
+ r = topup_split_caches(kvm) ?: -EAGAIN;
+ write_lock(&kvm->mmu_lock);
+ goto out;
+ }
+
+ shadow_mmu_split_huge_page(kvm, slot, huge_sptep);
+
+out:
+ trace_kvm_mmu_split_huge_page(gfn, spte, level, r);
+ return r;
+}
+
+static bool shadow_mmu_try_split_huge_pages(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot)
+{
+ struct rmap_iterator iter;
+ struct kvm_mmu_page *sp;
+ u64 *huge_sptep;
+ int r;
+
+restart:
+ for_each_rmap_spte(rmap_head, &iter, huge_sptep) {
+ sp = sptep_to_sp(huge_sptep);
+
+ /* TDP MMU is enabled, so rmap only contains nested MMU SPs. */
+ if (WARN_ON_ONCE(!sp->role.guest_mode))
+ continue;
+
+ /* The rmaps should never contain non-leaf SPTEs. */
+ if (WARN_ON_ONCE(!is_large_pte(*huge_sptep)))
+ continue;
+
+ /* SPs with level >PG_LEVEL_4K should never by unsync. */
+ if (WARN_ON_ONCE(sp->unsync))
+ continue;
+
+ /* Don't bother splitting huge pages on invalid SPs. */
+ if (sp->role.invalid)
+ continue;
+
+ r = shadow_mmu_try_split_huge_page(kvm, slot, huge_sptep);
+
+ /*
+ * The split succeeded or needs to be retried because the MMU
+ * lock was dropped. Either way, restart the iterator to get it
+ * back into a consistent state.
+ */
+ if (!r || r == -EAGAIN)
+ goto restart;
+
+ /* The split failed and shouldn't be retried (e.g. -ENOMEM). */
+ break;
+ }
+
+ return false;
+}
+
+static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end,
+ int target_level)
+{
+ int level;
+
+ /*
+ * Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working
+ * down to the target level. This ensures pages are recursively split
+ * all the way to the target level. There's no need to split pages
+ * already at the target level.
+ */
+ for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) {
+ slot_handle_level_range(kvm, slot, shadow_mmu_try_split_huge_pages,
+ level, level, start, end - 1, true, false);
+ }
+}
+
/* Must be called with the mmu_lock held in write-mode. */
void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
const struct kvm_memory_slot *memslot,
u64 start, u64 end,
int target_level)
{
- if (is_tdp_mmu_enabled(kvm))
- kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end,
- target_level, false);
+ if (!is_tdp_mmu_enabled(kvm))
+ return;
+
+ if (kvm_memslots_have_rmaps(kvm))
+ kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
+
+ kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false);
/*
* A TLB flush is unnecessary at this point for the same resons as in
@@ -5973,12 +6385,19 @@ void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
u64 start = memslot->base_gfn;
u64 end = start + memslot->npages;
- if (is_tdp_mmu_enabled(kvm)) {
- read_lock(&kvm->mmu_lock);
- kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
- read_unlock(&kvm->mmu_lock);
+ if (!is_tdp_mmu_enabled(kvm))
+ return;
+
+ if (kvm_memslots_have_rmaps(kvm)) {
+ write_lock(&kvm->mmu_lock);
+ kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
+ write_unlock(&kvm->mmu_lock);
}
+ read_lock(&kvm->mmu_lock);
+ kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
+ read_unlock(&kvm->mmu_lock);
+
/*
* No TLB flush is necessary here. KVM will flush TLBs after
* write-protecting and/or clearing dirty on the newly split SPTEs to
@@ -6012,10 +6431,10 @@ restart:
* the guest, and the guest page table is using 4K page size
* mapping if the indirect sp has level = 1.
*/
- if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
+ if (sp->role.direct &&
sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
- pfn, PG_LEVEL_NUM)) {
- pte_list_remove(kvm, rmap_head, sptep);
+ PG_LEVEL_NUM)) {
+ kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
if (kvm_available_flush_tlb_with_range())
kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
@@ -6030,18 +6449,24 @@ restart:
return need_tlb_flush;
}
+static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
+{
+ /*
+ * Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap
+ * pages that are already mapped at the maximum hugepage level.
+ */
+ if (slot_handle_level(kvm, slot, kvm_mmu_zap_collapsible_spte,
+ PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
+ kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+}
+
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *slot)
{
if (kvm_memslots_have_rmaps(kvm)) {
write_lock(&kvm->mmu_lock);
- /*
- * Zap only 4k SPTEs since the legacy MMU only supports dirty
- * logging at a 4k granularity and never creates collapsible
- * 2m SPTEs during dirty logging.
- */
- if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true))
- kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+ kvm_rmap_zap_collapsible_sptes(kvm, slot);
write_unlock(&kvm->mmu_lock);
}
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index bd2a26897b97..582def531d4d 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -20,6 +20,20 @@ extern bool dbg;
#define MMU_WARN_ON(x) do { } while (0)
#endif
+/* Page table builder macros common to shadow (host) PTEs and guest PTEs. */
+#define __PT_LEVEL_SHIFT(level, bits_per_level) \
+ (PAGE_SHIFT + ((level) - 1) * (bits_per_level))
+#define __PT_INDEX(address, level, bits_per_level) \
+ (((address) >> __PT_LEVEL_SHIFT(level, bits_per_level)) & ((1 << (bits_per_level)) - 1))
+
+#define __PT_LVL_ADDR_MASK(base_addr_mask, level, bits_per_level) \
+ ((base_addr_mask) & ~((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1))
+
+#define __PT_LVL_OFFSET_MASK(base_addr_mask, level, bits_per_level) \
+ ((base_addr_mask) & ((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1))
+
+#define __PT_ENT_PER_PAGE(bits_per_level) (1 << (bits_per_level))
+
/*
* Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT
* bit, and thus are guaranteed to be non-zero when valid. And, when a guest
@@ -53,8 +67,21 @@ struct kvm_mmu_page {
gfn_t gfn;
u64 *spt;
- /* hold the gfn of each spte inside spt */
- gfn_t *gfns;
+
+ /*
+ * Stores the result of the guest translation being shadowed by each
+ * SPTE. KVM shadows two types of guest translations: nGPA -> GPA
+ * (shadow EPT/NPT) and GVA -> GPA (traditional shadow paging). In both
+ * cases the result of the translation is a GPA and a set of access
+ * constraints.
+ *
+ * The GFN is stored in the upper bits (PAGE_SHIFT) and the shadowed
+ * access permissions are stored in the lower bits. Note, for
+ * convenience and uniformity across guests, the access permissions are
+ * stored in KVM format (e.g. ACC_EXEC_MASK) not the raw guest format.
+ */
+ u64 *shadowed_translation;
+
/* Currently serving as active root */
union {
int root_count;
@@ -141,9 +168,9 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
unsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
extern int nx_huge_pages;
-static inline bool is_nx_huge_page_enabled(void)
+static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
{
- return READ_ONCE(nx_huge_pages);
+ return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages;
}
struct kvm_page_fault {
@@ -242,7 +269,8 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
.user = err & PFERR_USER_MASK,
.prefetch = prefetch,
.is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault),
- .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(),
+ .nx_huge_page_workaround_enabled =
+ is_nx_huge_page_enabled(vcpu->kvm),
.max_level = KVM_MAX_HUGEPAGE_LEVEL,
.req_level = PG_LEVEL_4K,
@@ -281,7 +309,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int kvm_mmu_max_mapping_level(struct kvm *kvm,
const struct kvm_memory_slot *slot, gfn_t gfn,
- kvm_pfn_t pfn, int max_level);
+ int max_level);
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
diff --git a/arch/x86/kvm/mmu/paging.h b/arch/x86/kvm/mmu/paging.h
deleted file mode 100644
index de8ab323bb70..000000000000
--- a/arch/x86/kvm/mmu/paging.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/* Shadow paging constants/helpers that don't need to be #undef'd. */
-#ifndef __KVM_X86_PAGING_H
-#define __KVM_X86_PAGING_H
-
-#define GUEST_PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
-#define PT64_LVL_ADDR_MASK(level) \
- (GUEST_PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
- * PT64_LEVEL_BITS))) - 1))
-#define PT64_LVL_OFFSET_MASK(level) \
- (GUEST_PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
- * PT64_LEVEL_BITS))) - 1))
-#endif /* __KVM_X86_PAGING_H */
-
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index db80f7ccaa4e..f5958071220c 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -16,25 +16,21 @@
*/
/*
- * We need the mmu code to access both 32-bit and 64-bit guest ptes,
- * so the code in this file is compiled twice, once per pte size.
+ * The MMU needs to be able to access/walk 32-bit and 64-bit guest page tables,
+ * as well as guest EPT tables, so the code in this file is compiled thrice,
+ * once per guest PTE type. The per-type defines are #undef'd at the end.
*/
#if PTTYPE == 64
#define pt_element_t u64
#define guest_walker guest_walker64
#define FNAME(name) paging##64_##name
- #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK
- #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
- #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
- #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
- #define PT_LEVEL_BITS PT64_LEVEL_BITS
+ #define PT_LEVEL_BITS 9
#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
#define PT_HAVE_ACCESSED_DIRTY(mmu) true
#ifdef CONFIG_X86_64
#define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
- #define CMPXCHG "cmpxchgq"
#else
#define PT_MAX_FULL_LEVELS 2
#endif
@@ -42,36 +38,35 @@
#define pt_element_t u32
#define guest_walker guest_walker32
#define FNAME(name) paging##32_##name
- #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
- #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
- #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
- #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
- #define PT_LEVEL_BITS PT32_LEVEL_BITS
+ #define PT_LEVEL_BITS 10
#define PT_MAX_FULL_LEVELS 2
#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
#define PT_HAVE_ACCESSED_DIRTY(mmu) true
- #define CMPXCHG "cmpxchgl"
+
+ #define PT32_DIR_PSE36_SIZE 4
+ #define PT32_DIR_PSE36_SHIFT 13
+ #define PT32_DIR_PSE36_MASK \
+ (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
#elif PTTYPE == PTTYPE_EPT
#define pt_element_t u64
#define guest_walker guest_walkerEPT
#define FNAME(name) ept_##name
- #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK
- #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
- #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
- #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
- #define PT_LEVEL_BITS PT64_LEVEL_BITS
+ #define PT_LEVEL_BITS 9
#define PT_GUEST_DIRTY_SHIFT 9
#define PT_GUEST_ACCESSED_SHIFT 8
#define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled)
- #ifdef CONFIG_X86_64
- #define CMPXCHG "cmpxchgq"
- #endif
#define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
#else
#error Invalid PTTYPE value
#endif
+/* Common logic, but per-type values. These also need to be undefined. */
+#define PT_BASE_ADDR_MASK ((pt_element_t)(((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
+#define PT_LVL_ADDR_MASK(lvl) __PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS)
+#define PT_LVL_OFFSET_MASK(lvl) __PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS)
+#define PT_INDEX(addr, lvl) __PT_INDEX(addr, lvl, PT_LEVEL_BITS)
+
#define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT)
#define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT)
@@ -97,6 +92,15 @@ struct guest_walker {
struct x86_exception fault;
};
+#if PTTYPE == 32
+static inline gfn_t pse36_gfn_delta(u32 gpte)
+{
+ int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
+
+ return (gpte & PT32_DIR_PSE36_MASK) << shift;
+}
+#endif
+
static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
{
return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
@@ -374,7 +378,7 @@ retry_walk:
* information to fix the exit_qualification or exit_info_1
* fields.
*/
- if (unlikely(real_gpa == UNMAPPED_GVA))
+ if (unlikely(real_gpa == INVALID_GPA))
return 0;
host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gpa_to_gfn(real_gpa),
@@ -421,11 +425,13 @@ retry_walk:
gfn = gpte_to_gfn_lvl(pte, walker->level);
gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
- if (PTTYPE == 32 && walker->level > PG_LEVEL_4K && is_cpuid_PSE36())
+#if PTTYPE == 32
+ if (walker->level > PG_LEVEL_4K && is_cpuid_PSE36())
gfn += pse36_gfn_delta(pte);
+#endif
real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault);
- if (real_gpa == UNMAPPED_GVA)
+ if (real_gpa == INVALID_GPA)
return 0;
walker->gfn = real_gpa >> PAGE_SHIFT;
@@ -589,7 +595,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
if (sp->role.direct)
return __direct_pte_prefetch(vcpu, sp, sptep);
- i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+ i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1);
spte = sp->spt + i;
for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
@@ -642,14 +648,13 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
gfn_t table_gfn;
clear_sp_write_flooding_count(it.sptep);
- drop_large_spte(vcpu, it.sptep);
-
- sp = NULL;
- if (!is_shadow_present_pte(*it.sptep)) {
- table_gfn = gw->table_gfn[it.level - 2];
- access = gw->pt_access[it.level - 2];
- sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr,
- it.level-1, false, access);
+
+ table_gfn = gw->table_gfn[it.level - 2];
+ access = gw->pt_access[it.level - 2];
+ sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn,
+ false, access);
+
+ if (sp != ERR_PTR(-EEXIST)) {
/*
* We must synchronize the pagetable before linking it
* because the guest doesn't need to flush tlb when
@@ -678,7 +683,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
goto out_gpte_changed;
- if (sp)
+ if (sp != ERR_PTR(-EEXIST))
link_shadow_page(vcpu, it.sptep, sp);
}
@@ -702,16 +707,15 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
validate_direct_spte(vcpu, it.sptep, direct_access);
- drop_large_spte(vcpu, it.sptep);
+ sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn,
+ true, direct_access);
+ if (sp == ERR_PTR(-EEXIST))
+ continue;
- if (!is_shadow_present_pte(*it.sptep)) {
- sp = kvm_mmu_get_page(vcpu, base_gfn, fault->addr,
- it.level - 1, true, direct_access);
- link_shadow_page(vcpu, it.sptep, sp);
- if (fault->huge_page_disallowed &&
- fault->req_level >= it.level)
- account_huge_nx_page(vcpu->kvm, sp);
- }
+ link_shadow_page(vcpu, it.sptep, sp);
+ if (fault->huge_page_disallowed &&
+ fault->req_level >= it.level)
+ account_huge_nx_page(vcpu->kvm, sp);
}
if (WARN_ON_ONCE(it.level != fault->goal_level))
@@ -888,7 +892,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
WARN_ON(sp->role.level != PG_LEVEL_4K);
if (PTTYPE == 32)
- offset = sp->role.quadrant << PT64_LEVEL_BITS;
+ offset = sp->role.quadrant << SPTE_LEVEL_BITS;
return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
}
@@ -929,7 +933,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
break;
pte_gpa = FNAME(get_level1_sp_gpa)(sp);
- pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
+ pte_gpa += spte_index(sptep) * sizeof(pt_element_t);
mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL);
if (is_shadow_present_pte(old_spte))
@@ -958,7 +962,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
struct x86_exception *exception)
{
struct guest_walker walker;
- gpa_t gpa = UNMAPPED_GVA;
+ gpa_t gpa = INVALID_GPA;
int r;
#ifndef CONFIG_X86_64
@@ -978,7 +982,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
}
/*
- * Using the cached information from sp->gfns is safe because:
+ * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is
+ * safe because:
* - The spte has a reference to the struct page, so the pfn for a given gfn
* can't change unless all sptes pointing to it are nuked first.
*
@@ -1023,7 +1028,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
- for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
+ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
u64 *sptep, spte;
struct kvm_memory_slot *slot;
unsigned pte_access;
@@ -1053,12 +1058,23 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access))
continue;
- if (gfn != sp->gfns[i]) {
+ /*
+ * Drop the SPTE if the new protections would result in a RWX=0
+ * SPTE or if the gfn is changing. The RWX=0 case only affects
+ * EPT with execute-only support, i.e. EPT without an effective
+ * "present" bit, as all other paging modes will create a
+ * read-only SPTE if pte_access is zero.
+ */
+ if ((!pte_access && !shadow_present_mask) ||
+ gfn != kvm_mmu_page_get_gfn(sp, i)) {
drop_spte(vcpu->kvm, &sp->spt[i]);
flush = true;
continue;
}
+ /* Update the shadowed access bits in case they changed. */
+ kvm_mmu_page_set_access(sp, i, pte_access);
+
sptep = &sp->spt[i];
spte = *sptep;
host_writable = spte & shadow_host_writable_mask;
@@ -1070,6 +1086,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
flush |= mmu_spte_update(sptep, spte);
}
+ /*
+ * Note, any flush is purely for KVM's correctness, e.g. when dropping
+ * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier
+ * unmap or dirty logging event doesn't fail to flush. The guest is
+ * responsible for flushing the TLB to ensure any changes in protection
+ * bits are recognized, i.e. until the guest flushes or page faults on
+ * a relevant address, KVM is architecturally allowed to let vCPUs use
+ * cached translations with the old protection bits.
+ */
return flush;
}
@@ -1084,7 +1109,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
#undef PT_MAX_FULL_LEVELS
#undef gpte_to_gfn
#undef gpte_to_gfn_lvl
-#undef CMPXCHG
#undef PT_GUEST_ACCESSED_MASK
#undef PT_GUEST_DIRTY_MASK
#undef PT_GUEST_DIRTY_SHIFT
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index b5960bbde7f7..7314d27d57a4 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -33,6 +33,7 @@ u64 __read_mostly shadow_mmio_value;
u64 __read_mostly shadow_mmio_mask;
u64 __read_mostly shadow_mmio_access_mask;
u64 __read_mostly shadow_present_mask;
+u64 __read_mostly shadow_memtype_mask;
u64 __read_mostly shadow_me_value;
u64 __read_mostly shadow_me_mask;
u64 __read_mostly shadow_acc_track_mask;
@@ -129,6 +130,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 spte = SPTE_MMU_PRESENT_MASK;
bool wrprot = false;
+ WARN_ON_ONCE(!pte_access && !shadow_present_mask);
+
if (sp->role.ad_disabled)
spte |= SPTE_TDP_AD_DISABLED_MASK;
else if (kvm_mmu_page_ad_need_write_protect(sp))
@@ -145,7 +148,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
spte |= spte_shadow_accessed_mask(spte);
if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
- is_nx_huge_page_enabled()) {
+ is_nx_huge_page_enabled(vcpu->kvm)) {
pte_access &= ~ACC_EXEC_MASK;
}
@@ -159,10 +162,10 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (level > PG_LEVEL_4K)
spte |= PT_PAGE_SIZE_MASK;
- if (tdp_enabled)
- spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn,
- kvm_is_mmio_pfn(pfn));
+ if (shadow_memtype_mask)
+ spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn,
+ kvm_is_mmio_pfn(pfn));
if (host_writable)
spte |= shadow_host_writable_mask;
else
@@ -244,10 +247,10 @@ static u64 make_spte_executable(u64 spte)
* This is used during huge page splitting to build the SPTEs that make up the
* new page table.
*/
-u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index)
+u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, union kvm_mmu_page_role role,
+ int index)
{
u64 child_spte;
- int child_level;
if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte)))
return 0;
@@ -256,23 +259,23 @@ u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index)
return 0;
child_spte = huge_spte;
- child_level = huge_level - 1;
/*
* The child_spte already has the base address of the huge page being
* split. So we just have to OR in the offset to the page at the next
* lower level for the given index.
*/
- child_spte |= (index * KVM_PAGES_PER_HPAGE(child_level)) << PAGE_SHIFT;
+ child_spte |= (index * KVM_PAGES_PER_HPAGE(role.level)) << PAGE_SHIFT;
- if (child_level == PG_LEVEL_4K) {
+ if (role.level == PG_LEVEL_4K) {
child_spte &= ~PT_PAGE_SIZE_MASK;
/*
- * When splitting to a 4K page, mark the page executable as the
- * NX hugepage mitigation no longer applies.
+ * When splitting to a 4K page where execution is allowed, mark
+ * the page executable as the NX hugepage mitigation no longer
+ * applies.
*/
- if (is_nx_huge_page_enabled())
+ if ((role.access & ACC_EXEC_MASK) && is_nx_huge_page_enabled(kvm))
child_spte = make_spte_executable(child_spte);
}
@@ -299,7 +302,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
{
u64 new_spte;
- new_spte = old_spte & ~PT64_BASE_ADDR_MASK;
+ new_spte = old_spte & ~SPTE_BASE_ADDR_MASK;
new_spte |= (u64)new_pfn << PAGE_SHIFT;
new_spte &= ~PT_WRITABLE_MASK;
@@ -389,6 +392,13 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
shadow_nx_mask = 0ull;
shadow_x_mask = VMX_EPT_EXECUTABLE_MASK;
shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK;
+ /*
+ * EPT overrides the host MTRRs, and so KVM must program the desired
+ * memtype directly into the SPTEs. Note, this mask is just the mask
+ * of all bits that factor into the memtype, the actual memtype must be
+ * dynamically calculated, e.g. to ensure host MMIO is mapped UC.
+ */
+ shadow_memtype_mask = VMX_EPT_MT_MASK | VMX_EPT_IPAT_BIT;
shadow_acc_track_mask = VMX_EPT_RWX_MASK;
shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE;
@@ -439,6 +449,13 @@ void kvm_mmu_reset_all_pte_masks(void)
shadow_nx_mask = PT64_NX_MASK;
shadow_x_mask = 0;
shadow_present_mask = PT_PRESENT_MASK;
+
+ /*
+ * For shadow paging and NPT, KVM uses PAT entry '0' to encode WB
+ * memtype in the SPTEs, i.e. relies on host MTRRs to provide the
+ * correct memtype (WB is the "weakest" memtype).
+ */
+ shadow_memtype_mask = 0;
shadow_acc_track_mask = 0;
shadow_me_mask = 0;
shadow_me_value = 0;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 0127bb6e3c7d..cabe3fbb4f39 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -36,12 +36,12 @@ extern bool __read_mostly enable_mmio_caching;
static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
-#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
+#define SPTE_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
#else
-#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
+#define SPTE_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
#endif
-#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
+#define SPTE_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
| shadow_x_mask | shadow_nx_mask | shadow_me_mask)
#define ACC_EXEC_MASK 1
@@ -50,17 +50,13 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
/* The mask for the R/X bits in EPT PTEs */
-#define PT64_EPT_READABLE_MASK 0x1ull
-#define PT64_EPT_EXECUTABLE_MASK 0x4ull
+#define SPTE_EPT_READABLE_MASK 0x1ull
+#define SPTE_EPT_EXECUTABLE_MASK 0x4ull
-#define PT64_LEVEL_BITS 9
-
-#define PT64_LEVEL_SHIFT(level) \
- (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
-
-#define PT64_INDEX(address, level)\
- (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
-#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+#define SPTE_LEVEL_BITS 9
+#define SPTE_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, SPTE_LEVEL_BITS)
+#define SPTE_INDEX(address, level) __PT_INDEX(address, level, SPTE_LEVEL_BITS)
+#define SPTE_ENT_PER_PAGE __PT_ENT_PER_PAGE(SPTE_LEVEL_BITS)
/*
* The mask/shift to use for saving the original R/X bits when marking the PTE
@@ -69,8 +65,8 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
* restored only when a write is attempted to the page. This mask obviously
* must not overlap the A/D type mask.
*/
-#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \
- PT64_EPT_EXECUTABLE_MASK)
+#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (SPTE_EPT_READABLE_MASK | \
+ SPTE_EPT_EXECUTABLE_MASK)
#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54
#define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
@@ -151,6 +147,7 @@ extern u64 __read_mostly shadow_mmio_value;
extern u64 __read_mostly shadow_mmio_mask;
extern u64 __read_mostly shadow_mmio_access_mask;
extern u64 __read_mostly shadow_present_mask;
+extern u64 __read_mostly shadow_memtype_mask;
extern u64 __read_mostly shadow_me_value;
extern u64 __read_mostly shadow_me_mask;
@@ -194,6 +191,12 @@ static inline bool is_removed_spte(u64 spte)
return spte == REMOVED_SPTE;
}
+/* Get an SPTE's index into its parent's page table (and the spt array). */
+static inline int spte_index(u64 *sptep)
+{
+ return ((unsigned long)sptep / sizeof(*sptep)) & (SPTE_ENT_PER_PAGE - 1);
+}
+
/*
* In some cases, we need to preserve the GFN of a non-present or reserved
* SPTE when we usurp the upper five bits of the physical address space to
@@ -282,7 +285,7 @@ static inline bool is_executable_pte(u64 spte)
static inline kvm_pfn_t spte_to_pfn(u64 pte)
{
- return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+ return (pte & SPTE_BASE_ADDR_MASK) >> PAGE_SHIFT;
}
static inline bool is_accessed_spte(u64 spte)
@@ -425,7 +428,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
u64 old_spte, bool prefetch, bool can_unsync,
bool host_writable, u64 *new_spte);
-u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index);
+u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte,
+ union kvm_mmu_page_role role, int index);
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
u64 mark_spte_for_access_track(u64 spte);
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index ee4802d7b36c..39b48e7d7d1a 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -11,7 +11,7 @@
static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
{
iter->sptep = iter->pt_path[iter->level - 1] +
- SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
+ SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
}
@@ -116,8 +116,8 @@ static bool try_step_side(struct tdp_iter *iter)
* Check if the iterator is already at the end of the current page
* table.
*/
- if (SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level) ==
- (PT64_ENT_PER_PAGE - 1))
+ if (SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level) ==
+ (SPTE_ENT_PER_PAGE - 1))
return false;
iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
@@ -146,15 +146,6 @@ static bool try_step_up(struct tdp_iter *iter)
}
/*
- * Step the iterator back up a level in the paging structure. Should only be
- * used when the iterator is below the root level.
- */
-void tdp_iter_step_up(struct tdp_iter *iter)
-{
- WARN_ON(!try_step_up(iter));
-}
-
-/*
* Step to the next SPTE in a pre-order traversal of the paging structure.
* To get to the next SPTE, the iterator either steps down towards the goal
* GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index adfca0cf94d3..f0af385c56e0 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -114,6 +114,5 @@ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
int min_level, gfn_t next_last_level_gfn);
void tdp_iter_next(struct tdp_iter *iter);
void tdp_iter_restart(struct tdp_iter *iter);
-void tdp_iter_step_up(struct tdp_iter *iter);
#endif /* __KVM_X86_MMU_TDP_ITER_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 7b9265d67131..bf2ccf9debca 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -425,7 +425,7 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
tdp_mmu_unlink_sp(kvm, sp, shared);
- for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
+ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
tdp_ptep_t sptep = pt + i;
gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
u64 old_spte;
@@ -633,7 +633,6 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
u64 new_spte)
{
u64 *sptep = rcu_dereference(iter->sptep);
- u64 old_spte;
/*
* The caller is responsible for ensuring the old SPTE is not a REMOVED
@@ -649,17 +648,8 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
* Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
* does not hold the mmu_lock.
*/
- old_spte = cmpxchg64(sptep, iter->old_spte, new_spte);
- if (old_spte != iter->old_spte) {
- /*
- * The page table entry was modified by a different logical
- * CPU. Refresh iter->old_spte with the current value so the
- * caller operates on fresh data, e.g. if it retries
- * tdp_mmu_set_spte_atomic().
- */
- iter->old_spte = old_spte;
+ if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
return -EBUSY;
- }
__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
new_spte, iter->level, true);
@@ -934,9 +924,6 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
}
/*
- * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs
- * have been cleared and a TLB flush is needed before releasing the MMU lock.
- *
* If can_yield is true, will release the MMU lock and reschedule if the
* scheduler needs the CPU or there is contention on the MMU lock. If this
* function cannot yield, it will not release the MMU lock or reschedule and
@@ -979,10 +966,9 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
}
/*
- * Tears down the mappings for the range of gfns, [start, end), and frees the
- * non-root pages mapping GFNs strictly within that range. Returns true if
- * SPTEs have been cleared and a TLB flush is needed before releasing the
- * MMU lock.
+ * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
+ * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
+ * more SPTEs were zapped since the MMU lock was last acquired.
*/
bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
bool can_yield, bool flush)
@@ -1487,8 +1473,8 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
* No need for atomics when writing to sp->spt since the page table has
* not been linked in yet and thus is not reachable from any other CPU.
*/
- for (i = 0; i < PT64_ENT_PER_PAGE; i++)
- sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
+ for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
+ sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
/*
* Replace the huge spte with a pointer to the populated lower level
@@ -1507,7 +1493,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
* are overwriting from the page stats. But we have to manually update
* the page stats with the new present child pages.
*/
- kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE);
+ kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
out:
trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
@@ -1731,10 +1717,6 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
}
-/*
- * Clear leaf entries which could be replaced by large mappings, for
- * GFNs within the slot.
- */
static void zap_collapsible_spte_range(struct kvm *kvm,
struct kvm_mmu_page *root,
const struct kvm_memory_slot *slot)
@@ -1743,61 +1725,52 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
gfn_t end = start + slot->npages;
struct tdp_iter iter;
int max_mapping_level;
- kvm_pfn_t pfn;
rcu_read_lock();
- tdp_root_for_each_pte(iter, root, start, end) {
+ for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
+retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
- if (!is_shadow_present_pte(iter.old_spte) ||
- !is_last_spte(iter.old_spte, iter.level))
+ if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
+ !is_shadow_present_pte(iter.old_spte))
continue;
/*
- * This is a leaf SPTE. Check if the PFN it maps can
- * be mapped at a higher level.
+ * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
+ * a large page size, then its parent would have been zapped
+ * instead of stepping down.
*/
- pfn = spte_to_pfn(iter.old_spte);
-
- if (kvm_is_reserved_pfn(pfn))
+ if (is_last_spte(iter.old_spte, iter.level))
continue;
- max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
- iter.gfn, pfn, PG_LEVEL_NUM);
-
- WARN_ON(max_mapping_level < iter.level);
-
/*
- * If this page is already mapped at the highest
- * viable level, there's nothing more to do.
+ * If iter.gfn resides outside of the slot, i.e. the page for
+ * the current level overlaps but is not contained by the slot,
+ * then the SPTE can't be made huge. More importantly, trying
+ * to query that info from slot->arch.lpage_info will cause an
+ * out-of-bounds access.
*/
- if (max_mapping_level == iter.level)
+ if (iter.gfn < start || iter.gfn >= end)
continue;
- /*
- * The page can be remapped at a higher level, so step
- * up to zap the parent SPTE.
- */
- while (max_mapping_level > iter.level)
- tdp_iter_step_up(&iter);
+ max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
+ iter.gfn, PG_LEVEL_NUM);
+ if (max_mapping_level < iter.level)
+ continue;
/* Note, a successful atomic zap also does a remote TLB flush. */
- tdp_mmu_zap_spte_atomic(kvm, &iter);
-
- /*
- * If the atomic zap fails, the iter will recurse back into
- * the same subtree to retry.
- */
+ if (tdp_mmu_zap_spte_atomic(kvm, &iter))
+ goto retry;
}
rcu_read_unlock();
}
/*
- * Clear non-leaf entries (and free associated page tables) which could
- * be replaced by large mappings, for GFNs within the slot.
+ * Zap non-leaf SPTEs (and free their associated page tables) which could
+ * be replaced by huge pages, for GFNs within the slot.
*/
void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *slot)
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 3f868fed9114..02f9e4f245bd 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -16,6 +16,7 @@
#include <linux/bsearch.h>
#include <linux/sort.h>
#include <asm/perf_event.h>
+#include <asm/cpu_device_id.h>
#include "x86.h"
#include "cpuid.h"
#include "lapic.h"
@@ -24,6 +25,15 @@
/* This is enough to filter the vast majority of currently defined events. */
#define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300
+struct x86_pmu_capability __read_mostly kvm_pmu_cap;
+EXPORT_SYMBOL_GPL(kvm_pmu_cap);
+
+static const struct x86_cpu_id vmx_icl_pebs_cpu[] = {
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL),
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL),
+ {}
+};
+
/* NOTE:
* - Each perf counter is defined as "struct kvm_pmc";
* - There are two types of perf counters: general purpose (gp) and fixed.
@@ -34,7 +44,9 @@
* However AMD doesn't support fixed-counters;
* - There are three types of index to access perf counters (PMC):
* 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
- * has MSR_K7_PERFCTRn.
+ * has MSR_K7_PERFCTRn and, for families 15H and later,
+ * MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are
+ * aliased to MSR_K7_PERFCTRn.
* 2. MSR Index (named idx): This normally is used by RDPMC instruction.
* For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
* C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
@@ -46,7 +58,8 @@
* between pmc and perf counters is as the following:
* * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters
* [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed
- * * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters
+ * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
+ * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
*/
static struct kvm_pmu_ops kvm_pmu_ops __read_mostly;
@@ -86,15 +99,22 @@ static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+ bool skip_pmi = false;
/* Ignore counters that have been reprogrammed already. */
if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi))
return;
- __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+ if (pmc->perf_event && pmc->perf_event->attr.precise_ip) {
+ /* Indicate PEBS overflow PMI to guest. */
+ skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT,
+ (unsigned long *)&pmu->global_status);
+ } else {
+ __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+ }
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
- if (!pmc->intr)
+ if (!pmc->intr || skip_pmi)
return;
/*
@@ -124,6 +144,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
u64 config, bool exclude_user,
bool exclude_kernel, bool intr)
{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
struct perf_event *event;
struct perf_event_attr attr = {
.type = type,
@@ -135,9 +156,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
.exclude_kernel = exclude_kernel,
.config = config,
};
-
- if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX)
- return;
+ bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable);
attr.sample_period = get_sample_period(pmc, pmc->counter);
@@ -150,6 +169,25 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
*/
attr.sample_period = 0;
}
+ if (pebs) {
+ /*
+ * The non-zero precision level of guest event makes the ordinary
+ * guest event becomes a guest PEBS event and triggers the host
+ * PEBS PMI handler to determine whether the PEBS overflow PMI
+ * comes from the host counters or the guest.
+ *
+ * For most PEBS hardware events, the difference in the software
+ * precision levels of guest and host PEBS events will not affect
+ * the accuracy of the PEBS profiling result, because the "event IP"
+ * in the PEBS record is calibrated on the guest side.
+ *
+ * On Icelake everything is fine. Other hardware (GLC+, TNT+) that
+ * could possibly care here is unsupported and needs changes.
+ */
+ attr.precise_ip = 1;
+ if (x86_match_cpu(vmx_icl_pebs_cpu) && pmc->idx == 32)
+ attr.precise_ip = 3;
+ }
event = perf_event_create_kernel_counter(&attr, -1, current,
kvm_perf_overflow, pmc);
@@ -163,7 +201,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
pmc_to_pmu(pmc)->event_count++;
clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
pmc->is_paused = false;
- pmc->intr = intr;
+ pmc->intr = intr || pebs;
}
static void pmc_pause_counter(struct kvm_pmc *pmc)
@@ -189,6 +227,10 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
get_sample_period(pmc, pmc->counter)))
return false;
+ if (!test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) &&
+ pmc->perf_event->attr.precise_ip)
+ return false;
+
/* reuse perf_event to serve as pmc_reprogram_counter() does*/
perf_event_enable(pmc->perf_event);
pmc->is_paused = false;
@@ -205,115 +247,83 @@ static int cmp_u64(const void *pa, const void *pb)
return (a > b) - (a < b);
}
-void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
+static bool check_pmu_event_filter(struct kvm_pmc *pmc)
{
- u64 config;
- u32 type = PERF_TYPE_RAW;
- struct kvm *kvm = pmc->vcpu->kvm;
struct kvm_pmu_event_filter *filter;
- struct kvm_pmu *pmu = vcpu_to_pmu(pmc->vcpu);
+ struct kvm *kvm = pmc->vcpu->kvm;
bool allow_event = true;
+ __u64 key;
+ int idx;
- if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
- printk_once("kvm pmu: pin control bit is ignored\n");
-
- pmc->eventsel = eventsel;
-
- pmc_pause_counter(pmc);
-
- if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc))
- return;
+ if (!static_call(kvm_x86_pmu_hw_event_available)(pmc))
+ return false;
filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
- if (filter) {
- __u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;
+ if (!filter)
+ goto out;
+ if (pmc_is_gp(pmc)) {
+ key = pmc->eventsel & AMD64_RAW_EVENT_MASK_NB;
if (bsearch(&key, filter->events, filter->nevents,
sizeof(__u64), cmp_u64))
allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
else
allow_event = filter->action == KVM_PMU_EVENT_DENY;
+ } else {
+ idx = pmc->idx - INTEL_PMC_IDX_FIXED;
+ if (filter->action == KVM_PMU_EVENT_DENY &&
+ test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
+ allow_event = false;
+ if (filter->action == KVM_PMU_EVENT_ALLOW &&
+ !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
+ allow_event = false;
}
- if (!allow_event)
- return;
-
- if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
- ARCH_PERFMON_EVENTSEL_INV |
- ARCH_PERFMON_EVENTSEL_CMASK |
- HSW_IN_TX |
- HSW_IN_TX_CHECKPOINTED))) {
- config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc);
- if (config != PERF_COUNT_HW_MAX)
- type = PERF_TYPE_HARDWARE;
- }
-
- if (type == PERF_TYPE_RAW)
- config = eventsel & pmu->raw_event_mask;
-
- if (pmc->current_config == eventsel && pmc_resume_counter(pmc))
- return;
-
- pmc_release_perf_event(pmc);
- pmc->current_config = eventsel;
- pmc_reprogram_counter(pmc, type, config,
- !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
- !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
- eventsel & ARCH_PERFMON_EVENTSEL_INT);
+out:
+ return allow_event;
}
-EXPORT_SYMBOL_GPL(reprogram_gp_counter);
-void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
+void reprogram_counter(struct kvm_pmc *pmc)
{
- unsigned en_field = ctrl & 0x3;
- bool pmi = ctrl & 0x8;
- struct kvm_pmu_event_filter *filter;
- struct kvm *kvm = pmc->vcpu->kvm;
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+ u64 eventsel = pmc->eventsel;
+ u64 new_config = eventsel;
+ u8 fixed_ctr_ctrl;
pmc_pause_counter(pmc);
- if (!en_field || !pmc_is_enabled(pmc))
+ if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc))
return;
- filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
- if (filter) {
- if (filter->action == KVM_PMU_EVENT_DENY &&
- test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
- return;
- if (filter->action == KVM_PMU_EVENT_ALLOW &&
- !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
- return;
- }
-
- if (pmc->current_config == (u64)ctrl && pmc_resume_counter(pmc))
+ if (!check_pmu_event_filter(pmc))
return;
- pmc_release_perf_event(pmc);
-
- pmc->current_config = (u64)ctrl;
- pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE,
- static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc),
- !(en_field & 0x2), /* exclude user */
- !(en_field & 0x1), /* exclude kernel */
- pmi);
-}
-EXPORT_SYMBOL_GPL(reprogram_fixed_counter);
+ if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
+ printk_once("kvm pmu: pin control bit is ignored\n");
-void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx)
-{
- struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, pmc_idx);
+ if (pmc_is_fixed(pmc)) {
+ fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
+ pmc->idx - INTEL_PMC_IDX_FIXED);
+ if (fixed_ctr_ctrl & 0x1)
+ eventsel |= ARCH_PERFMON_EVENTSEL_OS;
+ if (fixed_ctr_ctrl & 0x2)
+ eventsel |= ARCH_PERFMON_EVENTSEL_USR;
+ if (fixed_ctr_ctrl & 0x8)
+ eventsel |= ARCH_PERFMON_EVENTSEL_INT;
+ new_config = (u64)fixed_ctr_ctrl;
+ }
- if (!pmc)
+ if (pmc->current_config == new_config && pmc_resume_counter(pmc))
return;
- if (pmc_is_gp(pmc))
- reprogram_gp_counter(pmc, pmc->eventsel);
- else {
- int idx = pmc_idx - INTEL_PMC_IDX_FIXED;
- u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx);
+ pmc_release_perf_event(pmc);
- reprogram_fixed_counter(pmc, ctrl, idx);
- }
+ pmc->current_config = new_config;
+ pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
+ (eventsel & pmu->raw_event_mask),
+ !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
+ !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
+ eventsel & ARCH_PERFMON_EVENTSEL_INT);
}
EXPORT_SYMBOL_GPL(reprogram_counter);
@@ -329,8 +339,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
clear_bit(bit, pmu->reprogram_pmi);
continue;
}
-
- reprogram_counter(pmu, bit);
+ reprogram_counter(pmc);
}
/*
@@ -471,17 +480,6 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)
kvm_pmu_refresh(vcpu);
}
-static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
-{
- struct kvm_pmu *pmu = pmc_to_pmu(pmc);
-
- if (pmc_is_fixed(pmc))
- return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
- pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3;
-
- return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
-}
-
/* Release perf_events for vPMCs that have been unused for a full time slice. */
void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
{
@@ -514,13 +512,12 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
{
- struct kvm_pmu *pmu = pmc_to_pmu(pmc);
u64 prev_count;
prev_count = pmc->counter;
pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc);
- reprogram_counter(pmu, pmc->idx);
+ reprogram_counter(pmc);
if (pmc->counter < prev_count)
__kvm_perf_overflow(pmc, false);
}
@@ -528,13 +525,8 @@ static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
unsigned int perf_hw_id)
{
- u64 old_eventsel = pmc->eventsel;
- unsigned int config;
-
- pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK);
- config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc);
- pmc->eventsel = old_eventsel;
- return config == perf_hw_id;
+ return !((pmc->eventsel ^ perf_get_hw_event_config(perf_hw_id)) &
+ AMD64_RAW_EVENT_MASK_NB);
}
static inline bool cpl_is_matched(struct kvm_pmc *pmc)
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index e745f443b6a8..5cc5721f260b 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -8,6 +8,9 @@
#define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu))
#define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu)
+#define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \
+ MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)
+
/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf)
@@ -22,7 +25,7 @@ struct kvm_event_hw_type_mapping {
};
struct kvm_pmu_ops {
- unsigned int (*pmc_perf_hw_id)(struct kvm_pmc *pmc);
+ bool (*hw_event_available)(struct kvm_pmc *pmc);
bool (*pmc_is_enabled)(struct kvm_pmc *pmc);
struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx);
struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu,
@@ -144,9 +147,43 @@ static inline void pmc_update_sample_period(struct kvm_pmc *pmc)
get_sample_period(pmc, pmc->counter));
}
-void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel);
-void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx);
-void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx);
+static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ if (pmc_is_fixed(pmc))
+ return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
+ pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3;
+
+ return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
+}
+
+extern struct x86_pmu_capability kvm_pmu_cap;
+
+static inline void kvm_init_pmu_capability(void)
+{
+ bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL;
+
+ perf_get_x86_pmu_capability(&kvm_pmu_cap);
+
+ /*
+ * For Intel, only support guest architectural pmu
+ * on a host with architectural pmu.
+ */
+ if ((is_intel && !kvm_pmu_cap.version) || !kvm_pmu_cap.num_counters_gp)
+ enable_pmu = false;
+
+ if (!enable_pmu) {
+ memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap));
+ return;
+ }
+
+ kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2);
+ kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed,
+ KVM_PMC_MAX_FIXED);
+}
+
+void reprogram_counter(struct kvm_pmc *pmc);
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index d1bc5820ea46..6919dee69f18 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -40,6 +40,9 @@
#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK)
+static bool force_avic;
+module_param_unsafe(force_avic, bool, 0444);
+
/* Note:
* This hash table is used to map VM_ID to a struct kvm_svm,
* when handling AMD IOMMU GALOG notification to schedule in
@@ -50,6 +53,7 @@ static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
static u32 next_vm_id = 0;
static bool next_vm_id_wrapped = 0;
static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
+enum avic_modes avic_mode;
/*
* This is a wrapper of struct amd_iommu_ir_data.
@@ -59,6 +63,54 @@ struct amd_svm_iommu_ir {
void *data; /* Storing pointer to struct amd_ir_data */
};
+static void avic_activate_vmcb(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
+ vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
+
+ vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+
+ /* Note:
+ * KVM can support hybrid-AVIC mode, where KVM emulates x2APIC
+ * MSR accesses, while interrupt injection to a running vCPU
+ * can be achieved using AVIC doorbell. The AVIC hardware still
+ * accelerate MMIO accesses, but this does not cause any harm
+ * as the guest is not supposed to access xAPIC mmio when uses x2APIC.
+ */
+ if (apic_x2apic_mode(svm->vcpu.arch.apic) &&
+ avic_mode == AVIC_MODE_X2) {
+ vmcb->control.int_ctl |= X2APIC_MODE_MASK;
+ vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID;
+ /* Disabling MSR intercept for x2APIC registers */
+ svm_set_x2apic_msr_interception(svm, false);
+ } else {
+ /* For xAVIC and hybrid-xAVIC modes */
+ vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID;
+ /* Enabling MSR intercept for x2APIC registers */
+ svm_set_x2apic_msr_interception(svm, true);
+ }
+}
+
+static void avic_deactivate_vmcb(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
+ vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
+
+ /*
+ * If running nested and the guest uses its own MSR bitmap, there
+ * is no need to update L0's msr bitmap
+ */
+ if (is_guest_mode(&svm->vcpu) &&
+ vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))
+ return;
+
+ /* Enabling MSR intercept for x2APIC registers */
+ svm_set_x2apic_msr_interception(svm, true);
+}
/* Note:
* This function is called from IOMMU driver to notify
@@ -175,13 +227,12 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
- vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK;
if (kvm_apicv_activated(svm->vcpu.kvm))
- vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+ avic_activate_vmcb(svm);
else
- vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
+ avic_deactivate_vmcb(svm);
}
static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
@@ -190,7 +241,8 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
u64 *avic_physical_id_table;
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
- if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
+ if ((avic_mode == AVIC_MODE_X1 && index > AVIC_MAX_PHYSICAL_ID) ||
+ (avic_mode == AVIC_MODE_X2 && index > X2AVIC_MAX_PHYSICAL_ID))
return NULL;
avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
@@ -237,7 +289,8 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
int id = vcpu->vcpu_id;
struct vcpu_svm *svm = to_svm(vcpu);
- if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
+ if ((avic_mode == AVIC_MODE_X1 && id > AVIC_MAX_PHYSICAL_ID) ||
+ (avic_mode == AVIC_MODE_X2 && id > X2AVIC_MAX_PHYSICAL_ID))
return -EINVAL;
if (!vcpu->arch.apic->regs)
@@ -279,8 +332,10 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu)
*/
int cpu = READ_ONCE(vcpu->cpu);
- if (cpu != get_cpu())
+ if (cpu != get_cpu()) {
wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
+ trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu));
+ }
put_cpu();
}
@@ -303,7 +358,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source
if (apic_x2apic_mode(source))
dest = icrh;
else
- dest = GET_APIC_DEST_FIELD(icrh);
+ dest = GET_XAPIC_DEST_FIELD(icrh);
if (dest_mode == APIC_DEST_PHYSICAL) {
/* broadcast destination, use slow path */
@@ -345,9 +400,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source
logid_index = cluster + __ffs(bitmap);
- if (apic_x2apic_mode(source)) {
- l1_physical_id = logid_index;
- } else {
+ if (!apic_x2apic_mode(source)) {
u32 *avic_logical_id_table =
page_address(kvm_svm->avic_logical_id_table_page);
@@ -362,6 +415,23 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source
l1_physical_id = logid_entry &
AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+ } else {
+ /*
+ * For x2APIC logical mode, cannot leverage the index.
+ * Instead, calculate physical ID from logical ID in ICRH.
+ */
+ int cluster = (icrh & 0xffff0000) >> 16;
+ int apic = ffs(icrh & 0xffff) - 1;
+
+ /*
+ * If the x2APIC logical ID sub-field (i.e. icrh[15:0])
+ * contains anything but a single bit, we cannot use the
+ * fast path, because it is limited to a single vCPU.
+ */
+ if (apic < 0 || icrh != (1 << apic))
+ return -EINVAL;
+
+ l1_physical_id = (cluster << 4) + apic;
}
}
@@ -396,9 +466,15 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
* since entered the guest will have processed pending IRQs at VMRUN.
*/
kvm_for_each_vcpu(i, vcpu, kvm) {
+ u32 dest;
+
+ if (apic_x2apic_mode(vcpu->arch.apic))
+ dest = icrh;
+ else
+ dest = GET_XAPIC_DEST_FIELD(icrh);
+
if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
- GET_APIC_DEST_FIELD(icrh),
- icrl & APIC_DEST_MASK)) {
+ dest, icrl & APIC_DEST_MASK)) {
vcpu->arch.apic->irr_pending = true;
svm_complete_interrupt_delivery(vcpu,
icrl & APIC_MODE_MASK,
@@ -514,8 +590,13 @@ static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
bool flat = svm->dfr_reg == APIC_DFR_FLAT;
- u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
+ u32 *entry;
+
+ /* Note: x2AVIC does not use logical APIC ID table */
+ if (apic_x2apic_mode(vcpu->arch.apic))
+ return;
+ entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
if (entry)
clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
}
@@ -527,6 +608,10 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
u32 id = kvm_xapic_id(vcpu->arch.apic);
+ /* AVIC does not support LDR update for x2APIC */
+ if (apic_x2apic_mode(vcpu->arch.apic))
+ return 0;
+
if (ldr == svm->ldr_reg)
return 0;
@@ -654,6 +739,18 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
avic_handle_ldr_update(vcpu);
}
+void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
+{
+ if (!lapic_in_kernel(vcpu) || avic_mode == AVIC_MODE_NONE)
+ return;
+
+ if (kvm_get_apic_mode(vcpu) == LAPIC_MODE_INVALID) {
+ WARN_ONCE(true, "Invalid local APIC state (vcpu_id=%d)", vcpu->vcpu_id);
+ return;
+ }
+ avic_refresh_apicv_exec_ctrl(vcpu);
+}
+
static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
{
int ret = 0;
@@ -906,7 +1003,6 @@ bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
BIT(APICV_INHIBIT_REASON_NESTED) |
BIT(APICV_INHIBIT_REASON_IRQWIN) |
BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
- BIT(APICV_INHIBIT_REASON_X2APIC) |
BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
BIT(APICV_INHIBIT_REASON_SEV) |
BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
@@ -968,7 +1064,6 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
return;
entry = READ_ONCE(*(svm->avic_physical_id_cache));
- WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
@@ -1016,9 +1111,9 @@ void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
* accordingly before re-activating.
*/
avic_apicv_post_state_restore(vcpu);
- vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+ avic_activate_vmcb(svm);
} else {
- vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
+ avic_deactivate_vmcb(svm);
}
vmcb_mark_dirty(vmcb, VMCB_AVIC);
@@ -1058,3 +1153,44 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
avic_vcpu_load(vcpu, vcpu->cpu);
}
+
+/*
+ * Note:
+ * - The module param avic enable both xAPIC and x2APIC mode.
+ * - Hypervisor can support both xAVIC and x2AVIC in the same guest.
+ * - The mode can be switched at run-time.
+ */
+bool avic_hardware_setup(struct kvm_x86_ops *x86_ops)
+{
+ if (!npt_enabled)
+ return false;
+
+ if (boot_cpu_has(X86_FEATURE_AVIC)) {
+ avic_mode = AVIC_MODE_X1;
+ pr_info("AVIC enabled\n");
+ } else if (force_avic) {
+ /*
+ * Some older systems does not advertise AVIC support.
+ * See Revision Guide for specific AMD processor for more detail.
+ */
+ avic_mode = AVIC_MODE_X1;
+ pr_warn("AVIC is not supported in CPUID but force enabled");
+ pr_warn("Your system might crash and burn");
+ }
+
+ /* AVIC is a prerequisite for x2AVIC. */
+ if (boot_cpu_has(X86_FEATURE_X2AVIC)) {
+ if (avic_mode == AVIC_MODE_X1) {
+ avic_mode = AVIC_MODE_X2;
+ pr_info("x2AVIC enabled\n");
+ } else {
+ pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled");
+ pr_warn(FW_BUG "Try enable AVIC using force_avic option");
+ }
+ }
+
+ if (avic_mode != AVIC_MODE_NONE)
+ amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+
+ return !!avic_mode;
+}
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index ba7cd26f438f..76dcc8a3e849 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -230,6 +230,11 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
break;
p = msrpm_offsets[i];
+
+ /* x2apic msrs are intercepted always for the nested guest */
+ if (is_x2apic_msrpm_offset(p))
+ continue;
+
offset = svm->nested.ctl.msrpm_base_pa + (p * 4);
if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
@@ -320,7 +325,8 @@ static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu,
return false;
}
- if (CC(!kvm_is_valid_cr4(vcpu, save->cr4)))
+ /* Note, SVM doesn't have any additional restrictions on CR4. */
+ if (CC(!__kvm_is_valid_cr4(vcpu, save->cr4)))
return false;
if (CC(!kvm_valid_efer(vcpu, save->efer)))
@@ -371,6 +377,7 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
to->nested_ctl = from->nested_ctl;
to->event_inj = from->event_inj;
to->event_inj_err = from->event_inj_err;
+ to->next_rip = from->next_rip;
to->nested_cr3 = from->nested_cr3;
to->virt_ext = from->virt_ext;
to->pause_filter_count = from->pause_filter_count;
@@ -608,7 +615,33 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
}
}
-static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
+static inline bool is_evtinj_soft(u32 evtinj)
+{
+ u32 type = evtinj & SVM_EVTINJ_TYPE_MASK;
+ u8 vector = evtinj & SVM_EVTINJ_VEC_MASK;
+
+ if (!(evtinj & SVM_EVTINJ_VALID))
+ return false;
+
+ if (type == SVM_EVTINJ_TYPE_SOFT)
+ return true;
+
+ return type == SVM_EVTINJ_TYPE_EXEPT && kvm_exception_is_soft(vector);
+}
+
+static bool is_evtinj_nmi(u32 evtinj)
+{
+ u32 type = evtinj & SVM_EVTINJ_TYPE_MASK;
+
+ if (!(evtinj & SVM_EVTINJ_VALID))
+ return false;
+
+ return type == SVM_EVTINJ_TYPE_NMI;
+}
+
+static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
+ unsigned long vmcb12_rip,
+ unsigned long vmcb12_csbase)
{
u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK;
u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
@@ -650,7 +683,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
vmcb02->control.tsc_offset = vcpu->arch.tsc_offset;
- if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
+ if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) {
WARN_ON(!svm->tsc_scaling_enabled);
nested_svm_update_tsc_ratio_msr(vcpu);
}
@@ -664,6 +697,30 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
vmcb02->control.event_inj = svm->nested.ctl.event_inj;
vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err;
+ /*
+ * next_rip is consumed on VMRUN as the return address pushed on the
+ * stack for injected soft exceptions/interrupts. If nrips is exposed
+ * to L1, take it verbatim from vmcb12. If nrips is supported in
+ * hardware but not exposed to L1, stuff the actual L2 RIP to emulate
+ * what a nrips=0 CPU would do (L1 is responsible for advancing RIP
+ * prior to injecting the event).
+ */
+ if (svm->nrips_enabled)
+ vmcb02->control.next_rip = svm->nested.ctl.next_rip;
+ else if (boot_cpu_has(X86_FEATURE_NRIPS))
+ vmcb02->control.next_rip = vmcb12_rip;
+
+ svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj);
+ if (is_evtinj_soft(vmcb02->control.event_inj)) {
+ svm->soft_int_injected = true;
+ svm->soft_int_csbase = vmcb12_csbase;
+ svm->soft_int_old_rip = vmcb12_rip;
+ if (svm->nrips_enabled)
+ svm->soft_int_next_rip = svm->nested.ctl.next_rip;
+ else
+ svm->soft_int_next_rip = vmcb12_rip;
+ }
+
vmcb02->control.virt_ext = vmcb01->control.virt_ext &
LBR_CTL_ENABLE_MASK;
if (svm->lbrv_enabled)
@@ -745,7 +802,7 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr);
svm_switch_vmcb(svm, &svm->nested.vmcb02);
- nested_vmcb02_prepare_control(svm);
+ nested_vmcb02_prepare_control(svm, vmcb12->save.rip, vmcb12->save.cs.base);
nested_vmcb02_prepare_save(svm, vmcb12);
ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3,
@@ -834,6 +891,8 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
out_exit_err:
svm->nested.nested_run_pending = 0;
+ svm->nmi_l1_to_l2 = false;
+ svm->soft_int_injected = false;
svm->vmcb->control.exit_code = SVM_EXIT_ERR;
svm->vmcb->control.exit_code_hi = 0;
@@ -982,7 +1041,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
}
- if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
+ if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) {
WARN_ON(!svm->tsc_scaling_enabled);
vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
__svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
@@ -1421,6 +1480,7 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst,
dst->nested_ctl = from->nested_ctl;
dst->event_inj = from->event_inj;
dst->event_inj_err = from->event_inj_err;
+ dst->next_rip = from->next_rip;
dst->nested_cr3 = from->nested_cr3;
dst->virt_ext = from->virt_ext;
dst->pause_filter_count = from->pause_filter_count;
@@ -1605,7 +1665,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
nested_copy_vmcb_control_to_cache(svm, ctl);
svm_switch_vmcb(svm, &svm->nested.vmcb02);
- nested_vmcb02_prepare_control(svm);
+ nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip, svm->vmcb->save.cs.base);
/*
* While the nested guest CR3 is already checked and set by
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 136039fc6d01..f24613a108c5 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -33,34 +33,6 @@ enum index {
INDEX_ERROR,
};
-/* duplicated from amd_perfmon_event_map, K7 and above should work. */
-static struct kvm_event_hw_type_mapping amd_event_mapping[] = {
- [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES },
- [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
- [2] = { 0x7d, 0x07, PERF_COUNT_HW_CACHE_REFERENCES },
- [3] = { 0x7e, 0x07, PERF_COUNT_HW_CACHE_MISSES },
- [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
- [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
- [6] = { 0xd0, 0x00, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
- [7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
-};
-
-/* duplicated from amd_f17h_perfmon_event_map. */
-static struct kvm_event_hw_type_mapping amd_f17h_event_mapping[] = {
- [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES },
- [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
- [2] = { 0x60, 0xff, PERF_COUNT_HW_CACHE_REFERENCES },
- [3] = { 0x64, 0x09, PERF_COUNT_HW_CACHE_MISSES },
- [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
- [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
- [6] = { 0x87, 0x02, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
- [7] = { 0x87, 0x01, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
-};
-
-/* amd_pmc_perf_hw_id depends on these being the same size */
-static_assert(ARRAY_SIZE(amd_event_mapping) ==
- ARRAY_SIZE(amd_f17h_event_mapping));
-
static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type)
{
struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
@@ -154,31 +126,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
return &pmu->gp_counters[msr_to_index(msr)];
}
-static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc)
+static bool amd_hw_event_available(struct kvm_pmc *pmc)
{
- struct kvm_event_hw_type_mapping *event_mapping;
- u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
- u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
- int i;
-
- /* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */
- if (WARN_ON(pmc_is_fixed(pmc)))
- return PERF_COUNT_HW_MAX;
-
- if (guest_cpuid_family(pmc->vcpu) >= 0x17)
- event_mapping = amd_f17h_event_mapping;
- else
- event_mapping = amd_event_mapping;
-
- for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++)
- if (event_mapping[i].eventsel == event_select
- && event_mapping[i].unit_mask == unit_mask)
- break;
-
- if (i == ARRAY_SIZE(amd_event_mapping))
- return PERF_COUNT_HW_MAX;
-
- return event_mapping[i].event_type;
+ return true;
}
/* check if a PMC is enabled by comparing it against global_ctrl bits. Because
@@ -286,8 +236,10 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
if (pmc) {
data &= ~pmu->reserved_bits;
- if (data != pmc->eventsel)
- reprogram_gp_counter(pmc, data);
+ if (data != pmc->eventsel) {
+ pmc->eventsel = data;
+ reprogram_counter(pmc);
+ }
return 0;
}
@@ -343,7 +295,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu)
}
struct kvm_pmu_ops amd_pmu_ops __initdata = {
- .pmc_perf_hw_id = amd_pmc_perf_hw_id,
+ .hw_event_available = amd_hw_event_available,
.pmc_is_enabled = amd_pmc_is_enabled,
.pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
.rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc,
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 0c240ed04f96..b0e793e7d85c 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -603,6 +603,9 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
save->xss = svm->vcpu.arch.ia32_xss;
save->dr6 = svm->vcpu.arch.dr6;
+ pr_debug("Virtual Machine Save Area (VMSA):\n");
+ print_hex_dump(KERN_CONT, "", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false);
+
return 0;
}
@@ -1606,38 +1609,35 @@ static int sev_lock_vcpus_for_migration(struct kvm *kvm,
{
struct kvm_vcpu *vcpu;
unsigned long i, j;
- bool first = true;
kvm_for_each_vcpu(i, vcpu, kvm) {
if (mutex_lock_killable_nested(&vcpu->mutex, role))
goto out_unlock;
- if (first) {
+#ifdef CONFIG_PROVE_LOCKING
+ if (!i)
/*
* Reset the role to one that avoids colliding with
* the role used for the first vcpu mutex.
*/
role = SEV_NR_MIGRATION_ROLES;
- first = false;
- } else {
+ else
mutex_release(&vcpu->mutex.dep_map, _THIS_IP_);
- }
+#endif
}
return 0;
out_unlock:
- first = true;
kvm_for_each_vcpu(j, vcpu, kvm) {
if (i == j)
break;
- if (first)
- first = false;
- else
+#ifdef CONFIG_PROVE_LOCKING
+ if (j)
mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_);
-
+#endif
mutex_unlock(&vcpu->mutex);
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 44bbf25dfeb9..38f873cb6f2c 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -74,6 +74,8 @@ static uint64_t osvw_len = 4, osvw_status;
static DEFINE_PER_CPU(u64, current_tsc_ratio);
+#define X2APIC_MSR(x) (APIC_BASE_MSR + (x >> 4))
+
static const struct svm_direct_access_msrs {
u32 index; /* Index of the MSR */
bool always; /* True if intercept is initially cleared */
@@ -100,6 +102,38 @@ static const struct svm_direct_access_msrs {
{ .index = MSR_IA32_CR_PAT, .always = false },
{ .index = MSR_AMD64_SEV_ES_GHCB, .always = true },
{ .index = MSR_TSC_AUX, .always = false },
+ { .index = X2APIC_MSR(APIC_ID), .always = false },
+ { .index = X2APIC_MSR(APIC_LVR), .always = false },
+ { .index = X2APIC_MSR(APIC_TASKPRI), .always = false },
+ { .index = X2APIC_MSR(APIC_ARBPRI), .always = false },
+ { .index = X2APIC_MSR(APIC_PROCPRI), .always = false },
+ { .index = X2APIC_MSR(APIC_EOI), .always = false },
+ { .index = X2APIC_MSR(APIC_RRR), .always = false },
+ { .index = X2APIC_MSR(APIC_LDR), .always = false },
+ { .index = X2APIC_MSR(APIC_DFR), .always = false },
+ { .index = X2APIC_MSR(APIC_SPIV), .always = false },
+ { .index = X2APIC_MSR(APIC_ISR), .always = false },
+ { .index = X2APIC_MSR(APIC_TMR), .always = false },
+ { .index = X2APIC_MSR(APIC_IRR), .always = false },
+ { .index = X2APIC_MSR(APIC_ESR), .always = false },
+ { .index = X2APIC_MSR(APIC_ICR), .always = false },
+ { .index = X2APIC_MSR(APIC_ICR2), .always = false },
+
+ /*
+ * Note:
+ * AMD does not virtualize APIC TSC-deadline timer mode, but it is
+ * emulated by KVM. When setting APIC LVTT (0x832) register bit 18,
+ * the AVIC hardware would generate GP fault. Therefore, always
+ * intercept the MSR 0x832, and do not setup direct_access_msr.
+ */
+ { .index = X2APIC_MSR(APIC_LVTTHMR), .always = false },
+ { .index = X2APIC_MSR(APIC_LVTPC), .always = false },
+ { .index = X2APIC_MSR(APIC_LVT0), .always = false },
+ { .index = X2APIC_MSR(APIC_LVT1), .always = false },
+ { .index = X2APIC_MSR(APIC_LVTERR), .always = false },
+ { .index = X2APIC_MSR(APIC_TMICT), .always = false },
+ { .index = X2APIC_MSR(APIC_TMCCT), .always = false },
+ { .index = X2APIC_MSR(APIC_TDCR), .always = false },
{ .index = MSR_INVALID, .always = false },
};
@@ -188,9 +222,6 @@ module_param(tsc_scaling, int, 0444);
static bool avic;
module_param(avic, bool, 0444);
-static bool force_avic;
-module_param_unsafe(force_avic, bool, 0444);
-
bool __read_mostly dump_invalid_vmcb;
module_param(dump_invalid_vmcb, bool, 0644);
@@ -342,9 +373,11 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
}
-static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
+ bool commit_side_effects)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long old_rflags;
/*
* SEV-ES does not expose the next RIP. The RIP update is controlled by
@@ -359,18 +392,75 @@ static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
}
if (!svm->next_rip) {
+ if (unlikely(!commit_side_effects))
+ old_rflags = svm->vmcb->save.rflags;
+
if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
return 0;
+
+ if (unlikely(!commit_side_effects))
+ svm->vmcb->save.rflags = old_rflags;
} else {
kvm_rip_write(vcpu, svm->next_rip);
}
done:
- svm_set_interrupt_shadow(vcpu, 0);
+ if (likely(commit_side_effects))
+ svm_set_interrupt_shadow(vcpu, 0);
return 1;
}
+static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ return __svm_skip_emulated_instruction(vcpu, true);
+}
+
+static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
+{
+ unsigned long rip, old_rip = kvm_rip_read(vcpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * Due to architectural shortcomings, the CPU doesn't always provide
+ * NextRIP, e.g. if KVM intercepted an exception that occurred while
+ * the CPU was vectoring an INTO/INT3 in the guest. Temporarily skip
+ * the instruction even if NextRIP is supported to acquire the next
+ * RIP so that it can be shoved into the NextRIP field, otherwise
+ * hardware will fail to advance guest RIP during event injection.
+ * Drop the exception/interrupt if emulation fails and effectively
+ * retry the instruction, it's the least awful option. If NRIPS is
+ * in use, the skip must not commit any side effects such as clearing
+ * the interrupt shadow or RFLAGS.RF.
+ */
+ if (!__svm_skip_emulated_instruction(vcpu, !nrips))
+ return -EIO;
+
+ rip = kvm_rip_read(vcpu);
+
+ /*
+ * Save the injection information, even when using next_rip, as the
+ * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
+ * doesn't complete due to a VM-Exit occurring while the CPU is
+ * vectoring the event. Decoding the instruction isn't guaranteed to
+ * work as there may be no backing instruction, e.g. if the event is
+ * being injected by L1 for L2, or if the guest is patching INT3 into
+ * a different instruction.
+ */
+ svm->soft_int_injected = true;
+ svm->soft_int_csbase = svm->vmcb->save.cs.base;
+ svm->soft_int_old_rip = old_rip;
+ svm->soft_int_next_rip = rip;
+
+ if (nrips)
+ kvm_rip_write(vcpu, old_rip);
+
+ if (static_cpu_has(X86_FEATURE_NRIPS))
+ svm->vmcb->control.next_rip = rip;
+
+ return 0;
+}
+
static void svm_queue_exception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -380,21 +470,9 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
kvm_deliver_exception_payload(vcpu);
- if (nr == BP_VECTOR && !nrips) {
- unsigned long rip, old_rip = kvm_rip_read(vcpu);
-
- /*
- * For guest debugging where we have to reinject #BP if some
- * INT3 is guest-owned:
- * Emulate nRIP by moving RIP forward. Will fail if injection
- * raises a fault that is not intercepted. Still better than
- * failing in all cases.
- */
- (void)svm_skip_emulated_instruction(vcpu);
- rip = kvm_rip_read(vcpu);
- svm->int3_rip = rip + svm->vmcb->save.cs.base;
- svm->int3_injected = rip - old_rip;
- }
+ if (kvm_exception_is_soft(nr) &&
+ svm_update_soft_interrupt_rip(vcpu))
+ return;
svm->vmcb->control.event_inj = nr
| SVM_EVTINJ_VALID
@@ -736,6 +814,29 @@ void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
}
}
+void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
+{
+ int i;
+
+ if (intercept == svm->x2avic_msrs_intercepted)
+ return;
+
+ if (avic_mode != AVIC_MODE_X2 ||
+ !apic_x2apic_mode(svm->vcpu.arch.apic))
+ return;
+
+ for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) {
+ int index = direct_access_msrs[i].index;
+
+ if ((index < APIC_BASE_MSR) ||
+ (index > APIC_BASE_MSR + 0xff))
+ continue;
+ set_msr_interception(&svm->vcpu, svm->msrpm, index,
+ !intercept, !intercept);
+ }
+
+ svm->x2avic_msrs_intercepted = intercept;
+}
void svm_vcpu_free_msrpm(u32 *msrpm)
{
@@ -1231,7 +1332,7 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
svm_init_osvw(vcpu);
vcpu->arch.microcode_version = 0x01000065;
- svm->tsc_ratio_msr = kvm_default_tsc_scaling_ratio;
+ svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;
if (sev_es_guest(vcpu->kvm))
sev_es_vcpu_reset(svm);
@@ -1299,6 +1400,8 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
goto error_free_vmsa_page;
}
+ svm->x2avic_msrs_intercepted = true;
+
svm->vmcb01.ptr = page_address(vmcb01_page);
svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
svm_switch_vmcb(svm, &svm->vmcb01);
@@ -2345,6 +2448,7 @@ static int task_switch_interception(struct kvm_vcpu *vcpu)
kvm_clear_exception_queue(vcpu);
break;
case SVM_EXITINTINFO_TYPE_INTR:
+ case SVM_EXITINTINFO_TYPE_SOFT:
kvm_clear_interrupt_queue(vcpu);
break;
default:
@@ -3375,35 +3479,49 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
+
+ if (svm->nmi_l1_to_l2)
+ return;
+
vcpu->arch.hflags |= HF_NMI_MASK;
if (!sev_es_guest(vcpu->kvm))
svm_set_intercept(svm, INTERCEPT_IRET);
++vcpu->stat.nmi_injections;
}
-static void svm_inject_irq(struct kvm_vcpu *vcpu)
+static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ u32 type;
+
+ if (vcpu->arch.interrupt.soft) {
+ if (svm_update_soft_interrupt_rip(vcpu))
+ return;
- BUG_ON(!(gif_set(svm)));
+ type = SVM_EVTINJ_TYPE_SOFT;
+ } else {
+ type = SVM_EVTINJ_TYPE_INTR;
+ }
- trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
+ trace_kvm_inj_virq(vcpu->arch.interrupt.nr,
+ vcpu->arch.interrupt.soft, reinjected);
++vcpu->stat.irq_injections;
svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
- SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
+ SVM_EVTINJ_VALID | type;
}
void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
int trig_mode, int vector)
{
/*
- * vcpu->arch.apicv_active must be read after vcpu->mode.
+ * apic->apicv_active must be read after vcpu->mode.
* Pairs with smp_store_release in vcpu_enter_guest.
*/
bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
- if (!READ_ONCE(vcpu->arch.apicv_active)) {
+ /* Note, this is called iff the local APIC is in-kernel. */
+ if (!READ_ONCE(vcpu->arch.apic->apicv_active)) {
/* Process the interrupt via inject_pending_event */
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
@@ -3668,15 +3786,49 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
}
+static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
+ int type)
+{
+ bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT);
+ bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
+ * associated with the original soft exception/interrupt. next_rip is
+ * cleared on all exits that can occur while vectoring an event, so KVM
+ * needs to manually set next_rip for re-injection. Unlike the !nrips
+ * case below, this needs to be done if and only if KVM is re-injecting
+ * the same event, i.e. if the event is a soft exception/interrupt,
+ * otherwise next_rip is unused on VMRUN.
+ */
+ if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) &&
+ kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase))
+ svm->vmcb->control.next_rip = svm->soft_int_next_rip;
+ /*
+ * If NRIPS isn't enabled, KVM must manually advance RIP prior to
+ * injecting the soft exception/interrupt. That advancement needs to
+ * be unwound if vectoring didn't complete. Note, the new event may
+ * not be the injected event, e.g. if KVM injected an INTn, the INTn
+ * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
+ * be the reported vectored event, but RIP still needs to be unwound.
+ */
+ else if (!nrips && (is_soft || is_exception) &&
+ kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase))
+ kvm_rip_write(vcpu, svm->soft_int_old_rip);
+}
+
static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
u8 vector;
int type;
u32 exitintinfo = svm->vmcb->control.exit_int_info;
- unsigned int3_injected = svm->int3_injected;
+ bool nmi_l1_to_l2 = svm->nmi_l1_to_l2;
+ bool soft_int_injected = svm->soft_int_injected;
- svm->int3_injected = 0;
+ svm->nmi_l1_to_l2 = false;
+ svm->soft_int_injected = false;
/*
* If we've made progress since setting HF_IRET_MASK, we've
@@ -3701,9 +3853,13 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
+ if (soft_int_injected)
+ svm_complete_soft_interrupt(vcpu, vector, type);
+
switch (type) {
case SVM_EXITINTINFO_TYPE_NMI:
vcpu->arch.nmi_injected = true;
+ svm->nmi_l1_to_l2 = nmi_l1_to_l2;
break;
case SVM_EXITINTINFO_TYPE_EXEPT:
/*
@@ -3712,18 +3868,6 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
if (vector == X86_TRAP_VC)
break;
- /*
- * In case of software exceptions, do not reinject the vector,
- * but re-execute the instruction instead. Rewind RIP first
- * if we emulated INT3 before.
- */
- if (kvm_exception_is_soft(vector)) {
- if (vector == BP_VECTOR && int3_injected &&
- kvm_is_linear_rip(vcpu, svm->int3_rip))
- kvm_rip_write(vcpu,
- kvm_rip_read(vcpu) - int3_injected);
- break;
- }
if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
u32 err = svm->vmcb->control.exit_int_info_err;
kvm_requeue_exception_e(vcpu, vector, err);
@@ -3734,9 +3878,13 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
case SVM_EXITINTINFO_TYPE_INTR:
kvm_queue_interrupt(vcpu, vector, false);
break;
+ case SVM_EXITINTINFO_TYPE_SOFT:
+ kvm_queue_interrupt(vcpu, vector, true);
+ break;
default:
break;
}
+
}
static void svm_cancel_injection(struct kvm_vcpu *vcpu)
@@ -3952,7 +4100,7 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
hv_track_root_tdp(vcpu, root_hpa);
cr3 = vcpu->arch.cr3;
- } else if (vcpu->arch.mmu->root_role.level >= PT64_ROOT_4LEVEL) {
+ } else if (root_level >= PT64_ROOT_4LEVEL) {
cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
} else {
/* PCID in the guest should be impossible with a 32-bit MMU. */
@@ -4013,16 +4161,10 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
return true;
}
-static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
-{
- return 0;
-}
-
static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_cpuid_entry2 *best;
- struct kvm *kvm = vcpu->kvm;
vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
boot_cpu_has(X86_FEATURE_XSAVE) &&
@@ -4049,19 +4191,11 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
/* For sev guests, the memory encryption bit is not reserved in CR3. */
if (sev_guest(vcpu->kvm)) {
- best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0);
+ best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
if (best)
vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
}
- if (kvm_vcpu_apicv_active(vcpu)) {
- /*
- * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
- * is exposed to the guest, disable AVIC.
- */
- if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
- kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_X2APIC);
- }
init_vmcb_after_set_cpuid(vcpu);
}
@@ -4673,11 +4807,11 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.enable_nmi_window = svm_enable_nmi_window,
.enable_irq_window = svm_enable_irq_window,
.update_cr8_intercept = svm_update_cr8_intercept,
+ .set_virtual_apic_mode = avic_set_virtual_apic_mode,
.refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
.check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons,
.apicv_post_state_restore = avic_apicv_post_state_restore,
- .get_mt_mask = svm_get_mt_mask,
.get_exit_info = svm_get_exit_info,
.vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
@@ -4773,7 +4907,7 @@ static __init void svm_set_cpu_caps(void)
{
kvm_set_cpu_caps();
- supported_xss = 0;
+ kvm_caps.supported_xss = 0;
/* CPUID 0x80000001 and 0x8000000A (SVM features) */
if (nested) {
@@ -4849,7 +4983,8 @@ static __init int svm_hardware_setup(void)
init_msrpm_offsets();
- supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
+ kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
+ XFEATURE_MASK_BNDCSR);
if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
kvm_enable_efer_bits(EFER_FFXSR);
@@ -4859,11 +4994,11 @@ static __init int svm_hardware_setup(void)
tsc_scaling = false;
} else {
pr_info("TSC scaling supported\n");
- kvm_has_tsc_control = true;
+ kvm_caps.has_tsc_control = true;
}
}
- kvm_max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
- kvm_tsc_scaling_ratio_frac_bits = 32;
+ kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
+ kvm_caps.tsc_scaling_ratio_frac_bits = 32;
tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
@@ -4917,17 +5052,9 @@ static __init int svm_hardware_setup(void)
nrips = false;
}
- enable_apicv = avic = avic && npt_enabled && (boot_cpu_has(X86_FEATURE_AVIC) || force_avic);
+ enable_apicv = avic = avic && avic_hardware_setup(&svm_x86_ops);
- if (enable_apicv) {
- if (!boot_cpu_has(X86_FEATURE_AVIC)) {
- pr_warn("AVIC is not supported in CPUID but force enabled");
- pr_warn("Your system might crash and burn");
- } else
- pr_info("AVIC enabled\n");
-
- amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
- } else {
+ if (!enable_apicv) {
svm_x86_ops.vcpu_blocking = NULL;
svm_x86_ops.vcpu_unblocking = NULL;
svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 9223ac100ef5..6a7686bf6900 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -29,13 +29,21 @@
#define IOPM_SIZE PAGE_SIZE * 3
#define MSRPM_SIZE PAGE_SIZE * 2
-#define MAX_DIRECT_ACCESS_MSRS 21
-#define MSRPM_OFFSETS 16
+#define MAX_DIRECT_ACCESS_MSRS 46
+#define MSRPM_OFFSETS 32
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
extern int vgif;
extern bool intercept_smi;
+enum avic_modes {
+ AVIC_MODE_NONE = 0,
+ AVIC_MODE_X1,
+ AVIC_MODE_X2,
+};
+
+extern enum avic_modes avic_mode;
+
/*
* Clean bits in VMCB.
* VMCB_ALL_CLEAN_MASK might also need to
@@ -139,6 +147,7 @@ struct vmcb_ctrl_area_cached {
u64 nested_ctl;
u32 event_inj;
u32 event_inj_err;
+ u64 next_rip;
u64 nested_cr3;
u64 virt_ext;
u32 clean;
@@ -228,9 +237,12 @@ struct vcpu_svm {
bool nmi_singlestep;
u64 nmi_singlestep_guest_rflags;
+ bool nmi_l1_to_l2;
- unsigned int3_injected;
- unsigned long int3_rip;
+ unsigned long soft_int_csbase;
+ unsigned long soft_int_old_rip;
+ unsigned long soft_int_next_rip;
+ bool soft_int_injected;
/* optional nested SVM features that are enabled for this guest */
bool nrips_enabled : 1;
@@ -264,6 +276,8 @@ struct vcpu_svm {
struct vcpu_sev_es_state sev_es;
bool guest_state_loaded;
+
+ bool x2avic_msrs_intercepted;
};
struct svm_cpu_data {
@@ -509,6 +523,15 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm)
return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
}
+static inline bool is_x2apic_msrpm_offset(u32 offset)
+{
+ /* 4 msrs per u8, and 4 u8 in u32 */
+ u32 msr = offset * 16;
+
+ return (msr >= APIC_BASE_MSR) &&
+ (msr < (APIC_BASE_MSR + 0x100));
+}
+
/* svm.c */
#define MSR_INVALID 0xffffffffU
@@ -534,6 +557,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value);
int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code);
void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
int read, int write);
+void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool disable);
void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
int trig_mode, int vec);
@@ -603,6 +627,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
/* avic.c */
+bool avic_hardware_setup(struct kvm_x86_ops *ops);
int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
@@ -613,18 +638,16 @@ int avic_init_vcpu(struct vcpu_svm *svm);
void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
void avic_vcpu_put(struct kvm_vcpu *vcpu);
void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
-void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason);
-void avic_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr);
-void avic_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr);
-bool avic_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu);
int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
void avic_ring_doorbell(struct kvm_vcpu *vcpu);
unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu);
+void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
+
/* sev.c */
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index de4762517569..2120d7c060a9 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -154,7 +154,7 @@ TRACE_EVENT(kvm_xen_hypercall,
TRACE_EVENT(kvm_pio,
TP_PROTO(unsigned int rw, unsigned int port, unsigned int size,
- unsigned int count, void *data),
+ unsigned int count, const void *data),
TP_ARGS(rw, port, size, count, data),
TP_STRUCT__entry(
@@ -333,18 +333,24 @@ TRACE_EVENT_KVM_EXIT(kvm_exit);
* Tracepoint for kvm interrupt injection:
*/
TRACE_EVENT(kvm_inj_virq,
- TP_PROTO(unsigned int irq),
- TP_ARGS(irq),
+ TP_PROTO(unsigned int vector, bool soft, bool reinjected),
+ TP_ARGS(vector, soft, reinjected),
TP_STRUCT__entry(
- __field( unsigned int, irq )
+ __field( unsigned int, vector )
+ __field( bool, soft )
+ __field( bool, reinjected )
),
TP_fast_assign(
- __entry->irq = irq;
+ __entry->vector = vector;
+ __entry->soft = soft;
+ __entry->reinjected = reinjected;
),
- TP_printk("irq %u", __entry->irq)
+ TP_printk("%s 0x%x%s",
+ __entry->soft ? "Soft/INTn" : "IRQ", __entry->vector,
+ __entry->reinjected ? " [reinjected]" : "")
);
#define EXS(x) { x##_VECTOR, "#" #x }
@@ -358,25 +364,30 @@ TRACE_EVENT(kvm_inj_virq,
* Tracepoint for kvm interrupt injection:
*/
TRACE_EVENT(kvm_inj_exception,
- TP_PROTO(unsigned exception, bool has_error, unsigned error_code),
- TP_ARGS(exception, has_error, error_code),
+ TP_PROTO(unsigned exception, bool has_error, unsigned error_code,
+ bool reinjected),
+ TP_ARGS(exception, has_error, error_code, reinjected),
TP_STRUCT__entry(
__field( u8, exception )
__field( u8, has_error )
__field( u32, error_code )
+ __field( bool, reinjected )
),
TP_fast_assign(
__entry->exception = exception;
__entry->has_error = has_error;
__entry->error_code = error_code;
+ __entry->reinjected = reinjected;
),
- TP_printk("%s (0x%x)",
+ TP_printk("%s%s%s%s%s",
__print_symbolic(__entry->exception, kvm_trace_sym_exc),
- /* FIXME: don't print error_code if not present */
- __entry->has_error ? __entry->error_code : 0)
+ !__entry->has_error ? "" : " (",
+ !__entry->has_error ? "" : __print_symbolic(__entry->error_code, { }),
+ !__entry->has_error ? "" : ")",
+ __entry->reinjected ? " [reinjected]" : "")
);
/*
@@ -1479,6 +1490,24 @@ TRACE_EVENT(kvm_avic_kick_vcpu_slowpath,
__entry->icrh, __entry->icrl, __entry->index)
);
+TRACE_EVENT(kvm_avic_doorbell,
+ TP_PROTO(u32 vcpuid, u32 apicid),
+ TP_ARGS(vcpuid, apicid),
+
+ TP_STRUCT__entry(
+ __field(u32, vcpuid)
+ __field(u32, apicid)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpuid = vcpuid;
+ __entry->apicid = apicid;
+ ),
+
+ TP_printk("vcpuid=%u, apicid=%u",
+ __entry->vcpuid, __entry->apicid)
+);
+
TRACE_EVENT(kvm_hv_timer_state,
TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use),
TP_ARGS(vcpu_id, hv_timer_in_use),
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index c0e24826a86f..c5e5dfef69c7 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -6,6 +6,8 @@
#include "../lapic.h"
#include "../x86.h"
+#include "../pmu.h"
+#include "../cpuid.h"
extern bool __read_mostly enable_vpid;
extern bool __read_mostly flexpriority_enabled;
@@ -13,6 +15,7 @@ extern bool __read_mostly enable_ept;
extern bool __read_mostly enable_unrestricted_guest;
extern bool __read_mostly enable_ept_ad_bits;
extern bool __read_mostly enable_pml;
+extern bool __read_mostly enable_ipiv;
extern int __read_mostly pt_mode;
#define PT_MODE_SYSTEM 0
@@ -59,6 +62,7 @@ struct vmcs_config {
u32 pin_based_exec_ctrl;
u32 cpu_based_exec_ctrl;
u32 cpu_based_2nd_exec_ctrl;
+ u64 cpu_based_3rd_exec_ctrl;
u32 vmexit_ctrl;
u32 vmentry_ctrl;
struct nested_vmx_msrs nested;
@@ -94,20 +98,17 @@ static inline bool cpu_has_vmx_posted_intr(void)
static inline bool cpu_has_load_ia32_efer(void)
{
- return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER) &&
- (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_EFER);
+ return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER;
}
static inline bool cpu_has_load_perf_global_ctrl(void)
{
- return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
- (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
+ return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
}
static inline bool cpu_has_vmx_mpx(void)
{
- return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
- (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
+ return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS;
}
static inline bool cpu_has_vmx_tpr_shadow(void)
@@ -131,6 +132,12 @@ static inline bool cpu_has_secondary_exec_ctrls(void)
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
}
+static inline bool cpu_has_tertiary_exec_ctrls(void)
+{
+ return vmcs_config.cpu_based_exec_ctrl &
+ CPU_BASED_ACTIVATE_TERTIARY_CONTROLS;
+}
+
static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -276,6 +283,11 @@ static inline bool cpu_has_vmx_apicv(void)
cpu_has_vmx_posted_intr();
}
+static inline bool cpu_has_vmx_ipiv(void)
+{
+ return vmcs_config.cpu_based_3rd_exec_ctrl & TERTIARY_EXEC_IPI_VIRT;
+}
+
static inline bool cpu_has_vmx_flexpriority(void)
{
return cpu_has_vmx_tpr_shadow() &&
@@ -363,7 +375,6 @@ static inline bool cpu_has_vmx_intel_pt(void)
rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
return (vmx_msr & MSR_IA32_VMX_MISC_INTEL_PT) &&
(vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) &&
- (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_IA32_RTIT_CTL) &&
(vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL);
}
@@ -385,23 +396,31 @@ static inline bool vmx_pt_mode_is_host_guest(void)
return pt_mode == PT_MODE_HOST_GUEST;
}
+static inline bool vmx_pebs_supported(void)
+{
+ return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept;
+}
+
static inline u64 vmx_get_perf_capabilities(void)
{
- u64 perf_cap = 0;
+ u64 perf_cap = PMU_CAP_FW_WRITES;
+ u64 host_perf_cap = 0;
if (!enable_pmu)
- return perf_cap;
+ return 0;
if (boot_cpu_has(X86_FEATURE_PDCM))
- rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap);
+ rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
- perf_cap &= PMU_CAP_LBR_FMT;
+ perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
- /*
- * Since counters are virtualized, KVM would support full
- * width counting unconditionally, even if the host lacks it.
- */
- return PMU_CAP_FW_WRITES | perf_cap;
+ if (vmx_pebs_supported()) {
+ perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
+ if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4)
+ perf_cap &= ~PERF_CAP_PEBS_BASELINE;
+ }
+
+ return perf_cap;
}
static inline u64 vmx_supported_debugctl(void)
@@ -417,4 +436,10 @@ static inline u64 vmx_supported_debugctl(void)
return debugctl;
}
+static inline bool cpu_has_notify_vmexit(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_NOTIFY_VM_EXITING;
+}
+
#endif /* __KVM_X86_VMX_CAPS_H */
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index 87e3dc10edf4..6a61b1ae7942 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -297,8 +297,10 @@ const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
#if IS_ENABLED(CONFIG_HYPERV)
__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
{
+ vmcs_conf->cpu_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_EXEC_CTRL;
vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
+ vmcs_conf->cpu_based_3rd_exec_ctrl = 0;
vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index 8d70f9aea94b..f886a8ff0342 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -50,6 +50,7 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs);
*/
#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \
PIN_BASED_VMX_PREEMPTION_TIMER)
+#define EVMCS1_UNSUPPORTED_EXEC_CTRL (CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
#define EVMCS1_UNSUPPORTED_2NDEXEC \
(SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index ab135f9ef52f..ddd4367d4826 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -311,11 +311,12 @@ static void free_nested(struct kvm_vcpu *vcpu)
vmx->nested.cached_vmcs12 = NULL;
kfree(vmx->nested.cached_shadow_vmcs12);
vmx->nested.cached_shadow_vmcs12 = NULL;
- /* Unpin physical memory we referred to in the vmcs02 */
- if (vmx->nested.apic_access_page) {
- kvm_release_page_clean(vmx->nested.apic_access_page);
- vmx->nested.apic_access_page = NULL;
- }
+ /*
+ * Unpin physical memory we referred to in the vmcs02. The APIC access
+ * page's backing page (yeah, confusing) shouldn't actually be accessed,
+ * and if it is written, the contents are irrelevant.
+ */
+ kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false);
kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
vmx->nested.pi_desc = NULL;
@@ -1223,7 +1224,7 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
/* reserved */
BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
- u64 vmx_basic = vmx->nested.msrs.basic;
+ u64 vmx_basic = vmcs_config.nested.basic;
if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
return -EINVAL;
@@ -1246,36 +1247,42 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
return 0;
}
-static int
-vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
+static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index,
+ u32 **low, u32 **high)
{
- u64 supported;
- u32 *lowp, *highp;
-
switch (msr_index) {
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
- lowp = &vmx->nested.msrs.pinbased_ctls_low;
- highp = &vmx->nested.msrs.pinbased_ctls_high;
+ *low = &msrs->pinbased_ctls_low;
+ *high = &msrs->pinbased_ctls_high;
break;
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
- lowp = &vmx->nested.msrs.procbased_ctls_low;
- highp = &vmx->nested.msrs.procbased_ctls_high;
+ *low = &msrs->procbased_ctls_low;
+ *high = &msrs->procbased_ctls_high;
break;
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
- lowp = &vmx->nested.msrs.exit_ctls_low;
- highp = &vmx->nested.msrs.exit_ctls_high;
+ *low = &msrs->exit_ctls_low;
+ *high = &msrs->exit_ctls_high;
break;
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
- lowp = &vmx->nested.msrs.entry_ctls_low;
- highp = &vmx->nested.msrs.entry_ctls_high;
+ *low = &msrs->entry_ctls_low;
+ *high = &msrs->entry_ctls_high;
break;
case MSR_IA32_VMX_PROCBASED_CTLS2:
- lowp = &vmx->nested.msrs.secondary_ctls_low;
- highp = &vmx->nested.msrs.secondary_ctls_high;
+ *low = &msrs->secondary_ctls_low;
+ *high = &msrs->secondary_ctls_high;
break;
default:
BUG();
}
+}
+
+static int
+vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
+{
+ u32 *lowp, *highp;
+ u64 supported;
+
+ vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp);
supported = vmx_control_msr(*lowp, *highp);
@@ -1287,6 +1294,7 @@ vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
return -EINVAL;
+ vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp);
*lowp = data;
*highp = data >> 32;
return 0;
@@ -1300,10 +1308,8 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
/* reserved */
GENMASK_ULL(13, 9) | BIT_ULL(31);
- u64 vmx_misc;
-
- vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
- vmx->nested.msrs.misc_high);
+ u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low,
+ vmcs_config.nested.misc_high);
if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
return -EINVAL;
@@ -1331,10 +1337,8 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
{
- u64 vmx_ept_vpid_cap;
-
- vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
- vmx->nested.msrs.vpid_caps);
+ u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps,
+ vmcs_config.nested.vpid_caps);
/* Every bit is either reserved or a feature bit. */
if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
@@ -1345,20 +1349,21 @@ static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
return 0;
}
-static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
+static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index)
{
- u64 *msr;
-
switch (msr_index) {
case MSR_IA32_VMX_CR0_FIXED0:
- msr = &vmx->nested.msrs.cr0_fixed0;
- break;
+ return &msrs->cr0_fixed0;
case MSR_IA32_VMX_CR4_FIXED0:
- msr = &vmx->nested.msrs.cr4_fixed0;
- break;
+ return &msrs->cr4_fixed0;
default:
BUG();
}
+}
+
+static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
+{
+ const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index);
/*
* 1 bits (which indicates bits which "must-be-1" during VMX operation)
@@ -1367,7 +1372,7 @@ static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
if (!is_bitwise_subset(data, *msr, -1ULL))
return -EINVAL;
- *msr = data;
+ *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data;
return 0;
}
@@ -1428,7 +1433,7 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
vmx->nested.msrs.vmcs_enum = data;
return 0;
case MSR_IA32_VMX_VMFUNC:
- if (data & ~vmx->nested.msrs.vmfunc_controls)
+ if (data & ~vmcs_config.nested.vmfunc_controls)
return -EINVAL;
vmx->nested.msrs.vmfunc_controls = data;
return 0;
@@ -2133,6 +2138,8 @@ static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
{
+ struct kvm *kvm = vmx->vcpu.kvm;
+
/*
* If vmcs02 hasn't been initialized, set the constant vmcs02 state
* according to L0's settings (vmcs12 is irrelevant here). Host
@@ -2175,6 +2182,9 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
if (cpu_has_vmx_encls_vmexit())
vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA);
+ if (kvm_notify_vmexit_enabled(kvm))
+ vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
+
/*
* Set the MSR load/store lists to match L0's settings. Only the
* addresses are constant (for vmcs02), the counts can change based
@@ -2514,11 +2524,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
} else {
kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
- vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
}
if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
- vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
+ vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs);
vmx_set_rflags(vcpu, vmcs12->guest_rflags);
/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
@@ -2547,7 +2557,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmx_get_l2_tsc_multiplier(vcpu));
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
- if (kvm_has_tsc_control)
+ if (kvm_caps.has_tsc_control)
vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
@@ -2613,6 +2623,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
}
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
+ intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) &&
WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
vmcs12->guest_ia32_perf_global_ctrl))) {
*entry_failure_code = ENTRY_FAIL_DEFAULT;
@@ -3158,8 +3169,6 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_host_map *map;
- struct page *page;
- u64 hpa;
if (!vcpu->arch.pdptrs_from_userspace &&
!nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
@@ -3174,23 +3183,12 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
- /*
- * Translate L1 physical address to host physical
- * address for vmcs02. Keep the page pinned, so this
- * physical address remains valid. We keep a reference
- * to it so we can release it later.
- */
- if (vmx->nested.apic_access_page) { /* shouldn't happen */
- kvm_release_page_clean(vmx->nested.apic_access_page);
- vmx->nested.apic_access_page = NULL;
- }
- page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
- if (!is_error_page(page)) {
- vmx->nested.apic_access_page = page;
- hpa = page_to_phys(vmx->nested.apic_access_page);
- vmcs_write64(APIC_ACCESS_ADDR, hpa);
+ map = &vmx->nested.apic_access_page_map;
+
+ if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) {
+ vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn));
} else {
- pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n",
+ pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n",
__func__);
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror =
@@ -3373,11 +3371,13 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
- if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
- vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ if (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
+ vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
if (kvm_mpx_supported() &&
- !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
- vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+ (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
+ vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
/*
* Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
@@ -4096,8 +4096,6 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
vmcs12->guest_pending_dbg_exceptions =
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
- if (kvm_mpx_supported())
- vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
}
@@ -4336,7 +4334,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
vcpu->arch.pat = vmcs12->host_ia32_pat;
}
- if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+ if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
+ intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)))
WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
vmcs12->host_ia32_perf_global_ctrl));
@@ -4609,7 +4608,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
- if (kvm_has_tsc_control)
+ if (kvm_caps.has_tsc_control)
vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
if (vmx->nested.l1_tpr_threshold != -1)
@@ -4626,10 +4625,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
}
/* Unpin physical memory we referred to in vmcs02 */
- if (vmx->nested.apic_access_page) {
- kvm_release_page_clean(vmx->nested.apic_access_page);
- vmx->nested.apic_access_page = NULL;
- }
+ kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false);
kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
vmx->nested.pi_desc = NULL;
@@ -4828,28 +4824,6 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
return 0;
}
-void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu,
- bool vcpu_has_perf_global_ctrl)
-{
- struct vcpu_vmx *vmx;
-
- if (!nested_vmx_allowed(vcpu))
- return;
-
- vmx = to_vmx(vcpu);
- if (vcpu_has_perf_global_ctrl) {
- vmx->nested.msrs.entry_ctls_high |=
- VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
- vmx->nested.msrs.exit_ctls_high |=
- VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
- } else {
- vmx->nested.msrs.entry_ctls_high &=
- ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
- vmx->nested.msrs.exit_ctls_high &=
- ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
- }
-}
-
static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer,
int *ret)
{
@@ -4952,7 +4926,7 @@ out_vmcs02:
}
/* Emulate the VMXON instruction. */
-static int handle_vmon(struct kvm_vcpu *vcpu)
+static int handle_vmxon(struct kvm_vcpu *vcpu)
{
int ret;
gpa_t vmptr;
@@ -4962,20 +4936,25 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
| FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
/*
- * The Intel VMX Instruction Reference lists a bunch of bits that are
- * prerequisite to running VMXON, most notably cr4.VMXE must be set to
- * 1 (see vmx_is_valid_cr4() for when we allow the guest to set this).
- * Otherwise, we should fail with #UD. But most faulting conditions
- * have already been checked by hardware, prior to the VM-exit for
- * VMXON. We do test guest cr4.VMXE because processor CR4 always has
- * that bit set to 1 in non-root mode.
+ * Note, KVM cannot rely on hardware to perform the CR0/CR4 #UD checks
+ * that have higher priority than VM-Exit (see Intel SDM's pseudocode
+ * for VMXON), as KVM must load valid CR0/CR4 values into hardware while
+ * running the guest, i.e. KVM needs to check the _guest_ values.
+ *
+ * Rely on hardware for the other two pre-VM-Exit checks, !VM86 and
+ * !COMPATIBILITY modes. KVM may run the guest in VM86 to emulate Real
+ * Mode, but KVM will never take the guest out of those modes.
*/
- if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
+ if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) ||
+ !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
- /* CPL=0 must be checked manually. */
+ /*
+ * CPL=0 and all other checks that are lower priority than VM-Exit must
+ * be checked manually.
+ */
if (vmx_get_cpl(vcpu)) {
kvm_inject_gp(vcpu, 0);
return 1;
@@ -5044,7 +5023,7 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
}
/* Emulate the VMXOFF instruction */
-static int handle_vmoff(struct kvm_vcpu *vcpu)
+static int handle_vmxoff(struct kvm_vcpu *vcpu)
{
if (!nested_vmx_check_permission(vcpu))
return 1;
@@ -6111,6 +6090,9 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
case EXIT_REASON_ENCLS:
return nested_vmx_exit_handled_encls(vcpu, vmcs12);
+ case EXIT_REASON_NOTIFY:
+ /* Notify VM exit is not exposed to L1 */
+ return false;
default:
return true;
}
@@ -6775,6 +6757,9 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
+ if (vmx_umip_emulated())
+ msrs->cr4_fixed1 |= X86_CR4_UMIP;
+
msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr();
}
@@ -6818,8 +6803,8 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
exit_handlers[EXIT_REASON_VMREAD] = handle_vmread;
exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume;
exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite;
- exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff;
- exit_handlers[EXIT_REASON_VMON] = handle_vmon;
+ exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff;
+ exit_handlers[EXIT_REASON_VMON] = handle_vmxon;
exit_handlers[EXIT_REASON_INVEPT] = handle_invept;
exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid;
exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc;
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index c92cea0b8ccc..88b00a7359e4 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -32,8 +32,6 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata);
int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
u32 vmx_instruction_info, bool wr, int len, gva_t *ret);
-void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu,
- bool vcpu_has_perf_global_ctrl);
void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu);
bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
int size);
@@ -281,7 +279,8 @@ static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0;
u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1;
- return fixed_bits_valid(val, fixed0, fixed1);
+ return fixed_bits_valid(val, fixed0, fixed1) &&
+ __kvm_is_valid_cr4(vcpu, val);
}
/* No difference in the restrictions on guest and host CR4 in VMX operation. */
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 37e9eb32e3d9..862c1a4d971b 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -37,23 +37,35 @@ static int fixed_pmc_events[] = {1, 0, 7};
static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
{
+ struct kvm_pmc *pmc;
+ u8 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl;
int i;
+ pmu->fixed_ctr_ctrl = data;
for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
u8 new_ctrl = fixed_ctrl_field(data, i);
- u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i);
- struct kvm_pmc *pmc;
-
- pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
+ u8 old_ctrl = fixed_ctrl_field(old_fixed_ctr_ctrl, i);
if (old_ctrl == new_ctrl)
continue;
+ pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
+
__set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use);
- reprogram_fixed_counter(pmc, new_ctrl, i);
+ reprogram_counter(pmc);
}
+}
- pmu->fixed_ctr_ctrl = data;
+static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
+{
+ if (pmc_idx < INTEL_PMC_IDX_FIXED) {
+ return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx,
+ MSR_P6_EVNTSEL0);
+ } else {
+ u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED;
+
+ return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0);
+ }
}
/* function is called when global control register has been updated. */
@@ -61,14 +73,18 @@ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
{
int bit;
u64 diff = pmu->global_ctrl ^ data;
+ struct kvm_pmc *pmc;
pmu->global_ctrl = data;
- for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
- reprogram_counter(pmu, bit);
+ for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) {
+ pmc = intel_pmc_idx_to_pmc(pmu, bit);
+ if (pmc)
+ reprogram_counter(pmc);
+ }
}
-static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc)
+static bool intel_hw_event_available(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
@@ -82,15 +98,12 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc)
/* disable event that reported as not present by cpuid */
if ((i < 7) && !(pmu->available_event_types & (1 << i)))
- return PERF_COUNT_HW_MAX + 1;
+ return false;
break;
}
- if (i == ARRAY_SIZE(intel_arch_events))
- return PERF_COUNT_HW_MAX;
-
- return intel_arch_events[i].event_type;
+ return true;
}
/* check if a PMC is enabled by comparing it with globl_ctrl bits. */
@@ -98,19 +111,10 @@ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
- return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
-}
-
-static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
-{
- if (pmc_idx < INTEL_PMC_IDX_FIXED)
- return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx,
- MSR_P6_EVNTSEL0);
- else {
- u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED;
+ if (!intel_pmu_has_perf_global_ctrl(pmu))
+ return true;
- return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0);
- }
+ return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
}
static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
@@ -167,16 +171,6 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr)
return get_gp_pmc(pmu, msr, MSR_IA32_PMC0);
}
-bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu)
-{
- /*
- * As a first step, a guest could only enable LBR feature if its
- * cpu model is the same as the host because the LBR registers
- * would be pass-through to the guest and they're model specific.
- */
- return boot_cpu_data.x86_model == guest_cpuid_model(vcpu);
-}
-
bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu)
{
struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu);
@@ -205,6 +199,7 @@ static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index)
static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ u64 perf_capabilities;
int ret;
switch (msr) {
@@ -212,7 +207,18 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
case MSR_CORE_PERF_GLOBAL_STATUS:
case MSR_CORE_PERF_GLOBAL_CTRL:
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- ret = pmu->version > 1;
+ return intel_pmu_has_perf_global_ctrl(pmu);
+ break;
+ case MSR_IA32_PEBS_ENABLE:
+ ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT;
+ break;
+ case MSR_IA32_DS_AREA:
+ ret = guest_cpuid_has(vcpu, X86_FEATURE_DS);
+ break;
+ case MSR_PEBS_DATA_CFG:
+ perf_capabilities = vcpu_get_perf_capabilities(vcpu);
+ ret = (perf_capabilities & PERF_CAP_PEBS_BASELINE) &&
+ ((perf_capabilities & PERF_CAP_PEBS_FORMAT) > 3);
break;
default:
ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
@@ -361,6 +367,15 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
msr_info->data = 0;
return 0;
+ case MSR_IA32_PEBS_ENABLE:
+ msr_info->data = pmu->pebs_enable;
+ return 0;
+ case MSR_IA32_DS_AREA:
+ msr_info->data = pmu->ds_area;
+ return 0;
+ case MSR_PEBS_DATA_CFG:
+ msr_info->data = pmu->pebs_data_cfg;
+ return 0;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
(pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
@@ -395,7 +410,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_CORE_PERF_FIXED_CTR_CTRL:
if (pmu->fixed_ctr_ctrl == data)
return 0;
- if (!(data & 0xfffffffffffff444ull)) {
+ if (!(data & pmu->fixed_ctr_ctrl_mask)) {
reprogram_fixed_counters(pmu, data);
return 0;
}
@@ -421,6 +436,29 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 0;
}
break;
+ case MSR_IA32_PEBS_ENABLE:
+ if (pmu->pebs_enable == data)
+ return 0;
+ if (!(data & pmu->pebs_enable_mask)) {
+ pmu->pebs_enable = data;
+ return 0;
+ }
+ break;
+ case MSR_IA32_DS_AREA:
+ if (msr_info->host_initiated && data && !guest_cpuid_has(vcpu, X86_FEATURE_DS))
+ return 1;
+ if (is_noncanonical_address(data, vcpu))
+ return 1;
+ pmu->ds_area = data;
+ return 0;
+ case MSR_PEBS_DATA_CFG:
+ if (pmu->pebs_data_cfg == data)
+ return 0;
+ if (!(data & pmu->pebs_data_cfg_mask)) {
+ pmu->pebs_data_cfg = data;
+ return 0;
+ }
+ break;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
(pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
@@ -445,7 +483,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
(pmu->raw_event_mask & HSW_IN_TX_CHECKPOINTED))
reserved_bits ^= HSW_IN_TX_CHECKPOINTED;
if (!(data & reserved_bits)) {
- reprogram_gp_counter(pmc, data);
+ pmc->eventsel = data;
+ reprogram_counter(pmc);
return 0;
}
} else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false))
@@ -474,11 +513,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
-
- struct x86_pmu_capability x86_pmu;
struct kvm_cpuid_entry2 *entry;
union cpuid10_eax eax;
union cpuid10_edx edx;
+ u64 perf_capabilities;
+ u64 counter_mask;
+ int i;
pmu->nr_arch_gp_counters = 0;
pmu->nr_arch_fixed_counters = 0;
@@ -487,8 +527,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->version = 0;
pmu->reserved_bits = 0xffffffff00200000ull;
pmu->raw_event_mask = X86_RAW_EVENT_MASK;
+ pmu->global_ctrl_mask = ~0ull;
+ pmu->global_ovf_ctrl_mask = ~0ull;
+ pmu->fixed_ctr_ctrl_mask = ~0ull;
+ pmu->pebs_enable_mask = ~0ull;
+ pmu->pebs_data_cfg_mask = ~0ull;
- entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
+ entry = kvm_find_cpuid_entry(vcpu, 0xa);
if (!entry || !vcpu->kvm->arch.enable_pmu)
return;
eax.full = entry->eax;
@@ -498,13 +543,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
if (!pmu->version)
return;
- perf_get_x86_pmu_capability(&x86_pmu);
-
pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
- x86_pmu.num_counters_gp);
- eax.split.bit_width = min_t(int, eax.split.bit_width, x86_pmu.bit_width_gp);
+ kvm_pmu_cap.num_counters_gp);
+ eax.split.bit_width = min_t(int, eax.split.bit_width,
+ kvm_pmu_cap.bit_width_gp);
pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
- eax.split.mask_length = min_t(int, eax.split.mask_length, x86_pmu.events_mask_len);
+ eax.split.mask_length = min_t(int, eax.split.mask_length,
+ kvm_pmu_cap.events_mask_len);
pmu->available_event_types = ~entry->ebx &
((1ull << eax.split.mask_length) - 1);
@@ -514,17 +559,19 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->nr_arch_fixed_counters =
min3(ARRAY_SIZE(fixed_pmc_events),
(size_t) edx.split.num_counters_fixed,
- (size_t) x86_pmu.num_counters_fixed);
- edx.split.bit_width_fixed = min_t(int,
- edx.split.bit_width_fixed, x86_pmu.bit_width_fixed);
+ (size_t)kvm_pmu_cap.num_counters_fixed);
+ edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed,
+ kvm_pmu_cap.bit_width_fixed);
pmu->counter_bitmask[KVM_PMC_FIXED] =
((u64)1 << edx.split.bit_width_fixed) - 1;
setup_fixed_pmc_eventsel(pmu);
}
- pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) |
- (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
- pmu->global_ctrl_mask = ~pmu->global_ctrl;
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
+ pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4));
+ counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
+ (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED));
+ pmu->global_ctrl_mask = counter_mask;
pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask
& ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF |
MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD);
@@ -532,7 +579,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->global_ovf_ctrl_mask &=
~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI;
- entry = kvm_find_cpuid_entry(vcpu, 7, 0);
+ entry = kvm_find_cpuid_entry_index(vcpu, 7, 0);
if (entry &&
(boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
(entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) {
@@ -545,16 +592,29 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
bitmap_set(pmu->all_valid_pmc_idx,
INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters);
- nested_vmx_pmu_refresh(vcpu,
- intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL));
-
- if (intel_pmu_lbr_is_compatible(vcpu))
+ if (cpuid_model_is_consistent(vcpu))
x86_perf_get_lbr(&lbr_desc->records);
else
lbr_desc->records.nr = 0;
if (lbr_desc->records.nr)
bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1);
+
+ perf_capabilities = vcpu_get_perf_capabilities(vcpu);
+ if (perf_capabilities & PERF_CAP_PEBS_FORMAT) {
+ if (perf_capabilities & PERF_CAP_PEBS_BASELINE) {
+ pmu->pebs_enable_mask = counter_mask;
+ pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE;
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+ pmu->fixed_ctr_ctrl_mask &=
+ ~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4));
+ }
+ pmu->pebs_data_cfg_mask = ~0xff00000full;
+ } else {
+ pmu->pebs_enable_mask =
+ ~((1ull << pmu->nr_arch_gp_counters) - 1);
+ }
+ }
}
static void intel_pmu_init(struct kvm_vcpu *vcpu)
@@ -719,8 +779,28 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
intel_pmu_release_guest_lbr_event(vcpu);
}
+void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu)
+{
+ struct kvm_pmc *pmc = NULL;
+ int bit;
+
+ for_each_set_bit(bit, (unsigned long *)&pmu->global_ctrl,
+ X86_PMC_IDX_MAX) {
+ pmc = intel_pmc_idx_to_pmc(pmu, bit);
+
+ if (!pmc || !pmc_speculative_in_use(pmc) ||
+ !intel_pmc_is_enabled(pmc))
+ continue;
+
+ if (pmc->perf_event && pmc->idx != pmc->perf_event->hw.idx) {
+ pmu->host_cross_mapped_mask |=
+ BIT_ULL(pmc->perf_event->hw.idx);
+ }
+ }
+}
+
struct kvm_pmu_ops intel_pmu_ops __initdata = {
- .pmc_perf_hw_id = intel_pmc_perf_hw_id,
+ .hw_event_available = intel_hw_event_available,
.pmc_is_enabled = intel_pmc_is_enabled,
.pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
.rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc,
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 07e5fcf5a5aa..1b56c5e5c9fb 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -34,7 +34,7 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
return &(to_vmx(vcpu)->pi_desc);
}
-static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new)
+static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new)
{
/*
* PID.ON can be set at any time by a different vCPU or by hardware,
@@ -42,7 +42,7 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new)
* update must be retried with a fresh snapshot an ON change causes
* the cmpxchg to fail.
*/
- if (cmpxchg64(&pi_desc->control, old, new) != old)
+ if (!try_cmpxchg64(&pi_desc->control, pold, new))
return -EBUSY;
return 0;
@@ -96,8 +96,9 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
if (!x2apic_mode)
dest = (dest << 8) & 0xFF00;
+ old.control = READ_ONCE(pi_desc->control);
do {
- old.control = new.control = READ_ONCE(pi_desc->control);
+ new.control = old.control;
/*
* Clear SN (as above) and refresh the destination APIC ID to
@@ -111,7 +112,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
* descriptor was modified on "put" to use the wakeup vector.
*/
new.nv = POSTED_INTR_VECTOR;
- } while (pi_try_set_control(pi_desc, old.control, new.control));
+ } while (pi_try_set_control(pi_desc, &old.control, new.control));
local_irq_restore(flags);
@@ -156,12 +157,12 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
WARN(pi_desc->sn, "PI descriptor SN field set before blocking");
+ old.control = READ_ONCE(pi_desc->control);
do {
- old.control = new.control = READ_ONCE(pi_desc->control);
-
/* set 'NV' to 'wakeup vector' */
+ new.control = old.control;
new.nv = POSTED_INTR_WAKEUP_VECTOR;
- } while (pi_try_set_control(pi_desc, old.control, new.control));
+ } while (pi_try_set_control(pi_desc, &old.control, new.control));
/*
* Send a wakeup IPI to this CPU if an interrupt may have been posted
@@ -177,11 +178,24 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
local_irq_restore(flags);
}
+static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu)
+{
+ /*
+ * The default posted interrupt vector does nothing when
+ * invoked outside guest mode. Return whether a blocked vCPU
+ * can be the target of posted interrupts, as is the case when
+ * using either IPI virtualization or VT-d PI, so that the
+ * notification vector is switched to the one that calls
+ * back to the pi_wakeup_handler() function.
+ */
+ return vmx_can_use_ipiv(vcpu) || vmx_can_use_vtd_pi(vcpu->kvm);
+}
+
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- if (!vmx_can_use_vtd_pi(vcpu->kvm))
+ if (!vmx_needs_pi_wakeup(vcpu))
return;
if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu))
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 9a45d5c9f116..26992076552e 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -5,6 +5,8 @@
#define POSTED_INTR_ON 0
#define POSTED_INTR_SN 1
+#define PID_TABLE_ENTRY_VALID 1
+
/* Posted-Interrupt Descriptor */
struct pi_desc {
u32 pir[8]; /* Posted interrupt requested */
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index 35e7ec91ae86..aba8cebdc587 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -79,7 +79,7 @@ static int sgx_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t gva, bool write,
else
*gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, &ex);
- if (*gpa == UNMAPPED_GVA) {
+ if (*gpa == INVALID_GPA) {
kvm_inject_emulated_page_fault(vcpu, &ex);
return -EFAULT;
}
@@ -148,8 +148,8 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu,
u8 max_size_log2;
int trapnr, ret;
- sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0);
- sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1);
+ sgx_12_0 = kvm_find_cpuid_entry_index(vcpu, 0x12, 0);
+ sgx_12_1 = kvm_find_cpuid_entry_index(vcpu, 0x12, 1);
if (!sgx_12_0 || !sgx_12_1) {
kvm_prepare_emulation_failure_exit(vcpu);
return 0;
@@ -431,7 +431,7 @@ static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu)
if (!vcpu->kvm->arch.sgx_provisioning_allowed)
return true;
- guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 0);
+ guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 0);
if (!guest_cpuid)
return true;
@@ -439,7 +439,7 @@ static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu)
if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx)
return true;
- guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 1);
+ guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 1);
if (!guest_cpuid)
return true;
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 2b9d7a7e83f7..ac290a44a693 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -50,6 +50,7 @@ struct vmcs_controls_shadow {
u32 pin;
u32 exec;
u32 secondary_exec;
+ u64 tertiary_exec;
};
/*
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index be7c19374fdd..d7f8331d6f7e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -105,6 +105,9 @@ module_param(fasteoi, bool, S_IRUGO);
module_param(enable_apicv, bool, S_IRUGO);
+bool __read_mostly enable_ipiv = true;
+module_param(enable_ipiv, bool, 0444);
+
/*
* If nested=1, nested virtualization is supported, i.e., guests may use
* VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -116,6 +119,9 @@ module_param(nested, bool, S_IRUGO);
bool __read_mostly enable_pml = 1;
module_param_named(pml, enable_pml, bool, S_IRUGO);
+static bool __read_mostly error_on_inconsistent_vmcs_config = true;
+module_param(error_on_inconsistent_vmcs_config, bool, 0444);
+
static bool __read_mostly dump_invalid_vmcs = 0;
module_param(dump_invalid_vmcs, bool, 0644);
@@ -443,18 +449,20 @@ asmlinkage void vmread_error(unsigned long field, bool fault)
noinline void vmwrite_error(unsigned long field, unsigned long value)
{
- vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n",
+ vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%u\n",
field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
{
- vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr);
+ vmx_insn_failed("kvm: vmclear failed: %p/%llx err=%u\n",
+ vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
{
- vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr);
+ vmx_insn_failed("kvm: vmptrld failed: %p/%llx err=%u\n",
+ vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
@@ -1787,7 +1795,7 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
return vmcs12->tsc_multiplier;
- return kvm_default_tsc_scaling_ratio;
+ return kvm_caps.default_tsc_scaling_ratio;
}
static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
@@ -2111,6 +2119,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
(data & MSR_IA32_BNDCFGS_RSVD))
return 1;
+
+ if (is_guest_mode(vcpu) &&
+ ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) ||
+ (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS)))
+ get_vmcs12(vcpu)->guest_bndcfgs = data;
+
vmcs_write64(GUEST_BNDCFGS, data);
break;
case MSR_IA32_UMWAIT_CONTROL:
@@ -2312,7 +2326,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if ((data & PMU_CAP_LBR_FMT) !=
(vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT))
return 1;
- if (!intel_pmu_lbr_is_compatible(vcpu))
+ if (!cpuid_model_is_consistent(vcpu))
+ return 1;
+ }
+ if (data & PERF_CAP_PEBS_FORMAT) {
+ if ((data & PERF_CAP_PEBS_MASK) !=
+ (vmx_get_perf_capabilities() & PERF_CAP_PEBS_MASK))
+ return 1;
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_DS))
+ return 1;
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64))
+ return 1;
+ if (!cpuid_model_is_consistent(vcpu))
return 1;
}
ret = kvm_set_msr_common(vcpu, msr_info);
@@ -2489,6 +2514,15 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
return 0;
}
+static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
+{
+ u64 allowed;
+
+ rdmsrl(msr, allowed);
+
+ return ctl_opt & allowed;
+}
+
static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
struct vmx_capability *vmx_cap)
{
@@ -2497,8 +2531,26 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
u32 _pin_based_exec_control = 0;
u32 _cpu_based_exec_control = 0;
u32 _cpu_based_2nd_exec_control = 0;
+ u64 _cpu_based_3rd_exec_control = 0;
u32 _vmexit_control = 0;
u32 _vmentry_control = 0;
+ int i;
+
+ /*
+ * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory.
+ * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always
+ * intercepts writes to PAT and EFER, i.e. never enables those controls.
+ */
+ struct {
+ u32 entry_control;
+ u32 exit_control;
+ } const vmcs_entry_exit_pairs[] = {
+ { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL },
+ { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT },
+ { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER },
+ { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS },
+ { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL },
+ };
memset(vmcs_conf, 0, sizeof(*vmcs_conf));
min = CPU_BASED_HLT_EXITING |
@@ -2518,7 +2570,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
opt = CPU_BASED_TPR_SHADOW |
CPU_BASED_USE_MSR_BITMAPS |
- CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS |
+ CPU_BASED_ACTIVATE_TERTIARY_CONTROLS;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
&_cpu_based_exec_control) < 0)
return -EIO;
@@ -2551,7 +2604,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
SECONDARY_EXEC_PT_USE_GPA |
SECONDARY_EXEC_PT_CONCEAL_VMX |
SECONDARY_EXEC_ENABLE_VMFUNC |
- SECONDARY_EXEC_BUS_LOCK_DETECTION;
+ SECONDARY_EXEC_BUS_LOCK_DETECTION |
+ SECONDARY_EXEC_NOTIFY_VM_EXITING;
if (cpu_has_sgx())
opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
if (adjust_vmx_controls(min2, opt2,
@@ -2581,15 +2635,30 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_INVLPG_EXITING);
} else if (vmx_cap->ept) {
- vmx_cap->ept = 0;
pr_warn_once("EPT CAP should not exist if not support "
"1-setting enable EPT VM-execution control\n");
+
+ if (error_on_inconsistent_vmcs_config)
+ return -EIO;
+
+ vmx_cap->ept = 0;
}
if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
- vmx_cap->vpid) {
- vmx_cap->vpid = 0;
+ vmx_cap->vpid) {
pr_warn_once("VPID CAP should not exist if not support "
"1-setting enable VPID VM-execution control\n");
+
+ if (error_on_inconsistent_vmcs_config)
+ return -EIO;
+
+ vmx_cap->vpid = 0;
+ }
+
+ if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) {
+ u64 opt3 = TERTIARY_EXEC_IPI_VIRT;
+
+ _cpu_based_3rd_exec_control = adjust_vmx_controls64(opt3,
+ MSR_IA32_VMX_PROCBASED_CTLS3);
}
min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
@@ -2630,6 +2699,23 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
&_vmentry_control) < 0)
return -EIO;
+ for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) {
+ u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control;
+ u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control;
+
+ if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl))
+ continue;
+
+ pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n",
+ _vmentry_control & n_ctrl, _vmexit_control & x_ctrl);
+
+ if (error_on_inconsistent_vmcs_config)
+ return -EIO;
+
+ _vmentry_control &= ~n_ctrl;
+ _vmexit_control &= ~x_ctrl;
+ }
+
/*
* Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
* can't be used due to an errata where VM Exit may incorrectly clear
@@ -2678,6 +2764,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
+ vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control;
vmcs_conf->vmexit_ctrl = _vmexit_control;
vmcs_conf->vmentry_ctrl = _vmentry_control;
@@ -3230,8 +3317,8 @@ static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
/*
* We operate under the default treatment of SMM, so VMX cannot be
- * enabled under SMM. Note, whether or not VMXE is allowed at all is
- * handled by kvm_is_valid_cr4().
+ * enabled under SMM. Note, whether or not VMXE is allowed at all,
+ * i.e. is a reserved bit, is handled by common x86 code.
*/
if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))
return false;
@@ -3702,7 +3789,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
}
/* Set up identity-mapping pagetable for EPT in real mode */
- for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
+ for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) {
tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) {
@@ -3932,6 +4019,8 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
+ if (enable_ipiv)
+ vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW);
}
}
@@ -3977,20 +4066,26 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
u32 i;
/*
- * Set intercept permissions for all potentially passed through MSRs
- * again. They will automatically get filtered through the MSR filter,
- * so we are back in sync after this.
+ * Redo intercept permissions for MSRs that KVM is passing through to
+ * the guest. Disabling interception will check the new MSR filter and
+ * ensure that KVM enables interception if usersepace wants to filter
+ * the MSR. MSRs that KVM is already intercepting don't need to be
+ * refreshed since KVM is going to intercept them regardless of what
+ * userspace wants.
*/
for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
u32 msr = vmx_possible_passthrough_msrs[i];
- bool read = test_bit(i, vmx->shadow_msr_intercept.read);
- bool write = test_bit(i, vmx->shadow_msr_intercept.write);
- vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, read);
- vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, write);
+ if (!test_bit(i, vmx->shadow_msr_intercept.read))
+ vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R);
+
+ if (!test_bit(i, vmx->shadow_msr_intercept.write))
+ vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W);
}
- pt_update_intercept_for_msr(vcpu);
+ /* PT MSRs can be passed through iff PT is exposed to the guest. */
+ if (vmx_pt_mode_is_host_guest())
+ pt_update_intercept_for_msr(vcpu);
}
static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
@@ -4085,7 +4180,8 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
if (!r)
return 0;
- if (!vcpu->arch.apicv_active)
+ /* Note, this is called iff the local APIC is in-kernel. */
+ if (!vcpu->arch.apic->apicv_active)
return -1;
if (pi_test_and_set_pir(vector, &vmx->pi_desc))
@@ -4259,15 +4355,19 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
}
pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
- if (cpu_has_secondary_exec_ctrls()) {
- if (kvm_vcpu_apicv_active(vcpu))
- secondary_exec_controls_setbit(vmx,
- SECONDARY_EXEC_APIC_REGISTER_VIRT |
- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
- else
- secondary_exec_controls_clearbit(vmx,
- SECONDARY_EXEC_APIC_REGISTER_VIRT |
- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+
+ if (kvm_vcpu_apicv_active(vcpu)) {
+ secondary_exec_controls_setbit(vmx,
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+ if (enable_ipiv)
+ tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT);
+ } else {
+ secondary_exec_controls_clearbit(vmx,
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+ if (enable_ipiv)
+ tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT);
}
vmx_update_msr_bitmap_x2apic(vcpu);
@@ -4299,6 +4399,20 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
return exec_control;
}
+static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx)
+{
+ u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl;
+
+ /*
+ * IPI virtualization relies on APICv. Disable IPI virtualization if
+ * APICv is inhibited.
+ */
+ if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu))
+ exec_control &= ~TERTIARY_EXEC_IPI_VIRT;
+
+ return exec_control;
+}
+
/*
* Adjust a single secondary execution control bit to intercept/allow an
* instruction in the guest. This is usually done based on whether or not a
@@ -4441,13 +4555,48 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
if (!vcpu->kvm->arch.bus_lock_detection_enabled)
exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
+ if (!kvm_notify_vmexit_enabled(vcpu->kvm))
+ exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
+
return exec_control;
}
+static inline int vmx_get_pid_table_order(struct kvm *kvm)
+{
+ return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table));
+}
+
+static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
+{
+ struct page *pages;
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+
+ if (!irqchip_in_kernel(kvm) || !enable_ipiv)
+ return 0;
+
+ if (kvm_vmx->pid_table)
+ return 0;
+
+ pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm));
+ if (!pages)
+ return -ENOMEM;
+
+ kvm_vmx->pid_table = (void *)page_address(pages);
+ return 0;
+}
+
+static int vmx_vcpu_precreate(struct kvm *kvm)
+{
+ return vmx_alloc_ipiv_pid_table(kvm);
+}
+
#define VMX_XSS_EXIT_BITMAP 0
static void init_vmcs(struct vcpu_vmx *vmx)
{
+ struct kvm *kvm = vmx->vcpu.kvm;
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+
if (nested)
nested_vmx_set_vmcs_shadowing_bitmap();
@@ -4464,6 +4613,9 @@ static void init_vmcs(struct vcpu_vmx *vmx)
if (cpu_has_secondary_exec_ctrls())
secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
+ if (cpu_has_tertiary_exec_ctrls())
+ tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
+
if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
vmcs_write64(EOI_EXIT_BITMAP0, 0);
vmcs_write64(EOI_EXIT_BITMAP1, 0);
@@ -4476,12 +4628,20 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
}
- if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
+ if (vmx_can_use_ipiv(&vmx->vcpu)) {
+ vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table));
+ vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1);
+ }
+
+ if (!kvm_pause_in_guest(kvm)) {
vmcs_write32(PLE_GAP, ple_gap);
vmx->ple_window = ple_window;
vmx->ple_window_dirty = true;
}
+ if (kvm_notify_vmexit_enabled(kvm))
+ vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
+
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
@@ -4652,13 +4812,13 @@ static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
}
-static void vmx_inject_irq(struct kvm_vcpu *vcpu)
+static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
uint32_t intr;
int irq = vcpu->arch.interrupt.nr;
- trace_kvm_inj_virq(irq);
+ trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected);
++vcpu->stat.irq_injections;
if (vmx->rmode.vm86_active) {
@@ -5770,6 +5930,32 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_notify(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qual = vmx_get_exit_qual(vcpu);
+ bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID;
+
+ ++vcpu->stat.notify_window_exits;
+
+ /*
+ * Notify VM exit happened while executing iret from NMI,
+ * "blocked by NMI" bit has to be set before next VM entry.
+ */
+ if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI))
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+
+ if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER ||
+ context_invalid) {
+ vcpu->run->exit_reason = KVM_EXIT_NOTIFY;
+ vcpu->run->notify.flags = context_invalid ?
+ KVM_NOTIFY_CONTEXT_INVALID : 0;
+ return 0;
+ }
+
+ return 1;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -5827,6 +6013,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
[EXIT_REASON_ENCLS] = handle_encls,
[EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit,
+ [EXIT_REASON_NOTIFY] = handle_notify,
};
static const int kvm_vmx_max_exit_handlers =
@@ -5924,6 +6111,7 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 vmentry_ctl, vmexit_ctl;
u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
+ u64 tertiary_exec_control;
unsigned long cr4;
int efer_slot;
@@ -5937,9 +6125,16 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
cr4 = vmcs_readl(GUEST_CR4);
- secondary_exec_control = 0;
+
if (cpu_has_secondary_exec_ctrls())
secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ else
+ secondary_exec_control = 0;
+
+ if (cpu_has_tertiary_exec_ctrls())
+ tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL);
+ else
+ tertiary_exec_control = 0;
pr_err("VMCS %p, last attempted VM-entry on CPU %d\n",
vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);
@@ -6039,9 +6234,10 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
pr_err("*** Control State ***\n");
- pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
- pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
- pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
+ pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
+ cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control);
+ pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n",
+ pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl);
pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
vmcs_read32(EXCEPTION_BITMAP),
vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
@@ -6191,7 +6387,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
exit_reason.basic != EXIT_REASON_PML_FULL &&
exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
- exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
+ exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
+ exit_reason.basic != EXIT_REASON_NOTIFY)) {
int ndata = 3;
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -6453,7 +6650,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
put_page(page);
}
-static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+static void vmx_hwapic_isr_update(int max_isr)
{
u16 status;
u8 old;
@@ -6783,9 +6980,14 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
{
int i, nr_msrs;
struct perf_guest_switch_msr *msrs;
+ struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu);
+
+ pmu->host_cross_mapped_mask = 0;
+ if (pmu->pebs_enable & pmu->global_ctrl)
+ intel_pmu_cross_mapped_check(pmu);
/* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */
- msrs = perf_guest_get_msrs(&nr_msrs);
+ msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu);
if (!msrs)
return;
@@ -7166,6 +7368,10 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
goto free_vmcs;
}
+ if (vmx_can_use_ipiv(vcpu))
+ WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
+ __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID);
+
return 0;
free_vmcs:
@@ -7234,7 +7440,7 @@ static int __init vmx_check_processor_compat(void)
return 0;
}
-static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
u8 cache;
@@ -7310,7 +7516,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \
} while (0)
- entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+ entry = kvm_find_cpuid_entry(vcpu, 0x1);
cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME));
cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME));
cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC));
@@ -7326,7 +7532,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID));
cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE));
- entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
+ entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0);
cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE));
cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP));
cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP));
@@ -7337,23 +7543,6 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
#undef cr4_fixed1_update
}
-static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (kvm_mpx_supported()) {
- bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
-
- if (mpx_enabled) {
- vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
- vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
- } else {
- vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
- vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
- }
- }
-}
-
static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7361,7 +7550,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
int i;
for (i = 0; i < PT_CPUID_LEAVES; i++) {
- best = kvm_find_cpuid_entry(vcpu, 0x14, i);
+ best = kvm_find_cpuid_entry_index(vcpu, 0x14, i);
if (!best)
return;
vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
@@ -7445,10 +7634,8 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
- if (nested_vmx_allowed(vcpu)) {
+ if (nested_vmx_allowed(vcpu))
nested_vmx_cr_fixed1_bits_update(vcpu);
- nested_vmx_entry_exit_ctls_update(vcpu);
- }
if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
@@ -7502,6 +7689,13 @@ static __init void vmx_set_cpu_caps(void)
kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
if (vmx_pt_mode_is_host_guest())
kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
+ if (vmx_pebs_supported()) {
+ kvm_cpu_cap_check_and_set(X86_FEATURE_DS);
+ kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64);
+ }
+
+ if (!enable_pmu)
+ kvm_cpu_cap_clear(X86_FEATURE_PDCM);
if (!enable_sgx) {
kvm_cpu_cap_clear(X86_FEATURE_SGX);
@@ -7514,7 +7708,7 @@ static __init void vmx_set_cpu_caps(void)
kvm_cpu_cap_set(X86_FEATURE_UMIP);
/* CPUID 0xD.1 */
- supported_xss = 0;
+ kvm_caps.supported_xss = 0;
if (!cpu_has_vmx_xsaves())
kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
@@ -7655,9 +7849,9 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
delta_tsc = 0;
/* Convert to host delta tsc if tsc scaling is enabled */
- if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
+ if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio &&
delta_tsc && u64_shl_div_u64(delta_tsc,
- kvm_tsc_scaling_ratio_frac_bits,
+ kvm_caps.tsc_scaling_ratio_frac_bits,
vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc))
return -ERANGE;
@@ -7729,6 +7923,13 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ /*
+ * TODO: Implement custom flows for forcing the vCPU out/in of L2 on
+ * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong
+ * SMI and RSM only modify state that is saved and restored via SMRAM.
+ * E.g. most MSRs are left untouched, but many are modified by VM-Exit
+ * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM.
+ */
vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
if (vmx->nested.smm.guest_mode)
nested_vmx_vmexit(vcpu, -1, 0, 0);
@@ -7802,6 +8003,13 @@ static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
return supported & BIT(reason);
}
+static void vmx_vm_destroy(struct kvm *kvm)
+{
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+
+ free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
+}
+
static struct kvm_x86_ops vmx_x86_ops __initdata = {
.name = "kvm_intel",
@@ -7813,7 +8021,9 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.vm_size = sizeof(struct kvm_vmx),
.vm_init = vmx_vm_init,
+ .vm_destroy = vmx_vm_destroy,
+ .vcpu_precreate = vmx_vcpu_precreate,
.vcpu_create = vmx_vcpu_create,
.vcpu_free = vmx_vcpu_free,
.vcpu_reset = vmx_vcpu_reset,
@@ -8027,8 +8237,8 @@ static __init int hardware_setup(void)
}
if (!cpu_has_vmx_mpx())
- supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
- XFEATURE_MASK_BNDCSR);
+ kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
+ XFEATURE_MASK_BNDCSR);
if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
!(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
@@ -8091,12 +8301,16 @@ static __init int hardware_setup(void)
if (!enable_apicv)
vmx_x86_ops.sync_pir_to_irr = NULL;
+ if (!enable_apicv || !cpu_has_vmx_ipiv())
+ enable_ipiv = false;
+
if (cpu_has_vmx_tsc_scaling())
- kvm_has_tsc_control = true;
+ kvm_caps.has_tsc_control = true;
- kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
- kvm_tsc_scaling_ratio_frac_bits = 48;
- kvm_has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
+ kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
+ kvm_caps.tsc_scaling_ratio_frac_bits = 48;
+ kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
+ kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit();
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
@@ -8153,11 +8367,12 @@ static __init int hardware_setup(void)
vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
}
- kvm_mce_cap_supported |= MCG_LMCE_P;
+ kvm_caps.supported_mce_cap |= MCG_LMCE_P;
+ kvm_caps.supported_mce_cap |= MCG_CMCI_P;
if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
return -EINVAL;
- if (!enable_ept || !cpu_has_vmx_intel_pt())
+ if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt())
pt_mode = PT_MODE_SYSTEM;
if (pt_mode == PT_MODE_HOST_GUEST)
vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 1e7f9453894b..fb8e3480a9d7 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -92,10 +92,22 @@ union vmx_exit_reason {
u32 full;
};
+static inline bool intel_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu)
+{
+ /*
+ * Architecturally, Intel's SDM states that IA32_PERF_GLOBAL_CTRL is
+ * supported if "CPUID.0AH: EAX[7:0] > 0", i.e. if the PMU version is
+ * greater than zero. However, KVM only exposes and emulates the MSR
+ * to/for the guest if the guest PMU supports at least "Architectural
+ * Performance Monitoring Version 2".
+ */
+ return pmu->version > 1;
+}
+
#define vcpu_to_lbr_desc(vcpu) (&to_vmx(vcpu)->lbr_desc)
#define vcpu_to_lbr_records(vcpu) (&to_vmx(vcpu)->lbr_desc.records)
-bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu);
+void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu);
bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu);
int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu);
@@ -205,7 +217,7 @@ struct nested_vmx {
* Guest pages referred to in the vmcs02 with host-physical
* pointers, so we must keep them pinned while L2 runs.
*/
- struct page *apic_access_page;
+ struct kvm_host_map apic_access_page_map;
struct kvm_host_map virtual_apic_map;
struct kvm_host_map pi_desc_map;
@@ -220,9 +232,18 @@ struct nested_vmx {
bool has_preemption_timer_deadline;
bool preemption_timer_expired;
- /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
- u64 vmcs01_debugctl;
- u64 vmcs01_guest_bndcfgs;
+ /*
+ * Used to snapshot MSRs that are conditionally loaded on VM-Enter in
+ * order to propagate the guest's pre-VM-Enter value into vmcs02. For
+ * emulation of VMLAUNCH/VMRESUME, the snapshot will be of L1's value.
+ * For KVM_SET_NESTED_STATE, the snapshot is of L2's value, _if_
+ * userspace restores MSRs before nested state. If userspace restores
+ * MSRs after nested state, the snapshot holds garbage, but KVM can't
+ * detect that, and the garbage value in vmcs02 will be overwritten by
+ * MSR restoration in any case.
+ */
+ u64 pre_vmenter_debugctl;
+ u64 pre_vmenter_bndcfgs;
/* to migrate it to L1 if L2 writes to L1's CR8 directly */
int l1_tpr_threshold;
@@ -369,6 +390,8 @@ struct kvm_vmx {
unsigned int tss_addr;
bool ept_identity_pagetable_done;
gpa_t ept_identity_map_addr;
+ /* Posted Interrupt Descriptor (PID) table for IPI virtualization */
+ u64 *pid_table;
};
bool nested_vmx_allowed(struct kvm_vcpu *vcpu);
@@ -462,35 +485,36 @@ static inline u8 vmx_get_rvi(void)
return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
}
-#define BUILD_CONTROLS_SHADOW(lname, uname) \
-static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val) \
-{ \
- if (vmx->loaded_vmcs->controls_shadow.lname != val) { \
- vmcs_write32(uname, val); \
- vmx->loaded_vmcs->controls_shadow.lname = val; \
- } \
-} \
-static inline u32 __##lname##_controls_get(struct loaded_vmcs *vmcs) \
-{ \
- return vmcs->controls_shadow.lname; \
-} \
-static inline u32 lname##_controls_get(struct vcpu_vmx *vmx) \
-{ \
- return __##lname##_controls_get(vmx->loaded_vmcs); \
-} \
-static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val) \
-{ \
- lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \
-} \
-static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u32 val) \
-{ \
- lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \
+#define BUILD_CONTROLS_SHADOW(lname, uname, bits) \
+static inline void lname##_controls_set(struct vcpu_vmx *vmx, u##bits val) \
+{ \
+ if (vmx->loaded_vmcs->controls_shadow.lname != val) { \
+ vmcs_write##bits(uname, val); \
+ vmx->loaded_vmcs->controls_shadow.lname = val; \
+ } \
+} \
+static inline u##bits __##lname##_controls_get(struct loaded_vmcs *vmcs) \
+{ \
+ return vmcs->controls_shadow.lname; \
+} \
+static inline u##bits lname##_controls_get(struct vcpu_vmx *vmx) \
+{ \
+ return __##lname##_controls_get(vmx->loaded_vmcs); \
+} \
+static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u##bits val) \
+{ \
+ lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \
+} \
+static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u##bits val) \
+{ \
+ lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \
}
-BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS)
-BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS)
-BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL)
-BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL)
-BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL)
+BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS, 32)
+BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS, 32)
+BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL, 32)
+BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL, 32)
+BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL, 32)
+BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64)
/*
* VMX_REGS_LAZY_LOAD_SET - The set of registers that will be updated in the
@@ -586,4 +610,9 @@ static inline int vmx_get_instr_info_reg2(u32 vmx_instr_info)
return (vmx_instr_info >> 28) & 0xf;
}
+static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu)
+{
+ return lapic_in_kernel(vcpu) && enable_ipiv;
+}
+
#endif /* __KVM_X86_VMX_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e5fa335a4ea7..33560bfa0cac 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -87,8 +87,11 @@
#define MAX_IO_MSRS 256
#define KVM_MAX_MCE_BANKS 32
-u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
-EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
+
+struct kvm_caps kvm_caps __read_mostly = {
+ .supported_mce_cap = MCG_CTL_P | MCG_SER_P,
+};
+EXPORT_SYMBOL_GPL(kvm_caps);
#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e))
@@ -151,19 +154,6 @@ module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
static bool __read_mostly kvmclock_periodic_sync = true;
module_param(kvmclock_periodic_sync, bool, S_IRUGO);
-bool __read_mostly kvm_has_tsc_control;
-EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
-u32 __read_mostly kvm_max_guest_tsc_khz;
-EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
-u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits;
-EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
-u64 __read_mostly kvm_max_tsc_scaling_ratio;
-EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
-u64 __read_mostly kvm_default_tsc_scaling_ratio;
-EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
-bool __read_mostly kvm_has_bus_lock_exit;
-EXPORT_SYMBOL_GPL(kvm_has_bus_lock_exit);
-
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
static u32 __read_mostly tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
@@ -235,8 +225,6 @@ EXPORT_SYMBOL_GPL(enable_apicv);
u64 __read_mostly host_xss;
EXPORT_SYMBOL_GPL(host_xss);
-u64 __read_mostly supported_xss;
-EXPORT_SYMBOL_GPL(supported_xss);
const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
KVM_GENERIC_VM_STATS(),
@@ -298,7 +286,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
STATS_DESC_COUNTER(VCPU, directed_yield_successful),
STATS_DESC_COUNTER(VCPU, preemption_reported),
STATS_DESC_COUNTER(VCPU, preemption_other),
- STATS_DESC_IBOOLEAN(VCPU, guest_mode)
+ STATS_DESC_IBOOLEAN(VCPU, guest_mode),
+ STATS_DESC_COUNTER(VCPU, notify_window_exits),
};
const struct kvm_stats_header kvm_vcpu_stats_header = {
@@ -311,8 +300,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
};
u64 __read_mostly host_xcr0;
-u64 __read_mostly supported_xcr0;
-EXPORT_SYMBOL_GPL(supported_xcr0);
static struct kmem_cache *x86_emulator_cache;
@@ -862,7 +849,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
*/
real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn),
PFERR_USER_MASK | PFERR_WRITE_MASK, NULL);
- if (real_gpa == UNMAPPED_GVA)
+ if (real_gpa == INVALID_GPA)
return 0;
/* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */
@@ -1094,7 +1081,7 @@ int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv);
-bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
if (cr4 & cr4_reserved_bits)
return false;
@@ -1102,9 +1089,15 @@ bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
return false;
- return static_call(kvm_x86_is_valid_cr4)(vcpu, cr4);
+ return true;
+}
+EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4);
+
+static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ return __kvm_is_valid_cr4(vcpu, cr4) &&
+ static_call(kvm_x86_is_valid_cr4)(vcpu, cr4);
}
-EXPORT_SYMBOL_GPL(kvm_is_valid_cr4);
void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
{
@@ -1450,6 +1443,7 @@ static const u32 msrs_to_save_all[] = {
MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
+ MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
@@ -2051,13 +2045,6 @@ int kvm_emulate_invd(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_emulate_invd);
-int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
-{
- pr_warn_once("kvm: MWAIT instruction emulated as NOP!\n");
- return kvm_emulate_as_nop(vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_mwait);
-
int kvm_handle_invalid_op(struct kvm_vcpu *vcpu)
{
kvm_queue_exception(vcpu, UD_VECTOR);
@@ -2065,11 +2052,26 @@ int kvm_handle_invalid_op(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_handle_invalid_op);
-int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
+
+static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn)
{
- pr_warn_once("kvm: MONITOR instruction emulated as NOP!\n");
+ if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT))
+ return kvm_handle_invalid_op(vcpu);
+
+ pr_warn_once("kvm: %s instruction emulated as NOP!\n", insn);
return kvm_emulate_as_nop(vcpu);
}
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_monitor_mwait(vcpu, "MWAIT");
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_mwait);
+
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_monitor_mwait(vcpu, "MONITOR");
+}
EXPORT_SYMBOL_GPL(kvm_emulate_monitor);
static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
@@ -2349,12 +2351,12 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
/* Guest TSC same frequency as host TSC? */
if (!scale) {
- kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
+ kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio);
return 0;
}
/* TSC scaling supported? */
- if (!kvm_has_tsc_control) {
+ if (!kvm_caps.has_tsc_control) {
if (user_tsc_khz > tsc_khz) {
vcpu->arch.tsc_catchup = 1;
vcpu->arch.tsc_always_catchup = 1;
@@ -2366,10 +2368,10 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
}
/* TSC scaling required - calculate ratio */
- ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
+ ratio = mul_u64_u32_div(1ULL << kvm_caps.tsc_scaling_ratio_frac_bits,
user_tsc_khz, tsc_khz);
- if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
+ if (ratio == 0 || ratio >= kvm_caps.max_tsc_scaling_ratio) {
pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
user_tsc_khz);
return -1;
@@ -2387,7 +2389,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
/* tsc_khz can be zero if TSC calibration fails */
if (user_tsc_khz == 0) {
/* set tsc_scaling_ratio to a safe value */
- kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
+ kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio);
return -1;
}
@@ -2464,18 +2466,18 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
* (frac) represent the fractional part, ie. ratio represents a fixed
* point number (mult + frac * 2^(-N)).
*
- * N equals to kvm_tsc_scaling_ratio_frac_bits.
+ * N equals to kvm_caps.tsc_scaling_ratio_frac_bits.
*/
static inline u64 __scale_tsc(u64 ratio, u64 tsc)
{
- return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
+ return mul_u64_u64_shr(tsc, ratio, kvm_caps.tsc_scaling_ratio_frac_bits);
}
u64 kvm_scale_tsc(u64 tsc, u64 ratio)
{
u64 _tsc = tsc;
- if (ratio != kvm_default_tsc_scaling_ratio)
+ if (ratio != kvm_caps.default_tsc_scaling_ratio)
_tsc = __scale_tsc(ratio, tsc);
return _tsc;
@@ -2502,11 +2504,11 @@ u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier)
{
u64 nested_offset;
- if (l2_multiplier == kvm_default_tsc_scaling_ratio)
+ if (l2_multiplier == kvm_caps.default_tsc_scaling_ratio)
nested_offset = l1_offset;
else
nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier,
- kvm_tsc_scaling_ratio_frac_bits);
+ kvm_caps.tsc_scaling_ratio_frac_bits);
nested_offset += l2_offset;
return nested_offset;
@@ -2515,9 +2517,9 @@ EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset);
u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier)
{
- if (l2_multiplier != kvm_default_tsc_scaling_ratio)
+ if (l2_multiplier != kvm_caps.default_tsc_scaling_ratio)
return mul_u64_u64_shr(l1_multiplier, l2_multiplier,
- kvm_tsc_scaling_ratio_frac_bits);
+ kvm_caps.tsc_scaling_ratio_frac_bits);
return l1_multiplier;
}
@@ -2559,7 +2561,7 @@ static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multipli
else
vcpu->arch.tsc_scaling_ratio = l1_multiplier;
- if (kvm_has_tsc_control)
+ if (kvm_caps.has_tsc_control)
static_call(kvm_x86_write_tsc_multiplier)(
vcpu, vcpu->arch.tsc_scaling_ratio);
}
@@ -2695,7 +2697,7 @@ static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
{
- if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
+ if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio)
WARN_ON(adjustment < 0);
adjustment = kvm_scale_tsc((u64) adjustment,
vcpu->arch.l1_tsc_scaling_ratio);
@@ -3108,7 +3110,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
/* With all the info we got, fill in the values */
- if (kvm_has_tsc_control)
+ if (kvm_caps.has_tsc_control)
tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz,
v->arch.l1_tsc_scaling_ratio);
@@ -3198,6 +3200,16 @@ static void kvmclock_sync_fn(struct work_struct *work)
KVMCLOCK_SYNC_PERIOD);
}
+/* These helpers are safe iff @msr is known to be an MCx bank MSR. */
+static bool is_mci_control_msr(u32 msr)
+{
+ return (msr & 3) == 0;
+}
+static bool is_mci_status_msr(u32 msr)
+{
+ return (msr & 3) == 1;
+}
+
/*
* On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
*/
@@ -3216,6 +3228,7 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
unsigned bank_num = mcg_cap & 0xff;
u32 msr = msr_info->index;
u64 data = msr_info->data;
+ u32 offset, last_msr;
switch (msr) {
case MSR_IA32_MCG_STATUS:
@@ -3229,32 +3242,53 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
vcpu->arch.mcg_ctl = data;
break;
- default:
- if (msr >= MSR_IA32_MC0_CTL &&
- msr < MSR_IA32_MCx_CTL(bank_num)) {
- u32 offset = array_index_nospec(
- msr - MSR_IA32_MC0_CTL,
- MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
-
- /* only 0 or all 1s can be written to IA32_MCi_CTL
- * some Linux kernels though clear bit 10 in bank 4 to
- * workaround a BIOS/GART TBL issue on AMD K8s, ignore
- * this to avoid an uncatched #GP in the guest
- */
- if ((offset & 0x3) == 0 &&
- data != 0 && (data | (1 << 10)) != ~(u64)0)
- return -1;
-
- /* MCi_STATUS */
- if (!msr_info->host_initiated &&
- (offset & 0x3) == 1 && data != 0) {
- if (!can_set_mci_status(vcpu))
- return -1;
- }
+ case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+ last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
+ if (msr > last_msr)
+ return 1;
- vcpu->arch.mce_banks[offset] = data;
- break;
- }
+ if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated))
+ return 1;
+ /* An attempt to write a 1 to a reserved bit raises #GP */
+ if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK))
+ return 1;
+ offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
+ last_msr + 1 - MSR_IA32_MC0_CTL2);
+ vcpu->arch.mci_ctl2_banks[offset] = data;
+ break;
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+ last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
+ if (msr > last_msr)
+ return 1;
+
+ /*
+ * Only 0 or all 1s can be written to IA32_MCi_CTL, all other
+ * values are architecturally undefined. But, some Linux
+ * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB
+ * issue on AMD K8s, allow bit 10 to be clear when setting all
+ * other bits in order to avoid an uncaught #GP in the guest.
+ *
+ * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable,
+ * single-bit ECC data errors.
+ */
+ if (is_mci_control_msr(msr) &&
+ data != 0 && (data | (1 << 10) | 1) != ~(u64)0)
+ return 1;
+
+ /*
+ * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR.
+ * AMD-based CPUs allow non-zero values, but if and only if
+ * HWCR[McStatusWrEn] is set.
+ */
+ if (!msr_info->host_initiated && is_mci_status_msr(msr) &&
+ data != 0 && !can_set_mci_status(vcpu))
+ return 1;
+
+ offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
+ last_msr + 1 - MSR_IA32_MC0_CTL);
+ vcpu->arch.mce_banks[offset] = data;
+ break;
+ default:
return 1;
}
return 0;
@@ -3538,7 +3572,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
}
break;
- case 0x200 ... 0x2ff:
+ case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
+ case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
return kvm_mtrr_set_msr(vcpu, msr, data);
case MSR_IA32_APICBASE:
return kvm_set_apic_base(vcpu, msr_info);
@@ -3560,9 +3595,21 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.ia32_tsc_adjust_msr = data;
}
break;
- case MSR_IA32_MISC_ENABLE:
+ case MSR_IA32_MISC_ENABLE: {
+ u64 old_val = vcpu->arch.ia32_misc_enable_msr;
+
+ if (!msr_info->host_initiated) {
+ /* RO bits */
+ if ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK)
+ return 1;
+
+ /* R bits, i.e. writes are ignored, but don't fault. */
+ data = data & ~MSR_IA32_MISC_ENABLE_EMON;
+ data |= old_val & MSR_IA32_MISC_ENABLE_EMON;
+ }
+
if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
- ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
+ ((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
return 1;
vcpu->arch.ia32_misc_enable_msr = data;
@@ -3571,6 +3618,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.ia32_misc_enable_msr = data;
}
break;
+ }
case MSR_IA32_SMBASE:
if (!msr_info->host_initiated)
return 1;
@@ -3597,7 +3645,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
* IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
* XSAVES/XRSTORS to save/restore PT MSRs.
*/
- if (data & ~supported_xss)
+ if (data & ~kvm_caps.supported_xss)
return 1;
vcpu->arch.ia32_xss = data;
kvm_update_cpuid_runtime(vcpu);
@@ -3695,6 +3743,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+ case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
return set_msr_mce(vcpu, msr_info);
case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
@@ -3785,6 +3834,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.guest_fpu.xfd_err = data;
break;
#endif
+ case MSR_IA32_PEBS_ENABLE:
+ case MSR_IA32_DS_AREA:
+ case MSR_PEBS_DATA_CFG:
+ case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
+ if (kvm_pmu_is_valid_msr(vcpu, msr))
+ return kvm_pmu_set_msr(vcpu, msr_info);
+ /*
+ * Userspace is allowed to write '0' to MSRs that KVM reports
+ * as to-be-saved, even if an MSRs isn't fully supported.
+ */
+ return !msr_info->host_initiated || data;
default:
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
@@ -3799,6 +3859,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
u64 data;
u64 mcg_cap = vcpu->arch.mcg_cap;
unsigned bank_num = mcg_cap & 0xff;
+ u32 offset, last_msr;
switch (msr) {
case MSR_IA32_P5_MC_ADDR:
@@ -3816,16 +3877,27 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
case MSR_IA32_MCG_STATUS:
data = vcpu->arch.mcg_status;
break;
- default:
- if (msr >= MSR_IA32_MC0_CTL &&
- msr < MSR_IA32_MCx_CTL(bank_num)) {
- u32 offset = array_index_nospec(
- msr - MSR_IA32_MC0_CTL,
- MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
+ case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+ last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
+ if (msr > last_msr)
+ return 1;
- data = vcpu->arch.mce_banks[offset];
- break;
- }
+ if (!(mcg_cap & MCG_CMCI_P) && !host)
+ return 1;
+ offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
+ last_msr + 1 - MSR_IA32_MC0_CTL2);
+ data = vcpu->arch.mci_ctl2_banks[offset];
+ break;
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+ last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
+ if (msr > last_msr)
+ return 1;
+
+ offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
+ last_msr + 1 - MSR_IA32_MC0_CTL);
+ data = vcpu->arch.mce_banks[offset];
+ break;
+ default:
return 1;
}
*pdata = data;
@@ -3865,9 +3937,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */
msr_info->data = 0;
break;
+ case MSR_IA32_PEBS_ENABLE:
+ case MSR_IA32_DS_AREA:
+ case MSR_PEBS_DATA_CFG:
case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
return kvm_pmu_get_msr(vcpu, msr_info);
+ /*
+ * Userspace is allowed to read MSRs that KVM reports as
+ * to-be-saved, even if an MSR isn't fully supported.
+ */
if (!msr_info->host_initiated)
return 1;
msr_info->data = 0;
@@ -3922,7 +4001,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
}
case MSR_MTRRcap:
- case 0x200 ... 0x2ff:
+ case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
+ case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
case 0xcd: /* fsb frequency */
msr_info->data = 3;
@@ -4038,6 +4118,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+ case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
msr_info->host_initiated);
case MSR_IA32_XSS:
@@ -4280,6 +4361,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_GET_MSR_FEATURES:
case KVM_CAP_MSR_PLATFORM_INFO:
case KVM_CAP_EXCEPTION_PAYLOAD:
+ case KVM_CAP_X86_TRIPLE_FAULT_EVENT:
case KVM_CAP_SET_GUEST_DEBUG:
case KVM_CAP_LAST_CPU:
case KVM_CAP_X86_USER_SPACE_MSR:
@@ -4296,6 +4378,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SYS_ATTRIBUTES:
case KVM_CAP_VAPIC:
case KVM_CAP_ENABLE_CAP:
+ case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
r = 1;
break;
case KVM_CAP_EXIT_HYPERCALL:
@@ -4357,7 +4440,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
case KVM_CAP_TSC_CONTROL:
case KVM_CAP_VM_TSC_CONTROL:
- r = kvm_has_tsc_control;
+ r = kvm_caps.has_tsc_control;
break;
case KVM_CAP_X2APIC_API:
r = KVM_X2APIC_API_VALID_FLAGS;
@@ -4379,7 +4462,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = sched_info_on();
break;
case KVM_CAP_X86_BUS_LOCK_EXIT:
- if (kvm_has_bus_lock_exit)
+ if (kvm_caps.has_bus_lock_exit)
r = KVM_BUS_LOCK_DETECTION_OFF |
KVM_BUS_LOCK_DETECTION_EXIT;
else
@@ -4388,17 +4471,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_XSAVE2: {
u64 guest_perm = xstate_get_guest_group_perm();
- r = xstate_required_size(supported_xcr0 & guest_perm, false);
+ r = xstate_required_size(kvm_caps.supported_xcr0 & guest_perm, false);
if (r < sizeof(struct kvm_xsave))
r = sizeof(struct kvm_xsave);
break;
+ }
case KVM_CAP_PMU_CAPABILITY:
r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0;
break;
- }
case KVM_CAP_DISABLE_QUIRKS2:
r = KVM_X86_VALID_QUIRKS;
break;
+ case KVM_CAP_X86_NOTIFY_VMEXIT:
+ r = kvm_caps.has_notify_vmexit;
+ break;
default:
break;
}
@@ -4426,7 +4512,7 @@ static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
switch (attr->attr) {
case KVM_X86_XCOMP_GUEST_SUPP:
- if (put_user(supported_xcr0, uaddr))
+ if (put_user(kvm_caps.supported_xcr0, uaddr))
return -EFAULT;
return 0;
default:
@@ -4503,8 +4589,8 @@ long kvm_arch_dev_ioctl(struct file *filp,
}
case KVM_X86_GET_MCE_CAP_SUPPORTED:
r = -EFAULT;
- if (copy_to_user(argp, &kvm_mce_cap_supported,
- sizeof(kvm_mce_cap_supported)))
+ if (copy_to_user(argp, &kvm_caps.supported_mce_cap,
+ sizeof(kvm_caps.supported_mce_cap)))
goto out;
r = 0;
break;
@@ -4803,22 +4889,63 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
r = -EINVAL;
if (!bank_num || bank_num > KVM_MAX_MCE_BANKS)
goto out;
- if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
+ if (mcg_cap & ~(kvm_caps.supported_mce_cap | 0xff | 0xff0000))
goto out;
r = 0;
vcpu->arch.mcg_cap = mcg_cap;
/* Init IA32_MCG_CTL to all 1s */
if (mcg_cap & MCG_CTL_P)
vcpu->arch.mcg_ctl = ~(u64)0;
- /* Init IA32_MCi_CTL to all 1s */
- for (bank = 0; bank < bank_num; bank++)
+ /* Init IA32_MCi_CTL to all 1s, IA32_MCi_CTL2 to all 0s */
+ for (bank = 0; bank < bank_num; bank++) {
vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+ if (mcg_cap & MCG_CMCI_P)
+ vcpu->arch.mci_ctl2_banks[bank] = 0;
+ }
+
+ kvm_apic_after_set_mcg_cap(vcpu);
static_call(kvm_x86_setup_mce)(vcpu);
out:
return r;
}
+/*
+ * Validate this is an UCNA (uncorrectable no action) error by checking the
+ * MCG_STATUS and MCi_STATUS registers:
+ * - none of the bits for Machine Check Exceptions are set
+ * - both the VAL (valid) and UC (uncorrectable) bits are set
+ * MCI_STATUS_PCC - Processor Context Corrupted
+ * MCI_STATUS_S - Signaled as a Machine Check Exception
+ * MCI_STATUS_AR - Software recoverable Action Required
+ */
+static bool is_ucna(struct kvm_x86_mce *mce)
+{
+ return !mce->mcg_status &&
+ !(mce->status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR)) &&
+ (mce->status & MCI_STATUS_VAL) &&
+ (mce->status & MCI_STATUS_UC);
+}
+
+static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce, u64* banks)
+{
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+
+ banks[1] = mce->status;
+ banks[2] = mce->addr;
+ banks[3] = mce->misc;
+ vcpu->arch.mcg_status = mce->mcg_status;
+
+ if (!(mcg_cap & MCG_CMCI_P) ||
+ !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN))
+ return 0;
+
+ if (lapic_in_kernel(vcpu))
+ kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI);
+
+ return 0;
+}
+
static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
struct kvm_x86_mce *mce)
{
@@ -4828,6 +4955,12 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
return -EINVAL;
+
+ banks += array_index_nospec(4 * mce->bank, 4 * bank_num);
+
+ if (is_ucna(mce))
+ return kvm_vcpu_x86_set_ucna(vcpu, mce, banks);
+
/*
* if IA32_MCG_CTL is not all 1s, the uncorrected error
* reporting is disabled
@@ -4835,7 +4968,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
vcpu->arch.mcg_ctl != ~(u64)0)
return 0;
- banks += 4 * mce->bank;
/*
* if IA32_MCi_CTL is not all 1s, the uncorrected error
* reporting is disabled for the bank
@@ -4941,6 +5073,10 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
| KVM_VCPUEVENT_VALID_SMM);
if (vcpu->kvm->arch.exception_payload_enabled)
events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
+ if (vcpu->kvm->arch.triple_fault_event) {
+ events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
+ }
memset(&events->reserved, 0, sizeof(events->reserved));
}
@@ -4954,7 +5090,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
| KVM_VCPUEVENT_VALID_SIPI_VECTOR
| KVM_VCPUEVENT_VALID_SHADOW
| KVM_VCPUEVENT_VALID_SMM
- | KVM_VCPUEVENT_VALID_PAYLOAD))
+ | KVM_VCPUEVENT_VALID_PAYLOAD
+ | KVM_VCPUEVENT_VALID_TRIPLE_FAULT))
return -EINVAL;
if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
@@ -5027,6 +5164,15 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
}
}
+ if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) {
+ if (!vcpu->kvm->arch.triple_fault_event)
+ return -EINVAL;
+ if (events->triple_fault.pending)
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ else
+ kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ }
+
kvm_make_request(KVM_REQ_EVENT, vcpu);
return 0;
@@ -5095,7 +5241,8 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
guest_xsave->region,
- supported_xcr0, &vcpu->arch.pkru);
+ kvm_caps.supported_xcr0,
+ &vcpu->arch.pkru);
}
static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
@@ -5600,8 +5747,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EINVAL;
user_tsc_khz = (u32)arg;
- if (kvm_has_tsc_control &&
- user_tsc_khz >= kvm_max_guest_tsc_khz)
+ if (kvm_caps.has_tsc_control &&
+ user_tsc_khz >= kvm_caps.max_guest_tsc_khz)
goto out;
if (user_tsc_khz == 0)
@@ -6028,6 +6175,10 @@ split_irqchip_unlock:
kvm->arch.exception_payload_enabled = cap->args[0];
r = 0;
break;
+ case KVM_CAP_X86_TRIPLE_FAULT_EVENT:
+ kvm->arch.triple_fault_event = cap->args[0];
+ r = 0;
+ break;
case KVM_CAP_X86_USER_SPACE_MSR:
r = -EINVAL;
if (cap->args[0] & ~(KVM_MSR_EXIT_REASON_INVAL |
@@ -6046,7 +6197,7 @@ split_irqchip_unlock:
(cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT))
break;
- if (kvm_has_bus_lock_exit &&
+ if (kvm_caps.has_bus_lock_exit &&
cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)
kvm->arch.bus_lock_detection_enabled = true;
r = 0;
@@ -6109,6 +6260,65 @@ split_irqchip_unlock:
}
mutex_unlock(&kvm->lock);
break;
+ case KVM_CAP_MAX_VCPU_ID:
+ r = -EINVAL;
+ if (cap->args[0] > KVM_MAX_VCPU_IDS)
+ break;
+
+ mutex_lock(&kvm->lock);
+ if (kvm->arch.max_vcpu_ids == cap->args[0]) {
+ r = 0;
+ } else if (!kvm->arch.max_vcpu_ids) {
+ kvm->arch.max_vcpu_ids = cap->args[0];
+ r = 0;
+ }
+ mutex_unlock(&kvm->lock);
+ break;
+ case KVM_CAP_X86_NOTIFY_VMEXIT:
+ r = -EINVAL;
+ if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS)
+ break;
+ if (!kvm_caps.has_notify_vmexit)
+ break;
+ if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED))
+ break;
+ mutex_lock(&kvm->lock);
+ if (!kvm->created_vcpus) {
+ kvm->arch.notify_window = cap->args[0] >> 32;
+ kvm->arch.notify_vmexit_flags = (u32)cap->args[0];
+ r = 0;
+ }
+ mutex_unlock(&kvm->lock);
+ break;
+ case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
+ r = -EINVAL;
+
+ /*
+ * Since the risk of disabling NX hugepages is a guest crashing
+ * the system, ensure the userspace process has permission to
+ * reboot the system.
+ *
+ * Note that unlike the reboot() syscall, the process must have
+ * this capability in the root namespace because exposing
+ * /dev/kvm into a container does not limit the scope of the
+ * iTLB multihit bug to that container. In other words,
+ * this must use capable(), not ns_capable().
+ */
+ if (!capable(CAP_SYS_BOOT)) {
+ r = -EPERM;
+ break;
+ }
+
+ if (cap->args[0])
+ break;
+
+ mutex_lock(&kvm->lock);
+ if (!kvm->created_vcpus) {
+ kvm->arch.disable_nx_huge_pages = true;
+ r = 0;
+ }
+ mutex_unlock(&kvm->lock);
+ break;
default:
r = -EINVAL;
break;
@@ -6584,8 +6794,8 @@ set_pit2_out:
r = -EINVAL;
user_tsc_khz = (u32)arg;
- if (kvm_has_tsc_control &&
- user_tsc_khz >= kvm_max_guest_tsc_khz)
+ if (kvm_caps.has_tsc_control &&
+ user_tsc_khz >= kvm_caps.max_guest_tsc_khz)
goto out;
if (user_tsc_khz == 0)
@@ -6660,15 +6870,12 @@ out:
static void kvm_init_msr_list(void)
{
- struct x86_pmu_capability x86_pmu;
u32 dummy[2];
unsigned i;
BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3,
"Please update the fixed PMCs in msrs_to_saved_all[]");
- perf_get_x86_pmu_capability(&x86_pmu);
-
num_msrs_to_save = 0;
num_emulated_msrs = 0;
num_msr_based_features = 0;
@@ -6720,12 +6927,12 @@ static void kvm_init_msr_list(void)
break;
case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
- min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
+ min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
continue;
break;
case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
- min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
+ min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
continue;
break;
case MSR_IA32_XFD:
@@ -6882,7 +7089,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
- if (gpa == UNMAPPED_GVA)
+ if (gpa == INVALID_GPA)
return X86EMUL_PROPAGATE_FAULT;
ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
offset, toread);
@@ -6913,7 +7120,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
/* Inline kvm_read_guest_virt_helper for speed. */
gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK,
exception);
- if (unlikely(gpa == UNMAPPED_GVA))
+ if (unlikely(gpa == INVALID_GPA))
return X86EMUL_PROPAGATE_FAULT;
offset = addr & (PAGE_SIZE-1);
@@ -6983,7 +7190,7 @@ static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes
unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
- if (gpa == UNMAPPED_GVA)
+ if (gpa == INVALID_GPA)
return X86EMUL_PROPAGATE_FAULT;
ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
if (ret < 0) {
@@ -7094,7 +7301,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
*gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
- if (*gpa == UNMAPPED_GVA)
+ if (*gpa == INVALID_GPA)
return -1;
return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
@@ -7331,7 +7538,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
- if (gpa == UNMAPPED_GVA ||
+ if (gpa == INVALID_GPA ||
(gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
goto emul_write;
@@ -7385,36 +7592,47 @@ emul_write:
return emulator_write_emulated(ctxt, addr, new, bytes, exception);
}
-static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
+static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
+ unsigned short port, void *data,
+ unsigned int count, bool in)
{
- int r = 0, i;
+ unsigned i;
+ int r;
- for (i = 0; i < vcpu->arch.pio.count; i++) {
- if (vcpu->arch.pio.in)
- r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
- vcpu->arch.pio.size, pd);
+ WARN_ON_ONCE(vcpu->arch.pio.count);
+ for (i = 0; i < count; i++) {
+ if (in)
+ r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, port, size, data);
else
- r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
- vcpu->arch.pio.port, vcpu->arch.pio.size,
- pd);
- if (r)
+ r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, port, size, data);
+
+ if (r) {
+ if (i == 0)
+ goto userspace_io;
+
+ /*
+ * Userspace must have unregistered the device while PIO
+ * was running. Drop writes / read as 0.
+ */
+ if (in)
+ memset(data, 0, size * (count - i));
break;
- pd += vcpu->arch.pio.size;
+ }
+
+ data += size;
}
- return r;
-}
+ return 1;
-static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
- unsigned short port,
- unsigned int count, bool in)
-{
+userspace_io:
vcpu->arch.pio.port = port;
vcpu->arch.pio.in = in;
- vcpu->arch.pio.count = count;
+ vcpu->arch.pio.count = count;
vcpu->arch.pio.size = size;
- if (!kernel_pio(vcpu, vcpu->arch.pio_data))
- return 1;
+ if (in)
+ memset(vcpu->arch.pio_data, 0, size * count);
+ else
+ memcpy(vcpu->arch.pio_data, data, size * count);
vcpu->run->exit_reason = KVM_EXIT_IO;
vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -7422,30 +7640,33 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
vcpu->run->io.count = count;
vcpu->run->io.port = port;
-
return 0;
}
-static int __emulator_pio_in(struct kvm_vcpu *vcpu, int size,
- unsigned short port, unsigned int count)
+static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
+ unsigned short port, void *val, unsigned int count)
{
- WARN_ON(vcpu->arch.pio.count);
- memset(vcpu->arch.pio_data, 0, size * count);
- return emulator_pio_in_out(vcpu, size, port, count, true);
+ int r = emulator_pio_in_out(vcpu, size, port, val, count, true);
+ if (r)
+ trace_kvm_pio(KVM_PIO_IN, port, size, count, val);
+
+ return r;
}
static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val)
{
int size = vcpu->arch.pio.size;
- unsigned count = vcpu->arch.pio.count;
+ unsigned int count = vcpu->arch.pio.count;
memcpy(val, vcpu->arch.pio_data, size * count);
trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data);
vcpu->arch.pio.count = 0;
}
-static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
- unsigned short port, void *val, unsigned int count)
+static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port, void *val,
+ unsigned int count)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
if (vcpu->arch.pio.count) {
/*
* Complete a previous iteration that required userspace I/O.
@@ -7454,39 +7675,19 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
* shenanigans as KVM doesn't support modifying the rep count,
* and the emulator ensures @count doesn't overflow the buffer.
*/
- } else {
- int r = __emulator_pio_in(vcpu, size, port, count);
- if (!r)
- return r;
-
- /* Results already available, fall through. */
+ complete_emulator_pio_in(vcpu, val);
+ return 1;
}
- complete_emulator_pio_in(vcpu, val);
- return 1;
-}
-
-static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
- int size, unsigned short port, void *val,
- unsigned int count)
-{
- return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count);
-
+ return emulator_pio_in(vcpu, size, port, val, count);
}
static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
unsigned short port, const void *val,
unsigned int count)
{
- int ret;
-
- memcpy(vcpu->arch.pio_data, val, size * count);
- trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
- ret = emulator_pio_in_out(vcpu, size, port, count, false);
- if (ret)
- vcpu->arch.pio.count = 0;
-
- return ret;
+ trace_kvm_pio(KVM_PIO_OUT, port, size, count, val);
+ return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
}
static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
@@ -7868,7 +8069,16 @@ static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr);
}
+static void emulator_vm_bugged(struct x86_emulate_ctxt *ctxt)
+{
+ struct kvm *kvm = emul_to_vcpu(ctxt)->kvm;
+
+ if (!kvm->vm_bugged)
+ kvm_vm_bugged(kvm);
+}
+
static const struct x86_emulate_ops emulate_ops = {
+ .vm_bugged = emulator_vm_bugged,
.read_gpr = emulator_read_gpr,
.write_gpr = emulator_write_gpr,
.read_std = emulator_read_std,
@@ -8145,7 +8355,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* If the mapping is invalid in guest, let cpu retry
* it to generate fault.
*/
- if (gpa == UNMAPPED_GVA)
+ if (gpa == INVALID_GPA)
return true;
}
@@ -8672,11 +8882,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
/* For size less than 4 we merge, else we zero extend */
val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
- /*
- * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform
- * the copy and tracing
- */
- emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1);
+ complete_emulator_pio_in(vcpu, &val);
kvm_rax_write(vcpu, val);
return kvm_skip_emulated_instruction(vcpu);
@@ -8751,7 +8957,7 @@ static void kvm_hyperv_tsc_notifier(void)
/* TSC frequency always matches when on Hyper-V */
for_each_present_cpu(cpu)
per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
- kvm_max_guest_tsc_khz = tsc_khz;
+ kvm_caps.max_guest_tsc_khz = tsc_khz;
list_for_each_entry(kvm, &vm_list, vm_list) {
__kvm_start_pvclock_update(kvm);
@@ -8952,25 +9158,23 @@ static struct notifier_block pvclock_gtod_notifier = {
int kvm_arch_init(void *opaque)
{
struct kvm_x86_init_ops *ops = opaque;
+ u64 host_pat;
int r;
if (kvm_x86_ops.hardware_enable) {
pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name);
- r = -EEXIST;
- goto out;
+ return -EEXIST;
}
if (!ops->cpu_has_kvm_support()) {
pr_err_ratelimited("kvm: no hardware support for '%s'\n",
ops->runtime_ops->name);
- r = -EOPNOTSUPP;
- goto out;
+ return -EOPNOTSUPP;
}
if (ops->disabled_by_bios()) {
pr_err_ratelimited("kvm: support for '%s' disabled by bios\n",
ops->runtime_ops->name);
- r = -EOPNOTSUPP;
- goto out;
+ return -EOPNOTSUPP;
}
/*
@@ -8980,27 +9184,37 @@ int kvm_arch_init(void *opaque)
*/
if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
printk(KERN_ERR "kvm: inadequate fpu\n");
- r = -EOPNOTSUPP;
- goto out;
+ return -EOPNOTSUPP;
}
if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n");
- r = -EOPNOTSUPP;
- goto out;
+ return -EOPNOTSUPP;
}
- r = -ENOMEM;
+ /*
+ * KVM assumes that PAT entry '0' encodes WB memtype and simply zeroes
+ * the PAT bits in SPTEs. Bail if PAT[0] is programmed to something
+ * other than WB. Note, EPT doesn't utilize the PAT, but don't bother
+ * with an exception. PAT[0] is set to WB on RESET and also by the
+ * kernel, i.e. failure indicates a kernel bug or broken firmware.
+ */
+ if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) ||
+ (host_pat & GENMASK(2, 0)) != 6) {
+ pr_err("kvm: host PAT[0] is not WB\n");
+ return -EIO;
+ }
x86_emulator_cache = kvm_alloc_emulator_cache();
if (!x86_emulator_cache) {
pr_err("kvm: failed to allocate cache for x86 emulator\n");
- goto out;
+ return -ENOMEM;
}
user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
if (!user_return_msrs) {
printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
+ r = -ENOMEM;
goto out_free_x86_emulator_cache;
}
kvm_nr_uret_msrs = 0;
@@ -9013,7 +9227,7 @@ int kvm_arch_init(void *opaque)
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
- supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
+ kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
}
if (pi_inject_timer == -1)
@@ -9031,7 +9245,6 @@ out_free_percpu:
free_percpu(user_return_msrs);
out_free_x86_emulator_cache:
kmem_cache_destroy(x86_emulator_cache);
-out:
return r;
}
@@ -9406,7 +9619,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
if (!lapic_in_kernel(vcpu))
return;
- if (vcpu->arch.apicv_active)
+ if (vcpu->arch.apic->apicv_active)
return;
if (!vcpu->arch.apic->vapic_addr)
@@ -9435,6 +9648,11 @@ int kvm_check_nested_events(struct kvm_vcpu *vcpu)
static void kvm_inject_exception(struct kvm_vcpu *vcpu)
{
+ trace_kvm_inj_exception(vcpu->arch.exception.nr,
+ vcpu->arch.exception.has_error_code,
+ vcpu->arch.exception.error_code,
+ vcpu->arch.exception.injected);
+
if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
vcpu->arch.exception.error_code = false;
static_call(kvm_x86_queue_exception)(vcpu);
@@ -9470,7 +9688,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
static_call(kvm_x86_inject_nmi)(vcpu);
can_inject = false;
} else if (vcpu->arch.interrupt.injected) {
- static_call(kvm_x86_inject_irq)(vcpu);
+ static_call(kvm_x86_inject_irq)(vcpu, true);
can_inject = false;
}
}
@@ -9492,13 +9710,6 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
/* try to inject new event if pending */
if (vcpu->arch.exception.pending) {
- trace_kvm_inj_exception(vcpu->arch.exception.nr,
- vcpu->arch.exception.has_error_code,
- vcpu->arch.exception.error_code);
-
- vcpu->arch.exception.pending = false;
- vcpu->arch.exception.injected = true;
-
if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
X86_EFLAGS_RF);
@@ -9512,6 +9723,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
}
kvm_inject_exception(vcpu);
+
+ vcpu->arch.exception.pending = false;
+ vcpu->arch.exception.injected = true;
+
can_inject = false;
}
@@ -9564,7 +9779,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
goto out;
if (r) {
kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
- static_call(kvm_x86_inject_irq)(vcpu);
+ static_call(kvm_x86_inject_irq)(vcpu, false);
WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
}
if (kvm_cpu_has_injectable_intr(vcpu))
@@ -9857,6 +10072,7 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm)
void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
{
+ struct kvm_lapic *apic = vcpu->arch.apic;
bool activate;
if (!lapic_in_kernel(vcpu))
@@ -9865,12 +10081,14 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
down_read(&vcpu->kvm->arch.apicv_update_lock);
preempt_disable();
- activate = kvm_vcpu_apicv_activated(vcpu);
+ /* Do not activate APICV when APIC is disabled */
+ activate = kvm_vcpu_apicv_activated(vcpu) &&
+ (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED);
- if (vcpu->arch.apicv_active == activate)
+ if (apic->apicv_active == activate)
goto out;
- vcpu->arch.apicv_active = activate;
+ apic->apicv_active = activate;
kvm_apic_update_apicv(vcpu);
static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
@@ -9880,7 +10098,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
* still active when the interrupt got accepted. Make sure
* inject_pending_event() is called to check for that.
*/
- if (!vcpu->arch.apicv_active)
+ if (!apic->apicv_active)
kvm_make_request(KVM_REQ_EVENT, vcpu);
out:
@@ -10275,7 +10493,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
* per-VM state, and responsing vCPUs must wait for the update
* to complete before servicing KVM_REQ_APICV_UPDATE.
*/
- WARN_ON_ONCE(kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu));
+ WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
+ (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
@@ -10654,8 +10873,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
r = cui(vcpu);
if (r <= 0)
goto out;
- } else
- WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
+ } else {
+ WARN_ON_ONCE(vcpu->arch.pio.count);
+ WARN_ON_ONCE(vcpu->mmio_needed);
+ }
if (kvm_run->immediate_exit) {
r = -EINTR;
@@ -11183,7 +11404,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
tr->physical_address = gpa;
- tr->valid = gpa != UNMAPPED_GVA;
+ tr->valid = gpa != INVALID_GPA;
tr->writeable = 1;
tr->usermode = 0;
@@ -11276,11 +11497,17 @@ static int sync_regs(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
{
- if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
+ if (kvm_check_tsc_unstable() && kvm->created_vcpus)
pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
"guest TSC will not be reliable\n");
- return 0;
+ if (!kvm->arch.max_vcpu_ids)
+ kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS;
+
+ if (id >= kvm->arch.max_vcpu_ids)
+ return -EINVAL;
+
+ return static_call(kvm_x86_vcpu_precreate)(kvm);
}
int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
@@ -11317,7 +11544,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
* will ensure the vCPU gets the correct state before VM-Entry.
*/
if (enable_apicv) {
- vcpu->arch.apicv_active = true;
+ vcpu->arch.apic->apicv_active = true;
kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
}
} else
@@ -11330,9 +11557,11 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
goto fail_free_lapic;
vcpu->arch.pio_data = page_address(page);
- vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
+ vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64),
GFP_KERNEL_ACCOUNT);
- if (!vcpu->arch.mce_banks)
+ vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64),
+ GFP_KERNEL_ACCOUNT);
+ if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks)
goto fail_free_pio_data;
vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
@@ -11386,6 +11615,7 @@ free_wbinvd_dirty_mask:
free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
fail_free_mce_banks:
kfree(vcpu->arch.mce_banks);
+ kfree(vcpu->arch.mci_ctl2_banks);
fail_free_pio_data:
free_page((unsigned long)vcpu->arch.pio_data);
fail_free_lapic:
@@ -11431,6 +11661,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kvm_hv_vcpu_uninit(vcpu);
kvm_pmu_destroy(vcpu);
kfree(vcpu->arch.mce_banks);
+ kfree(vcpu->arch.mci_ctl2_banks);
kvm_free_lapic(vcpu);
idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_mmu_destroy(vcpu);
@@ -11510,6 +11741,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.smbase = 0x30000;
vcpu->arch.msr_misc_features_enables = 0;
+ vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
+ MSR_IA32_MISC_ENABLE_BTS_UNAVAIL;
__kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP);
__kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true);
@@ -11526,7 +11759,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
* i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry
* on RESET. But, go through the motions in case that's ever remedied.
*/
- cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0);
+ cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1);
kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600);
static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
@@ -11717,6 +11950,8 @@ int kvm_arch_hardware_setup(void *opaque)
if (boot_cpu_has(X86_FEATURE_XSAVES))
rdmsrl(MSR_IA32_XSS, host_xss);
+ kvm_init_pmu_capability();
+
r = ops->hardware_setup();
if (r != 0)
return r;
@@ -11726,13 +11961,13 @@ int kvm_arch_hardware_setup(void *opaque)
kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
- supported_xss = 0;
+ kvm_caps.supported_xss = 0;
#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
#undef __kvm_cpu_cap_has
- if (kvm_has_tsc_control) {
+ if (kvm_caps.has_tsc_control) {
/*
* Make sure the user can only configure tsc_khz values that
* fit into a signed integer.
@@ -11740,10 +11975,10 @@ int kvm_arch_hardware_setup(void *opaque)
* be 1 on all machines.
*/
u64 max = min(0x7fffffffULL,
- __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz));
- kvm_max_guest_tsc_khz = max;
+ __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz));
+ kvm_caps.max_guest_tsc_khz = max;
}
- kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
+ kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits;
kvm_init_msr_list();
return 0;
}
@@ -12331,7 +12566,8 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
{
- if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
+ if (kvm_vcpu_apicv_active(vcpu) &&
+ static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
return true;
return false;
@@ -12773,7 +13009,7 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c
(PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK);
if (!(error_code & PFERR_PRESENT_MASK) ||
- mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != UNMAPPED_GVA) {
+ mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != INVALID_GPA) {
/*
* If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
* tables probably do not match the TLB. Just proceed
@@ -12998,6 +13234,12 @@ int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
}
EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read);
+static void advance_sev_es_emulated_pio(struct kvm_vcpu *vcpu, unsigned count, int size)
+{
+ vcpu->arch.sev_pio_count -= count;
+ vcpu->arch.sev_pio_data += count * size;
+}
+
static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
unsigned int port);
@@ -13021,8 +13263,7 @@ static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count);
/* memcpy done already by emulator_pio_out. */
- vcpu->arch.sev_pio_count -= count;
- vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size;
+ advance_sev_es_emulated_pio(vcpu, count, size);
if (!ret)
break;
@@ -13038,20 +13279,14 @@ static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
unsigned int port);
-static void advance_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
-{
- unsigned count = vcpu->arch.pio.count;
- complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data);
- vcpu->arch.sev_pio_count -= count;
- vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size;
-}
-
static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
{
+ unsigned count = vcpu->arch.pio.count;
int size = vcpu->arch.pio.size;
int port = vcpu->arch.pio.port;
- advance_sev_es_emulated_ins(vcpu);
+ complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data);
+ advance_sev_es_emulated_pio(vcpu, count, size);
if (vcpu->arch.sev_pio_count)
return kvm_sev_es_ins(vcpu, size, port);
return 1;
@@ -13063,11 +13298,11 @@ static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
for (;;) {
unsigned int count =
min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
- if (!__emulator_pio_in(vcpu, size, port, count))
+ if (!emulator_pio_in(vcpu, size, port, vcpu->arch.sev_pio_data, count))
break;
/* Emulation done by the kernel. */
- advance_sev_es_emulated_ins(vcpu);
+ advance_sev_es_emulated_pio(vcpu, count, size);
if (!vcpu->arch.sev_pio_count)
return 1;
}
@@ -13110,6 +13345,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_kick_vcpu_slowpath);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_doorbell);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 588792f00334..1926d2cb8e79 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -8,6 +8,27 @@
#include "kvm_cache_regs.h"
#include "kvm_emulate.h"
+struct kvm_caps {
+ /* control of guest tsc rate supported? */
+ bool has_tsc_control;
+ /* maximum supported tsc_khz for guests */
+ u32 max_guest_tsc_khz;
+ /* number of bits of the fractional part of the TSC scaling ratio */
+ u8 tsc_scaling_ratio_frac_bits;
+ /* maximum allowed value of TSC scaling ratio */
+ u64 max_tsc_scaling_ratio;
+ /* 1ull << kvm_caps.tsc_scaling_ratio_frac_bits */
+ u64 default_tsc_scaling_ratio;
+ /* bus lock detection supported? */
+ bool has_bus_lock_exit;
+ /* notify VM exit supported? */
+ bool has_notify_vmexit;
+
+ u64 supported_mce_cap;
+ u64 supported_xcr0;
+ u64 supported_xss;
+};
+
void kvm_spurious_fault(void);
#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \
@@ -283,14 +304,15 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
extern u64 host_xcr0;
-extern u64 supported_xcr0;
extern u64 host_xss;
-extern u64 supported_xss;
+
+extern struct kvm_caps kvm_caps;
+
extern bool enable_pmu;
static inline bool kvm_mpx_supported(void)
{
- return (supported_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR))
+ return (kvm_caps.supported_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR))
== (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
}
@@ -344,6 +366,11 @@ static inline bool kvm_cstate_in_guest(struct kvm *kvm)
return kvm->arch.cstate_in_guest;
}
+static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm)
+{
+ return kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_ENABLED;
+}
+
enum kvm_intr_type {
/* Values are arbitrary, but must be non-zero. */
KVM_HANDLING_IRQ = 1,
@@ -407,7 +434,7 @@ static inline void kvm_machine_check(void)
void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu);
void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
int kvm_spec_ctrl_test_value(u64 value);
-bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
struct x86_exception *e);
int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva);
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 610beba35907..a0c05ccbf4b1 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -1049,7 +1049,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
else
vcpu->arch.xen.poll_evtchn = -1;
- set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask);
+ set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask);
if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) {
vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
@@ -1071,7 +1071,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
*r = 0;
out:
/* Really, this is only needed in case of timeout */
- clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask);
+ clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask);
if (unlikely(sched_poll.nr_ports > 1))
kfree(ports);
@@ -1311,7 +1311,7 @@ static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port)
int poll_evtchn = vcpu->arch.xen.poll_evtchn;
if ((poll_evtchn == port || poll_evtchn == -1) &&
- test_and_clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask)) {
+ test_and_clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask)) {
kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
kvm_vcpu_kick(vcpu);
}
@@ -1344,7 +1344,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
vcpu = kvm_get_vcpu_by_id(kvm, xe->vcpu_id);
if (!vcpu)
return -EINVAL;
- WRITE_ONCE(xe->vcpu_idx, kvm_vcpu_get_idx(vcpu));
+ WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx);
}
if (!vcpu->arch.xen.vcpu_info_cache.active)
@@ -1540,7 +1540,7 @@ int kvm_xen_setup_evtchn(struct kvm *kvm,
*/
vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu);
if (vcpu)
- e->xen_evtchn.vcpu_idx = kvm_vcpu_get_idx(vcpu);
+ e->xen_evtchn.vcpu_idx = vcpu->vcpu_idx;
else
e->xen_evtchn.vcpu_idx = -1;