diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2022-07-29 09:46:01 -0400 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2022-08-01 03:21:00 -0400 |
commit | 63f4b210414b65aa3103c54369cacbd0b1bdf02f (patch) | |
tree | 2dc7b490d3a89306669c70256a41764ca52ab3b3 /arch/x86/kvm | |
parent | 2e2e91158febfeb73b5d4f249440218304f34101 (diff) | |
parent | 7edc3a68038ab151a8791ddb6217755a5e4a5809 (diff) |
Merge remote-tracking branch 'kvm/next' into kvm-next-5.20
KVM/s390, KVM/x86 and common infrastructure changes for 5.20
x86:
* Permit guests to ignore single-bit ECC errors
* Fix races in gfn->pfn cache refresh; do not pin pages tracked by the cache
* Intel IPI virtualization
* Allow getting/setting pending triple fault with KVM_GET/SET_VCPU_EVENTS
* PEBS virtualization
* Simplify PMU emulation by just using PERF_TYPE_RAW events
* More accurate event reinjection on SVM (avoid retrying instructions)
* Allow getting/setting the state of the speaker port data bit
* Refuse starting the kvm-intel module if VM-Entry/VM-Exit controls are inconsistent
* "Notify" VM exit (detect microarchitectural hangs) for Intel
* Cleanups for MCE MSR emulation
s390:
* add an interface to provide a hypervisor dump for secure guests
* improve selftests to use TAP interface
* enable interpretive execution of zPCI instructions (for PCI passthrough)
* First part of deferred teardown
* CPU Topology
* PV attestation
* Minor fixes
Generic:
* new selftests API using struct kvm_vcpu instead of a (vm, id) tuple
x86:
* Use try_cmpxchg64 instead of cmpxchg64
* Bugfixes
* Ignore benign host accesses to PMU MSRs when PMU is disabled
* Allow disabling KVM's "MONITOR/MWAIT are NOPs!" behavior
* x86/MMU: Allow NX huge pages to be disabled on a per-vm basis
* Port eager page splitting to shadow MMU as well
* Enable CMCI capability by default and handle injected UCNA errors
* Expose pid of vcpu threads in debugfs
* x2AVIC support for AMD
* cleanup PIO emulation
* Fixes for LLDT/LTR emulation
* Don't require refcounted "struct page" to create huge SPTEs
x86 cleanups:
* Use separate namespaces for guest PTEs and shadow PTEs bitmasks
* PIO emulation
* Reorganize rmap API, mostly around rmap destruction
* Do not workaround very old KVM bugs for L0 that runs with nesting enabled
* new selftests API for CPUID
Diffstat (limited to 'arch/x86/kvm')
44 files changed, 2994 insertions, 1440 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index de6d44e07e34..75dcf7a72605 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -67,9 +67,17 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted) #define F feature_bit #define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0) +/* + * Magic value used by KVM when querying userspace-provided CPUID entries and + * doesn't care about the CPIUD index because the index of the function in + * question is not significant. Note, this magic value must have at least one + * bit set in bits[63:32] and must be consumed as a u64 by cpuid_entry2_find() + * to avoid false positives when processing guest CPUID input. + */ +#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( - struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index) + struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index) { struct kvm_cpuid_entry2 *e; int i; @@ -77,9 +85,31 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( for (i = 0; i < nent; i++) { e = &entries[i]; - if (e->function == function && - (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index)) + if (e->function != function) + continue; + + /* + * If the index isn't significant, use the first entry with a + * matching function. It's userspace's responsibilty to not + * provide "duplicate" entries in all cases. + */ + if (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index) + return e; + + + /* + * Similarly, use the first matching entry if KVM is doing a + * lookup (as opposed to emulating CPUID) for a function that's + * architecturally defined as not having a significant index. + */ + if (index == KVM_CPUID_INDEX_NOT_SIGNIFICANT) { + /* + * Direct lookups from KVM should not diverge from what + * KVM defines internally (the architectural behavior). + */ + WARN_ON_ONCE(cpuid_function_is_indexed(function)); return e; + } } return NULL; @@ -96,7 +126,8 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu, * The existing code assumes virtual address is 48-bit or 57-bit in the * canonical address checks; exit if it is ever changed. */ - best = cpuid_entry2_find(entries, nent, 0x80000008, 0); + best = cpuid_entry2_find(entries, nent, 0x80000008, + KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) { int vaddr_bits = (best->eax & 0xff00) >> 8; @@ -151,7 +182,7 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) vcpu->arch.kvm_cpuid_base = 0; for_each_possible_hypervisor_cpuid_base(function) { - entry = kvm_find_cpuid_entry(vcpu, function, 0); + entry = kvm_find_cpuid_entry(vcpu, function); if (entry) { u32 signature[3]; @@ -177,7 +208,8 @@ static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *v if (!base) return NULL; - return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, 0); + return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, + KVM_CPUID_INDEX_NOT_SIGNIFICANT); } static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) @@ -200,7 +232,7 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) /* * Calculate guest's supported XCR0 taking into account guest CPUID data and - * supported_xcr0 (comprised of host configuration and KVM_SUPPORTED_XCR0). + * KVM's supported XCR0 (comprised of host's XCR0 and KVM_SUPPORTED_XCR0). */ static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent) { @@ -210,7 +242,7 @@ static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent) if (!best) return 0; - return (best->eax | ((u64)best->edx << 32)) & supported_xcr0; + return (best->eax | ((u64)best->edx << 32)) & kvm_caps.supported_xcr0; } static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, @@ -219,7 +251,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e struct kvm_cpuid_entry2 *best; u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent); - best = cpuid_entry2_find(entries, nent, 1, 0); + best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) { /* Update OSXSAVE bit */ if (boot_cpu_has(X86_FEATURE_XSAVE)) @@ -250,7 +282,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { - best = cpuid_entry2_find(entries, nent, 0x1, 0); + best = cpuid_entry2_find(entries, nent, 0x1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) cpuid_entry_change(best, X86_FEATURE_MWAIT, vcpu->arch.ia32_misc_enable_msr & @@ -285,7 +317,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *best; u64 guest_supported_xcr0; - best = kvm_find_cpuid_entry(vcpu, 1, 0); + best = kvm_find_cpuid_entry(vcpu, 1); if (best && apic) { if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) apic->lapic_timer.timer_mode_mask = 3 << 17; @@ -325,10 +357,10 @@ int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); + best = kvm_find_cpuid_entry(vcpu, 0x80000000); if (!best || best->eax < 0x80000008) goto not_found; - best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + best = kvm_find_cpuid_entry(vcpu, 0x80000008); if (best) return best->eax & 0xff; not_found: @@ -868,7 +900,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) case 9: break; case 0xa: { /* Architectural Performance Monitoring */ - struct x86_pmu_capability cap; union cpuid10_eax eax; union cpuid10_edx edx; @@ -877,30 +908,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; } - perf_get_x86_pmu_capability(&cap); + eax.split.version_id = kvm_pmu_cap.version; + eax.split.num_counters = kvm_pmu_cap.num_counters_gp; + eax.split.bit_width = kvm_pmu_cap.bit_width_gp; + eax.split.mask_length = kvm_pmu_cap.events_mask_len; + edx.split.num_counters_fixed = kvm_pmu_cap.num_counters_fixed; + edx.split.bit_width_fixed = kvm_pmu_cap.bit_width_fixed; - /* - * The guest architecture pmu is only supported if the architecture - * pmu exists on the host and the module parameters allow it. - */ - if (!cap.version || !enable_pmu) - memset(&cap, 0, sizeof(cap)); - - eax.split.version_id = min(cap.version, 2); - eax.split.num_counters = cap.num_counters_gp; - eax.split.bit_width = cap.bit_width_gp; - eax.split.mask_length = cap.events_mask_len; - - edx.split.num_counters_fixed = - min(cap.num_counters_fixed, KVM_PMC_MAX_FIXED); - edx.split.bit_width_fixed = cap.bit_width_fixed; - if (cap.version) + if (kvm_pmu_cap.version) edx.split.anythread_deprecated = 1; edx.split.reserved1 = 0; edx.split.reserved2 = 0; entry->eax = eax.full; - entry->ebx = cap.events_mask; + entry->ebx = kvm_pmu_cap.events_mask; entry->ecx = 0; entry->edx = edx.full; break; @@ -923,8 +944,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) } break; case 0xd: { - u64 permitted_xcr0 = supported_xcr0 & xstate_get_guest_group_perm(); - u64 permitted_xss = supported_xss; + u64 permitted_xcr0 = kvm_caps.supported_xcr0 & xstate_get_guest_group_perm(); + u64 permitted_xss = kvm_caps.supported_xss; entry->eax &= permitted_xcr0; entry->ebx = xstate_required_size(permitted_xcr0, false); @@ -1313,12 +1334,20 @@ out_free: return r; } -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index) +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, + u32 function, u32 index) { return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, function, index); } +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry_index); + +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function) +{ + return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, + function, KVM_CPUID_INDEX_NOT_SIGNIFICANT); +} EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); /* @@ -1355,7 +1384,7 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) struct kvm_cpuid_entry2 *basic, *class; u32 function = *fn_ptr; - basic = kvm_find_cpuid_entry(vcpu, 0, 0); + basic = kvm_find_cpuid_entry(vcpu, 0); if (!basic) return NULL; @@ -1364,11 +1393,11 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) return NULL; if (function >= 0x40000000 && function <= 0x4fffffff) - class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00, 0); + class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00); else if (function >= 0xc0000000) - class = kvm_find_cpuid_entry(vcpu, 0xc0000000, 0); + class = kvm_find_cpuid_entry(vcpu, 0xc0000000); else - class = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); + class = kvm_find_cpuid_entry(vcpu, function & 0x80000000); if (class && function <= class->eax) return NULL; @@ -1386,7 +1415,7 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) * the effective CPUID entry is the max basic leaf. Note, the index of * the original requested leaf is observed! */ - return kvm_find_cpuid_entry(vcpu, basic->eax, index); + return kvm_find_cpuid_entry_index(vcpu, basic->eax, index); } bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, @@ -1396,7 +1425,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, struct kvm_cpuid_entry2 *entry; bool exact, used_max_basic = false; - entry = kvm_find_cpuid_entry(vcpu, function, index); + entry = kvm_find_cpuid_entry_index(vcpu, function, index); exact = !!entry; if (!entry && !exact_only) { @@ -1425,7 +1454,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, * exists. EDX can be copied from any existing index. */ if (function == 0xb || function == 0x1f) { - entry = kvm_find_cpuid_entry(vcpu, function, 1); + entry = kvm_find_cpuid_entry_index(vcpu, function, 1); if (entry) { *ecx = index & 0xff; *edx = entry->edx; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 8a770b481d9d..b1658c0de847 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -13,8 +13,10 @@ void kvm_set_cpu_caps(void); void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); void kvm_update_pv_runtime(struct kvm_vcpu *vcpu); +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, + u32 function, u32 index); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index); + u32 function); int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries, unsigned int type); @@ -76,7 +78,7 @@ static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu, const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature); struct kvm_cpuid_entry2 *entry; - entry = kvm_find_cpuid_entry(vcpu, cpuid.function, cpuid.index); + entry = kvm_find_cpuid_entry_index(vcpu, cpuid.function, cpuid.index); if (!entry) return NULL; @@ -109,7 +111,7 @@ static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0, 0); + best = kvm_find_cpuid_entry(vcpu, 0); return best && (is_guest_vendor_amd(best->ebx, best->ecx, best->edx) || is_guest_vendor_hygon(best->ebx, best->ecx, best->edx)); @@ -119,7 +121,7 @@ static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0, 0); + best = kvm_find_cpuid_entry(vcpu, 0); return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx); } @@ -127,7 +129,7 @@ static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; @@ -138,18 +140,23 @@ static inline int guest_cpuid_model(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; return x86_model(best->eax); } +static inline bool cpuid_model_is_consistent(struct kvm_vcpu *vcpu) +{ + return boot_cpu_data.x86_model == guest_cpuid_model(vcpu); +} + static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 9240b3b7f8dd..cfed36aba2f7 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -48,7 +48,7 @@ DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val) { - *val = kvm_tsc_scaling_ratio_frac_bits; + *val = kvm_caps.tsc_scaling_ratio_frac_bits; return 0; } @@ -66,7 +66,7 @@ void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_ debugfs_dentry, vcpu, &vcpu_timer_advance_ns_fops); - if (kvm_has_tsc_control) { + if (kvm_caps.has_tsc_control) { debugfs_create_file("tsc-scaling-ratio", 0444, debugfs_dentry, vcpu, &vcpu_tsc_scaling_fops); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f8382abe22ff..047c583596bb 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -244,6 +244,9 @@ enum x86_transfer_type { static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) { + if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) + nr &= NR_EMULATOR_GPRS - 1; + if (!(ctxt->regs_valid & (1 << nr))) { ctxt->regs_valid |= 1 << nr; ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr); @@ -253,6 +256,12 @@ static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr) { + if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) + nr &= NR_EMULATOR_GPRS - 1; + + BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS); + BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS); + ctxt->regs_valid |= 1 << nr; ctxt->regs_dirty |= 1 << nr; return &ctxt->_regs[nr]; @@ -266,9 +275,10 @@ static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr) static void writeback_registers(struct x86_emulate_ctxt *ctxt) { + unsigned long dirty = ctxt->regs_dirty; unsigned reg; - for_each_set_bit(reg, (ulong *)&ctxt->regs_dirty, 16) + for_each_set_bit(reg, &dirty, NR_EMULATOR_GPRS) ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]); } @@ -615,7 +625,9 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, u32 error, bool valid) { - WARN_ON(vec > 0x1f); + if (KVM_EMULATOR_BUG_ON(vec > 0x1f, ctxt)) + return X86EMUL_UNHANDLEABLE; + ctxt->exception.vector = vec; ctxt->exception.error_code = error; ctxt->exception.error_code_valid = valid; @@ -1362,7 +1374,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, if (mc->pos < mc->end) goto read_cached; - WARN_ON((mc->end + size) >= sizeof(mc->data)); + if (KVM_EMULATOR_BUG_ON((mc->end + size) >= sizeof(mc->data), ctxt)) + return X86EMUL_UNHANDLEABLE; rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size, &ctxt->exception); @@ -1687,16 +1700,6 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, case VCPU_SREG_TR: if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) goto exception; - if (!seg_desc.p) { - err_vec = NP_VECTOR; - goto exception; - } - old_desc = seg_desc; - seg_desc.type |= 2; /* busy */ - ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc, - sizeof(seg_desc), &ctxt->exception); - if (ret != X86EMUL_CONTINUE) - return ret; break; case VCPU_SREG_LDTR: if (seg_desc.s || seg_desc.type != 2) @@ -1734,8 +1737,17 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (ret != X86EMUL_CONTINUE) return ret; if (emul_is_noncanonical_address(get_desc_base(&seg_desc) | - ((u64)base3 << 32), ctxt)) - return emulate_gp(ctxt, 0); + ((u64)base3 << 32), ctxt)) + return emulate_gp(ctxt, err_code); + } + + if (seg == VCPU_SREG_TR) { + old_desc = seg_desc; + seg_desc.type |= 2; /* busy */ + ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc, + sizeof(seg_desc), &ctxt->exception); + if (ret != X86EMUL_CONTINUE) + return ret; } load: ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg); @@ -2432,7 +2444,7 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED; ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0); - for (i = 0; i < 8; i++) + for (i = 0; i < NR_EMULATOR_GPRS; i++) *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4); val = GET_SMSTATE(u32, smstate, 0x7fcc); @@ -2489,7 +2501,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u16 selector; int i, r; - for (i = 0; i < 16; i++) + for (i = 0; i < NR_EMULATOR_GPRS; i++) *reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8); ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78); @@ -5719,7 +5731,8 @@ writeback: done: if (rc == X86EMUL_PROPAGATE_FAULT) { - WARN_ON(ctxt->exception.vector > 0x1f); + if (KVM_EMULATOR_BUG_ON(ctxt->exception.vector > 0x1f, ctxt)) + return EMULATION_FAILED; ctxt->have_exception = true; } if (rc == X86EMUL_INTERCEPTED) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index e2e95a6fccfd..ed804447589c 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1992,7 +1992,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *entry; struct kvm_vcpu_hv *hv_vcpu; - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE); if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) { vcpu->arch.hyperv_enabled = true; } else { @@ -2005,7 +2005,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) hv_vcpu = to_hv_vcpu(vcpu); - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES); if (entry) { hv_vcpu->cpuid_cache.features_eax = entry->eax; hv_vcpu->cpuid_cache.features_ebx = entry->ebx; @@ -2016,7 +2016,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) hv_vcpu->cpuid_cache.features_edx = 0; } - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO); if (entry) { hv_vcpu->cpuid_cache.enlightenments_eax = entry->eax; hv_vcpu->cpuid_cache.enlightenments_ebx = entry->ebx; @@ -2025,7 +2025,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) hv_vcpu->cpuid_cache.enlightenments_ebx = 0; } - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES); if (entry) hv_vcpu->cpuid_cache.syndbg_cap_eax = entry->eax; else diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 1c83076091af..e0a7a0e7a73c 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -591,7 +591,10 @@ static int speaker_ioport_write(struct kvm_vcpu *vcpu, return -EOPNOTSUPP; mutex_lock(&pit_state->lock); - pit_state->speaker_data_on = (val >> 1) & 1; + if (val & (1 << 1)) + pit_state->flags |= KVM_PIT_FLAGS_SPEAKER_DATA_ON; + else + pit_state->flags &= ~KVM_PIT_FLAGS_SPEAKER_DATA_ON; pit_set_gate(pit, 2, val & 1); mutex_unlock(&pit_state->lock); return 0; @@ -612,8 +615,9 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu, refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1; mutex_lock(&pit_state->lock); - ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(pit, 2) | - (pit_get_out(pit, 2) << 5) | (refresh_clock << 4)); + ret = (!!(pit_state->flags & KVM_PIT_FLAGS_SPEAKER_DATA_ON) << 1) | + pit_get_gate(pit, 2) | (pit_get_out(pit, 2) << 5) | + (refresh_clock << 4); if (len > sizeof(ret)) len = sizeof(ret); memcpy(data, (char *)&ret, len); diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 394d9527da7e..a768212ba821 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -29,7 +29,6 @@ struct kvm_kpit_state { bool is_periodic; s64 period; /* unit: ns */ struct hrtimer timer; - u32 speaker_data_on; struct mutex lock; atomic_t reinject; diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 8dff25d267b7..89246446d6aa 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -89,6 +89,7 @@ struct x86_instruction_info { #define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */ struct x86_emulate_ops { + void (*vm_bugged)(struct x86_emulate_ctxt *ctxt); /* * read_gpr: read a general purpose register (rax - r15) * @@ -301,6 +302,18 @@ struct fastop; typedef void (*fastop_t)(struct fastop *); +/* + * The emulator's _regs array tracks only the GPRs, i.e. excludes RIP. RIP is + * tracked/accessed via _eip, and except for RIP relative addressing, which + * also uses _eip, RIP cannot be a register operand nor can it be an operand in + * a ModRM or SIB byte. + */ +#ifdef CONFIG_X86_64 +#define NR_EMULATOR_GPRS 16 +#else +#define NR_EMULATOR_GPRS 8 +#endif + struct x86_emulate_ctxt { void *vcpu; const struct x86_emulate_ops *ops; @@ -345,9 +358,9 @@ struct x86_emulate_ctxt { u8 lock_prefix; u8 rep_prefix; /* bitmaps of registers in _regs[] that can be read */ - u32 regs_valid; + u16 regs_valid; /* bitmaps of registers in _regs[] that have been written */ - u32 regs_dirty; + u16 regs_dirty; /* modrm */ u8 modrm; u8 modrm_mod; @@ -363,7 +376,7 @@ struct x86_emulate_ctxt { struct operand src2; struct operand dst; struct operand memop; - unsigned long _regs[NR_VCPU_REGS]; + unsigned long _regs[NR_EMULATOR_GPRS]; struct operand *memopp; struct fetch_cache fetch; struct read_cache io_read; @@ -371,6 +384,15 @@ struct x86_emulate_ctxt { bool is_branch; }; +#define KVM_EMULATOR_BUG_ON(cond, ctxt) \ +({ \ + int __ret = (cond); \ + \ + if (WARN_ON_ONCE(__ret)) \ + ctxt->ops->vm_bugged(ctxt); \ + unlikely(__ret); \ +}) + /* Repeat String Operation Prefix */ #define REPE_PREFIX 0xf3 #define REPNE_PREFIX 0xf2 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0e68b4c937fc..e2ce3556915e 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -27,6 +27,7 @@ #include <linux/math64.h> #include <linux/slab.h> #include <asm/processor.h> +#include <asm/mce.h> #include <asm/msr.h> #include <asm/page.h> #include <asm/current.h> @@ -54,7 +55,7 @@ #define PRIo64 "o" /* 14 is the version for Xeon and Pentium 8.4.8*/ -#define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16)) +#define APIC_VERSION 0x14UL #define LAPIC_MMIO_LENGTH (1 << 12) /* followed define is not in apicdef.h */ #define MAX_APIC_VECTOR 256 @@ -67,6 +68,8 @@ static bool lapic_timer_advance_dynamic __read_mostly; #define LAPIC_TIMER_ADVANCE_NS_MAX 5000 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 +static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data); +static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data); static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val) { @@ -398,14 +401,26 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val) return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; } +static inline bool kvm_lapic_lvt_supported(struct kvm_lapic *apic, int lvt_index) +{ + return apic->nr_lvt_entries > lvt_index; +} + +static inline int kvm_apic_calc_nr_lvt_entries(struct kvm_vcpu *vcpu) +{ + return KVM_APIC_MAX_NR_LVT_ENTRIES - !(vcpu->arch.mcg_cap & MCG_CMCI_P); +} + void kvm_apic_set_version(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - u32 v = APIC_VERSION; + u32 v = 0; if (!lapic_in_kernel(vcpu)) return; + v = APIC_VERSION | ((apic->nr_lvt_entries - 1) << 16); + /* * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation) * which doesn't have EOI register; Some buggy OSes (e.g. Windows with @@ -419,12 +434,33 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) kvm_lapic_set_reg(apic, APIC_LVR, v); } -static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = { - LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */ - LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ - LVT_MASK | APIC_MODE_MASK, /* LVTPC */ - LINT_MASK, LINT_MASK, /* LVT0-1 */ - LVT_MASK /* LVTERR */ +void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu) +{ + int nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu); + struct kvm_lapic *apic = vcpu->arch.apic; + int i; + + if (!lapic_in_kernel(vcpu) || nr_lvt_entries == apic->nr_lvt_entries) + return; + + /* Initialize/mask any "new" LVT entries. */ + for (i = apic->nr_lvt_entries; i < nr_lvt_entries; i++) + kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED); + + apic->nr_lvt_entries = nr_lvt_entries; + + /* The number of LVT entries is reflected in the version register. */ + kvm_apic_set_version(vcpu); +} + +static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = { + [LVT_TIMER] = LVT_MASK, /* timer mode mask added at runtime */ + [LVT_THERMAL_MONITOR] = LVT_MASK | APIC_MODE_MASK, + [LVT_PERFORMANCE_COUNTER] = LVT_MASK | APIC_MODE_MASK, + [LVT_LINT0] = LINT_MASK, + [LVT_LINT1] = LINT_MASK, + [LVT_ERROR] = LVT_MASK, + [LVT_CMCI] = LVT_MASK | APIC_MODE_MASK }; static int find_highest_vector(void *bitmap) @@ -518,14 +554,11 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) { - struct kvm_vcpu *vcpu; - - vcpu = apic->vcpu; - - if (unlikely(vcpu->arch.apicv_active)) { + if (unlikely(apic->apicv_active)) { /* need to update RVI */ kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); - static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); + static_call_cond(kvm_x86_hwapic_irr_update)(apic->vcpu, + apic_find_highest_irr(apic)); } else { apic->irr_pending = false; kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); @@ -542,20 +575,16 @@ EXPORT_SYMBOL_GPL(kvm_apic_clear_irr); static inline void apic_set_isr(int vec, struct kvm_lapic *apic) { - struct kvm_vcpu *vcpu; - if (__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) return; - vcpu = apic->vcpu; - /* * With APIC virtualization enabled, all caching is disabled * because the processor can modify ISR under the hood. Instead * just set SVI. */ - if (unlikely(vcpu->arch.apicv_active)) - static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, vec); + if (unlikely(apic->apicv_active)) + static_call_cond(kvm_x86_hwapic_isr_update)(vec); else { ++apic->isr_count; BUG_ON(apic->isr_count > MAX_APIC_VECTOR); @@ -589,12 +618,9 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic) static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) { - struct kvm_vcpu *vcpu; if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) return; - vcpu = apic->vcpu; - /* * We do get here for APIC virtualization enabled if the guest * uses the Hyper-V APIC enlightenment. In this case we may need @@ -602,8 +628,8 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * on the other hand isr_count and highest_isr_cache are unused * and must be left alone. */ - if (unlikely(vcpu->arch.apicv_active)) - static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); + if (unlikely(apic->apicv_active)) + static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic)); else { --apic->isr_count; BUG_ON(apic->isr_count < 0); @@ -801,17 +827,17 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda) if (kvm_apic_broadcast(apic, mda)) return true; - if (apic_x2apic_mode(apic)) - return mda == kvm_x2apic_id(apic); - /* - * Hotplug hack: Make LAPIC in xAPIC mode also accept interrupts as if - * it were in x2APIC mode. Hotplugged VCPUs start in xAPIC mode and - * this allows unique addressing of VCPUs with APIC ID over 0xff. - * The 0xff condition is needed because writeable xAPIC ID. + * Hotplug hack: Accept interrupts for vCPUs in xAPIC mode as if they + * were in x2APIC mode if the target APIC ID can't be encoded as an + * xAPIC ID. This allows unique addressing of hotplugged vCPUs (which + * start in xAPIC mode) with an APIC ID that is unaddressable in xAPIC + * mode. Match the x2APIC ID if and only if the target APIC ID can't + * be encoded in xAPIC to avoid spurious matches against a vCPU that + * changed its (addressable) xAPIC ID (which is writable). */ - if (kvm_x2apic_id(apic) > 0xff && mda == kvm_x2apic_id(apic)) - return true; + if (apic_x2apic_mode(apic) || mda > 0xff) + return mda == kvm_x2apic_id(apic); return mda == kvm_xapic_id(apic); } @@ -1325,7 +1351,7 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) if (apic_x2apic_mode(apic)) irq.dest_id = icr_high; else - irq.dest_id = GET_APIC_DEST_FIELD(icr_high); + irq.dest_id = GET_XAPIC_DEST_FIELD(icr_high); trace_kvm_apic_ipi(icr_low, irq.dest_id); @@ -1444,6 +1470,9 @@ static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, APIC_REG_MASK(APIC_TMCCT) | APIC_REG_MASK(APIC_TDCR); + if (kvm_lapic_lvt_supported(apic, LVT_CMCI)) + valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI); + /* * ARBPRI and ICR2 are not valid in x2APIC mode. WARN if KVM reads ICR * in x2APIC mode as it's an 8-byte register in x2APIC and needs to be @@ -1583,7 +1612,7 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) int vec = reg & APIC_VECTOR_MASK; void *bitmap = apic->regs + APIC_ISR; - if (vcpu->arch.apicv_active) + if (apic->apicv_active) bitmap = apic->regs + APIC_IRR; if (apic_test_vector(vec, bitmap)) @@ -1602,7 +1631,7 @@ static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles) * that __delay() uses delay_tsc whenever the hardware has TSC, thus * always for VMX enabled hardware. */ - if (vcpu->arch.tsc_scaling_ratio == kvm_default_tsc_scaling_ratio) { + if (vcpu->arch.tsc_scaling_ratio == kvm_caps.default_tsc_scaling_ratio) { __delay(min(guest_cycles, nsec_to_cycles(vcpu, timer_advance_ns))); } else { @@ -1700,7 +1729,7 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use) ktimer->expired_tscdeadline = ktimer->tscdeadline; - if (!from_timer_fn && vcpu->arch.apicv_active) { + if (!from_timer_fn && apic->apicv_active) { WARN_ON(kvm_get_running_vcpu() != vcpu); kvm_apic_inject_pending_timer_irqs(apic); return; @@ -2052,6 +2081,16 @@ static void kvm_lapic_xapic_id_updated(struct kvm_lapic *apic) kvm_set_apicv_inhibit(apic->vcpu->kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED); } +static int get_lvt_index(u32 reg) +{ + if (reg == APIC_LVTCMCI) + return LVT_CMCI; + if (reg < APIC_LVTT || reg > APIC_LVTERR) + return -1; + return array_index_nospec( + (reg - APIC_LVTT) >> 4, KVM_APIC_MAX_NR_LVT_ENTRIES); +} + static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) { int ret = 0; @@ -2098,13 +2137,10 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) apic_set_spiv(apic, val & mask); if (!(val & APIC_SPIV_APIC_ENABLED)) { int i; - u32 lvt_val; - for (i = 0; i < KVM_APIC_LVT_NUM; i++) { - lvt_val = kvm_lapic_get_reg(apic, - APIC_LVTT + 0x10 * i); - kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, - lvt_val | APIC_LVT_MASKED); + for (i = 0; i < apic->nr_lvt_entries; i++) { + kvm_lapic_set_reg(apic, APIC_LVTx(i), + kvm_lapic_get_reg(apic, APIC_LVTx(i)) | APIC_LVT_MASKED); } apic_update_lvtt(apic); atomic_set(&apic->lapic_timer.pending, 0); @@ -2133,16 +2169,15 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_LVTTHMR: case APIC_LVTPC: case APIC_LVT1: - case APIC_LVTERR: { - /* TODO: Check vector */ - size_t size; - u32 index; - + case APIC_LVTERR: + case APIC_LVTCMCI: { + u32 index = get_lvt_index(reg); + if (!kvm_lapic_lvt_supported(apic, index)) { + ret = 1; + break; + } if (!kvm_apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; - size = ARRAY_SIZE(apic_lvt_mask); - index = array_index_nospec( - (reg - APIC_LVTT) >> 4, size); val &= apic_lvt_mask[index]; kvm_lapic_set_reg(apic, reg, val); break; @@ -2246,10 +2281,26 @@ EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); /* emulate APIC access in a trap manner */ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) { - u32 val = kvm_lapic_get_reg(vcpu->arch.apic, offset); + struct kvm_lapic *apic = vcpu->arch.apic; + u64 val; + + if (apic_x2apic_mode(apic)) + kvm_lapic_msr_read(apic, offset, &val); + else + val = kvm_lapic_get_reg(apic, offset); - /* TODO: optimize to just emulate side effect w/o one more write */ - kvm_lapic_reg_write(vcpu->arch.apic, offset, val); + /* + * ICR is a single 64-bit register when x2APIC is enabled. For legacy + * xAPIC, ICR writes need to go down the common (slightly slower) path + * to get the upper half from ICR2. + */ + if (apic_x2apic_mode(apic) && offset == APIC_ICR) { + kvm_apic_send_ipi(apic, (u32)val, (u32)(val >> 32)); + trace_kvm_apic_write(APIC_ICR, val); + } else { + /* TODO: optimize to just emulate side effect w/o one more write */ + kvm_lapic_reg_write(apic, offset, (u32)val); + } } EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode); @@ -2344,8 +2395,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE)) kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); - if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) + if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) { + kvm_vcpu_update_apicv(vcpu); static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu); + } apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; @@ -2361,7 +2414,7 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (vcpu->arch.apicv_active) { + if (apic->apicv_active) { /* irr_pending is always true when apicv is activated. */ apic->irr_pending = true; apic->isr_count = 1; @@ -2401,8 +2454,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); kvm_apic_set_version(apic->vcpu); - for (i = 0; i < KVM_APIC_LVT_NUM; i++) - kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); + for (i = 0; i < apic->nr_lvt_entries; i++) + kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED); apic_update_lvtt(apic); if (kvm_vcpu_is_reset_bsp(vcpu) && kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED)) @@ -2436,10 +2489,10 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); - if (vcpu->arch.apicv_active) { + if (apic->apicv_active) { static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1); - static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, -1); + static_call_cond(kvm_x86_hwapic_isr_update)(-1); } vcpu->arch.apic_arb_prio = 0; @@ -2532,6 +2585,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) } apic->vcpu = vcpu; + apic->nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu); + hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); apic->lapic_timer.timer.function = apic_timer_fn; @@ -2716,10 +2771,10 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) kvm_lapic_set_reg(apic, APIC_TMCCT, 0); kvm_apic_update_apicv(vcpu); apic->highest_isr_cache = -1; - if (vcpu->arch.apicv_active) { + if (apic->apicv_active) { static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); - static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); + static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic)); } kvm_make_request(KVM_REQ_EVENT, vcpu); if (ioapic_in_kernel(vcpu->kvm)) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 65bb2a8cf145..117a46df5cc1 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -10,7 +10,6 @@ #define KVM_APIC_INIT 0 #define KVM_APIC_SIPI 1 -#define KVM_APIC_LVT_NUM 6 #define APIC_SHORT_MASK 0xc0000 #define APIC_DEST_NOSHORT 0x0 @@ -29,6 +28,20 @@ enum lapic_mode { LAPIC_MODE_X2APIC = MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE, }; +enum lapic_lvt_entry { + LVT_TIMER, + LVT_THERMAL_MONITOR, + LVT_PERFORMANCE_COUNTER, + LVT_LINT0, + LVT_LINT1, + LVT_ERROR, + LVT_CMCI, + + KVM_APIC_MAX_NR_LVT_ENTRIES, +}; + +#define APIC_LVTx(x) ((x) == LVT_CMCI ? APIC_LVTCMCI : APIC_LVTT + 0x10 * (x)) + struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ @@ -48,6 +61,7 @@ struct kvm_lapic { struct kvm_timer lapic_timer; u32 divide_count; struct kvm_vcpu *vcpu; + bool apicv_active; bool sw_enabled; bool irr_pending; bool lvt0_in_nmi_mode; @@ -65,6 +79,7 @@ struct kvm_lapic { struct gfn_to_hva_cache vapic_cache; unsigned long pending_events; unsigned int sipi_vector; + int nr_lvt_entries; }; struct dest_map; @@ -84,6 +99,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_recalculate_apic_map(struct kvm *kvm); void kvm_apic_set_version(struct kvm_vcpu *vcpu); +void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu); bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int shorthand, unsigned int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); @@ -204,7 +220,7 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic) static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu) { - return vcpu->arch.apic && vcpu->arch.apicv_active; + return lapic_in_kernel(vcpu) && vcpu->arch.apic->apicv_active; } static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index f8192864b496..a99acec925eb 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -6,11 +6,6 @@ #include "kvm_cache_regs.h" #include "cpuid.h" -#define PT64_PT_BITS 9 -#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) -#define PT32_PT_BITS 10 -#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) - #define PT_WRITABLE_SHIFT 1 #define PT_USER_SHIFT 2 @@ -34,11 +29,6 @@ #define PT_DIR_PAT_SHIFT 12 #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) -#define PT32_DIR_PSE36_SIZE 4 -#define PT32_DIR_PSE36_SHIFT 13 -#define PT32_DIR_PSE36_MASK \ - (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) - #define PT64_ROOT_5LEVEL 5 #define PT64_ROOT_4LEVEL 4 #define PT32_ROOT_LEVEL 2 diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 17252f39bd7c..3e1317325e1f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -53,8 +53,6 @@ #include <asm/kvm_page_track.h> #include "trace.h" -#include "paging.h" - extern bool itlb_multihit_kvm_mitigation; int __read_mostly nx_huge_pages = -1; @@ -111,26 +109,6 @@ module_param(dbg, bool, 0644); #define PTE_PREFETCH_NUM 8 -#define PT32_LEVEL_BITS 10 - -#define PT32_LEVEL_SHIFT(level) \ - (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) - -#define PT32_LVL_OFFSET_MASK(level) \ - (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT32_LEVEL_BITS))) - 1)) - -#define PT32_INDEX(address, level)\ - (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) - - -#define PT32_BASE_ADDR_MASK PAGE_MASK -#define PT32_DIR_BASE_ADDR_MASK \ - (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) -#define PT32_LVL_ADDR_MASK(level) \ - (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT32_LEVEL_BITS))) - 1)) - #include <trace/events/kvm.h> /* make pte_list_desc fit well in cache lines */ @@ -326,13 +304,6 @@ static int is_cpuid_PSE36(void) return 1; } -static gfn_t pse36_gfn_delta(u32 gpte) -{ - int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; - - return (gpte & PT32_DIR_PSE36_MASK) << shift; -} - #ifdef CONFIG_X86_64 static void __set_spte(u64 *sptep, u64 spte) { @@ -432,7 +403,7 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) * The idea using the light way get the spte on x86_32 guest is from * gup_get_pte (mm/gup.c). * - * An spte tlb flush may be pending, because kvm_set_pte_rmapp + * An spte tlb flush may be pending, because kvm_set_pte_rmap * coalesces them and we are running out of the MMU lock. Therefore * we need to protect against in-progress updates of the spte. * @@ -558,11 +529,12 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) * state bits, it is used to clear the last level sptep. * Returns the old PTE. */ -static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) +static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) { kvm_pfn_t pfn; u64 old_spte = *sptep; int level = sptep_to_sp(sptep)->role.level; + struct page *page; if (!is_shadow_present_pte(old_spte) || !spte_has_volatile_bits(old_spte)) @@ -578,11 +550,13 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) pfn = spte_to_pfn(old_spte); /* - * KVM does not hold the refcount of the page used by - * kvm mmu, before reclaiming the page, we should - * unmap it from mmu first. + * KVM doesn't hold a reference to any pages mapped into the guest, and + * instead uses the mmu_notifier to ensure that KVM unmaps any pages + * before they are reclaimed. Sanity check that, if the pfn is backed + * by a refcounted page, the refcount is elevated. */ - WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); + page = kvm_pfn_to_refcounted_page(pfn); + WARN_ON(page && !page_count(page)); if (is_accessed_spte(old_spte)) kvm_set_pfn_accessed(pfn); @@ -682,7 +656,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) if (r) return r; if (maybe_indirect) { - r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache, + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache, PT64_ROOT_MAX_LEVEL); if (r) return r; @@ -695,48 +669,79 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); - kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); } -static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) -{ - return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); -} - static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) { kmem_cache_free(pte_list_desc_cache, pte_list_desc); } +static bool sp_has_gptes(struct kvm_mmu_page *sp); + static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) { if (sp->role.passthrough) return sp->gfn; if (!sp->role.direct) - return sp->gfns[index]; + return sp->shadowed_translation[index] >> PAGE_SHIFT; - return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); + return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS)); } -static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) +/* + * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note + * that the SPTE itself may have a more constrained access permissions that + * what the guest enforces. For example, a guest may create an executable + * huge PTE but KVM may disallow execution to mitigate iTLB multihit. + */ +static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index) { - if (sp->role.passthrough) { - WARN_ON_ONCE(gfn != sp->gfn); - return; - } + if (sp_has_gptes(sp)) + return sp->shadowed_translation[index] & ACC_ALL; - if (!sp->role.direct) { - sp->gfns[index] = gfn; + /* + * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs, + * KVM is not shadowing any guest page tables, so the "guest access + * permissions" are just ACC_ALL. + * + * For direct SPs in indirect MMUs (shadow paging), i.e. when KVM + * is shadowing a guest huge page with small pages, the guest access + * permissions being shadowed are the access permissions of the huge + * page. + * + * In both cases, sp->role.access contains the correct access bits. + */ + return sp->role.access; +} + +static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index, + gfn_t gfn, unsigned int access) +{ + if (sp_has_gptes(sp)) { + sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access; return; } - if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index))) - pr_err_ratelimited("gfn mismatch under direct page %llx " - "(expected %llx, got %llx)\n", - sp->gfn, - kvm_mmu_page_get_gfn(sp, index), gfn); + WARN_ONCE(access != kvm_mmu_page_get_access(sp, index), + "access mismatch under %s page %llx (expected %u, got %u)\n", + sp->role.passthrough ? "passthrough" : "direct", + sp->gfn, kvm_mmu_page_get_access(sp, index), access); + + WARN_ONCE(gfn != kvm_mmu_page_get_gfn(sp, index), + "gfn mismatch under %s page %llx (expected %llx, got %llx)\n", + sp->role.passthrough ? "passthrough" : "direct", + sp->gfn, kvm_mmu_page_get_gfn(sp, index), gfn); +} + +static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index, + unsigned int access) +{ + gfn_t gfn = kvm_mmu_page_get_gfn(sp, index); + + kvm_mmu_page_set_translation(sp, index, gfn, access); } /* @@ -792,6 +797,9 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) KVM_PAGE_TRACK_WRITE); kvm_mmu_gfn_disallow_lpage(slot, gfn); + + if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K)) + kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); } void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -855,7 +863,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, /* * Returns the number of pointers in the rmap chain, not counting the new one. */ -static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, +static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; @@ -866,7 +874,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, rmap_head->val = (unsigned long)spte; } else if (!(rmap_head->val & 1)) { rmap_printk("%p %llx 1->many\n", spte, *spte); - desc = mmu_alloc_pte_list_desc(vcpu); + desc = kvm_mmu_memory_cache_alloc(cache); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; desc->spte_count = 2; @@ -878,7 +886,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, while (desc->spte_count == PTE_LIST_EXT) { count += PTE_LIST_EXT; if (!desc->more) { - desc->more = mmu_alloc_pte_list_desc(vcpu); + desc->more = kvm_mmu_memory_cache_alloc(cache); desc = desc->more; desc->spte_count = 0; break; @@ -913,7 +921,7 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, mmu_free_pte_list_desc(desc); } -static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) +static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; struct pte_list_desc *prev_desc; @@ -949,15 +957,16 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) } } -static void pte_list_remove(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - u64 *sptep) +static void kvm_zap_one_rmap_spte(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, u64 *sptep) { mmu_spte_clear_track_bits(kvm, sptep); - __pte_list_remove(sptep, rmap_head); + pte_list_remove(sptep, rmap_head); } -/* Return true if rmap existed, false otherwise */ -static bool pte_list_destroy(struct kvm *kvm, struct kvm_rmap_head *rmap_head) +/* Return true if at least one SPTE was zapped, false otherwise */ +static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, + struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc, *next; int i; @@ -1030,7 +1039,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) struct kvm_rmap_head *rmap_head; sp = sptep_to_sp(spte); - gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); + gfn = kvm_mmu_page_get_gfn(sp, spte_index(spte)); /* * Unlike rmap_add, rmap_remove does not run in the context of a vCPU @@ -1042,7 +1051,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) slot = __gfn_to_memslot(slots, gfn); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); - __pte_list_remove(spte, rmap_head); + pte_list_remove(spte, rmap_head); } /* @@ -1129,26 +1138,18 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } - -static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) +static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush) { - if (is_large_pte(*sptep)) { - WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K); - drop_spte(kvm, sptep); - return true; - } + struct kvm_mmu_page *sp; - return false; -} + sp = sptep_to_sp(sptep); + WARN_ON(sp->role.level == PG_LEVEL_4K); -static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) -{ - if (__drop_large_spte(vcpu->kvm, sptep)) { - struct kvm_mmu_page *sp = sptep_to_sp(sptep); + drop_spte(kvm, sptep); - kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); - } } /* @@ -1383,22 +1384,22 @@ static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn) return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); } -static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - const struct kvm_memory_slot *slot) +static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot) { - return pte_list_destroy(kvm, rmap_head); + return kvm_zap_all_rmap_sptes(kvm, rmap_head); } -static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t unused) +static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t unused) { - return kvm_zap_rmapp(kvm, rmap_head, slot); + return __kvm_zap_rmap(kvm, rmap_head, slot); } -static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t pte) +static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t pte) { u64 *sptep; struct rmap_iterator iter; @@ -1417,7 +1418,7 @@ restart: need_flush = true; if (pte_write(pte)) { - pte_list_remove(kvm, rmap_head, sptep); + kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); goto restart; } else { new_spte = kvm_mmu_changed_pte_notifier_make_spte( @@ -1529,7 +1530,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) bool flush = false; if (kvm_memslots_have_rmaps(kvm)) - flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); + flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap); if (is_tdp_mmu_enabled(kvm)) flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); @@ -1542,7 +1543,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) bool flush = false; if (kvm_memslots_have_rmaps(kvm)) - flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp); + flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap); if (is_tdp_mmu_enabled(kvm)) flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range); @@ -1550,9 +1551,9 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) return flush; } -static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t unused) +static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t unused) { u64 *sptep; struct rmap_iterator iter; @@ -1564,9 +1565,9 @@ static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, return young; } -static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, - int level, pte_t unused) +static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, + int level, pte_t unused) { u64 *sptep; struct rmap_iterator iter; @@ -1579,31 +1580,43 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, #define RMAP_RECYCLE_THRESHOLD 1000 -static void rmap_add(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, - u64 *spte, gfn_t gfn) +static void __rmap_add(struct kvm *kvm, + struct kvm_mmu_memory_cache *cache, + const struct kvm_memory_slot *slot, + u64 *spte, gfn_t gfn, unsigned int access) { struct kvm_mmu_page *sp; struct kvm_rmap_head *rmap_head; int rmap_count; sp = sptep_to_sp(spte); - kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); + kvm_mmu_page_set_translation(sp, spte_index(spte), gfn, access); + kvm_update_page_stats(kvm, sp->role.level, 1); + rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); - rmap_count = pte_list_add(vcpu, spte, rmap_head); + rmap_count = pte_list_add(cache, spte, rmap_head); if (rmap_count > RMAP_RECYCLE_THRESHOLD) { - kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); + kvm_zap_all_rmap_sptes(kvm, rmap_head); kvm_flush_remote_tlbs_with_address( - vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); + kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); } } +static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, + u64 *spte, gfn_t gfn, unsigned int access) +{ + struct kvm_mmu_memory_cache *cache = &vcpu->arch.mmu_pte_list_desc_cache; + + __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access); +} + bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; if (kvm_memslots_have_rmaps(kvm)) - young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp); + young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap); if (is_tdp_mmu_enabled(kvm)) young |= kvm_tdp_mmu_age_gfn_range(kvm, range); @@ -1616,7 +1629,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) bool young = false; if (kvm_memslots_have_rmaps(kvm)) - young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp); + young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap); if (is_tdp_mmu_enabled(kvm)) young |= kvm_tdp_mmu_test_age_gfn(kvm, range); @@ -1652,14 +1665,14 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr) percpu_counter_add(&kvm_total_used_mmu_pages, nr); } -static void kvm_mmu_free_page(struct kvm_mmu_page *sp) +static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp) { MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); hlist_del(&sp->hash_link); list_del(&sp->link); free_page((unsigned long)sp->spt); if (!sp->role.direct) - free_page((unsigned long)sp->gfns); + free_page((unsigned long)sp->shadowed_translation); kmem_cache_free(mmu_page_header_cache, sp); } @@ -1668,19 +1681,19 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn) return hash_64(gfn, KVM_MMU_HASH_SHIFT); } -static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, +static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache, struct kvm_mmu_page *sp, u64 *parent_pte) { if (!parent_pte) return; - pte_list_add(vcpu, parent_pte, &sp->parent_ptes); + pte_list_add(cache, parent_pte, &sp->parent_ptes); } static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, u64 *parent_pte) { - __pte_list_remove(parent_pte, &sp->parent_ptes); + pte_list_remove(parent_pte, &sp->parent_ptes); } static void drop_parent_pte(struct kvm_mmu_page *sp, @@ -1690,27 +1703,6 @@ static void drop_parent_pte(struct kvm_mmu_page *sp, mmu_spte_clear_no_track(parent_pte); } -static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct) -{ - struct kvm_mmu_page *sp; - - sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); - sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); - if (!direct) - sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache); - set_page_private(virt_to_page(sp->spt), (unsigned long)sp); - - /* - * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() - * depends on valid pages being added to the head of the list. See - * comments in kvm_zap_obsolete_pages(). - */ - sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; - list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); - kvm_mod_used_mmu_pages(vcpu->kvm, +1); - return sp; -} - static void mark_unsync(u64 *spte); static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) { @@ -1725,11 +1717,9 @@ static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) static void mark_unsync(u64 *spte) { struct kvm_mmu_page *sp; - unsigned int index; sp = sptep_to_sp(spte); - index = spte - sp->spt; - if (__test_and_set_bit(index, sp->unsync_child_bitmap)) + if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap)) return; if (sp->unsync_children++) return; @@ -1789,7 +1779,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, continue; } - child = to_shadow_page(ent & PT64_BASE_ADDR_MASK); + child = to_shadow_page(ent & SPTE_BASE_ADDR_MASK); if (child->unsync_children) { if (mmu_pages_add(pvec, child, i)) @@ -2019,36 +2009,24 @@ static void clear_sp_write_flooding_count(u64 *spte) __clear_sp_write_flooding_count(sptep_to_sp(spte)); } -static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, - gfn_t gfn, - gva_t gaddr, - unsigned level, - int direct, - unsigned int access) +/* + * The vCPU is required when finding indirect shadow pages; the shadow + * page may already exist and syncing it needs the vCPU pointer in + * order to read guest page tables. Direct shadow pages are never + * unsync, thus @vcpu can be NULL if @role.direct is true. + */ +static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm, + struct kvm_vcpu *vcpu, + gfn_t gfn, + struct hlist_head *sp_list, + union kvm_mmu_page_role role) { - bool direct_mmu = vcpu->arch.mmu->root_role.direct; - union kvm_mmu_page_role role; - struct hlist_head *sp_list; - unsigned quadrant; struct kvm_mmu_page *sp; int ret; int collisions = 0; LIST_HEAD(invalid_list); - role = vcpu->arch.mmu->root_role; - role.level = level; - role.direct = direct; - role.access = access; - if (role.has_4_byte_gpte) { - quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); - quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; - role.quadrant = quadrant; - } - if (level <= vcpu->arch.mmu->cpu_role.base.level) - role.passthrough = 0; - - sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; - for_each_valid_sp(vcpu->kvm, sp, sp_list) { + for_each_valid_sp(kvm, sp, sp_list) { if (sp->gfn != gfn) { collisions++; continue; @@ -2064,16 +2042,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, * Unsync pages must not be left as is, because the new * upper-level page will be write-protected. */ - if (level > PG_LEVEL_4K && sp->unsync) - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + if (role.level > PG_LEVEL_4K && sp->unsync) + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); continue; } - if (direct_mmu) - goto trace_get_page; + /* unsync and write-flooding only apply to indirect SPs. */ + if (sp->role.direct) + goto out; if (sp->unsync) { + if (KVM_BUG_ON(!vcpu, kvm)) + break; + /* * The page is good, but is stale. kvm_sync_page does * get the latest guest state, but (unlike mmu_unsync_children) @@ -2092,37 +2074,160 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, WARN_ON(!list_empty(&invalid_list)); if (ret > 0) - kvm_flush_remote_tlbs(vcpu->kvm); + kvm_flush_remote_tlbs(kvm); } __clear_sp_write_flooding_count(sp); -trace_get_page: - trace_kvm_mmu_get_page(sp, false); goto out; } - ++vcpu->kvm->stat.mmu_cache_miss; + sp = NULL; + ++kvm->stat.mmu_cache_miss; + +out: + kvm_mmu_commit_zap_page(kvm, &invalid_list); + + if (collisions > kvm->stat.max_mmu_page_hash_collisions) + kvm->stat.max_mmu_page_hash_collisions = collisions; + return sp; +} + +/* Caches used when allocating a new shadow page. */ +struct shadow_page_caches { + struct kvm_mmu_memory_cache *page_header_cache; + struct kvm_mmu_memory_cache *shadow_page_cache; + struct kvm_mmu_memory_cache *shadowed_info_cache; +}; + +static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, + struct shadow_page_caches *caches, + gfn_t gfn, + struct hlist_head *sp_list, + union kvm_mmu_page_role role) +{ + struct kvm_mmu_page *sp; + + sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache); + sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache); + if (!role.direct) + sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache); + + set_page_private(virt_to_page(sp->spt), (unsigned long)sp); - sp = kvm_mmu_alloc_page(vcpu, direct); + /* + * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() + * depends on valid pages being added to the head of the list. See + * comments in kvm_zap_obsolete_pages(). + */ + sp->mmu_valid_gen = kvm->arch.mmu_valid_gen; + list_add(&sp->link, &kvm->arch.active_mmu_pages); + kvm_mod_used_mmu_pages(kvm, +1); sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, sp_list); - if (sp_has_gptes(sp)) { - account_shadowed(vcpu->kvm, sp); - if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn)) - kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); + if (sp_has_gptes(sp)) + account_shadowed(kvm, sp); + + return sp; +} + +/* Note, @vcpu may be NULL if @role.direct is true; see kvm_mmu_find_shadow_page. */ +static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm, + struct kvm_vcpu *vcpu, + struct shadow_page_caches *caches, + gfn_t gfn, + union kvm_mmu_page_role role) +{ + struct hlist_head *sp_list; + struct kvm_mmu_page *sp; + bool created = false; + + sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; + + sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role); + if (!sp) { + created = true; + sp = kvm_mmu_alloc_shadow_page(kvm, caches, gfn, sp_list, role); } - trace_kvm_mmu_get_page(sp, true); -out: - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions) - vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions; + trace_kvm_mmu_get_page(sp, created); return sp; } +static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu, + gfn_t gfn, + union kvm_mmu_page_role role) +{ + struct shadow_page_caches caches = { + .page_header_cache = &vcpu->arch.mmu_page_header_cache, + .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache, + .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache, + }; + + return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role); +} + +static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, + unsigned int access) +{ + struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep); + union kvm_mmu_page_role role; + + role = parent_sp->role; + role.level--; + role.access = access; + role.direct = direct; + role.passthrough = 0; + + /* + * If the guest has 4-byte PTEs then that means it's using 32-bit, + * 2-level, non-PAE paging. KVM shadows such guests with PAE paging + * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must + * shadow each guest page table with multiple shadow page tables, which + * requires extra bookkeeping in the role. + * + * Specifically, to shadow the guest's page directory (which covers a + * 4GiB address space), KVM uses 4 PAE page directories, each mapping + * 1GiB of the address space. @role.quadrant encodes which quarter of + * the address space each maps. + * + * To shadow the guest's page tables (which each map a 4MiB region), KVM + * uses 2 PAE page tables, each mapping a 2MiB region. For these, + * @role.quadrant encodes which half of the region they map. + * + * Concretely, a 4-byte PDE consumes bits 31:22, while an 8-byte PDE + * consumes bits 29:21. To consume bits 31:30, KVM's uses 4 shadow + * PDPTEs; those 4 PAE page directories are pre-allocated and their + * quadrant is assigned in mmu_alloc_root(). A 4-byte PTE consumes + * bits 21:12, while an 8-byte PTE consumes bits 20:12. To consume + * bit 21 in the PTE (the child here), KVM propagates that bit to the + * quadrant, i.e. sets quadrant to '0' or '1'. The parent 8-byte PDE + * covers bit 21 (see above), thus the quadrant is calculated from the + * _least_ significant bit of the PDE index. + */ + if (role.has_4_byte_gpte) { + WARN_ON_ONCE(role.level != PG_LEVEL_4K); + role.quadrant = spte_index(sptep) & 1; + } + + return role; +} + +static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu, + u64 *sptep, gfn_t gfn, + bool direct, unsigned int access) +{ + union kvm_mmu_page_role role; + + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) + return ERR_PTR(-EEXIST); + + role = kvm_mmu_child_role(sptep, direct, access); + return kvm_mmu_get_shadow_page(vcpu, gfn, role); +} + static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator, struct kvm_vcpu *vcpu, hpa_t root, u64 addr) @@ -2145,7 +2250,7 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato iterator->shadow_addr = vcpu->arch.mmu->pae_root[(addr >> 30) & 3]; - iterator->shadow_addr &= PT64_BASE_ADDR_MASK; + iterator->shadow_addr &= SPTE_BASE_ADDR_MASK; --iterator->level; if (!iterator->shadow_addr) iterator->level = 0; @@ -2164,7 +2269,7 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) if (iterator->level < PG_LEVEL_4K) return false; - iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); + iterator->index = SPTE_INDEX(iterator->addr, iterator->level); iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; return true; } @@ -2177,7 +2282,7 @@ static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, return; } - iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK; + iterator->shadow_addr = spte & SPTE_BASE_ADDR_MASK; --iterator->level; } @@ -2186,23 +2291,38 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) __shadow_walk_next(iterator, *iterator->sptep); } -static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, - struct kvm_mmu_page *sp) +static void __link_shadow_page(struct kvm *kvm, + struct kvm_mmu_memory_cache *cache, u64 *sptep, + struct kvm_mmu_page *sp, bool flush) { u64 spte; BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); + /* + * If an SPTE is present already, it must be a leaf and therefore + * a large one. Drop it, and flush the TLB if needed, before + * installing sp. + */ + if (is_shadow_present_pte(*sptep)) + drop_large_spte(kvm, sptep, flush); + spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); mmu_spte_set(sptep, spte); - mmu_page_add_parent_pte(vcpu, sp, sptep); + mmu_page_add_parent_pte(cache, sp, sptep); if (sp->unsync_children || sp->unsync) mark_unsync(sptep); } +static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, + struct kvm_mmu_page *sp) +{ + __link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp, true); +} + static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned direct_access) { @@ -2216,7 +2336,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, * so we should update the spte at this point to get * a new sp with the correct access. */ - child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK); + child = to_shadow_page(*sptep & SPTE_BASE_ADDR_MASK); if (child->role.access == direct_access) return; @@ -2237,7 +2357,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, if (is_last_spte(pte, sp->role.level)) { drop_spte(kvm, spte); } else { - child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); + child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK); drop_parent_pte(child, spte); /* @@ -2263,7 +2383,7 @@ static int kvm_mmu_page_unlink_children(struct kvm *kvm, int zapped = 0; unsigned i; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + for (i = 0; i < SPTE_ENT_PER_PAGE; ++i) zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list); return zapped; @@ -2396,7 +2516,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, list_for_each_entry_safe(sp, nsp, invalid_list, link) { WARN_ON(!sp->role.invalid || sp->root_count); - kvm_mmu_free_page(sp); + kvm_mmu_free_shadow_page(sp); } } @@ -2676,7 +2796,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, struct kvm_mmu_page *child; u64 pte = *sptep; - child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); + child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK); drop_parent_pte(child, sptep); flush = true; } else if (pfn != spte_to_pfn(*sptep)) { @@ -2711,8 +2831,10 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, if (!was_rmapped) { WARN_ON_ONCE(ret == RET_PF_SPURIOUS); - kvm_update_page_stats(vcpu->kvm, level, 1); - rmap_add(vcpu, slot, sptep, gfn); + rmap_add(vcpu, slot, sptep, gfn, pte_access); + } else { + /* Already rmapped but the pte_access bits may have changed. */ + kvm_mmu_page_set_access(sp, spte_index(sptep), pte_access); } return ret; @@ -2728,7 +2850,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, int i, ret; gfn_t gfn; - gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); + gfn = kvm_mmu_page_get_gfn(sp, spte_index(start)); slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK); if (!slot) return -1; @@ -2754,7 +2876,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, WARN_ON(!sp->role.direct); - i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); + i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1); spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { @@ -2798,20 +2920,42 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } -static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, +/* + * Lookup the mapping level for @gfn in the current mm. + * + * WARNING! Use of host_pfn_mapping_level() requires the caller and the end + * consumer to be tied into KVM's handlers for MMU notifier events! + * + * There are several ways to safely use this helper: + * + * - Check mmu_notifier_retry_hva() after grabbing the mapping level, before + * consuming it. In this case, mmu_lock doesn't need to be held during the + * lookup, but it does need to be held while checking the MMU notifier. + * + * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation + * event for the hva. This can be done by explicit checking the MMU notifier + * or by ensuring that KVM already has a valid mapping that covers the hva. + * + * - Do not use the result to install new mappings, e.g. use the host mapping + * level only to decide whether or not to zap an entry. In this case, it's + * not required to hold mmu_lock (though it's highly likely the caller will + * want to hold mmu_lock anyways, e.g. to modify SPTEs). + * + * Note! The lookup can still race with modifications to host page tables, but + * the above "rules" ensure KVM will not _consume_ the result of the walk if a + * race with the primary MMU occurs. + */ +static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, const struct kvm_memory_slot *slot) { + int level = PG_LEVEL_4K; unsigned long hva; unsigned long flags; - int level = PG_LEVEL_4K; pgd_t pgd; p4d_t p4d; pud_t pud; pmd_t pmd; - if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn)) - return PG_LEVEL_4K; - /* * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() * is not solely for performance, it's also necessary to avoid the @@ -2823,16 +2967,19 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, hva = __gfn_to_hva_memslot(slot, gfn); /* - * Lookup the mapping level in the current mm. The information - * may become stale soon, but it is safe to use as long as - * 1) mmu_notifier_retry was checked after taking mmu_lock, and - * 2) mmu_lock is taken now. - * - * We still need to disable IRQs to prevent concurrent tear down - * of page tables. + * Disable IRQs to prevent concurrent tear down of host page tables, + * e.g. if the primary MMU promotes a P*D to a huge page and then frees + * the original page table. */ local_irq_save(flags); + /* + * Read each entry once. As above, a non-leaf entry can be promoted to + * a huge page _during_ this walk. Re-reading the entry could send the + * walk into the weeks, e.g. p*d_large() returns false (sees the old + * value) and then p*d_offset() walks into the target huge page instead + * of the old page table (sees the new value). + */ pgd = READ_ONCE(*pgd_offset(kvm->mm, hva)); if (pgd_none(pgd)) goto out; @@ -2864,7 +3011,7 @@ out: int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, - kvm_pfn_t pfn, int max_level) + int max_level) { struct kvm_lpage_info *linfo; int host_level; @@ -2879,7 +3026,7 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; - host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot); + host_level = host_pfn_mapping_level(kvm, gfn, slot); return min(host_level, max_level); } @@ -2893,7 +3040,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (unlikely(fault->max_level == PG_LEVEL_4K)) return; - if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn)) + if (is_error_noslot_pfn(fault->pfn)) return; if (kvm_slot_dirty_track_enabled(slot)) @@ -2904,8 +3051,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * level, which will be used to do precise, accurate accounting. */ fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, - fault->gfn, fault->pfn, - fault->max_level); + fault->gfn, fault->max_level); if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) return; @@ -2961,13 +3107,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (it.level == fault->goal_level) break; - drop_large_spte(vcpu, it.sptep); - if (is_shadow_present_pte(*it.sptep)) + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL); + if (sp == ERR_PTR(-EEXIST)) continue; - sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, - it.level - 1, true, ACC_ALL); - link_shadow_page(vcpu, it.sptep, sp); if (fault->is_tdp && fault->huge_page_disallowed && fault->req_level >= it.level) @@ -3095,7 +3238,7 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, * * Compare with set_spte where instead shadow_dirty_mask is set. */ - if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) + if (!try_cmpxchg64(sptep, &old_spte, new_spte)) return false; if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) @@ -3265,7 +3408,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, if (!VALID_PAGE(*root_hpa)) return; - sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); + sp = to_shadow_page(*root_hpa & SPTE_BASE_ADDR_MASK); if (WARN_ON(!sp)) return; @@ -3369,12 +3512,19 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) return ret; } -static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, - u8 level, bool direct) +static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant, + u8 level) { + union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; struct kvm_mmu_page *sp; - sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL); + role.level = level; + role.quadrant = quadrant; + + WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte); + WARN_ON_ONCE(role.direct && role.has_4_byte_gpte); + + sp = kvm_mmu_get_shadow_page(vcpu, gfn, role); ++sp->root_count; return __pa(sp->spt); @@ -3397,7 +3547,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); mmu->root.hpa = root; } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { - root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); + root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level); mmu->root.hpa = root; } else if (shadow_root_level == PT32E_ROOT_LEVEL) { if (WARN_ON_ONCE(!mmu->pae_root)) { @@ -3408,8 +3558,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); - root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), - i << 30, PT32_ROOT_LEVEL, true); + root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 0, + PT32_ROOT_LEVEL); mmu->pae_root[i] = root | PT_PRESENT_MASK | shadow_me_value; } @@ -3493,9 +3643,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) struct kvm_mmu *mmu = vcpu->arch.mmu; u64 pdptrs[4], pm_mask; gfn_t root_gfn, root_pgd; + int quadrant, i, r; hpa_t root; - unsigned i; - int r; root_pgd = mmu->get_guest_pgd(vcpu); root_gfn = root_pgd >> PAGE_SHIFT; @@ -3533,7 +3682,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) */ if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) { root = mmu_alloc_root(vcpu, root_gfn, 0, - mmu->root_role.level, false); + mmu->root_role.level); mmu->root.hpa = root; goto set_root_pgd; } @@ -3578,8 +3727,15 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) root_gfn = pdptrs[i] >> PAGE_SHIFT; } - root = mmu_alloc_root(vcpu, root_gfn, i << 30, - PT32_ROOT_LEVEL, false); + /* + * If shadowing 32-bit non-PAE page tables, each PAE page + * directory maps one quarter of the guest's non-PAE page + * directory. Othwerise each PAE page direct shadows one guest + * PAE page directory so that quadrant should be 0. + */ + quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0; + + root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL); mmu->pae_root[i] = root | pm_mask; } @@ -3737,7 +3893,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) hpa_t root = vcpu->arch.mmu->pae_root[i]; if (IS_VALID_PAE_ROOT(root)) { - root &= PT64_BASE_ADDR_MASK; + root &= SPTE_BASE_ADDR_MASK; sp = to_shadow_page(root); mmu_sync_children(vcpu, sp, true); } @@ -4155,14 +4311,26 @@ EXPORT_SYMBOL_GPL(kvm_handle_page_fault); int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - while (fault->max_level > PG_LEVEL_4K) { - int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); - gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1); - - if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) - break; + /* + * If the guest's MTRRs may be used to compute the "real" memtype, + * restrict the mapping level to ensure KVM uses a consistent memtype + * across the entire mapping. If the host MTRRs are ignored by TDP + * (shadow_memtype_mask is non-zero), and the VM has non-coherent DMA + * (DMA doesn't snoop CPU caches), KVM's ABI is to honor the memtype + * from the guest's MTRRs so that guest accesses to memory that is + * DMA'd aren't cached against the guest's wishes. + * + * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs, + * e.g. KVM will force UC memtype for host MMIO. + */ + if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) { + for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) { + int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); + gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1); - --fault->max_level; + if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) + break; + } } return direct_page_fault(vcpu, fault); @@ -4567,7 +4735,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context) if (boot_cpu_is_amd()) __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), - context->root_role.level, false, + context->root_role.level, true, boot_cpu_has(X86_FEATURE_GBPAGES), false, true); else @@ -5199,11 +5367,11 @@ static bool need_remote_flush(u64 old, u64 new) return false; if (!is_shadow_present_pte(new)) return true; - if ((old ^ new) & PT64_BASE_ADDR_MASK) + if ((old ^ new) & SPTE_BASE_ADDR_MASK) return true; old ^= shadow_nx_mask; new ^= shadow_nx_mask; - return (old & ~new & PT64_PERM_MASK) != 0; + return (old & ~new & SPTE_PERM_MASK) != 0; } static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, @@ -5328,13 +5496,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); - /* - * No need to care whether allocation memory is successful - * or not since pte prefetch is skipped if it does not have - * enough objects in the cache. - */ - mmu_topup_memory_caches(vcpu, true); - write_lock(&vcpu->kvm->mmu_lock); gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); @@ -5819,9 +5980,25 @@ int kvm_mmu_init_vm(struct kvm *kvm) node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; kvm_page_track_register_notifier(kvm, node); + + kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache; + kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO; + + kvm->arch.split_shadow_page_cache.gfp_zero = __GFP_ZERO; + + kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache; + kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO; + return 0; } +static void mmu_free_vm_memory_caches(struct kvm *kvm) +{ + kvm_mmu_free_memory_cache(&kvm->arch.split_desc_cache); + kvm_mmu_free_memory_cache(&kvm->arch.split_page_header_cache); + kvm_mmu_free_memory_cache(&kvm->arch.split_shadow_page_cache); +} + void kvm_mmu_uninit_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; @@ -5829,9 +6006,11 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) kvm_page_track_unregister_notifier(kvm, node); kvm_mmu_uninit_tdp_mmu(kvm); + + mmu_free_vm_memory_caches(kvm); } -static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) +static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { const struct kvm_memory_slot *memslot; struct kvm_memslots *slots; @@ -5853,8 +6032,7 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (WARN_ON_ONCE(start >= end)) continue; - flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, - + flush = slot_handle_level_range(kvm, memslot, __kvm_zap_rmap, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, true, flush); } @@ -5879,7 +6057,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) kvm_inc_notifier_count(kvm, gfn_start, gfn_end); - flush = __kvm_zap_rmaps(kvm, gfn_start, gfn_end); + flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); if (is_tdp_mmu_enabled(kvm)) { for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) @@ -5950,15 +6128,249 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } +static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min) +{ + return kvm_mmu_memory_cache_nr_free_objects(cache) < min; +} + +static bool need_topup_split_caches_or_resched(struct kvm *kvm) +{ + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) + return true; + + /* + * In the worst case, SPLIT_DESC_CACHE_MIN_NR_OBJECTS descriptors are needed + * to split a single huge page. Calculating how many are actually needed + * is possible but not worth the complexity. + */ + return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) || + need_topup(&kvm->arch.split_page_header_cache, 1) || + need_topup(&kvm->arch.split_shadow_page_cache, 1); +} + +static int topup_split_caches(struct kvm *kvm) +{ + /* + * Allocating rmap list entries when splitting huge pages for nested + * MMUs is uncommon as KVM needs to use a list if and only if there is + * more than one rmap entry for a gfn, i.e. requires an L1 gfn to be + * aliased by multiple L2 gfns and/or from multiple nested roots with + * different roles. Aliasing gfns when using TDP is atypical for VMMs; + * a few gfns are often aliased during boot, e.g. when remapping BIOS, + * but aliasing rarely occurs post-boot or for many gfns. If there is + * only one rmap entry, rmap->val points directly at that one entry and + * doesn't need to allocate a list. Buffer the cache by the default + * capacity so that KVM doesn't have to drop mmu_lock to topup if KVM + * encounters an aliased gfn or two. + */ + const int capacity = SPLIT_DESC_CACHE_MIN_NR_OBJECTS + + KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE; + int r; + + lockdep_assert_held(&kvm->slots_lock); + + r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache, capacity, + SPLIT_DESC_CACHE_MIN_NR_OBJECTS); + if (r) + return r; + + r = kvm_mmu_topup_memory_cache(&kvm->arch.split_page_header_cache, 1); + if (r) + return r; + + return kvm_mmu_topup_memory_cache(&kvm->arch.split_shadow_page_cache, 1); +} + +static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *huge_sptep) +{ + struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep); + struct shadow_page_caches caches = {}; + union kvm_mmu_page_role role; + unsigned int access; + gfn_t gfn; + + gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep)); + access = kvm_mmu_page_get_access(huge_sp, spte_index(huge_sptep)); + + /* + * Note, huge page splitting always uses direct shadow pages, regardless + * of whether the huge page itself is mapped by a direct or indirect + * shadow page, since the huge page region itself is being directly + * mapped with smaller pages. + */ + role = kvm_mmu_child_role(huge_sptep, /*direct=*/true, access); + + /* Direct SPs do not require a shadowed_info_cache. */ + caches.page_header_cache = &kvm->arch.split_page_header_cache; + caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache; + + /* Safe to pass NULL for vCPU since requesting a direct SP. */ + return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role); +} + +static void shadow_mmu_split_huge_page(struct kvm *kvm, + const struct kvm_memory_slot *slot, + u64 *huge_sptep) + +{ + struct kvm_mmu_memory_cache *cache = &kvm->arch.split_desc_cache; + u64 huge_spte = READ_ONCE(*huge_sptep); + struct kvm_mmu_page *sp; + bool flush = false; + u64 *sptep, spte; + gfn_t gfn; + int index; + + sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep); + + for (index = 0; index < SPTE_ENT_PER_PAGE; index++) { + sptep = &sp->spt[index]; + gfn = kvm_mmu_page_get_gfn(sp, index); + + /* + * The SP may already have populated SPTEs, e.g. if this huge + * page is aliased by multiple sptes with the same access + * permissions. These entries are guaranteed to map the same + * gfn-to-pfn translation since the SP is direct, so no need to + * modify them. + * + * However, if a given SPTE points to a lower level page table, + * that lower level page table may only be partially populated. + * Installing such SPTEs would effectively unmap a potion of the + * huge page. Unmapping guest memory always requires a TLB flush + * since a subsequent operation on the unmapped regions would + * fail to detect the need to flush. + */ + if (is_shadow_present_pte(*sptep)) { + flush |= !is_last_spte(*sptep, sp->role.level); + continue; + } + + spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index); + mmu_spte_set(sptep, spte); + __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access); + } + + __link_shadow_page(kvm, cache, huge_sptep, sp, flush); +} + +static int shadow_mmu_try_split_huge_page(struct kvm *kvm, + const struct kvm_memory_slot *slot, + u64 *huge_sptep) +{ + struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep); + int level, r = 0; + gfn_t gfn; + u64 spte; + + /* Grab information for the tracepoint before dropping the MMU lock. */ + gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep)); + level = huge_sp->role.level; + spte = *huge_sptep; + + if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) { + r = -ENOSPC; + goto out; + } + + if (need_topup_split_caches_or_resched(kvm)) { + write_unlock(&kvm->mmu_lock); + cond_resched(); + /* + * If the topup succeeds, return -EAGAIN to indicate that the + * rmap iterator should be restarted because the MMU lock was + * dropped. + */ + r = topup_split_caches(kvm) ?: -EAGAIN; + write_lock(&kvm->mmu_lock); + goto out; + } + + shadow_mmu_split_huge_page(kvm, slot, huge_sptep); + +out: + trace_kvm_mmu_split_huge_page(gfn, spte, level, r); + return r; +} + +static bool shadow_mmu_try_split_huge_pages(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot) +{ + struct rmap_iterator iter; + struct kvm_mmu_page *sp; + u64 *huge_sptep; + int r; + +restart: + for_each_rmap_spte(rmap_head, &iter, huge_sptep) { + sp = sptep_to_sp(huge_sptep); + + /* TDP MMU is enabled, so rmap only contains nested MMU SPs. */ + if (WARN_ON_ONCE(!sp->role.guest_mode)) + continue; + + /* The rmaps should never contain non-leaf SPTEs. */ + if (WARN_ON_ONCE(!is_large_pte(*huge_sptep))) + continue; + + /* SPs with level >PG_LEVEL_4K should never by unsync. */ + if (WARN_ON_ONCE(sp->unsync)) + continue; + + /* Don't bother splitting huge pages on invalid SPs. */ + if (sp->role.invalid) + continue; + + r = shadow_mmu_try_split_huge_page(kvm, slot, huge_sptep); + + /* + * The split succeeded or needs to be retried because the MMU + * lock was dropped. Either way, restart the iterator to get it + * back into a consistent state. + */ + if (!r || r == -EAGAIN) + goto restart; + + /* The split failed and shouldn't be retried (e.g. -ENOMEM). */ + break; + } + + return false; +} + +static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot, + gfn_t start, gfn_t end, + int target_level) +{ + int level; + + /* + * Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working + * down to the target level. This ensures pages are recursively split + * all the way to the target level. There's no need to split pages + * already at the target level. + */ + for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) { + slot_handle_level_range(kvm, slot, shadow_mmu_try_split_huge_pages, + level, level, start, end - 1, true, false); + } +} + /* Must be called with the mmu_lock held in write-mode. */ void kvm_mmu_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *memslot, u64 start, u64 end, int target_level) { - if (is_tdp_mmu_enabled(kvm)) - kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, - target_level, false); + if (!is_tdp_mmu_enabled(kvm)) + return; + + if (kvm_memslots_have_rmaps(kvm)) + kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level); + + kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false); /* * A TLB flush is unnecessary at this point for the same resons as in @@ -5973,12 +6385,19 @@ void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, u64 start = memslot->base_gfn; u64 end = start + memslot->npages; - if (is_tdp_mmu_enabled(kvm)) { - read_lock(&kvm->mmu_lock); - kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true); - read_unlock(&kvm->mmu_lock); + if (!is_tdp_mmu_enabled(kvm)) + return; + + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level); + write_unlock(&kvm->mmu_lock); } + read_lock(&kvm->mmu_lock); + kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true); + read_unlock(&kvm->mmu_lock); + /* * No TLB flush is necessary here. KVM will flush TLBs after * write-protecting and/or clearing dirty on the newly split SPTEs to @@ -6012,10 +6431,10 @@ restart: * the guest, and the guest page table is using 4K page size * mapping if the indirect sp has level = 1. */ - if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && + if (sp->role.direct && sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, - pfn, PG_LEVEL_NUM)) { - pte_list_remove(kvm, rmap_head, sptep); + PG_LEVEL_NUM)) { + kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, @@ -6030,18 +6449,24 @@ restart: return need_tlb_flush; } +static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot) +{ + /* + * Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap + * pages that are already mapped at the maximum hugepage level. + */ + if (slot_handle_level(kvm, slot, kvm_mmu_zap_collapsible_spte, + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true)) + kvm_arch_flush_remote_tlbs_memslot(kvm, slot); +} + void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) { if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); - /* - * Zap only 4k SPTEs since the legacy MMU only supports dirty - * logging at a 4k granularity and never creates collapsible - * 2m SPTEs during dirty logging. - */ - if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true)) - kvm_arch_flush_remote_tlbs_memslot(kvm, slot); + kvm_rmap_zap_collapsible_sptes(kvm, slot); write_unlock(&kvm->mmu_lock); } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index bd2a26897b97..582def531d4d 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -20,6 +20,20 @@ extern bool dbg; #define MMU_WARN_ON(x) do { } while (0) #endif +/* Page table builder macros common to shadow (host) PTEs and guest PTEs. */ +#define __PT_LEVEL_SHIFT(level, bits_per_level) \ + (PAGE_SHIFT + ((level) - 1) * (bits_per_level)) +#define __PT_INDEX(address, level, bits_per_level) \ + (((address) >> __PT_LEVEL_SHIFT(level, bits_per_level)) & ((1 << (bits_per_level)) - 1)) + +#define __PT_LVL_ADDR_MASK(base_addr_mask, level, bits_per_level) \ + ((base_addr_mask) & ~((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1)) + +#define __PT_LVL_OFFSET_MASK(base_addr_mask, level, bits_per_level) \ + ((base_addr_mask) & ((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1)) + +#define __PT_ENT_PER_PAGE(bits_per_level) (1 << (bits_per_level)) + /* * Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT * bit, and thus are guaranteed to be non-zero when valid. And, when a guest @@ -53,8 +67,21 @@ struct kvm_mmu_page { gfn_t gfn; u64 *spt; - /* hold the gfn of each spte inside spt */ - gfn_t *gfns; + + /* + * Stores the result of the guest translation being shadowed by each + * SPTE. KVM shadows two types of guest translations: nGPA -> GPA + * (shadow EPT/NPT) and GVA -> GPA (traditional shadow paging). In both + * cases the result of the translation is a GPA and a set of access + * constraints. + * + * The GFN is stored in the upper bits (PAGE_SHIFT) and the shadowed + * access permissions are stored in the lower bits. Note, for + * convenience and uniformity across guests, the access permissions are + * stored in KVM format (e.g. ACC_EXEC_MASK) not the raw guest format. + */ + u64 *shadowed_translation; + /* Currently serving as active root */ union { int root_count; @@ -141,9 +168,9 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, unsigned int pte_list_count(struct kvm_rmap_head *rmap_head); extern int nx_huge_pages; -static inline bool is_nx_huge_page_enabled(void) +static inline bool is_nx_huge_page_enabled(struct kvm *kvm) { - return READ_ONCE(nx_huge_pages); + return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages; } struct kvm_page_fault { @@ -242,7 +269,8 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, .user = err & PFERR_USER_MASK, .prefetch = prefetch, .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault), - .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(), + .nx_huge_page_workaround_enabled = + is_nx_huge_page_enabled(vcpu->kvm), .max_level = KVM_MAX_HUGEPAGE_LEVEL, .req_level = PG_LEVEL_4K, @@ -281,7 +309,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, - kvm_pfn_t pfn, int max_level); + int max_level); void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); diff --git a/arch/x86/kvm/mmu/paging.h b/arch/x86/kvm/mmu/paging.h deleted file mode 100644 index de8ab323bb70..000000000000 --- a/arch/x86/kvm/mmu/paging.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* Shadow paging constants/helpers that don't need to be #undef'd. */ -#ifndef __KVM_X86_PAGING_H -#define __KVM_X86_PAGING_H - -#define GUEST_PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) -#define PT64_LVL_ADDR_MASK(level) \ - (GUEST_PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) -#define PT64_LVL_OFFSET_MASK(level) \ - (GUEST_PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) -#endif /* __KVM_X86_PAGING_H */ - diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index db80f7ccaa4e..f5958071220c 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -16,25 +16,21 @@ */ /* - * We need the mmu code to access both 32-bit and 64-bit guest ptes, - * so the code in this file is compiled twice, once per pte size. + * The MMU needs to be able to access/walk 32-bit and 64-bit guest page tables, + * as well as guest EPT tables, so the code in this file is compiled thrice, + * once per guest PTE type. The per-type defines are #undef'd at the end. */ #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 #define FNAME(name) paging##64_##name - #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK - #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) - #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) - #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_LEVEL_BITS 9 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define PT_HAVE_ACCESSED_DIRTY(mmu) true #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL - #define CMPXCHG "cmpxchgq" #else #define PT_MAX_FULL_LEVELS 2 #endif @@ -42,36 +38,35 @@ #define pt_element_t u32 #define guest_walker guest_walker32 #define FNAME(name) paging##32_##name - #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK - #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) - #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) - #define PT_INDEX(addr, level) PT32_INDEX(addr, level) - #define PT_LEVEL_BITS PT32_LEVEL_BITS + #define PT_LEVEL_BITS 10 #define PT_MAX_FULL_LEVELS 2 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define PT_HAVE_ACCESSED_DIRTY(mmu) true - #define CMPXCHG "cmpxchgl" + + #define PT32_DIR_PSE36_SIZE 4 + #define PT32_DIR_PSE36_SHIFT 13 + #define PT32_DIR_PSE36_MASK \ + (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) #elif PTTYPE == PTTYPE_EPT #define pt_element_t u64 #define guest_walker guest_walkerEPT #define FNAME(name) ept_##name - #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK - #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) - #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) - #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_LEVEL_BITS 9 #define PT_GUEST_DIRTY_SHIFT 9 #define PT_GUEST_ACCESSED_SHIFT 8 #define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled) - #ifdef CONFIG_X86_64 - #define CMPXCHG "cmpxchgq" - #endif #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL #else #error Invalid PTTYPE value #endif +/* Common logic, but per-type values. These also need to be undefined. */ +#define PT_BASE_ADDR_MASK ((pt_element_t)(((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))) +#define PT_LVL_ADDR_MASK(lvl) __PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) +#define PT_LVL_OFFSET_MASK(lvl) __PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) +#define PT_INDEX(addr, lvl) __PT_INDEX(addr, lvl, PT_LEVEL_BITS) + #define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT) #define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT) @@ -97,6 +92,15 @@ struct guest_walker { struct x86_exception fault; }; +#if PTTYPE == 32 +static inline gfn_t pse36_gfn_delta(u32 gpte) +{ + int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; + + return (gpte & PT32_DIR_PSE36_MASK) << shift; +} +#endif + static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) { return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; @@ -374,7 +378,7 @@ retry_walk: * information to fix the exit_qualification or exit_info_1 * fields. */ - if (unlikely(real_gpa == UNMAPPED_GVA)) + if (unlikely(real_gpa == INVALID_GPA)) return 0; host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gpa_to_gfn(real_gpa), @@ -421,11 +425,13 @@ retry_walk: gfn = gpte_to_gfn_lvl(pte, walker->level); gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; - if (PTTYPE == 32 && walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) +#if PTTYPE == 32 + if (walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) gfn += pse36_gfn_delta(pte); +#endif real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault); - if (real_gpa == UNMAPPED_GVA) + if (real_gpa == INVALID_GPA) return 0; walker->gfn = real_gpa >> PAGE_SHIFT; @@ -589,7 +595,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, if (sp->role.direct) return __direct_pte_prefetch(vcpu, sp, sptep); - i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); + i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1); spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { @@ -642,14 +648,13 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, gfn_t table_gfn; clear_sp_write_flooding_count(it.sptep); - drop_large_spte(vcpu, it.sptep); - - sp = NULL; - if (!is_shadow_present_pte(*it.sptep)) { - table_gfn = gw->table_gfn[it.level - 2]; - access = gw->pt_access[it.level - 2]; - sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr, - it.level-1, false, access); + + table_gfn = gw->table_gfn[it.level - 2]; + access = gw->pt_access[it.level - 2]; + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn, + false, access); + + if (sp != ERR_PTR(-EEXIST)) { /* * We must synchronize the pagetable before linking it * because the guest doesn't need to flush tlb when @@ -678,7 +683,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) goto out_gpte_changed; - if (sp) + if (sp != ERR_PTR(-EEXIST)) link_shadow_page(vcpu, it.sptep, sp); } @@ -702,16 +707,15 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, validate_direct_spte(vcpu, it.sptep, direct_access); - drop_large_spte(vcpu, it.sptep); + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, + true, direct_access); + if (sp == ERR_PTR(-EEXIST)) + continue; - if (!is_shadow_present_pte(*it.sptep)) { - sp = kvm_mmu_get_page(vcpu, base_gfn, fault->addr, - it.level - 1, true, direct_access); - link_shadow_page(vcpu, it.sptep, sp); - if (fault->huge_page_disallowed && - fault->req_level >= it.level) - account_huge_nx_page(vcpu->kvm, sp); - } + link_shadow_page(vcpu, it.sptep, sp); + if (fault->huge_page_disallowed && + fault->req_level >= it.level) + account_huge_nx_page(vcpu->kvm, sp); } if (WARN_ON_ONCE(it.level != fault->goal_level)) @@ -888,7 +892,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) WARN_ON(sp->role.level != PG_LEVEL_4K); if (PTTYPE == 32) - offset = sp->role.quadrant << PT64_LEVEL_BITS; + offset = sp->role.quadrant << SPTE_LEVEL_BITS; return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); } @@ -929,7 +933,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) break; pte_gpa = FNAME(get_level1_sp_gpa)(sp); - pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); + pte_gpa += spte_index(sptep) * sizeof(pt_element_t); mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL); if (is_shadow_present_pte(old_spte)) @@ -958,7 +962,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct x86_exception *exception) { struct guest_walker walker; - gpa_t gpa = UNMAPPED_GVA; + gpa_t gpa = INVALID_GPA; int r; #ifndef CONFIG_X86_64 @@ -978,7 +982,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } /* - * Using the cached information from sp->gfns is safe because: + * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is + * safe because: * - The spte has a reference to the struct page, so the pfn for a given gfn * can't change unless all sptes pointing to it are nuked first. * @@ -1023,7 +1028,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); - for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { u64 *sptep, spte; struct kvm_memory_slot *slot; unsigned pte_access; @@ -1053,12 +1058,23 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) continue; - if (gfn != sp->gfns[i]) { + /* + * Drop the SPTE if the new protections would result in a RWX=0 + * SPTE or if the gfn is changing. The RWX=0 case only affects + * EPT with execute-only support, i.e. EPT without an effective + * "present" bit, as all other paging modes will create a + * read-only SPTE if pte_access is zero. + */ + if ((!pte_access && !shadow_present_mask) || + gfn != kvm_mmu_page_get_gfn(sp, i)) { drop_spte(vcpu->kvm, &sp->spt[i]); flush = true; continue; } + /* Update the shadowed access bits in case they changed. */ + kvm_mmu_page_set_access(sp, i, pte_access); + sptep = &sp->spt[i]; spte = *sptep; host_writable = spte & shadow_host_writable_mask; @@ -1070,6 +1086,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) flush |= mmu_spte_update(sptep, spte); } + /* + * Note, any flush is purely for KVM's correctness, e.g. when dropping + * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier + * unmap or dirty logging event doesn't fail to flush. The guest is + * responsible for flushing the TLB to ensure any changes in protection + * bits are recognized, i.e. until the guest flushes or page faults on + * a relevant address, KVM is architecturally allowed to let vCPUs use + * cached translations with the old protection bits. + */ return flush; } @@ -1084,7 +1109,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef PT_MAX_FULL_LEVELS #undef gpte_to_gfn #undef gpte_to_gfn_lvl -#undef CMPXCHG #undef PT_GUEST_ACCESSED_MASK #undef PT_GUEST_DIRTY_MASK #undef PT_GUEST_DIRTY_SHIFT diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index b5960bbde7f7..7314d27d57a4 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -33,6 +33,7 @@ u64 __read_mostly shadow_mmio_value; u64 __read_mostly shadow_mmio_mask; u64 __read_mostly shadow_mmio_access_mask; u64 __read_mostly shadow_present_mask; +u64 __read_mostly shadow_memtype_mask; u64 __read_mostly shadow_me_value; u64 __read_mostly shadow_me_mask; u64 __read_mostly shadow_acc_track_mask; @@ -129,6 +130,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 spte = SPTE_MMU_PRESENT_MASK; bool wrprot = false; + WARN_ON_ONCE(!pte_access && !shadow_present_mask); + if (sp->role.ad_disabled) spte |= SPTE_TDP_AD_DISABLED_MASK; else if (kvm_mmu_page_ad_need_write_protect(sp)) @@ -145,7 +148,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, spte |= spte_shadow_accessed_mask(spte); if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && - is_nx_huge_page_enabled()) { + is_nx_huge_page_enabled(vcpu->kvm)) { pte_access &= ~ACC_EXEC_MASK; } @@ -159,10 +162,10 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (level > PG_LEVEL_4K) spte |= PT_PAGE_SIZE_MASK; - if (tdp_enabled) - spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); + if (shadow_memtype_mask) + spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn, + kvm_is_mmio_pfn(pfn)); if (host_writable) spte |= shadow_host_writable_mask; else @@ -244,10 +247,10 @@ static u64 make_spte_executable(u64 spte) * This is used during huge page splitting to build the SPTEs that make up the * new page table. */ -u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index) +u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, union kvm_mmu_page_role role, + int index) { u64 child_spte; - int child_level; if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte))) return 0; @@ -256,23 +259,23 @@ u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index) return 0; child_spte = huge_spte; - child_level = huge_level - 1; /* * The child_spte already has the base address of the huge page being * split. So we just have to OR in the offset to the page at the next * lower level for the given index. */ - child_spte |= (index * KVM_PAGES_PER_HPAGE(child_level)) << PAGE_SHIFT; + child_spte |= (index * KVM_PAGES_PER_HPAGE(role.level)) << PAGE_SHIFT; - if (child_level == PG_LEVEL_4K) { + if (role.level == PG_LEVEL_4K) { child_spte &= ~PT_PAGE_SIZE_MASK; /* - * When splitting to a 4K page, mark the page executable as the - * NX hugepage mitigation no longer applies. + * When splitting to a 4K page where execution is allowed, mark + * the page executable as the NX hugepage mitigation no longer + * applies. */ - if (is_nx_huge_page_enabled()) + if ((role.access & ACC_EXEC_MASK) && is_nx_huge_page_enabled(kvm)) child_spte = make_spte_executable(child_spte); } @@ -299,7 +302,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn) { u64 new_spte; - new_spte = old_spte & ~PT64_BASE_ADDR_MASK; + new_spte = old_spte & ~SPTE_BASE_ADDR_MASK; new_spte |= (u64)new_pfn << PAGE_SHIFT; new_spte &= ~PT_WRITABLE_MASK; @@ -389,6 +392,13 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) shadow_nx_mask = 0ull; shadow_x_mask = VMX_EPT_EXECUTABLE_MASK; shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK; + /* + * EPT overrides the host MTRRs, and so KVM must program the desired + * memtype directly into the SPTEs. Note, this mask is just the mask + * of all bits that factor into the memtype, the actual memtype must be + * dynamically calculated, e.g. to ensure host MMIO is mapped UC. + */ + shadow_memtype_mask = VMX_EPT_MT_MASK | VMX_EPT_IPAT_BIT; shadow_acc_track_mask = VMX_EPT_RWX_MASK; shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE; shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE; @@ -439,6 +449,13 @@ void kvm_mmu_reset_all_pte_masks(void) shadow_nx_mask = PT64_NX_MASK; shadow_x_mask = 0; shadow_present_mask = PT_PRESENT_MASK; + + /* + * For shadow paging and NPT, KVM uses PAT entry '0' to encode WB + * memtype in the SPTEs, i.e. relies on host MTRRs to provide the + * correct memtype (WB is the "weakest" memtype). + */ + shadow_memtype_mask = 0; shadow_acc_track_mask = 0; shadow_me_mask = 0; shadow_me_value = 0; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 0127bb6e3c7d..cabe3fbb4f39 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -36,12 +36,12 @@ extern bool __read_mostly enable_mmio_caching; static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK -#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) +#define SPTE_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) #else -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#define SPTE_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) #endif -#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ +#define SPTE_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ | shadow_x_mask | shadow_nx_mask | shadow_me_mask) #define ACC_EXEC_MASK 1 @@ -50,17 +50,13 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) /* The mask for the R/X bits in EPT PTEs */ -#define PT64_EPT_READABLE_MASK 0x1ull -#define PT64_EPT_EXECUTABLE_MASK 0x4ull +#define SPTE_EPT_READABLE_MASK 0x1ull +#define SPTE_EPT_EXECUTABLE_MASK 0x4ull -#define PT64_LEVEL_BITS 9 - -#define PT64_LEVEL_SHIFT(level) \ - (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) - -#define PT64_INDEX(address, level)\ - (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) -#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) +#define SPTE_LEVEL_BITS 9 +#define SPTE_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, SPTE_LEVEL_BITS) +#define SPTE_INDEX(address, level) __PT_INDEX(address, level, SPTE_LEVEL_BITS) +#define SPTE_ENT_PER_PAGE __PT_ENT_PER_PAGE(SPTE_LEVEL_BITS) /* * The mask/shift to use for saving the original R/X bits when marking the PTE @@ -69,8 +65,8 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); * restored only when a write is attempted to the page. This mask obviously * must not overlap the A/D type mask. */ -#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \ - PT64_EPT_EXECUTABLE_MASK) +#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (SPTE_EPT_READABLE_MASK | \ + SPTE_EPT_EXECUTABLE_MASK) #define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54 #define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) @@ -151,6 +147,7 @@ extern u64 __read_mostly shadow_mmio_value; extern u64 __read_mostly shadow_mmio_mask; extern u64 __read_mostly shadow_mmio_access_mask; extern u64 __read_mostly shadow_present_mask; +extern u64 __read_mostly shadow_memtype_mask; extern u64 __read_mostly shadow_me_value; extern u64 __read_mostly shadow_me_mask; @@ -194,6 +191,12 @@ static inline bool is_removed_spte(u64 spte) return spte == REMOVED_SPTE; } +/* Get an SPTE's index into its parent's page table (and the spt array). */ +static inline int spte_index(u64 *sptep) +{ + return ((unsigned long)sptep / sizeof(*sptep)) & (SPTE_ENT_PER_PAGE - 1); +} + /* * In some cases, we need to preserve the GFN of a non-present or reserved * SPTE when we usurp the upper five bits of the physical address space to @@ -282,7 +285,7 @@ static inline bool is_executable_pte(u64 spte) static inline kvm_pfn_t spte_to_pfn(u64 pte) { - return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; + return (pte & SPTE_BASE_ADDR_MASK) >> PAGE_SHIFT; } static inline bool is_accessed_spte(u64 spte) @@ -425,7 +428,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool prefetch, bool can_unsync, bool host_writable, u64 *new_spte); -u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index); +u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, + union kvm_mmu_page_role role, int index); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); u64 mark_spte_for_access_track(u64 spte); diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index ee4802d7b36c..39b48e7d7d1a 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -11,7 +11,7 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter) { iter->sptep = iter->pt_path[iter->level - 1] + - SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level); + SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level); iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep); } @@ -116,8 +116,8 @@ static bool try_step_side(struct tdp_iter *iter) * Check if the iterator is already at the end of the current page * table. */ - if (SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level) == - (PT64_ENT_PER_PAGE - 1)) + if (SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level) == + (SPTE_ENT_PER_PAGE - 1)) return false; iter->gfn += KVM_PAGES_PER_HPAGE(iter->level); @@ -146,15 +146,6 @@ static bool try_step_up(struct tdp_iter *iter) } /* - * Step the iterator back up a level in the paging structure. Should only be - * used when the iterator is below the root level. - */ -void tdp_iter_step_up(struct tdp_iter *iter) -{ - WARN_ON(!try_step_up(iter)); -} - -/* * Step to the next SPTE in a pre-order traversal of the paging structure. * To get to the next SPTE, the iterator either steps down towards the goal * GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index adfca0cf94d3..f0af385c56e0 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -114,6 +114,5 @@ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root, int min_level, gfn_t next_last_level_gfn); void tdp_iter_next(struct tdp_iter *iter); void tdp_iter_restart(struct tdp_iter *iter); -void tdp_iter_step_up(struct tdp_iter *iter); #endif /* __KVM_X86_MMU_TDP_ITER_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7b9265d67131..bf2ccf9debca 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -425,7 +425,7 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) tdp_mmu_unlink_sp(kvm, sp, shared); - for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { tdp_ptep_t sptep = pt + i; gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); u64 old_spte; @@ -633,7 +633,6 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, u64 new_spte) { u64 *sptep = rcu_dereference(iter->sptep); - u64 old_spte; /* * The caller is responsible for ensuring the old SPTE is not a REMOVED @@ -649,17 +648,8 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and * does not hold the mmu_lock. */ - old_spte = cmpxchg64(sptep, iter->old_spte, new_spte); - if (old_spte != iter->old_spte) { - /* - * The page table entry was modified by a different logical - * CPU. Refresh iter->old_spte with the current value so the - * caller operates on fresh data, e.g. if it retries - * tdp_mmu_set_spte_atomic(). - */ - iter->old_spte = old_spte; + if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) return -EBUSY; - } __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, new_spte, iter->level, true); @@ -934,9 +924,6 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) } /* - * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs - * have been cleared and a TLB flush is needed before releasing the MMU lock. - * * If can_yield is true, will release the MMU lock and reschedule if the * scheduler needs the CPU or there is contention on the MMU lock. If this * function cannot yield, it will not release the MMU lock or reschedule and @@ -979,10 +966,9 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, } /* - * Tears down the mappings for the range of gfns, [start, end), and frees the - * non-root pages mapping GFNs strictly within that range. Returns true if - * SPTEs have been cleared and a TLB flush is needed before releasing the - * MMU lock. + * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns + * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or + * more SPTEs were zapped since the MMU lock was last acquired. */ bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, bool can_yield, bool flush) @@ -1487,8 +1473,8 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, * No need for atomics when writing to sp->spt since the page table has * not been linked in yet and thus is not reachable from any other CPU. */ - for (i = 0; i < PT64_ENT_PER_PAGE; i++) - sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i); + for (i = 0; i < SPTE_ENT_PER_PAGE; i++) + sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i); /* * Replace the huge spte with a pointer to the populated lower level @@ -1507,7 +1493,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, * are overwriting from the page stats. But we have to manually update * the page stats with the new present child pages. */ - kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE); + kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE); out: trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); @@ -1731,10 +1717,6 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); } -/* - * Clear leaf entries which could be replaced by large mappings, for - * GFNs within the slot. - */ static void zap_collapsible_spte_range(struct kvm *kvm, struct kvm_mmu_page *root, const struct kvm_memory_slot *slot) @@ -1743,61 +1725,52 @@ static void zap_collapsible_spte_range(struct kvm *kvm, gfn_t end = start + slot->npages; struct tdp_iter iter; int max_mapping_level; - kvm_pfn_t pfn; rcu_read_lock(); - tdp_root_for_each_pte(iter, root, start, end) { + for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) { +retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; - if (!is_shadow_present_pte(iter.old_spte) || - !is_last_spte(iter.old_spte, iter.level)) + if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || + !is_shadow_present_pte(iter.old_spte)) continue; /* - * This is a leaf SPTE. Check if the PFN it maps can - * be mapped at a higher level. + * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with + * a large page size, then its parent would have been zapped + * instead of stepping down. */ - pfn = spte_to_pfn(iter.old_spte); - - if (kvm_is_reserved_pfn(pfn)) + if (is_last_spte(iter.old_spte, iter.level)) continue; - max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, - iter.gfn, pfn, PG_LEVEL_NUM); - - WARN_ON(max_mapping_level < iter.level); - /* - * If this page is already mapped at the highest - * viable level, there's nothing more to do. + * If iter.gfn resides outside of the slot, i.e. the page for + * the current level overlaps but is not contained by the slot, + * then the SPTE can't be made huge. More importantly, trying + * to query that info from slot->arch.lpage_info will cause an + * out-of-bounds access. */ - if (max_mapping_level == iter.level) + if (iter.gfn < start || iter.gfn >= end) continue; - /* - * The page can be remapped at a higher level, so step - * up to zap the parent SPTE. - */ - while (max_mapping_level > iter.level) - tdp_iter_step_up(&iter); + max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, + iter.gfn, PG_LEVEL_NUM); + if (max_mapping_level < iter.level) + continue; /* Note, a successful atomic zap also does a remote TLB flush. */ - tdp_mmu_zap_spte_atomic(kvm, &iter); - - /* - * If the atomic zap fails, the iter will recurse back into - * the same subtree to retry. - */ + if (tdp_mmu_zap_spte_atomic(kvm, &iter)) + goto retry; } rcu_read_unlock(); } /* - * Clear non-leaf entries (and free associated page tables) which could - * be replaced by large mappings, for GFNs within the slot. + * Zap non-leaf SPTEs (and free their associated page tables) which could + * be replaced by huge pages, for GFNs within the slot. */ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 3f868fed9114..02f9e4f245bd 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -16,6 +16,7 @@ #include <linux/bsearch.h> #include <linux/sort.h> #include <asm/perf_event.h> +#include <asm/cpu_device_id.h> #include "x86.h" #include "cpuid.h" #include "lapic.h" @@ -24,6 +25,15 @@ /* This is enough to filter the vast majority of currently defined events. */ #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300 +struct x86_pmu_capability __read_mostly kvm_pmu_cap; +EXPORT_SYMBOL_GPL(kvm_pmu_cap); + +static const struct x86_cpu_id vmx_icl_pebs_cpu[] = { + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL), + {} +}; + /* NOTE: * - Each perf counter is defined as "struct kvm_pmc"; * - There are two types of perf counters: general purpose (gp) and fixed. @@ -34,7 +44,9 @@ * However AMD doesn't support fixed-counters; * - There are three types of index to access perf counters (PMC): * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD - * has MSR_K7_PERFCTRn. + * has MSR_K7_PERFCTRn and, for families 15H and later, + * MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are + * aliased to MSR_K7_PERFCTRn. * 2. MSR Index (named idx): This normally is used by RDPMC instruction. * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except @@ -46,7 +58,8 @@ * between pmc and perf counters is as the following: * * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed - * * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters + * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H + * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters */ static struct kvm_pmu_ops kvm_pmu_ops __read_mostly; @@ -86,15 +99,22 @@ static void kvm_pmi_trigger_fn(struct irq_work *irq_work) static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); + bool skip_pmi = false; /* Ignore counters that have been reprogrammed already. */ if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) return; - __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); + if (pmc->perf_event && pmc->perf_event->attr.precise_ip) { + /* Indicate PEBS overflow PMI to guest. */ + skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT, + (unsigned long *)&pmu->global_status); + } else { + __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); + } kvm_make_request(KVM_REQ_PMU, pmc->vcpu); - if (!pmc->intr) + if (!pmc->intr || skip_pmi) return; /* @@ -124,6 +144,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, bool exclude_user, bool exclude_kernel, bool intr) { + struct kvm_pmu *pmu = pmc_to_pmu(pmc); struct perf_event *event; struct perf_event_attr attr = { .type = type, @@ -135,9 +156,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, .exclude_kernel = exclude_kernel, .config = config, }; - - if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX) - return; + bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable); attr.sample_period = get_sample_period(pmc, pmc->counter); @@ -150,6 +169,25 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, */ attr.sample_period = 0; } + if (pebs) { + /* + * The non-zero precision level of guest event makes the ordinary + * guest event becomes a guest PEBS event and triggers the host + * PEBS PMI handler to determine whether the PEBS overflow PMI + * comes from the host counters or the guest. + * + * For most PEBS hardware events, the difference in the software + * precision levels of guest and host PEBS events will not affect + * the accuracy of the PEBS profiling result, because the "event IP" + * in the PEBS record is calibrated on the guest side. + * + * On Icelake everything is fine. Other hardware (GLC+, TNT+) that + * could possibly care here is unsupported and needs changes. + */ + attr.precise_ip = 1; + if (x86_match_cpu(vmx_icl_pebs_cpu) && pmc->idx == 32) + attr.precise_ip = 3; + } event = perf_event_create_kernel_counter(&attr, -1, current, kvm_perf_overflow, pmc); @@ -163,7 +201,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, pmc_to_pmu(pmc)->event_count++; clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); pmc->is_paused = false; - pmc->intr = intr; + pmc->intr = intr || pebs; } static void pmc_pause_counter(struct kvm_pmc *pmc) @@ -189,6 +227,10 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) get_sample_period(pmc, pmc->counter))) return false; + if (!test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) && + pmc->perf_event->attr.precise_ip) + return false; + /* reuse perf_event to serve as pmc_reprogram_counter() does*/ perf_event_enable(pmc->perf_event); pmc->is_paused = false; @@ -205,115 +247,83 @@ static int cmp_u64(const void *pa, const void *pb) return (a > b) - (a < b); } -void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) +static bool check_pmu_event_filter(struct kvm_pmc *pmc) { - u64 config; - u32 type = PERF_TYPE_RAW; - struct kvm *kvm = pmc->vcpu->kvm; struct kvm_pmu_event_filter *filter; - struct kvm_pmu *pmu = vcpu_to_pmu(pmc->vcpu); + struct kvm *kvm = pmc->vcpu->kvm; bool allow_event = true; + __u64 key; + int idx; - if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) - printk_once("kvm pmu: pin control bit is ignored\n"); - - pmc->eventsel = eventsel; - - pmc_pause_counter(pmc); - - if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc)) - return; + if (!static_call(kvm_x86_pmu_hw_event_available)(pmc)) + return false; filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); - if (filter) { - __u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB; + if (!filter) + goto out; + if (pmc_is_gp(pmc)) { + key = pmc->eventsel & AMD64_RAW_EVENT_MASK_NB; if (bsearch(&key, filter->events, filter->nevents, sizeof(__u64), cmp_u64)) allow_event = filter->action == KVM_PMU_EVENT_ALLOW; else allow_event = filter->action == KVM_PMU_EVENT_DENY; + } else { + idx = pmc->idx - INTEL_PMC_IDX_FIXED; + if (filter->action == KVM_PMU_EVENT_DENY && + test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) + allow_event = false; + if (filter->action == KVM_PMU_EVENT_ALLOW && + !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) + allow_event = false; } - if (!allow_event) - return; - - if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | - ARCH_PERFMON_EVENTSEL_INV | - ARCH_PERFMON_EVENTSEL_CMASK | - HSW_IN_TX | - HSW_IN_TX_CHECKPOINTED))) { - config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc); - if (config != PERF_COUNT_HW_MAX) - type = PERF_TYPE_HARDWARE; - } - - if (type == PERF_TYPE_RAW) - config = eventsel & pmu->raw_event_mask; - - if (pmc->current_config == eventsel && pmc_resume_counter(pmc)) - return; - - pmc_release_perf_event(pmc); - pmc->current_config = eventsel; - pmc_reprogram_counter(pmc, type, config, - !(eventsel & ARCH_PERFMON_EVENTSEL_USR), - !(eventsel & ARCH_PERFMON_EVENTSEL_OS), - eventsel & ARCH_PERFMON_EVENTSEL_INT); +out: + return allow_event; } -EXPORT_SYMBOL_GPL(reprogram_gp_counter); -void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) +void reprogram_counter(struct kvm_pmc *pmc) { - unsigned en_field = ctrl & 0x3; - bool pmi = ctrl & 0x8; - struct kvm_pmu_event_filter *filter; - struct kvm *kvm = pmc->vcpu->kvm; + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + u64 eventsel = pmc->eventsel; + u64 new_config = eventsel; + u8 fixed_ctr_ctrl; pmc_pause_counter(pmc); - if (!en_field || !pmc_is_enabled(pmc)) + if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc)) return; - filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); - if (filter) { - if (filter->action == KVM_PMU_EVENT_DENY && - test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) - return; - if (filter->action == KVM_PMU_EVENT_ALLOW && - !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) - return; - } - - if (pmc->current_config == (u64)ctrl && pmc_resume_counter(pmc)) + if (!check_pmu_event_filter(pmc)) return; - pmc_release_perf_event(pmc); - - pmc->current_config = (u64)ctrl; - pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, - static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc), - !(en_field & 0x2), /* exclude user */ - !(en_field & 0x1), /* exclude kernel */ - pmi); -} -EXPORT_SYMBOL_GPL(reprogram_fixed_counter); + if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) + printk_once("kvm pmu: pin control bit is ignored\n"); -void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) -{ - struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, pmc_idx); + if (pmc_is_fixed(pmc)) { + fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, + pmc->idx - INTEL_PMC_IDX_FIXED); + if (fixed_ctr_ctrl & 0x1) + eventsel |= ARCH_PERFMON_EVENTSEL_OS; + if (fixed_ctr_ctrl & 0x2) + eventsel |= ARCH_PERFMON_EVENTSEL_USR; + if (fixed_ctr_ctrl & 0x8) + eventsel |= ARCH_PERFMON_EVENTSEL_INT; + new_config = (u64)fixed_ctr_ctrl; + } - if (!pmc) + if (pmc->current_config == new_config && pmc_resume_counter(pmc)) return; - if (pmc_is_gp(pmc)) - reprogram_gp_counter(pmc, pmc->eventsel); - else { - int idx = pmc_idx - INTEL_PMC_IDX_FIXED; - u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx); + pmc_release_perf_event(pmc); - reprogram_fixed_counter(pmc, ctrl, idx); - } + pmc->current_config = new_config; + pmc_reprogram_counter(pmc, PERF_TYPE_RAW, + (eventsel & pmu->raw_event_mask), + !(eventsel & ARCH_PERFMON_EVENTSEL_USR), + !(eventsel & ARCH_PERFMON_EVENTSEL_OS), + eventsel & ARCH_PERFMON_EVENTSEL_INT); } EXPORT_SYMBOL_GPL(reprogram_counter); @@ -329,8 +339,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) clear_bit(bit, pmu->reprogram_pmi); continue; } - - reprogram_counter(pmu, bit); + reprogram_counter(pmc); } /* @@ -471,17 +480,6 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) kvm_pmu_refresh(vcpu); } -static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu = pmc_to_pmu(pmc); - - if (pmc_is_fixed(pmc)) - return fixed_ctrl_field(pmu->fixed_ctr_ctrl, - pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3; - - return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; -} - /* Release perf_events for vPMCs that have been unused for a full time slice. */ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) { @@ -514,13 +512,12 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) { - struct kvm_pmu *pmu = pmc_to_pmu(pmc); u64 prev_count; prev_count = pmc->counter; pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); - reprogram_counter(pmu, pmc->idx); + reprogram_counter(pmc); if (pmc->counter < prev_count) __kvm_perf_overflow(pmc, false); } @@ -528,13 +525,8 @@ static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, unsigned int perf_hw_id) { - u64 old_eventsel = pmc->eventsel; - unsigned int config; - - pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK); - config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc); - pmc->eventsel = old_eventsel; - return config == perf_hw_id; + return !((pmc->eventsel ^ perf_get_hw_event_config(perf_hw_id)) & + AMD64_RAW_EVENT_MASK_NB); } static inline bool cpl_is_matched(struct kvm_pmc *pmc) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index e745f443b6a8..5cc5721f260b 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -8,6 +8,9 @@ #define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu)) #define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu) +#define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \ + MSR_IA32_MISC_ENABLE_BTS_UNAVAIL) + /* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ #define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf) @@ -22,7 +25,7 @@ struct kvm_event_hw_type_mapping { }; struct kvm_pmu_ops { - unsigned int (*pmc_perf_hw_id)(struct kvm_pmc *pmc); + bool (*hw_event_available)(struct kvm_pmc *pmc); bool (*pmc_is_enabled)(struct kvm_pmc *pmc); struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx); struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, @@ -144,9 +147,43 @@ static inline void pmc_update_sample_period(struct kvm_pmc *pmc) get_sample_period(pmc, pmc->counter)); } -void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel); -void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx); -void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx); +static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + if (pmc_is_fixed(pmc)) + return fixed_ctrl_field(pmu->fixed_ctr_ctrl, + pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3; + + return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; +} + +extern struct x86_pmu_capability kvm_pmu_cap; + +static inline void kvm_init_pmu_capability(void) +{ + bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; + + perf_get_x86_pmu_capability(&kvm_pmu_cap); + + /* + * For Intel, only support guest architectural pmu + * on a host with architectural pmu. + */ + if ((is_intel && !kvm_pmu_cap.version) || !kvm_pmu_cap.num_counters_gp) + enable_pmu = false; + + if (!enable_pmu) { + memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); + return; + } + + kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); + kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, + KVM_PMC_MAX_FIXED); +} + +void reprogram_counter(struct kvm_pmc *pmc); void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index d1bc5820ea46..6919dee69f18 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -40,6 +40,9 @@ #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK) #define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) +static bool force_avic; +module_param_unsafe(force_avic, bool, 0444); + /* Note: * This hash table is used to map VM_ID to a struct kvm_svm, * when handling AMD IOMMU GALOG notification to schedule in @@ -50,6 +53,7 @@ static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); static u32 next_vm_id = 0; static bool next_vm_id_wrapped = 0; static DEFINE_SPINLOCK(svm_vm_data_hash_lock); +enum avic_modes avic_mode; /* * This is a wrapper of struct amd_iommu_ir_data. @@ -59,6 +63,54 @@ struct amd_svm_iommu_ir { void *data; /* Storing pointer to struct amd_ir_data */ }; +static void avic_activate_vmcb(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = svm->vmcb01.ptr; + + vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); + vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; + + vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + + /* Note: + * KVM can support hybrid-AVIC mode, where KVM emulates x2APIC + * MSR accesses, while interrupt injection to a running vCPU + * can be achieved using AVIC doorbell. The AVIC hardware still + * accelerate MMIO accesses, but this does not cause any harm + * as the guest is not supposed to access xAPIC mmio when uses x2APIC. + */ + if (apic_x2apic_mode(svm->vcpu.arch.apic) && + avic_mode == AVIC_MODE_X2) { + vmcb->control.int_ctl |= X2APIC_MODE_MASK; + vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID; + /* Disabling MSR intercept for x2APIC registers */ + svm_set_x2apic_msr_interception(svm, false); + } else { + /* For xAVIC and hybrid-xAVIC modes */ + vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID; + /* Enabling MSR intercept for x2APIC registers */ + svm_set_x2apic_msr_interception(svm, true); + } +} + +static void avic_deactivate_vmcb(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = svm->vmcb01.ptr; + + vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); + vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; + + /* + * If running nested and the guest uses its own MSR bitmap, there + * is no need to update L0's msr bitmap + */ + if (is_guest_mode(&svm->vcpu) && + vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)) + return; + + /* Enabling MSR intercept for x2APIC registers */ + svm_set_x2apic_msr_interception(svm, true); +} /* Note: * This function is called from IOMMU driver to notify @@ -175,13 +227,12 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; - vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK; if (kvm_apicv_activated(svm->vcpu.kvm)) - vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + avic_activate_vmcb(svm); else - vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; + avic_deactivate_vmcb(svm); } static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, @@ -190,7 +241,8 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, u64 *avic_physical_id_table; struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); - if (index >= AVIC_MAX_PHYSICAL_ID_COUNT) + if ((avic_mode == AVIC_MODE_X1 && index > AVIC_MAX_PHYSICAL_ID) || + (avic_mode == AVIC_MODE_X2 && index > X2AVIC_MAX_PHYSICAL_ID)) return NULL; avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page); @@ -237,7 +289,8 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) int id = vcpu->vcpu_id; struct vcpu_svm *svm = to_svm(vcpu); - if (id >= AVIC_MAX_PHYSICAL_ID_COUNT) + if ((avic_mode == AVIC_MODE_X1 && id > AVIC_MAX_PHYSICAL_ID) || + (avic_mode == AVIC_MODE_X2 && id > X2AVIC_MAX_PHYSICAL_ID)) return -EINVAL; if (!vcpu->arch.apic->regs) @@ -279,8 +332,10 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu) */ int cpu = READ_ONCE(vcpu->cpu); - if (cpu != get_cpu()) + if (cpu != get_cpu()) { wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); + trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu)); + } put_cpu(); } @@ -303,7 +358,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source if (apic_x2apic_mode(source)) dest = icrh; else - dest = GET_APIC_DEST_FIELD(icrh); + dest = GET_XAPIC_DEST_FIELD(icrh); if (dest_mode == APIC_DEST_PHYSICAL) { /* broadcast destination, use slow path */ @@ -345,9 +400,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source logid_index = cluster + __ffs(bitmap); - if (apic_x2apic_mode(source)) { - l1_physical_id = logid_index; - } else { + if (!apic_x2apic_mode(source)) { u32 *avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page); @@ -362,6 +415,23 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source l1_physical_id = logid_entry & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; + } else { + /* + * For x2APIC logical mode, cannot leverage the index. + * Instead, calculate physical ID from logical ID in ICRH. + */ + int cluster = (icrh & 0xffff0000) >> 16; + int apic = ffs(icrh & 0xffff) - 1; + + /* + * If the x2APIC logical ID sub-field (i.e. icrh[15:0]) + * contains anything but a single bit, we cannot use the + * fast path, because it is limited to a single vCPU. + */ + if (apic < 0 || icrh != (1 << apic)) + return -EINVAL; + + l1_physical_id = (cluster << 4) + apic; } } @@ -396,9 +466,15 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, * since entered the guest will have processed pending IRQs at VMRUN. */ kvm_for_each_vcpu(i, vcpu, kvm) { + u32 dest; + + if (apic_x2apic_mode(vcpu->arch.apic)) + dest = icrh; + else + dest = GET_XAPIC_DEST_FIELD(icrh); + if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, - GET_APIC_DEST_FIELD(icrh), - icrl & APIC_DEST_MASK)) { + dest, icrl & APIC_DEST_MASK)) { vcpu->arch.apic->irr_pending = true; svm_complete_interrupt_delivery(vcpu, icrl & APIC_MODE_MASK, @@ -514,8 +590,13 @@ static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); bool flat = svm->dfr_reg == APIC_DFR_FLAT; - u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); + u32 *entry; + + /* Note: x2AVIC does not use logical APIC ID table */ + if (apic_x2apic_mode(vcpu->arch.apic)) + return; + entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); if (entry) clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry); } @@ -527,6 +608,10 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); u32 id = kvm_xapic_id(vcpu->arch.apic); + /* AVIC does not support LDR update for x2APIC */ + if (apic_x2apic_mode(vcpu->arch.apic)) + return 0; + if (ldr == svm->ldr_reg) return 0; @@ -654,6 +739,18 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } +void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu) +{ + if (!lapic_in_kernel(vcpu) || avic_mode == AVIC_MODE_NONE) + return; + + if (kvm_get_apic_mode(vcpu) == LAPIC_MODE_INVALID) { + WARN_ONCE(true, "Invalid local APIC state (vcpu_id=%d)", vcpu->vcpu_id); + return; + } + avic_refresh_apicv_exec_ctrl(vcpu); +} + static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) { int ret = 0; @@ -906,7 +1003,6 @@ bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) BIT(APICV_INHIBIT_REASON_NESTED) | BIT(APICV_INHIBIT_REASON_IRQWIN) | BIT(APICV_INHIBIT_REASON_PIT_REINJ) | - BIT(APICV_INHIBIT_REASON_X2APIC) | BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | BIT(APICV_INHIBIT_REASON_SEV) | BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | @@ -968,7 +1064,6 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) return; entry = READ_ONCE(*(svm->avic_physical_id_cache)); - WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); @@ -1016,9 +1111,9 @@ void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) * accordingly before re-activating. */ avic_apicv_post_state_restore(vcpu); - vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + avic_activate_vmcb(svm); } else { - vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; + avic_deactivate_vmcb(svm); } vmcb_mark_dirty(vmcb, VMCB_AVIC); @@ -1058,3 +1153,44 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) avic_vcpu_load(vcpu, vcpu->cpu); } + +/* + * Note: + * - The module param avic enable both xAPIC and x2APIC mode. + * - Hypervisor can support both xAVIC and x2AVIC in the same guest. + * - The mode can be switched at run-time. + */ +bool avic_hardware_setup(struct kvm_x86_ops *x86_ops) +{ + if (!npt_enabled) + return false; + + if (boot_cpu_has(X86_FEATURE_AVIC)) { + avic_mode = AVIC_MODE_X1; + pr_info("AVIC enabled\n"); + } else if (force_avic) { + /* + * Some older systems does not advertise AVIC support. + * See Revision Guide for specific AMD processor for more detail. + */ + avic_mode = AVIC_MODE_X1; + pr_warn("AVIC is not supported in CPUID but force enabled"); + pr_warn("Your system might crash and burn"); + } + + /* AVIC is a prerequisite for x2AVIC. */ + if (boot_cpu_has(X86_FEATURE_X2AVIC)) { + if (avic_mode == AVIC_MODE_X1) { + avic_mode = AVIC_MODE_X2; + pr_info("x2AVIC enabled\n"); + } else { + pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled"); + pr_warn(FW_BUG "Try enable AVIC using force_avic option"); + } + } + + if (avic_mode != AVIC_MODE_NONE) + amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); + + return !!avic_mode; +} diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index ba7cd26f438f..76dcc8a3e849 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -230,6 +230,11 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) break; p = msrpm_offsets[i]; + + /* x2apic msrs are intercepted always for the nested guest */ + if (is_x2apic_msrpm_offset(p)) + continue; + offset = svm->nested.ctl.msrpm_base_pa + (p * 4); if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4)) @@ -320,7 +325,8 @@ static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu, return false; } - if (CC(!kvm_is_valid_cr4(vcpu, save->cr4))) + /* Note, SVM doesn't have any additional restrictions on CR4. */ + if (CC(!__kvm_is_valid_cr4(vcpu, save->cr4))) return false; if (CC(!kvm_valid_efer(vcpu, save->efer))) @@ -371,6 +377,7 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, to->nested_ctl = from->nested_ctl; to->event_inj = from->event_inj; to->event_inj_err = from->event_inj_err; + to->next_rip = from->next_rip; to->nested_cr3 = from->nested_cr3; to->virt_ext = from->virt_ext; to->pause_filter_count = from->pause_filter_count; @@ -608,7 +615,33 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 } } -static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) +static inline bool is_evtinj_soft(u32 evtinj) +{ + u32 type = evtinj & SVM_EVTINJ_TYPE_MASK; + u8 vector = evtinj & SVM_EVTINJ_VEC_MASK; + + if (!(evtinj & SVM_EVTINJ_VALID)) + return false; + + if (type == SVM_EVTINJ_TYPE_SOFT) + return true; + + return type == SVM_EVTINJ_TYPE_EXEPT && kvm_exception_is_soft(vector); +} + +static bool is_evtinj_nmi(u32 evtinj) +{ + u32 type = evtinj & SVM_EVTINJ_TYPE_MASK; + + if (!(evtinj & SVM_EVTINJ_VALID)) + return false; + + return type == SVM_EVTINJ_TYPE_NMI; +} + +static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, + unsigned long vmcb12_rip, + unsigned long vmcb12_csbase) { u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK; u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK; @@ -650,7 +683,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) vmcb02->control.tsc_offset = vcpu->arch.tsc_offset; - if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { + if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) { WARN_ON(!svm->tsc_scaling_enabled); nested_svm_update_tsc_ratio_msr(vcpu); } @@ -664,6 +697,30 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) vmcb02->control.event_inj = svm->nested.ctl.event_inj; vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err; + /* + * next_rip is consumed on VMRUN as the return address pushed on the + * stack for injected soft exceptions/interrupts. If nrips is exposed + * to L1, take it verbatim from vmcb12. If nrips is supported in + * hardware but not exposed to L1, stuff the actual L2 RIP to emulate + * what a nrips=0 CPU would do (L1 is responsible for advancing RIP + * prior to injecting the event). + */ + if (svm->nrips_enabled) + vmcb02->control.next_rip = svm->nested.ctl.next_rip; + else if (boot_cpu_has(X86_FEATURE_NRIPS)) + vmcb02->control.next_rip = vmcb12_rip; + + svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj); + if (is_evtinj_soft(vmcb02->control.event_inj)) { + svm->soft_int_injected = true; + svm->soft_int_csbase = vmcb12_csbase; + svm->soft_int_old_rip = vmcb12_rip; + if (svm->nrips_enabled) + svm->soft_int_next_rip = svm->nested.ctl.next_rip; + else + svm->soft_int_next_rip = vmcb12_rip; + } + vmcb02->control.virt_ext = vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK; if (svm->lbrv_enabled) @@ -745,7 +802,7 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr); svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm); + nested_vmcb02_prepare_control(svm, vmcb12->save.rip, vmcb12->save.cs.base); nested_vmcb02_prepare_save(svm, vmcb12); ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3, @@ -834,6 +891,8 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) out_exit_err: svm->nested.nested_run_pending = 0; + svm->nmi_l1_to_l2 = false; + svm->soft_int_injected = false; svm->vmcb->control.exit_code = SVM_EXIT_ERR; svm->vmcb->control.exit_code_hi = 0; @@ -982,7 +1041,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS); } - if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { + if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) { WARN_ON(!svm->tsc_scaling_enabled); vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); @@ -1421,6 +1480,7 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, dst->nested_ctl = from->nested_ctl; dst->event_inj = from->event_inj; dst->event_inj_err = from->event_inj_err; + dst->next_rip = from->next_rip; dst->nested_cr3 = from->nested_cr3; dst->virt_ext = from->virt_ext; dst->pause_filter_count = from->pause_filter_count; @@ -1605,7 +1665,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, nested_copy_vmcb_control_to_cache(svm, ctl); svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm); + nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip, svm->vmcb->save.cs.base); /* * While the nested guest CR3 is already checked and set by diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 136039fc6d01..f24613a108c5 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -33,34 +33,6 @@ enum index { INDEX_ERROR, }; -/* duplicated from amd_perfmon_event_map, K7 and above should work. */ -static struct kvm_event_hw_type_mapping amd_event_mapping[] = { - [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES }, - [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, - [2] = { 0x7d, 0x07, PERF_COUNT_HW_CACHE_REFERENCES }, - [3] = { 0x7e, 0x07, PERF_COUNT_HW_CACHE_MISSES }, - [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, - [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, - [6] = { 0xd0, 0x00, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, - [7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, -}; - -/* duplicated from amd_f17h_perfmon_event_map. */ -static struct kvm_event_hw_type_mapping amd_f17h_event_mapping[] = { - [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES }, - [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, - [2] = { 0x60, 0xff, PERF_COUNT_HW_CACHE_REFERENCES }, - [3] = { 0x64, 0x09, PERF_COUNT_HW_CACHE_MISSES }, - [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, - [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, - [6] = { 0x87, 0x02, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, - [7] = { 0x87, 0x01, PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, -}; - -/* amd_pmc_perf_hw_id depends on these being the same size */ -static_assert(ARRAY_SIZE(amd_event_mapping) == - ARRAY_SIZE(amd_f17h_event_mapping)); - static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type) { struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); @@ -154,31 +126,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, return &pmu->gp_counters[msr_to_index(msr)]; } -static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc) +static bool amd_hw_event_available(struct kvm_pmc *pmc) { - struct kvm_event_hw_type_mapping *event_mapping; - u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT; - u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; - int i; - - /* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */ - if (WARN_ON(pmc_is_fixed(pmc))) - return PERF_COUNT_HW_MAX; - - if (guest_cpuid_family(pmc->vcpu) >= 0x17) - event_mapping = amd_f17h_event_mapping; - else - event_mapping = amd_event_mapping; - - for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++) - if (event_mapping[i].eventsel == event_select - && event_mapping[i].unit_mask == unit_mask) - break; - - if (i == ARRAY_SIZE(amd_event_mapping)) - return PERF_COUNT_HW_MAX; - - return event_mapping[i].event_type; + return true; } /* check if a PMC is enabled by comparing it against global_ctrl bits. Because @@ -286,8 +236,10 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL); if (pmc) { data &= ~pmu->reserved_bits; - if (data != pmc->eventsel) - reprogram_gp_counter(pmc, data); + if (data != pmc->eventsel) { + pmc->eventsel = data; + reprogram_counter(pmc); + } return 0; } @@ -343,7 +295,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) } struct kvm_pmu_ops amd_pmu_ops __initdata = { - .pmc_perf_hw_id = amd_pmc_perf_hw_id, + .hw_event_available = amd_hw_event_available, .pmc_is_enabled = amd_pmc_is_enabled, .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0c240ed04f96..b0e793e7d85c 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -603,6 +603,9 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->xss = svm->vcpu.arch.ia32_xss; save->dr6 = svm->vcpu.arch.dr6; + pr_debug("Virtual Machine Save Area (VMSA):\n"); + print_hex_dump(KERN_CONT, "", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false); + return 0; } @@ -1606,38 +1609,35 @@ static int sev_lock_vcpus_for_migration(struct kvm *kvm, { struct kvm_vcpu *vcpu; unsigned long i, j; - bool first = true; kvm_for_each_vcpu(i, vcpu, kvm) { if (mutex_lock_killable_nested(&vcpu->mutex, role)) goto out_unlock; - if (first) { +#ifdef CONFIG_PROVE_LOCKING + if (!i) /* * Reset the role to one that avoids colliding with * the role used for the first vcpu mutex. */ role = SEV_NR_MIGRATION_ROLES; - first = false; - } else { + else mutex_release(&vcpu->mutex.dep_map, _THIS_IP_); - } +#endif } return 0; out_unlock: - first = true; kvm_for_each_vcpu(j, vcpu, kvm) { if (i == j) break; - if (first) - first = false; - else +#ifdef CONFIG_PROVE_LOCKING + if (j) mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_); - +#endif mutex_unlock(&vcpu->mutex); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 44bbf25dfeb9..38f873cb6f2c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -74,6 +74,8 @@ static uint64_t osvw_len = 4, osvw_status; static DEFINE_PER_CPU(u64, current_tsc_ratio); +#define X2APIC_MSR(x) (APIC_BASE_MSR + (x >> 4)) + static const struct svm_direct_access_msrs { u32 index; /* Index of the MSR */ bool always; /* True if intercept is initially cleared */ @@ -100,6 +102,38 @@ static const struct svm_direct_access_msrs { { .index = MSR_IA32_CR_PAT, .always = false }, { .index = MSR_AMD64_SEV_ES_GHCB, .always = true }, { .index = MSR_TSC_AUX, .always = false }, + { .index = X2APIC_MSR(APIC_ID), .always = false }, + { .index = X2APIC_MSR(APIC_LVR), .always = false }, + { .index = X2APIC_MSR(APIC_TASKPRI), .always = false }, + { .index = X2APIC_MSR(APIC_ARBPRI), .always = false }, + { .index = X2APIC_MSR(APIC_PROCPRI), .always = false }, + { .index = X2APIC_MSR(APIC_EOI), .always = false }, + { .index = X2APIC_MSR(APIC_RRR), .always = false }, + { .index = X2APIC_MSR(APIC_LDR), .always = false }, + { .index = X2APIC_MSR(APIC_DFR), .always = false }, + { .index = X2APIC_MSR(APIC_SPIV), .always = false }, + { .index = X2APIC_MSR(APIC_ISR), .always = false }, + { .index = X2APIC_MSR(APIC_TMR), .always = false }, + { .index = X2APIC_MSR(APIC_IRR), .always = false }, + { .index = X2APIC_MSR(APIC_ESR), .always = false }, + { .index = X2APIC_MSR(APIC_ICR), .always = false }, + { .index = X2APIC_MSR(APIC_ICR2), .always = false }, + + /* + * Note: + * AMD does not virtualize APIC TSC-deadline timer mode, but it is + * emulated by KVM. When setting APIC LVTT (0x832) register bit 18, + * the AVIC hardware would generate GP fault. Therefore, always + * intercept the MSR 0x832, and do not setup direct_access_msr. + */ + { .index = X2APIC_MSR(APIC_LVTTHMR), .always = false }, + { .index = X2APIC_MSR(APIC_LVTPC), .always = false }, + { .index = X2APIC_MSR(APIC_LVT0), .always = false }, + { .index = X2APIC_MSR(APIC_LVT1), .always = false }, + { .index = X2APIC_MSR(APIC_LVTERR), .always = false }, + { .index = X2APIC_MSR(APIC_TMICT), .always = false }, + { .index = X2APIC_MSR(APIC_TMCCT), .always = false }, + { .index = X2APIC_MSR(APIC_TDCR), .always = false }, { .index = MSR_INVALID, .always = false }, }; @@ -188,9 +222,6 @@ module_param(tsc_scaling, int, 0444); static bool avic; module_param(avic, bool, 0444); -static bool force_avic; -module_param_unsafe(force_avic, bool, 0444); - bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); @@ -342,9 +373,11 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) } -static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) +static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, + bool commit_side_effects) { struct vcpu_svm *svm = to_svm(vcpu); + unsigned long old_rflags; /* * SEV-ES does not expose the next RIP. The RIP update is controlled by @@ -359,18 +392,75 @@ static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) } if (!svm->next_rip) { + if (unlikely(!commit_side_effects)) + old_rflags = svm->vmcb->save.rflags; + if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) return 0; + + if (unlikely(!commit_side_effects)) + svm->vmcb->save.rflags = old_rflags; } else { kvm_rip_write(vcpu, svm->next_rip); } done: - svm_set_interrupt_shadow(vcpu, 0); + if (likely(commit_side_effects)) + svm_set_interrupt_shadow(vcpu, 0); return 1; } +static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + return __svm_skip_emulated_instruction(vcpu, true); +} + +static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu) +{ + unsigned long rip, old_rip = kvm_rip_read(vcpu); + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * Due to architectural shortcomings, the CPU doesn't always provide + * NextRIP, e.g. if KVM intercepted an exception that occurred while + * the CPU was vectoring an INTO/INT3 in the guest. Temporarily skip + * the instruction even if NextRIP is supported to acquire the next + * RIP so that it can be shoved into the NextRIP field, otherwise + * hardware will fail to advance guest RIP during event injection. + * Drop the exception/interrupt if emulation fails and effectively + * retry the instruction, it's the least awful option. If NRIPS is + * in use, the skip must not commit any side effects such as clearing + * the interrupt shadow or RFLAGS.RF. + */ + if (!__svm_skip_emulated_instruction(vcpu, !nrips)) + return -EIO; + + rip = kvm_rip_read(vcpu); + + /* + * Save the injection information, even when using next_rip, as the + * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection + * doesn't complete due to a VM-Exit occurring while the CPU is + * vectoring the event. Decoding the instruction isn't guaranteed to + * work as there may be no backing instruction, e.g. if the event is + * being injected by L1 for L2, or if the guest is patching INT3 into + * a different instruction. + */ + svm->soft_int_injected = true; + svm->soft_int_csbase = svm->vmcb->save.cs.base; + svm->soft_int_old_rip = old_rip; + svm->soft_int_next_rip = rip; + + if (nrips) + kvm_rip_write(vcpu, old_rip); + + if (static_cpu_has(X86_FEATURE_NRIPS)) + svm->vmcb->control.next_rip = rip; + + return 0; +} + static void svm_queue_exception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -380,21 +470,9 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) kvm_deliver_exception_payload(vcpu); - if (nr == BP_VECTOR && !nrips) { - unsigned long rip, old_rip = kvm_rip_read(vcpu); - - /* - * For guest debugging where we have to reinject #BP if some - * INT3 is guest-owned: - * Emulate nRIP by moving RIP forward. Will fail if injection - * raises a fault that is not intercepted. Still better than - * failing in all cases. - */ - (void)svm_skip_emulated_instruction(vcpu); - rip = kvm_rip_read(vcpu); - svm->int3_rip = rip + svm->vmcb->save.cs.base; - svm->int3_injected = rip - old_rip; - } + if (kvm_exception_is_soft(nr) && + svm_update_soft_interrupt_rip(vcpu)) + return; svm->vmcb->control.event_inj = nr | SVM_EVTINJ_VALID @@ -736,6 +814,29 @@ void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) } } +void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) +{ + int i; + + if (intercept == svm->x2avic_msrs_intercepted) + return; + + if (avic_mode != AVIC_MODE_X2 || + !apic_x2apic_mode(svm->vcpu.arch.apic)) + return; + + for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) { + int index = direct_access_msrs[i].index; + + if ((index < APIC_BASE_MSR) || + (index > APIC_BASE_MSR + 0xff)) + continue; + set_msr_interception(&svm->vcpu, svm->msrpm, index, + !intercept, !intercept); + } + + svm->x2avic_msrs_intercepted = intercept; +} void svm_vcpu_free_msrpm(u32 *msrpm) { @@ -1231,7 +1332,7 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) svm_init_osvw(vcpu); vcpu->arch.microcode_version = 0x01000065; - svm->tsc_ratio_msr = kvm_default_tsc_scaling_ratio; + svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio; if (sev_es_guest(vcpu->kvm)) sev_es_vcpu_reset(svm); @@ -1299,6 +1400,8 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) goto error_free_vmsa_page; } + svm->x2avic_msrs_intercepted = true; + svm->vmcb01.ptr = page_address(vmcb01_page); svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); svm_switch_vmcb(svm, &svm->vmcb01); @@ -2345,6 +2448,7 @@ static int task_switch_interception(struct kvm_vcpu *vcpu) kvm_clear_exception_queue(vcpu); break; case SVM_EXITINTINFO_TYPE_INTR: + case SVM_EXITINTINFO_TYPE_SOFT: kvm_clear_interrupt_queue(vcpu); break; default: @@ -3375,35 +3479,49 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; + + if (svm->nmi_l1_to_l2) + return; + vcpu->arch.hflags |= HF_NMI_MASK; if (!sev_es_guest(vcpu->kvm)) svm_set_intercept(svm, INTERCEPT_IRET); ++vcpu->stat.nmi_injections; } -static void svm_inject_irq(struct kvm_vcpu *vcpu) +static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) { struct vcpu_svm *svm = to_svm(vcpu); + u32 type; + + if (vcpu->arch.interrupt.soft) { + if (svm_update_soft_interrupt_rip(vcpu)) + return; - BUG_ON(!(gif_set(svm))); + type = SVM_EVTINJ_TYPE_SOFT; + } else { + type = SVM_EVTINJ_TYPE_INTR; + } - trace_kvm_inj_virq(vcpu->arch.interrupt.nr); + trace_kvm_inj_virq(vcpu->arch.interrupt.nr, + vcpu->arch.interrupt.soft, reinjected); ++vcpu->stat.irq_injections; svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | - SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; + SVM_EVTINJ_VALID | type; } void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, int trig_mode, int vector) { /* - * vcpu->arch.apicv_active must be read after vcpu->mode. + * apic->apicv_active must be read after vcpu->mode. * Pairs with smp_store_release in vcpu_enter_guest. */ bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE); - if (!READ_ONCE(vcpu->arch.apicv_active)) { + /* Note, this is called iff the local APIC is in-kernel. */ + if (!READ_ONCE(vcpu->arch.apic->apicv_active)) { /* Process the interrupt via inject_pending_event */ kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); @@ -3668,15 +3786,49 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; } +static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector, + int type) +{ + bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT); + bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT); + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's + * associated with the original soft exception/interrupt. next_rip is + * cleared on all exits that can occur while vectoring an event, so KVM + * needs to manually set next_rip for re-injection. Unlike the !nrips + * case below, this needs to be done if and only if KVM is re-injecting + * the same event, i.e. if the event is a soft exception/interrupt, + * otherwise next_rip is unused on VMRUN. + */ + if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) && + kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase)) + svm->vmcb->control.next_rip = svm->soft_int_next_rip; + /* + * If NRIPS isn't enabled, KVM must manually advance RIP prior to + * injecting the soft exception/interrupt. That advancement needs to + * be unwound if vectoring didn't complete. Note, the new event may + * not be the injected event, e.g. if KVM injected an INTn, the INTn + * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will + * be the reported vectored event, but RIP still needs to be unwound. + */ + else if (!nrips && (is_soft || is_exception) && + kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase)) + kvm_rip_write(vcpu, svm->soft_int_old_rip); +} + static void svm_complete_interrupts(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); u8 vector; int type; u32 exitintinfo = svm->vmcb->control.exit_int_info; - unsigned int3_injected = svm->int3_injected; + bool nmi_l1_to_l2 = svm->nmi_l1_to_l2; + bool soft_int_injected = svm->soft_int_injected; - svm->int3_injected = 0; + svm->nmi_l1_to_l2 = false; + svm->soft_int_injected = false; /* * If we've made progress since setting HF_IRET_MASK, we've @@ -3701,9 +3853,13 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; + if (soft_int_injected) + svm_complete_soft_interrupt(vcpu, vector, type); + switch (type) { case SVM_EXITINTINFO_TYPE_NMI: vcpu->arch.nmi_injected = true; + svm->nmi_l1_to_l2 = nmi_l1_to_l2; break; case SVM_EXITINTINFO_TYPE_EXEPT: /* @@ -3712,18 +3868,6 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) if (vector == X86_TRAP_VC) break; - /* - * In case of software exceptions, do not reinject the vector, - * but re-execute the instruction instead. Rewind RIP first - * if we emulated INT3 before. - */ - if (kvm_exception_is_soft(vector)) { - if (vector == BP_VECTOR && int3_injected && - kvm_is_linear_rip(vcpu, svm->int3_rip)) - kvm_rip_write(vcpu, - kvm_rip_read(vcpu) - int3_injected); - break; - } if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { u32 err = svm->vmcb->control.exit_int_info_err; kvm_requeue_exception_e(vcpu, vector, err); @@ -3734,9 +3878,13 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) case SVM_EXITINTINFO_TYPE_INTR: kvm_queue_interrupt(vcpu, vector, false); break; + case SVM_EXITINTINFO_TYPE_SOFT: + kvm_queue_interrupt(vcpu, vector, true); + break; default: break; } + } static void svm_cancel_injection(struct kvm_vcpu *vcpu) @@ -3952,7 +4100,7 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, hv_track_root_tdp(vcpu, root_hpa); cr3 = vcpu->arch.cr3; - } else if (vcpu->arch.mmu->root_role.level >= PT64_ROOT_4LEVEL) { + } else if (root_level >= PT64_ROOT_4LEVEL) { cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu); } else { /* PCID in the guest should be impossible with a 32-bit MMU. */ @@ -4013,16 +4161,10 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) return true; } -static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) -{ - return 0; -} - static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_cpuid_entry2 *best; - struct kvm *kvm = vcpu->kvm; vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && boot_cpu_has(X86_FEATURE_XSAVE) && @@ -4049,19 +4191,11 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) /* For sev guests, the memory encryption bit is not reserved in CR3. */ if (sev_guest(vcpu->kvm)) { - best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0); + best = kvm_find_cpuid_entry(vcpu, 0x8000001F); if (best) vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); } - if (kvm_vcpu_apicv_active(vcpu)) { - /* - * AVIC does not work with an x2APIC mode guest. If the X2APIC feature - * is exposed to the guest, disable AVIC. - */ - if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC)) - kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_X2APIC); - } init_vmcb_after_set_cpuid(vcpu); } @@ -4673,11 +4807,11 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .enable_nmi_window = svm_enable_nmi_window, .enable_irq_window = svm_enable_irq_window, .update_cr8_intercept = svm_update_cr8_intercept, + .set_virtual_apic_mode = avic_set_virtual_apic_mode, .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons, .apicv_post_state_restore = avic_apicv_post_state_restore, - .get_mt_mask = svm_get_mt_mask, .get_exit_info = svm_get_exit_info, .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, @@ -4773,7 +4907,7 @@ static __init void svm_set_cpu_caps(void) { kvm_set_cpu_caps(); - supported_xss = 0; + kvm_caps.supported_xss = 0; /* CPUID 0x80000001 and 0x8000000A (SVM features) */ if (nested) { @@ -4849,7 +4983,8 @@ static __init int svm_hardware_setup(void) init_msrpm_offsets(); - supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); + kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | + XFEATURE_MASK_BNDCSR); if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) kvm_enable_efer_bits(EFER_FFXSR); @@ -4859,11 +4994,11 @@ static __init int svm_hardware_setup(void) tsc_scaling = false; } else { pr_info("TSC scaling supported\n"); - kvm_has_tsc_control = true; + kvm_caps.has_tsc_control = true; } } - kvm_max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX; - kvm_tsc_scaling_ratio_frac_bits = 32; + kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX; + kvm_caps.tsc_scaling_ratio_frac_bits = 32; tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); @@ -4917,17 +5052,9 @@ static __init int svm_hardware_setup(void) nrips = false; } - enable_apicv = avic = avic && npt_enabled && (boot_cpu_has(X86_FEATURE_AVIC) || force_avic); + enable_apicv = avic = avic && avic_hardware_setup(&svm_x86_ops); - if (enable_apicv) { - if (!boot_cpu_has(X86_FEATURE_AVIC)) { - pr_warn("AVIC is not supported in CPUID but force enabled"); - pr_warn("Your system might crash and burn"); - } else - pr_info("AVIC enabled\n"); - - amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); - } else { + if (!enable_apicv) { svm_x86_ops.vcpu_blocking = NULL; svm_x86_ops.vcpu_unblocking = NULL; svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 9223ac100ef5..6a7686bf6900 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -29,13 +29,21 @@ #define IOPM_SIZE PAGE_SIZE * 3 #define MSRPM_SIZE PAGE_SIZE * 2 -#define MAX_DIRECT_ACCESS_MSRS 21 -#define MSRPM_OFFSETS 16 +#define MAX_DIRECT_ACCESS_MSRS 46 +#define MSRPM_OFFSETS 32 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; extern int vgif; extern bool intercept_smi; +enum avic_modes { + AVIC_MODE_NONE = 0, + AVIC_MODE_X1, + AVIC_MODE_X2, +}; + +extern enum avic_modes avic_mode; + /* * Clean bits in VMCB. * VMCB_ALL_CLEAN_MASK might also need to @@ -139,6 +147,7 @@ struct vmcb_ctrl_area_cached { u64 nested_ctl; u32 event_inj; u32 event_inj_err; + u64 next_rip; u64 nested_cr3; u64 virt_ext; u32 clean; @@ -228,9 +237,12 @@ struct vcpu_svm { bool nmi_singlestep; u64 nmi_singlestep_guest_rflags; + bool nmi_l1_to_l2; - unsigned int3_injected; - unsigned long int3_rip; + unsigned long soft_int_csbase; + unsigned long soft_int_old_rip; + unsigned long soft_int_next_rip; + bool soft_int_injected; /* optional nested SVM features that are enabled for this guest */ bool nrips_enabled : 1; @@ -264,6 +276,8 @@ struct vcpu_svm { struct vcpu_sev_es_state sev_es; bool guest_state_loaded; + + bool x2avic_msrs_intercepted; }; struct svm_cpu_data { @@ -509,6 +523,15 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE; } +static inline bool is_x2apic_msrpm_offset(u32 offset) +{ + /* 4 msrs per u8, and 4 u8 in u32 */ + u32 msr = offset * 16; + + return (msr >= APIC_BASE_MSR) && + (msr < (APIC_BASE_MSR + 0x100)); +} + /* svm.c */ #define MSR_INVALID 0xffffffffU @@ -534,6 +557,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value); int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code); void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, int read, int write); +void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool disable); void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, int trig_mode, int vec); @@ -603,6 +627,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; /* avic.c */ +bool avic_hardware_setup(struct kvm_x86_ops *ops); int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); @@ -613,18 +638,16 @@ int avic_init_vcpu(struct vcpu_svm *svm); void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void avic_vcpu_put(struct kvm_vcpu *vcpu); void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu); -void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu); void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason); -void avic_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); -void avic_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); -bool avic_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu); int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void avic_vcpu_blocking(struct kvm_vcpu *vcpu); void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); void avic_ring_doorbell(struct kvm_vcpu *vcpu); unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu); +void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu); + /* sev.c */ diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index de4762517569..2120d7c060a9 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -154,7 +154,7 @@ TRACE_EVENT(kvm_xen_hypercall, TRACE_EVENT(kvm_pio, TP_PROTO(unsigned int rw, unsigned int port, unsigned int size, - unsigned int count, void *data), + unsigned int count, const void *data), TP_ARGS(rw, port, size, count, data), TP_STRUCT__entry( @@ -333,18 +333,24 @@ TRACE_EVENT_KVM_EXIT(kvm_exit); * Tracepoint for kvm interrupt injection: */ TRACE_EVENT(kvm_inj_virq, - TP_PROTO(unsigned int irq), - TP_ARGS(irq), + TP_PROTO(unsigned int vector, bool soft, bool reinjected), + TP_ARGS(vector, soft, reinjected), TP_STRUCT__entry( - __field( unsigned int, irq ) + __field( unsigned int, vector ) + __field( bool, soft ) + __field( bool, reinjected ) ), TP_fast_assign( - __entry->irq = irq; + __entry->vector = vector; + __entry->soft = soft; + __entry->reinjected = reinjected; ), - TP_printk("irq %u", __entry->irq) + TP_printk("%s 0x%x%s", + __entry->soft ? "Soft/INTn" : "IRQ", __entry->vector, + __entry->reinjected ? " [reinjected]" : "") ); #define EXS(x) { x##_VECTOR, "#" #x } @@ -358,25 +364,30 @@ TRACE_EVENT(kvm_inj_virq, * Tracepoint for kvm interrupt injection: */ TRACE_EVENT(kvm_inj_exception, - TP_PROTO(unsigned exception, bool has_error, unsigned error_code), - TP_ARGS(exception, has_error, error_code), + TP_PROTO(unsigned exception, bool has_error, unsigned error_code, + bool reinjected), + TP_ARGS(exception, has_error, error_code, reinjected), TP_STRUCT__entry( __field( u8, exception ) __field( u8, has_error ) __field( u32, error_code ) + __field( bool, reinjected ) ), TP_fast_assign( __entry->exception = exception; __entry->has_error = has_error; __entry->error_code = error_code; + __entry->reinjected = reinjected; ), - TP_printk("%s (0x%x)", + TP_printk("%s%s%s%s%s", __print_symbolic(__entry->exception, kvm_trace_sym_exc), - /* FIXME: don't print error_code if not present */ - __entry->has_error ? __entry->error_code : 0) + !__entry->has_error ? "" : " (", + !__entry->has_error ? "" : __print_symbolic(__entry->error_code, { }), + !__entry->has_error ? "" : ")", + __entry->reinjected ? " [reinjected]" : "") ); /* @@ -1479,6 +1490,24 @@ TRACE_EVENT(kvm_avic_kick_vcpu_slowpath, __entry->icrh, __entry->icrl, __entry->index) ); +TRACE_EVENT(kvm_avic_doorbell, + TP_PROTO(u32 vcpuid, u32 apicid), + TP_ARGS(vcpuid, apicid), + + TP_STRUCT__entry( + __field(u32, vcpuid) + __field(u32, apicid) + ), + + TP_fast_assign( + __entry->vcpuid = vcpuid; + __entry->apicid = apicid; + ), + + TP_printk("vcpuid=%u, apicid=%u", + __entry->vcpuid, __entry->apicid) +); + TRACE_EVENT(kvm_hv_timer_state, TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use), TP_ARGS(vcpu_id, hv_timer_in_use), diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index c0e24826a86f..c5e5dfef69c7 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -6,6 +6,8 @@ #include "../lapic.h" #include "../x86.h" +#include "../pmu.h" +#include "../cpuid.h" extern bool __read_mostly enable_vpid; extern bool __read_mostly flexpriority_enabled; @@ -13,6 +15,7 @@ extern bool __read_mostly enable_ept; extern bool __read_mostly enable_unrestricted_guest; extern bool __read_mostly enable_ept_ad_bits; extern bool __read_mostly enable_pml; +extern bool __read_mostly enable_ipiv; extern int __read_mostly pt_mode; #define PT_MODE_SYSTEM 0 @@ -59,6 +62,7 @@ struct vmcs_config { u32 pin_based_exec_ctrl; u32 cpu_based_exec_ctrl; u32 cpu_based_2nd_exec_ctrl; + u64 cpu_based_3rd_exec_ctrl; u32 vmexit_ctrl; u32 vmentry_ctrl; struct nested_vmx_msrs nested; @@ -94,20 +98,17 @@ static inline bool cpu_has_vmx_posted_intr(void) static inline bool cpu_has_load_ia32_efer(void) { - return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER) && - (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_EFER); + return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER; } static inline bool cpu_has_load_perf_global_ctrl(void) { - return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && - (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); + return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; } static inline bool cpu_has_vmx_mpx(void) { - return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && - (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); + return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS; } static inline bool cpu_has_vmx_tpr_shadow(void) @@ -131,6 +132,12 @@ static inline bool cpu_has_secondary_exec_ctrls(void) CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; } +static inline bool cpu_has_tertiary_exec_ctrls(void) +{ + return vmcs_config.cpu_based_exec_ctrl & + CPU_BASED_ACTIVATE_TERTIARY_CONTROLS; +} + static inline bool cpu_has_vmx_virtualize_apic_accesses(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -276,6 +283,11 @@ static inline bool cpu_has_vmx_apicv(void) cpu_has_vmx_posted_intr(); } +static inline bool cpu_has_vmx_ipiv(void) +{ + return vmcs_config.cpu_based_3rd_exec_ctrl & TERTIARY_EXEC_IPI_VIRT; +} + static inline bool cpu_has_vmx_flexpriority(void) { return cpu_has_vmx_tpr_shadow() && @@ -363,7 +375,6 @@ static inline bool cpu_has_vmx_intel_pt(void) rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); return (vmx_msr & MSR_IA32_VMX_MISC_INTEL_PT) && (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) && - (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_IA32_RTIT_CTL) && (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL); } @@ -385,23 +396,31 @@ static inline bool vmx_pt_mode_is_host_guest(void) return pt_mode == PT_MODE_HOST_GUEST; } +static inline bool vmx_pebs_supported(void) +{ + return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept; +} + static inline u64 vmx_get_perf_capabilities(void) { - u64 perf_cap = 0; + u64 perf_cap = PMU_CAP_FW_WRITES; + u64 host_perf_cap = 0; if (!enable_pmu) - return perf_cap; + return 0; if (boot_cpu_has(X86_FEATURE_PDCM)) - rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap); + rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); - perf_cap &= PMU_CAP_LBR_FMT; + perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; - /* - * Since counters are virtualized, KVM would support full - * width counting unconditionally, even if the host lacks it. - */ - return PMU_CAP_FW_WRITES | perf_cap; + if (vmx_pebs_supported()) { + perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; + if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4) + perf_cap &= ~PERF_CAP_PEBS_BASELINE; + } + + return perf_cap; } static inline u64 vmx_supported_debugctl(void) @@ -417,4 +436,10 @@ static inline u64 vmx_supported_debugctl(void) return debugctl; } +static inline bool cpu_has_notify_vmexit(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_NOTIFY_VM_EXITING; +} + #endif /* __KVM_X86_VMX_CAPS_H */ diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 87e3dc10edf4..6a61b1ae7942 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -297,8 +297,10 @@ const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1); #if IS_ENABLED(CONFIG_HYPERV) __init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) { + vmcs_conf->cpu_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_EXEC_CTRL; vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL; vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC; + vmcs_conf->cpu_based_3rd_exec_ctrl = 0; vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 8d70f9aea94b..f886a8ff0342 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -50,6 +50,7 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs); */ #define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \ PIN_BASED_VMX_PREEMPTION_TIMER) +#define EVMCS1_UNSUPPORTED_EXEC_CTRL (CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) #define EVMCS1_UNSUPPORTED_2NDEXEC \ (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ab135f9ef52f..ddd4367d4826 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -311,11 +311,12 @@ static void free_nested(struct kvm_vcpu *vcpu) vmx->nested.cached_vmcs12 = NULL; kfree(vmx->nested.cached_shadow_vmcs12); vmx->nested.cached_shadow_vmcs12 = NULL; - /* Unpin physical memory we referred to in the vmcs02 */ - if (vmx->nested.apic_access_page) { - kvm_release_page_clean(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } + /* + * Unpin physical memory we referred to in the vmcs02. The APIC access + * page's backing page (yeah, confusing) shouldn't actually be accessed, + * and if it is written, the contents are irrelevant. + */ + kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); vmx->nested.pi_desc = NULL; @@ -1223,7 +1224,7 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | /* reserved */ BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); - u64 vmx_basic = vmx->nested.msrs.basic; + u64 vmx_basic = vmcs_config.nested.basic; if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) return -EINVAL; @@ -1246,36 +1247,42 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) return 0; } -static int -vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, + u32 **low, u32 **high) { - u64 supported; - u32 *lowp, *highp; - switch (msr_index) { case MSR_IA32_VMX_TRUE_PINBASED_CTLS: - lowp = &vmx->nested.msrs.pinbased_ctls_low; - highp = &vmx->nested.msrs.pinbased_ctls_high; + *low = &msrs->pinbased_ctls_low; + *high = &msrs->pinbased_ctls_high; break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - lowp = &vmx->nested.msrs.procbased_ctls_low; - highp = &vmx->nested.msrs.procbased_ctls_high; + *low = &msrs->procbased_ctls_low; + *high = &msrs->procbased_ctls_high; break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: - lowp = &vmx->nested.msrs.exit_ctls_low; - highp = &vmx->nested.msrs.exit_ctls_high; + *low = &msrs->exit_ctls_low; + *high = &msrs->exit_ctls_high; break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - lowp = &vmx->nested.msrs.entry_ctls_low; - highp = &vmx->nested.msrs.entry_ctls_high; + *low = &msrs->entry_ctls_low; + *high = &msrs->entry_ctls_high; break; case MSR_IA32_VMX_PROCBASED_CTLS2: - lowp = &vmx->nested.msrs.secondary_ctls_low; - highp = &vmx->nested.msrs.secondary_ctls_high; + *low = &msrs->secondary_ctls_low; + *high = &msrs->secondary_ctls_high; break; default: BUG(); } +} + +static int +vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u32 *lowp, *highp; + u64 supported; + + vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); supported = vmx_control_msr(*lowp, *highp); @@ -1287,6 +1294,7 @@ vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) return -EINVAL; + vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); *lowp = data; *highp = data >> 32; return 0; @@ -1300,10 +1308,8 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | /* reserved */ GENMASK_ULL(13, 9) | BIT_ULL(31); - u64 vmx_misc; - - vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, - vmx->nested.msrs.misc_high); + u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, + vmcs_config.nested.misc_high); if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) return -EINVAL; @@ -1331,10 +1337,8 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) { - u64 vmx_ept_vpid_cap; - - vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, - vmx->nested.msrs.vpid_caps); + u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, + vmcs_config.nested.vpid_caps); /* Every bit is either reserved or a feature bit. */ if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) @@ -1345,20 +1349,21 @@ static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) return 0; } -static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) { - u64 *msr; - switch (msr_index) { case MSR_IA32_VMX_CR0_FIXED0: - msr = &vmx->nested.msrs.cr0_fixed0; - break; + return &msrs->cr0_fixed0; case MSR_IA32_VMX_CR4_FIXED0: - msr = &vmx->nested.msrs.cr4_fixed0; - break; + return &msrs->cr4_fixed0; default: BUG(); } +} + +static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); /* * 1 bits (which indicates bits which "must-be-1" during VMX operation) @@ -1367,7 +1372,7 @@ static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) if (!is_bitwise_subset(data, *msr, -1ULL)) return -EINVAL; - *msr = data; + *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; return 0; } @@ -1428,7 +1433,7 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vmx->nested.msrs.vmcs_enum = data; return 0; case MSR_IA32_VMX_VMFUNC: - if (data & ~vmx->nested.msrs.vmfunc_controls) + if (data & ~vmcs_config.nested.vmfunc_controls) return -EINVAL; vmx->nested.msrs.vmfunc_controls = data; return 0; @@ -2133,6 +2138,8 @@ static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) { + struct kvm *kvm = vmx->vcpu.kvm; + /* * If vmcs02 hasn't been initialized, set the constant vmcs02 state * according to L0's settings (vmcs12 is irrelevant here). Host @@ -2175,6 +2182,9 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) if (cpu_has_vmx_encls_vmexit()) vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); + if (kvm_notify_vmexit_enabled(kvm)) + vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); + /* * Set the MSR load/store lists to match L0's settings. Only the * addresses are constant (for vmcs02), the counts can change based @@ -2514,11 +2524,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); } else { kvm_set_dr(vcpu, 7, vcpu->arch.dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); } if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) - vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); + vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); vmx_set_rflags(vcpu, vmcs12->guest_rflags); /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the @@ -2547,7 +2557,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmx_get_l2_tsc_multiplier(vcpu)); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); - if (kvm_has_tsc_control) + if (kvm_caps.has_tsc_control) vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); @@ -2613,6 +2623,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && + intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->guest_ia32_perf_global_ctrl))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; @@ -3158,8 +3169,6 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_host_map *map; - struct page *page; - u64 hpa; if (!vcpu->arch.pdptrs_from_userspace && !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { @@ -3174,23 +3183,12 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - /* - * Translate L1 physical address to host physical - * address for vmcs02. Keep the page pinned, so this - * physical address remains valid. We keep a reference - * to it so we can release it later. - */ - if (vmx->nested.apic_access_page) { /* shouldn't happen */ - kvm_release_page_clean(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); - if (!is_error_page(page)) { - vmx->nested.apic_access_page = page; - hpa = page_to_phys(vmx->nested.apic_access_page); - vmcs_write64(APIC_ACCESS_ADDR, hpa); + map = &vmx->nested.apic_access_page_map; + + if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { + vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); } else { - pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n", + pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", __func__); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = @@ -3373,11 +3371,13 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); - if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) - vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + if (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) + vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); if (kvm_mpx_supported() && - !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) - vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) + vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); /* * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* @@ -4096,8 +4096,6 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); vmcs12->guest_pending_dbg_exceptions = vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); - if (kvm_mpx_supported()) - vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; } @@ -4336,7 +4334,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); vcpu->arch.pat = vmcs12->host_ia32_pat; } - if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) + if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && + intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl)); @@ -4609,7 +4608,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); - if (kvm_has_tsc_control) + if (kvm_caps.has_tsc_control) vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); if (vmx->nested.l1_tpr_threshold != -1) @@ -4626,10 +4625,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, } /* Unpin physical memory we referred to in vmcs02 */ - if (vmx->nested.apic_access_page) { - kvm_release_page_clean(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } + kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); vmx->nested.pi_desc = NULL; @@ -4828,28 +4824,6 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, return 0; } -void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu, - bool vcpu_has_perf_global_ctrl) -{ - struct vcpu_vmx *vmx; - - if (!nested_vmx_allowed(vcpu)) - return; - - vmx = to_vmx(vcpu); - if (vcpu_has_perf_global_ctrl) { - vmx->nested.msrs.entry_ctls_high |= - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; - vmx->nested.msrs.exit_ctls_high |= - VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; - } else { - vmx->nested.msrs.entry_ctls_high &= - ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; - vmx->nested.msrs.exit_ctls_high &= - ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; - } -} - static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, int *ret) { @@ -4952,7 +4926,7 @@ out_vmcs02: } /* Emulate the VMXON instruction. */ -static int handle_vmon(struct kvm_vcpu *vcpu) +static int handle_vmxon(struct kvm_vcpu *vcpu) { int ret; gpa_t vmptr; @@ -4962,20 +4936,25 @@ static int handle_vmon(struct kvm_vcpu *vcpu) | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; /* - * The Intel VMX Instruction Reference lists a bunch of bits that are - * prerequisite to running VMXON, most notably cr4.VMXE must be set to - * 1 (see vmx_is_valid_cr4() for when we allow the guest to set this). - * Otherwise, we should fail with #UD. But most faulting conditions - * have already been checked by hardware, prior to the VM-exit for - * VMXON. We do test guest cr4.VMXE because processor CR4 always has - * that bit set to 1 in non-root mode. + * Note, KVM cannot rely on hardware to perform the CR0/CR4 #UD checks + * that have higher priority than VM-Exit (see Intel SDM's pseudocode + * for VMXON), as KVM must load valid CR0/CR4 values into hardware while + * running the guest, i.e. KVM needs to check the _guest_ values. + * + * Rely on hardware for the other two pre-VM-Exit checks, !VM86 and + * !COMPATIBILITY modes. KVM may run the guest in VM86 to emulate Real + * Mode, but KVM will never take the guest out of those modes. */ - if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { + if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || + !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - /* CPL=0 must be checked manually. */ + /* + * CPL=0 and all other checks that are lower priority than VM-Exit must + * be checked manually. + */ if (vmx_get_cpl(vcpu)) { kvm_inject_gp(vcpu, 0); return 1; @@ -5044,7 +5023,7 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) } /* Emulate the VMXOFF instruction */ -static int handle_vmoff(struct kvm_vcpu *vcpu) +static int handle_vmxoff(struct kvm_vcpu *vcpu) { if (!nested_vmx_check_permission(vcpu)) return 1; @@ -6111,6 +6090,9 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); case EXIT_REASON_ENCLS: return nested_vmx_exit_handled_encls(vcpu, vmcs12); + case EXIT_REASON_NOTIFY: + /* Notify VM exit is not exposed to L1 */ + return false; default: return true; } @@ -6775,6 +6757,9 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); + if (vmx_umip_emulated()) + msrs->cr4_fixed1 |= X86_CR4_UMIP; + msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); } @@ -6818,8 +6803,8 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; - exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff; - exit_handlers[EXIT_REASON_VMON] = handle_vmon; + exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; + exit_handlers[EXIT_REASON_VMON] = handle_vmxon; exit_handlers[EXIT_REASON_INVEPT] = handle_invept; exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index c92cea0b8ccc..88b00a7359e4 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -32,8 +32,6 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, u32 vmx_instruction_info, bool wr, int len, gva_t *ret); -void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu, - bool vcpu_has_perf_global_ctrl); void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu); bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, int size); @@ -281,7 +279,8 @@ static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0; u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1; - return fixed_bits_valid(val, fixed0, fixed1); + return fixed_bits_valid(val, fixed0, fixed1) && + __kvm_is_valid_cr4(vcpu, val); } /* No difference in the restrictions on guest and host CR4 in VMX operation. */ diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 37e9eb32e3d9..862c1a4d971b 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -37,23 +37,35 @@ static int fixed_pmc_events[] = {1, 0, 7}; static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) { + struct kvm_pmc *pmc; + u8 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl; int i; + pmu->fixed_ctr_ctrl = data; for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { u8 new_ctrl = fixed_ctrl_field(data, i); - u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i); - struct kvm_pmc *pmc; - - pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); + u8 old_ctrl = fixed_ctrl_field(old_fixed_ctr_ctrl, i); if (old_ctrl == new_ctrl) continue; + pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); + __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); - reprogram_fixed_counter(pmc, new_ctrl, i); + reprogram_counter(pmc); } +} - pmu->fixed_ctr_ctrl = data; +static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) +{ + if (pmc_idx < INTEL_PMC_IDX_FIXED) { + return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx, + MSR_P6_EVNTSEL0); + } else { + u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; + + return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); + } } /* function is called when global control register has been updated. */ @@ -61,14 +73,18 @@ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) { int bit; u64 diff = pmu->global_ctrl ^ data; + struct kvm_pmc *pmc; pmu->global_ctrl = data; - for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) - reprogram_counter(pmu, bit); + for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) { + pmc = intel_pmc_idx_to_pmc(pmu, bit); + if (pmc) + reprogram_counter(pmc); + } } -static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) +static bool intel_hw_event_available(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT; @@ -82,15 +98,12 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) /* disable event that reported as not present by cpuid */ if ((i < 7) && !(pmu->available_event_types & (1 << i))) - return PERF_COUNT_HW_MAX + 1; + return false; break; } - if (i == ARRAY_SIZE(intel_arch_events)) - return PERF_COUNT_HW_MAX; - - return intel_arch_events[i].event_type; + return true; } /* check if a PMC is enabled by comparing it with globl_ctrl bits. */ @@ -98,19 +111,10 @@ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); - return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); -} - -static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) -{ - if (pmc_idx < INTEL_PMC_IDX_FIXED) - return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx, - MSR_P6_EVNTSEL0); - else { - u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; + if (!intel_pmu_has_perf_global_ctrl(pmu)) + return true; - return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); - } + return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); } static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) @@ -167,16 +171,6 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) return get_gp_pmc(pmu, msr, MSR_IA32_PMC0); } -bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu) -{ - /* - * As a first step, a guest could only enable LBR feature if its - * cpu model is the same as the host because the LBR registers - * would be pass-through to the guest and they're model specific. - */ - return boot_cpu_data.x86_model == guest_cpuid_model(vcpu); -} - bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu) { struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu); @@ -205,6 +199,7 @@ static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index) static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u64 perf_capabilities; int ret; switch (msr) { @@ -212,7 +207,18 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) case MSR_CORE_PERF_GLOBAL_STATUS: case MSR_CORE_PERF_GLOBAL_CTRL: case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - ret = pmu->version > 1; + return intel_pmu_has_perf_global_ctrl(pmu); + break; + case MSR_IA32_PEBS_ENABLE: + ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT; + break; + case MSR_IA32_DS_AREA: + ret = guest_cpuid_has(vcpu, X86_FEATURE_DS); + break; + case MSR_PEBS_DATA_CFG: + perf_capabilities = vcpu_get_perf_capabilities(vcpu); + ret = (perf_capabilities & PERF_CAP_PEBS_BASELINE) && + ((perf_capabilities & PERF_CAP_PEBS_FORMAT) > 3); break; default: ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || @@ -361,6 +367,15 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CORE_PERF_GLOBAL_OVF_CTRL: msr_info->data = 0; return 0; + case MSR_IA32_PEBS_ENABLE: + msr_info->data = pmu->pebs_enable; + return 0; + case MSR_IA32_DS_AREA: + msr_info->data = pmu->ds_area; + return 0; + case MSR_PEBS_DATA_CFG: + msr_info->data = pmu->pebs_data_cfg; + return 0; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { @@ -395,7 +410,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CORE_PERF_FIXED_CTR_CTRL: if (pmu->fixed_ctr_ctrl == data) return 0; - if (!(data & 0xfffffffffffff444ull)) { + if (!(data & pmu->fixed_ctr_ctrl_mask)) { reprogram_fixed_counters(pmu, data); return 0; } @@ -421,6 +436,29 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } break; + case MSR_IA32_PEBS_ENABLE: + if (pmu->pebs_enable == data) + return 0; + if (!(data & pmu->pebs_enable_mask)) { + pmu->pebs_enable = data; + return 0; + } + break; + case MSR_IA32_DS_AREA: + if (msr_info->host_initiated && data && !guest_cpuid_has(vcpu, X86_FEATURE_DS)) + return 1; + if (is_noncanonical_address(data, vcpu)) + return 1; + pmu->ds_area = data; + return 0; + case MSR_PEBS_DATA_CFG: + if (pmu->pebs_data_cfg == data) + return 0; + if (!(data & pmu->pebs_data_cfg_mask)) { + pmu->pebs_data_cfg = data; + return 0; + } + break; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { @@ -445,7 +483,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) (pmu->raw_event_mask & HSW_IN_TX_CHECKPOINTED)) reserved_bits ^= HSW_IN_TX_CHECKPOINTED; if (!(data & reserved_bits)) { - reprogram_gp_counter(pmc, data); + pmc->eventsel = data; + reprogram_counter(pmc); return 0; } } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) @@ -474,11 +513,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); - - struct x86_pmu_capability x86_pmu; struct kvm_cpuid_entry2 *entry; union cpuid10_eax eax; union cpuid10_edx edx; + u64 perf_capabilities; + u64 counter_mask; + int i; pmu->nr_arch_gp_counters = 0; pmu->nr_arch_fixed_counters = 0; @@ -487,8 +527,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->version = 0; pmu->reserved_bits = 0xffffffff00200000ull; pmu->raw_event_mask = X86_RAW_EVENT_MASK; + pmu->global_ctrl_mask = ~0ull; + pmu->global_ovf_ctrl_mask = ~0ull; + pmu->fixed_ctr_ctrl_mask = ~0ull; + pmu->pebs_enable_mask = ~0ull; + pmu->pebs_data_cfg_mask = ~0ull; - entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); + entry = kvm_find_cpuid_entry(vcpu, 0xa); if (!entry || !vcpu->kvm->arch.enable_pmu) return; eax.full = entry->eax; @@ -498,13 +543,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (!pmu->version) return; - perf_get_x86_pmu_capability(&x86_pmu); - pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, - x86_pmu.num_counters_gp); - eax.split.bit_width = min_t(int, eax.split.bit_width, x86_pmu.bit_width_gp); + kvm_pmu_cap.num_counters_gp); + eax.split.bit_width = min_t(int, eax.split.bit_width, + kvm_pmu_cap.bit_width_gp); pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; - eax.split.mask_length = min_t(int, eax.split.mask_length, x86_pmu.events_mask_len); + eax.split.mask_length = min_t(int, eax.split.mask_length, + kvm_pmu_cap.events_mask_len); pmu->available_event_types = ~entry->ebx & ((1ull << eax.split.mask_length) - 1); @@ -514,17 +559,19 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->nr_arch_fixed_counters = min3(ARRAY_SIZE(fixed_pmc_events), (size_t) edx.split.num_counters_fixed, - (size_t) x86_pmu.num_counters_fixed); - edx.split.bit_width_fixed = min_t(int, - edx.split.bit_width_fixed, x86_pmu.bit_width_fixed); + (size_t)kvm_pmu_cap.num_counters_fixed); + edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed, + kvm_pmu_cap.bit_width_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << edx.split.bit_width_fixed) - 1; setup_fixed_pmc_eventsel(pmu); } - pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | - (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); - pmu->global_ctrl_mask = ~pmu->global_ctrl; + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) + pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4)); + counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | + (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED)); + pmu->global_ctrl_mask = counter_mask; pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); @@ -532,7 +579,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->global_ovf_ctrl_mask &= ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; - entry = kvm_find_cpuid_entry(vcpu, 7, 0); + entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); if (entry && (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) { @@ -545,16 +592,29 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); - nested_vmx_pmu_refresh(vcpu, - intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)); - - if (intel_pmu_lbr_is_compatible(vcpu)) + if (cpuid_model_is_consistent(vcpu)) x86_perf_get_lbr(&lbr_desc->records); else lbr_desc->records.nr = 0; if (lbr_desc->records.nr) bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1); + + perf_capabilities = vcpu_get_perf_capabilities(vcpu); + if (perf_capabilities & PERF_CAP_PEBS_FORMAT) { + if (perf_capabilities & PERF_CAP_PEBS_BASELINE) { + pmu->pebs_enable_mask = counter_mask; + pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmu->fixed_ctr_ctrl_mask &= + ~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4)); + } + pmu->pebs_data_cfg_mask = ~0xff00000full; + } else { + pmu->pebs_enable_mask = + ~((1ull << pmu->nr_arch_gp_counters) - 1); + } + } } static void intel_pmu_init(struct kvm_vcpu *vcpu) @@ -719,8 +779,28 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) intel_pmu_release_guest_lbr_event(vcpu); } +void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) +{ + struct kvm_pmc *pmc = NULL; + int bit; + + for_each_set_bit(bit, (unsigned long *)&pmu->global_ctrl, + X86_PMC_IDX_MAX) { + pmc = intel_pmc_idx_to_pmc(pmu, bit); + + if (!pmc || !pmc_speculative_in_use(pmc) || + !intel_pmc_is_enabled(pmc)) + continue; + + if (pmc->perf_event && pmc->idx != pmc->perf_event->hw.idx) { + pmu->host_cross_mapped_mask |= + BIT_ULL(pmc->perf_event->hw.idx); + } + } +} + struct kvm_pmu_ops intel_pmu_ops __initdata = { - .pmc_perf_hw_id = intel_pmc_perf_hw_id, + .hw_event_available = intel_hw_event_available, .pmc_is_enabled = intel_pmc_is_enabled, .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 07e5fcf5a5aa..1b56c5e5c9fb 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -34,7 +34,7 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) return &(to_vmx(vcpu)->pi_desc); } -static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new) +static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new) { /* * PID.ON can be set at any time by a different vCPU or by hardware, @@ -42,7 +42,7 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new) * update must be retried with a fresh snapshot an ON change causes * the cmpxchg to fail. */ - if (cmpxchg64(&pi_desc->control, old, new) != old) + if (!try_cmpxchg64(&pi_desc->control, pold, new)) return -EBUSY; return 0; @@ -96,8 +96,9 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) if (!x2apic_mode) dest = (dest << 8) & 0xFF00; + old.control = READ_ONCE(pi_desc->control); do { - old.control = new.control = READ_ONCE(pi_desc->control); + new.control = old.control; /* * Clear SN (as above) and refresh the destination APIC ID to @@ -111,7 +112,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) * descriptor was modified on "put" to use the wakeup vector. */ new.nv = POSTED_INTR_VECTOR; - } while (pi_try_set_control(pi_desc, old.control, new.control)); + } while (pi_try_set_control(pi_desc, &old.control, new.control)); local_irq_restore(flags); @@ -156,12 +157,12 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) WARN(pi_desc->sn, "PI descriptor SN field set before blocking"); + old.control = READ_ONCE(pi_desc->control); do { - old.control = new.control = READ_ONCE(pi_desc->control); - /* set 'NV' to 'wakeup vector' */ + new.control = old.control; new.nv = POSTED_INTR_WAKEUP_VECTOR; - } while (pi_try_set_control(pi_desc, old.control, new.control)); + } while (pi_try_set_control(pi_desc, &old.control, new.control)); /* * Send a wakeup IPI to this CPU if an interrupt may have been posted @@ -177,11 +178,24 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) local_irq_restore(flags); } +static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu) +{ + /* + * The default posted interrupt vector does nothing when + * invoked outside guest mode. Return whether a blocked vCPU + * can be the target of posted interrupts, as is the case when + * using either IPI virtualization or VT-d PI, so that the + * notification vector is switched to the one that calls + * back to the pi_wakeup_handler() function. + */ + return vmx_can_use_ipiv(vcpu) || vmx_can_use_vtd_pi(vcpu->kvm); +} + void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - if (!vmx_can_use_vtd_pi(vcpu->kvm)) + if (!vmx_needs_pi_wakeup(vcpu)) return; if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu)) diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 9a45d5c9f116..26992076552e 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -5,6 +5,8 @@ #define POSTED_INTR_ON 0 #define POSTED_INTR_SN 1 +#define PID_TABLE_ENTRY_VALID 1 + /* Posted-Interrupt Descriptor */ struct pi_desc { u32 pir[8]; /* Posted interrupt requested */ diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 35e7ec91ae86..aba8cebdc587 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -79,7 +79,7 @@ static int sgx_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t gva, bool write, else *gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, &ex); - if (*gpa == UNMAPPED_GVA) { + if (*gpa == INVALID_GPA) { kvm_inject_emulated_page_fault(vcpu, &ex); return -EFAULT; } @@ -148,8 +148,8 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, u8 max_size_log2; int trapnr, ret; - sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0); - sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1); + sgx_12_0 = kvm_find_cpuid_entry_index(vcpu, 0x12, 0); + sgx_12_1 = kvm_find_cpuid_entry_index(vcpu, 0x12, 1); if (!sgx_12_0 || !sgx_12_1) { kvm_prepare_emulation_failure_exit(vcpu); return 0; @@ -431,7 +431,7 @@ static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu) if (!vcpu->kvm->arch.sgx_provisioning_allowed) return true; - guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 0); + guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 0); if (!guest_cpuid) return true; @@ -439,7 +439,7 @@ static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu) if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx) return true; - guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 1); + guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 1); if (!guest_cpuid) return true; diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 2b9d7a7e83f7..ac290a44a693 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -50,6 +50,7 @@ struct vmcs_controls_shadow { u32 pin; u32 exec; u32 secondary_exec; + u64 tertiary_exec; }; /* diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index be7c19374fdd..d7f8331d6f7e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -105,6 +105,9 @@ module_param(fasteoi, bool, S_IRUGO); module_param(enable_apicv, bool, S_IRUGO); +bool __read_mostly enable_ipiv = true; +module_param(enable_ipiv, bool, 0444); + /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -116,6 +119,9 @@ module_param(nested, bool, S_IRUGO); bool __read_mostly enable_pml = 1; module_param_named(pml, enable_pml, bool, S_IRUGO); +static bool __read_mostly error_on_inconsistent_vmcs_config = true; +module_param(error_on_inconsistent_vmcs_config, bool, 0444); + static bool __read_mostly dump_invalid_vmcs = 0; module_param(dump_invalid_vmcs, bool, 0644); @@ -443,18 +449,20 @@ asmlinkage void vmread_error(unsigned long field, bool fault) noinline void vmwrite_error(unsigned long field, unsigned long value) { - vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n", + vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%u\n", field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) { - vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr); + vmx_insn_failed("kvm: vmclear failed: %p/%llx err=%u\n", + vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) { - vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr); + vmx_insn_failed("kvm: vmptrld failed: %p/%llx err=%u\n", + vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) @@ -1787,7 +1795,7 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) return vmcs12->tsc_multiplier; - return kvm_default_tsc_scaling_ratio; + return kvm_caps.default_tsc_scaling_ratio; } static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) @@ -2111,6 +2119,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (is_noncanonical_address(data & PAGE_MASK, vcpu) || (data & MSR_IA32_BNDCFGS_RSVD)) return 1; + + if (is_guest_mode(vcpu) && + ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) || + (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS))) + get_vmcs12(vcpu)->guest_bndcfgs = data; + vmcs_write64(GUEST_BNDCFGS, data); break; case MSR_IA32_UMWAIT_CONTROL: @@ -2312,7 +2326,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((data & PMU_CAP_LBR_FMT) != (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)) return 1; - if (!intel_pmu_lbr_is_compatible(vcpu)) + if (!cpuid_model_is_consistent(vcpu)) + return 1; + } + if (data & PERF_CAP_PEBS_FORMAT) { + if ((data & PERF_CAP_PEBS_MASK) != + (vmx_get_perf_capabilities() & PERF_CAP_PEBS_MASK)) + return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_DS)) + return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64)) + return 1; + if (!cpuid_model_is_consistent(vcpu)) return 1; } ret = kvm_set_msr_common(vcpu, msr_info); @@ -2489,6 +2514,15 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, return 0; } +static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) +{ + u64 allowed; + + rdmsrl(msr, allowed); + + return ctl_opt & allowed; +} + static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, struct vmx_capability *vmx_cap) { @@ -2497,8 +2531,26 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, u32 _pin_based_exec_control = 0; u32 _cpu_based_exec_control = 0; u32 _cpu_based_2nd_exec_control = 0; + u64 _cpu_based_3rd_exec_control = 0; u32 _vmexit_control = 0; u32 _vmentry_control = 0; + int i; + + /* + * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. + * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always + * intercepts writes to PAT and EFER, i.e. never enables those controls. + */ + struct { + u32 entry_control; + u32 exit_control; + } const vmcs_entry_exit_pairs[] = { + { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL }, + { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT }, + { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, + { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, + { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, + }; memset(vmcs_conf, 0, sizeof(*vmcs_conf)); min = CPU_BASED_HLT_EXITING | @@ -2518,7 +2570,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS | + CPU_BASED_ACTIVATE_TERTIARY_CONTROLS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, &_cpu_based_exec_control) < 0) return -EIO; @@ -2551,7 +2604,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX | SECONDARY_EXEC_ENABLE_VMFUNC | - SECONDARY_EXEC_BUS_LOCK_DETECTION; + SECONDARY_EXEC_BUS_LOCK_DETECTION | + SECONDARY_EXEC_NOTIFY_VM_EXITING; if (cpu_has_sgx()) opt2 |= SECONDARY_EXEC_ENCLS_EXITING; if (adjust_vmx_controls(min2, opt2, @@ -2581,15 +2635,30 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, CPU_BASED_CR3_STORE_EXITING | CPU_BASED_INVLPG_EXITING); } else if (vmx_cap->ept) { - vmx_cap->ept = 0; pr_warn_once("EPT CAP should not exist if not support " "1-setting enable EPT VM-execution control\n"); + + if (error_on_inconsistent_vmcs_config) + return -EIO; + + vmx_cap->ept = 0; } if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && - vmx_cap->vpid) { - vmx_cap->vpid = 0; + vmx_cap->vpid) { pr_warn_once("VPID CAP should not exist if not support " "1-setting enable VPID VM-execution control\n"); + + if (error_on_inconsistent_vmcs_config) + return -EIO; + + vmx_cap->vpid = 0; + } + + if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) { + u64 opt3 = TERTIARY_EXEC_IPI_VIRT; + + _cpu_based_3rd_exec_control = adjust_vmx_controls64(opt3, + MSR_IA32_VMX_PROCBASED_CTLS3); } min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; @@ -2630,6 +2699,23 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, &_vmentry_control) < 0) return -EIO; + for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) { + u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control; + u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control; + + if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl)) + continue; + + pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n", + _vmentry_control & n_ctrl, _vmexit_control & x_ctrl); + + if (error_on_inconsistent_vmcs_config) + return -EIO; + + _vmentry_control &= ~n_ctrl; + _vmexit_control &= ~x_ctrl; + } + /* * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they * can't be used due to an errata where VM Exit may incorrectly clear @@ -2678,6 +2764,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; + vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; vmcs_conf->vmexit_ctrl = _vmexit_control; vmcs_conf->vmentry_ctrl = _vmentry_control; @@ -3230,8 +3317,8 @@ static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { /* * We operate under the default treatment of SMM, so VMX cannot be - * enabled under SMM. Note, whether or not VMXE is allowed at all is - * handled by kvm_is_valid_cr4(). + * enabled under SMM. Note, whether or not VMXE is allowed at all, + * i.e. is a reserved bit, is handled by common x86 code. */ if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) return false; @@ -3702,7 +3789,7 @@ static int init_rmode_identity_map(struct kvm *kvm) } /* Set up identity-mapping pagetable for EPT in real mode */ - for (i = 0; i < PT32_ENT_PER_PAGE; i++) { + for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) { tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) { @@ -3932,6 +4019,8 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); + if (enable_ipiv) + vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW); } } @@ -3977,20 +4066,26 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) u32 i; /* - * Set intercept permissions for all potentially passed through MSRs - * again. They will automatically get filtered through the MSR filter, - * so we are back in sync after this. + * Redo intercept permissions for MSRs that KVM is passing through to + * the guest. Disabling interception will check the new MSR filter and + * ensure that KVM enables interception if usersepace wants to filter + * the MSR. MSRs that KVM is already intercepting don't need to be + * refreshed since KVM is going to intercept them regardless of what + * userspace wants. */ for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { u32 msr = vmx_possible_passthrough_msrs[i]; - bool read = test_bit(i, vmx->shadow_msr_intercept.read); - bool write = test_bit(i, vmx->shadow_msr_intercept.write); - vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, read); - vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, write); + if (!test_bit(i, vmx->shadow_msr_intercept.read)) + vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R); + + if (!test_bit(i, vmx->shadow_msr_intercept.write)) + vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W); } - pt_update_intercept_for_msr(vcpu); + /* PT MSRs can be passed through iff PT is exposed to the guest. */ + if (vmx_pt_mode_is_host_guest()) + pt_update_intercept_for_msr(vcpu); } static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, @@ -4085,7 +4180,8 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (!r) return 0; - if (!vcpu->arch.apicv_active) + /* Note, this is called iff the local APIC is in-kernel. */ + if (!vcpu->arch.apic->apicv_active) return -1; if (pi_test_and_set_pir(vector, &vmx->pi_desc)) @@ -4259,15 +4355,19 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) } pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); - if (cpu_has_secondary_exec_ctrls()) { - if (kvm_vcpu_apicv_active(vcpu)) - secondary_exec_controls_setbit(vmx, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - else - secondary_exec_controls_clearbit(vmx, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + + if (kvm_vcpu_apicv_active(vcpu)) { + secondary_exec_controls_setbit(vmx, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + if (enable_ipiv) + tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT); + } else { + secondary_exec_controls_clearbit(vmx, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + if (enable_ipiv) + tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT); } vmx_update_msr_bitmap_x2apic(vcpu); @@ -4299,6 +4399,20 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) return exec_control; } +static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) +{ + u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl; + + /* + * IPI virtualization relies on APICv. Disable IPI virtualization if + * APICv is inhibited. + */ + if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu)) + exec_control &= ~TERTIARY_EXEC_IPI_VIRT; + + return exec_control; +} + /* * Adjust a single secondary execution control bit to intercept/allow an * instruction in the guest. This is usually done based on whether or not a @@ -4441,13 +4555,48 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) if (!vcpu->kvm->arch.bus_lock_detection_enabled) exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; + if (!kvm_notify_vmexit_enabled(vcpu->kvm)) + exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING; + return exec_control; } +static inline int vmx_get_pid_table_order(struct kvm *kvm) +{ + return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table)); +} + +static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) +{ + struct page *pages; + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + + if (!irqchip_in_kernel(kvm) || !enable_ipiv) + return 0; + + if (kvm_vmx->pid_table) + return 0; + + pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm)); + if (!pages) + return -ENOMEM; + + kvm_vmx->pid_table = (void *)page_address(pages); + return 0; +} + +static int vmx_vcpu_precreate(struct kvm *kvm) +{ + return vmx_alloc_ipiv_pid_table(kvm); +} + #define VMX_XSS_EXIT_BITMAP 0 static void init_vmcs(struct vcpu_vmx *vmx) { + struct kvm *kvm = vmx->vcpu.kvm; + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + if (nested) nested_vmx_set_vmcs_shadowing_bitmap(); @@ -4464,6 +4613,9 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (cpu_has_secondary_exec_ctrls()) secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); + if (cpu_has_tertiary_exec_ctrls()) + tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); + if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); @@ -4476,12 +4628,20 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); } - if (!kvm_pause_in_guest(vmx->vcpu.kvm)) { + if (vmx_can_use_ipiv(&vmx->vcpu)) { + vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table)); + vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1); + } + + if (!kvm_pause_in_guest(kvm)) { vmcs_write32(PLE_GAP, ple_gap); vmx->ple_window = ple_window; vmx->ple_window_dirty = true; } + if (kvm_notify_vmexit_enabled(kvm)) + vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ @@ -4652,13 +4812,13 @@ static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); } -static void vmx_inject_irq(struct kvm_vcpu *vcpu) +static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) { struct vcpu_vmx *vmx = to_vmx(vcpu); uint32_t intr; int irq = vcpu->arch.interrupt.nr; - trace_kvm_inj_virq(irq); + trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected); ++vcpu->stat.irq_injections; if (vmx->rmode.vm86_active) { @@ -5770,6 +5930,32 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) return 1; } +static int handle_notify(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qual = vmx_get_exit_qual(vcpu); + bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID; + + ++vcpu->stat.notify_window_exits; + + /* + * Notify VM exit happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + */ + if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + + if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER || + context_invalid) { + vcpu->run->exit_reason = KVM_EXIT_NOTIFY; + vcpu->run->notify.flags = context_invalid ? + KVM_NOTIFY_CONTEXT_INVALID : 0; + return 0; + } + + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -5827,6 +6013,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, [EXIT_REASON_ENCLS] = handle_encls, [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, + [EXIT_REASON_NOTIFY] = handle_notify, }; static const int kvm_vmx_max_exit_handlers = @@ -5924,6 +6111,7 @@ void dump_vmcs(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmentry_ctl, vmexit_ctl; u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; + u64 tertiary_exec_control; unsigned long cr4; int efer_slot; @@ -5937,9 +6125,16 @@ void dump_vmcs(struct kvm_vcpu *vcpu) cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); cr4 = vmcs_readl(GUEST_CR4); - secondary_exec_control = 0; + if (cpu_has_secondary_exec_ctrls()) secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + else + secondary_exec_control = 0; + + if (cpu_has_tertiary_exec_ctrls()) + tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL); + else + tertiary_exec_control = 0; pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); @@ -6039,9 +6234,10 @@ void dump_vmcs(struct kvm_vcpu *vcpu) vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); pr_err("*** Control State ***\n"); - pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", - pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control); - pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); + pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", + cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control); + pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n", + pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl); pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", vmcs_read32(EXCEPTION_BITMAP), vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), @@ -6191,7 +6387,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) exit_reason.basic != EXIT_REASON_EPT_VIOLATION && exit_reason.basic != EXIT_REASON_PML_FULL && exit_reason.basic != EXIT_REASON_APIC_ACCESS && - exit_reason.basic != EXIT_REASON_TASK_SWITCH)) { + exit_reason.basic != EXIT_REASON_TASK_SWITCH && + exit_reason.basic != EXIT_REASON_NOTIFY)) { int ndata = 3; vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -6453,7 +6650,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) put_page(page); } -static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) +static void vmx_hwapic_isr_update(int max_isr) { u16 status; u8 old; @@ -6783,9 +6980,14 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) { int i, nr_msrs; struct perf_guest_switch_msr *msrs; + struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); + + pmu->host_cross_mapped_mask = 0; + if (pmu->pebs_enable & pmu->global_ctrl) + intel_pmu_cross_mapped_check(pmu); /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ - msrs = perf_guest_get_msrs(&nr_msrs); + msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); if (!msrs) return; @@ -7166,6 +7368,10 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu) goto free_vmcs; } + if (vmx_can_use_ipiv(vcpu)) + WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], + __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); + return 0; free_vmcs: @@ -7234,7 +7440,7 @@ static int __init vmx_check_processor_compat(void) return 0; } -static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) +static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { u8 cache; @@ -7310,7 +7516,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ } while (0) - entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); + entry = kvm_find_cpuid_entry(vcpu, 0x1); cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); @@ -7326,7 +7532,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); - entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); + entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0); cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); @@ -7337,23 +7543,6 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) #undef cr4_fixed1_update } -static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (kvm_mpx_supported()) { - bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX); - - if (mpx_enabled) { - vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; - vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; - } else { - vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS; - vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS; - } - } -} - static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7361,7 +7550,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) int i; for (i = 0; i < PT_CPUID_LEAVES; i++) { - best = kvm_find_cpuid_entry(vcpu, 0x14, i); + best = kvm_find_cpuid_entry_index(vcpu, 0x14, i); if (!best) return; vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; @@ -7445,10 +7634,8 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); - if (nested_vmx_allowed(vcpu)) { + if (nested_vmx_allowed(vcpu)) nested_vmx_cr_fixed1_bits_update(vcpu); - nested_vmx_entry_exit_ctls_update(vcpu); - } if (boot_cpu_has(X86_FEATURE_INTEL_PT) && guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) @@ -7502,6 +7689,13 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_INVPCID); if (vmx_pt_mode_is_host_guest()) kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); + if (vmx_pebs_supported()) { + kvm_cpu_cap_check_and_set(X86_FEATURE_DS); + kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); + } + + if (!enable_pmu) + kvm_cpu_cap_clear(X86_FEATURE_PDCM); if (!enable_sgx) { kvm_cpu_cap_clear(X86_FEATURE_SGX); @@ -7514,7 +7708,7 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_UMIP); /* CPUID 0xD.1 */ - supported_xss = 0; + kvm_caps.supported_xss = 0; if (!cpu_has_vmx_xsaves()) kvm_cpu_cap_clear(X86_FEATURE_XSAVES); @@ -7655,9 +7849,9 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, delta_tsc = 0; /* Convert to host delta tsc if tsc scaling is enabled */ - if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && + if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio && delta_tsc && u64_shl_div_u64(delta_tsc, - kvm_tsc_scaling_ratio_frac_bits, + kvm_caps.tsc_scaling_ratio_frac_bits, vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) return -ERANGE; @@ -7729,6 +7923,13 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate) { struct vcpu_vmx *vmx = to_vmx(vcpu); + /* + * TODO: Implement custom flows for forcing the vCPU out/in of L2 on + * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong + * SMI and RSM only modify state that is saved and restored via SMRAM. + * E.g. most MSRs are left untouched, but many are modified by VM-Exit + * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM. + */ vmx->nested.smm.guest_mode = is_guest_mode(vcpu); if (vmx->nested.smm.guest_mode) nested_vmx_vmexit(vcpu, -1, 0, 0); @@ -7802,6 +8003,13 @@ static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) return supported & BIT(reason); } +static void vmx_vm_destroy(struct kvm *kvm) +{ + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + + free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); +} + static struct kvm_x86_ops vmx_x86_ops __initdata = { .name = "kvm_intel", @@ -7813,7 +8021,9 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vm_size = sizeof(struct kvm_vmx), .vm_init = vmx_vm_init, + .vm_destroy = vmx_vm_destroy, + .vcpu_precreate = vmx_vcpu_precreate, .vcpu_create = vmx_vcpu_create, .vcpu_free = vmx_vcpu_free, .vcpu_reset = vmx_vcpu_reset, @@ -8027,8 +8237,8 @@ static __init int hardware_setup(void) } if (!cpu_has_vmx_mpx()) - supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | - XFEATURE_MASK_BNDCSR); + kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | + XFEATURE_MASK_BNDCSR); if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) @@ -8091,12 +8301,16 @@ static __init int hardware_setup(void) if (!enable_apicv) vmx_x86_ops.sync_pir_to_irr = NULL; + if (!enable_apicv || !cpu_has_vmx_ipiv()) + enable_ipiv = false; + if (cpu_has_vmx_tsc_scaling()) - kvm_has_tsc_control = true; + kvm_caps.has_tsc_control = true; - kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; - kvm_tsc_scaling_ratio_frac_bits = 48; - kvm_has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); + kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; + kvm_caps.tsc_scaling_ratio_frac_bits = 48; + kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); + kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit(); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ @@ -8153,11 +8367,12 @@ static __init int hardware_setup(void) vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; } - kvm_mce_cap_supported |= MCG_LMCE_P; + kvm_caps.supported_mce_cap |= MCG_LMCE_P; + kvm_caps.supported_mce_cap |= MCG_CMCI_P; if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) return -EINVAL; - if (!enable_ept || !cpu_has_vmx_intel_pt()) + if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; if (pt_mode == PT_MODE_HOST_GUEST) vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 1e7f9453894b..fb8e3480a9d7 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -92,10 +92,22 @@ union vmx_exit_reason { u32 full; }; +static inline bool intel_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) +{ + /* + * Architecturally, Intel's SDM states that IA32_PERF_GLOBAL_CTRL is + * supported if "CPUID.0AH: EAX[7:0] > 0", i.e. if the PMU version is + * greater than zero. However, KVM only exposes and emulates the MSR + * to/for the guest if the guest PMU supports at least "Architectural + * Performance Monitoring Version 2". + */ + return pmu->version > 1; +} + #define vcpu_to_lbr_desc(vcpu) (&to_vmx(vcpu)->lbr_desc) #define vcpu_to_lbr_records(vcpu) (&to_vmx(vcpu)->lbr_desc.records) -bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu); +void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu); bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu); int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); @@ -205,7 +217,7 @@ struct nested_vmx { * Guest pages referred to in the vmcs02 with host-physical * pointers, so we must keep them pinned while L2 runs. */ - struct page *apic_access_page; + struct kvm_host_map apic_access_page_map; struct kvm_host_map virtual_apic_map; struct kvm_host_map pi_desc_map; @@ -220,9 +232,18 @@ struct nested_vmx { bool has_preemption_timer_deadline; bool preemption_timer_expired; - /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ - u64 vmcs01_debugctl; - u64 vmcs01_guest_bndcfgs; + /* + * Used to snapshot MSRs that are conditionally loaded on VM-Enter in + * order to propagate the guest's pre-VM-Enter value into vmcs02. For + * emulation of VMLAUNCH/VMRESUME, the snapshot will be of L1's value. + * For KVM_SET_NESTED_STATE, the snapshot is of L2's value, _if_ + * userspace restores MSRs before nested state. If userspace restores + * MSRs after nested state, the snapshot holds garbage, but KVM can't + * detect that, and the garbage value in vmcs02 will be overwritten by + * MSR restoration in any case. + */ + u64 pre_vmenter_debugctl; + u64 pre_vmenter_bndcfgs; /* to migrate it to L1 if L2 writes to L1's CR8 directly */ int l1_tpr_threshold; @@ -369,6 +390,8 @@ struct kvm_vmx { unsigned int tss_addr; bool ept_identity_pagetable_done; gpa_t ept_identity_map_addr; + /* Posted Interrupt Descriptor (PID) table for IPI virtualization */ + u64 *pid_table; }; bool nested_vmx_allowed(struct kvm_vcpu *vcpu); @@ -462,35 +485,36 @@ static inline u8 vmx_get_rvi(void) return vmcs_read16(GUEST_INTR_STATUS) & 0xff; } -#define BUILD_CONTROLS_SHADOW(lname, uname) \ -static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val) \ -{ \ - if (vmx->loaded_vmcs->controls_shadow.lname != val) { \ - vmcs_write32(uname, val); \ - vmx->loaded_vmcs->controls_shadow.lname = val; \ - } \ -} \ -static inline u32 __##lname##_controls_get(struct loaded_vmcs *vmcs) \ -{ \ - return vmcs->controls_shadow.lname; \ -} \ -static inline u32 lname##_controls_get(struct vcpu_vmx *vmx) \ -{ \ - return __##lname##_controls_get(vmx->loaded_vmcs); \ -} \ -static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val) \ -{ \ - lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \ -} \ -static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u32 val) \ -{ \ - lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \ +#define BUILD_CONTROLS_SHADOW(lname, uname, bits) \ +static inline void lname##_controls_set(struct vcpu_vmx *vmx, u##bits val) \ +{ \ + if (vmx->loaded_vmcs->controls_shadow.lname != val) { \ + vmcs_write##bits(uname, val); \ + vmx->loaded_vmcs->controls_shadow.lname = val; \ + } \ +} \ +static inline u##bits __##lname##_controls_get(struct loaded_vmcs *vmcs) \ +{ \ + return vmcs->controls_shadow.lname; \ +} \ +static inline u##bits lname##_controls_get(struct vcpu_vmx *vmx) \ +{ \ + return __##lname##_controls_get(vmx->loaded_vmcs); \ +} \ +static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u##bits val) \ +{ \ + lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \ +} \ +static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u##bits val) \ +{ \ + lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \ } -BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS) -BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS) -BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL) -BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL) -BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL) +BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS, 32) +BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS, 32) +BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL, 32) +BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL, 32) +BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL, 32) +BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64) /* * VMX_REGS_LAZY_LOAD_SET - The set of registers that will be updated in the @@ -586,4 +610,9 @@ static inline int vmx_get_instr_info_reg2(u32 vmx_instr_info) return (vmx_instr_info >> 28) & 0xf; } +static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu) +{ + return lapic_in_kernel(vcpu) && enable_ipiv; +} + #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e5fa335a4ea7..33560bfa0cac 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -87,8 +87,11 @@ #define MAX_IO_MSRS 256 #define KVM_MAX_MCE_BANKS 32 -u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; -EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); + +struct kvm_caps kvm_caps __read_mostly = { + .supported_mce_cap = MCG_CTL_P | MCG_SER_P, +}; +EXPORT_SYMBOL_GPL(kvm_caps); #define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) @@ -151,19 +154,6 @@ module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); static bool __read_mostly kvmclock_periodic_sync = true; module_param(kvmclock_periodic_sync, bool, S_IRUGO); -bool __read_mostly kvm_has_tsc_control; -EXPORT_SYMBOL_GPL(kvm_has_tsc_control); -u32 __read_mostly kvm_max_guest_tsc_khz; -EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); -u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits; -EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits); -u64 __read_mostly kvm_max_tsc_scaling_ratio; -EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio); -u64 __read_mostly kvm_default_tsc_scaling_ratio; -EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio); -bool __read_mostly kvm_has_bus_lock_exit; -EXPORT_SYMBOL_GPL(kvm_has_bus_lock_exit); - /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); @@ -235,8 +225,6 @@ EXPORT_SYMBOL_GPL(enable_apicv); u64 __read_mostly host_xss; EXPORT_SYMBOL_GPL(host_xss); -u64 __read_mostly supported_xss; -EXPORT_SYMBOL_GPL(supported_xss); const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), @@ -298,7 +286,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, directed_yield_successful), STATS_DESC_COUNTER(VCPU, preemption_reported), STATS_DESC_COUNTER(VCPU, preemption_other), - STATS_DESC_IBOOLEAN(VCPU, guest_mode) + STATS_DESC_IBOOLEAN(VCPU, guest_mode), + STATS_DESC_COUNTER(VCPU, notify_window_exits), }; const struct kvm_stats_header kvm_vcpu_stats_header = { @@ -311,8 +300,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { }; u64 __read_mostly host_xcr0; -u64 __read_mostly supported_xcr0; -EXPORT_SYMBOL_GPL(supported_xcr0); static struct kmem_cache *x86_emulator_cache; @@ -862,7 +849,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) */ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn), PFERR_USER_MASK | PFERR_WRITE_MASK, NULL); - if (real_gpa == UNMAPPED_GVA) + if (real_gpa == INVALID_GPA) return 0; /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */ @@ -1094,7 +1081,7 @@ int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv); -bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & cr4_reserved_bits) return false; @@ -1102,9 +1089,15 @@ bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) return false; - return static_call(kvm_x86_is_valid_cr4)(vcpu, cr4); + return true; +} +EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4); + +static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + return __kvm_is_valid_cr4(vcpu, cr4) && + static_call(kvm_x86_is_valid_cr4)(vcpu, cr4); } -EXPORT_SYMBOL_GPL(kvm_is_valid_cr4); void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) { @@ -1450,6 +1443,7 @@ static const u32 msrs_to_save_all[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, + MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, @@ -2051,13 +2045,6 @@ int kvm_emulate_invd(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_invd); -int kvm_emulate_mwait(struct kvm_vcpu *vcpu) -{ - pr_warn_once("kvm: MWAIT instruction emulated as NOP!\n"); - return kvm_emulate_as_nop(vcpu); -} -EXPORT_SYMBOL_GPL(kvm_emulate_mwait); - int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); @@ -2065,11 +2052,26 @@ int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); -int kvm_emulate_monitor(struct kvm_vcpu *vcpu) + +static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) { - pr_warn_once("kvm: MONITOR instruction emulated as NOP!\n"); + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) && + !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT)) + return kvm_handle_invalid_op(vcpu); + + pr_warn_once("kvm: %s instruction emulated as NOP!\n", insn); return kvm_emulate_as_nop(vcpu); } +int kvm_emulate_mwait(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_monitor_mwait(vcpu, "MWAIT"); +} +EXPORT_SYMBOL_GPL(kvm_emulate_mwait); + +int kvm_emulate_monitor(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_monitor_mwait(vcpu, "MONITOR"); +} EXPORT_SYMBOL_GPL(kvm_emulate_monitor); static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) @@ -2349,12 +2351,12 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) /* Guest TSC same frequency as host TSC? */ if (!scale) { - kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio); + kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio); return 0; } /* TSC scaling supported? */ - if (!kvm_has_tsc_control) { + if (!kvm_caps.has_tsc_control) { if (user_tsc_khz > tsc_khz) { vcpu->arch.tsc_catchup = 1; vcpu->arch.tsc_always_catchup = 1; @@ -2366,10 +2368,10 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) } /* TSC scaling required - calculate ratio */ - ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits, + ratio = mul_u64_u32_div(1ULL << kvm_caps.tsc_scaling_ratio_frac_bits, user_tsc_khz, tsc_khz); - if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) { + if (ratio == 0 || ratio >= kvm_caps.max_tsc_scaling_ratio) { pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", user_tsc_khz); return -1; @@ -2387,7 +2389,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) /* tsc_khz can be zero if TSC calibration fails */ if (user_tsc_khz == 0) { /* set tsc_scaling_ratio to a safe value */ - kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio); + kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio); return -1; } @@ -2464,18 +2466,18 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) * (frac) represent the fractional part, ie. ratio represents a fixed * point number (mult + frac * 2^(-N)). * - * N equals to kvm_tsc_scaling_ratio_frac_bits. + * N equals to kvm_caps.tsc_scaling_ratio_frac_bits. */ static inline u64 __scale_tsc(u64 ratio, u64 tsc) { - return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits); + return mul_u64_u64_shr(tsc, ratio, kvm_caps.tsc_scaling_ratio_frac_bits); } u64 kvm_scale_tsc(u64 tsc, u64 ratio) { u64 _tsc = tsc; - if (ratio != kvm_default_tsc_scaling_ratio) + if (ratio != kvm_caps.default_tsc_scaling_ratio) _tsc = __scale_tsc(ratio, tsc); return _tsc; @@ -2502,11 +2504,11 @@ u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier) { u64 nested_offset; - if (l2_multiplier == kvm_default_tsc_scaling_ratio) + if (l2_multiplier == kvm_caps.default_tsc_scaling_ratio) nested_offset = l1_offset; else nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier, - kvm_tsc_scaling_ratio_frac_bits); + kvm_caps.tsc_scaling_ratio_frac_bits); nested_offset += l2_offset; return nested_offset; @@ -2515,9 +2517,9 @@ EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset); u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier) { - if (l2_multiplier != kvm_default_tsc_scaling_ratio) + if (l2_multiplier != kvm_caps.default_tsc_scaling_ratio) return mul_u64_u64_shr(l1_multiplier, l2_multiplier, - kvm_tsc_scaling_ratio_frac_bits); + kvm_caps.tsc_scaling_ratio_frac_bits); return l1_multiplier; } @@ -2559,7 +2561,7 @@ static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multipli else vcpu->arch.tsc_scaling_ratio = l1_multiplier; - if (kvm_has_tsc_control) + if (kvm_caps.has_tsc_control) static_call(kvm_x86_write_tsc_multiplier)( vcpu, vcpu->arch.tsc_scaling_ratio); } @@ -2695,7 +2697,7 @@ static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) { - if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) + if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio) WARN_ON(adjustment < 0); adjustment = kvm_scale_tsc((u64) adjustment, vcpu->arch.l1_tsc_scaling_ratio); @@ -3108,7 +3110,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* With all the info we got, fill in the values */ - if (kvm_has_tsc_control) + if (kvm_caps.has_tsc_control) tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz, v->arch.l1_tsc_scaling_ratio); @@ -3198,6 +3200,16 @@ static void kvmclock_sync_fn(struct work_struct *work) KVMCLOCK_SYNC_PERIOD); } +/* These helpers are safe iff @msr is known to be an MCx bank MSR. */ +static bool is_mci_control_msr(u32 msr) +{ + return (msr & 3) == 0; +} +static bool is_mci_status_msr(u32 msr) +{ + return (msr & 3) == 1; +} + /* * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP. */ @@ -3216,6 +3228,7 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) unsigned bank_num = mcg_cap & 0xff; u32 msr = msr_info->index; u64 data = msr_info->data; + u32 offset, last_msr; switch (msr) { case MSR_IA32_MCG_STATUS: @@ -3229,32 +3242,53 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.mcg_ctl = data; break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = array_index_nospec( - msr - MSR_IA32_MC0_CTL, - MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); - - /* only 0 or all 1s can be written to IA32_MCi_CTL - * some Linux kernels though clear bit 10 in bank 4 to - * workaround a BIOS/GART TBL issue on AMD K8s, ignore - * this to avoid an uncatched #GP in the guest - */ - if ((offset & 0x3) == 0 && - data != 0 && (data | (1 << 10)) != ~(u64)0) - return -1; - - /* MCi_STATUS */ - if (!msr_info->host_initiated && - (offset & 0x3) == 1 && data != 0) { - if (!can_set_mci_status(vcpu)) - return -1; - } + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1; + if (msr > last_msr) + return 1; - vcpu->arch.mce_banks[offset] = data; - break; - } + if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated)) + return 1; + /* An attempt to write a 1 to a reserved bit raises #GP */ + if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK)) + return 1; + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2, + last_msr + 1 - MSR_IA32_MC0_CTL2); + vcpu->arch.mci_ctl2_banks[offset] = data; + break; + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; + if (msr > last_msr) + return 1; + + /* + * Only 0 or all 1s can be written to IA32_MCi_CTL, all other + * values are architecturally undefined. But, some Linux + * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB + * issue on AMD K8s, allow bit 10 to be clear when setting all + * other bits in order to avoid an uncaught #GP in the guest. + * + * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable, + * single-bit ECC data errors. + */ + if (is_mci_control_msr(msr) && + data != 0 && (data | (1 << 10) | 1) != ~(u64)0) + return 1; + + /* + * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR. + * AMD-based CPUs allow non-zero values, but if and only if + * HWCR[McStatusWrEn] is set. + */ + if (!msr_info->host_initiated && is_mci_status_msr(msr) && + data != 0 && !can_set_mci_status(vcpu)) + return 1; + + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); + vcpu->arch.mce_banks[offset] = data; + break; + default: return 1; } return 0; @@ -3538,7 +3572,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; } break; - case 0x200 ... 0x2ff: + case 0x200 ... MSR_IA32_MC0_CTL2 - 1: + case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff: return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: return kvm_set_apic_base(vcpu, msr_info); @@ -3560,9 +3595,21 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.ia32_tsc_adjust_msr = data; } break; - case MSR_IA32_MISC_ENABLE: + case MSR_IA32_MISC_ENABLE: { + u64 old_val = vcpu->arch.ia32_misc_enable_msr; + + if (!msr_info->host_initiated) { + /* RO bits */ + if ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK) + return 1; + + /* R bits, i.e. writes are ignored, but don't fault. */ + data = data & ~MSR_IA32_MISC_ENABLE_EMON; + data |= old_val & MSR_IA32_MISC_ENABLE_EMON; + } + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && - ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { + ((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) return 1; vcpu->arch.ia32_misc_enable_msr = data; @@ -3571,6 +3618,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.ia32_misc_enable_msr = data; } break; + } case MSR_IA32_SMBASE: if (!msr_info->host_initiated) return 1; @@ -3597,7 +3645,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than * XSAVES/XRSTORS to save/restore PT MSRs. */ - if (data & ~supported_xss) + if (data & ~kvm_caps.supported_xss) return 1; vcpu->arch.ia32_xss = data; kvm_update_cpuid_runtime(vcpu); @@ -3695,6 +3743,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: return set_msr_mce(vcpu, msr_info); case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: @@ -3785,6 +3834,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.guest_fpu.xfd_err = data; break; #endif + case MSR_IA32_PEBS_ENABLE: + case MSR_IA32_DS_AREA: + case MSR_PEBS_DATA_CFG: + case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: + if (kvm_pmu_is_valid_msr(vcpu, msr)) + return kvm_pmu_set_msr(vcpu, msr_info); + /* + * Userspace is allowed to write '0' to MSRs that KVM reports + * as to-be-saved, even if an MSRs isn't fully supported. + */ + return !msr_info->host_initiated || data; default: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); @@ -3799,6 +3859,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) u64 data; u64 mcg_cap = vcpu->arch.mcg_cap; unsigned bank_num = mcg_cap & 0xff; + u32 offset, last_msr; switch (msr) { case MSR_IA32_P5_MC_ADDR: @@ -3816,16 +3877,27 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) case MSR_IA32_MCG_STATUS: data = vcpu->arch.mcg_status; break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = array_index_nospec( - msr - MSR_IA32_MC0_CTL, - MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1; + if (msr > last_msr) + return 1; - data = vcpu->arch.mce_banks[offset]; - break; - } + if (!(mcg_cap & MCG_CMCI_P) && !host) + return 1; + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2, + last_msr + 1 - MSR_IA32_MC0_CTL2); + data = vcpu->arch.mci_ctl2_banks[offset]; + break; + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; + if (msr > last_msr) + return 1; + + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); + data = vcpu->arch.mce_banks[offset]; + break; + default: return 1; } *pdata = data; @@ -3865,9 +3937,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ msr_info->data = 0; break; + case MSR_IA32_PEBS_ENABLE: + case MSR_IA32_DS_AREA: + case MSR_PEBS_DATA_CFG: case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); + /* + * Userspace is allowed to read MSRs that KVM reports as + * to-be-saved, even if an MSR isn't fully supported. + */ if (!msr_info->host_initiated) return 1; msr_info->data = 0; @@ -3922,7 +4001,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; } case MSR_MTRRcap: - case 0x200 ... 0x2ff: + case 0x200 ... MSR_IA32_MC0_CTL2 - 1: + case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff: return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); case 0xcd: /* fsb frequency */ msr_info->data = 3; @@ -4038,6 +4118,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: return get_msr_mce(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); case MSR_IA32_XSS: @@ -4280,6 +4361,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: + case KVM_CAP_X86_TRIPLE_FAULT_EVENT: case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_LAST_CPU: case KVM_CAP_X86_USER_SPACE_MSR: @@ -4296,6 +4378,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SYS_ATTRIBUTES: case KVM_CAP_VAPIC: case KVM_CAP_ENABLE_CAP: + case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: @@ -4357,7 +4440,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_TSC_CONTROL: case KVM_CAP_VM_TSC_CONTROL: - r = kvm_has_tsc_control; + r = kvm_caps.has_tsc_control; break; case KVM_CAP_X2APIC_API: r = KVM_X2APIC_API_VALID_FLAGS; @@ -4379,7 +4462,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = sched_info_on(); break; case KVM_CAP_X86_BUS_LOCK_EXIT: - if (kvm_has_bus_lock_exit) + if (kvm_caps.has_bus_lock_exit) r = KVM_BUS_LOCK_DETECTION_OFF | KVM_BUS_LOCK_DETECTION_EXIT; else @@ -4388,17 +4471,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_XSAVE2: { u64 guest_perm = xstate_get_guest_group_perm(); - r = xstate_required_size(supported_xcr0 & guest_perm, false); + r = xstate_required_size(kvm_caps.supported_xcr0 & guest_perm, false); if (r < sizeof(struct kvm_xsave)) r = sizeof(struct kvm_xsave); break; + } case KVM_CAP_PMU_CAPABILITY: r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0; break; - } case KVM_CAP_DISABLE_QUIRKS2: r = KVM_X86_VALID_QUIRKS; break; + case KVM_CAP_X86_NOTIFY_VMEXIT: + r = kvm_caps.has_notify_vmexit; + break; default: break; } @@ -4426,7 +4512,7 @@ static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr) switch (attr->attr) { case KVM_X86_XCOMP_GUEST_SUPP: - if (put_user(supported_xcr0, uaddr)) + if (put_user(kvm_caps.supported_xcr0, uaddr)) return -EFAULT; return 0; default: @@ -4503,8 +4589,8 @@ long kvm_arch_dev_ioctl(struct file *filp, } case KVM_X86_GET_MCE_CAP_SUPPORTED: r = -EFAULT; - if (copy_to_user(argp, &kvm_mce_cap_supported, - sizeof(kvm_mce_cap_supported))) + if (copy_to_user(argp, &kvm_caps.supported_mce_cap, + sizeof(kvm_caps.supported_mce_cap))) goto out; r = 0; break; @@ -4803,22 +4889,63 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, r = -EINVAL; if (!bank_num || bank_num > KVM_MAX_MCE_BANKS) goto out; - if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000)) + if (mcg_cap & ~(kvm_caps.supported_mce_cap | 0xff | 0xff0000)) goto out; r = 0; vcpu->arch.mcg_cap = mcg_cap; /* Init IA32_MCG_CTL to all 1s */ if (mcg_cap & MCG_CTL_P) vcpu->arch.mcg_ctl = ~(u64)0; - /* Init IA32_MCi_CTL to all 1s */ - for (bank = 0; bank < bank_num; bank++) + /* Init IA32_MCi_CTL to all 1s, IA32_MCi_CTL2 to all 0s */ + for (bank = 0; bank < bank_num; bank++) { vcpu->arch.mce_banks[bank*4] = ~(u64)0; + if (mcg_cap & MCG_CMCI_P) + vcpu->arch.mci_ctl2_banks[bank] = 0; + } + + kvm_apic_after_set_mcg_cap(vcpu); static_call(kvm_x86_setup_mce)(vcpu); out: return r; } +/* + * Validate this is an UCNA (uncorrectable no action) error by checking the + * MCG_STATUS and MCi_STATUS registers: + * - none of the bits for Machine Check Exceptions are set + * - both the VAL (valid) and UC (uncorrectable) bits are set + * MCI_STATUS_PCC - Processor Context Corrupted + * MCI_STATUS_S - Signaled as a Machine Check Exception + * MCI_STATUS_AR - Software recoverable Action Required + */ +static bool is_ucna(struct kvm_x86_mce *mce) +{ + return !mce->mcg_status && + !(mce->status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR)) && + (mce->status & MCI_STATUS_VAL) && + (mce->status & MCI_STATUS_UC); +} + +static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce, u64* banks) +{ + u64 mcg_cap = vcpu->arch.mcg_cap; + + banks[1] = mce->status; + banks[2] = mce->addr; + banks[3] = mce->misc; + vcpu->arch.mcg_status = mce->mcg_status; + + if (!(mcg_cap & MCG_CMCI_P) || + !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN)) + return 0; + + if (lapic_in_kernel(vcpu)) + kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI); + + return 0; +} + static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce) { @@ -4828,6 +4955,12 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) return -EINVAL; + + banks += array_index_nospec(4 * mce->bank, 4 * bank_num); + + if (is_ucna(mce)) + return kvm_vcpu_x86_set_ucna(vcpu, mce, banks); + /* * if IA32_MCG_CTL is not all 1s, the uncorrected error * reporting is disabled @@ -4835,7 +4968,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && vcpu->arch.mcg_ctl != ~(u64)0) return 0; - banks += 4 * mce->bank; /* * if IA32_MCi_CTL is not all 1s, the uncorrected error * reporting is disabled for the bank @@ -4941,6 +5073,10 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SMM); if (vcpu->kvm->arch.exception_payload_enabled) events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD; + if (vcpu->kvm->arch.triple_fault_event) { + events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu); + events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT; + } memset(&events->reserved, 0, sizeof(events->reserved)); } @@ -4954,7 +5090,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SIPI_VECTOR | KVM_VCPUEVENT_VALID_SHADOW | KVM_VCPUEVENT_VALID_SMM - | KVM_VCPUEVENT_VALID_PAYLOAD)) + | KVM_VCPUEVENT_VALID_PAYLOAD + | KVM_VCPUEVENT_VALID_TRIPLE_FAULT)) return -EINVAL; if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) { @@ -5027,6 +5164,15 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, } } + if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) { + if (!vcpu->kvm->arch.triple_fault_event) + return -EINVAL; + if (events->triple_fault.pending) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + else + kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); + } + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; @@ -5095,7 +5241,8 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu, guest_xsave->region, - supported_xcr0, &vcpu->arch.pkru); + kvm_caps.supported_xcr0, + &vcpu->arch.pkru); } static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, @@ -5600,8 +5747,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; user_tsc_khz = (u32)arg; - if (kvm_has_tsc_control && - user_tsc_khz >= kvm_max_guest_tsc_khz) + if (kvm_caps.has_tsc_control && + user_tsc_khz >= kvm_caps.max_guest_tsc_khz) goto out; if (user_tsc_khz == 0) @@ -6028,6 +6175,10 @@ split_irqchip_unlock: kvm->arch.exception_payload_enabled = cap->args[0]; r = 0; break; + case KVM_CAP_X86_TRIPLE_FAULT_EVENT: + kvm->arch.triple_fault_event = cap->args[0]; + r = 0; + break; case KVM_CAP_X86_USER_SPACE_MSR: r = -EINVAL; if (cap->args[0] & ~(KVM_MSR_EXIT_REASON_INVAL | @@ -6046,7 +6197,7 @@ split_irqchip_unlock: (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)) break; - if (kvm_has_bus_lock_exit && + if (kvm_caps.has_bus_lock_exit && cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT) kvm->arch.bus_lock_detection_enabled = true; r = 0; @@ -6109,6 +6260,65 @@ split_irqchip_unlock: } mutex_unlock(&kvm->lock); break; + case KVM_CAP_MAX_VCPU_ID: + r = -EINVAL; + if (cap->args[0] > KVM_MAX_VCPU_IDS) + break; + + mutex_lock(&kvm->lock); + if (kvm->arch.max_vcpu_ids == cap->args[0]) { + r = 0; + } else if (!kvm->arch.max_vcpu_ids) { + kvm->arch.max_vcpu_ids = cap->args[0]; + r = 0; + } + mutex_unlock(&kvm->lock); + break; + case KVM_CAP_X86_NOTIFY_VMEXIT: + r = -EINVAL; + if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS) + break; + if (!kvm_caps.has_notify_vmexit) + break; + if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED)) + break; + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + kvm->arch.notify_window = cap->args[0] >> 32; + kvm->arch.notify_vmexit_flags = (u32)cap->args[0]; + r = 0; + } + mutex_unlock(&kvm->lock); + break; + case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: + r = -EINVAL; + + /* + * Since the risk of disabling NX hugepages is a guest crashing + * the system, ensure the userspace process has permission to + * reboot the system. + * + * Note that unlike the reboot() syscall, the process must have + * this capability in the root namespace because exposing + * /dev/kvm into a container does not limit the scope of the + * iTLB multihit bug to that container. In other words, + * this must use capable(), not ns_capable(). + */ + if (!capable(CAP_SYS_BOOT)) { + r = -EPERM; + break; + } + + if (cap->args[0]) + break; + + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + kvm->arch.disable_nx_huge_pages = true; + r = 0; + } + mutex_unlock(&kvm->lock); + break; default: r = -EINVAL; break; @@ -6584,8 +6794,8 @@ set_pit2_out: r = -EINVAL; user_tsc_khz = (u32)arg; - if (kvm_has_tsc_control && - user_tsc_khz >= kvm_max_guest_tsc_khz) + if (kvm_caps.has_tsc_control && + user_tsc_khz >= kvm_caps.max_guest_tsc_khz) goto out; if (user_tsc_khz == 0) @@ -6660,15 +6870,12 @@ out: static void kvm_init_msr_list(void) { - struct x86_pmu_capability x86_pmu; u32 dummy[2]; unsigned i; BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3, "Please update the fixed PMCs in msrs_to_saved_all[]"); - perf_get_x86_pmu_capability(&x86_pmu); - num_msrs_to_save = 0; num_emulated_msrs = 0; num_msr_based_features = 0; @@ -6720,12 +6927,12 @@ static void kvm_init_msr_list(void) break; case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= - min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) continue; break; case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= - min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) continue; break; case MSR_IA32_XFD: @@ -6882,7 +7089,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) + if (gpa == INVALID_GPA) return X86EMUL_PROPAGATE_FAULT; ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data, offset, toread); @@ -6913,7 +7120,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, /* Inline kvm_read_guest_virt_helper for speed. */ gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK, exception); - if (unlikely(gpa == UNMAPPED_GVA)) + if (unlikely(gpa == INVALID_GPA)) return X86EMUL_PROPAGATE_FAULT; offset = addr & (PAGE_SIZE-1); @@ -6983,7 +7190,7 @@ static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) + if (gpa == INVALID_GPA) return X86EMUL_PROPAGATE_FAULT; ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite); if (ret < 0) { @@ -7094,7 +7301,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, *gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); - if (*gpa == UNMAPPED_GVA) + if (*gpa == INVALID_GPA) return -1; return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write); @@ -7331,7 +7538,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); - if (gpa == UNMAPPED_GVA || + if (gpa == INVALID_GPA || (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto emul_write; @@ -7385,36 +7592,47 @@ emul_write: return emulator_write_emulated(ctxt, addr, new, bytes, exception); } -static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) +static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, + unsigned short port, void *data, + unsigned int count, bool in) { - int r = 0, i; + unsigned i; + int r; - for (i = 0; i < vcpu->arch.pio.count; i++) { - if (vcpu->arch.pio.in) - r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); + WARN_ON_ONCE(vcpu->arch.pio.count); + for (i = 0; i < count; i++) { + if (in) + r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, port, size, data); else - r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, - vcpu->arch.pio.port, vcpu->arch.pio.size, - pd); - if (r) + r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, port, size, data); + + if (r) { + if (i == 0) + goto userspace_io; + + /* + * Userspace must have unregistered the device while PIO + * was running. Drop writes / read as 0. + */ + if (in) + memset(data, 0, size * (count - i)); break; - pd += vcpu->arch.pio.size; + } + + data += size; } - return r; -} + return 1; -static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, - unsigned short port, - unsigned int count, bool in) -{ +userspace_io: vcpu->arch.pio.port = port; vcpu->arch.pio.in = in; - vcpu->arch.pio.count = count; + vcpu->arch.pio.count = count; vcpu->arch.pio.size = size; - if (!kernel_pio(vcpu, vcpu->arch.pio_data)) - return 1; + if (in) + memset(vcpu->arch.pio_data, 0, size * count); + else + memcpy(vcpu->arch.pio_data, data, size * count); vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; @@ -7422,30 +7640,33 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; vcpu->run->io.count = count; vcpu->run->io.port = port; - return 0; } -static int __emulator_pio_in(struct kvm_vcpu *vcpu, int size, - unsigned short port, unsigned int count) +static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, + unsigned short port, void *val, unsigned int count) { - WARN_ON(vcpu->arch.pio.count); - memset(vcpu->arch.pio_data, 0, size * count); - return emulator_pio_in_out(vcpu, size, port, count, true); + int r = emulator_pio_in_out(vcpu, size, port, val, count, true); + if (r) + trace_kvm_pio(KVM_PIO_IN, port, size, count, val); + + return r; } static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val) { int size = vcpu->arch.pio.size; - unsigned count = vcpu->arch.pio.count; + unsigned int count = vcpu->arch.pio.count; memcpy(val, vcpu->arch.pio_data, size * count); trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data); vcpu->arch.pio.count = 0; } -static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, - unsigned short port, void *val, unsigned int count) +static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, void *val, + unsigned int count) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); if (vcpu->arch.pio.count) { /* * Complete a previous iteration that required userspace I/O. @@ -7454,39 +7675,19 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, * shenanigans as KVM doesn't support modifying the rep count, * and the emulator ensures @count doesn't overflow the buffer. */ - } else { - int r = __emulator_pio_in(vcpu, size, port, count); - if (!r) - return r; - - /* Results already available, fall through. */ + complete_emulator_pio_in(vcpu, val); + return 1; } - complete_emulator_pio_in(vcpu, val); - return 1; -} - -static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, - int size, unsigned short port, void *val, - unsigned int count) -{ - return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count); - + return emulator_pio_in(vcpu, size, port, val, count); } static int emulator_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port, const void *val, unsigned int count) { - int ret; - - memcpy(vcpu->arch.pio_data, val, size * count); - trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); - ret = emulator_pio_in_out(vcpu, size, port, count, false); - if (ret) - vcpu->arch.pio.count = 0; - - return ret; + trace_kvm_pio(KVM_PIO_OUT, port, size, count, val); + return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); } static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, @@ -7868,7 +8069,16 @@ static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr); } +static void emulator_vm_bugged(struct x86_emulate_ctxt *ctxt) +{ + struct kvm *kvm = emul_to_vcpu(ctxt)->kvm; + + if (!kvm->vm_bugged) + kvm_vm_bugged(kvm); +} + static const struct x86_emulate_ops emulate_ops = { + .vm_bugged = emulator_vm_bugged, .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, .read_std = emulator_read_std, @@ -8145,7 +8355,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * If the mapping is invalid in guest, let cpu retry * it to generate fault. */ - if (gpa == UNMAPPED_GVA) + if (gpa == INVALID_GPA) return true; } @@ -8672,11 +8882,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) /* For size less than 4 we merge, else we zero extend */ val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0; - /* - * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform - * the copy and tracing - */ - emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1); + complete_emulator_pio_in(vcpu, &val); kvm_rax_write(vcpu, val); return kvm_skip_emulated_instruction(vcpu); @@ -8751,7 +8957,7 @@ static void kvm_hyperv_tsc_notifier(void) /* TSC frequency always matches when on Hyper-V */ for_each_present_cpu(cpu) per_cpu(cpu_tsc_khz, cpu) = tsc_khz; - kvm_max_guest_tsc_khz = tsc_khz; + kvm_caps.max_guest_tsc_khz = tsc_khz; list_for_each_entry(kvm, &vm_list, vm_list) { __kvm_start_pvclock_update(kvm); @@ -8952,25 +9158,23 @@ static struct notifier_block pvclock_gtod_notifier = { int kvm_arch_init(void *opaque) { struct kvm_x86_init_ops *ops = opaque; + u64 host_pat; int r; if (kvm_x86_ops.hardware_enable) { pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name); - r = -EEXIST; - goto out; + return -EEXIST; } if (!ops->cpu_has_kvm_support()) { pr_err_ratelimited("kvm: no hardware support for '%s'\n", ops->runtime_ops->name); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } if (ops->disabled_by_bios()) { pr_err_ratelimited("kvm: support for '%s' disabled by bios\n", ops->runtime_ops->name); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } /* @@ -8980,27 +9184,37 @@ int kvm_arch_init(void *opaque) */ if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) { printk(KERN_ERR "kvm: inadequate fpu\n"); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n"); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } - r = -ENOMEM; + /* + * KVM assumes that PAT entry '0' encodes WB memtype and simply zeroes + * the PAT bits in SPTEs. Bail if PAT[0] is programmed to something + * other than WB. Note, EPT doesn't utilize the PAT, but don't bother + * with an exception. PAT[0] is set to WB on RESET and also by the + * kernel, i.e. failure indicates a kernel bug or broken firmware. + */ + if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) || + (host_pat & GENMASK(2, 0)) != 6) { + pr_err("kvm: host PAT[0] is not WB\n"); + return -EIO; + } x86_emulator_cache = kvm_alloc_emulator_cache(); if (!x86_emulator_cache) { pr_err("kvm: failed to allocate cache for x86 emulator\n"); - goto out; + return -ENOMEM; } user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); if (!user_return_msrs) { printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n"); + r = -ENOMEM; goto out_free_x86_emulator_cache; } kvm_nr_uret_msrs = 0; @@ -9013,7 +9227,7 @@ int kvm_arch_init(void *opaque) if (boot_cpu_has(X86_FEATURE_XSAVE)) { host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; + kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; } if (pi_inject_timer == -1) @@ -9031,7 +9245,6 @@ out_free_percpu: free_percpu(user_return_msrs); out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); -out: return r; } @@ -9406,7 +9619,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; - if (vcpu->arch.apicv_active) + if (vcpu->arch.apic->apicv_active) return; if (!vcpu->arch.apic->vapic_addr) @@ -9435,6 +9648,11 @@ int kvm_check_nested_events(struct kvm_vcpu *vcpu) static void kvm_inject_exception(struct kvm_vcpu *vcpu) { + trace_kvm_inj_exception(vcpu->arch.exception.nr, + vcpu->arch.exception.has_error_code, + vcpu->arch.exception.error_code, + vcpu->arch.exception.injected); + if (vcpu->arch.exception.error_code && !is_protmode(vcpu)) vcpu->arch.exception.error_code = false; static_call(kvm_x86_queue_exception)(vcpu); @@ -9470,7 +9688,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) static_call(kvm_x86_inject_nmi)(vcpu); can_inject = false; } else if (vcpu->arch.interrupt.injected) { - static_call(kvm_x86_inject_irq)(vcpu); + static_call(kvm_x86_inject_irq)(vcpu, true); can_inject = false; } } @@ -9492,13 +9710,6 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) /* try to inject new event if pending */ if (vcpu->arch.exception.pending) { - trace_kvm_inj_exception(vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code); - - vcpu->arch.exception.pending = false; - vcpu->arch.exception.injected = true; - if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT) __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) | X86_EFLAGS_RF); @@ -9512,6 +9723,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) } kvm_inject_exception(vcpu); + + vcpu->arch.exception.pending = false; + vcpu->arch.exception.injected = true; + can_inject = false; } @@ -9564,7 +9779,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) goto out; if (r) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); - static_call(kvm_x86_inject_irq)(vcpu); + static_call(kvm_x86_inject_irq)(vcpu, false); WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0); } if (kvm_cpu_has_injectable_intr(vcpu)) @@ -9857,6 +10072,7 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm) void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) { + struct kvm_lapic *apic = vcpu->arch.apic; bool activate; if (!lapic_in_kernel(vcpu)) @@ -9865,12 +10081,14 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) down_read(&vcpu->kvm->arch.apicv_update_lock); preempt_disable(); - activate = kvm_vcpu_apicv_activated(vcpu); + /* Do not activate APICV when APIC is disabled */ + activate = kvm_vcpu_apicv_activated(vcpu) && + (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED); - if (vcpu->arch.apicv_active == activate) + if (apic->apicv_active == activate) goto out; - vcpu->arch.apicv_active = activate; + apic->apicv_active = activate; kvm_apic_update_apicv(vcpu); static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); @@ -9880,7 +10098,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) * still active when the interrupt got accepted. Make sure * inject_pending_event() is called to check for that. */ - if (!vcpu->arch.apicv_active) + if (!apic->apicv_active) kvm_make_request(KVM_REQ_EVENT, vcpu); out: @@ -10275,7 +10493,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * per-VM state, and responsing vCPUs must wait for the update * to complete before servicing KVM_REQ_APICV_UPDATE. */ - WARN_ON_ONCE(kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)); + WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && + (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED)); exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) @@ -10654,8 +10873,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) r = cui(vcpu); if (r <= 0) goto out; - } else - WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); + } else { + WARN_ON_ONCE(vcpu->arch.pio.count); + WARN_ON_ONCE(vcpu->mmio_needed); + } if (kvm_run->immediate_exit) { r = -EINTR; @@ -11183,7 +11404,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); srcu_read_unlock(&vcpu->kvm->srcu, idx); tr->physical_address = gpa; - tr->valid = gpa != UNMAPPED_GVA; + tr->valid = gpa != INVALID_GPA; tr->writeable = 1; tr->usermode = 0; @@ -11276,11 +11497,17 @@ static int sync_regs(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { - if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) + if (kvm_check_tsc_unstable() && kvm->created_vcpus) pr_warn_once("kvm: SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); - return 0; + if (!kvm->arch.max_vcpu_ids) + kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS; + + if (id >= kvm->arch.max_vcpu_ids) + return -EINVAL; + + return static_call(kvm_x86_vcpu_precreate)(kvm); } int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) @@ -11317,7 +11544,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) * will ensure the vCPU gets the correct state before VM-Entry. */ if (enable_apicv) { - vcpu->arch.apicv_active = true; + vcpu->arch.apic->apicv_active = true; kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); } } else @@ -11330,9 +11557,11 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) goto fail_free_lapic; vcpu->arch.pio_data = page_address(page); - vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, + vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64), GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.mce_banks) + vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64), + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks) goto fail_free_pio_data; vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; @@ -11386,6 +11615,7 @@ free_wbinvd_dirty_mask: free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fail_free_mce_banks: kfree(vcpu->arch.mce_banks); + kfree(vcpu->arch.mci_ctl2_banks); fail_free_pio_data: free_page((unsigned long)vcpu->arch.pio_data); fail_free_lapic: @@ -11431,6 +11661,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_hv_vcpu_uninit(vcpu); kvm_pmu_destroy(vcpu); kfree(vcpu->arch.mce_banks); + kfree(vcpu->arch.mci_ctl2_banks); kvm_free_lapic(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_mmu_destroy(vcpu); @@ -11510,6 +11741,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.smbase = 0x30000; vcpu->arch.msr_misc_features_enables = 0; + vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | + MSR_IA32_MISC_ENABLE_BTS_UNAVAIL; __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP); __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true); @@ -11526,7 +11759,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry * on RESET. But, go through the motions in case that's ever remedied. */ - cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0); + cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1); kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600); static_call(kvm_x86_vcpu_reset)(vcpu, init_event); @@ -11717,6 +11950,8 @@ int kvm_arch_hardware_setup(void *opaque) if (boot_cpu_has(X86_FEATURE_XSAVES)) rdmsrl(MSR_IA32_XSS, host_xss); + kvm_init_pmu_capability(); + r = ops->hardware_setup(); if (r != 0) return r; @@ -11726,13 +11961,13 @@ int kvm_arch_hardware_setup(void *opaque) kvm_register_perf_callbacks(ops->handle_intel_pt_intr); if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) - supported_xss = 0; + kvm_caps.supported_xss = 0; #define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); #undef __kvm_cpu_cap_has - if (kvm_has_tsc_control) { + if (kvm_caps.has_tsc_control) { /* * Make sure the user can only configure tsc_khz values that * fit into a signed integer. @@ -11740,10 +11975,10 @@ int kvm_arch_hardware_setup(void *opaque) * be 1 on all machines. */ u64 max = min(0x7fffffffULL, - __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz)); - kvm_max_guest_tsc_khz = max; + __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz)); + kvm_caps.max_guest_tsc_khz = max; } - kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; + kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits; kvm_init_msr_list(); return 0; } @@ -12331,7 +12566,8 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) { - if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu)) + if (kvm_vcpu_apicv_active(vcpu) && + static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu)) return true; return false; @@ -12773,7 +13009,7 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK); if (!(error_code & PFERR_PRESENT_MASK) || - mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != UNMAPPED_GVA) { + mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != INVALID_GPA) { /* * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page * tables probably do not match the TLB. Just proceed @@ -12998,6 +13234,12 @@ int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, } EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read); +static void advance_sev_es_emulated_pio(struct kvm_vcpu *vcpu, unsigned count, int size) +{ + vcpu->arch.sev_pio_count -= count; + vcpu->arch.sev_pio_data += count * size; +} + static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, unsigned int port); @@ -13021,8 +13263,7 @@ static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count); /* memcpy done already by emulator_pio_out. */ - vcpu->arch.sev_pio_count -= count; - vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size; + advance_sev_es_emulated_pio(vcpu, count, size); if (!ret) break; @@ -13038,20 +13279,14 @@ static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, unsigned int port); -static void advance_sev_es_emulated_ins(struct kvm_vcpu *vcpu) -{ - unsigned count = vcpu->arch.pio.count; - complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data); - vcpu->arch.sev_pio_count -= count; - vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size; -} - static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) { + unsigned count = vcpu->arch.pio.count; int size = vcpu->arch.pio.size; int port = vcpu->arch.pio.port; - advance_sev_es_emulated_ins(vcpu); + complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data); + advance_sev_es_emulated_pio(vcpu, count, size); if (vcpu->arch.sev_pio_count) return kvm_sev_es_ins(vcpu, size, port); return 1; @@ -13063,11 +13298,11 @@ static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, for (;;) { unsigned int count = min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count); - if (!__emulator_pio_in(vcpu, size, port, count)) + if (!emulator_pio_in(vcpu, size, port, vcpu->arch.sev_pio_data, count)) break; /* Emulation done by the kernel. */ - advance_sev_es_emulated_ins(vcpu); + advance_sev_es_emulated_pio(vcpu, count, size); if (!vcpu->arch.sev_pio_count) return 1; } @@ -13110,6 +13345,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_kick_vcpu_slowpath); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_doorbell); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 588792f00334..1926d2cb8e79 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -8,6 +8,27 @@ #include "kvm_cache_regs.h" #include "kvm_emulate.h" +struct kvm_caps { + /* control of guest tsc rate supported? */ + bool has_tsc_control; + /* maximum supported tsc_khz for guests */ + u32 max_guest_tsc_khz; + /* number of bits of the fractional part of the TSC scaling ratio */ + u8 tsc_scaling_ratio_frac_bits; + /* maximum allowed value of TSC scaling ratio */ + u64 max_tsc_scaling_ratio; + /* 1ull << kvm_caps.tsc_scaling_ratio_frac_bits */ + u64 default_tsc_scaling_ratio; + /* bus lock detection supported? */ + bool has_bus_lock_exit; + /* notify VM exit supported? */ + bool has_notify_vmexit; + + u64 supported_mce_cap; + u64 supported_xcr0; + u64 supported_xss; +}; + void kvm_spurious_fault(void); #define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \ @@ -283,14 +304,15 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); extern u64 host_xcr0; -extern u64 supported_xcr0; extern u64 host_xss; -extern u64 supported_xss; + +extern struct kvm_caps kvm_caps; + extern bool enable_pmu; static inline bool kvm_mpx_supported(void) { - return (supported_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) + return (kvm_caps.supported_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) == (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); } @@ -344,6 +366,11 @@ static inline bool kvm_cstate_in_guest(struct kvm *kvm) return kvm->arch.cstate_in_guest; } +static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm) +{ + return kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_ENABLED; +} + enum kvm_intr_type { /* Values are arbitrary, but must be non-zero. */ KVM_HANDLING_IRQ = 1, @@ -407,7 +434,7 @@ static inline void kvm_machine_check(void) void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); int kvm_spec_ctrl_test_value(u64 value); -bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, struct x86_exception *e); int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva); diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 610beba35907..a0c05ccbf4b1 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -1049,7 +1049,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, else vcpu->arch.xen.poll_evtchn = -1; - set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask); + set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask); if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) { vcpu->arch.mp_state = KVM_MP_STATE_HALTED; @@ -1071,7 +1071,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, *r = 0; out: /* Really, this is only needed in case of timeout */ - clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask); + clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask); if (unlikely(sched_poll.nr_ports > 1)) kfree(ports); @@ -1311,7 +1311,7 @@ static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port) int poll_evtchn = vcpu->arch.xen.poll_evtchn; if ((poll_evtchn == port || poll_evtchn == -1) && - test_and_clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask)) { + test_and_clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask)) { kvm_make_request(KVM_REQ_UNBLOCK, vcpu); kvm_vcpu_kick(vcpu); } @@ -1344,7 +1344,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) vcpu = kvm_get_vcpu_by_id(kvm, xe->vcpu_id); if (!vcpu) return -EINVAL; - WRITE_ONCE(xe->vcpu_idx, kvm_vcpu_get_idx(vcpu)); + WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx); } if (!vcpu->arch.xen.vcpu_info_cache.active) @@ -1540,7 +1540,7 @@ int kvm_xen_setup_evtchn(struct kvm *kvm, */ vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu); if (vcpu) - e->xen_evtchn.vcpu_idx = kvm_vcpu_get_idx(vcpu); + e->xen_evtchn.vcpu_idx = vcpu->vcpu_idx; else e->xen_evtchn.vcpu_idx = -1; |