summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/mmu.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-08-19 10:38:36 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2018-08-19 10:38:36 -0700
commite61cf2e3a5b452cfefcb145021f5a8ea88735cc1 (patch)
treebbabaf0d4753d6880ecbaddd8daa0164d49c1c61 /arch/x86/kvm/mmu.c
parent1009aa1205c2c5e9101437dcadfa195708d863bf (diff)
parent28a1f3ac1d0c8558ee4453d9634dad891a6e922e (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull first set of KVM updates from Paolo Bonzini: "PPC: - minor code cleanups x86: - PCID emulation and CR3 caching for shadow page tables - nested VMX live migration - nested VMCS shadowing - optimized IPI hypercall - some optimizations ARM will come next week" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (85 commits) kvm: x86: Set highest physical address bits in non-present/reserved SPTEs KVM/x86: Use CC_SET()/CC_OUT in arch/x86/kvm/vmx.c KVM: X86: Implement PV IPIs in linux guest KVM: X86: Add kvm hypervisor init time platform setup callback KVM: X86: Implement "send IPI" hypercall KVM/x86: Move X86_CR4_OSXSAVE check into kvm_valid_sregs() KVM: x86: Skip pae_root shadow allocation if tdp enabled KVM/MMU: Combine flushing remote tlb in mmu_set_spte() KVM: vmx: skip VMWRITE of HOST_{FS,GS}_BASE when possible KVM: vmx: skip VMWRITE of HOST_{FS,GS}_SEL when possible KVM: vmx: always initialize HOST_{FS,GS}_BASE to zero during setup KVM: vmx: move struct host_state usage to struct loaded_vmcs KVM: vmx: compute need to reload FS/GS/LDT on demand KVM: nVMX: remove a misleading comment regarding vmcs02 fields KVM: vmx: rename __vmx_load_host_state() and vmx_save_host_state() KVM: vmx: add dedicated utility to access guest's kernel_gs_base KVM: vmx: track host_state.loaded using a loaded_vmcs pointer KVM: vmx: refactor segmentation code in vmx_save_host_state() kvm: nVMX: Fix fault priority for VMX operations kvm: nVMX: Fix fault vector for VMX operation at CPL > 0 ...
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r--arch/x86/kvm/mmu.c531
1 files changed, 442 insertions, 89 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a44e568363a4..a282321329b5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -178,7 +178,24 @@ struct kvm_shadow_walk_iterator {
unsigned index;
};
-#define for_each_shadow_entry(_vcpu, _addr, _walker) \
+static const union kvm_mmu_page_role mmu_base_role_mask = {
+ .cr0_wp = 1,
+ .cr4_pae = 1,
+ .nxe = 1,
+ .smep_andnot_wp = 1,
+ .smap_andnot_wp = 1,
+ .smm = 1,
+ .guest_mode = 1,
+ .ad_disabled = 1,
+};
+
+#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \
+ for (shadow_walk_init_using_root(&(_walker), (_vcpu), \
+ (_root), (_addr)); \
+ shadow_walk_okay(&(_walker)); \
+ shadow_walk_next(&(_walker)))
+
+#define for_each_shadow_entry(_vcpu, _addr, _walker) \
for (shadow_walk_init(&(_walker), _vcpu, _addr); \
shadow_walk_okay(&(_walker)); \
shadow_walk_next(&(_walker)))
@@ -221,7 +238,20 @@ static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
PT64_EPT_EXECUTABLE_MASK;
static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
+/*
+ * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
+ * to guard against L1TF attacks.
+ */
+static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
+
+/*
+ * The number of high-order 1 bits to use in the mask above.
+ */
+static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
+
static void mmu_spte_set(u64 *sptep, u64 spte);
+static union kvm_mmu_page_role
+kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
{
@@ -308,9 +338,13 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
{
unsigned int gen = kvm_current_mmio_generation(vcpu);
u64 mask = generation_mmio_spte_mask(gen);
+ u64 gpa = gfn << PAGE_SHIFT;
access &= ACC_WRITE_MASK | ACC_USER_MASK;
- mask |= shadow_mmio_value | access | gfn << PAGE_SHIFT;
+ mask |= shadow_mmio_value | access;
+ mask |= gpa | shadow_nonpresent_or_rsvd_mask;
+ mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
+ << shadow_nonpresent_or_rsvd_mask_len;
trace_mark_mmio_spte(sptep, gfn, access, gen);
mmu_spte_set(sptep, mask);
@@ -323,8 +357,14 @@ static bool is_mmio_spte(u64 spte)
static gfn_t get_mmio_spte_gfn(u64 spte)
{
- u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask;
- return (spte & ~mask) >> PAGE_SHIFT;
+ u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask |
+ shadow_nonpresent_or_rsvd_mask;
+ u64 gpa = spte & ~mask;
+
+ gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
+ & shadow_nonpresent_or_rsvd_mask;
+
+ return gpa >> PAGE_SHIFT;
}
static unsigned get_mmio_spte_access(u64 spte)
@@ -381,7 +421,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
-static void kvm_mmu_clear_all_pte_masks(void)
+static void kvm_mmu_reset_all_pte_masks(void)
{
shadow_user_mask = 0;
shadow_accessed_mask = 0;
@@ -391,6 +431,18 @@ static void kvm_mmu_clear_all_pte_masks(void)
shadow_mmio_mask = 0;
shadow_present_mask = 0;
shadow_acc_track_mask = 0;
+
+ /*
+ * If the CPU has 46 or less physical address bits, then set an
+ * appropriate mask to guard against L1TF attacks. Otherwise, it is
+ * assumed that the CPU is not vulnerable to L1TF.
+ */
+ if (boot_cpu_data.x86_phys_bits <
+ 52 - shadow_nonpresent_or_rsvd_mask_len)
+ shadow_nonpresent_or_rsvd_mask =
+ rsvd_bits(boot_cpu_data.x86_phys_bits -
+ shadow_nonpresent_or_rsvd_mask_len,
+ boot_cpu_data.x86_phys_bits - 1);
}
static int is_cpuid_PSE36(void)
@@ -1986,7 +2038,7 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
return 0;
}
-static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root)
{
}
@@ -2117,12 +2169,8 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
- if (sp->role.cr4_pae != !!is_pae(vcpu)) {
- kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
- return false;
- }
-
- if (vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
+ if (sp->role.cr4_pae != !!is_pae(vcpu)
+ || vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return false;
}
@@ -2392,11 +2440,12 @@ out:
return sp;
}
-static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
- struct kvm_vcpu *vcpu, u64 addr)
+static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
+ struct kvm_vcpu *vcpu, hpa_t root,
+ u64 addr)
{
iterator->addr = addr;
- iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
+ iterator->shadow_addr = root;
iterator->level = vcpu->arch.mmu.shadow_root_level;
if (iterator->level == PT64_ROOT_4LEVEL &&
@@ -2405,6 +2454,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
--iterator->level;
if (iterator->level == PT32E_ROOT_LEVEL) {
+ /*
+ * prev_root is currently only used for 64-bit hosts. So only
+ * the active root_hpa is valid here.
+ */
+ BUG_ON(root != vcpu->arch.mmu.root_hpa);
+
iterator->shadow_addr
= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
@@ -2414,6 +2469,13 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
}
}
+static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
+ struct kvm_vcpu *vcpu, u64 addr)
+{
+ shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu.root_hpa,
+ addr);
+}
+
static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
{
if (iterator->level < PT_PAGE_TABLE_LEVEL)
@@ -2702,6 +2764,45 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
kvm_unsync_page(vcpu, sp);
}
+ /*
+ * We need to ensure that the marking of unsync pages is visible
+ * before the SPTE is updated to allow writes because
+ * kvm_mmu_sync_roots() checks the unsync flags without holding
+ * the MMU lock and so can race with this. If the SPTE was updated
+ * before the page had been marked as unsync-ed, something like the
+ * following could happen:
+ *
+ * CPU 1 CPU 2
+ * ---------------------------------------------------------------------
+ * 1.2 Host updates SPTE
+ * to be writable
+ * 2.1 Guest writes a GPTE for GVA X.
+ * (GPTE being in the guest page table shadowed
+ * by the SP from CPU 1.)
+ * This reads SPTE during the page table walk.
+ * Since SPTE.W is read as 1, there is no
+ * fault.
+ *
+ * 2.2 Guest issues TLB flush.
+ * That causes a VM Exit.
+ *
+ * 2.3 kvm_mmu_sync_pages() reads sp->unsync.
+ * Since it is false, so it just returns.
+ *
+ * 2.4 Guest accesses GVA X.
+ * Since the mapping in the SP was not updated,
+ * so the old mapping for GVA X incorrectly
+ * gets used.
+ * 1.1 Host marks SP
+ * as unsync
+ * (sp->unsync = true)
+ *
+ * The write barrier below ensures that 1.1 happens before 1.2 and thus
+ * the situation in 2.4 does not arise. The implicit barrier in 2.2
+ * pairs with this write barrier.
+ */
+ smp_wmb();
+
return false;
}
@@ -2724,6 +2825,10 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
return true;
}
+/* Bits which may be returned by set_spte() */
+#define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
+#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
+
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int level,
gfn_t gfn, kvm_pfn_t pfn, bool speculative,
@@ -2800,7 +2905,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
pgprintk("%s: found shadow page for %llx, marking ro\n",
__func__, gfn);
- ret = 1;
+ ret |= SET_SPTE_WRITE_PROTECTED_PT;
pte_access &= ~ACC_WRITE_MASK;
spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
}
@@ -2816,7 +2921,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
set_pte:
if (mmu_spte_update(sptep, spte))
- kvm_flush_remote_tlbs(vcpu->kvm);
+ ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
done:
return ret;
}
@@ -2827,7 +2932,9 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
{
int was_rmapped = 0;
int rmap_count;
+ int set_spte_ret;
int ret = RET_PF_RETRY;
+ bool flush = false;
pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
*sptep, write_fault, gfn);
@@ -2844,22 +2951,25 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
child = page_header(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, sptep);
- kvm_flush_remote_tlbs(vcpu->kvm);
+ flush = true;
} else if (pfn != spte_to_pfn(*sptep)) {
pgprintk("hfn old %llx new %llx\n",
spte_to_pfn(*sptep), pfn);
drop_spte(vcpu->kvm, sptep);
- kvm_flush_remote_tlbs(vcpu->kvm);
+ flush = true;
} else
was_rmapped = 1;
}
- if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
- true, host_writable)) {
+ set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
+ speculative, true, host_writable);
+ if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
if (write_fault)
ret = RET_PF_EMULATE;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
+ if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
+ kvm_flush_remote_tlbs(vcpu->kvm);
if (unlikely(is_mmio_spte(*sptep)))
ret = RET_PF_EMULATE;
@@ -3358,26 +3468,47 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
*root_hpa = INVALID_PAGE;
}
-void kvm_mmu_free_roots(struct kvm_vcpu *vcpu)
+/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
+void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free)
{
int i;
LIST_HEAD(invalid_list);
struct kvm_mmu *mmu = &vcpu->arch.mmu;
+ bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
- if (!VALID_PAGE(mmu->root_hpa))
- return;
+ BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
+
+ /* Before acquiring the MMU lock, see if we need to do any real work. */
+ if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
+ VALID_PAGE(mmu->prev_roots[i].hpa))
+ break;
+
+ if (i == KVM_MMU_NUM_PREV_ROOTS)
+ return;
+ }
spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
- (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
- mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, &invalid_list);
- } else {
- for (i = 0; i < 4; ++i)
- if (mmu->pae_root[i] != 0)
- mmu_free_root_page(vcpu->kvm, &mmu->pae_root[i],
- &invalid_list);
- mmu->root_hpa = INVALID_PAGE;
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
+ mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
+ &invalid_list);
+
+ if (free_active_root) {
+ if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
+ (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
+ mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
+ &invalid_list);
+ } else {
+ for (i = 0; i < 4; ++i)
+ if (mmu->pae_root[i] != 0)
+ mmu_free_root_page(vcpu->kvm,
+ &mmu->pae_root[i],
+ &invalid_list);
+ mmu->root_hpa = INVALID_PAGE;
+ }
}
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
@@ -3546,7 +3677,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
return mmu_alloc_shadow_roots(vcpu);
}
-static void mmu_sync_roots(struct kvm_vcpu *vcpu)
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
{
int i;
struct kvm_mmu_page *sp;
@@ -3558,14 +3689,39 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
return;
vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
- kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
+
if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
+
sp = page_header(root);
+
+ /*
+ * Even if another CPU was marking the SP as unsync-ed
+ * simultaneously, any guest page table changes are not
+ * guaranteed to be visible anyway until this VCPU issues a TLB
+ * flush strictly after those changes are made. We only need to
+ * ensure that the other CPU sets these flags before any actual
+ * changes to the page tables are made. The comments in
+ * mmu_need_write_protect() describe what could go wrong if this
+ * requirement isn't satisfied.
+ */
+ if (!smp_load_acquire(&sp->unsync) &&
+ !smp_load_acquire(&sp->unsync_children))
+ return;
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
+
mmu_sync_children(vcpu, sp);
+
kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
+ spin_unlock(&vcpu->kvm->mmu_lock);
return;
}
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
+
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu.pae_root[i];
@@ -3575,13 +3731,8 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
mmu_sync_children(vcpu, sp);
}
}
- kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
-}
-void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
-{
- spin_lock(&vcpu->kvm->mmu_lock);
- mmu_sync_roots(vcpu);
+ kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
spin_unlock(&vcpu->kvm->mmu_lock);
}
EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
@@ -3948,16 +4099,107 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu,
context->update_pte = nonpaging_update_pte;
context->root_level = 0;
context->shadow_root_level = PT32E_ROOT_LEVEL;
- context->root_hpa = INVALID_PAGE;
context->direct_map = true;
context->nx = false;
}
-void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu)
+/*
+ * Find out if a previously cached root matching the new CR3/role is available.
+ * The current root is also inserted into the cache.
+ * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
+ * returned.
+ * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
+ * false is returned. This root should now be freed by the caller.
+ */
+static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
+ union kvm_mmu_page_role new_role)
+{
+ uint i;
+ struct kvm_mmu_root_info root;
+ struct kvm_mmu *mmu = &vcpu->arch.mmu;
+
+ root.cr3 = mmu->get_cr3(vcpu);
+ root.hpa = mmu->root_hpa;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ swap(root, mmu->prev_roots[i]);
+
+ if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) &&
+ page_header(root.hpa) != NULL &&
+ new_role.word == page_header(root.hpa)->role.word)
+ break;
+ }
+
+ mmu->root_hpa = root.hpa;
+
+ return i < KVM_MMU_NUM_PREV_ROOTS;
+}
+
+static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
+ union kvm_mmu_page_role new_role,
+ bool skip_tlb_flush)
{
- kvm_mmu_free_roots(vcpu);
+ struct kvm_mmu *mmu = &vcpu->arch.mmu;
+
+ /*
+ * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
+ * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
+ * later if necessary.
+ */
+ if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
+ mmu->root_level >= PT64_ROOT_4LEVEL) {
+ if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
+ return false;
+
+ if (cached_root_available(vcpu, new_cr3, new_role)) {
+ /*
+ * It is possible that the cached previous root page is
+ * obsolete because of a change in the MMU
+ * generation number. However, that is accompanied by
+ * KVM_REQ_MMU_RELOAD, which will free the root that we
+ * have set here and allocate a new one.
+ */
+
+ kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
+ if (!skip_tlb_flush) {
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ kvm_x86_ops->tlb_flush(vcpu, true);
+ }
+
+ /*
+ * The last MMIO access's GVA and GPA are cached in the
+ * VCPU. When switching to a new CR3, that GVA->GPA
+ * mapping may no longer be valid. So clear any cached
+ * MMIO info even when we don't need to sync the shadow
+ * page tables.
+ */
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
+
+ __clear_sp_write_flooding_count(
+ page_header(mmu->root_hpa));
+
+ return true;
+ }
+ }
+
+ return false;
}
+static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
+ union kvm_mmu_page_role new_role,
+ bool skip_tlb_flush)
+{
+ if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
+ kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT);
+}
+
+void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
+{
+ __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu),
+ skip_tlb_flush);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
+
static unsigned long get_cr3(struct kvm_vcpu *vcpu)
{
return kvm_read_cr3(vcpu);
@@ -4432,7 +4674,6 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
context->invlpg = paging64_invlpg;
context->update_pte = paging64_update_pte;
context->shadow_root_level = level;
- context->root_hpa = INVALID_PAGE;
context->direct_map = false;
}
@@ -4462,7 +4703,6 @@ static void paging32_init_context(struct kvm_vcpu *vcpu,
context->invlpg = paging32_invlpg;
context->update_pte = paging32_update_pte;
context->shadow_root_level = PT32E_ROOT_LEVEL;
- context->root_hpa = INVALID_PAGE;
context->direct_map = false;
}
@@ -4472,20 +4712,32 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu,
paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
}
+static union kvm_mmu_page_role
+kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu)
+{
+ union kvm_mmu_page_role role = {0};
+
+ role.guest_mode = is_guest_mode(vcpu);
+ role.smm = is_smm(vcpu);
+ role.ad_disabled = (shadow_accessed_mask == 0);
+ role.level = kvm_x86_ops->get_tdp_level(vcpu);
+ role.direct = true;
+ role.access = ACC_ALL;
+
+ return role;
+}
+
static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *context = &vcpu->arch.mmu;
- context->base_role.word = 0;
- context->base_role.guest_mode = is_guest_mode(vcpu);
- context->base_role.smm = is_smm(vcpu);
- context->base_role.ad_disabled = (shadow_accessed_mask == 0);
+ context->base_role.word = mmu_base_role_mask.word &
+ kvm_calc_tdp_mmu_root_page_role(vcpu).word;
context->page_fault = tdp_page_fault;
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
context->update_pte = nonpaging_update_pte;
context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
- context->root_hpa = INVALID_PAGE;
context->direct_map = true;
context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
context->get_cr3 = get_cr3;
@@ -4520,13 +4772,36 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
reset_tdp_shadow_zero_bits_mask(vcpu, context);
}
-void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
+static union kvm_mmu_page_role
+kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu)
{
+ union kvm_mmu_page_role role = {0};
bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
- struct kvm_mmu *context = &vcpu->arch.mmu;
- MMU_WARN_ON(VALID_PAGE(context->root_hpa));
+ role.nxe = is_nx(vcpu);
+ role.cr4_pae = !!is_pae(vcpu);
+ role.cr0_wp = is_write_protection(vcpu);
+ role.smep_andnot_wp = smep && !is_write_protection(vcpu);
+ role.smap_andnot_wp = smap && !is_write_protection(vcpu);
+ role.guest_mode = is_guest_mode(vcpu);
+ role.smm = is_smm(vcpu);
+ role.direct = !is_paging(vcpu);
+ role.access = ACC_ALL;
+
+ if (!is_long_mode(vcpu))
+ role.level = PT32E_ROOT_LEVEL;
+ else if (is_la57_mode(vcpu))
+ role.level = PT64_ROOT_5LEVEL;
+ else
+ role.level = PT64_ROOT_4LEVEL;
+
+ return role;
+}
+
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
if (!is_paging(vcpu))
nonpaging_init_context(vcpu, context);
@@ -4537,26 +4812,34 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
else
paging32_init_context(vcpu, context);
- context->base_role.nxe = is_nx(vcpu);
- context->base_role.cr4_pae = !!is_pae(vcpu);
- context->base_role.cr0_wp = is_write_protection(vcpu);
- context->base_role.smep_andnot_wp
- = smep && !is_write_protection(vcpu);
- context->base_role.smap_andnot_wp
- = smap && !is_write_protection(vcpu);
- context->base_role.guest_mode = is_guest_mode(vcpu);
- context->base_role.smm = is_smm(vcpu);
+ context->base_role.word = mmu_base_role_mask.word &
+ kvm_calc_shadow_mmu_root_page_role(vcpu).word;
reset_shadow_zero_bits_mask(vcpu, context);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
+static union kvm_mmu_page_role
+kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty)
+{
+ union kvm_mmu_page_role role = vcpu->arch.mmu.base_role;
+
+ role.level = PT64_ROOT_4LEVEL;
+ role.direct = false;
+ role.ad_disabled = !accessed_dirty;
+ role.guest_mode = true;
+ role.access = ACC_ALL;
+
+ return role;
+}
+
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
- bool accessed_dirty)
+ bool accessed_dirty, gpa_t new_eptp)
{
struct kvm_mmu *context = &vcpu->arch.mmu;
+ union kvm_mmu_page_role root_page_role =
+ kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty);
- MMU_WARN_ON(VALID_PAGE(context->root_hpa));
-
+ __kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role, false);
context->shadow_root_level = PT64_ROOT_4LEVEL;
context->nx = true;
@@ -4567,10 +4850,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->invlpg = ept_invlpg;
context->update_pte = ept_update_pte;
context->root_level = PT64_ROOT_4LEVEL;
- context->root_hpa = INVALID_PAGE;
context->direct_map = false;
- context->base_role.ad_disabled = !accessed_dirty;
- context->base_role.guest_mode = 1;
+ context->base_role.word = root_page_role.word & mmu_base_role_mask.word;
update_permission_bitmask(vcpu, context, true);
update_pkru_bitmask(vcpu, context, true);
update_last_nonleaf_level(vcpu, context);
@@ -4633,8 +4914,17 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
update_last_nonleaf_level(vcpu, g_context);
}
-static void init_kvm_mmu(struct kvm_vcpu *vcpu)
+void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
{
+ if (reset_roots) {
+ uint i;
+
+ vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
+ }
+
if (mmu_is_nested(vcpu))
init_kvm_nested_mmu(vcpu);
else if (tdp_enabled)
@@ -4642,11 +4932,21 @@ static void init_kvm_mmu(struct kvm_vcpu *vcpu)
else
init_kvm_softmmu(vcpu);
}
+EXPORT_SYMBOL_GPL(kvm_init_mmu);
+
+static union kvm_mmu_page_role
+kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
+{
+ if (tdp_enabled)
+ return kvm_calc_tdp_mmu_root_page_role(vcpu);
+ else
+ return kvm_calc_shadow_mmu_root_page_role(vcpu);
+}
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
{
kvm_mmu_unload(vcpu);
- init_kvm_mmu(vcpu);
+ kvm_init_mmu(vcpu, true);
}
EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
@@ -4661,8 +4961,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
kvm_mmu_sync_roots(vcpu);
if (r)
goto out;
- /* set_cr3() should ensure TLB has been flushed */
- vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
+ kvm_mmu_load_cr3(vcpu);
+ kvm_x86_ops->tlb_flush(vcpu, true);
out:
return r;
}
@@ -4670,7 +4970,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load);
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
- kvm_mmu_free_roots(vcpu);
+ kvm_mmu_free_roots(vcpu, KVM_MMU_ROOTS_ALL);
WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
}
EXPORT_SYMBOL_GPL(kvm_mmu_unload);
@@ -4823,16 +5123,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
u64 entry, gentry, *spte;
int npte;
bool remote_flush, local_flush;
- union kvm_mmu_page_role mask = { };
-
- mask.cr0_wp = 1;
- mask.cr4_pae = 1;
- mask.nxe = 1;
- mask.smep_andnot_wp = 1;
- mask.smap_andnot_wp = 1;
- mask.smm = 1;
- mask.guest_mode = 1;
- mask.ad_disabled = 1;
/*
* If we don't have indirect shadow pages, it means no page is
@@ -4876,7 +5166,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
mmu_page_zap_pte(vcpu->kvm, sp, spte);
if (gentry &&
!((sp->role.word ^ vcpu->arch.mmu.base_role.word)
- & mask.word) && rmap_can_add(vcpu))
+ & mmu_base_role_mask.word) && rmap_can_add(vcpu))
mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
if (need_remote_flush(entry, *spte))
remote_flush = true;
@@ -5001,12 +5291,67 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
- vcpu->arch.mmu.invlpg(vcpu, gva);
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ struct kvm_mmu *mmu = &vcpu->arch.mmu;
+ int i;
+
+ /* INVLPG on a * non-canonical address is a NOP according to the SDM. */
+ if (is_noncanonical_address(gva, vcpu))
+ return;
+
+ mmu->invlpg(vcpu, gva, mmu->root_hpa);
+
+ /*
+ * INVLPG is required to invalidate any global mappings for the VA,
+ * irrespective of PCID. Since it would take us roughly similar amount
+ * of work to determine whether any of the prev_root mappings of the VA
+ * is marked global, or to just sync it blindly, so we might as well
+ * just always sync it.
+ *
+ * Mappings not reachable via the current cr3 or the prev_roots will be
+ * synced when switching to that cr3, so nothing needs to be done here
+ * for them.
+ */
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (VALID_PAGE(mmu->prev_roots[i].hpa))
+ mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
+
+ kvm_x86_ops->tlb_flush_gva(vcpu, gva);
++vcpu->stat.invlpg;
}
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
+void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
+{
+ struct kvm_mmu *mmu = &vcpu->arch.mmu;
+ bool tlb_flush = false;
+ uint i;
+
+ if (pcid == kvm_get_active_pcid(vcpu)) {
+ mmu->invlpg(vcpu, gva, mmu->root_hpa);
+ tlb_flush = true;
+ }
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
+ pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) {
+ mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
+ tlb_flush = true;
+ }
+ }
+
+ if (tlb_flush)
+ kvm_x86_ops->tlb_flush_gva(vcpu, gva);
+
+ ++vcpu->stat.invlpg;
+
+ /*
+ * Mappings not reachable via the current cr3 or the prev_roots will be
+ * synced when switching to that cr3, so nothing needs to be done here
+ * for them.
+ */
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
+
void kvm_enable_tdp(void)
{
tdp_enabled = true;
@@ -5030,6 +5375,9 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
struct page *page;
int i;
+ if (tdp_enabled)
+ return 0;
+
/*
* When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
* Therefore we need to allocate shadow page tables in the first
@@ -5048,11 +5396,16 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
int kvm_mmu_create(struct kvm_vcpu *vcpu)
{
+ uint i;
+
vcpu->arch.walk_mmu = &vcpu->arch.mmu;
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
vcpu->arch.mmu.translate_gpa = translate_gpa;
vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
+
return alloc_mmu_pages(vcpu);
}
@@ -5060,7 +5413,7 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu)
{
MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
- init_kvm_mmu(vcpu);
+ kvm_init_mmu(vcpu, true);
}
static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
@@ -5500,7 +5853,7 @@ int kvm_mmu_module_init(void)
{
int ret = -ENOMEM;
- kvm_mmu_clear_all_pte_masks();
+ kvm_mmu_reset_all_pte_masks();
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
sizeof(struct pte_list_desc),