summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/mmu/paging_tmpl.h
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu/paging_tmpl.h')
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h275
1 files changed, 68 insertions, 207 deletions
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 57f0b75c80f9..0662e0278e70 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -324,7 +324,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
trace_kvm_mmu_pagetable_walk(addr, access);
retry_walk:
walker->level = mmu->cpu_role.base.level;
- pte = mmu->get_guest_pgd(vcpu);
+ pte = kvm_mmu_get_guest_pgd(vcpu, mmu);
have_ad = PT_HAVE_ACCESSED_DIRTY(mmu);
#if PTTYPE == 64
@@ -519,7 +519,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
static bool
FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- u64 *spte, pt_element_t gpte, bool no_dirty_log)
+ u64 *spte, pt_element_t gpte)
{
struct kvm_memory_slot *slot;
unsigned pte_access;
@@ -535,8 +535,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
pte_access = sp->role.access & FNAME(gpte_access)(gpte);
FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
- slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn,
- no_dirty_log && (pte_access & ACC_WRITE_MASK));
+ slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, pte_access & ACC_WRITE_MASK);
if (!slot)
return false;
@@ -605,7 +604,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
if (is_shadow_present_pte(*spte))
continue;
- if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true))
+ if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i]))
break;
}
}
@@ -685,8 +684,17 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
if (sp != ERR_PTR(-EEXIST))
link_shadow_page(vcpu, it.sptep, sp);
+
+ if (fault->write && table_gfn == fault->gfn)
+ fault->write_fault_to_shadow_pgtable = true;
}
+ /*
+ * Adjust the hugepage size _after_ resolving indirect shadow pages.
+ * KVM doesn't support mapping hugepages into the guest for gfns that
+ * are being shadowed by KVM, i.e. allocating a new shadow page may
+ * affect the allowed hugepage size.
+ */
kvm_mmu_hugepage_adjust(vcpu, fault);
trace_kvm_mmu_spte_requested(fault);
@@ -731,46 +739,6 @@ out_gpte_changed:
return RET_PF_RETRY;
}
- /*
- * To see whether the mapped gfn can write its page table in the current
- * mapping.
- *
- * It is the helper function of FNAME(page_fault). When guest uses large page
- * size to map the writable gfn which is used as current page table, we should
- * force kvm to use small page size to map it because new shadow page will be
- * created when kvm establishes shadow page table that stop kvm using large
- * page size. Do it early can avoid unnecessary #PF and emulation.
- *
- * @write_fault_to_shadow_pgtable will return true if the fault gfn is
- * currently used as its page table.
- *
- * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
- * since the PDPT is always shadowed, that means, we can not use large page
- * size to map the gfn which is used as PDPT.
- */
-static bool
-FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
- struct guest_walker *walker, bool user_fault,
- bool *write_fault_to_shadow_pgtable)
-{
- int level;
- gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
- bool self_changed = false;
-
- if (!(walker->pte_access & ACC_WRITE_MASK ||
- (!is_cr0_wp(vcpu->arch.mmu) && !user_fault)))
- return false;
-
- for (level = walker->level; level <= walker->max_level; level++) {
- gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
-
- self_changed |= !(gfn & mask);
- *write_fault_to_shadow_pgtable |= !gfn;
- }
-
- return self_changed;
-}
-
/*
* Page fault handler. There are several causes for a page fault:
* - there is no shadow pte for the guest pte
@@ -789,7 +757,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
{
struct guest_walker walker;
int r;
- bool is_self_change_mapping;
pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code);
WARN_ON_ONCE(fault->is_tdp);
@@ -814,6 +781,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
}
fault->gfn = walker.gfn;
+ fault->max_level = walker.level;
fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
if (page_fault_handle_page_track(vcpu, fault)) {
@@ -825,16 +793,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (r)
return r;
- vcpu->arch.write_fault_to_shadow_pgtable = false;
-
- is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
- &walker, fault->user, &vcpu->arch.write_fault_to_shadow_pgtable);
-
- if (is_self_change_mapping)
- fault->max_level = PG_LEVEL_4K;
- else
- fault->max_level = walker.level;
-
r = kvm_faultin_pfn(vcpu, fault, walker.pte_access);
if (r != RET_PF_CONTINUE)
return r;
@@ -887,64 +845,6 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
}
-static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
-{
- struct kvm_shadow_walk_iterator iterator;
- struct kvm_mmu_page *sp;
- u64 old_spte;
- int level;
- u64 *sptep;
-
- vcpu_clear_mmio_info(vcpu, gva);
-
- /*
- * No need to check return value here, rmap_can_add() can
- * help us to skip pte prefetch later.
- */
- mmu_topup_memory_caches(vcpu, true);
-
- if (!VALID_PAGE(root_hpa)) {
- WARN_ON(1);
- return;
- }
-
- write_lock(&vcpu->kvm->mmu_lock);
- for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) {
- level = iterator.level;
- sptep = iterator.sptep;
-
- sp = sptep_to_sp(sptep);
- old_spte = *sptep;
- if (is_last_spte(old_spte, level)) {
- pt_element_t gpte;
- gpa_t pte_gpa;
-
- if (!sp->unsync)
- break;
-
- pte_gpa = FNAME(get_level1_sp_gpa)(sp);
- pte_gpa += spte_index(sptep) * sizeof(pt_element_t);
-
- mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL);
- if (is_shadow_present_pte(old_spte))
- kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep);
-
- if (!rmap_can_add(vcpu))
- break;
-
- if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
- sizeof(pt_element_t)))
- break;
-
- FNAME(prefetch_gpte)(vcpu, sp, sptep, gpte, false);
- }
-
- if (!sp->unsync_children)
- break;
- }
- write_unlock(&vcpu->kvm->mmu_lock);
-}
-
/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gpa_t addr, u64 access,
@@ -977,114 +877,75 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
* can't change unless all sptes pointing to it are nuked first.
*
* Returns
- * < 0: the sp should be zapped
- * 0: the sp is synced and no tlb flushing is required
- * > 0: the sp is synced and tlb flushing is required
+ * < 0: failed to sync spte
+ * 0: the spte is synced and no tlb flushing is required
+ * > 0: the spte is synced and tlb flushing is required
*/
-static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
{
- union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role;
- int i;
bool host_writable;
gpa_t first_pte_gpa;
- bool flush = false;
-
- /*
- * Ignore various flags when verifying that it's safe to sync a shadow
- * page using the current MMU context.
- *
- * - level: not part of the overall MMU role and will never match as the MMU's
- * level tracks the root level
- * - access: updated based on the new guest PTE
- * - quadrant: not part of the overall MMU role (similar to level)
- */
- const union kvm_mmu_page_role sync_role_ign = {
- .level = 0xf,
- .access = 0x7,
- .quadrant = 0x3,
- .passthrough = 0x1,
- };
+ u64 *sptep, spte;
+ struct kvm_memory_slot *slot;
+ unsigned pte_access;
+ pt_element_t gpte;
+ gpa_t pte_gpa;
+ gfn_t gfn;
- /*
- * Direct pages can never be unsync, and KVM should never attempt to
- * sync a shadow page for a different MMU context, e.g. if the role
- * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the
- * reserved bits checks will be wrong, etc...
- */
- if (WARN_ON_ONCE(sp->role.direct ||
- (sp->role.word ^ root_role.word) & ~sync_role_ign.word))
- return -1;
+ if (WARN_ON_ONCE(!sp->spt[i]))
+ return 0;
first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
+ pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
- for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
- u64 *sptep, spte;
- struct kvm_memory_slot *slot;
- unsigned pte_access;
- pt_element_t gpte;
- gpa_t pte_gpa;
- gfn_t gfn;
-
- if (!sp->spt[i])
- continue;
-
- pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
-
- if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
- sizeof(pt_element_t)))
- return -1;
-
- if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
- flush = true;
- continue;
- }
-
- gfn = gpte_to_gfn(gpte);
- pte_access = sp->role.access;
- pte_access &= FNAME(gpte_access)(gpte);
- FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
-
- if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access))
- continue;
+ if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
+ sizeof(pt_element_t)))
+ return -1;
- /*
- * Drop the SPTE if the new protections would result in a RWX=0
- * SPTE or if the gfn is changing. The RWX=0 case only affects
- * EPT with execute-only support, i.e. EPT without an effective
- * "present" bit, as all other paging modes will create a
- * read-only SPTE if pte_access is zero.
- */
- if ((!pte_access && !shadow_present_mask) ||
- gfn != kvm_mmu_page_get_gfn(sp, i)) {
- drop_spte(vcpu->kvm, &sp->spt[i]);
- flush = true;
- continue;
- }
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte))
+ return 1;
- /* Update the shadowed access bits in case they changed. */
- kvm_mmu_page_set_access(sp, i, pte_access);
+ gfn = gpte_to_gfn(gpte);
+ pte_access = sp->role.access;
+ pte_access &= FNAME(gpte_access)(gpte);
+ FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
- sptep = &sp->spt[i];
- spte = *sptep;
- host_writable = spte & shadow_host_writable_mask;
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- make_spte(vcpu, sp, slot, pte_access, gfn,
- spte_to_pfn(spte), spte, true, false,
- host_writable, &spte);
+ if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access))
+ return 0;
- flush |= mmu_spte_update(sptep, spte);
+ /*
+ * Drop the SPTE if the new protections would result in a RWX=0
+ * SPTE or if the gfn is changing. The RWX=0 case only affects
+ * EPT with execute-only support, i.e. EPT without an effective
+ * "present" bit, as all other paging modes will create a
+ * read-only SPTE if pte_access is zero.
+ */
+ if ((!pte_access && !shadow_present_mask) ||
+ gfn != kvm_mmu_page_get_gfn(sp, i)) {
+ drop_spte(vcpu->kvm, &sp->spt[i]);
+ return 1;
}
-
/*
- * Note, any flush is purely for KVM's correctness, e.g. when dropping
- * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier
- * unmap or dirty logging event doesn't fail to flush. The guest is
- * responsible for flushing the TLB to ensure any changes in protection
- * bits are recognized, i.e. until the guest flushes or page faults on
- * a relevant address, KVM is architecturally allowed to let vCPUs use
- * cached translations with the old protection bits.
+ * Do nothing if the permissions are unchanged. The existing SPTE is
+ * still, and prefetch_invalid_gpte() has verified that the A/D bits
+ * are set in the "new" gPTE, i.e. there is no danger of missing an A/D
+ * update due to A/D bits being set in the SPTE but not the gPTE.
*/
- return flush;
+ if (kvm_mmu_page_get_access(sp, i) == pte_access)
+ return 0;
+
+ /* Update the shadowed access bits in case they changed. */
+ kvm_mmu_page_set_access(sp, i, pte_access);
+
+ sptep = &sp->spt[i];
+ spte = *sptep;
+ host_writable = spte & shadow_host_writable_mask;
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ make_spte(vcpu, sp, slot, pte_access, gfn,
+ spte_to_pfn(spte), spte, true, false,
+ host_writable, &spte);
+
+ return mmu_spte_update(sptep, spte);
}
#undef pt_element_t