diff options
Diffstat (limited to 'arch/x86/kvm/mmu/paging_tmpl.h')
-rw-r--r-- | arch/x86/kvm/mmu/paging_tmpl.h | 275 |
1 files changed, 68 insertions, 207 deletions
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 57f0b75c80f9..0662e0278e70 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -324,7 +324,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, trace_kvm_mmu_pagetable_walk(addr, access); retry_walk: walker->level = mmu->cpu_role.base.level; - pte = mmu->get_guest_pgd(vcpu); + pte = kvm_mmu_get_guest_pgd(vcpu, mmu); have_ad = PT_HAVE_ACCESSED_DIRTY(mmu); #if PTTYPE == 64 @@ -519,7 +519,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, static bool FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, pt_element_t gpte, bool no_dirty_log) + u64 *spte, pt_element_t gpte) { struct kvm_memory_slot *slot; unsigned pte_access; @@ -535,8 +535,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, pte_access = sp->role.access & FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, - no_dirty_log && (pte_access & ACC_WRITE_MASK)); + slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, pte_access & ACC_WRITE_MASK); if (!slot) return false; @@ -605,7 +604,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, if (is_shadow_present_pte(*spte)) continue; - if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true)) + if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i])) break; } } @@ -685,8 +684,17 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (sp != ERR_PTR(-EEXIST)) link_shadow_page(vcpu, it.sptep, sp); + + if (fault->write && table_gfn == fault->gfn) + fault->write_fault_to_shadow_pgtable = true; } + /* + * Adjust the hugepage size _after_ resolving indirect shadow pages. + * KVM doesn't support mapping hugepages into the guest for gfns that + * are being shadowed by KVM, i.e. allocating a new shadow page may + * affect the allowed hugepage size. + */ kvm_mmu_hugepage_adjust(vcpu, fault); trace_kvm_mmu_spte_requested(fault); @@ -731,46 +739,6 @@ out_gpte_changed: return RET_PF_RETRY; } - /* - * To see whether the mapped gfn can write its page table in the current - * mapping. - * - * It is the helper function of FNAME(page_fault). When guest uses large page - * size to map the writable gfn which is used as current page table, we should - * force kvm to use small page size to map it because new shadow page will be - * created when kvm establishes shadow page table that stop kvm using large - * page size. Do it early can avoid unnecessary #PF and emulation. - * - * @write_fault_to_shadow_pgtable will return true if the fault gfn is - * currently used as its page table. - * - * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok - * since the PDPT is always shadowed, that means, we can not use large page - * size to map the gfn which is used as PDPT. - */ -static bool -FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, - struct guest_walker *walker, bool user_fault, - bool *write_fault_to_shadow_pgtable) -{ - int level; - gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1); - bool self_changed = false; - - if (!(walker->pte_access & ACC_WRITE_MASK || - (!is_cr0_wp(vcpu->arch.mmu) && !user_fault))) - return false; - - for (level = walker->level; level <= walker->max_level; level++) { - gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1]; - - self_changed |= !(gfn & mask); - *write_fault_to_shadow_pgtable |= !gfn; - } - - return self_changed; -} - /* * Page fault handler. There are several causes for a page fault: * - there is no shadow pte for the guest pte @@ -789,7 +757,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault { struct guest_walker walker; int r; - bool is_self_change_mapping; pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code); WARN_ON_ONCE(fault->is_tdp); @@ -814,6 +781,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault } fault->gfn = walker.gfn; + fault->max_level = walker.level; fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); if (page_fault_handle_page_track(vcpu, fault)) { @@ -825,16 +793,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) return r; - vcpu->arch.write_fault_to_shadow_pgtable = false; - - is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, - &walker, fault->user, &vcpu->arch.write_fault_to_shadow_pgtable); - - if (is_self_change_mapping) - fault->max_level = PG_LEVEL_4K; - else - fault->max_level = walker.level; - r = kvm_faultin_pfn(vcpu, fault, walker.pte_access); if (r != RET_PF_CONTINUE) return r; @@ -887,64 +845,6 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); } -static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) -{ - struct kvm_shadow_walk_iterator iterator; - struct kvm_mmu_page *sp; - u64 old_spte; - int level; - u64 *sptep; - - vcpu_clear_mmio_info(vcpu, gva); - - /* - * No need to check return value here, rmap_can_add() can - * help us to skip pte prefetch later. - */ - mmu_topup_memory_caches(vcpu, true); - - if (!VALID_PAGE(root_hpa)) { - WARN_ON(1); - return; - } - - write_lock(&vcpu->kvm->mmu_lock); - for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) { - level = iterator.level; - sptep = iterator.sptep; - - sp = sptep_to_sp(sptep); - old_spte = *sptep; - if (is_last_spte(old_spte, level)) { - pt_element_t gpte; - gpa_t pte_gpa; - - if (!sp->unsync) - break; - - pte_gpa = FNAME(get_level1_sp_gpa)(sp); - pte_gpa += spte_index(sptep) * sizeof(pt_element_t); - - mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL); - if (is_shadow_present_pte(old_spte)) - kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep); - - if (!rmap_can_add(vcpu)) - break; - - if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, - sizeof(pt_element_t))) - break; - - FNAME(prefetch_gpte)(vcpu, sp, sptep, gpte, false); - } - - if (!sp->unsync_children) - break; - } - write_unlock(&vcpu->kvm->mmu_lock); -} - /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gpa_t addr, u64 access, @@ -977,114 +877,75 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, * can't change unless all sptes pointing to it are nuked first. * * Returns - * < 0: the sp should be zapped - * 0: the sp is synced and no tlb flushing is required - * > 0: the sp is synced and tlb flushing is required + * < 0: failed to sync spte + * 0: the spte is synced and no tlb flushing is required + * > 0: the spte is synced and tlb flushing is required */ -static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i) { - union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role; - int i; bool host_writable; gpa_t first_pte_gpa; - bool flush = false; - - /* - * Ignore various flags when verifying that it's safe to sync a shadow - * page using the current MMU context. - * - * - level: not part of the overall MMU role and will never match as the MMU's - * level tracks the root level - * - access: updated based on the new guest PTE - * - quadrant: not part of the overall MMU role (similar to level) - */ - const union kvm_mmu_page_role sync_role_ign = { - .level = 0xf, - .access = 0x7, - .quadrant = 0x3, - .passthrough = 0x1, - }; + u64 *sptep, spte; + struct kvm_memory_slot *slot; + unsigned pte_access; + pt_element_t gpte; + gpa_t pte_gpa; + gfn_t gfn; - /* - * Direct pages can never be unsync, and KVM should never attempt to - * sync a shadow page for a different MMU context, e.g. if the role - * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the - * reserved bits checks will be wrong, etc... - */ - if (WARN_ON_ONCE(sp->role.direct || - (sp->role.word ^ root_role.word) & ~sync_role_ign.word)) - return -1; + if (WARN_ON_ONCE(!sp->spt[i])) + return 0; first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); + pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); - for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { - u64 *sptep, spte; - struct kvm_memory_slot *slot; - unsigned pte_access; - pt_element_t gpte; - gpa_t pte_gpa; - gfn_t gfn; - - if (!sp->spt[i]) - continue; - - pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); - - if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, - sizeof(pt_element_t))) - return -1; - - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { - flush = true; - continue; - } - - gfn = gpte_to_gfn(gpte); - pte_access = sp->role.access; - pte_access &= FNAME(gpte_access)(gpte); - FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - - if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) - continue; + if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, + sizeof(pt_element_t))) + return -1; - /* - * Drop the SPTE if the new protections would result in a RWX=0 - * SPTE or if the gfn is changing. The RWX=0 case only affects - * EPT with execute-only support, i.e. EPT without an effective - * "present" bit, as all other paging modes will create a - * read-only SPTE if pte_access is zero. - */ - if ((!pte_access && !shadow_present_mask) || - gfn != kvm_mmu_page_get_gfn(sp, i)) { - drop_spte(vcpu->kvm, &sp->spt[i]); - flush = true; - continue; - } + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) + return 1; - /* Update the shadowed access bits in case they changed. */ - kvm_mmu_page_set_access(sp, i, pte_access); + gfn = gpte_to_gfn(gpte); + pte_access = sp->role.access; + pte_access &= FNAME(gpte_access)(gpte); + FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - sptep = &sp->spt[i]; - spte = *sptep; - host_writable = spte & shadow_host_writable_mask; - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - make_spte(vcpu, sp, slot, pte_access, gfn, - spte_to_pfn(spte), spte, true, false, - host_writable, &spte); + if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) + return 0; - flush |= mmu_spte_update(sptep, spte); + /* + * Drop the SPTE if the new protections would result in a RWX=0 + * SPTE or if the gfn is changing. The RWX=0 case only affects + * EPT with execute-only support, i.e. EPT without an effective + * "present" bit, as all other paging modes will create a + * read-only SPTE if pte_access is zero. + */ + if ((!pte_access && !shadow_present_mask) || + gfn != kvm_mmu_page_get_gfn(sp, i)) { + drop_spte(vcpu->kvm, &sp->spt[i]); + return 1; } - /* - * Note, any flush is purely for KVM's correctness, e.g. when dropping - * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier - * unmap or dirty logging event doesn't fail to flush. The guest is - * responsible for flushing the TLB to ensure any changes in protection - * bits are recognized, i.e. until the guest flushes or page faults on - * a relevant address, KVM is architecturally allowed to let vCPUs use - * cached translations with the old protection bits. + * Do nothing if the permissions are unchanged. The existing SPTE is + * still, and prefetch_invalid_gpte() has verified that the A/D bits + * are set in the "new" gPTE, i.e. there is no danger of missing an A/D + * update due to A/D bits being set in the SPTE but not the gPTE. */ - return flush; + if (kvm_mmu_page_get_access(sp, i) == pte_access) + return 0; + + /* Update the shadowed access bits in case they changed. */ + kvm_mmu_page_set_access(sp, i, pte_access); + + sptep = &sp->spt[i]; + spte = *sptep; + host_writable = spte & shadow_host_writable_mask; + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + make_spte(vcpu, sp, slot, pte_access, gfn, + spte_to_pfn(spte), spte, true, false, + host_writable, &spte); + + return mmu_spte_update(sptep, spte); } #undef pt_element_t |