diff options
Diffstat (limited to 'arch/arm64/kvm/mmu.c')
-rw-r--r-- | arch/arm64/kvm/mmu.c | 195 |
1 files changed, 133 insertions, 62 deletions
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 60ee3d9f01f8..31d7fa4c7c14 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -128,6 +128,25 @@ static void kvm_s2_free_pages_exact(void *virt, size_t size) free_pages_exact(virt, size); } +static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; + +static void stage2_free_removed_table_rcu_cb(struct rcu_head *head) +{ + struct page *page = container_of(head, struct page, rcu_head); + void *pgtable = page_to_virt(page); + u32 level = page_private(page); + + kvm_pgtable_stage2_free_removed(&kvm_s2_mm_ops, pgtable, level); +} + +static void stage2_free_removed_table(void *addr, u32 level) +{ + struct page *page = virt_to_page(addr); + + set_page_private(page, (unsigned long)level); + call_rcu(&page->rcu_head, stage2_free_removed_table_rcu_cb); +} + static void kvm_host_get_page(void *addr) { get_page(virt_to_page(addr)); @@ -640,8 +659,8 @@ static struct kvm_pgtable_mm_ops kvm_user_mm_ops = { static int get_user_mapping_size(struct kvm *kvm, u64 addr) { struct kvm_pgtable pgt = { - .pgd = (kvm_pte_t *)kvm->mm->pgd, - .ia_bits = VA_BITS, + .pgd = (kvm_pteref_t)kvm->mm->pgd, + .ia_bits = vabits_actual, .start_level = (KVM_PGTABLE_MAX_LEVELS - CONFIG_PGTABLE_LEVELS), .mm_ops = &kvm_user_mm_ops, @@ -662,6 +681,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { .zalloc_page = stage2_memcache_zalloc_page, .zalloc_pages_exact = kvm_s2_zalloc_pages_exact, .free_pages_exact = kvm_s2_free_pages_exact, + .free_removed_table = stage2_free_removed_table, .get_page = kvm_host_get_page, .put_page = kvm_s2_put_page, .page_count = kvm_host_page_count, @@ -675,15 +695,42 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { * kvm_init_stage2_mmu - Initialise a S2 MMU structure * @kvm: The pointer to the KVM structure * @mmu: The pointer to the s2 MMU structure + * @type: The machine type of the virtual machine * * Allocates only the stage-2 HW PGD level table(s). * Note we don't need locking here as this is only called when the VM is * created, which can only be done once. */ -int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) +int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) { + u32 kvm_ipa_limit = get_kvm_ipa_limit(); int cpu, err; struct kvm_pgtable *pgt; + u64 mmfr0, mmfr1; + u32 phys_shift; + + if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) + return -EINVAL; + + phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); + if (is_protected_kvm_enabled()) { + phys_shift = kvm_ipa_limit; + } else if (phys_shift) { + if (phys_shift > kvm_ipa_limit || + phys_shift < ARM64_MIN_PARANGE_BITS) + return -EINVAL; + } else { + phys_shift = KVM_PHYS_SHIFT; + if (phys_shift > kvm_ipa_limit) { + pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", + current->comm); + return -EINVAL; + } + } + + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); if (mmu->pgt != NULL) { kvm_err("kvm_arch already initialized?\n"); @@ -807,6 +854,32 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) } } +static void hyp_mc_free_fn(void *addr, void *unused) +{ + free_page((unsigned long)addr); +} + +static void *hyp_mc_alloc_fn(void *unused) +{ + return (void *)__get_free_page(GFP_KERNEL_ACCOUNT); +} + +void free_hyp_memcache(struct kvm_hyp_memcache *mc) +{ + if (is_protected_kvm_enabled()) + __free_hyp_memcache(mc, hyp_mc_free_fn, + kvm_host_va, NULL); +} + +int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) +{ + if (!is_protected_kvm_enabled()) + return 0; + + return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, + kvm_host_pa, NULL); +} + /** * kvm_phys_addr_ioremap - map a device range to guest IPA * @@ -841,7 +914,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, write_lock(&kvm->mmu_lock); ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot, - &cache); + &cache, 0); write_unlock(&kvm->mmu_lock); if (ret) break; @@ -1091,32 +1164,26 @@ static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva) * - mmap_lock protects between a VM faulting a page in and the VMM performing * an mprotect() to add VM_MTE */ -static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, - unsigned long size) +static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, + unsigned long size) { unsigned long i, nr_pages = size >> PAGE_SHIFT; - struct page *page; + struct page *page = pfn_to_page(pfn); if (!kvm_has_mte(kvm)) - return 0; - - /* - * pfn_to_online_page() is used to reject ZONE_DEVICE pages - * that may not support tags. - */ - page = pfn_to_online_page(pfn); - - if (!page) - return -EFAULT; + return; for (i = 0; i < nr_pages; i++, page++) { - if (!test_bit(PG_mte_tagged, &page->flags)) { + if (try_page_mte_tagging(page)) { mte_clear_page_tags(page_address(page)); - set_bit(PG_mte_tagged, &page->flags); + set_page_mte_tagged(page); } } +} - return 0; +static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_MTE_ALLOWED; } static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, @@ -1127,7 +1194,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, bool write_fault, writable, force_pte = false; bool exec_fault; bool device = false; - bool shared; unsigned long mmu_seq; struct kvm *kvm = vcpu->kvm; struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; @@ -1136,7 +1202,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, gfn_t gfn; kvm_pfn_t pfn; bool logging_active = memslot_is_logging(memslot); - bool use_read_lock = false; unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu); unsigned long vma_pagesize, fault_granule; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; @@ -1171,14 +1236,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (logging_active) { force_pte = true; vma_shift = PAGE_SHIFT; - use_read_lock = (fault_status == FSC_PERM && write_fault && - fault_granule == PAGE_SIZE); } else { vma_shift = get_vma_page_shift(vma, hva); } - shared = (vma->vm_flags & VM_SHARED); - switch (vma_shift) { #ifndef __PAGETABLE_PMD_FOLDED case PUD_SHIFT: @@ -1239,7 +1300,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, */ smp_rmb(); - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, write_fault, &writable, NULL); if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(hva, vma_shift); @@ -1271,15 +1332,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (exec_fault && device) return -ENOEXEC; - /* - * To reduce MMU contentions and enhance concurrency during dirty - * logging dirty logging, only acquire read lock for permission - * relaxation. - */ - if (use_read_lock) - read_lock(&kvm->mmu_lock); - else - write_lock(&kvm->mmu_lock); + read_lock(&kvm->mmu_lock); pgt = vcpu->arch.hw_mmu->pgt; if (mmu_invalidate_retry(kvm, mmu_seq)) goto out_unlock; @@ -1298,13 +1351,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) { - /* Check the VMM hasn't introduced a new VM_SHARED VMA */ - if (!shared) - ret = sanitise_mte_tags(kvm, pfn, vma_pagesize); - else + /* Check the VMM hasn't introduced a new disallowed VMA */ + if (kvm_vma_mte_allowed(vma)) { + sanitise_mte_tags(kvm, pfn, vma_pagesize); + } else { ret = -EFAULT; - if (ret) goto out_unlock; + } } if (writable) @@ -1323,15 +1376,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * permissions only if vma_pagesize equals fault_granule. Otherwise, * kvm_pgtable_stage2_map() should be called to change block size. */ - if (fault_status == FSC_PERM && vma_pagesize == fault_granule) { + if (fault_status == FSC_PERM && vma_pagesize == fault_granule) ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); - } else { - WARN_ONCE(use_read_lock, "Attempted stage-2 map outside of write lock\n"); - + else ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, __pfn_to_phys(pfn), prot, - memcache); - } + memcache, KVM_PGTABLE_WALK_SHARED); /* Mark the page dirty only if the fault is handled successfully */ if (writable && !ret) { @@ -1340,10 +1390,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } out_unlock: - if (use_read_lock) - read_unlock(&kvm->mmu_lock); - else - write_unlock(&kvm->mmu_lock); + read_unlock(&kvm->mmu_lock); kvm_set_pfn_accessed(pfn); kvm_release_pfn_clean(pfn); return ret != -EAGAIN ? ret : 0; @@ -1526,15 +1573,18 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { kvm_pfn_t pfn = pte_pfn(range->pte); - int ret; if (!kvm->arch.mmu.pgt) return false; WARN_ON(range->end - range->start != 1); - ret = sanitise_mte_tags(kvm, pfn, PAGE_SIZE); - if (ret) + /* + * If the page isn't tagged, defer to user_mem_abort() for sanitising + * the MTE tags. The S2 pte should have been unmapped by + * mmu_notifier_invalidate_range_end(). + */ + if (kvm_has_mte(kvm) && !page_mte_tagged(pfn_to_page(pfn))) return false; /* @@ -1549,7 +1599,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) */ kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, PAGE_SIZE, __pfn_to_phys(pfn), - KVM_PGTABLE_PROT_R, NULL); + KVM_PGTABLE_PROT_R, NULL, 0); return false; } @@ -1618,6 +1668,8 @@ static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { int kvm_mmu_init(u32 *hyp_va_bits) { int err; + u32 idmap_bits; + u32 kernel_bits; hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); @@ -1631,7 +1683,31 @@ int kvm_mmu_init(u32 *hyp_va_bits) */ BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); - *hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET); + /* + * The ID map may be configured to use an extended virtual address + * range. This is only the case if system RAM is out of range for the + * currently configured page size and VA_BITS_MIN, in which case we will + * also need the extended virtual range for the HYP ID map, or we won't + * be able to enable the EL2 MMU. + * + * However, in some cases the ID map may be configured for fewer than + * the number of VA bits used by the regular kernel stage 1. This + * happens when VA_BITS=52 and the kernel image is placed in PA space + * below 48 bits. + * + * At EL2, there is only one TTBR register, and we can't switch between + * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom + * line: we need to use the extended range with *both* our translation + * tables. + * + * So use the maximum of the idmap VA bits and the regular kernel stage + * 1 VA bits to assure that the hypervisor can both ID map its code page + * and map any kernel memory. + */ + idmap_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET); + kernel_bits = vabits_actual; + *hyp_va_bits = max(idmap_bits, kernel_bits); + kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits); kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); kvm_debug("HYP VA range: %lx:%lx\n", @@ -1740,12 +1816,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, if (!vma) break; - /* - * VM_SHARED mappings are not allowed with MTE to avoid races - * when updating the PG_mte_tagged page flag, see - * sanitise_mte_tags for more details. - */ - if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED) { + if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) { ret = -EINVAL; break; } |