diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-03-27 12:22:45 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-03-27 12:22:45 -0700 |
commit | 3a93e40326c8f470e71d20b4c42d36767450f38f (patch) | |
tree | 1ca4c46656fcd8ef039538d9d6881a3c75b0e790 | |
parent | 91fe204578ba3183cc32984a613bd539b3670507 (diff) | |
parent | 9e347ba03029e10e6405f8c3a7a91a5597943ed9 (diff) |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm fixes from Paolo Bonzini:
"RISC-V:
- Fix VM hang in case of timer delta being zero
ARM:
- MMU fixes:
- Read the MMU notifier seq before dropping the mmap lock to guard
against reading a potentially stale VMA
- Disable interrupts when walking user page tables to protect
against the page table being freed
- Read the MTE permissions for the VMA within the mmap lock
critical section, avoiding the use of a potentally stale VMA
pointer
- vPMU fixes:
- Return the sum of the current perf event value and PMC snapshot
for reads from userspace
- Don't save the value of guest writes to PMCR_EL0.{C,P}, which
could otherwise lead to userspace erroneously resetting the vPMU
during VM save/restore"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
riscv/kvm: Fix VM hang in case of timer delta being zero.
KVM: arm64: Check for kvm_vma_mte_allowed in the critical section
KVM: arm64: Disable interrupts while walking userspace PTs
KVM: arm64: Retry fault if vma_lookup() results become invalid
KVM: arm64: PMU: Don't save PMCR_EL0.{C,P} for the vCPU
KVM: arm64: PMU: Fix GET_ONE_REG for vPMC regs to return the current value
-rw-r--r-- | arch/arm64/kvm/mmu.c | 99 | ||||
-rw-r--r-- | arch/arm64/kvm/pmu-emul.c | 3 | ||||
-rw-r--r-- | arch/arm64/kvm/sys_regs.c | 21 | ||||
-rw-r--r-- | arch/riscv/kvm/vcpu_timer.c | 6 |
4 files changed, 87 insertions, 42 deletions
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 7113587222ff..3b9d4d24c361 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -666,14 +666,33 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr) CONFIG_PGTABLE_LEVELS), .mm_ops = &kvm_user_mm_ops, }; + unsigned long flags; kvm_pte_t pte = 0; /* Keep GCC quiet... */ u32 level = ~0; int ret; + /* + * Disable IRQs so that we hazard against a concurrent + * teardown of the userspace page tables (which relies on + * IPI-ing threads). + */ + local_irq_save(flags); ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); - VM_BUG_ON(ret); - VM_BUG_ON(level >= KVM_PGTABLE_MAX_LEVELS); - VM_BUG_ON(!(pte & PTE_VALID)); + local_irq_restore(flags); + + if (ret) + return ret; + + /* + * Not seeing an error, but not updating level? Something went + * deeply wrong... + */ + if (WARN_ON(level >= KVM_PGTABLE_MAX_LEVELS)) + return -EFAULT; + + /* Oops, the userspace PTs are gone... Replay the fault */ + if (!kvm_pte_valid(pte)) + return -EAGAIN; return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); } @@ -1079,7 +1098,7 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, * * Returns the size of the mapping. */ -static unsigned long +static long transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long hva, kvm_pfn_t *pfnp, phys_addr_t *ipap) @@ -1091,8 +1110,15 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, * sure that the HVA and IPA are sufficiently aligned and that the * block map is contained within the memslot. */ - if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) && - get_user_mapping_size(kvm, hva) >= PMD_SIZE) { + if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { + int sz = get_user_mapping_size(kvm, hva); + + if (sz < 0) + return sz; + + if (sz < PMD_SIZE) + return PAGE_SIZE; + /* * The address we faulted on is backed by a transparent huge * page. However, because we map the compound huge page and @@ -1192,7 +1218,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, { int ret = 0; bool write_fault, writable, force_pte = false; - bool exec_fault; + bool exec_fault, mte_allowed; bool device = false; unsigned long mmu_seq; struct kvm *kvm = vcpu->kvm; @@ -1203,7 +1229,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, kvm_pfn_t pfn; bool logging_active = memslot_is_logging(memslot); unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu); - unsigned long vma_pagesize, fault_granule; + long vma_pagesize, fault_granule; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt; @@ -1218,6 +1244,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } /* + * Permission faults just need to update the existing leaf entry, + * and so normally don't require allocations from the memcache. The + * only exception to this is when dirty logging is enabled at runtime + * and a write fault needs to collapse a block entry into a table. + */ + if (fault_status != ESR_ELx_FSC_PERM || + (logging_active && write_fault)) { + ret = kvm_mmu_topup_memory_cache(memcache, + kvm_mmu_cache_min_pages(kvm)); + if (ret) + return ret; + } + + /* * Let's check if we will get back a huge page backed by hugetlbfs, or * get block mapping for device MMIO region. */ @@ -1269,37 +1309,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, fault_ipa &= ~(vma_pagesize - 1); gfn = fault_ipa >> PAGE_SHIFT; - mmap_read_unlock(current->mm); + mte_allowed = kvm_vma_mte_allowed(vma); - /* - * Permission faults just need to update the existing leaf entry, - * and so normally don't require allocations from the memcache. The - * only exception to this is when dirty logging is enabled at runtime - * and a write fault needs to collapse a block entry into a table. - */ - if (fault_status != ESR_ELx_FSC_PERM || - (logging_active && write_fault)) { - ret = kvm_mmu_topup_memory_cache(memcache, - kvm_mmu_cache_min_pages(kvm)); - if (ret) - return ret; - } + /* Don't use the VMA after the unlock -- it may have vanished */ + vma = NULL; - mmu_seq = vcpu->kvm->mmu_invalidate_seq; /* - * Ensure the read of mmu_invalidate_seq happens before we call - * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk - * the page we just got a reference to gets unmapped before we have a - * chance to grab the mmu_lock, which ensure that if the page gets - * unmapped afterwards, the call to kvm_unmap_gfn will take it away - * from us again properly. This smp_rmb() interacts with the smp_wmb() - * in kvm_mmu_notifier_invalidate_<page|range_end>. + * Read mmu_invalidate_seq so that KVM can detect if the results of + * vma_lookup() or __gfn_to_pfn_memslot() become stale prior to + * acquiring kvm->mmu_lock. * - * Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is - * used to avoid unnecessary overhead introduced to locate the memory - * slot because it's always fixed even @gfn is adjusted for huge pages. + * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs + * with the smp_wmb() in kvm_mmu_invalidate_end(). */ - smp_rmb(); + mmu_seq = vcpu->kvm->mmu_invalidate_seq; + mmap_read_unlock(current->mm); pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, write_fault, &writable, NULL); @@ -1350,11 +1374,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, vma_pagesize = transparent_hugepage_adjust(kvm, memslot, hva, &pfn, &fault_ipa); + + if (vma_pagesize < 0) { + ret = vma_pagesize; + goto out_unlock; + } } if (fault_status != ESR_ELx_FSC_PERM && !device && kvm_has_mte(kvm)) { /* Check the VMM hasn't introduced a new disallowed VMA */ - if (kvm_vma_mte_allowed(vma)) { + if (mte_allowed) { sanitise_mte_tags(kvm, pfn, vma_pagesize); } else { ret = -EFAULT; diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 24908400e190..c243b10f3e15 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -538,7 +538,8 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) if (!kvm_pmu_is_3p5(vcpu)) val &= ~ARMV8_PMU_PMCR_LP; - __vcpu_sys_reg(vcpu, PMCR_EL0) = val; + /* The reset bits don't indicate any state, and shouldn't be saved. */ + __vcpu_sys_reg(vcpu, PMCR_EL0) = val & ~(ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_P); if (val & ARMV8_PMU_PMCR_E) { kvm_pmu_enable_counter_mask(vcpu, diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 53749d3a0996..1b2c161120be 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -856,6 +856,22 @@ static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx) return true; } +static int get_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + u64 idx; + + if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 0) + /* PMCCNTR_EL0 */ + idx = ARMV8_PMU_CYCLE_IDX; + else + /* PMEVCNTRn_EL0 */ + idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); + + *val = kvm_pmu_get_counter_value(vcpu, idx); + return 0; +} + static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) @@ -1072,7 +1088,7 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, /* Macro to expand the PMEVCNTRn_EL0 register */ #define PMU_PMEVCNTR_EL0(n) \ { PMU_SYS_REG(SYS_PMEVCNTRn_EL0(n)), \ - .reset = reset_pmevcntr, \ + .reset = reset_pmevcntr, .get_user = get_pmu_evcntr, \ .access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), } /* Macro to expand the PMEVTYPERn_EL0 register */ @@ -1982,7 +1998,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { PMU_SYS_REG(SYS_PMCEID1_EL0), .access = access_pmceid, .reset = NULL }, { PMU_SYS_REG(SYS_PMCCNTR_EL0), - .access = access_pmu_evcntr, .reset = reset_unknown, .reg = PMCCNTR_EL0 }, + .access = access_pmu_evcntr, .reset = reset_unknown, + .reg = PMCCNTR_EL0, .get_user = get_pmu_evcntr}, { PMU_SYS_REG(SYS_PMXEVTYPER_EL0), .access = access_pmu_evtyper, .reset = NULL }, { PMU_SYS_REG(SYS_PMXEVCNTR_EL0), diff --git a/arch/riscv/kvm/vcpu_timer.c b/arch/riscv/kvm/vcpu_timer.c index ad34519c8a13..3ac2ff6a65da 100644 --- a/arch/riscv/kvm/vcpu_timer.c +++ b/arch/riscv/kvm/vcpu_timer.c @@ -147,10 +147,8 @@ static void kvm_riscv_vcpu_timer_blocking(struct kvm_vcpu *vcpu) return; delta_ns = kvm_riscv_delta_cycles2ns(t->next_cycles, gt, t); - if (delta_ns) { - hrtimer_start(&t->hrt, ktime_set(0, delta_ns), HRTIMER_MODE_REL); - t->next_set = true; - } + hrtimer_start(&t->hrt, ktime_set(0, delta_ns), HRTIMER_MODE_REL); + t->next_set = true; } static void kvm_riscv_vcpu_timer_unblocking(struct kvm_vcpu *vcpu) |