diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-06-09 13:09:57 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-06-09 13:09:57 -0700 |
commit | 2f673816b2db30ce6122fe0e5e6a00de20e8d99a (patch) | |
tree | cbcfb92990e2e0339e0aeee5cb629a0f6a91a8bf | |
parent | 368094df48e680fa51cedb68537408cfa64b788e (diff) | |
parent | 4422829e8053068e0225e4d0ef42dc41ea7c9ef5 (diff) |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm fixes from Paolo Bonzini:
"Bugfixes, including a TLB flush fix that affects processors without
nested page tables"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
kvm: fix previous commit for 32-bit builds
kvm: avoid speculation-based attacks from out-of-range memslot accesses
KVM: x86: Unload MMU on guest TLB flush if TDP disabled to force MMU sync
KVM: x86: Ensure liveliness of nested VM-Enter fail tracepoint message
selftests: kvm: Add support for customized slot0 memory size
KVM: selftests: introduce P47V64 for s390x
KVM: x86: Ensure PV TLB flush tracepoint reflects KVM behavior
KVM: X86: MMU: Use the correct inherited permissions to get shadow page
KVM: LAPIC: Write 0 to TMICT should also cancel vmx-preemption timer
KVM: SVM: Fix SEV SEND_START session length & SEND_UPDATE_DATA query length after commit 238eca821cee
-rw-r--r-- | Documentation/virt/kvm/mmu.rst | 4 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 17 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/paging_tmpl.h | 14 | ||||
-rw-r--r-- | arch/x86/kvm/svm/sev.c | 6 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 6 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 19 | ||||
-rw-r--r-- | include/linux/kvm_host.h | 10 | ||||
-rw-r--r-- | tools/testing/selftests/kvm/include/kvm_util.h | 10 | ||||
-rw-r--r-- | tools/testing/selftests/kvm/kvm_page_table_test.c | 2 | ||||
-rw-r--r-- | tools/testing/selftests/kvm/lib/kvm_util.c | 52 | ||||
-rw-r--r-- | tools/testing/selftests/kvm/lib/perf_test_util.c | 2 | ||||
-rw-r--r-- | tools/testing/selftests/kvm/memslot_perf_test.c | 2 |
12 files changed, 105 insertions, 39 deletions
diff --git a/Documentation/virt/kvm/mmu.rst b/Documentation/virt/kvm/mmu.rst index 5bfe28b0728e..20d85daed395 100644 --- a/Documentation/virt/kvm/mmu.rst +++ b/Documentation/virt/kvm/mmu.rst @@ -171,8 +171,8 @@ Shadow pages contain the following information: shadow pages) so role.quadrant takes values in the range 0..3. Each quadrant maps 1GB virtual address space. role.access: - Inherited guest access permissions in the form uwx. Note execute - permission is positive, not negative. + Inherited guest access permissions from the parent ptes in the form uwx. + Note execute permission is positive, not negative. role.invalid: The page is invalid and should not be used. It is a root page that is currently pinned (by a cpu hardware register pointing to it); once it is diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 8120e8614b92..6d72d8f43310 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1494,6 +1494,15 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic) static void cancel_hv_timer(struct kvm_lapic *apic); +static void cancel_apic_timer(struct kvm_lapic *apic) +{ + hrtimer_cancel(&apic->lapic_timer.timer); + preempt_disable(); + if (apic->lapic_timer.hv_timer_in_use) + cancel_hv_timer(apic); + preempt_enable(); +} + static void apic_update_lvtt(struct kvm_lapic *apic) { u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) & @@ -1502,11 +1511,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic) if (apic->lapic_timer.timer_mode != timer_mode) { if (apic_lvtt_tscdeadline(apic) != (timer_mode == APIC_LVT_TIMER_TSCDEADLINE)) { - hrtimer_cancel(&apic->lapic_timer.timer); - preempt_disable(); - if (apic->lapic_timer.hv_timer_in_use) - cancel_hv_timer(apic); - preempt_enable(); + cancel_apic_timer(apic); kvm_lapic_set_reg(apic, APIC_TMICT, 0); apic->lapic_timer.period = 0; apic->lapic_timer.tscdeadline = 0; @@ -2092,7 +2097,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) if (apic_lvtt_tscdeadline(apic)) break; - hrtimer_cancel(&apic->lapic_timer.timer); + cancel_apic_timer(apic); kvm_lapic_set_reg(apic, APIC_TMICT, val); start_apic_timer(apic); break; diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 70b7e44e3035..823a5919f9fa 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -90,8 +90,8 @@ struct guest_walker { gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; bool pte_writable[PT_MAX_FULL_LEVELS]; - unsigned pt_access; - unsigned pte_access; + unsigned int pt_access[PT_MAX_FULL_LEVELS]; + unsigned int pte_access; gfn_t gfn; struct x86_exception fault; }; @@ -418,13 +418,15 @@ retry_walk: } walker->ptes[walker->level - 1] = pte; + + /* Convert to ACC_*_MASK flags for struct guest_walker. */ + walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask); } while (!is_last_gpte(mmu, walker->level, pte)); pte_pkey = FNAME(gpte_pkeys)(vcpu, pte); accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; /* Convert to ACC_*_MASK flags for struct guest_walker. */ - walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_nx_mask); walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask); errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access); if (unlikely(errcode)) @@ -463,7 +465,8 @@ retry_walk: } pgprintk("%s: pte %llx pte_access %x pt_access %x\n", - __func__, (u64)pte, walker->pte_access, walker->pt_access); + __func__, (u64)pte, walker->pte_access, + walker->pt_access[walker->level - 1]); return 1; error: @@ -643,7 +646,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; - unsigned direct_access, access = gw->pt_access; + unsigned int direct_access, access; int top_level, level, req_level, ret; gfn_t base_gfn = gw->gfn; @@ -675,6 +678,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, sp = NULL; if (!is_shadow_present_pte(*it.sptep)) { table_gfn = gw->table_gfn[it.level - 2]; + access = gw->pt_access[it.level - 2]; sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1, false, access); } diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 5bc887e9a986..e0ce5da97fc2 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1103,10 +1103,9 @@ __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp, struct sev_data_send_start data; int ret; + memset(&data, 0, sizeof(data)); data.handle = sev->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); - if (ret < 0) - return ret; params->session_len = data.session_len; if (copy_to_user((void __user *)(uintptr_t)argp->data, params, @@ -1215,10 +1214,9 @@ __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp, struct sev_data_send_update_data data; int ret; + memset(&data, 0, sizeof(data)); data.handle = sev->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); - if (ret < 0) - return ret; params->hdr_len = data.hdr_len; params->trans_len = data.trans_len; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index a61c015870e3..4f839148948b 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1550,16 +1550,16 @@ TRACE_EVENT(kvm_nested_vmenter_failed, TP_ARGS(msg, err), TP_STRUCT__entry( - __field(const char *, msg) + __string(msg, msg) __field(u32, err) ), TP_fast_assign( - __entry->msg = msg; + __assign_str(msg, msg); __entry->err = err; ), - TP_printk("%s%s", __entry->msg, !__entry->err ? "" : + TP_printk("%s%s", __get_str(msg), !__entry->err ? "" : __print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS)) ); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b594275d49b5..6d3955a6a763 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3072,6 +3072,19 @@ static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; + + if (!tdp_enabled) { + /* + * A TLB flush on behalf of the guest is equivalent to + * INVPCID(all), toggling CR4.PGE, etc., which requires + * a forced sync of the shadow page tables. Unload the + * entire MMU here and the subsequent load will sync the + * shadow page tables, and also flush the TLB. + */ + kvm_mmu_unload(vcpu); + return; + } + static_call(kvm_x86_tlb_flush_guest)(vcpu); } @@ -3101,9 +3114,11 @@ static void record_steal_time(struct kvm_vcpu *vcpu) * expensive IPIs. */ if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { + u8 st_preempted = xchg(&st->preempted, 0); + trace_kvm_pv_tlb_flush(vcpu->vcpu_id, - st->preempted & KVM_VCPU_FLUSH_TLB); - if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) + st_preempted & KVM_VCPU_FLUSH_TLB); + if (st_preempted & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb_guest(vcpu); } else { st->preempted = 0; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 76102efbf079..8583ed3ff344 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1185,7 +1185,15 @@ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn) static inline unsigned long __gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) { - return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE; + /* + * The index was checked originally in search_memslots. To avoid + * that a malicious guest builds a Spectre gadget out of e.g. page + * table walks, do not let the processor speculate loads outside + * the guest's registered memslots. + */ + unsigned long offset = gfn - slot->base_gfn; + offset = array_index_nospec(offset, slot->npages); + return slot->userspace_addr + offset * PAGE_SIZE; } static inline int memslot_id(struct kvm *kvm, gfn_t gfn) diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index fcd8e3855111..35739567189e 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -43,6 +43,7 @@ enum vm_guest_mode { VM_MODE_P40V48_4K, VM_MODE_P40V48_64K, VM_MODE_PXXV48_4K, /* For 48bits VA but ANY bits PA */ + VM_MODE_P47V64_4K, NUM_VM_MODES, }; @@ -60,7 +61,7 @@ enum vm_guest_mode { #elif defined(__s390x__) -#define VM_MODE_DEFAULT VM_MODE_P52V48_4K +#define VM_MODE_DEFAULT VM_MODE_P47V64_4K #define MIN_PAGE_SHIFT 12U #define ptes_per_page(page_size) ((page_size) / 16) @@ -285,10 +286,11 @@ struct kvm_vm *vm_create_default_with_vcpus(uint32_t nr_vcpus, uint64_t extra_me uint32_t num_percpu_pages, void *guest_code, uint32_t vcpuids[]); -/* Like vm_create_default_with_vcpus, but accepts mode as a parameter */ +/* Like vm_create_default_with_vcpus, but accepts mode and slot0 memory as a parameter */ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, - uint64_t extra_mem_pages, uint32_t num_percpu_pages, - void *guest_code, uint32_t vcpuids[]); + uint64_t slot0_mem_pages, uint64_t extra_mem_pages, + uint32_t num_percpu_pages, void *guest_code, + uint32_t vcpuids[]); /* * Adds a vCPU with reasonable defaults (e.g. a stack) diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index 1c4753fff19e..82171f17c1d7 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -268,7 +268,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) /* Create a VM with enough guest pages */ guest_num_pages = test_mem_size / guest_page_size; - vm = vm_create_with_vcpus(mode, nr_vcpus, + vm = vm_create_with_vcpus(mode, nr_vcpus, DEFAULT_GUEST_PHY_PAGES, guest_num_pages, 0, guest_code, NULL); /* Align down GPA of the testing memslot */ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 28e528c19d28..5c70596dd1b9 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -175,6 +175,7 @@ const char *vm_guest_mode_string(uint32_t i) [VM_MODE_P40V48_4K] = "PA-bits:40, VA-bits:48, 4K pages", [VM_MODE_P40V48_64K] = "PA-bits:40, VA-bits:48, 64K pages", [VM_MODE_PXXV48_4K] = "PA-bits:ANY, VA-bits:48, 4K pages", + [VM_MODE_P47V64_4K] = "PA-bits:47, VA-bits:64, 4K pages", }; _Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES, "Missing new mode strings?"); @@ -192,6 +193,7 @@ const struct vm_guest_mode_params vm_guest_mode_params[] = { { 40, 48, 0x1000, 12 }, { 40, 48, 0x10000, 16 }, { 0, 0, 0x1000, 12 }, + { 47, 64, 0x1000, 12 }, }; _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES, "Missing new mode params?"); @@ -277,6 +279,9 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) TEST_FAIL("VM_MODE_PXXV48_4K not supported on non-x86 platforms"); #endif break; + case VM_MODE_P47V64_4K: + vm->pgtable_levels = 5; + break; default: TEST_FAIL("Unknown guest mode, mode: 0x%x", mode); } @@ -308,21 +313,50 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) return vm; } +/* + * VM Create with customized parameters + * + * Input Args: + * mode - VM Mode (e.g. VM_MODE_P52V48_4K) + * nr_vcpus - VCPU count + * slot0_mem_pages - Slot0 physical memory size + * extra_mem_pages - Non-slot0 physical memory total size + * num_percpu_pages - Per-cpu physical memory pages + * guest_code - Guest entry point + * vcpuids - VCPU IDs + * + * Output Args: None + * + * Return: + * Pointer to opaque structure that describes the created VM. + * + * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K), + * with customized slot0 memory size, at least 512 pages currently. + * extra_mem_pages is only used to calculate the maximum page table size, + * no real memory allocation for non-slot0 memory in this function. + */ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, - uint64_t extra_mem_pages, uint32_t num_percpu_pages, - void *guest_code, uint32_t vcpuids[]) + uint64_t slot0_mem_pages, uint64_t extra_mem_pages, + uint32_t num_percpu_pages, void *guest_code, + uint32_t vcpuids[]) { + uint64_t vcpu_pages, extra_pg_pages, pages; + struct kvm_vm *vm; + int i; + + /* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */ + if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES) + slot0_mem_pages = DEFAULT_GUEST_PHY_PAGES; + /* The maximum page table size for a memory region will be when the * smallest pages are used. Considering each page contains x page * table descriptors, the total extra size for page tables (for extra * N pages) will be: N/x+N/x^2+N/x^3+... which is definitely smaller * than N/x*2. */ - uint64_t vcpu_pages = (DEFAULT_STACK_PGS + num_percpu_pages) * nr_vcpus; - uint64_t extra_pg_pages = (extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2; - uint64_t pages = DEFAULT_GUEST_PHY_PAGES + extra_mem_pages + vcpu_pages + extra_pg_pages; - struct kvm_vm *vm; - int i; + vcpu_pages = (DEFAULT_STACK_PGS + num_percpu_pages) * nr_vcpus; + extra_pg_pages = (slot0_mem_pages + extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2; + pages = slot0_mem_pages + vcpu_pages + extra_pg_pages; TEST_ASSERT(nr_vcpus <= kvm_check_cap(KVM_CAP_MAX_VCPUS), "nr_vcpus = %d too large for host, max-vcpus = %d", @@ -354,8 +388,8 @@ struct kvm_vm *vm_create_default_with_vcpus(uint32_t nr_vcpus, uint64_t extra_me uint32_t num_percpu_pages, void *guest_code, uint32_t vcpuids[]) { - return vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, extra_mem_pages, - num_percpu_pages, guest_code, vcpuids); + return vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, DEFAULT_GUEST_PHY_PAGES, + extra_mem_pages, num_percpu_pages, guest_code, vcpuids); } struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index abf381800a59..7397ca299835 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -69,7 +69,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, TEST_ASSERT(vcpu_memory_bytes % perf_test_args.guest_page_size == 0, "Guest memory size is not guest page size aligned."); - vm = vm_create_with_vcpus(mode, vcpus, + vm = vm_create_with_vcpus(mode, vcpus, DEFAULT_GUEST_PHY_PAGES, (vcpus * vcpu_memory_bytes) / perf_test_args.guest_page_size, 0, guest_code, NULL); diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index 9307f25d8130..11239652d805 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -267,7 +267,7 @@ static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots, data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots); TEST_ASSERT(data->hva_slots, "malloc() fail"); - data->vm = vm_create_default(VCPU_ID, 1024, guest_code); + data->vm = vm_create_default(VCPU_ID, mempages, guest_code); pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n", max_mem_slots - 1, data->pages_per_slot, rempages); |