Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini: "ARM: - Fix the PMCR_EL0 reset value after the PMU rework - Correctly handle S2 fault triggered by a S1 page table walk by not always classifying it as a write, as this breaks on R/O memslots - Document why we cannot exit with KVM_EXIT_MMIO when taking a write fault from a S1 PTW on a R/O memslot - Put the Apple M2 on the naughty list for not being able to correctly implement the vgic SEIS feature, just like the M1 before it - Reviewer updates: Alex is stepping down, replaced by Zenghui x86: - Fix various rare locking issues in Xen emulation and teach lockdep to detect them - Documentation improvements - Do not return host topology information from KVM_GET_SUPPORTED_CPUID" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: KVM: x86/xen: Avoid deadlock by adding kvm->arch.xen.xen_lock leaf node lock KVM: Ensure lockdep knows about kvm->lock vs. vcpu->mutex ordering rule KVM: x86/xen: Fix potential deadlock in kvm_xen_update_runstate_guest() KVM: x86/xen: Fix lockdep warning on "recursive" gpc locking Documentation: kvm: fix SRCU locking order docs KVM: x86: Do not return host topology information from KVM_GET_SUPPORTED_CPUID KVM: nSVM: clarify recalc_intercepts() wrt CR8 MAINTAINERS: Remove myself as a KVM/arm64 reviewer MAINTAINERS: Add Zenghui Yu as a KVM/arm64 reviewer KVM: arm64: vgic: Add Apple M2 cpus to the list of broken SEIS implementations KVM: arm64: Convert FSC_* over to ESR_ELx_FSC_* KVM: arm64: Document the behaviour of S1PTW faults on RO memslots KVM: arm64: Fix S1PTW handling on RO memslots KVM: arm64: PMU: Fix PMCR_EL0 reset value
author: Linus Torvalds <torvalds@linux-foundation.org> 2023-01-13 14:41:50 -0600
committer: Linus Torvalds <torvalds@linux-foundation.org> 2023-01-13 14:41:50 -0600
commit: 92783a90bcbde8659dd4a160506c46c56db494d6 (patch)
tree: 080a6edd7cf645b404a7b0ea28abba60c2942d16 /arch/x86
parent: f5fe24ef17b5fbe6db49534163e77499fb10ae8c (diff)
parent: 310bc39546a435c83cc27a0eba878afac0d74714 (diff)
4 files changed, 72 insertions, 63 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f35f1ff4427b..6aaae18f1854 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1111,6 +1111,7 @@ struct msr_bitmap_range {
 
 /* Xen emulation context */
 struct kvm_xen {
+	struct mutex xen_lock;
 	u32 xen_version;
 	bool long_mode;
 	bool runstate_update_flag;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index b14653b61470..596061c1610e 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -770,16 +770,22 @@ struct kvm_cpuid_array {
 	int nent;
 };
 
+static struct kvm_cpuid_entry2 *get_next_cpuid(struct kvm_cpuid_array *array)
+{
+	if (array->nent >= array->maxnent)
+		return NULL;
+
+	return &array->entries[array->nent++];
+}
+
 static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array,
 					      u32 function, u32 index)
 {
-	struct kvm_cpuid_entry2 *entry;
+	struct kvm_cpuid_entry2 *entry = get_next_cpuid(array);
 
-	if (array->nent >= array->maxnent)
+	if (!entry)
 		return NULL;
 
-	entry = &array->entries[array->nent++];
-
 	memset(entry, 0, sizeof(*entry));
 	entry->function = function;
 	entry->index = index;
@@ -956,22 +962,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		entry->edx = edx.full;
 		break;
 	}
-	/*
-	 * Per Intel's SDM, the 0x1f is a superset of 0xb,
-	 * thus they can be handled by common code.
-	 */
 	case 0x1f:
 	case 0xb:
 		/*
-		 * Populate entries until the level type (ECX[15:8]) of the
-		 * previous entry is zero.  Note, CPUID EAX.{0x1f,0xb}.0 is
-		 * the starting entry, filled by the primary do_host_cpuid().
+		 * No topology; a valid topology is indicated by the presence
+		 * of subleaf 1.
 		 */
-		for (i = 1; entry->ecx & 0xff00; ++i) {
-			entry = do_host_cpuid(array, function, i);
-			if (!entry)
-				goto out;
-		}
+		entry->eax = entry->ebx = entry->ecx = 0;
 		break;
 	case 0xd: {
 		u64 permitted_xcr0 = kvm_caps.supported_xcr0 & xstate_get_guest_group_perm();
@@ -1202,6 +1199,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		entry->ebx = entry->ecx = entry->edx = 0;
 		break;
 	case 0x8000001e:
+		/* Do not return host topology information.  */
+		entry->eax = entry->ebx = entry->ecx = 0;
+		entry->edx = 0; /* reserved */
 		break;
 	case 0x8000001F:
 		if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) {
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index bc9cd7086fa9..add65dd59756 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -138,15 +138,13 @@ void recalc_intercepts(struct vcpu_svm *svm)
 		c->intercepts[i] = h->intercepts[i];
 
 	if (g->int_ctl & V_INTR_MASKING_MASK) {
-		/* We only want the cr8 intercept bits of L1 */
-		vmcb_clr_intercept(c, INTERCEPT_CR8_READ);
-		vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
-
 		/*
-		 * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
-		 * affect any interrupt we may want to inject; therefore,
-		 * interrupt window vmexits are irrelevant to L0.
+		 * Once running L2 with HF_VINTR_MASK, EFLAGS.IF and CR8
+		 * does not affect any interrupt we may want to inject;
+		 * therefore, writes to CR8 are irrelevant to L0, as are
+		 * interrupt window vmexits.
 		 */
+		vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
 		vmcb_clr_intercept(c, INTERCEPT_VINTR);
 	}
 
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 2e29bdc2949c..8fd41f5deae3 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -271,7 +271,15 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
 	 * Attempt to obtain the GPC lock on *both* (if there are two)
 	 * gfn_to_pfn caches that cover the region.
 	 */
-	read_lock_irqsave(&gpc1->lock, flags);
+	if (atomic) {
+		local_irq_save(flags);
+		if (!read_trylock(&gpc1->lock)) {
+			local_irq_restore(flags);
+			return;
+		}
+	} else {
+		read_lock_irqsave(&gpc1->lock, flags);
+	}
 	while (!kvm_gpc_check(gpc1, user_len1)) {
 		read_unlock_irqrestore(&gpc1->lock, flags);
 
@@ -304,9 +312,18 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
 		 * The guest's runstate_info is split across two pages and we
 		 * need to hold and validate both GPCs simultaneously. We can
 		 * declare a lock ordering GPC1 > GPC2 because nothing else
-		 * takes them more than one at a time.
+		 * takes them more than one at a time. Set a subclass on the
+		 * gpc1 lock to make lockdep shut up about it.
 		 */
-		read_lock(&gpc2->lock);
+		lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_);
+		if (atomic) {
+			if (!read_trylock(&gpc2->lock)) {
+				read_unlock_irqrestore(&gpc1->lock, flags);
+				return;
+			}
+		} else {
+			read_lock(&gpc2->lock);
+		}
 
 		if (!kvm_gpc_check(gpc2, user_len2)) {
 			read_unlock(&gpc2->lock);
@@ -590,26 +607,26 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
 		if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
 			r = -EINVAL;
 		} else {
-			mutex_lock(&kvm->lock);
+			mutex_lock(&kvm->arch.xen.xen_lock);
 			kvm->arch.xen.long_mode = !!data->u.long_mode;
-			mutex_unlock(&kvm->lock);
+			mutex_unlock(&kvm->arch.xen.xen_lock);
 			r = 0;
 		}
 		break;
 
 	case KVM_XEN_ATTR_TYPE_SHARED_INFO:
-		mutex_lock(&kvm->lock);
+		mutex_lock(&kvm->arch.xen.xen_lock);
 		r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
-		mutex_unlock(&kvm->lock);
+		mutex_unlock(&kvm->arch.xen.xen_lock);
 		break;
 
 	case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
 		if (data->u.vector && data->u.vector < 0x10)
 			r = -EINVAL;
 		else {
-			mutex_lock(&kvm->lock);
+			mutex_lock(&kvm->arch.xen.xen_lock);
 			kvm->arch.xen.upcall_vector = data->u.vector;
-			mutex_unlock(&kvm->lock);
+			mutex_unlock(&kvm->arch.xen.xen_lock);
 			r = 0;
 		}
 		break;
@@ -619,9 +636,9 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
 		break;
 
 	case KVM_XEN_ATTR_TYPE_XEN_VERSION:
-		mutex_lock(&kvm->lock);
+		mutex_lock(&kvm->arch.xen.xen_lock);
 		kvm->arch.xen.xen_version = data->u.xen_version;
-		mutex_unlock(&kvm->lock);
+		mutex_unlock(&kvm->arch.xen.xen_lock);
 		r = 0;
 		break;
 
@@ -630,9 +647,9 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
 			r = -EOPNOTSUPP;
 			break;
 		}
-		mutex_lock(&kvm->lock);
+		mutex_lock(&kvm->arch.xen.xen_lock);
 		kvm->arch.xen.runstate_update_flag = !!data->u.runstate_update_flag;
-		mutex_unlock(&kvm->lock);
+		mutex_unlock(&kvm->arch.xen.xen_lock);
 		r = 0;
 		break;
 
@@ -647,7 +664,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
 {
 	int r = -ENOENT;
 
-	mutex_lock(&kvm->lock);
+	mutex_lock(&kvm->arch.xen.xen_lock);
 
 	switch (data->type) {
 	case KVM_XEN_ATTR_TYPE_LONG_MODE:
@@ -686,7 +703,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
 		break;
 	}
 
-	mutex_unlock(&kvm->lock);
+	mutex_unlock(&kvm->arch.xen.xen_lock);
 	return r;
 }
 
@@ -694,7 +711,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 {
 	int idx, r = -ENOENT;
 
-	mutex_lock(&vcpu->kvm->lock);
+	mutex_lock(&vcpu->kvm->arch.xen.xen_lock);
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
 
 	switch (data->type) {
@@ -922,7 +939,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 	}
 
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
-	mutex_unlock(&vcpu->kvm->lock);
+	mutex_unlock(&vcpu->kvm->arch.xen.xen_lock);
 	return r;
 }
 
@@ -930,7 +947,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 {
 	int r = -ENOENT;
 
-	mutex_lock(&vcpu->kvm->lock);
+	mutex_lock(&vcpu->kvm->arch.xen.xen_lock);
 
 	switch (data->type) {
 	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
@@ -1013,7 +1030,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 		break;
 	}
 
-	mutex_unlock(&vcpu->kvm->lock);
+	mutex_unlock(&vcpu->kvm->arch.xen.xen_lock);
 	return r;
 }
 
@@ -1106,7 +1123,7 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
 	     xhc->blob_size_32 || xhc->blob_size_64))
 		return -EINVAL;
 
-	mutex_lock(&kvm->lock);
+	mutex_lock(&kvm->arch.xen.xen_lock);
 
 	if (xhc->msr && !kvm->arch.xen_hvm_config.msr)
 		static_branch_inc(&kvm_xen_enabled.key);
@@ -1115,7 +1132,7 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
 
 	memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));
 
-	mutex_unlock(&kvm->lock);
+	mutex_unlock(&kvm->arch.xen.xen_lock);
 	return 0;
 }
 
@@ -1658,15 +1675,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
 		mm_borrowed = true;
 	}
 
-	/*
-	 * For the irqfd workqueue, using the main kvm->lock mutex is
-	 * fine since this function is invoked from kvm_set_irq() with
-	 * no other lock held, no srcu. In future if it will be called
-	 * directly from a vCPU thread (e.g. on hypercall for an IPI)
-	 * then it may need to switch to using a leaf-node mutex for
-	 * serializing the shared_info mapping.
-	 */
-	mutex_lock(&kvm->lock);
+	mutex_lock(&kvm->arch.xen.xen_lock);
 
 	/*
 	 * It is theoretically possible for the page to be unmapped
@@ -1695,7 +1704,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
 		srcu_read_unlock(&kvm->srcu, idx);
 	} while(!rc);
 
-	mutex_unlock(&kvm->lock);
+	mutex_unlock(&kvm->arch.xen.xen_lock);
 
 	if (mm_borrowed)
 		kthread_unuse_mm(kvm->mm);
@@ -1811,7 +1820,7 @@ static int kvm_xen_eventfd_update(struct kvm *kvm,
 	int ret;
 
 	/* Protect writes to evtchnfd as well as the idr lookup.  */
-	mutex_lock(&kvm->lock);
+	mutex_lock(&kvm->arch.xen.xen_lock);
 	evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port);
 
 	ret = -ENOENT;
@@ -1842,7 +1851,7 @@ static int kvm_xen_eventfd_update(struct kvm *kvm,
 	}
 	ret = 0;
 out_unlock:
-	mutex_unlock(&kvm->lock);
+	mutex_unlock(&kvm->arch.xen.xen_lock);
 	return ret;
 }
 
@@ -1905,10 +1914,10 @@ static int kvm_xen_eventfd_assign(struct kvm *kvm,
 		evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
 	}
 
-	mutex_lock(&kvm->lock);
+	mutex_lock(&kvm->arch.xen.xen_lock);
 	ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1,
 			GFP_KERNEL);
-	mutex_unlock(&kvm->lock);
+	mutex_unlock(&kvm->arch.xen.xen_lock);
 	if (ret >= 0)
 		return 0;
 
@@ -1926,9 +1935,9 @@ static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port)
 {
 	struct evtchnfd *evtchnfd;
 
-	mutex_lock(&kvm->lock);
+	mutex_lock(&kvm->arch.xen.xen_lock);
 	evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port);
-	mutex_unlock(&kvm->lock);
+	mutex_unlock(&kvm->arch.xen.xen_lock);
 
 	if (!evtchnfd)
 		return -ENOENT;
@@ -1946,7 +1955,7 @@ static int kvm_xen_eventfd_reset(struct kvm *kvm)
 	int i;
 	int n = 0;
 
-	mutex_lock(&kvm->lock);
+	mutex_lock(&kvm->arch.xen.xen_lock);
 
 	/*
 	 * Because synchronize_srcu() cannot be called inside the
@@ -1958,7 +1967,7 @@ static int kvm_xen_eventfd_reset(struct kvm *kvm)
 
 	all_evtchnfds = kmalloc_array(n, sizeof(struct evtchnfd *), GFP_KERNEL);
 	if (!all_evtchnfds) {
-		mutex_unlock(&kvm->lock);
+		mutex_unlock(&kvm->arch.xen.xen_lock);
 		return -ENOMEM;
 	}
 
@@ -1967,7 +1976,7 @@ static int kvm_xen_eventfd_reset(struct kvm *kvm)
 		all_evtchnfds[n++] = evtchnfd;
 		idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port);
 	}
-	mutex_unlock(&kvm->lock);
+	mutex_unlock(&kvm->arch.xen.xen_lock);
 
 	synchronize_srcu(&kvm->srcu);
 
@@ -2069,6 +2078,7 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
 
 void kvm_xen_init_vm(struct kvm *kvm)
 {
+	mutex_init(&kvm->arch.xen.xen_lock);
 	idr_init(&kvm->arch.xen.evtchn_ports);
 	kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm, NULL, KVM_HOST_USES_PFN);
 }
author	Linus Torvalds <torvalds@linux-foundation.org>	2023-01-13 14:41:50 -0600
committer	Linus Torvalds <torvalds@linux-foundation.org>	2023-01-13 14:41:50 -0600
commit	92783a90bcbde8659dd4a160506c46c56db494d6 (patch)
tree	080a6edd7cf645b404a7b0ea28abba60c2942d16 /arch/x86
parent	f5fe24ef17b5fbe6db49534163e77499fb10ae8c (diff)
parent	310bc39546a435c83cc27a0eba878afac0d74714 (diff)