summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2024-07-16 09:54:57 -0400
committerPaolo Bonzini <pbonzini@redhat.com>2024-07-16 09:54:57 -0400
commit5c5ddf71071f01fabe9380e16c36c9263d40d528 (patch)
treeb7693fc445cc7df841414f6ed6387f651d0b66f5
parent34b69edecb47284e81f0204a50db5e55fe93cb52 (diff)
parent377b2f359d1f71c75f8cc352b5c81f2210312d83 (diff)
Merge tag 'kvm-x86-mtrrs-6.11' of https://github.com/kvm-x86/linux into HEAD
KVM x86 MTRR virtualization removal Remove support for virtualizing MTRRs on Intel CPUs, along with a nasty CR0.CD hack, and instead always honor guest PAT on CPUs that support self-snoop.
-rw-r--r--Documentation/virt/kvm/api.rst6
-rw-r--r--Documentation/virt/kvm/x86/errata.rst18
-rw-r--r--arch/x86/include/asm/kvm_host.h15
-rw-r--r--arch/x86/kvm/mmu.h7
-rw-r--r--arch/x86/kvm/mmu/mmu.c35
-rw-r--r--arch/x86/kvm/mtrr.c644
-rw-r--r--arch/x86/kvm/vmx/vmx.c40
-rw-r--r--arch/x86/kvm/x86.c24
-rw-r--r--arch/x86/kvm/x86.h4
-rw-r--r--include/linux/srcu.h14
10 files changed, 105 insertions, 702 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 798ad65f4fee..8e5dad80b337 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -8025,7 +8025,11 @@ The valid bits in cap.args[0] are:
When this quirk is disabled, the reset value
is 0x10000 (APIC_LVT_MASKED).
- KVM_X86_QUIRK_CD_NW_CLEARED By default, KVM clears CR0.CD and CR0.NW.
+ KVM_X86_QUIRK_CD_NW_CLEARED By default, KVM clears CR0.CD and CR0.NW on
+ AMD CPUs to workaround buggy guest firmware
+ that runs in perpetuity with CR0.CD, i.e.
+ with caches in "no fill" mode.
+
When this quirk is disabled, KVM does not
change the value of CR0.CD and CR0.NW.
diff --git a/Documentation/virt/kvm/x86/errata.rst b/Documentation/virt/kvm/x86/errata.rst
index 49a05f24747b..4116045a8744 100644
--- a/Documentation/virt/kvm/x86/errata.rst
+++ b/Documentation/virt/kvm/x86/errata.rst
@@ -48,3 +48,21 @@ have the same physical APIC ID, KVM will deliver events targeting that APIC ID
only to the vCPU with the lowest vCPU ID. If KVM_X2APIC_API_USE_32BIT_IDS is
not enabled, KVM follows x86 architecture when processing interrupts (all vCPUs
matching the target APIC ID receive the interrupt).
+
+MTRRs
+-----
+KVM does not virtualize guest MTRR memory types. KVM emulates accesses to MTRR
+MSRs, i.e. {RD,WR}MSR in the guest will behave as expected, but KVM does not
+honor guest MTRRs when determining the effective memory type, and instead
+treats all of guest memory as having Writeback (WB) MTRRs.
+
+CR0.CD
+------
+KVM does not virtualize CR0.CD on Intel CPUs. Similar to MTRR MSRs, KVM
+emulates CR0.CD accesses so that loads and stores from/to CR0 behave as
+expected, but setting CR0.CD=1 has no impact on the cachaeability of guest
+memory.
+
+Note, this erratum does not affect AMD CPUs, which fully virtualize CR0.CD in
+hardware, i.e. put the CPU caches into "no fill" mode when CR0.CD=1, even when
+running in the guest. \ No newline at end of file
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d5101f52e76c..210408361e9a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -160,7 +160,6 @@
#define KVM_MIN_FREE_MMU_PAGES 5
#define KVM_REFILL_PAGES 25
#define KVM_MAX_CPUID_ENTRIES 256
-#define KVM_NR_FIXED_MTRR_REGION 88
#define KVM_NR_VAR_MTRR 8
#define ASYNC_PF_PER_VCPU 64
@@ -605,18 +604,12 @@ enum {
KVM_DEBUGREG_WONT_EXIT = 2,
};
-struct kvm_mtrr_range {
- u64 base;
- u64 mask;
- struct list_head node;
-};
-
struct kvm_mtrr {
- struct kvm_mtrr_range var_ranges[KVM_NR_VAR_MTRR];
- mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION];
+ u64 var[KVM_NR_VAR_MTRR * 2];
+ u64 fixed_64k;
+ u64 fixed_16k[2];
+ u64 fixed_4k[8];
u64 deftype;
-
- struct list_head head;
};
/* Hyper-V SynIC timer */
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index f2e7e5c9b9ef..24ea7183d7b4 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -221,12 +221,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return -(u32)fault & errcode;
}
-bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma);
-
-static inline bool kvm_mmu_honors_guest_mtrrs(struct kvm *kvm)
-{
- return __kvm_mmu_honors_guest_mtrrs(kvm_arch_has_noncoherent_dma(kvm));
-}
+bool kvm_mmu_may_ignore_guest_pat(void);
int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index d3b8e4fad924..ee13fc7b0e27 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4671,38 +4671,23 @@ out_unlock:
}
#endif
-bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma)
+bool kvm_mmu_may_ignore_guest_pat(void)
{
/*
- * If host MTRRs are ignored (shadow_memtype_mask is non-zero), and the
- * VM has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is
- * to honor the memtype from the guest's MTRRs so that guest accesses
- * to memory that is DMA'd aren't cached against the guest's wishes.
- *
- * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs,
- * e.g. KVM will force UC memtype for host MMIO.
+ * When EPT is enabled (shadow_memtype_mask is non-zero), the CPU does
+ * not support self-snoop (or is affected by an erratum), and the VM
+ * has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to
+ * honor the memtype from the guest's PAT so that guest accesses to
+ * memory that is DMA'd aren't cached against the guest's wishes. As a
+ * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA,
+ * KVM _always_ ignores or honors guest PAT, i.e. doesn't toggle SPTE
+ * bits in response to non-coherent device (un)registration.
*/
- return vm_has_noncoherent_dma && shadow_memtype_mask;
+ return !static_cpu_has(X86_FEATURE_SELFSNOOP) && shadow_memtype_mask;
}
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- /*
- * If the guest's MTRRs may be used to compute the "real" memtype,
- * restrict the mapping level to ensure KVM uses a consistent memtype
- * across the entire mapping.
- */
- if (kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) {
- for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
- int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
- gfn_t base = gfn_round_for_level(fault->gfn,
- fault->max_level);
-
- if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
- break;
- }
- }
-
#ifdef CONFIG_X86_64
if (tdp_mmu_enabled)
return kvm_tdp_mmu_page_fault(vcpu, fault);
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index a67c28a56417..05490b9d8a43 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -19,33 +19,21 @@
#include <asm/mtrr.h>
#include "cpuid.h"
-#include "mmu.h"
-#define IA32_MTRR_DEF_TYPE_E (1ULL << 11)
-#define IA32_MTRR_DEF_TYPE_FE (1ULL << 10)
-#define IA32_MTRR_DEF_TYPE_TYPE_MASK (0xff)
-
-static bool is_mtrr_base_msr(unsigned int msr)
-{
- /* MTRR base MSRs use even numbers, masks use odd numbers. */
- return !(msr & 0x1);
-}
-
-static struct kvm_mtrr_range *var_mtrr_msr_to_range(struct kvm_vcpu *vcpu,
- unsigned int msr)
+static u64 *find_mtrr(struct kvm_vcpu *vcpu, unsigned int msr)
{
- int index = (msr - MTRRphysBase_MSR(0)) / 2;
-
- return &vcpu->arch.mtrr_state.var_ranges[index];
-}
+ int index;
-static bool msr_mtrr_valid(unsigned msr)
-{
switch (msr) {
case MTRRphysBase_MSR(0) ... MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1):
+ index = msr - MTRRphysBase_MSR(0);
+ return &vcpu->arch.mtrr_state.var[index];
case MSR_MTRRfix64K_00000:
+ return &vcpu->arch.mtrr_state.fixed_64k;
case MSR_MTRRfix16K_80000:
case MSR_MTRRfix16K_A0000:
+ index = msr - MSR_MTRRfix16K_80000;
+ return &vcpu->arch.mtrr_state.fixed_16k[index];
case MSR_MTRRfix4K_C0000:
case MSR_MTRRfix4K_C8000:
case MSR_MTRRfix4K_D0000:
@@ -54,10 +42,14 @@ static bool msr_mtrr_valid(unsigned msr)
case MSR_MTRRfix4K_E8000:
case MSR_MTRRfix4K_F0000:
case MSR_MTRRfix4K_F8000:
+ index = msr - MSR_MTRRfix4K_C0000;
+ return &vcpu->arch.mtrr_state.fixed_4k[index];
case MSR_MTRRdefType:
- return true;
+ return &vcpu->arch.mtrr_state.deftype;
+ default:
+ break;
}
- return false;
+ return NULL;
}
static bool valid_mtrr_type(unsigned t)
@@ -70,9 +62,6 @@ static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
int i;
u64 mask;
- if (!msr_mtrr_valid(msr))
- return false;
-
if (msr == MSR_MTRRdefType) {
if (data & ~0xcff)
return false;
@@ -85,8 +74,9 @@ static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
}
/* variable MTRRs */
- WARN_ON(!(msr >= MTRRphysBase_MSR(0) &&
- msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1)));
+ if (WARN_ON_ONCE(!(msr >= MTRRphysBase_MSR(0) &&
+ msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1))))
+ return false;
mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
if ((msr & 1) == 0) {
@@ -94,309 +84,32 @@ static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (!valid_mtrr_type(data & 0xff))
return false;
mask |= 0xf00;
- } else
+ } else {
/* MTRR mask */
mask |= 0x7ff;
-
- return (data & mask) == 0;
-}
-
-static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
-{
- return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_E);
-}
-
-static bool fixed_mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
-{
- return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_FE);
-}
-
-static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state)
-{
- return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK;
-}
-
-static u8 mtrr_disabled_type(struct kvm_vcpu *vcpu)
-{
- /*
- * Intel SDM 11.11.2.2: all MTRRs are disabled when
- * IA32_MTRR_DEF_TYPE.E bit is cleared, and the UC
- * memory type is applied to all of physical memory.
- *
- * However, virtual machines can be run with CPUID such that
- * there are no MTRRs. In that case, the firmware will never
- * enable MTRRs and it is obviously undesirable to run the
- * guest entirely with UC memory and we use WB.
- */
- if (guest_cpuid_has(vcpu, X86_FEATURE_MTRR))
- return MTRR_TYPE_UNCACHABLE;
- else
- return MTRR_TYPE_WRBACK;
-}
-
-/*
-* Three terms are used in the following code:
-* - segment, it indicates the address segments covered by fixed MTRRs.
-* - unit, it corresponds to the MSR entry in the segment.
-* - range, a range is covered in one memory cache type.
-*/
-struct fixed_mtrr_segment {
- u64 start;
- u64 end;
-
- int range_shift;
-
- /* the start position in kvm_mtrr.fixed_ranges[]. */
- int range_start;
-};
-
-static struct fixed_mtrr_segment fixed_seg_table[] = {
- /* MSR_MTRRfix64K_00000, 1 unit. 64K fixed mtrr. */
- {
- .start = 0x0,
- .end = 0x80000,
- .range_shift = 16, /* 64K */
- .range_start = 0,
- },
-
- /*
- * MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000, 2 units,
- * 16K fixed mtrr.
- */
- {
- .start = 0x80000,
- .end = 0xc0000,
- .range_shift = 14, /* 16K */
- .range_start = 8,
- },
-
- /*
- * MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000, 8 units,
- * 4K fixed mtrr.
- */
- {
- .start = 0xc0000,
- .end = 0x100000,
- .range_shift = 12, /* 12K */
- .range_start = 24,
- }
-};
-
-/*
- * The size of unit is covered in one MSR, one MSR entry contains
- * 8 ranges so that unit size is always 8 * 2^range_shift.
- */
-static u64 fixed_mtrr_seg_unit_size(int seg)
-{
- return 8 << fixed_seg_table[seg].range_shift;
-}
-
-static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit)
-{
- switch (msr) {
- case MSR_MTRRfix64K_00000:
- *seg = 0;
- *unit = 0;
- break;
- case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000:
- *seg = 1;
- *unit = array_index_nospec(
- msr - MSR_MTRRfix16K_80000,
- MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1);
- break;
- case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
- *seg = 2;
- *unit = array_index_nospec(
- msr - MSR_MTRRfix4K_C0000,
- MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1);
- break;
- default:
- return false;
}
- return true;
-}
-
-static void fixed_mtrr_seg_unit_range(int seg, int unit, u64 *start, u64 *end)
-{
- struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
- u64 unit_size = fixed_mtrr_seg_unit_size(seg);
-
- *start = mtrr_seg->start + unit * unit_size;
- *end = *start + unit_size;
- WARN_ON(*end > mtrr_seg->end);
-}
-
-static int fixed_mtrr_seg_unit_range_index(int seg, int unit)
-{
- struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
-
- WARN_ON(mtrr_seg->start + unit * fixed_mtrr_seg_unit_size(seg)
- > mtrr_seg->end);
-
- /* each unit has 8 ranges. */
- return mtrr_seg->range_start + 8 * unit;
-}
-
-static int fixed_mtrr_seg_end_range_index(int seg)
-{
- struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
- int n;
-
- n = (mtrr_seg->end - mtrr_seg->start) >> mtrr_seg->range_shift;
- return mtrr_seg->range_start + n - 1;
-}
-
-static bool fixed_msr_to_range(u32 msr, u64 *start, u64 *end)
-{
- int seg, unit;
-
- if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
- return false;
-
- fixed_mtrr_seg_unit_range(seg, unit, start, end);
- return true;
-}
-
-static int fixed_msr_to_range_index(u32 msr)
-{
- int seg, unit;
-
- if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
- return -1;
-
- return fixed_mtrr_seg_unit_range_index(seg, unit);
-}
-
-static int fixed_mtrr_addr_to_seg(u64 addr)
-{
- struct fixed_mtrr_segment *mtrr_seg;
- int seg, seg_num = ARRAY_SIZE(fixed_seg_table);
-
- for (seg = 0; seg < seg_num; seg++) {
- mtrr_seg = &fixed_seg_table[seg];
- if (mtrr_seg->start <= addr && addr < mtrr_seg->end)
- return seg;
- }
-
- return -1;
-}
-
-static int fixed_mtrr_addr_seg_to_range_index(u64 addr, int seg)
-{
- struct fixed_mtrr_segment *mtrr_seg;
- int index;
-
- mtrr_seg = &fixed_seg_table[seg];
- index = mtrr_seg->range_start;
- index += (addr - mtrr_seg->start) >> mtrr_seg->range_shift;
- return index;
-}
-
-static u64 fixed_mtrr_range_end_addr(int seg, int index)
-{
- struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
- int pos = index - mtrr_seg->range_start;
-
- return mtrr_seg->start + ((pos + 1) << mtrr_seg->range_shift);
-}
-
-static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end)
-{
- u64 mask;
-
- *start = range->base & PAGE_MASK;
-
- mask = range->mask & PAGE_MASK;
-
- /* This cannot overflow because writing to the reserved bits of
- * variable MTRRs causes a #GP.
- */
- *end = (*start | ~mask) + 1;
-}
-
-static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
-{
- struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
- gfn_t start, end;
-
- if (!kvm_mmu_honors_guest_mtrrs(vcpu->kvm))
- return;
-
- if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType)
- return;
-
- /* fixed MTRRs. */
- if (fixed_msr_to_range(msr, &start, &end)) {
- if (!fixed_mtrr_is_enabled(mtrr_state))
- return;
- } else if (msr == MSR_MTRRdefType) {
- start = 0x0;
- end = ~0ULL;
- } else {
- /* variable range MTRRs. */
- var_mtrr_range(var_mtrr_msr_to_range(vcpu, msr), &start, &end);
- }
-
- kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end));
-}
-
-static bool var_mtrr_range_is_valid(struct kvm_mtrr_range *range)
-{
- return (range->mask & (1 << 11)) != 0;
-}
-
-static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
- struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
- struct kvm_mtrr_range *tmp, *cur;
-
- cur = var_mtrr_msr_to_range(vcpu, msr);
-
- /* remove the entry if it's in the list. */
- if (var_mtrr_range_is_valid(cur))
- list_del(&cur->node);
-
- /*
- * Set all illegal GPA bits in the mask, since those bits must
- * implicitly be 0. The bits are then cleared when reading them.
- */
- if (is_mtrr_base_msr(msr))
- cur->base = data;
- else
- cur->mask = data | kvm_vcpu_reserved_gpa_bits_raw(vcpu);
-
- /* add it to the list if it's enabled. */
- if (var_mtrr_range_is_valid(cur)) {
- list_for_each_entry(tmp, &mtrr_state->head, node)
- if (cur->base >= tmp->base)
- break;
- list_add_tail(&cur->node, &tmp->node);
- }
+ return (data & mask) == 0;
}
int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
- int index;
+ u64 *mtrr;
- if (!kvm_mtrr_valid(vcpu, msr, data))
+ mtrr = find_mtrr(vcpu, msr);
+ if (!mtrr)
return 1;
- index = fixed_msr_to_range_index(msr);
- if (index >= 0)
- *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index] = data;
- else if (msr == MSR_MTRRdefType)
- vcpu->arch.mtrr_state.deftype = data;
- else
- set_var_mtrr_msr(vcpu, msr, data);
+ if (!kvm_mtrr_valid(vcpu, msr, data))
+ return 1;
- update_mtrr(vcpu, msr);
+ *mtrr = data;
return 0;
}
int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
{
- int index;
+ u64 *mtrr;
/* MSR_MTRRcap is a readonly MSR. */
if (msr == MSR_MTRRcap) {
@@ -410,311 +123,10 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
return 0;
}
- if (!msr_mtrr_valid(msr))
+ mtrr = find_mtrr(vcpu, msr);
+ if (!mtrr)
return 1;
- index = fixed_msr_to_range_index(msr);
- if (index >= 0) {
- *pdata = *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index];
- } else if (msr == MSR_MTRRdefType) {
- *pdata = vcpu->arch.mtrr_state.deftype;
- } else {
- /* Variable MTRRs */
- if (is_mtrr_base_msr(msr))
- *pdata = var_mtrr_msr_to_range(vcpu, msr)->base;
- else
- *pdata = var_mtrr_msr_to_range(vcpu, msr)->mask;
-
- *pdata &= ~kvm_vcpu_reserved_gpa_bits_raw(vcpu);
- }
-
+ *pdata = *mtrr;
return 0;
}
-
-void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu)
-{
- INIT_LIST_HEAD(&vcpu->arch.mtrr_state.head);
-}
-
-struct mtrr_iter {
- /* input fields. */
- struct kvm_mtrr *mtrr_state;
- u64 start;
- u64 end;
-
- /* output fields. */
- int mem_type;
- /* mtrr is completely disabled? */
- bool mtrr_disabled;
- /* [start, end) is not fully covered in MTRRs? */
- bool partial_map;
-
- /* private fields. */
- union {
- /* used for fixed MTRRs. */
- struct {
- int index;
- int seg;
- };
-
- /* used for var MTRRs. */
- struct {
- struct kvm_mtrr_range *range;
- /* max address has been covered in var MTRRs. */
- u64 start_max;
- };
- };
-
- bool fixed;
-};
-
-static bool mtrr_lookup_fixed_start(struct mtrr_iter *iter)
-{
- int seg, index;
-
- if (!fixed_mtrr_is_enabled(iter->mtrr_state))
- return false;
-
- seg = fixed_mtrr_addr_to_seg(iter->start);
- if (seg < 0)
- return false;
-
- iter->fixed = true;
- index = fixed_mtrr_addr_seg_to_range_index(iter->start, seg);
- iter->index = index;
- iter->seg = seg;
- return true;
-}
-
-static bool match_var_range(struct mtrr_iter *iter,
- struct kvm_mtrr_range *range)
-{
- u64 start, end;
-
- var_mtrr_range(range, &start, &end);
- if (!(start >= iter->end || end <= iter->start)) {
- iter->range = range;
-
- /*
- * the function is called when we do kvm_mtrr.head walking.
- * Range has the minimum base address which interleaves
- * [looker->start_max, looker->end).
- */
- iter->partial_map |= iter->start_max < start;
-
- /* update the max address has been covered. */
- iter->start_max = max(iter->start_max, end);
- return true;
- }
-
- return false;
-}
-
-static void __mtrr_lookup_var_next(struct mtrr_iter *iter)
-{
- struct kvm_mtrr *mtrr_state = iter->mtrr_state;
-
- list_for_each_entry_continue(iter->range, &mtrr_state->head, node)
- if (match_var_range(iter, iter->range))
- return;
-
- iter->range = NULL;
- iter->partial_map |= iter->start_max < iter->end;
-}
-
-static void mtrr_lookup_var_start(struct mtrr_iter *iter)
-{
- struct kvm_mtrr *mtrr_state = iter->mtrr_state;
-
- iter->fixed = false;
- iter->start_max = iter->start;
- iter->range = NULL;
- iter->range = list_prepare_entry(iter->range, &mtrr_state->head, node);
-
- __mtrr_lookup_var_next(iter);
-}
-
-static void mtrr_lookup_fixed_next(struct mtrr_iter *iter)
-{
- /* terminate the lookup. */
- if (fixed_mtrr_range_end_addr(iter->seg, iter->index) >= iter->end) {
- iter->fixed = false;
- iter->range = NULL;
- return;
- }
-
- iter->index++;
-
- /* have looked up for all fixed MTRRs. */
- if (iter->index >= ARRAY_SIZE(iter->mtrr_state->fixed_ranges))
- return mtrr_lookup_var_start(iter);
-
- /* switch to next segment. */
- if (iter->index > fixed_mtrr_seg_end_range_index(iter->seg))
- iter->seg++;
-}
-
-static void mtrr_lookup_var_next(struct mtrr_iter *iter)
-{
- __mtrr_lookup_var_next(iter);
-}
-
-static void mtrr_lookup_start(struct mtrr_iter *iter)
-{
- if (!mtrr_is_enabled(iter->mtrr_state)) {
- iter->mtrr_disabled = true;
- return;
- }
-
- if (!mtrr_lookup_fixed_start(iter))
- mtrr_lookup_var_start(iter);
-}
-
-static void mtrr_lookup_init(struct mtrr_iter *iter,
- struct kvm_mtrr *mtrr_state, u64 start, u64 end)
-{
- iter->mtrr_state = mtrr_state;
- iter->start = start;
- iter->end = end;
- iter->mtrr_disabled = false;
- iter->partial_map = false;
- iter->fixed = false;
- iter->range = NULL;
-
- mtrr_lookup_start(iter);
-}
-
-static bool mtrr_lookup_okay(struct mtrr_iter *iter)
-{
- if (iter->fixed) {
- iter->mem_type = iter->mtrr_state->fixed_ranges[iter->index];
- return true;
- }
-
- if (iter->range) {
- iter->mem_type = iter->range->base & 0xff;
- return true;
- }
-
- return false;
-}
-
-static void mtrr_lookup_next(struct mtrr_iter *iter)
-{
- if (iter->fixed)
- mtrr_lookup_fixed_next(iter);
- else
- mtrr_lookup_var_next(iter);
-}
-
-#define mtrr_for_each_mem_type(_iter_, _mtrr_, _gpa_start_, _gpa_end_) \
- for (mtrr_lookup_init(_iter_, _mtrr_, _gpa_start_, _gpa_end_); \
- mtrr_lookup_okay(_iter_); mtrr_lookup_next(_iter_))
-
-u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
- struct mtrr_iter iter;
- u64 start, end;
- int type = -1;
- const int wt_wb_mask = (1 << MTRR_TYPE_WRBACK)
- | (1 << MTRR_TYPE_WRTHROUGH);
-
- start = gfn_to_gpa(gfn);
- end = start + PAGE_SIZE;
-
- mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
- int curr_type = iter.mem_type;
-
- /*
- * Please refer to Intel SDM Volume 3: 11.11.4.1 MTRR
- * Precedences.
- */
-
- if (type == -1) {
- type = curr_type;
- continue;
- }
-
- /*
- * If two or more variable memory ranges match and the
- * memory types are identical, then that memory type is
- * used.
- */
- if (type == curr_type)
- continue;
-
- /*
- * If two or more variable memory ranges match and one of
- * the memory types is UC, the UC memory type used.
- */
- if (curr_type == MTRR_TYPE_UNCACHABLE)
- return MTRR_TYPE_UNCACHABLE;
-
- /*
- * If two or more variable memory ranges match and the
- * memory types are WT and WB, the WT memory type is used.
- */
- if (((1 << type) & wt_wb_mask) &&
- ((1 << curr_type) & wt_wb_mask)) {
- type = MTRR_TYPE_WRTHROUGH;
- continue;
- }
-
- /*
- * For overlaps not defined by the above rules, processor
- * behavior is undefined.
- */
-
- /* We use WB for this undefined behavior. :( */
- return MTRR_TYPE_WRBACK;
- }
-
- if (iter.mtrr_disabled)
- return mtrr_disabled_type(vcpu);
-
- /* not contained in any MTRRs. */
- if (type == -1)
- return mtrr_default_type(mtrr_state);
-
- /*
- * We just check one page, partially covered by MTRRs is
- * impossible.
- */
- WARN_ON(iter.partial_map);
-
- return type;
-}
-EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type);
-
-bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
- int page_num)
-{
- struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
- struct mtrr_iter iter;
- u64 start, end;
- int type = -1;
-
- start = gfn_to_gpa(gfn);
- end = gfn_to_gpa(gfn + page_num);
- mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
- if (type == -1) {
- type = iter.mem_type;
- continue;
- }
-
- if (type != iter.mem_type)
- return false;
- }
-
- if (iter.mtrr_disabled)
- return true;
-
- if (!iter.partial_map)
- return true;
-
- if (type == -1)
- return true;
-
- return type == mtrr_default_type(mtrr_state);
-}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index bedb9ba96918..13a6b0281e37 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7670,39 +7670,25 @@ int vmx_vm_init(struct kvm *kvm)
u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
- /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
- * memory aliases with conflicting memory types and sometimes MCEs.
- * We have to be careful as to what are honored and when.
- *
- * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to
- * UC. The effective memory type is UC or WC depending on guest PAT.
- * This was historically the source of MCEs and we want to be
- * conservative.
- *
- * When there is no need to deal with noncoherent DMA (e.g., no VT-d
- * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The
- * EPT memory type is set to WB. The effective memory type is forced
- * WB.
- *
- * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The
- * EPT memory type is used to emulate guest CD/MTRR.
+ /*
+ * Force UC for host MMIO regions, as allowing the guest to access MMIO
+ * with cacheable accesses will result in Machine Checks.
*/
-
if (is_mmio)
return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
- if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
+ /*
+ * Force WB and ignore guest PAT if the VM does NOT have a non-coherent
+ * device attached and the CPU doesn't support self-snoop. Letting the
+ * guest control memory types on Intel CPUs without self-snoop may
+ * result in unexpected behavior, and so KVM's (historical) ABI is to
+ * trust the guest to behave only as a last resort.
+ */
+ if (!static_cpu_has(X86_FEATURE_SELFSNOOP) &&
+ !kvm_arch_has_noncoherent_dma(vcpu->kvm))
return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
- if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) {
- if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
- return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT;
- else
- return (MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT) |
- VMX_EPT_IPAT_BIT;
- }
-
- return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
+ return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT);
}
static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e90e1a74564e..281edbbfc83d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -946,11 +946,6 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon
if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
kvm_mmu_reset_context(vcpu);
-
- if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
- kvm_mmu_honors_guest_mtrrs(vcpu->kvm) &&
- !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
- kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
}
EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
@@ -11182,6 +11177,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_vcpu_srcu_read_lock(vcpu);
/*
+ * Call this to ensure WC buffers in guest are evicted after each VM
+ * Exit, so that the evicted WC writes can be snooped across all cpus
+ */
+ smp_mb__after_srcu_read_lock();
+
+ /*
* Profile KVM exit RIPs:
*/
if (unlikely(prof_on == KVM_PROFILING)) {
@@ -12264,7 +12265,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
kvm_xen_init_vcpu(vcpu);
- kvm_vcpu_mtrr_init(vcpu);
vcpu_load(vcpu);
kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
kvm_vcpu_reset(vcpu, false);
@@ -13528,13 +13528,13 @@ EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
{
/*
- * Non-coherent DMA assignment and de-assignment will affect
- * whether KVM honors guest MTRRs and cause changes in memtypes
- * in TDP.
- * So, pass %true unconditionally to indicate non-coherent DMA was,
- * or will be involved, and that zapping SPTEs might be necessary.
+ * Non-coherent DMA assignment and de-assignment may affect whether or
+ * not KVM honors guest PAT, and thus may cause changes in EPT SPTEs
+ * due to toggling the "ignore PAT" bit. Zap all SPTEs when the first
+ * (or last) non-coherent device is (un)registered to so that new SPTEs
+ * with the correct "ignore guest PAT" setting are created.
*/
- if (__kvm_mmu_honors_guest_mtrrs(true))
+ if (kvm_mmu_may_ignore_guest_pat())
kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL));
}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index a88c65d3ea26..5da5b869a991 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -325,12 +325,8 @@ int handle_ud(struct kvm_vcpu *vcpu);
void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
struct kvm_queued_exception *ex);
-void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
-u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
-bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
- int page_num);
bool kvm_vector_hashing_enabled(void);
void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code);
int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 236610e4a8fa..1cb4527076de 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -343,6 +343,20 @@ static inline void smp_mb__after_srcu_read_unlock(void)
/* __srcu_read_unlock has smp_mb() internally so nothing to do here. */
}
+/**
+ * smp_mb__after_srcu_read_lock - ensure full ordering after srcu_read_lock
+ *
+ * Converts the preceding srcu_read_lock into a two-way memory barrier.
+ *
+ * Call this after srcu_read_lock, to guarantee that all memory operations
+ * that occur after smp_mb__after_srcu_read_lock will appear to happen after
+ * the preceding srcu_read_lock.
+ */
+static inline void smp_mb__after_srcu_read_lock(void)
+{
+ /* __srcu_read_lock has smp_mb() internally so nothing to do here. */
+}
+
DEFINE_LOCK_GUARD_1(srcu, struct srcu_struct,
_T->idx = srcu_read_lock(_T->lock),
srcu_read_unlock(_T->lock, _T->idx),