summaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/hyperv.c9
-rw-r--r--arch/x86/kvm/mmu.c54
-rw-r--r--arch/x86/kvm/mmutrace.h4
-rw-r--r--arch/x86/kvm/svm.c32
-rw-r--r--arch/x86/kvm/vmx/nested.c5
-rw-r--r--arch/x86/kvm/vmx/vmx.c19
-rw-r--r--arch/x86/kvm/vmx/vmx.h1
-rw-r--r--arch/x86/kvm/x86.c59
8 files changed, 131 insertions, 52 deletions
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 27c43525a05f..421899f6ad7b 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -526,7 +526,9 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
new_config.enable = 0;
stimer->config.as_uint64 = new_config.as_uint64;
- stimer_mark_pending(stimer, false);
+ if (stimer->config.enable)
+ stimer_mark_pending(stimer, false);
+
return 0;
}
@@ -542,7 +544,10 @@ static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
stimer->config.enable = 0;
else if (stimer->config.auto_enable)
stimer->config.enable = 1;
- stimer_mark_pending(stimer, false);
+
+ if (stimer->config.enable)
+ stimer_mark_pending(stimer, false);
+
return 0;
}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7837ab001d80..eee455a8a612 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -182,7 +182,7 @@ struct kvm_shadow_walk_iterator {
static const union kvm_mmu_page_role mmu_base_role_mask = {
.cr0_wp = 1,
- .cr4_pae = 1,
+ .gpte_is_8_bytes = 1,
.nxe = 1,
.smep_andnot_wp = 1,
.smap_andnot_wp = 1,
@@ -2205,6 +2205,7 @@ static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list);
+
#define for_each_valid_sp(_kvm, _sp, _gfn) \
hlist_for_each_entry(_sp, \
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
@@ -2215,12 +2216,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
for_each_valid_sp(_kvm, _sp, _gfn) \
if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
+static inline bool is_ept_sp(struct kvm_mmu_page *sp)
+{
+ return sp->role.cr0_wp && sp->role.smap_andnot_wp;
+}
+
/* @sp->gfn should be write-protected at the call site */
static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
- if (sp->role.cr4_pae != !!is_pae(vcpu)
- || vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
+ if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||
+ vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return false;
}
@@ -2423,7 +2429,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
role.level = level;
role.direct = direct;
if (role.direct)
- role.cr4_pae = 0;
+ role.gpte_is_8_bytes = true;
role.access = access;
if (!vcpu->arch.mmu->direct_map
&& vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
@@ -4794,7 +4800,6 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
role.base.access = ACC_ALL;
role.base.nxe = !!is_nx(vcpu);
- role.base.cr4_pae = !!is_pae(vcpu);
role.base.cr0_wp = is_write_protection(vcpu);
role.base.smm = is_smm(vcpu);
role.base.guest_mode = is_guest_mode(vcpu);
@@ -4815,6 +4820,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
role.base.ad_disabled = (shadow_accessed_mask == 0);
role.base.level = kvm_x86_ops->get_tdp_level(vcpu);
role.base.direct = true;
+ role.base.gpte_is_8_bytes = true;
return role;
}
@@ -4879,6 +4885,7 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
role.base.smap_andnot_wp = role.ext.cr4_smap &&
!is_write_protection(vcpu);
role.base.direct = !is_paging(vcpu);
+ role.base.gpte_is_8_bytes = !!is_pae(vcpu);
if (!is_long_mode(vcpu))
role.base.level = PT32E_ROOT_LEVEL;
@@ -4918,18 +4925,26 @@ static union kvm_mmu_role
kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
bool execonly)
{
- union kvm_mmu_role role;
+ union kvm_mmu_role role = {0};
- /* Base role is inherited from root_mmu */
- role.base.word = vcpu->arch.root_mmu.mmu_role.base.word;
- role.ext = kvm_calc_mmu_role_ext(vcpu);
+ /* SMM flag is inherited from root_mmu */
+ role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
role.base.level = PT64_ROOT_4LEVEL;
+ role.base.gpte_is_8_bytes = true;
role.base.direct = false;
role.base.ad_disabled = !accessed_dirty;
role.base.guest_mode = true;
role.base.access = ACC_ALL;
+ /*
+ * WP=1 and NOT_WP=1 is an impossible combination, use WP and the
+ * SMAP variation to denote shadow EPT entries.
+ */
+ role.base.cr0_wp = true;
+ role.base.smap_andnot_wp = true;
+
+ role.ext = kvm_calc_mmu_role_ext(vcpu);
role.ext.execonly = execonly;
return role;
@@ -5179,7 +5194,7 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
gpa, bytes, sp->role.word);
offset = offset_in_page(gpa);
- pte_size = sp->role.cr4_pae ? 8 : 4;
+ pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
/*
* Sometimes, the OS only writes the last one bytes to update status
@@ -5203,7 +5218,7 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
page_offset = offset_in_page(gpa);
level = sp->role.level;
*nspte = 1;
- if (!sp->role.cr4_pae) {
+ if (!sp->role.gpte_is_8_bytes) {
page_offset <<= 1; /* 32->64 */
/*
* A 32-bit pde maps 4MB while the shadow pdes map
@@ -5393,10 +5408,12 @@ emulate:
* This can happen if a guest gets a page-fault on data access but the HW
* table walker is not able to read the instruction page (e.g instruction
* page is not present in memory). In those cases we simply restart the
- * guest.
+ * guest, with the exception of AMD Erratum 1096 which is unrecoverable.
*/
- if (unlikely(insn && !insn_len))
- return 1;
+ if (unlikely(insn && !insn_len)) {
+ if (!kvm_x86_ops->need_emulation_on_page_fault(vcpu))
+ return 1;
+ }
er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
@@ -5509,7 +5526,9 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
if (flush && lock_flush_tlb) {
- kvm_flush_remote_tlbs(kvm);
+ kvm_flush_remote_tlbs_with_address(kvm,
+ start_gfn,
+ iterator.gfn - start_gfn + 1);
flush = false;
}
cond_resched_lock(&kvm->mmu_lock);
@@ -5517,7 +5536,8 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
}
if (flush && lock_flush_tlb) {
- kvm_flush_remote_tlbs(kvm);
+ kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
+ end_gfn - start_gfn + 1);
flush = false;
}
@@ -6011,7 +6031,7 @@ out:
/*
* Calculate mmu pages needed for kvm.
*/
-unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
+unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
{
unsigned int nr_mmu_pages;
unsigned int nr_pages = 0;
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 9f6c855a0043..dd30dccd2ad5 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -29,10 +29,10 @@
\
role.word = __entry->role; \
\
- trace_seq_printf(p, "sp gfn %llx l%u%s q%u%s %s%s" \
+ trace_seq_printf(p, "sp gfn %llx l%u %u-byte q%u%s %s%s" \
" %snxe %sad root %u %s%c", \
__entry->gfn, role.level, \
- role.cr4_pae ? " pae" : "", \
+ role.gpte_is_8_bytes ? 8 : 4, \
role.quadrant, \
role.direct ? " direct" : "", \
access_str[role.access], \
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b5b128a0a051..426039285fd1 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -7098,6 +7098,36 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
return -ENODEV;
}
+static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
+{
+ bool is_user, smap;
+
+ is_user = svm_get_cpl(vcpu) == 3;
+ smap = !kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
+
+ /*
+ * Detect and workaround Errata 1096 Fam_17h_00_0Fh
+ *
+ * In non SEV guest, hypervisor will be able to read the guest
+ * memory to decode the instruction pointer when insn_len is zero
+ * so we return true to indicate that decoding is possible.
+ *
+ * But in the SEV guest, the guest memory is encrypted with the
+ * guest specific key and hypervisor will not be able to decode the
+ * instruction pointer so we will not able to workaround it. Lets
+ * print the error and request to kill the guest.
+ */
+ if (is_user && smap) {
+ if (!sev_guest(vcpu->kvm))
+ return true;
+
+ pr_err_ratelimited("KVM: Guest triggered AMD Erratum 1096\n");
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ }
+
+ return false;
+}
+
static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -7231,6 +7261,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.nested_enable_evmcs = nested_enable_evmcs,
.nested_get_evmcs_version = nested_get_evmcs_version,
+
+ .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index f24a2c225070..153e539c29c9 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2585,6 +2585,11 @@ static int nested_check_host_control_regs(struct kvm_vcpu *vcpu,
!nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
!nested_cr3_valid(vcpu, vmcs12->host_cr3))
return -EINVAL;
+
+ if (is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu) ||
+ is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))
+ return -EINVAL;
+
/*
* If the load IA32_EFER VM-exit control is 1, bits reserved in the
* IA32_EFER MSR must be 0 in the field for that register. In addition,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index c73375e01ab8..ab432a930ae8 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1683,12 +1683,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = to_vmx(vcpu)->spec_ctrl;
break;
- case MSR_IA32_ARCH_CAPABILITIES:
- if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
- return 1;
- msr_info->data = to_vmx(vcpu)->arch_capabilities;
- break;
case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
break;
@@ -1895,11 +1889,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
MSR_TYPE_W);
break;
- case MSR_IA32_ARCH_CAPABILITIES:
- if (!msr_info->host_initiated)
- return 1;
- vmx->arch_capabilities = data;
- break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
@@ -4088,8 +4077,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
++vmx->nmsrs;
}
- vmx->arch_capabilities = kvm_get_arch_capabilities();
-
vm_exit_controls_init(vmx, vmx_vmexit_ctrl());
/* 22.2.1, 20.8.1 */
@@ -7409,6 +7396,11 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
return 0;
}
+static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
static __init int hardware_setup(void)
{
unsigned long host_bndcfgs;
@@ -7711,6 +7703,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.set_nested_state = NULL,
.get_vmcs12_pages = NULL,
.nested_enable_evmcs = NULL,
+ .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
};
static void vmx_cleanup_l1d_flush(void)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 1554cb45b393..a1e00d0a2482 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -190,7 +190,6 @@ struct vcpu_vmx {
u64 msr_guest_kernel_gs_base;
#endif
- u64 arch_capabilities;
u64 spec_ctrl;
u32 vm_entry_controls_shadow;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 65e4559eef2f..099b851dabaf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1125,7 +1125,7 @@ static u32 msrs_to_save[] = {
#endif
MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
- MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES,
+ MSR_IA32_SPEC_CTRL,
MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
@@ -1158,6 +1158,7 @@ static u32 emulated_msrs[] = {
MSR_IA32_TSC_ADJUST,
MSR_IA32_TSCDEADLINE,
+ MSR_IA32_ARCH_CAPABILITIES,
MSR_IA32_MISC_ENABLE,
MSR_IA32_MCG_STATUS,
MSR_IA32_MCG_CTL,
@@ -2443,6 +2444,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (msr_info->host_initiated)
vcpu->arch.microcode_version = data;
break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated)
+ return 1;
+ vcpu->arch.arch_capabilities = data;
+ break;
case MSR_EFER:
return set_efer(vcpu, data);
case MSR_K7_HWCR:
@@ -2747,6 +2753,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_UCODE_REV:
msr_info->data = vcpu->arch.microcode_version;
break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
+ return 1;
+ msr_info->data = vcpu->arch.arch_capabilities;
+ break;
case MSR_IA32_TSC:
msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
break;
@@ -6523,14 +6535,27 @@ int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
}
EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer);
+static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.pio.count = 0;
+
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip)))
+ return 1;
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
unsigned short port)
{
unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
size, port, &val, 1);
- /* do not return to emulator after return from userspace */
- vcpu->arch.pio.count = 0;
+
+ if (!ret) {
+ vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.complete_userspace_io = complete_fast_pio_out;
+ }
return ret;
}
@@ -6541,6 +6566,11 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
/* We should only ever be called with arch.pio.count equal to 1 */
BUG_ON(vcpu->arch.pio.count != 1);
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) {
+ vcpu->arch.pio.count = 0;
+ return 1;
+ }
+
/* For size less than 4 we merge, else we zero extend */
val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX)
: 0;
@@ -6553,7 +6583,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
vcpu->arch.pio.port, &val, 1);
kvm_register_write(vcpu, VCPU_REGS_RAX, val);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
@@ -6572,6 +6602,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
return ret;
}
+ vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
vcpu->arch.complete_userspace_io = complete_fast_pio_in;
return 0;
@@ -6579,16 +6610,13 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
{
- int ret = kvm_skip_emulated_instruction(vcpu);
+ int ret;
- /*
- * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered
- * KVM_EXIT_DEBUG here.
- */
if (in)
- return kvm_fast_pio_in(vcpu, size, port) && ret;
+ ret = kvm_fast_pio_in(vcpu, size, port);
else
- return kvm_fast_pio_out(vcpu, size, port) && ret;
+ ret = kvm_fast_pio_out(vcpu, size, port);
+ return ret && kvm_skip_emulated_instruction(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_fast_pio);
@@ -8733,6 +8761,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
{
+ vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
kvm_vcpu_mtrr_init(vcpu);
vcpu_load(vcpu);
@@ -9429,13 +9458,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
- int nr_mmu_pages = 0;
-
if (!kvm->arch.n_requested_mmu_pages)
- nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
-
- if (nr_mmu_pages)
- kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+ kvm_mmu_change_mmu_pages(kvm,
+ kvm_mmu_calculate_default_mmu_pages(kvm));
/*
* Dirty logging tracks sptes in 4k granularity, meaning that large