summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-05-03 16:49:46 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2019-05-03 16:49:46 -0700
commitaa1be08f52585fe36ecfaf5bddfdc784eb4c94cf (patch)
treebb8a647ba51f6990c880234c32c7ffe0cc8ec826
parent82463436a7fa40345c6febf0baa4c954af506ca6 (diff)
parente8ab8d24b488632d07ce5ddb261f1d454114415b (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM fixes from Paolo Bonzini: - PPC and ARM bugfixes from submaintainers - Fix old Windows versions on AMD (recent regression) - Fix old Linux versions on processors without EPT - Fixes for LAPIC timer optimizations * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (21 commits) KVM: nVMX: Fix size checks in vmx_set_nested_state KVM: selftests: make hyperv_cpuid test pass on AMD KVM: lapic: Check for in-kernel LAPIC before deferencing apic pointer KVM: fix KVM_CLEAR_DIRTY_LOG for memory slots of unaligned size x86/kvm/mmu: reset MMU context when 32-bit guest switches PAE KVM: x86: Whitelist port 0x7e for pre-incrementing %rip Documentation: kvm: fix dirty log ioctl arch lists KVM: VMX: Move RSB stuffing to before the first RET after VM-Exit KVM: arm/arm64: Don't emulate virtual timers on userspace ioctls kvm: arm: Skip stage2 huge mappings for unaligned ipa backed by THP KVM: arm/arm64: Ensure vcpu target is unset on reset failure KVM: lapic: Convert guest TSC to host time domain if necessary KVM: lapic: Allow user to disable adaptive tuning of timer advancement KVM: lapic: Track lapic timer advance per vCPU KVM: lapic: Disable timer advancement if adaptive tuning goes haywire x86: kvm: hyper-v: deal with buggy TLB flush requests from WS2012 KVM: x86: Consider LAPIC TSC-Deadline timer expired if deadline too short KVM: PPC: Book3S: Protect memslots while validating user address KVM: PPC: Book3S HV: Perserve PSSCR FAKE_SUSPEND bit on guest exit KVM: arm/arm64: vgic-v3: Retire pending interrupts on disabling LPIs ...
-rw-r--r--Documentation/virtual/kvm/api.txt11
-rw-r--r--arch/powerpc/kvm/book3s_64_vio.c6
-rw-r--r--arch/powerpc/kvm/book3s_hv.c4
-rw-r--r--arch/x86/include/asm/kvm_host.h1
-rw-r--r--arch/x86/include/uapi/asm/kvm.h1
-rw-r--r--arch/x86/kvm/hyperv.c11
-rw-r--r--arch/x86/kvm/lapic.c73
-rw-r--r--arch/x86/kvm/lapic.h4
-rw-r--r--arch/x86/kvm/mmu.c1
-rw-r--r--arch/x86/kvm/vmx/nested.c4
-rw-r--r--arch/x86/kvm/vmx/vmenter.S12
-rw-r--r--arch/x86/kvm/vmx/vmx.c7
-rw-r--r--arch/x86/kvm/x86.c36
-rw-r--r--arch/x86/kvm/x86.h2
-rw-r--r--tools/testing/selftests/kvm/dirty_log_test.c9
-rw-r--r--tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c9
-rw-r--r--virt/kvm/arm/arch_timer.c17
-rw-r--r--virt/kvm/arm/arm.c11
-rw-r--r--virt/kvm/arm/mmu.c6
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio-v3.c3
-rw-r--r--virt/kvm/arm/vgic/vgic.c21
-rw-r--r--virt/kvm/arm/vgic/vgic.h1
-rw-r--r--virt/kvm/kvm_main.c7
23 files changed, 192 insertions, 65 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 67068c47c591..64b38dfcc243 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -321,7 +321,7 @@ cpu's hardware control block.
4.8 KVM_GET_DIRTY_LOG (vm ioctl)
Capability: basic
-Architectures: x86
+Architectures: all
Type: vm ioctl
Parameters: struct kvm_dirty_log (in/out)
Returns: 0 on success, -1 on error
@@ -3810,7 +3810,7 @@ to I/O ports.
4.117 KVM_CLEAR_DIRTY_LOG (vm ioctl)
Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT
-Architectures: x86
+Architectures: x86, arm, arm64, mips
Type: vm ioctl
Parameters: struct kvm_dirty_log (in)
Returns: 0 on success, -1 on error
@@ -3830,8 +3830,9 @@ The ioctl clears the dirty status of pages in a memory slot, according to
the bitmap that is passed in struct kvm_clear_dirty_log's dirty_bitmap
field. Bit 0 of the bitmap corresponds to page "first_page" in the
memory slot, and num_pages is the size in bits of the input bitmap.
-Both first_page and num_pages must be a multiple of 64. For each bit
-that is set in the input bitmap, the corresponding page is marked "clean"
+first_page must be a multiple of 64; num_pages must also be a multiple of
+64 unless first_page + num_pages is the size of the memory slot. For each
+bit that is set in the input bitmap, the corresponding page is marked "clean"
in KVM's dirty bitmap, and dirty tracking is re-enabled for that page
(for example via write-protection, or by clearing the dirty bit in
a page table entry).
@@ -4799,7 +4800,7 @@ and injected exceptions.
7.18 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT
-Architectures: all
+Architectures: x86, arm, arm64, mips
Parameters: args[0] whether feature should be enabled or not
With this capability enabled, KVM_GET_DIRTY_LOG will not automatically
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index f02b04973710..f100e331e69b 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -543,14 +543,14 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
if (ret != H_SUCCESS)
return ret;
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
ret = kvmppc_tce_validate(stt, tce);
if (ret != H_SUCCESS)
- return ret;
+ goto unlock_exit;
dir = iommu_tce_direction(tce);
- idx = srcu_read_lock(&vcpu->kvm->srcu);
-
if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) {
ret = H_PARAMETER;
goto unlock_exit;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 06964350b97a..b2b29d4f9842 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3423,7 +3423,9 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
- mtspr(SPRN_PSSCR, host_psscr);
+ /* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */
+ mtspr(SPRN_PSSCR, host_psscr |
+ (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
mtspr(SPRN_HFSCR, host_hfscr);
mtspr(SPRN_CIABR, host_ciabr);
mtspr(SPRN_DAWR, host_dawr);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a9d03af34030..c79abe7ca093 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -295,6 +295,7 @@ union kvm_mmu_extended_role {
unsigned int valid:1;
unsigned int execonly:1;
unsigned int cr0_pg:1;
+ unsigned int cr4_pae:1;
unsigned int cr4_pse:1;
unsigned int cr4_pke:1;
unsigned int cr4_smap:1;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index dabfcf7c3941..7a0e64ccd6ff 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -381,6 +381,7 @@ struct kvm_sync_regs {
#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0)
#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1)
#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2)
+#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3)
#define KVM_STATE_NESTED_GUEST_MODE 0x00000001
#define KVM_STATE_NESTED_RUN_PENDING 0x00000002
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 421899f6ad7b..cc24b3a32c44 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1371,7 +1371,16 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
valid_bank_mask = BIT_ULL(0);
sparse_banks[0] = flush.processor_mask;
- all_cpus = flush.flags & HV_FLUSH_ALL_PROCESSORS;
+
+ /*
+ * Work around possible WS2012 bug: it sends hypercalls
+ * with processor_mask = 0x0 and HV_FLUSH_ALL_PROCESSORS clear,
+ * while also expecting us to flush something and crashing if
+ * we don't. Let's treat processor_mask == 0 same as
+ * HV_FLUSH_ALL_PROCESSORS.
+ */
+ all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) ||
+ flush.processor_mask == 0;
} else {
if (unlikely(kvm_read_guest(kvm, ingpa, &flush_ex,
sizeof(flush_ex))))
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9bf70cf84564..bd13fdddbdc4 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -70,7 +70,6 @@
#define APIC_BROADCAST 0xFF
#define X2APIC_BROADCAST 0xFFFFFFFFul
-static bool lapic_timer_advance_adjust_done = false;
#define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100
/* step-by-step approximation to mitigate fluctuation */
#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
@@ -1482,14 +1481,32 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
return false;
}
+static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles)
+{
+ u64 timer_advance_ns = vcpu->arch.apic->lapic_timer.timer_advance_ns;
+
+ /*
+ * If the guest TSC is running at a different ratio than the host, then
+ * convert the delay to nanoseconds to achieve an accurate delay. Note
+ * that __delay() uses delay_tsc whenever the hardware has TSC, thus
+ * always for VMX enabled hardware.
+ */
+ if (vcpu->arch.tsc_scaling_ratio == kvm_default_tsc_scaling_ratio) {
+ __delay(min(guest_cycles,
+ nsec_to_cycles(vcpu, timer_advance_ns)));
+ } else {
+ u64 delay_ns = guest_cycles * 1000000ULL;
+ do_div(delay_ns, vcpu->arch.virtual_tsc_khz);
+ ndelay(min_t(u32, delay_ns, timer_advance_ns));
+ }
+}
+
void wait_lapic_expire(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
u64 guest_tsc, tsc_deadline, ns;
- if (!lapic_in_kernel(vcpu))
- return;
-
if (apic->lapic_timer.expired_tscdeadline == 0)
return;
@@ -1501,33 +1518,37 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
- /* __delay is delay_tsc whenever the hardware has TSC, thus always. */
if (guest_tsc < tsc_deadline)
- __delay(min(tsc_deadline - guest_tsc,
- nsec_to_cycles(vcpu, lapic_timer_advance_ns)));
+ __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
- if (!lapic_timer_advance_adjust_done) {
+ if (!apic->lapic_timer.timer_advance_adjust_done) {
/* too early */
if (guest_tsc < tsc_deadline) {
ns = (tsc_deadline - guest_tsc) * 1000000ULL;
do_div(ns, vcpu->arch.virtual_tsc_khz);
- lapic_timer_advance_ns -= min((unsigned int)ns,
- lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+ timer_advance_ns -= min((u32)ns,
+ timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
} else {
/* too late */
ns = (guest_tsc - tsc_deadline) * 1000000ULL;
do_div(ns, vcpu->arch.virtual_tsc_khz);
- lapic_timer_advance_ns += min((unsigned int)ns,
- lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+ timer_advance_ns += min((u32)ns,
+ timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
}
if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
- lapic_timer_advance_adjust_done = true;
+ apic->lapic_timer.timer_advance_adjust_done = true;
+ if (unlikely(timer_advance_ns > 5000)) {
+ timer_advance_ns = 0;
+ apic->lapic_timer.timer_advance_adjust_done = true;
+ }
+ apic->lapic_timer.timer_advance_ns = timer_advance_ns;
}
}
static void start_sw_tscdeadline(struct kvm_lapic *apic)
{
- u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+ u64 guest_tsc, tscdeadline = ktimer->tscdeadline;
u64 ns = 0;
ktime_t expire;
struct kvm_vcpu *vcpu = apic->vcpu;
@@ -1542,13 +1563,15 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
now = ktime_get();
guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
- if (likely(tscdeadline > guest_tsc)) {
- ns = (tscdeadline - guest_tsc) * 1000000ULL;
- do_div(ns, this_tsc_khz);
+
+ ns = (tscdeadline - guest_tsc) * 1000000ULL;
+ do_div(ns, this_tsc_khz);
+
+ if (likely(tscdeadline > guest_tsc) &&
+ likely(ns > apic->lapic_timer.timer_advance_ns)) {
expire = ktime_add_ns(now, ns);
- expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
- hrtimer_start(&apic->lapic_timer.timer,
- expire, HRTIMER_MODE_ABS_PINNED);
+ expire = ktime_sub_ns(expire, ktimer->timer_advance_ns);
+ hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_PINNED);
} else
apic_timer_expired(apic);
@@ -2255,7 +2278,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
return HRTIMER_NORESTART;
}
-int kvm_create_lapic(struct kvm_vcpu *vcpu)
+int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
{
struct kvm_lapic *apic;
@@ -2279,6 +2302,14 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
HRTIMER_MODE_ABS_PINNED);
apic->lapic_timer.timer.function = apic_timer_fn;
+ if (timer_advance_ns == -1) {
+ apic->lapic_timer.timer_advance_ns = 1000;
+ apic->lapic_timer.timer_advance_adjust_done = false;
+ } else {
+ apic->lapic_timer.timer_advance_ns = timer_advance_ns;
+ apic->lapic_timer.timer_advance_adjust_done = true;
+ }
+
/*
* APIC is created enabled. This will prevent kvm_lapic_set_base from
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index ff6ef9c3d760..d6d049ba3045 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -31,8 +31,10 @@ struct kvm_timer {
u32 timer_mode_mask;
u64 tscdeadline;
u64 expired_tscdeadline;
+ u32 timer_advance_ns;
atomic_t pending; /* accumulated triggered timers */
bool hv_timer_in_use;
+ bool timer_advance_adjust_done;
};
struct kvm_lapic {
@@ -62,7 +64,7 @@ struct kvm_lapic {
struct dest_map;
-int kvm_create_lapic(struct kvm_vcpu *vcpu);
+int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns);
void kvm_free_lapic(struct kvm_vcpu *vcpu);
int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e10962dfc203..d9c7b45d231f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4781,6 +4781,7 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
union kvm_mmu_extended_role ext = {0};
ext.cr0_pg = !!is_paging(vcpu);
+ ext.cr4_pae = !!is_pae(vcpu);
ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
ext.cr4_pse = !!is_pse(vcpu);
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 6401eb7ef19c..0c601d079cd2 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5423,7 +5423,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
return ret;
/* Empty 'VMXON' state is permitted */
- if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12))
+ if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12))
return 0;
if (kvm_state->vmx.vmcs_pa != -1ull) {
@@ -5467,7 +5467,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
vmcs12->vmcs_link_pointer != -1ull) {
struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
- if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12))
+ if (kvm_state->size < sizeof(*kvm_state) + 2 * sizeof(*vmcs12))
return -EINVAL;
if (copy_from_user(shadow_vmcs12,
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 7b272738c576..d4cb1945b2e3 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -3,6 +3,7 @@
#include <asm/asm.h>
#include <asm/bitsperlong.h>
#include <asm/kvm_vcpu_regs.h>
+#include <asm/nospec-branch.h>
#define WORD_SIZE (BITS_PER_LONG / 8)
@@ -77,6 +78,17 @@ ENDPROC(vmx_vmenter)
* referred to by VMCS.HOST_RIP.
*/
ENTRY(vmx_vmexit)
+#ifdef CONFIG_RETPOLINE
+ ALTERNATIVE "jmp .Lvmexit_skip_rsb", "", X86_FEATURE_RETPOLINE
+ /* Preserve guest's RAX, it's used to stuff the RSB. */
+ push %_ASM_AX
+
+ /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
+ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
+
+ pop %_ASM_AX
+.Lvmexit_skip_rsb:
+#endif
ret
ENDPROC(vmx_vmexit)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b4e7d645275a..0c955bb286ff 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6462,9 +6462,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
- /* Eliminate branch target predictions from guest mode */
- vmexit_fill_RSB();
-
/* All fields are clean at this point */
if (static_branch_unlikely(&enable_evmcs))
current_evmcs->hv_clean_fields |=
@@ -7032,6 +7029,7 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
{
struct vcpu_vmx *vmx;
u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
+ struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
if (kvm_mwait_in_guest(vcpu->kvm))
return -EOPNOTSUPP;
@@ -7040,7 +7038,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
tscl = rdtsc();
guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
- lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns);
+ lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
+ ktimer->timer_advance_ns);
if (delta_tsc > lapic_timer_advance_cycles)
delta_tsc -= lapic_timer_advance_cycles;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a0d1fc80ac5a..b5edc8e3ce1d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -136,10 +136,14 @@ EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
static u32 __read_mostly tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
-/* lapic timer advance (tscdeadline mode only) in nanoseconds */
-unsigned int __read_mostly lapic_timer_advance_ns = 1000;
+/*
+ * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables
+ * adaptive tuning starting from default advancment of 1000ns. '0' disables
+ * advancement entirely. Any other value is used as-is and disables adaptive
+ * tuning, i.e. allows priveleged userspace to set an exact advancement time.
+ */
+static int __read_mostly lapic_timer_advance_ns = -1;
module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
-EXPORT_SYMBOL_GPL(lapic_timer_advance_ns);
static bool __read_mostly vector_hashing = true;
module_param(vector_hashing, bool, S_IRUGO);
@@ -6535,6 +6539,12 @@ int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
}
EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer);
+static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.pio.count = 0;
+ return 1;
+}
+
static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
{
vcpu->arch.pio.count = 0;
@@ -6551,12 +6561,23 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
size, port, &val, 1);
+ if (ret)
+ return ret;
- if (!ret) {
+ /*
+ * Workaround userspace that relies on old KVM behavior of %rip being
+ * incremented prior to exiting to userspace to handle "OUT 0x7e".
+ */
+ if (port == 0x7e &&
+ kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) {
+ vcpu->arch.complete_userspace_io =
+ complete_fast_pio_out_port_0x7e;
+ kvm_skip_emulated_instruction(vcpu);
+ } else {
vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
vcpu->arch.complete_userspace_io = complete_fast_pio_out;
}
- return ret;
+ return 0;
}
static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
@@ -7873,7 +7894,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
trace_kvm_entry(vcpu->vcpu_id);
- if (lapic_timer_advance_ns)
+ if (lapic_in_kernel(vcpu) &&
+ vcpu->arch.apic->lapic_timer.timer_advance_ns)
wait_lapic_expire(vcpu);
guest_enter_irqoff();
@@ -9061,7 +9083,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
if (irqchip_in_kernel(vcpu->kvm)) {
vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
- r = kvm_create_lapic(vcpu);
+ r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
if (r < 0)
goto fail_mmu_destroy;
} else
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index aedc5d0d4989..534d3f28bb01 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -294,8 +294,6 @@ extern u64 kvm_supported_xcr0(void);
extern unsigned int min_timer_period_us;
-extern unsigned int lapic_timer_advance_ns;
-
extern bool enable_vmware_backdoor;
extern struct static_key kvm_no_apic_vcpu;
diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c
index 4715cfba20dc..93f99c6b7d79 100644
--- a/tools/testing/selftests/kvm/dirty_log_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_test.c
@@ -288,8 +288,11 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
#endif
max_gfn = (1ul << (guest_pa_bits - guest_page_shift)) - 1;
guest_page_size = (1ul << guest_page_shift);
- /* 1G of guest page sized pages */
- guest_num_pages = (1ul << (30 - guest_page_shift));
+ /*
+ * A little more than 1G of guest page sized pages. Cover the
+ * case where the size is not aligned to 64 pages.
+ */
+ guest_num_pages = (1ul << (30 - guest_page_shift)) + 3;
host_page_size = getpagesize();
host_num_pages = (guest_num_pages * guest_page_size) / host_page_size +
!!((guest_num_pages * guest_page_size) % host_page_size);
@@ -359,7 +362,7 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap);
#ifdef USE_CLEAR_DIRTY_LOG
kvm_vm_clear_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap, 0,
- DIV_ROUND_UP(host_num_pages, 64) * 64);
+ host_num_pages);
#endif
vm_dirty_log_verify(bmap);
iteration++;
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
index 264425f75806..9a21e912097c 100644
--- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
@@ -141,7 +141,13 @@ int main(int argc, char *argv[])
free(hv_cpuid_entries);
- vcpu_ioctl(vm, VCPU_ID, KVM_ENABLE_CAP, &enable_evmcs_cap);
+ rv = _vcpu_ioctl(vm, VCPU_ID, KVM_ENABLE_CAP, &enable_evmcs_cap);
+
+ if (rv) {
+ fprintf(stderr,
+ "Enlightened VMCS is unsupported, skip related test\n");
+ goto vm_free;
+ }
hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm);
if (!hv_cpuid_entries)
@@ -151,6 +157,7 @@ int main(int argc, char *argv[])
free(hv_cpuid_entries);
+vm_free:
kvm_vm_free(vm);
return 0;
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 3417f2dbc366..7fc272ecae16 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -508,6 +508,14 @@ static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
/*
+ * Update the timer output so that it is likely to match the
+ * state we're about to restore. If the timer expires between
+ * this point and the register restoration, we'll take the
+ * interrupt anyway.
+ */
+ kvm_timer_update_irq(vcpu, kvm_timer_should_fire(vtimer), vtimer);
+
+ /*
* When using a userspace irqchip with the architected timers and a
* host interrupt controller that doesn't support an active state, we
* must still prevent continuously exiting from the guest, and
@@ -730,7 +738,6 @@ static void kvm_timer_init_interrupt(void *info)
int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
{
struct arch_timer_context *timer;
- bool level;
switch (regid) {
case KVM_REG_ARM_TIMER_CTL:
@@ -758,10 +765,6 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
return -1;
}
- level = kvm_timer_should_fire(timer);
- kvm_timer_update_irq(vcpu, level, timer);
- timer_emulate(timer);
-
return 0;
}
@@ -812,7 +815,7 @@ static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
switch (treg) {
case TIMER_REG_TVAL:
- val = kvm_phys_timer_read() - timer->cntvoff - timer->cnt_cval;
+ val = timer->cnt_cval - kvm_phys_timer_read() + timer->cntvoff;
break;
case TIMER_REG_CTL:
@@ -858,7 +861,7 @@ static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
{
switch (treg) {
case TIMER_REG_TVAL:
- timer->cnt_cval = val - kvm_phys_timer_read() - timer->cntvoff;
+ timer->cnt_cval = kvm_phys_timer_read() - timer->cntvoff + val;
break;
case TIMER_REG_CTL:
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 99c37384ba7b..f412ebc90610 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -934,7 +934,7 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
const struct kvm_vcpu_init *init)
{
- unsigned int i;
+ unsigned int i, ret;
int phys_target = kvm_target_cpu();
if (init->target != phys_target)
@@ -969,9 +969,14 @@ static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
vcpu->arch.target = phys_target;
/* Now we know what it is, we can reset it. */
- return kvm_reset_vcpu(vcpu);
-}
+ ret = kvm_reset_vcpu(vcpu);
+ if (ret) {
+ vcpu->arch.target = -1;
+ bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
+ }
+ return ret;
+}
static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
struct kvm_vcpu_init *init)
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 27c958306449..a39dcfdbcc65 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1781,8 +1781,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* Only PMD_SIZE transparent hugepages(THP) are
* currently supported. This code will need to be
* updated to support other THP sizes.
+ *
+ * Make sure the host VA and the guest IPA are sufficiently
+ * aligned and that the block is contained within the memslot.
*/
- if (transparent_hugepage_adjust(&pfn, &fault_ipa))
+ if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
+ transparent_hugepage_adjust(&pfn, &fault_ipa))
vma_pagesize = PMD_SIZE;
}
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
index 4a12322bf7df..9f4843fe9cda 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c
@@ -200,6 +200,9 @@ static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS;
+ if (was_enabled && !vgic_cpu->lpis_enabled)
+ vgic_flush_pending_lpis(vcpu);
+
if (!was_enabled && vgic_cpu->lpis_enabled)
vgic_enable_lpis(vcpu);
}
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 3af69f2a3866..191deccf60bf 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -151,6 +151,27 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
kfree(irq);
}
+void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_irq *irq, *tmp;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
+
+ list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) {
+ if (irq->intid >= VGIC_MIN_LPI) {
+ raw_spin_lock(&irq->irq_lock);
+ list_del(&irq->ap_list);
+ irq->vcpu = NULL;
+ raw_spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+ }
+
+ raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
+}
+
void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending)
{
WARN_ON(irq_set_irqchip_state(irq->host_irq,
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index a90024718ca4..abeeffabc456 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -238,6 +238,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu);
bool vgic_has_its(struct kvm *kvm);
int kvm_vgic_register_its_device(void);
void vgic_enable_lpis(struct kvm_vcpu *vcpu);
+void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu);
int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi);
int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index dc8edc97ba85..a704d1f9bd96 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1240,7 +1240,7 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm,
if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
return -EINVAL;
- if ((log->first_page & 63) || (log->num_pages & 63))
+ if (log->first_page & 63)
return -EINVAL;
slots = __kvm_memslots(kvm, as_id);
@@ -1253,8 +1253,9 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm,
n = kvm_dirty_bitmap_bytes(memslot);
if (log->first_page > memslot->npages ||
- log->num_pages > memslot->npages - log->first_page)
- return -EINVAL;
+ log->num_pages > memslot->npages - log->first_page ||
+ (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
+ return -EINVAL;
*flush = false;
dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);