summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt24
-rw-r--r--Documentation/virt/coco/sev-guest.rst19
-rw-r--r--Documentation/virt/kvm/api.rst169
-rw-r--r--Documentation/virt/kvm/devices/arm-vgic.rst2
-rw-r--r--Documentation/virt/kvm/halt-polling.rst12
-rw-r--r--Documentation/virt/kvm/x86/amd-memory-encryption.rst110
-rw-r--r--Documentation/virt/kvm/x86/errata.rst18
-rw-r--r--MAINTAINERS2
-rw-r--r--arch/arm64/include/asm/esr.h12
-rw-r--r--arch/arm64/include/asm/kvm_arm.h1
-rw-r--r--arch/arm64/include/asm/kvm_asm.h2
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h95
-rw-r--r--arch/arm64/include/asm/kvm_host.h69
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h4
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h26
-rw-r--r--arch/arm64/include/asm/kvm_nested.h131
-rw-r--r--arch/arm64/include/asm/sysreg.h17
-rw-r--r--arch/arm64/kernel/asm-offsets.c1
-rw-r--r--arch/arm64/kernel/debug-monitors.c4
-rw-r--r--arch/arm64/kernel/traps.c8
-rw-r--r--arch/arm64/kvm/arm.c88
-rw-r--r--arch/arm64/kvm/emulate-nested.c104
-rw-r--r--arch/arm64/kvm/fpsimd.c19
-rw-r--r--arch/arm64/kvm/handle_exit.c43
-rw-r--r--arch/arm64/kvm/hyp/entry.S8
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h29
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h35
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/ffa.h2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/Makefile6
-rw-r--r--arch/arm64/kvm/hyp/nvhe/ffa.c180
-rw-r--r--arch/arm64/kvm/hyp/nvhe/gen-hyprel.c6
-rw-r--r--arch/arm64/kvm/hyp/nvhe/host.S6
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-init.S30
-rw-r--r--arch/arm64/kvm/hyp/nvhe/setup.c4
-rw-r--r--arch/arm64/kvm/hyp/vhe/switch.c202
-rw-r--r--arch/arm64/kvm/hyp/vhe/tlb.c147
-rw-r--r--arch/arm64/kvm/mmu.c213
-rw-r--r--arch/arm64/kvm/nested.c1002
-rw-r--r--arch/arm64/kvm/pmu-emul.c2
-rw-r--r--arch/arm64/kvm/reset.c6
-rw-r--r--arch/arm64/kvm/sys_regs.c593
-rw-r--r--arch/loongarch/Kconfig11
-rw-r--r--arch/loongarch/include/asm/kvm_host.h14
-rw-r--r--arch/loongarch/include/asm/kvm_para.h11
-rw-r--r--arch/loongarch/include/asm/kvm_vcpu.h5
-rw-r--r--arch/loongarch/include/asm/loongarch.h1
-rw-r--r--arch/loongarch/include/asm/paravirt.h5
-rw-r--r--arch/loongarch/include/uapi/asm/kvm.h4
-rw-r--r--arch/loongarch/kernel/paravirt.c145
-rw-r--r--arch/loongarch/kernel/time.c2
-rw-r--r--arch/loongarch/kvm/Kconfig1
-rw-r--r--arch/loongarch/kvm/exit.c38
-rw-r--r--arch/loongarch/kvm/main.c1
-rw-r--r--arch/loongarch/kvm/mmu.c72
-rw-r--r--arch/loongarch/kvm/tlb.c5
-rw-r--r--arch/loongarch/kvm/vcpu.c156
-rw-r--r--arch/mips/include/asm/kvm_host.h1
-rw-r--r--arch/mips/kvm/mips.c2
-rw-r--r--arch/powerpc/include/asm/kvm_host.h1
-rw-r--r--arch/powerpc/kvm/powerpc.c2
-rw-r--r--arch/riscv/include/asm/kvm_aia_aplic.h58
-rw-r--r--arch/riscv/include/asm/kvm_aia_imsic.h38
-rw-r--r--arch/riscv/include/asm/kvm_host.h1
-rw-r--r--arch/riscv/kvm/aia.c35
-rw-r--r--arch/riscv/kvm/aia_aplic.c2
-rw-r--r--arch/riscv/kvm/aia_device.c2
-rw-r--r--arch/riscv/kvm/aia_imsic.c2
-rw-r--r--arch/riscv/kvm/trace.h67
-rw-r--r--arch/riscv/kvm/vcpu.c9
-rw-r--r--arch/riscv/kvm/vcpu_exit.c2
-rw-r--r--arch/s390/include/asm/kvm_host.h2
-rw-r--r--arch/s390/kvm/kvm-s390.c14
-rw-r--r--arch/s390/kvm/vsie.c22
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h8
-rw-r--r--arch/x86/include/asm/kvm-x86-pmu-ops.h3
-rw-r--r--arch/x86/include/asm/kvm_host.h90
-rw-r--r--arch/x86/include/asm/sev-common.h25
-rw-r--r--arch/x86/include/asm/sev.h51
-rw-r--r--arch/x86/include/asm/svm.h9
-rw-r--r--arch/x86/include/uapi/asm/kvm.h49
-rw-r--r--arch/x86/kvm/Kconfig4
-rw-r--r--arch/x86/kvm/cpuid.c14
-rw-r--r--arch/x86/kvm/cpuid.h18
-rw-r--r--arch/x86/kvm/emulate.c71
-rw-r--r--arch/x86/kvm/hyperv.c9
-rw-r--r--arch/x86/kvm/irq.c2
-rw-r--r--arch/x86/kvm/irq.h1
-rw-r--r--arch/x86/kvm/irq_comm.c7
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h10
-rw-r--r--arch/x86/kvm/kvm_emulate.h1
-rw-r--r--arch/x86/kvm/lapic.c48
-rw-r--r--arch/x86/kvm/lapic.h5
-rw-r--r--arch/x86/kvm/mmu.h42
-rw-r--r--arch/x86/kvm/mmu/mmu.c206
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h26
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h3
-rw-r--r--arch/x86/kvm/mmu/spte.c46
-rw-r--r--arch/x86/kvm/mmu/spte.h10
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c136
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h2
-rw-r--r--arch/x86/kvm/mtrr.c644
-rw-r--r--arch/x86/kvm/pmu.c73
-rw-r--r--arch/x86/kvm/pmu.h10
-rw-r--r--arch/x86/kvm/smm.c44
-rw-r--r--arch/x86/kvm/svm/nested.c2
-rw-r--r--arch/x86/kvm/svm/pmu.c11
-rw-r--r--arch/x86/kvm/svm/sev.c1564
-rw-r--r--arch/x86/kvm/svm/svm.c78
-rw-r--r--arch/x86/kvm/svm/svm.h70
-rw-r--r--arch/x86/kvm/trace.h55
-rw-r--r--arch/x86/kvm/vmx/main.c5
-rw-r--r--arch/x86/kvm/vmx/nested.c55
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c52
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h10
-rw-r--r--arch/x86/kvm/vmx/vmcs12.h14
-rw-r--r--arch/x86/kvm/vmx/vmx.c205
-rw-r--r--arch/x86/kvm/vmx/vmx.h3
-rw-r--r--arch/x86/kvm/vmx/x86_ops.h4
-rw-r--r--arch/x86/kvm/x86.c567
-rw-r--r--arch/x86/kvm/x86.h25
-rw-r--r--arch/x86/kvm/xen.c6
-rw-r--r--drivers/crypto/ccp/sev-dev.c36
-rw-r--r--drivers/virt/coco/sev-guest/sev-guest.c2
-rw-r--r--drivers/virt/coco/sev-guest/sev-guest.h63
-rw-r--r--include/linux/arm_ffa.h3
-rw-r--r--include/linux/kvm_host.h53
-rw-r--r--include/linux/pagemap.h13
-rw-r--r--include/linux/psp-sev.h4
-rw-r--r--include/linux/srcu.h14
-rw-r--r--include/uapi/linux/kvm.h27
-rw-r--r--include/uapi/linux/psp-sev.h27
-rw-r--r--include/uapi/linux/sev-guest.h3
-rw-r--r--mm/compaction.c12
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/truncate.c3
-rw-r--r--tools/include/uapi/linux/kvm.h10
-rw-r--r--tools/perf/arch/loongarch/Makefile1
-rw-r--r--tools/perf/arch/loongarch/util/Build2
-rw-r--r--tools/perf/arch/loongarch/util/header.c96
-rw-r--r--tools/perf/arch/loongarch/util/kvm-stat.c139
-rw-r--r--tools/perf/arch/riscv/Makefile1
-rw-r--r--tools/perf/arch/riscv/util/Build1
-rw-r--r--tools/perf/arch/riscv/util/kvm-stat.c78
-rw-r--r--tools/perf/arch/riscv/util/riscv_exception_types.h35
-rw-r--r--tools/testing/selftests/kvm/Makefile2
-rw-r--r--tools/testing/selftests/kvm/aarch64/set_id_regs.c17
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/apic.h8
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/processor.h18
-rw-r--r--tools/testing/selftests/kvm/lib/kvm_util.c9
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/processor.c11
-rw-r--r--tools/testing/selftests/kvm/memslot_modification_stress_test.c6
-rw-r--r--tools/testing/selftests/kvm/pre_fault_memory_test.c146
-rw-r--r--tools/testing/selftests/kvm/x86_64/apic_bus_clock_test.c194
-rw-r--r--tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c22
-rw-r--r--tools/testing/selftests/kvm/x86_64/pmu_counters_test.c44
-rw-r--r--tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c35
-rw-r--r--tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c16
-rw-r--r--virt/kvm/Kconfig11
-rw-r--r--virt/kvm/async_pf.c13
-rw-r--r--virt/kvm/guest_memfd.c176
-rw-r--r--virt/kvm/irqchip.c24
-rw-r--r--virt/kvm/kvm_main.c106
-rw-r--r--virt/kvm/pfncache.c3
163 files changed, 7748 insertions, 2313 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 7c19a5fd75e4..530ebe9b11cd 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2722,6 +2722,24 @@
[KVM,ARM,EARLY] Allow use of GICv4 for direct
injection of LPIs.
+ kvm-arm.wfe_trap_policy=
+ [KVM,ARM] Control when to set WFE instruction trap for
+ KVM VMs. Traps are allowed but not guaranteed by the
+ CPU architecture.
+
+ trap: set WFE instruction trap
+
+ notrap: clear WFE instruction trap
+
+ kvm-arm.wfi_trap_policy=
+ [KVM,ARM] Control when to set WFI instruction trap for
+ KVM VMs. Traps are allowed but not guaranteed by the
+ CPU architecture.
+
+ trap: set WFI instruction trap
+
+ notrap: clear WFI instruction trap
+
kvm_cma_resv_ratio=n [PPC,EARLY]
Reserves given percentage from system memory area for
contiguous memory allocation for KVM hash pagetable
@@ -4036,9 +4054,9 @@
prediction) vulnerability. System may allow data
leaks with this option.
- no-steal-acc [X86,PV_OPS,ARM64,PPC/PSERIES,RISCV,EARLY] Disable
- paravirtualized steal time accounting. steal time is
- computed, but won't influence scheduler behaviour
+ no-steal-acc [X86,PV_OPS,ARM64,PPC/PSERIES,RISCV,LOONGARCH,EARLY]
+ Disable paravirtualized steal time accounting. steal time
+ is computed, but won't influence scheduler behaviour
nosync [HW,M68K] Disables sync negotiation for all devices.
diff --git a/Documentation/virt/coco/sev-guest.rst b/Documentation/virt/coco/sev-guest.rst
index 9d00967a5b2b..93debceb6eb0 100644
--- a/Documentation/virt/coco/sev-guest.rst
+++ b/Documentation/virt/coco/sev-guest.rst
@@ -176,6 +176,25 @@ to SNP_CONFIG command defined in the SEV-SNP spec. The current values of
the firmware parameters affected by this command can be queried via
SNP_PLATFORM_STATUS.
+2.7 SNP_VLEK_LOAD
+-----------------
+:Technology: sev-snp
+:Type: hypervisor ioctl cmd
+:Parameters (in): struct sev_user_data_snp_vlek_load
+:Returns (out): 0 on success, -negative on error
+
+When requesting an attestation report a guest is able to specify whether
+it wants SNP firmware to sign the report using either a Versioned Chip
+Endorsement Key (VCEK), which is derived from chip-unique secrets, or a
+Versioned Loaded Endorsement Key (VLEK) which is obtained from an AMD
+Key Derivation Service (KDS) and derived from seeds allocated to
+enrolled cloud service providers.
+
+In the case of VLEK keys, the SNP_VLEK_LOAD SNP command is used to load
+them into the system after obtaining them from the KDS, and corresponds
+closely to the SNP_VLEK_LOAD firmware command specified in the SEV-SNP
+spec.
+
3. SEV-SNP CPUID Enforcement
============================
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 899480d4acaf..fe722c5dada9 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -891,12 +891,12 @@ like this::
The irq_type field has the following values:
-- irq_type[0]:
+- KVM_ARM_IRQ_TYPE_CPU:
out-of-kernel GIC: irq_id 0 is IRQ, irq_id 1 is FIQ
-- irq_type[1]:
+- KVM_ARM_IRQ_TYPE_SPI:
in-kernel GIC: SPI, irq_id between 32 and 1019 (incl.)
(the vcpu_index field is ignored)
-- irq_type[2]:
+- KVM_ARM_IRQ_TYPE_PPI:
in-kernel GIC: PPI, irq_id between 16 and 31 (incl.)
(The irq_id field thus corresponds nicely to the IRQ ID in the ARM GIC specs)
@@ -1403,6 +1403,12 @@ Instead, an abort (data abort if the cause of the page-table update
was a load or a store, instruction abort if it was an instruction
fetch) is injected in the guest.
+S390:
+^^^^^
+
+Returns -EINVAL if the VM has the KVM_VM_S390_UCONTROL flag set.
+Returns -EINVAL if called on a protected VM.
+
4.36 KVM_SET_TSS_ADDR
---------------------
@@ -1921,7 +1927,7 @@ flags:
If KVM_MSI_VALID_DEVID is set, devid contains a unique device identifier
for the device that wrote the MSI message. For PCI, this is usually a
-BFD identifier in the lower 16 bits.
+BDF identifier in the lower 16 bits.
On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS
feature of KVM_CAP_X2APIC_API capability is enabled. If it is enabled,
@@ -2989,7 +2995,7 @@ flags:
If KVM_MSI_VALID_DEVID is set, devid contains a unique device identifier
for the device that wrote the MSI message. For PCI, this is usually a
-BFD identifier in the lower 16 bits.
+BDF identifier in the lower 16 bits.
On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS
feature of KVM_CAP_X2APIC_API capability is enabled. If it is enabled,
@@ -6276,6 +6282,12 @@ state. At VM creation time, all memory is shared, i.e. the PRIVATE attribute
is '0' for all gfns. Userspace can control whether memory is shared/private by
toggling KVM_MEMORY_ATTRIBUTE_PRIVATE via KVM_SET_MEMORY_ATTRIBUTES as needed.
+S390:
+^^^^^
+
+Returns -EINVAL if the VM has the KVM_VM_S390_UCONTROL flag set.
+Returns -EINVAL if called on a protected VM.
+
4.141 KVM_SET_MEMORY_ATTRIBUTES
-------------------------------
@@ -6355,6 +6367,61 @@ a single guest_memfd file, but the bound ranges must not overlap).
See KVM_SET_USER_MEMORY_REGION2 for additional details.
+4.143 KVM_PRE_FAULT_MEMORY
+------------------------
+
+:Capability: KVM_CAP_PRE_FAULT_MEMORY
+:Architectures: none
+:Type: vcpu ioctl
+:Parameters: struct kvm_pre_fault_memory (in/out)
+:Returns: 0 if at least one page is processed, < 0 on error
+
+Errors:
+
+ ========== ===============================================================
+ EINVAL The specified `gpa` and `size` were invalid (e.g. not
+ page aligned, causes an overflow, or size is zero).
+ ENOENT The specified `gpa` is outside defined memslots.
+ EINTR An unmasked signal is pending and no page was processed.
+ EFAULT The parameter address was invalid.
+ EOPNOTSUPP Mapping memory for a GPA is unsupported by the
+ hypervisor, and/or for the current vCPU state/mode.
+ EIO unexpected error conditions (also causes a WARN)
+ ========== ===============================================================
+
+::
+
+ struct kvm_pre_fault_memory {
+ /* in/out */
+ __u64 gpa;
+ __u64 size;
+ /* in */
+ __u64 flags;
+ __u64 padding[5];
+ };
+
+KVM_PRE_FAULT_MEMORY populates KVM's stage-2 page tables used to map memory
+for the current vCPU state. KVM maps memory as if the vCPU generated a
+stage-2 read page fault, e.g. faults in memory as needed, but doesn't break
+CoW. However, KVM does not mark any newly created stage-2 PTE as Accessed.
+
+In some cases, multiple vCPUs might share the page tables. In this
+case, the ioctl can be called in parallel.
+
+When the ioctl returns, the input values are updated to point to the
+remaining range. If `size` > 0 on return, the caller can just issue
+the ioctl again with the same `struct kvm_map_memory` argument.
+
+Shadow page tables cannot support this ioctl because they
+are indexed by virtual address or nested guest physical address.
+Calling this ioctl when the guest is using shadow page tables (for
+example because it is running a nested guest with nested page tables)
+will fail with `EOPNOTSUPP` even if `KVM_CHECK_EXTENSION` reports
+the capability to be present.
+
+`flags` must currently be zero.
+
+
5. The kvm_run structure
========================
@@ -6419,9 +6486,12 @@ More architecture-specific flags detailing state of the VCPU that may
affect the device's behavior. Current defined flags::
/* x86, set if the VCPU is in system management mode */
- #define KVM_RUN_X86_SMM (1 << 0)
+ #define KVM_RUN_X86_SMM (1 << 0)
/* x86, set if bus lock detected in VM */
- #define KVM_RUN_BUS_LOCK (1 << 1)
+ #define KVM_RUN_X86_BUS_LOCK (1 << 1)
+ /* x86, set if the VCPU is executing a nested (L2) guest */
+ #define KVM_RUN_X86_GUEST_MODE (1 << 2)
+
/* arm64, set for KVM_EXIT_DEBUG */
#define KVM_DEBUG_ARCH_HSR_HIGH_VALID (1 << 0)
@@ -7767,29 +7837,31 @@ Valid bits in args[0] are::
#define KVM_BUS_LOCK_DETECTION_OFF (1 << 0)
#define KVM_BUS_LOCK_DETECTION_EXIT (1 << 1)
-Enabling this capability on a VM provides userspace with a way to select
-a policy to handle the bus locks detected in guest. Userspace can obtain
-the supported modes from the result of KVM_CHECK_EXTENSION and define it
-through the KVM_ENABLE_CAP.
+Enabling this capability on a VM provides userspace with a way to select a
+policy to handle the bus locks detected in guest. Userspace can obtain the
+supported modes from the result of KVM_CHECK_EXTENSION and define it through
+the KVM_ENABLE_CAP. The supported modes are mutually-exclusive.
-KVM_BUS_LOCK_DETECTION_OFF and KVM_BUS_LOCK_DETECTION_EXIT are supported
-currently and mutually exclusive with each other. More bits can be added in
-the future.
+This capability allows userspace to force VM exits on bus locks detected in the
+guest, irrespective whether or not the host has enabled split-lock detection
+(which triggers an #AC exception that KVM intercepts). This capability is
+intended to mitigate attacks where a malicious/buggy guest can exploit bus
+locks to degrade the performance of the whole system.
-With KVM_BUS_LOCK_DETECTION_OFF set, bus locks in guest will not cause vm exits
-so that no additional actions are needed. This is the default mode.
+If KVM_BUS_LOCK_DETECTION_OFF is set, KVM doesn't force guest bus locks to VM
+exit, although the host kernel's split-lock #AC detection still applies, if
+enabled.
-With KVM_BUS_LOCK_DETECTION_EXIT set, vm exits happen when bus lock detected
-in VM. KVM just exits to userspace when handling them. Userspace can enforce
-its own throttling or other policy based mitigations.
+If KVM_BUS_LOCK_DETECTION_EXIT is set, KVM enables a CPU feature that ensures
+bus locks in the guest trigger a VM exit, and KVM exits to userspace for all
+such VM exits, e.g. to allow userspace to throttle the offending guest and/or
+apply some other policy-based mitigation. When exiting to userspace, KVM sets
+KVM_RUN_X86_BUS_LOCK in vcpu-run->flags, and conditionally sets the exit_reason
+to KVM_EXIT_X86_BUS_LOCK.
-This capability is aimed to address the thread that VM can exploit bus locks to
-degree the performance of the whole system. Once the userspace enable this
-capability and select the KVM_BUS_LOCK_DETECTION_EXIT mode, KVM will set the
-KVM_RUN_BUS_LOCK flag in vcpu-run->flags field and exit to userspace. Concerning
-the bus lock vm exit can be preempted by a higher priority VM exit, the exit
-notifications to userspace can be KVM_EXIT_BUS_LOCK or other reasons.
-KVM_RUN_BUS_LOCK flag is used to distinguish between them.
+Note! Detected bus locks may be coincident with other exits to userspace, i.e.
+KVM_RUN_X86_BUS_LOCK should be checked regardless of the primary exit reason if
+userspace wants to take action on all detected bus locks.
7.23 KVM_CAP_PPC_DAWR1
----------------------
@@ -7905,10 +7977,10 @@ perform a bulk copy of tags to/from the guest.
7.29 KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM
-------------------------------------
-Architectures: x86 SEV enabled
-Type: vm
-Parameters: args[0] is the fd of the source vm
-Returns: 0 on success
+:Architectures: x86 SEV enabled
+:Type: vm
+:Parameters: args[0] is the fd of the source vm
+:Returns: 0 on success
This capability enables userspace to migrate the encryption context from the VM
indicated by the fd to the VM this is called on.
@@ -7956,7 +8028,11 @@ The valid bits in cap.args[0] are:
When this quirk is disabled, the reset value
is 0x10000 (APIC_LVT_MASKED).
- KVM_X86_QUIRK_CD_NW_CLEARED By default, KVM clears CR0.CD and CR0.NW.
+ KVM_X86_QUIRK_CD_NW_CLEARED By default, KVM clears CR0.CD and CR0.NW on
+ AMD CPUs to workaround buggy guest firmware
+ that runs in perpetuity with CR0.CD, i.e.
+ with caches in "no fill" mode.
+
When this quirk is disabled, KVM does not
change the value of CR0.CD and CR0.NW.
@@ -8073,6 +8149,37 @@ error/annotated fault.
See KVM_EXIT_MEMORY_FAULT for more information.
+7.35 KVM_CAP_X86_APIC_BUS_CYCLES_NS
+-----------------------------------
+
+:Architectures: x86
+:Target: VM
+:Parameters: args[0] is the desired APIC bus clock rate, in nanoseconds
+:Returns: 0 on success, -EINVAL if args[0] contains an invalid value for the
+ frequency or if any vCPUs have been created, -ENXIO if a virtual
+ local APIC has not been created using KVM_CREATE_IRQCHIP.
+
+This capability sets the VM's APIC bus clock frequency, used by KVM's in-kernel
+virtual APIC when emulating APIC timers. KVM's default value can be retrieved
+by KVM_CHECK_EXTENSION.
+
+Note: Userspace is responsible for correctly configuring CPUID 0x15, a.k.a. the
+core crystal clock frequency, if a non-zero CPUID 0x15 is exposed to the guest.
+
+7.36 KVM_CAP_X86_GUEST_MODE
+------------------------------
+
+:Architectures: x86
+:Returns: Informational only, -EINVAL on direct KVM_ENABLE_CAP.
+
+The presence of this capability indicates that KVM_RUN will update the
+KVM_RUN_X86_GUEST_MODE bit in kvm_run.flags to indicate whether the
+vCPU was executing nested guest code when it exited.
+
+KVM exits with the register state of either the L1 or L2 guest
+depending on which executed at the time of an exit. Userspace must
+take care to differentiate between these cases.
+
8. Other capabilities.
======================
diff --git a/Documentation/virt/kvm/devices/arm-vgic.rst b/Documentation/virt/kvm/devices/arm-vgic.rst
index 40bdeea1d86e..19f0c6756891 100644
--- a/Documentation/virt/kvm/devices/arm-vgic.rst
+++ b/Documentation/virt/kvm/devices/arm-vgic.rst
@@ -31,7 +31,7 @@ Groups:
KVM_VGIC_V2_ADDR_TYPE_CPU (rw, 64-bit)
Base address in the guest physical address space of the GIC virtual cpu
interface register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V2.
- This address needs to be 4K aligned and the region covers 4 KByte.
+ This address needs to be 4K aligned and the region covers 8 KByte.
Errors:
diff --git a/Documentation/virt/kvm/halt-polling.rst b/Documentation/virt/kvm/halt-polling.rst
index c82a04b709b4..a6790a67e205 100644
--- a/Documentation/virt/kvm/halt-polling.rst
+++ b/Documentation/virt/kvm/halt-polling.rst
@@ -79,11 +79,11 @@ adjustment of the polling interval.
Module Parameters
=================
-The kvm module has 3 tuneable module parameters to adjust the global max
-polling interval as well as the rate at which the polling interval is grown and
-shrunk. These variables are defined in include/linux/kvm_host.h and as module
-parameters in virt/kvm/kvm_main.c, or arch/powerpc/kvm/book3s_hv.c in the
-powerpc kvm-hv case.
+The kvm module has 4 tunable module parameters to adjust the global max polling
+interval, the initial value (to grow from 0), and the rate at which the polling
+interval is grown and shrunk. These variables are defined in
+include/linux/kvm_host.h and as module parameters in virt/kvm/kvm_main.c, or
+arch/powerpc/kvm/book3s_hv.c in the powerpc kvm-hv case.
+-----------------------+---------------------------+-------------------------+
|Module Parameter | Description | Default Value |
@@ -105,7 +105,7 @@ powerpc kvm-hv case.
| | grow_halt_poll_ns() | |
| | function. | |
+-----------------------+---------------------------+-------------------------+
-|halt_poll_ns_shrink | The value by which the | 0 |
+|halt_poll_ns_shrink | The value by which the | 2 |
| | halt polling interval is | |
| | divided in the | |
| | shrink_halt_poll_ns() | |
diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst
index 9677a0714a39..1ddb6a86ce7f 100644
--- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst
+++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst
@@ -466,6 +466,112 @@ issued by the hypervisor to make the guest ready for execution.
Returns: 0 on success, -negative on error
+18. KVM_SEV_SNP_LAUNCH_START
+----------------------------
+
+The KVM_SNP_LAUNCH_START command is used for creating the memory encryption
+context for the SEV-SNP guest. It must be called prior to issuing
+KVM_SEV_SNP_LAUNCH_UPDATE or KVM_SEV_SNP_LAUNCH_FINISH;
+
+Parameters (in): struct kvm_sev_snp_launch_start
+
+Returns: 0 on success, -negative on error
+
+::
+
+ struct kvm_sev_snp_launch_start {
+ __u64 policy; /* Guest policy to use. */
+ __u8 gosvw[16]; /* Guest OS visible workarounds. */
+ __u16 flags; /* Must be zero. */
+ __u8 pad0[6];
+ __u64 pad1[4];
+ };
+
+See SNP_LAUNCH_START in the SEV-SNP specification [snp-fw-abi]_ for further
+details on the input parameters in ``struct kvm_sev_snp_launch_start``.
+
+19. KVM_SEV_SNP_LAUNCH_UPDATE
+-----------------------------
+
+The KVM_SEV_SNP_LAUNCH_UPDATE command is used for loading userspace-provided
+data into a guest GPA range, measuring the contents into the SNP guest context
+created by KVM_SEV_SNP_LAUNCH_START, and then encrypting/validating that GPA
+range so that it will be immediately readable using the encryption key
+associated with the guest context once it is booted, after which point it can
+attest the measurement associated with its context before unlocking any
+secrets.
+
+It is required that the GPA ranges initialized by this command have had the
+KVM_MEMORY_ATTRIBUTE_PRIVATE attribute set in advance. See the documentation
+for KVM_SET_MEMORY_ATTRIBUTES for more details on this aspect.
+
+Upon success, this command is not guaranteed to have processed the entire
+range requested. Instead, the ``gfn_start``, ``uaddr``, and ``len`` fields of
+``struct kvm_sev_snp_launch_update`` will be updated to correspond to the
+remaining range that has yet to be processed. The caller should continue
+calling this command until those fields indicate the entire range has been
+processed, e.g. ``len`` is 0, ``gfn_start`` is equal to the last GFN in the
+range plus 1, and ``uaddr`` is the last byte of the userspace-provided source
+buffer address plus 1. In the case where ``type`` is KVM_SEV_SNP_PAGE_TYPE_ZERO,
+``uaddr`` will be ignored completely.
+
+Parameters (in): struct kvm_sev_snp_launch_update
+
+Returns: 0 on success, < 0 on error, -EAGAIN if caller should retry
+
+::
+
+ struct kvm_sev_snp_launch_update {
+ __u64 gfn_start; /* Guest page number to load/encrypt data into. */
+ __u64 uaddr; /* Userspace address of data to be loaded/encrypted. */
+ __u64 len; /* 4k-aligned length in bytes to copy into guest memory.*/
+ __u8 type; /* The type of the guest pages being initialized. */
+ __u8 pad0;
+ __u16 flags; /* Must be zero. */
+ __u32 pad1;
+ __u64 pad2[4];
+
+ };
+
+where the allowed values for page_type are #define'd as::
+
+ KVM_SEV_SNP_PAGE_TYPE_NORMAL
+ KVM_SEV_SNP_PAGE_TYPE_ZERO
+ KVM_SEV_SNP_PAGE_TYPE_UNMEASURED
+ KVM_SEV_SNP_PAGE_TYPE_SECRETS
+ KVM_SEV_SNP_PAGE_TYPE_CPUID
+
+See the SEV-SNP spec [snp-fw-abi]_ for further details on how each page type is
+used/measured.
+
+20. KVM_SEV_SNP_LAUNCH_FINISH
+-----------------------------
+
+After completion of the SNP guest launch flow, the KVM_SEV_SNP_LAUNCH_FINISH
+command can be issued to make the guest ready for execution.
+
+Parameters (in): struct kvm_sev_snp_launch_finish
+
+Returns: 0 on success, -negative on error
+
+::
+
+ struct kvm_sev_snp_launch_finish {
+ __u64 id_block_uaddr;
+ __u64 id_auth_uaddr;
+ __u8 id_block_en;
+ __u8 auth_key_en;
+ __u8 vcek_disabled;
+ __u8 host_data[32];
+ __u8 pad0[3];
+ __u16 flags; /* Must be zero */
+ __u64 pad1[4];
+ };
+
+
+See SNP_LAUNCH_FINISH in the SEV-SNP specification [snp-fw-abi]_ for further
+details on the input parameters in ``struct kvm_sev_snp_launch_finish``.
+
Device attribute API
====================
@@ -497,9 +603,11 @@ References
==========
-See [white-paper]_, [api-spec]_, [amd-apm]_ and [kvm-forum]_ for more info.
+See [white-paper]_, [api-spec]_, [amd-apm]_, [kvm-forum]_, and [snp-fw-abi]_
+for more info.
.. [white-paper] https://developer.amd.com/wordpress/media/2013/12/AMD_Memory_Encryption_Whitepaper_v7-Public.pdf
.. [api-spec] https://support.amd.com/TechDocs/55766_SEV-KM_API_Specification.pdf
.. [amd-apm] https://support.amd.com/TechDocs/24593.pdf (section 15.34)
.. [kvm-forum] https://www.linux-kvm.org/images/7/74/02x08A-Thomas_Lendacky-AMDs_Virtualizatoin_Memory_Encryption_Technology.pdf
+.. [snp-fw-abi] https://www.amd.com/system/files/TechDocs/56860.pdf
diff --git a/Documentation/virt/kvm/x86/errata.rst b/Documentation/virt/kvm/x86/errata.rst
index 49a05f24747b..4116045a8744 100644
--- a/Documentation/virt/kvm/x86/errata.rst
+++ b/Documentation/virt/kvm/x86/errata.rst
@@ -48,3 +48,21 @@ have the same physical APIC ID, KVM will deliver events targeting that APIC ID
only to the vCPU with the lowest vCPU ID. If KVM_X2APIC_API_USE_32BIT_IDS is
not enabled, KVM follows x86 architecture when processing interrupts (all vCPUs
matching the target APIC ID receive the interrupt).
+
+MTRRs
+-----
+KVM does not virtualize guest MTRR memory types. KVM emulates accesses to MTRR
+MSRs, i.e. {RD,WR}MSR in the guest will behave as expected, but KVM does not
+honor guest MTRRs when determining the effective memory type, and instead
+treats all of guest memory as having Writeback (WB) MTRRs.
+
+CR0.CD
+------
+KVM does not virtualize CR0.CD on Intel CPUs. Similar to MTRR MSRs, KVM
+emulates CR0.CD accesses so that loads and stores from/to CR0 behave as
+expected, but setting CR0.CD=1 has no impact on the cachaeability of guest
+memory.
+
+Note, this erratum does not affect AMD CPUs, which fully virtualize CR0.CD in
+hardware, i.e. put the CPU caches into "no fill" mode when CR0.CD=1, even when
+running in the guest. \ No newline at end of file
diff --git a/MAINTAINERS b/MAINTAINERS
index 880fa3d86f3e..2ce06ad40768 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12248,6 +12248,8 @@ L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
L: kvmarm@lists.linux.dev
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git
+F: Documentation/virt/kvm/arm/
+F: Documentation/virt/kvm/devices/arm*
F: arch/arm64/include/asm/kvm*
F: arch/arm64/include/uapi/asm/kvm*
F: arch/arm64/kvm/
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index 3f482500f71f..56c148890daf 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -160,6 +160,7 @@
#define ESR_ELx_Xs_MASK (GENMASK_ULL(4, 0))
/* ISS field definitions for exceptions taken in to Hyp */
+#define ESR_ELx_FSC_ADDRSZ (0x00)
#define ESR_ELx_CV (UL(1) << 24)
#define ESR_ELx_COND_SHIFT (20)
#define ESR_ELx_COND_MASK (UL(0xF) << ESR_ELx_COND_SHIFT)
@@ -387,6 +388,11 @@
#ifndef __ASSEMBLY__
#include <asm/types.h>
+static inline unsigned long esr_brk_comment(unsigned long esr)
+{
+ return esr & ESR_ELx_BRK64_ISS_COMMENT_MASK;
+}
+
static inline bool esr_is_data_abort(unsigned long esr)
{
const unsigned long ec = ESR_ELx_EC(esr);
@@ -394,6 +400,12 @@ static inline bool esr_is_data_abort(unsigned long esr)
return ec == ESR_ELx_EC_DABT_LOW || ec == ESR_ELx_EC_DABT_CUR;
}
+static inline bool esr_is_cfi_brk(unsigned long esr)
+{
+ return ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 &&
+ (esr_brk_comment(esr) & ~CFI_BRK_IMM_MASK) == CFI_BRK_IMM_BASE;
+}
+
static inline bool esr_fsc_is_translation_fault(unsigned long esr)
{
esr = esr & ESR_ELx_FSC;
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index b2adc2c6c82a..d81cc746e0eb 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -102,7 +102,6 @@
#define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC)
#define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H)
-#define HCRX_GUEST_FLAGS (HCRX_EL2_SMPME | HCRX_EL2_TCR2En)
#define HCRX_HOST_FLAGS (HCRX_EL2_MSCEn | HCRX_EL2_TCR2En | HCRX_EL2_EnFPM)
/* TCR_EL2 Registers bits */
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index a6330460d9e5..2181a11b9d92 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -232,6 +232,8 @@ extern void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
phys_addr_t start, unsigned long pages);
extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu);
+extern int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding);
+
extern void __kvm_timer_set_cntvoff(u64 cntvoff);
extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 21650e7924d4..a601a9305b10 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -11,6 +11,7 @@
#ifndef __ARM64_KVM_EMULATE_H__
#define __ARM64_KVM_EMULATE_H__
+#include <linux/bitfield.h>
#include <linux/kvm_host.h>
#include <asm/debug-monitors.h>
@@ -55,6 +56,14 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu);
int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2);
int kvm_inject_nested_irq(struct kvm_vcpu *vcpu);
+static inline void kvm_inject_nested_sve_trap(struct kvm_vcpu *vcpu)
+{
+ u64 esr = FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SVE) |
+ ESR_ELx_IL;
+
+ kvm_inject_nested_sync(vcpu, esr);
+}
+
#if defined(__KVM_VHE_HYPERVISOR__) || defined(__KVM_NVHE_HYPERVISOR__)
static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
{
@@ -69,39 +78,17 @@ static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
{
- vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS;
- if (has_vhe() || has_hvhe())
- vcpu->arch.hcr_el2 |= HCR_E2H;
- if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) {
- /* route synchronous external abort exceptions to EL2 */
- vcpu->arch.hcr_el2 |= HCR_TEA;
- /* trap error record accesses */
- vcpu->arch.hcr_el2 |= HCR_TERR;
- }
+ if (!vcpu_has_run_once(vcpu))
+ vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS;
- if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) {
- vcpu->arch.hcr_el2 |= HCR_FWB;
- } else {
- /*
- * For non-FWB CPUs, we trap VM ops (HCR_EL2.TVM) until M+C
- * get set in SCTLR_EL1 such that we can detect when the guest
- * MMU gets turned on and do the necessary cache maintenance
- * then.
- */
+ /*
+ * For non-FWB CPUs, we trap VM ops (HCR_EL2.TVM) until M+C
+ * get set in SCTLR_EL1 such that we can detect when the guest
+ * MMU gets turned on and do the necessary cache maintenance
+ * then.
+ */
+ if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
vcpu->arch.hcr_el2 |= HCR_TVM;
- }
-
- if (cpus_have_final_cap(ARM64_HAS_EVT) &&
- !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE))
- vcpu->arch.hcr_el2 |= HCR_TID4;
- else
- vcpu->arch.hcr_el2 |= HCR_TID2;
-
- if (vcpu_el1_is_32bit(vcpu))
- vcpu->arch.hcr_el2 &= ~HCR_RW;
-
- if (kvm_has_mte(vcpu->kvm))
- vcpu->arch.hcr_el2 |= HCR_ATA;
}
static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
@@ -660,4 +647,50 @@ static __always_inline void kvm_reset_cptr_el2(struct kvm_vcpu *vcpu)
kvm_write_cptr_el2(val);
}
+
+/*
+ * Returns a 'sanitised' view of CPTR_EL2, translating from nVHE to the VHE
+ * format if E2H isn't set.
+ */
+static inline u64 vcpu_sanitised_cptr_el2(const struct kvm_vcpu *vcpu)
+{
+ u64 cptr = __vcpu_sys_reg(vcpu, CPTR_EL2);
+
+ if (!vcpu_el2_e2h_is_set(vcpu))
+ cptr = translate_cptr_el2_to_cpacr_el1(cptr);
+
+ return cptr;
+}
+
+static inline bool ____cptr_xen_trap_enabled(const struct kvm_vcpu *vcpu,
+ unsigned int xen)
+{
+ switch (xen) {
+ case 0b00:
+ case 0b10:
+ return true;
+ case 0b01:
+ return vcpu_el2_tge_is_set(vcpu) && !vcpu_is_el2(vcpu);
+ case 0b11:
+ default:
+ return false;
+ }
+}
+
+#define __guest_hyp_cptr_xen_trap_enabled(vcpu, xen) \
+ (!vcpu_has_nv(vcpu) ? false : \
+ ____cptr_xen_trap_enabled(vcpu, \
+ SYS_FIELD_GET(CPACR_ELx, xen, \
+ vcpu_sanitised_cptr_el2(vcpu))))
+
+static inline bool guest_hyp_fpsimd_traps_enabled(const struct kvm_vcpu *vcpu)
+{
+ return __guest_hyp_cptr_xen_trap_enabled(vcpu, FPEN);
+}
+
+static inline bool guest_hyp_sve_traps_enabled(const struct kvm_vcpu *vcpu)
+{
+ return __guest_hyp_cptr_xen_trap_enabled(vcpu, ZEN);
+}
+
#endif /* __ARM64_KVM_EMULATE_H__ */
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 36b8e97bf49e..a33f5996ca9f 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -189,6 +189,33 @@ struct kvm_s2_mmu {
uint64_t split_page_chunk_size;
struct kvm_arch *arch;
+
+ /*
+ * For a shadow stage-2 MMU, the virtual vttbr used by the
+ * host to parse the guest S2.
+ * This either contains:
+ * - the virtual VTTBR programmed by the guest hypervisor with
+ * CnP cleared
+ * - The value 1 (VMID=0, BADDR=0, CnP=1) if invalid
+ *
+ * We also cache the full VTCR which gets used for TLB invalidation,
+ * taking the ARM ARM's "Any of the bits in VTCR_EL2 are permitted
+ * to be cached in a TLB" to the letter.
+ */
+ u64 tlb_vttbr;
+ u64 tlb_vtcr;
+
+ /*
+ * true when this represents a nested context where virtual
+ * HCR_EL2.VM == 1
+ */
+ bool nested_stage2_enabled;
+
+ /*
+ * 0: Nobody is currently using this, check vttbr for validity
+ * >0: Somebody is actively using this.
+ */
+ atomic_t refcnt;
};
struct kvm_arch_memory_slot {
@@ -256,6 +283,14 @@ struct kvm_arch {
*/
u64 fgu[__NR_FGT_GROUP_IDS__];
+ /*
+ * Stage 2 paging state for VMs with nested S2 using a virtual
+ * VMID.
+ */
+ struct kvm_s2_mmu *nested_mmus;
+ size_t nested_mmus_size;
+ int nested_mmus_next;
+
/* Interrupt controller */
struct vgic_dist vgic;
@@ -327,11 +362,11 @@ struct kvm_arch {
* Atomic access to multiple idregs are guarded by kvm_arch.config_lock.
*/
#define IDREG_IDX(id) (((sys_reg_CRm(id) - 1) << 3) | sys_reg_Op2(id))
-#define IDX_IDREG(idx) sys_reg(3, 0, 0, ((idx) >> 3) + 1, (idx) & Op2_mask)
-#define IDREG(kvm, id) ((kvm)->arch.id_regs[IDREG_IDX(id)])
#define KVM_ARM_ID_REG_NUM (IDREG_IDX(sys_reg(3, 0, 0, 7, 7)) + 1)
u64 id_regs[KVM_ARM_ID_REG_NUM];
+ u64 ctr_el0;
+
/* Masks for VNCR-baked sysregs */
struct kvm_sysreg_masks *sysreg_masks;
@@ -423,6 +458,7 @@ enum vcpu_sysreg {
MDCR_EL2, /* Monitor Debug Configuration Register (EL2) */
CPTR_EL2, /* Architectural Feature Trap Register (EL2) */
HACR_EL2, /* Hypervisor Auxiliary Control Register */
+ ZCR_EL2, /* SVE Control Register (EL2) */
TTBR0_EL2, /* Translation Table Base Register 0 (EL2) */
TTBR1_EL2, /* Translation Table Base Register 1 (EL2) */
TCR_EL2, /* Translation Control Register (EL2) */
@@ -867,6 +903,9 @@ struct kvm_vcpu_arch {
#define vcpu_sve_max_vq(vcpu) sve_vq_from_vl((vcpu)->arch.sve_max_vl)
+#define vcpu_sve_zcr_elx(vcpu) \
+ (unlikely(is_hyp_ctxt(vcpu)) ? ZCR_EL2 : ZCR_EL1)
+
#define vcpu_sve_state_size(vcpu) ({ \
size_t __size_ret; \
unsigned int __vcpu_vq; \
@@ -991,6 +1030,7 @@ static inline bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val)
case DACR32_EL2: *val = read_sysreg_s(SYS_DACR32_EL2); break;
case IFSR32_EL2: *val = read_sysreg_s(SYS_IFSR32_EL2); break;
case DBGVCR32_EL2: *val = read_sysreg_s(SYS_DBGVCR32_EL2); break;
+ case ZCR_EL1: *val = read_sysreg_s(SYS_ZCR_EL12); break;
default: return false;
}
@@ -1036,6 +1076,7 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg)
case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); break;
case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); break;
case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); break;
+ case ZCR_EL1: write_sysreg_s(val, SYS_ZCR_EL12); break;
default: return false;
}
@@ -1145,7 +1186,7 @@ int __init populate_nv_trap_config(void);
bool lock_all_vcpus(struct kvm *kvm);
void unlock_all_vcpus(struct kvm *kvm);
-void kvm_init_sysreg(struct kvm_vcpu *);
+void kvm_calculate_traps(struct kvm_vcpu *vcpu);
/* MMIO helpers */
void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data);
@@ -1248,7 +1289,6 @@ static inline bool kvm_system_needs_idmapped_vectors(void)
}
static inline void kvm_arch_sync_events(struct kvm *kvm) {}
-static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
void kvm_arm_init_debug(void);
void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu);
@@ -1306,6 +1346,7 @@ void kvm_vcpu_load_vhe(struct kvm_vcpu *vcpu);
void kvm_vcpu_put_vhe(struct kvm_vcpu *vcpu);
int __init kvm_set_ipa_limit(void);
+u32 kvm_get_pa_bits(struct kvm *kvm);
#define __KVM_HAVE_ARCH_VM_ALLOC
struct kvm *kvm_arch_alloc_vm(void);
@@ -1355,6 +1396,24 @@ static inline void kvm_hyp_reserve(void) { }
void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu);
bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu);
+static inline u64 *__vm_id_reg(struct kvm_arch *ka, u32 reg)
+{
+ switch (reg) {
+ case sys_reg(3, 0, 0, 1, 0) ... sys_reg(3, 0, 0, 7, 7):
+ return &ka->id_regs[IDREG_IDX(reg)];
+ case SYS_CTR_EL0:
+ return &ka->ctr_el0;
+ default:
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+}
+
+#define kvm_read_vm_id_reg(kvm, reg) \
+ ({ u64 __val = *__vm_id_reg(&(kvm)->arch, reg); __val; })
+
+void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val);
+
#define __expand_field_sign_unsigned(id, fld, val) \
((u64)SYS_FIELD_VALUE(id, fld, val))
@@ -1371,7 +1430,7 @@ bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu);
#define get_idreg_field_unsigned(kvm, id, fld) \
({ \
- u64 __val = IDREG((kvm), SYS_##id); \
+ u64 __val = kvm_read_vm_id_reg((kvm), SYS_##id); \
FIELD_GET(id##_##fld##_MASK, __val); \
})
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index b05bceca3385..c838309e4ec4 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -124,8 +124,8 @@ void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr,
#endif
#ifdef __KVM_NVHE_HYPERVISOR__
-void __pkvm_init_switch_pgd(phys_addr_t phys, unsigned long size,
- phys_addr_t pgd, void *sp, void *cont_fn);
+void __pkvm_init_switch_pgd(phys_addr_t pgd, unsigned long sp,
+ void (*fn)(void));
int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
unsigned long *per_cpu_base, u32 hyp_va_bits);
void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt);
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index d5e48d870461..216ca424bb16 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -98,6 +98,7 @@ alternative_cb_end
#include <asm/mmu_context.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_host.h>
+#include <asm/kvm_nested.h>
void kvm_update_va_mask(struct alt_instr *alt,
__le32 *origptr, __le32 *updptr, int nr_inst);
@@ -165,6 +166,10 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr);
void __init free_hyp_pgds(void);
+void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size);
+void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end);
+void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end);
+
void stage2_unmap_vm(struct kvm *kvm);
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type);
void kvm_uninit_stage2_mmu(struct kvm *kvm);
@@ -326,5 +331,26 @@ static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu)
{
return container_of(mmu->arch, struct kvm, arch);
}
+
+static inline u64 get_vmid(u64 vttbr)
+{
+ return (vttbr & VTTBR_VMID_MASK(kvm_get_vmid_bits())) >>
+ VTTBR_VMID_SHIFT;
+}
+
+static inline bool kvm_s2_mmu_valid(struct kvm_s2_mmu *mmu)
+{
+ return !(mmu->tlb_vttbr & VTTBR_CNP_BIT);
+}
+
+static inline bool kvm_is_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
+{
+ /*
+ * Be careful, mmu may not be fully initialised so do look at
+ * *any* of its fields.
+ */
+ return &kvm->arch.mmu != mmu;
+}
+
#endif /* __ASSEMBLY__ */
#endif /* __ARM64_KVM_MMU_H__ */
diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
index 5e0ab0596246..5b06c31035a2 100644
--- a/arch/arm64/include/asm/kvm_nested.h
+++ b/arch/arm64/include/asm/kvm_nested.h
@@ -5,6 +5,7 @@
#include <linux/bitfield.h>
#include <linux/kvm_host.h>
#include <asm/kvm_emulate.h>
+#include <asm/kvm_pgtable.h>
static inline bool vcpu_has_nv(const struct kvm_vcpu *vcpu)
{
@@ -32,7 +33,7 @@ static inline u64 translate_tcr_el2_to_tcr_el1(u64 tcr)
static inline u64 translate_cptr_el2_to_cpacr_el1(u64 cptr_el2)
{
- u64 cpacr_el1 = 0;
+ u64 cpacr_el1 = CPACR_ELx_RES1;
if (cptr_el2 & CPTR_EL2_TTA)
cpacr_el1 |= CPACR_ELx_TTA;
@@ -41,6 +42,8 @@ static inline u64 translate_cptr_el2_to_cpacr_el1(u64 cptr_el2)
if (!(cptr_el2 & CPTR_EL2_TZ))
cpacr_el1 |= CPACR_ELx_ZEN;
+ cpacr_el1 |= cptr_el2 & (CPTR_EL2_TCPAC | CPTR_EL2_TAM);
+
return cpacr_el1;
}
@@ -61,6 +64,125 @@ static inline u64 translate_ttbr0_el2_to_ttbr0_el1(u64 ttbr0)
}
extern bool forward_smc_trap(struct kvm_vcpu *vcpu);
+extern void kvm_init_nested(struct kvm *kvm);
+extern int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu);
+extern void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu);
+extern struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu);
+
+union tlbi_info;
+
+extern void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid,
+ const union tlbi_info *info,
+ void (*)(struct kvm_s2_mmu *,
+ const union tlbi_info *));
+extern void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu);
+extern void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu);
+
+struct kvm_s2_trans {
+ phys_addr_t output;
+ unsigned long block_size;
+ bool writable;
+ bool readable;
+ int level;
+ u32 esr;
+ u64 upper_attr;
+};
+
+static inline phys_addr_t kvm_s2_trans_output(struct kvm_s2_trans *trans)
+{
+ return trans->output;
+}
+
+static inline unsigned long kvm_s2_trans_size(struct kvm_s2_trans *trans)
+{
+ return trans->block_size;
+}
+
+static inline u32 kvm_s2_trans_esr(struct kvm_s2_trans *trans)
+{
+ return trans->esr;
+}
+
+static inline bool kvm_s2_trans_readable(struct kvm_s2_trans *trans)
+{
+ return trans->readable;
+}
+
+static inline bool kvm_s2_trans_writable(struct kvm_s2_trans *trans)
+{
+ return trans->writable;
+}
+
+static inline bool kvm_s2_trans_executable(struct kvm_s2_trans *trans)
+{
+ return !(trans->upper_attr & BIT(54));
+}
+
+extern int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
+ struct kvm_s2_trans *result);
+extern int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu,
+ struct kvm_s2_trans *trans);
+extern int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2);
+extern void kvm_nested_s2_wp(struct kvm *kvm);
+extern void kvm_nested_s2_unmap(struct kvm *kvm);
+extern void kvm_nested_s2_flush(struct kvm *kvm);
+
+unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val);
+
+static inline bool kvm_supported_tlbi_s1e1_op(struct kvm_vcpu *vpcu, u32 instr)
+{
+ struct kvm *kvm = vpcu->kvm;
+ u8 CRm = sys_reg_CRm(instr);
+
+ if (!(sys_reg_Op0(instr) == TLBI_Op0 &&
+ sys_reg_Op1(instr) == TLBI_Op1_EL1))
+ return false;
+
+ if (!(sys_reg_CRn(instr) == TLBI_CRn_XS ||
+ (sys_reg_CRn(instr) == TLBI_CRn_nXS &&
+ kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP))))
+ return false;
+
+ if (CRm == TLBI_CRm_nROS &&
+ !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS))
+ return false;
+
+ if ((CRm == TLBI_CRm_RIS || CRm == TLBI_CRm_ROS ||
+ CRm == TLBI_CRm_RNS) &&
+ !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE))
+ return false;
+
+ return true;
+}
+
+static inline bool kvm_supported_tlbi_s1e2_op(struct kvm_vcpu *vpcu, u32 instr)
+{
+ struct kvm *kvm = vpcu->kvm;
+ u8 CRm = sys_reg_CRm(instr);
+
+ if (!(sys_reg_Op0(instr) == TLBI_Op0 &&
+ sys_reg_Op1(instr) == TLBI_Op1_EL2))
+ return false;
+
+ if (!(sys_reg_CRn(instr) == TLBI_CRn_XS ||
+ (sys_reg_CRn(instr) == TLBI_CRn_nXS &&
+ kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP))))
+ return false;
+
+ if (CRm == TLBI_CRm_IPAIS || CRm == TLBI_CRm_IPAONS)
+ return false;
+
+ if (CRm == TLBI_CRm_nROS &&
+ !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS))
+ return false;
+
+ if ((CRm == TLBI_CRm_RIS || CRm == TLBI_CRm_ROS ||
+ CRm == TLBI_CRm_RNS) &&
+ !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE))
+ return false;
+
+ return true;
+}
int kvm_init_nv_sysregs(struct kvm *kvm);
@@ -76,4 +198,11 @@ static inline bool kvm_auth_eretax(struct kvm_vcpu *vcpu, u64 *elr)
}
#endif
+#define KVM_NV_GUEST_MAP_SZ (KVM_PGTABLE_PROT_SW1 | KVM_PGTABLE_PROT_SW0)
+
+static inline u64 kvm_encode_nested_level(struct kvm_s2_trans *trans)
+{
+ return FIELD_PREP(KVM_NV_GUEST_MAP_SZ, trans->level);
+}
+
#endif /* __ARM64_KVM_NESTED_H */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 1b6e436dbb55..4a9ea103817e 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -654,6 +654,23 @@
#define OP_AT_S12E0W sys_insn(AT_Op0, 4, AT_CRn, 8, 7)
/* TLBI instructions */
+#define TLBI_Op0 1
+
+#define TLBI_Op1_EL1 0 /* Accessible from EL1 or higher */
+#define TLBI_Op1_EL2 4 /* Accessible from EL2 or higher */
+
+#define TLBI_CRn_XS 8 /* Extra Slow (the common one) */
+#define TLBI_CRn_nXS 9 /* not Extra Slow (which nobody uses)*/
+
+#define TLBI_CRm_IPAIS 0 /* S2 Inner-Shareable */
+#define TLBI_CRm_nROS 1 /* non-Range, Outer-Sharable */
+#define TLBI_CRm_RIS 2 /* Range, Inner-Sharable */
+#define TLBI_CRm_nRIS 3 /* non-Range, Inner-Sharable */
+#define TLBI_CRm_IPAONS 4 /* S2 Outer and Non-Shareable */
+#define TLBI_CRm_ROS 5 /* Range, Outer-Sharable */
+#define TLBI_CRm_RNS 6 /* Range, Non-Sharable */
+#define TLBI_CRm_nRNS 7 /* non-Range, Non-Sharable */
+
#define OP_TLBI_VMALLE1OS sys_insn(1, 0, 8, 1, 0)
#define OP_TLBI_VAE1OS sys_insn(1, 0, 8, 1, 1)
#define OP_TLBI_ASIDE1OS sys_insn(1, 0, 8, 1, 2)
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 81496083c041..27de1dddb0ab 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -128,6 +128,7 @@ int main(void)
DEFINE(VCPU_FAULT_DISR, offsetof(struct kvm_vcpu, arch.fault.disr_el1));
DEFINE(VCPU_HCR_EL2, offsetof(struct kvm_vcpu, arch.hcr_el2));
DEFINE(CPU_USER_PT_REGS, offsetof(struct kvm_cpu_context, regs));
+ DEFINE(CPU_ELR_EL2, offsetof(struct kvm_cpu_context, sys_regs[ELR_EL2]));
DEFINE(CPU_RGSR_EL1, offsetof(struct kvm_cpu_context, sys_regs[RGSR_EL1]));
DEFINE(CPU_GCR_EL1, offsetof(struct kvm_cpu_context, sys_regs[GCR_EL1]));
DEFINE(CPU_APIAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APIAKEYLO_EL1]));
diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c
index 64f2ecbdfe5c..024a7b245056 100644
--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -312,9 +312,7 @@ static int call_break_hook(struct pt_regs *regs, unsigned long esr)
* entirely not preemptible, and we can use rcu list safely here.
*/
list_for_each_entry_rcu(hook, list, node) {
- unsigned long comment = esr & ESR_ELx_BRK64_ISS_COMMENT_MASK;
-
- if ((comment & ~hook->mask) == hook->imm)
+ if ((esr_brk_comment(esr) & ~hook->mask) == hook->imm)
fn = hook->fn;
}
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 215e6d7f2df8..9e22683aa921 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -1105,8 +1105,6 @@ static struct break_hook ubsan_break_hook = {
};
#endif
-#define esr_comment(esr) ((esr) & ESR_ELx_BRK64_ISS_COMMENT_MASK)
-
/*
* Initial handler for AArch64 BRK exceptions
* This handler only used until debug_traps_init().
@@ -1115,15 +1113,15 @@ int __init early_brk64(unsigned long addr, unsigned long esr,
struct pt_regs *regs)
{
#ifdef CONFIG_CFI_CLANG
- if ((esr_comment(esr) & ~CFI_BRK_IMM_MASK) == CFI_BRK_IMM_BASE)
+ if (esr_is_cfi_brk(esr))
return cfi_handler(regs, esr) != DBG_HOOK_HANDLED;
#endif
#ifdef CONFIG_KASAN_SW_TAGS
- if ((esr_comment(esr) & ~KASAN_BRK_MASK) == KASAN_BRK_IMM)
+ if ((esr_brk_comment(esr) & ~KASAN_BRK_MASK) == KASAN_BRK_IMM)
return kasan_handler(regs, esr) != DBG_HOOK_HANDLED;
#endif
#ifdef CONFIG_UBSAN_TRAP
- if ((esr_comment(esr) & ~UBSAN_BRK_MASK) == UBSAN_BRK_IMM)
+ if ((esr_brk_comment(esr) & ~UBSAN_BRK_MASK) == UBSAN_BRK_IMM)
return ubsan_handler(regs, esr) != DBG_HOOK_HANDLED;
#endif
return bug_handler(regs, esr) != DBG_HOOK_HANDLED;
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 59716789fe0f..a7ca776b51ec 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -48,6 +48,15 @@
static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT;
+enum kvm_wfx_trap_policy {
+ KVM_WFX_NOTRAP_SINGLE_TASK, /* Default option */
+ KVM_WFX_NOTRAP,
+ KVM_WFX_TRAP,
+};
+
+static enum kvm_wfx_trap_policy kvm_wfi_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK;
+static enum kvm_wfx_trap_policy kvm_wfe_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK;
+
DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
@@ -170,6 +179,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
mutex_unlock(&kvm->lock);
#endif
+ kvm_init_nested(kvm);
+
ret = kvm_share_hyp(kvm, kvm + 1);
if (ret)
return ret;
@@ -546,11 +557,32 @@ static void vcpu_set_pauth_traps(struct kvm_vcpu *vcpu)
}
}
+static bool kvm_vcpu_should_clear_twi(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(kvm_wfi_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK))
+ return kvm_wfi_trap_policy == KVM_WFX_NOTRAP;
+
+ return single_task_running() &&
+ (atomic_read(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count) ||
+ vcpu->kvm->arch.vgic.nassgireq);
+}
+
+static bool kvm_vcpu_should_clear_twe(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(kvm_wfe_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK))
+ return kvm_wfe_trap_policy == KVM_WFX_NOTRAP;
+
+ return single_task_running();
+}
+
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct kvm_s2_mmu *mmu;
int *last_ran;
+ if (vcpu_has_nv(vcpu))
+ kvm_vcpu_load_hw_mmu(vcpu);
+
mmu = vcpu->arch.hw_mmu;
last_ran = this_cpu_ptr(mmu->last_vcpu_ran);
@@ -579,10 +611,15 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (kvm_arm_is_pvtime_enabled(&vcpu->arch))
kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu);
- if (single_task_running())
- vcpu_clear_wfx_traps(vcpu);
+ if (kvm_vcpu_should_clear_twe(vcpu))
+ vcpu->arch.hcr_el2 &= ~HCR_TWE;
+ else
+ vcpu->arch.hcr_el2 |= HCR_TWE;
+
+ if (kvm_vcpu_should_clear_twi(vcpu))
+ vcpu->arch.hcr_el2 &= ~HCR_TWI;
else
- vcpu_set_wfx_traps(vcpu);
+ vcpu->arch.hcr_el2 |= HCR_TWI;
vcpu_set_pauth_traps(vcpu);
@@ -601,6 +638,8 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
kvm_timer_vcpu_put(vcpu);
kvm_vgic_put(vcpu);
kvm_vcpu_pmu_restore_host(vcpu);
+ if (vcpu_has_nv(vcpu))
+ kvm_vcpu_put_hw_mmu(vcpu);
kvm_arm_vmid_clear_active();
vcpu_clear_on_unsupported_cpu(vcpu);
@@ -797,7 +836,7 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
* This needs to happen after NV has imposed its own restrictions on
* the feature set
*/
- kvm_init_sysreg(vcpu);
+ kvm_calculate_traps(vcpu);
ret = kvm_timer_enable(vcpu);
if (ret)
@@ -1099,7 +1138,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
vcpu_load(vcpu);
- if (run->immediate_exit) {
+ if (!vcpu->wants_to_run) {
ret = -EINTR;
goto out;
}
@@ -1419,11 +1458,6 @@ static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu,
test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features))
return -EINVAL;
- /* Disallow NV+SVE for the time being */
- if (test_bit(KVM_ARM_VCPU_HAS_EL2, &features) &&
- test_bit(KVM_ARM_VCPU_SVE, &features))
- return -EINVAL;
-
if (!test_bit(KVM_ARM_VCPU_EL1_32BIT, &features))
return 0;
@@ -1459,6 +1493,10 @@ static int kvm_setup_vcpu(struct kvm_vcpu *vcpu)
if (kvm_vcpu_has_pmu(vcpu) && !kvm->arch.arm_pmu)
ret = kvm_arm_set_default_pmu(kvm);
+ /* Prepare for nested if required */
+ if (!ret && vcpu_has_nv(vcpu))
+ ret = kvm_vcpu_init_nested(vcpu);
+
return ret;
}
@@ -2858,6 +2896,36 @@ static int __init early_kvm_mode_cfg(char *arg)
}
early_param("kvm-arm.mode", early_kvm_mode_cfg);
+static int __init early_kvm_wfx_trap_policy_cfg(char *arg, enum kvm_wfx_trap_policy *p)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (strcmp(arg, "trap") == 0) {
+ *p = KVM_WFX_TRAP;
+ return 0;
+ }
+
+ if (strcmp(arg, "notrap") == 0) {
+ *p = KVM_WFX_NOTRAP;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static int __init early_kvm_wfi_trap_policy_cfg(char *arg)
+{
+ return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfi_trap_policy);
+}
+early_param("kvm-arm.wfi_trap_policy", early_kvm_wfi_trap_policy_cfg);
+
+static int __init early_kvm_wfe_trap_policy_cfg(char *arg)
+{
+ return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfe_trap_policy);
+}
+early_param("kvm-arm.wfe_trap_policy", early_kvm_wfe_trap_policy_cfg);
+
enum kvm_mode kvm_get_mode(void)
{
return kvm_mode;
diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c
index 54090967a335..05166eccea0a 100644
--- a/arch/arm64/kvm/emulate-nested.c
+++ b/arch/arm64/kvm/emulate-nested.c
@@ -79,6 +79,12 @@ enum cgt_group_id {
CGT_MDCR_E2TB,
CGT_MDCR_TDCC,
+ CGT_CPACR_E0POE,
+ CGT_CPTR_TAM,
+ CGT_CPTR_TCPAC,
+
+ CGT_HCRX_TCR2En,
+
/*
* Anything after this point is a combination of coarse trap
* controls, which must all be evaluated to decide what to do.
@@ -89,6 +95,7 @@ enum cgt_group_id {
CGT_HCR_TTLB_TTLBIS,
CGT_HCR_TTLB_TTLBOS,
CGT_HCR_TVM_TRVM,
+ CGT_HCR_TVM_TRVM_HCRX_TCR2En,
CGT_HCR_TPU_TICAB,
CGT_HCR_TPU_TOCU,
CGT_HCR_NV1_nNV2_ENSCXT,
@@ -106,6 +113,8 @@ enum cgt_group_id {
CGT_CNTHCTL_EL1PCTEN = __COMPLEX_CONDITIONS__,
CGT_CNTHCTL_EL1PTEN,
+ CGT_CPTR_TTA,
+
/* Must be last */
__NR_CGT_GROUP_IDS__
};
@@ -345,6 +354,30 @@ static const struct trap_bits coarse_trap_bits[] = {
.mask = MDCR_EL2_TDCC,
.behaviour = BEHAVE_FORWARD_ANY,
},
+ [CGT_CPACR_E0POE] = {
+ .index = CPTR_EL2,
+ .value = CPACR_ELx_E0POE,
+ .mask = CPACR_ELx_E0POE,
+ .behaviour = BEHAVE_FORWARD_ANY,
+ },
+ [CGT_CPTR_TAM] = {
+ .index = CPTR_EL2,
+ .value = CPTR_EL2_TAM,
+ .mask = CPTR_EL2_TAM,
+ .behaviour = BEHAVE_FORWARD_ANY,
+ },
+ [CGT_CPTR_TCPAC] = {
+ .index = CPTR_EL2,
+ .value = CPTR_EL2_TCPAC,
+ .mask = CPTR_EL2_TCPAC,
+ .behaviour = BEHAVE_FORWARD_ANY,
+ },
+ [CGT_HCRX_TCR2En] = {
+ .index = HCRX_EL2,
+ .value = 0,
+ .mask = HCRX_EL2_TCR2En,
+ .behaviour = BEHAVE_FORWARD_ANY,
+ },
};
#define MCB(id, ...) \
@@ -359,6 +392,8 @@ static const enum cgt_group_id *coarse_control_combo[] = {
MCB(CGT_HCR_TTLB_TTLBIS, CGT_HCR_TTLB, CGT_HCR_TTLBIS),
MCB(CGT_HCR_TTLB_TTLBOS, CGT_HCR_TTLB, CGT_HCR_TTLBOS),
MCB(CGT_HCR_TVM_TRVM, CGT_HCR_TVM, CGT_HCR_TRVM),
+ MCB(CGT_HCR_TVM_TRVM_HCRX_TCR2En,
+ CGT_HCR_TVM, CGT_HCR_TRVM, CGT_HCRX_TCR2En),
MCB(CGT_HCR_TPU_TICAB, CGT_HCR_TPU, CGT_HCR_TICAB),
MCB(CGT_HCR_TPU_TOCU, CGT_HCR_TPU, CGT_HCR_TOCU),
MCB(CGT_HCR_NV1_nNV2_ENSCXT, CGT_HCR_NV1_nNV2, CGT_HCR_ENSCXT),
@@ -410,12 +445,26 @@ static enum trap_behaviour check_cnthctl_el1pten(struct kvm_vcpu *vcpu)
return BEHAVE_FORWARD_ANY;
}
+static enum trap_behaviour check_cptr_tta(struct kvm_vcpu *vcpu)
+{
+ u64 val = __vcpu_sys_reg(vcpu, CPTR_EL2);
+
+ if (!vcpu_el2_e2h_is_set(vcpu))
+ val = translate_cptr_el2_to_cpacr_el1(val);
+
+ if (val & CPACR_ELx_TTA)
+ return BEHAVE_FORWARD_ANY;
+
+ return BEHAVE_HANDLE_LOCALLY;
+}
+
#define CCC(id, fn) \
[id - __COMPLEX_CONDITIONS__] = fn
static const complex_condition_check ccc[] = {
CCC(CGT_CNTHCTL_EL1PCTEN, check_cnthctl_el1pcten),
CCC(CGT_CNTHCTL_EL1PTEN, check_cnthctl_el1pten),
+ CCC(CGT_CPTR_TTA, check_cptr_tta),
};
/*
@@ -622,6 +671,7 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = {
SR_TRAP(SYS_MAIR_EL1, CGT_HCR_TVM_TRVM),
SR_TRAP(SYS_AMAIR_EL1, CGT_HCR_TVM_TRVM),
SR_TRAP(SYS_CONTEXTIDR_EL1, CGT_HCR_TVM_TRVM),
+ SR_TRAP(SYS_TCR2_EL1, CGT_HCR_TVM_TRVM_HCRX_TCR2En),
SR_TRAP(SYS_DC_ZVA, CGT_HCR_TDZ),
SR_TRAP(SYS_DC_GVA, CGT_HCR_TDZ),
SR_TRAP(SYS_DC_GZVA, CGT_HCR_TDZ),
@@ -1000,6 +1050,59 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = {
SR_TRAP(SYS_TRBPTR_EL1, CGT_MDCR_E2TB),
SR_TRAP(SYS_TRBSR_EL1, CGT_MDCR_E2TB),
SR_TRAP(SYS_TRBTRG_EL1, CGT_MDCR_E2TB),
+ SR_TRAP(SYS_CPACR_EL1, CGT_CPTR_TCPAC),
+ SR_TRAP(SYS_AMUSERENR_EL0, CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMCFGR_EL0, CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMCGCR_EL0, CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMCNTENCLR0_EL0, CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMCNTENCLR1_EL0, CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMCNTENSET0_EL0, CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMCNTENSET1_EL0, CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMCR_EL0, CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR0_EL0(0), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR0_EL0(1), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR0_EL0(2), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR0_EL0(3), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(0), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(1), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(2), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(3), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(4), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(5), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(6), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(7), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(8), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(9), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(10), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(11), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(12), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(13), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(14), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVCNTR1_EL0(15), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER0_EL0(0), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER0_EL0(1), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER0_EL0(2), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER0_EL0(3), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(0), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(1), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(2), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(3), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(4), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(5), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(6), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(7), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(8), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(9), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(10), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(11), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(12), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(13), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(14), CGT_CPTR_TAM),
+ SR_TRAP(SYS_AMEVTYPER1_EL0(15), CGT_CPTR_TAM),
+ SR_TRAP(SYS_POR_EL0, CGT_CPACR_E0POE),
+ /* op0=2, op1=1, and CRn<0b1000 */
+ SR_RANGE_TRAP(sys_reg(2, 1, 0, 0, 0),
+ sys_reg(2, 1, 7, 15, 7), CGT_CPTR_TTA),
SR_TRAP(SYS_CNTP_TVAL_EL0, CGT_CNTHCTL_EL1PTEN),
SR_TRAP(SYS_CNTP_CVAL_EL0, CGT_CNTHCTL_EL1PTEN),
SR_TRAP(SYS_CNTP_CTL_EL0, CGT_CNTHCTL_EL1PTEN),
@@ -1071,6 +1174,7 @@ static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = {
SR_FGT(SYS_TPIDRRO_EL0, HFGxTR, TPIDRRO_EL0, 1),
SR_FGT(SYS_TPIDR_EL1, HFGxTR, TPIDR_EL1, 1),
SR_FGT(SYS_TCR_EL1, HFGxTR, TCR_EL1, 1),
+ SR_FGT(SYS_TCR2_EL1, HFGxTR, TCR_EL1, 1),
SR_FGT(SYS_SCXTNUM_EL0, HFGxTR, SCXTNUM_EL0, 1),
SR_FGT(SYS_SCXTNUM_EL1, HFGxTR, SCXTNUM_EL1, 1),
SR_FGT(SYS_SCTLR_EL1, HFGxTR, SCTLR_EL1, 1),
diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
index 521b32868d0d..c53e5b14038d 100644
--- a/arch/arm64/kvm/fpsimd.c
+++ b/arch/arm64/kvm/fpsimd.c
@@ -178,7 +178,13 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu)
if (guest_owns_fp_regs()) {
if (vcpu_has_sve(vcpu)) {
- __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR);
+ u64 zcr = read_sysreg_el1(SYS_ZCR);
+
+ /*
+ * If the vCPU is in the hyp context then ZCR_EL1 is
+ * loaded with its vEL2 counterpart.
+ */
+ __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)) = zcr;
/*
* Restore the VL that was saved when bound to the CPU,
@@ -189,11 +195,14 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu)
* Note that this means that at guest exit ZCR_EL1 is
* not necessarily the same as on guest entry.
*
- * Restoring the VL isn't needed in VHE mode since
- * ZCR_EL2 (accessed via ZCR_EL1) would fulfill the same
- * role when doing the save from EL2.
+ * ZCR_EL2 holds the guest hypervisor's VL when running
+ * a nested guest, which could be smaller than the
+ * max for the vCPU. Similar to above, we first need to
+ * switch to a VL consistent with the layout of the
+ * vCPU's SVE state. KVM support for NV implies VHE, so
+ * using the ZCR_EL1 alias is safe.
*/
- if (!has_vhe())
+ if (!has_vhe() || (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)))
sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1,
SYS_ZCR_EL1);
}
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index b037f0a0e27e..d7c2990e7c9e 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -94,11 +94,19 @@ static int handle_smc(struct kvm_vcpu *vcpu)
}
/*
- * Guest access to FP/ASIMD registers are routed to this handler only
- * when the system doesn't support FP/ASIMD.
+ * This handles the cases where the system does not support FP/ASIMD or when
+ * we are running nested virtualization and the guest hypervisor is trapping
+ * FP/ASIMD accesses by its guest guest.
+ *
+ * All other handling of guest vs. host FP/ASIMD register state is handled in
+ * fixup_guest_exit().
*/
-static int handle_no_fpsimd(struct kvm_vcpu *vcpu)
+static int kvm_handle_fpasimd(struct kvm_vcpu *vcpu)
{
+ if (guest_hyp_fpsimd_traps_enabled(vcpu))
+ return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
+
+ /* This is the case when the system doesn't support FP/ASIMD. */
kvm_inject_undefined(vcpu);
return 1;
}
@@ -209,6 +217,9 @@ static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu)
*/
static int handle_sve(struct kvm_vcpu *vcpu)
{
+ if (guest_hyp_sve_traps_enabled(vcpu))
+ return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
+
kvm_inject_undefined(vcpu);
return 1;
}
@@ -304,7 +315,7 @@ static exit_handle_fn arm_exit_handlers[] = {
[ESR_ELx_EC_BREAKPT_LOW]= kvm_handle_guest_debug,
[ESR_ELx_EC_BKPT32] = kvm_handle_guest_debug,
[ESR_ELx_EC_BRK64] = kvm_handle_guest_debug,
- [ESR_ELx_EC_FP_ASIMD] = handle_no_fpsimd,
+ [ESR_ELx_EC_FP_ASIMD] = kvm_handle_fpasimd,
[ESR_ELx_EC_PAC] = kvm_handle_ptrauth,
};
@@ -411,6 +422,20 @@ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index)
kvm_handle_guest_serror(vcpu, kvm_vcpu_get_esr(vcpu));
}
+static void print_nvhe_hyp_panic(const char *name, u64 panic_addr)
+{
+ kvm_err("nVHE hyp %s at: [<%016llx>] %pB!\n", name, panic_addr,
+ (void *)(panic_addr + kaslr_offset()));
+}
+
+static void kvm_nvhe_report_cfi_failure(u64 panic_addr)
+{
+ print_nvhe_hyp_panic("CFI failure", panic_addr);
+
+ if (IS_ENABLED(CONFIG_CFI_PERMISSIVE))
+ kvm_err(" (CONFIG_CFI_PERMISSIVE ignored for hyp failures)\n");
+}
+
void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
u64 elr_virt, u64 elr_phys,
u64 par, uintptr_t vcpu,
@@ -423,7 +448,7 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
if (mode != PSR_MODE_EL2t && mode != PSR_MODE_EL2h) {
kvm_err("Invalid host exception to nVHE hyp!\n");
} else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 &&
- (esr & ESR_ELx_BRK64_ISS_COMMENT_MASK) == BUG_BRK_IMM) {
+ esr_brk_comment(esr) == BUG_BRK_IMM) {
const char *file = NULL;
unsigned int line = 0;
@@ -439,11 +464,11 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
if (file)
kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line);
else
- kvm_err("nVHE hyp BUG at: [<%016llx>] %pB!\n", panic_addr,
- (void *)(panic_addr + kaslr_offset()));
+ print_nvhe_hyp_panic("BUG", panic_addr);
+ } else if (IS_ENABLED(CONFIG_CFI_CLANG) && esr_is_cfi_brk(esr)) {
+ kvm_nvhe_report_cfi_failure(panic_addr);
} else {
- kvm_err("nVHE hyp panic at: [<%016llx>] %pB!\n", panic_addr,
- (void *)(panic_addr + kaslr_offset()));
+ print_nvhe_hyp_panic("panic", panic_addr);
}
/* Dump the nVHE hypervisor backtrace */
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index f3aa7738b477..4433a234aa9b 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -83,6 +83,14 @@ alternative_else_nop_endif
eret
sb
+SYM_INNER_LABEL(__guest_exit_restore_elr_and_panic, SYM_L_GLOBAL)
+ // x2-x29,lr: vcpu regs
+ // vcpu x0-x1 on the stack
+
+ adr_this_cpu x0, kvm_hyp_ctxt, x1
+ ldr x0, [x0, #CPU_ELR_EL2]
+ msr elr_el2, x0
+
SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL)
// x2-x29,lr: vcpu regs
// vcpu x0-x1 on the stack
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 0c4de44534b7..f59ccfe11ab9 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -314,11 +314,24 @@ static bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code)
static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu)
{
+ /*
+ * The vCPU's saved SVE state layout always matches the max VL of the
+ * vCPU. Start off with the max VL so we can load the SVE state.
+ */
sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2);
__sve_restore_state(vcpu_sve_pffr(vcpu),
&vcpu->arch.ctxt.fp_regs.fpsr,
true);
- write_sysreg_el1(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR);
+
+ /*
+ * The effective VL for a VM could differ from the max VL when running a
+ * nested guest, as the guest hypervisor could select a smaller VL. Slap
+ * that into hardware before wrapping up.
+ */
+ if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))
+ sve_cond_update_zcr_vq(__vcpu_sys_reg(vcpu, ZCR_EL2), SYS_ZCR_EL2);
+
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)), SYS_ZCR);
}
static inline void __hyp_sve_save_host(void)
@@ -354,10 +367,19 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
/* Only handle traps the vCPU can support here: */
switch (esr_ec) {
case ESR_ELx_EC_FP_ASIMD:
+ /* Forward traps to the guest hypervisor as required */
+ if (guest_hyp_fpsimd_traps_enabled(vcpu))
+ return false;
break;
+ case ESR_ELx_EC_SYS64:
+ if (WARN_ON_ONCE(!is_hyp_ctxt(vcpu)))
+ return false;
+ fallthrough;
case ESR_ELx_EC_SVE:
if (!sve_guest)
return false;
+ if (guest_hyp_sve_traps_enabled(vcpu))
+ return false;
break;
default:
return false;
@@ -693,7 +715,7 @@ guest:
static inline void __kvm_unexpected_el2_exception(void)
{
- extern char __guest_exit_panic[];
+ extern char __guest_exit_restore_elr_and_panic[];
unsigned long addr, fixup;
struct kvm_exception_table_entry *entry, *end;
unsigned long elr_el2 = read_sysreg(elr_el2);
@@ -715,7 +737,8 @@ static inline void __kvm_unexpected_el2_exception(void)
}
/* Trigger a panic after restoring the hyp context. */
- write_sysreg(__guest_exit_panic, elr_el2);
+ this_cpu_ptr(&kvm_hyp_ctxt)->sys_regs[ELR_EL2] = elr_el2;
+ write_sysreg(__guest_exit_restore_elr_and_panic, elr_el2);
}
#endif /* __ARM64_KVM_HYP_SWITCH_H__ */
diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
index 4be6a7fa0070..4c0fdabaf8ae 100644
--- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
@@ -55,6 +55,17 @@ static inline bool ctxt_has_s1pie(struct kvm_cpu_context *ctxt)
return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64MMFR3_EL1, S1PIE, IMP);
}
+static inline bool ctxt_has_tcrx(struct kvm_cpu_context *ctxt)
+{
+ struct kvm_vcpu *vcpu;
+
+ if (!cpus_have_final_cap(ARM64_HAS_TCR2))
+ return false;
+
+ vcpu = ctxt_to_vcpu(ctxt);
+ return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64MMFR3_EL1, TCRX, IMP);
+}
+
static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
{
ctxt_sys_reg(ctxt, SCTLR_EL1) = read_sysreg_el1(SYS_SCTLR);
@@ -62,8 +73,14 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
ctxt_sys_reg(ctxt, TTBR0_EL1) = read_sysreg_el1(SYS_TTBR0);
ctxt_sys_reg(ctxt, TTBR1_EL1) = read_sysreg_el1(SYS_TTBR1);
ctxt_sys_reg(ctxt, TCR_EL1) = read_sysreg_el1(SYS_TCR);
- if (cpus_have_final_cap(ARM64_HAS_TCR2))
+ if (ctxt_has_tcrx(ctxt)) {
ctxt_sys_reg(ctxt, TCR2_EL1) = read_sysreg_el1(SYS_TCR2);
+
+ if (ctxt_has_s1pie(ctxt)) {
+ ctxt_sys_reg(ctxt, PIR_EL1) = read_sysreg_el1(SYS_PIR);
+ ctxt_sys_reg(ctxt, PIRE0_EL1) = read_sysreg_el1(SYS_PIRE0);
+ }
+ }
ctxt_sys_reg(ctxt, ESR_EL1) = read_sysreg_el1(SYS_ESR);
ctxt_sys_reg(ctxt, AFSR0_EL1) = read_sysreg_el1(SYS_AFSR0);
ctxt_sys_reg(ctxt, AFSR1_EL1) = read_sysreg_el1(SYS_AFSR1);
@@ -73,10 +90,6 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
ctxt_sys_reg(ctxt, CONTEXTIDR_EL1) = read_sysreg_el1(SYS_CONTEXTIDR);
ctxt_sys_reg(ctxt, AMAIR_EL1) = read_sysreg_el1(SYS_AMAIR);
ctxt_sys_reg(ctxt, CNTKCTL_EL1) = read_sysreg_el1(SYS_CNTKCTL);
- if (ctxt_has_s1pie(ctxt)) {
- ctxt_sys_reg(ctxt, PIR_EL1) = read_sysreg_el1(SYS_PIR);
- ctxt_sys_reg(ctxt, PIRE0_EL1) = read_sysreg_el1(SYS_PIRE0);
- }
ctxt_sys_reg(ctxt, PAR_EL1) = read_sysreg_par();
ctxt_sys_reg(ctxt, TPIDR_EL1) = read_sysreg(tpidr_el1);
@@ -138,8 +151,14 @@ static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
write_sysreg_el1(ctxt_sys_reg(ctxt, CPACR_EL1), SYS_CPACR);
write_sysreg_el1(ctxt_sys_reg(ctxt, TTBR0_EL1), SYS_TTBR0);
write_sysreg_el1(ctxt_sys_reg(ctxt, TTBR1_EL1), SYS_TTBR1);
- if (cpus_have_final_cap(ARM64_HAS_TCR2))
+ if (ctxt_has_tcrx(ctxt)) {
write_sysreg_el1(ctxt_sys_reg(ctxt, TCR2_EL1), SYS_TCR2);
+
+ if (ctxt_has_s1pie(ctxt)) {
+ write_sysreg_el1(ctxt_sys_reg(ctxt, PIR_EL1), SYS_PIR);
+ write_sysreg_el1(ctxt_sys_reg(ctxt, PIRE0_EL1), SYS_PIRE0);
+ }
+ }
write_sysreg_el1(ctxt_sys_reg(ctxt, ESR_EL1), SYS_ESR);
write_sysreg_el1(ctxt_sys_reg(ctxt, AFSR0_EL1), SYS_AFSR0);
write_sysreg_el1(ctxt_sys_reg(ctxt, AFSR1_EL1), SYS_AFSR1);
@@ -149,10 +168,6 @@ static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
write_sysreg_el1(ctxt_sys_reg(ctxt, CONTEXTIDR_EL1), SYS_CONTEXTIDR);
write_sysreg_el1(ctxt_sys_reg(ctxt, AMAIR_EL1), SYS_AMAIR);
write_sysreg_el1(ctxt_sys_reg(ctxt, CNTKCTL_EL1), SYS_CNTKCTL);
- if (ctxt_has_s1pie(ctxt)) {
- write_sysreg_el1(ctxt_sys_reg(ctxt, PIR_EL1), SYS_PIR);
- write_sysreg_el1(ctxt_sys_reg(ctxt, PIRE0_EL1), SYS_PIRE0);
- }
write_sysreg(ctxt_sys_reg(ctxt, PAR_EL1), par_el1);
write_sysreg(ctxt_sys_reg(ctxt, TPIDR_EL1), tpidr_el1);
diff --git a/arch/arm64/kvm/hyp/include/nvhe/ffa.h b/arch/arm64/kvm/hyp/include/nvhe/ffa.h
index d9fd5e6c7d3c..146e0aebfa1c 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/ffa.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/ffa.h
@@ -9,7 +9,7 @@
#include <asm/kvm_host.h>
#define FFA_MIN_FUNC_NUM 0x60
-#define FFA_MAX_FUNC_NUM 0x7F
+#define FFA_MAX_FUNC_NUM 0xFF
int hyp_ffa_init(void *pages);
bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id);
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index 50fa0ffb6b7e..782b34b004be 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -89,9 +89,9 @@ quiet_cmd_hyprel = HYPREL $@
quiet_cmd_hypcopy = HYPCOPY $@
cmd_hypcopy = $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ $< $@
-# Remove ftrace, Shadow Call Stack, and CFI CFLAGS.
-# This is equivalent to the 'notrace', '__noscs', and '__nocfi' annotations.
-KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) $(CC_FLAGS_CFI), $(KBUILD_CFLAGS))
+# Remove ftrace and Shadow Call Stack CFLAGS.
+# This is equivalent to the 'notrace' and '__noscs' annotations.
+KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS), $(KBUILD_CFLAGS))
# Starting from 13.0.0 llvm emits SHT_REL section '.llvm.call-graph-profile'
# when profile optimization is applied. gen-hyprel does not support SHT_REL and
# causes a build failure. Remove profile optimization flags.
diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
index efb053af331c..e715c157c2c4 100644
--- a/arch/arm64/kvm/hyp/nvhe/ffa.c
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -67,6 +67,9 @@ struct kvm_ffa_buffers {
*/
static struct kvm_ffa_buffers hyp_buffers;
static struct kvm_ffa_buffers host_buffers;
+static u32 hyp_ffa_version;
+static bool has_version_negotiated;
+static hyp_spinlock_t version_lock;
static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno)
{
@@ -462,7 +465,7 @@ static __always_inline void do_ffa_mem_xfer(const u64 func_id,
memcpy(buf, host_buffers.tx, fraglen);
ep_mem_access = (void *)buf +
- ffa_mem_desc_offset(buf, 0, FFA_VERSION_1_0);
+ ffa_mem_desc_offset(buf, 0, hyp_ffa_version);
offset = ep_mem_access->composite_off;
if (!offset || buf->ep_count != 1 || buf->sender_id != HOST_FFA_ID) {
ret = FFA_RET_INVALID_PARAMETERS;
@@ -541,7 +544,7 @@ static void do_ffa_mem_reclaim(struct arm_smccc_res *res,
fraglen = res->a2;
ep_mem_access = (void *)buf +
- ffa_mem_desc_offset(buf, 0, FFA_VERSION_1_0);
+ ffa_mem_desc_offset(buf, 0, hyp_ffa_version);
offset = ep_mem_access->composite_off;
/*
* We can trust the SPMD to get this right, but let's at least
@@ -651,6 +654,132 @@ out_handled:
return true;
}
+static int hyp_ffa_post_init(void)
+{
+ size_t min_rxtx_sz;
+ struct arm_smccc_res res;
+
+ arm_smccc_1_1_smc(FFA_ID_GET, 0, 0, 0, 0, 0, 0, 0, &res);
+ if (res.a0 != FFA_SUCCESS)
+ return -EOPNOTSUPP;
+
+ if (res.a2 != HOST_FFA_ID)
+ return -EINVAL;
+
+ arm_smccc_1_1_smc(FFA_FEATURES, FFA_FN64_RXTX_MAP,
+ 0, 0, 0, 0, 0, 0, &res);
+ if (res.a0 != FFA_SUCCESS)
+ return -EOPNOTSUPP;
+
+ switch (res.a2) {
+ case FFA_FEAT_RXTX_MIN_SZ_4K:
+ min_rxtx_sz = SZ_4K;
+ break;
+ case FFA_FEAT_RXTX_MIN_SZ_16K:
+ min_rxtx_sz = SZ_16K;
+ break;
+ case FFA_FEAT_RXTX_MIN_SZ_64K:
+ min_rxtx_sz = SZ_64K;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (min_rxtx_sz > PAGE_SIZE)
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
+static void do_ffa_version(struct arm_smccc_res *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, ffa_req_version, ctxt, 1);
+
+ if (FFA_MAJOR_VERSION(ffa_req_version) != 1) {
+ res->a0 = FFA_RET_NOT_SUPPORTED;
+ return;
+ }
+
+ hyp_spin_lock(&version_lock);
+ if (has_version_negotiated) {
+ res->a0 = hyp_ffa_version;
+ goto unlock;
+ }
+
+ /*
+ * If the client driver tries to downgrade the version, we need to ask
+ * first if TEE supports it.
+ */
+ if (FFA_MINOR_VERSION(ffa_req_version) < FFA_MINOR_VERSION(hyp_ffa_version)) {
+ arm_smccc_1_1_smc(FFA_VERSION, ffa_req_version, 0,
+ 0, 0, 0, 0, 0,
+ res);
+ if (res->a0 == FFA_RET_NOT_SUPPORTED)
+ goto unlock;
+
+ hyp_ffa_version = ffa_req_version;
+ }
+
+ if (hyp_ffa_post_init())
+ res->a0 = FFA_RET_NOT_SUPPORTED;
+ else {
+ has_version_negotiated = true;
+ res->a0 = hyp_ffa_version;
+ }
+unlock:
+ hyp_spin_unlock(&version_lock);
+}
+
+static void do_ffa_part_get(struct arm_smccc_res *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, uuid0, ctxt, 1);
+ DECLARE_REG(u32, uuid1, ctxt, 2);
+ DECLARE_REG(u32, uuid2, ctxt, 3);
+ DECLARE_REG(u32, uuid3, ctxt, 4);
+ DECLARE_REG(u32, flags, ctxt, 5);
+ u32 count, partition_sz, copy_sz;
+
+ hyp_spin_lock(&host_buffers.lock);
+ if (!host_buffers.rx) {
+ ffa_to_smccc_res(res, FFA_RET_BUSY);
+ goto out_unlock;
+ }
+
+ arm_smccc_1_1_smc(FFA_PARTITION_INFO_GET, uuid0, uuid1,
+ uuid2, uuid3, flags, 0, 0,
+ res);
+
+ if (res->a0 != FFA_SUCCESS)
+ goto out_unlock;
+
+ count = res->a2;
+ if (!count)
+ goto out_unlock;
+
+ if (hyp_ffa_version > FFA_VERSION_1_0) {
+ /* Get the number of partitions deployed in the system */
+ if (flags & 0x1)
+ goto out_unlock;
+
+ partition_sz = res->a3;
+ } else {
+ /* FFA_VERSION_1_0 lacks the size in the response */
+ partition_sz = FFA_1_0_PARTITON_INFO_SZ;
+ }
+
+ copy_sz = partition_sz * count;
+ if (copy_sz > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) {
+ ffa_to_smccc_res(res, FFA_RET_ABORTED);
+ goto out_unlock;
+ }
+
+ memcpy(host_buffers.rx, hyp_buffers.rx, copy_sz);
+out_unlock:
+ hyp_spin_unlock(&host_buffers.lock);
+}
+
bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id)
{
struct arm_smccc_res res;
@@ -671,6 +800,11 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id)
if (!is_ffa_call(func_id))
return false;
+ if (!has_version_negotiated && func_id != FFA_VERSION) {
+ ffa_to_smccc_error(&res, FFA_RET_INVALID_PARAMETERS);
+ goto out_handled;
+ }
+
switch (func_id) {
case FFA_FEATURES:
if (!do_ffa_features(&res, host_ctxt))
@@ -697,6 +831,12 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id)
case FFA_MEM_FRAG_TX:
do_ffa_mem_frag_tx(&res, host_ctxt);
goto out_handled;
+ case FFA_VERSION:
+ do_ffa_version(&res, host_ctxt);
+ goto out_handled;
+ case FFA_PARTITION_INFO_GET:
+ do_ffa_part_get(&res, host_ctxt);
+ goto out_handled;
}
if (ffa_call_supported(func_id))
@@ -711,13 +851,12 @@ out_handled:
int hyp_ffa_init(void *pages)
{
struct arm_smccc_res res;
- size_t min_rxtx_sz;
void *tx, *rx;
if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2)
return 0;
- arm_smccc_1_1_smc(FFA_VERSION, FFA_VERSION_1_0, 0, 0, 0, 0, 0, 0, &res);
+ arm_smccc_1_1_smc(FFA_VERSION, FFA_VERSION_1_1, 0, 0, 0, 0, 0, 0, &res);
if (res.a0 == FFA_RET_NOT_SUPPORTED)
return 0;
@@ -737,34 +876,10 @@ int hyp_ffa_init(void *pages)
if (FFA_MAJOR_VERSION(res.a0) != 1)
return -EOPNOTSUPP;
- arm_smccc_1_1_smc(FFA_ID_GET, 0, 0, 0, 0, 0, 0, 0, &res);
- if (res.a0 != FFA_SUCCESS)
- return -EOPNOTSUPP;
-
- if (res.a2 != HOST_FFA_ID)
- return -EINVAL;
-
- arm_smccc_1_1_smc(FFA_FEATURES, FFA_FN64_RXTX_MAP,
- 0, 0, 0, 0, 0, 0, &res);
- if (res.a0 != FFA_SUCCESS)
- return -EOPNOTSUPP;
-
- switch (res.a2) {
- case FFA_FEAT_RXTX_MIN_SZ_4K:
- min_rxtx_sz = SZ_4K;
- break;
- case FFA_FEAT_RXTX_MIN_SZ_16K:
- min_rxtx_sz = SZ_16K;
- break;
- case FFA_FEAT_RXTX_MIN_SZ_64K:
- min_rxtx_sz = SZ_64K;
- break;
- default:
- return -EINVAL;
- }
-
- if (min_rxtx_sz > PAGE_SIZE)
- return -EOPNOTSUPP;
+ if (FFA_MINOR_VERSION(res.a0) < FFA_MINOR_VERSION(FFA_VERSION_1_1))
+ hyp_ffa_version = res.a0;
+ else
+ hyp_ffa_version = FFA_VERSION_1_1;
tx = pages;
pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE;
@@ -787,5 +902,6 @@ int hyp_ffa_init(void *pages)
.lock = __HYP_SPIN_LOCK_UNLOCKED,
};
+ version_lock = __HYP_SPIN_LOCK_UNLOCKED;
return 0;
}
diff --git a/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c
index 6bc88a756cb7..b63f4e1c1033 100644
--- a/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c
+++ b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c
@@ -50,6 +50,9 @@
#ifndef R_AARCH64_ABS64
#define R_AARCH64_ABS64 257
#endif
+#ifndef R_AARCH64_ABS32
+#define R_AARCH64_ABS32 258
+#endif
#ifndef R_AARCH64_PREL64
#define R_AARCH64_PREL64 260
#endif
@@ -383,6 +386,9 @@ static void emit_rela_section(Elf64_Shdr *sh_rela)
case R_AARCH64_ABS64:
emit_rela_abs64(rela, sh_orig_name);
break;
+ /* Allow 32-bit absolute relocation, for kCFI type hashes. */
+ case R_AARCH64_ABS32:
+ break;
/* Allow position-relative data relocations. */
case R_AARCH64_PREL64:
case R_AARCH64_PREL32:
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
index 135cfb294ee5..3d610fc51f4d 100644
--- a/arch/arm64/kvm/hyp/nvhe/host.S
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -197,12 +197,6 @@ SYM_FUNC_END(__host_hvc)
sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0
sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp
- /* If a guest is loaded, panic out of it. */
- stp x0, x1, [sp, #-16]!
- get_loaded_vcpu x0, x1
- cbnz x0, __guest_exit_panic
- add sp, sp, #16
-
/*
* The panic may not be clean if the exception is taken before the host
* context has been saved by __host_exit or after the hyp context has
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index 2994878d68ea..07120b37da35 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -5,6 +5,7 @@
*/
#include <linux/arm-smccc.h>
+#include <linux/cfi_types.h>
#include <linux/linkage.h>
#include <asm/alternative.h>
@@ -265,33 +266,38 @@ alternative_else_nop_endif
SYM_CODE_END(__kvm_handle_stub_hvc)
-SYM_FUNC_START(__pkvm_init_switch_pgd)
+/*
+ * void __pkvm_init_switch_pgd(phys_addr_t pgd, unsigned long sp,
+ * void (*fn)(void));
+ *
+ * SYM_TYPED_FUNC_START() allows C to call this ID-mapped function indirectly
+ * using a physical pointer without triggering a kCFI failure.
+ */
+SYM_TYPED_FUNC_START(__pkvm_init_switch_pgd)
/* Turn the MMU off */
pre_disable_mmu_workaround
- mrs x2, sctlr_el2
- bic x3, x2, #SCTLR_ELx_M
- msr sctlr_el2, x3
+ mrs x3, sctlr_el2
+ bic x4, x3, #SCTLR_ELx_M
+ msr sctlr_el2, x4
isb
tlbi alle2
/* Install the new pgtables */
- ldr x3, [x0, #NVHE_INIT_PGD_PA]
- phys_to_ttbr x4, x3
+ phys_to_ttbr x5, x0
alternative_if ARM64_HAS_CNP
- orr x4, x4, #TTBR_CNP_BIT
+ orr x5, x5, #TTBR_CNP_BIT
alternative_else_nop_endif
- msr ttbr0_el2, x4
+ msr ttbr0_el2, x5
/* Set the new stack pointer */
- ldr x0, [x0, #NVHE_INIT_STACK_HYP_VA]
- mov sp, x0
+ mov sp, x1
/* And turn the MMU back on! */
dsb nsh
isb
- set_sctlr_el2 x2
- ret x1
+ set_sctlr_el2 x3
+ ret x2
SYM_FUNC_END(__pkvm_init_switch_pgd)
.popsection
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index f4350ba07b0b..174007f3fadd 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -339,7 +339,7 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
{
struct kvm_nvhe_init_params *params;
void *virt = hyp_phys_to_virt(phys);
- void (*fn)(phys_addr_t params_pa, void *finalize_fn_va);
+ typeof(__pkvm_init_switch_pgd) *fn;
int ret;
BUG_ON(kvm_check_pvm_sysreg_table());
@@ -363,7 +363,7 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
/* Jump in the idmap page to switch to the new page-tables */
params = this_cpu_ptr(&kvm_init_params);
fn = (typeof(fn))__hyp_pa(__pkvm_init_switch_pgd);
- fn(__hyp_pa(params), __pkvm_init_finalise);
+ fn(params->pgd_pa, params->stack_hyp_va, __pkvm_init_finalise);
unreachable();
}
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index 8fbb6a2e0559..77010b76c150 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -65,6 +65,77 @@ static u64 __compute_hcr(struct kvm_vcpu *vcpu)
return hcr | (__vcpu_sys_reg(vcpu, HCR_EL2) & ~NV_HCR_GUEST_EXCLUDE);
}
+static void __activate_cptr_traps(struct kvm_vcpu *vcpu)
+{
+ u64 cptr;
+
+ /*
+ * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to
+ * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2,
+ * except for some missing controls, such as TAM.
+ * In this case, CPTR_EL2.TAM has the same position with or without
+ * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM
+ * shift value for trapping the AMU accesses.
+ */
+ u64 val = CPACR_ELx_TTA | CPTR_EL2_TAM;
+
+ if (guest_owns_fp_regs()) {
+ val |= CPACR_ELx_FPEN;
+ if (vcpu_has_sve(vcpu))
+ val |= CPACR_ELx_ZEN;
+ } else {
+ __activate_traps_fpsimd32(vcpu);
+ }
+
+ if (!vcpu_has_nv(vcpu))
+ goto write;
+
+ /*
+ * The architecture is a bit crap (what a surprise): an EL2 guest
+ * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA,
+ * as they are RES0 in the guest's view. To work around it, trap the
+ * sucker using the very same bit it can't set...
+ */
+ if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu))
+ val |= CPTR_EL2_TCPAC;
+
+ /*
+ * Layer the guest hypervisor's trap configuration on top of our own if
+ * we're in a nested context.
+ */
+ if (is_hyp_ctxt(vcpu))
+ goto write;
+
+ cptr = vcpu_sanitised_cptr_el2(vcpu);
+
+ /*
+ * Pay attention, there's some interesting detail here.
+ *
+ * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two
+ * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest):
+ *
+ * - CPTR_EL2.xEN = x0, traps are enabled
+ * - CPTR_EL2.xEN = x1, traps are disabled
+ *
+ * In other words, bit[0] determines if guest accesses trap or not. In
+ * the interest of simplicity, clear the entire field if the guest
+ * hypervisor has traps enabled to dispel any illusion of something more
+ * complicated taking place.
+ */
+ if (!(SYS_FIELD_GET(CPACR_ELx, FPEN, cptr) & BIT(0)))
+ val &= ~CPACR_ELx_FPEN;
+ if (!(SYS_FIELD_GET(CPACR_ELx, ZEN, cptr) & BIT(0)))
+ val &= ~CPACR_ELx_ZEN;
+
+ if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP))
+ val |= cptr & CPACR_ELx_E0POE;
+
+ val |= cptr & CPTR_EL2_TCPAC;
+
+write:
+ write_sysreg(val, cpacr_el1);
+}
+
static void __activate_traps(struct kvm_vcpu *vcpu)
{
u64 val;
@@ -91,30 +162,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
}
}
- val = read_sysreg(cpacr_el1);
- val |= CPACR_ELx_TTA;
- val &= ~(CPACR_ELx_ZEN | CPACR_ELx_SMEN);
-
- /*
- * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to
- * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2,
- * except for some missing controls, such as TAM.
- * In this case, CPTR_EL2.TAM has the same position with or without
- * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM
- * shift value for trapping the AMU accesses.
- */
-
- val |= CPTR_EL2_TAM;
-
- if (guest_owns_fp_regs()) {
- if (vcpu_has_sve(vcpu))
- val |= CPACR_ELx_ZEN;
- } else {
- val &= ~CPACR_ELx_FPEN;
- __activate_traps_fpsimd32(vcpu);
- }
-
- write_sysreg(val, cpacr_el1);
+ __activate_cptr_traps(vcpu);
write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el1);
}
@@ -266,10 +314,111 @@ static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu)
__fpsimd_save_state(*host_data_ptr(fpsimd_state));
}
+static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ int ret = -EINVAL;
+ u32 instr;
+ u64 val;
+
+ /*
+ * Ideally, we would never trap on EL2 S1 TLB invalidations using
+ * the EL1 instructions when the guest's HCR_EL2.{E2H,TGE}=={1,1}.
+ * But "thanks" to FEAT_NV2, we don't trap writes to HCR_EL2,
+ * meaning that we can't track changes to the virtual TGE bit. So we
+ * have to leave HCR_EL2.TTLB set on the host. Oopsie...
+ *
+ * Try and handle these invalidation as quickly as possible, without
+ * fully exiting. Note that we don't need to consider any forwarding
+ * here, as having E2H+TGE set is the very definition of being
+ * InHost.
+ *
+ * For the lesser hypervisors out there that have failed to get on
+ * with the VHE program, we can also handle the nVHE style of EL2
+ * invalidation.
+ */
+ if (!(is_hyp_ctxt(vcpu)))
+ return false;
+
+ instr = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu));
+ val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu));
+
+ if ((kvm_supported_tlbi_s1e1_op(vcpu, instr) &&
+ vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) ||
+ kvm_supported_tlbi_s1e2_op (vcpu, instr))
+ ret = __kvm_tlbi_s1e2(NULL, val, instr);
+
+ if (ret)
+ return false;
+
+ __kvm_skip_instr(vcpu);
+
+ return true;
+}
+
+static bool kvm_hyp_handle_cpacr_el1(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ u64 esr = kvm_vcpu_get_esr(vcpu);
+ int rt;
+
+ if (!is_hyp_ctxt(vcpu) || esr_sys64_to_sysreg(esr) != SYS_CPACR_EL1)
+ return false;
+
+ rt = kvm_vcpu_sys_get_rt(vcpu);
+
+ if ((esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ) {
+ vcpu_set_reg(vcpu, rt, __vcpu_sys_reg(vcpu, CPTR_EL2));
+ } else {
+ vcpu_write_sys_reg(vcpu, vcpu_get_reg(vcpu, rt), CPTR_EL2);
+ __activate_cptr_traps(vcpu);
+ }
+
+ __kvm_skip_instr(vcpu);
+
+ return true;
+}
+
+static bool kvm_hyp_handle_zcr_el2(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu));
+
+ if (!vcpu_has_nv(vcpu))
+ return false;
+
+ if (sysreg != SYS_ZCR_EL2)
+ return false;
+
+ if (guest_owns_fp_regs())
+ return false;
+
+ /*
+ * ZCR_EL2 traps are handled in the slow path, with the expectation
+ * that the guest's FP context has already been loaded onto the CPU.
+ *
+ * Load the guest's FP context and unconditionally forward to the
+ * slow path for handling (i.e. return false).
+ */
+ kvm_hyp_handle_fpsimd(vcpu, exit_code);
+ return false;
+}
+
+static bool kvm_hyp_handle_sysreg_vhe(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ if (kvm_hyp_handle_tlbi_el2(vcpu, exit_code))
+ return true;
+
+ if (kvm_hyp_handle_cpacr_el1(vcpu, exit_code))
+ return true;
+
+ if (kvm_hyp_handle_zcr_el2(vcpu, exit_code))
+ return true;
+
+ return kvm_hyp_handle_sysreg(vcpu, exit_code);
+}
+
static const exit_handler_fn hyp_exit_handlers[] = {
[0 ... ESR_ELx_EC_MAX] = NULL,
[ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32,
- [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg,
+ [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg_vhe,
[ESR_ELx_EC_SVE] = kvm_hyp_handle_fpsimd,
[ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd,
[ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low,
@@ -388,7 +537,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
return ret;
}
-static void __hyp_call_panic(u64 spsr, u64 elr, u64 par)
+static void __noreturn __hyp_call_panic(u64 spsr, u64 elr, u64 par)
{
struct kvm_cpu_context *host_ctxt;
struct kvm_vcpu *vcpu;
@@ -413,7 +562,6 @@ void __noreturn hyp_panic(void)
u64 par = read_sysreg_par();
__hyp_call_panic(spsr, elr, par);
- unreachable();
}
asmlinkage void kvm_unexpected_el2_exception(void)
diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c
index 5fa0359f3a87..3d50a1bd2bdb 100644
--- a/arch/arm64/kvm/hyp/vhe/tlb.c
+++ b/arch/arm64/kvm/hyp/vhe/tlb.c
@@ -219,3 +219,150 @@ void __kvm_flush_vm_context(void)
__tlbi(alle1is);
dsb(ish);
}
+
+/*
+ * TLB invalidation emulation for NV. For any given instruction, we
+ * perform the following transformtions:
+ *
+ * - a TLBI targeting EL2 S1 is remapped to EL1 S1
+ * - a non-shareable TLBI is upgraded to being inner-shareable
+ * - an outer-shareable TLBI is also mapped to inner-shareable
+ * - an nXS TLBI is upgraded to XS
+ */
+int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding)
+{
+ struct tlb_inv_context cxt;
+ int ret = 0;
+
+ /*
+ * The guest will have provided its own DSB ISHST before trapping.
+ * If it hasn't, that's its own problem, and we won't paper over it
+ * (plus, there is plenty of extra synchronisation before we even
+ * get here...).
+ */
+
+ if (mmu)
+ enter_vmid_context(mmu, &cxt);
+
+ switch (sys_encoding) {
+ case OP_TLBI_ALLE2:
+ case OP_TLBI_ALLE2IS:
+ case OP_TLBI_ALLE2OS:
+ case OP_TLBI_VMALLE1:
+ case OP_TLBI_VMALLE1IS:
+ case OP_TLBI_VMALLE1OS:
+ case OP_TLBI_ALLE2NXS:
+ case OP_TLBI_ALLE2ISNXS:
+ case OP_TLBI_ALLE2OSNXS:
+ case OP_TLBI_VMALLE1NXS:
+ case OP_TLBI_VMALLE1ISNXS:
+ case OP_TLBI_VMALLE1OSNXS:
+ __tlbi(vmalle1is);
+ break;
+ case OP_TLBI_VAE2:
+ case OP_TLBI_VAE2IS:
+ case OP_TLBI_VAE2OS:
+ case OP_TLBI_VAE1:
+ case OP_TLBI_VAE1IS:
+ case OP_TLBI_VAE1OS:
+ case OP_TLBI_VAE2NXS:
+ case OP_TLBI_VAE2ISNXS:
+ case OP_TLBI_VAE2OSNXS:
+ case OP_TLBI_VAE1NXS:
+ case OP_TLBI_VAE1ISNXS:
+ case OP_TLBI_VAE1OSNXS:
+ __tlbi(vae1is, va);
+ break;
+ case OP_TLBI_VALE2:
+ case OP_TLBI_VALE2IS:
+ case OP_TLBI_VALE2OS:
+ case OP_TLBI_VALE1:
+ case OP_TLBI_VALE1IS:
+ case OP_TLBI_VALE1OS:
+ case OP_TLBI_VALE2NXS:
+ case OP_TLBI_VALE2ISNXS:
+ case OP_TLBI_VALE2OSNXS:
+ case OP_TLBI_VALE1NXS:
+ case OP_TLBI_VALE1ISNXS:
+ case OP_TLBI_VALE1OSNXS:
+ __tlbi(vale1is, va);
+ break;
+ case OP_TLBI_ASIDE1:
+ case OP_TLBI_ASIDE1IS:
+ case OP_TLBI_ASIDE1OS:
+ case OP_TLBI_ASIDE1NXS:
+ case OP_TLBI_ASIDE1ISNXS:
+ case OP_TLBI_ASIDE1OSNXS:
+ __tlbi(aside1is, va);
+ break;
+ case OP_TLBI_VAAE1:
+ case OP_TLBI_VAAE1IS:
+ case OP_TLBI_VAAE1OS:
+ case OP_TLBI_VAAE1NXS:
+ case OP_TLBI_VAAE1ISNXS:
+ case OP_TLBI_VAAE1OSNXS:
+ __tlbi(vaae1is, va);
+ break;
+ case OP_TLBI_VAALE1:
+ case OP_TLBI_VAALE1IS:
+ case OP_TLBI_VAALE1OS:
+ case OP_TLBI_VAALE1NXS:
+ case OP_TLBI_VAALE1ISNXS:
+ case OP_TLBI_VAALE1OSNXS:
+ __tlbi(vaale1is, va);
+ break;
+ case OP_TLBI_RVAE2:
+ case OP_TLBI_RVAE2IS:
+ case OP_TLBI_RVAE2OS:
+ case OP_TLBI_RVAE1:
+ case OP_TLBI_RVAE1IS:
+ case OP_TLBI_RVAE1OS:
+ case OP_TLBI_RVAE2NXS:
+ case OP_TLBI_RVAE2ISNXS:
+ case OP_TLBI_RVAE2OSNXS:
+ case OP_TLBI_RVAE1NXS:
+ case OP_TLBI_RVAE1ISNXS:
+ case OP_TLBI_RVAE1OSNXS:
+ __tlbi(rvae1is, va);
+ break;
+ case OP_TLBI_RVALE2:
+ case OP_TLBI_RVALE2IS:
+ case OP_TLBI_RVALE2OS:
+ case OP_TLBI_RVALE1:
+ case OP_TLBI_RVALE1IS:
+ case OP_TLBI_RVALE1OS:
+ case OP_TLBI_RVALE2NXS:
+ case OP_TLBI_RVALE2ISNXS:
+ case OP_TLBI_RVALE2OSNXS:
+ case OP_TLBI_RVALE1NXS:
+ case OP_TLBI_RVALE1ISNXS:
+ case OP_TLBI_RVALE1OSNXS:
+ __tlbi(rvale1is, va);
+ break;
+ case OP_TLBI_RVAAE1:
+ case OP_TLBI_RVAAE1IS:
+ case OP_TLBI_RVAAE1OS:
+ case OP_TLBI_RVAAE1NXS:
+ case OP_TLBI_RVAAE1ISNXS:
+ case OP_TLBI_RVAAE1OSNXS:
+ __tlbi(rvaae1is, va);
+ break;
+ case OP_TLBI_RVAALE1:
+ case OP_TLBI_RVAALE1IS:
+ case OP_TLBI_RVAALE1OS:
+ case OP_TLBI_RVAALE1NXS:
+ case OP_TLBI_RVAALE1ISNXS:
+ case OP_TLBI_RVAALE1OSNXS:
+ __tlbi(rvaale1is, va);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ dsb(ish);
+ isb();
+
+ if (mmu)
+ exit_vmid_context(&cxt);
+
+ return ret;
+}
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 8bcab0cc3fe9..6981b1bc0946 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -328,18 +328,23 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
may_block));
}
-static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
+void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
{
__unmap_stage2_range(mmu, start, size, true);
}
+void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
+{
+ stage2_apply_range_resched(mmu, addr, end, kvm_pgtable_stage2_flush);
+}
+
static void stage2_flush_memslot(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
- stage2_apply_range_resched(&kvm->arch.mmu, addr, end, kvm_pgtable_stage2_flush);
+ kvm_stage2_flush_range(&kvm->arch.mmu, addr, end);
}
/**
@@ -362,6 +367,8 @@ static void stage2_flush_vm(struct kvm *kvm)
kvm_for_each_memslot(memslot, bkt, slots)
stage2_flush_memslot(kvm, memslot);
+ kvm_nested_s2_flush(kvm);
+
write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
}
@@ -855,21 +862,9 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
.icache_inval_pou = invalidate_icache_guest_page,
};
-/**
- * kvm_init_stage2_mmu - Initialise a S2 MMU structure
- * @kvm: The pointer to the KVM structure
- * @mmu: The pointer to the s2 MMU structure
- * @type: The machine type of the virtual machine
- *
- * Allocates only the stage-2 HW PGD level table(s).
- * Note we don't need locking here as this is only called when the VM is
- * created, which can only be done once.
- */
-int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
+static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type)
{
u32 kvm_ipa_limit = get_kvm_ipa_limit();
- int cpu, err;
- struct kvm_pgtable *pgt;
u64 mmfr0, mmfr1;
u32 phys_shift;
@@ -896,11 +891,51 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
+ return 0;
+}
+
+/**
+ * kvm_init_stage2_mmu - Initialise a S2 MMU structure
+ * @kvm: The pointer to the KVM structure
+ * @mmu: The pointer to the s2 MMU structure
+ * @type: The machine type of the virtual machine
+ *
+ * Allocates only the stage-2 HW PGD level table(s).
+ * Note we don't need locking here as this is only called in two cases:
+ *
+ * - when the VM is created, which can't race against anything
+ *
+ * - when secondary kvm_s2_mmu structures are initialised for NV
+ * guests, and the caller must hold kvm->lock as this is called on a
+ * per-vcpu basis.
+ */
+int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
+{
+ int cpu, err;
+ struct kvm_pgtable *pgt;
+
+ /*
+ * If we already have our page tables in place, and that the
+ * MMU context is the canonical one, we have a bug somewhere,
+ * as this is only supposed to ever happen once per VM.
+ *
+ * Otherwise, we're building nested page tables, and that's
+ * probably because userspace called KVM_ARM_VCPU_INIT more
+ * than once on the same vcpu. Since that's actually legal,
+ * don't kick a fuss and leave gracefully.
+ */
if (mmu->pgt != NULL) {
+ if (kvm_is_nested_s2_mmu(kvm, mmu))
+ return 0;
+
kvm_err("kvm_arch already initialized?\n");
return -EINVAL;
}
+ err = kvm_init_ipa_range(mmu, type);
+ if (err)
+ return err;
+
pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT);
if (!pgt)
return -ENOMEM;
@@ -925,6 +960,10 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
mmu->pgt = pgt;
mmu->pgd_phys = __pa(pgt->pgd);
+
+ if (kvm_is_nested_s2_mmu(kvm, mmu))
+ kvm_init_nested_s2_mmu(mmu);
+
return 0;
out_destroy_pgtable:
@@ -976,7 +1015,7 @@ static void stage2_unmap_memslot(struct kvm *kvm,
if (!(vma->vm_flags & VM_PFNMAP)) {
gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
- unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
+ kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
}
hva = vm_end;
} while (hva < reg_end);
@@ -1003,6 +1042,8 @@ void stage2_unmap_vm(struct kvm *kvm)
kvm_for_each_memslot(memslot, bkt, slots)
stage2_unmap_memslot(kvm, memslot);
+ kvm_nested_s2_unmap(kvm);
+
write_unlock(&kvm->mmu_lock);
mmap_read_unlock(current->mm);
srcu_read_unlock(&kvm->srcu, idx);
@@ -1102,12 +1143,12 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
}
/**
- * stage2_wp_range() - write protect stage2 memory region range
+ * kvm_stage2_wp_range() - write protect stage2 memory region range
* @mmu: The KVM stage-2 MMU pointer
* @addr: Start address of range
* @end: End address of range
*/
-static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
+void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
{
stage2_apply_range_resched(mmu, addr, end, kvm_pgtable_stage2_wrprotect);
}
@@ -1138,7 +1179,8 @@ static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
write_lock(&kvm->mmu_lock);
- stage2_wp_range(&kvm->arch.mmu, start, end);
+ kvm_stage2_wp_range(&kvm->arch.mmu, start, end);
+ kvm_nested_s2_wp(kvm);
write_unlock(&kvm->mmu_lock);
kvm_flush_remote_tlbs_memslot(kvm, memslot);
}
@@ -1192,7 +1234,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
lockdep_assert_held_write(&kvm->mmu_lock);
- stage2_wp_range(&kvm->arch.mmu, start, end);
+ kvm_stage2_wp_range(&kvm->arch.mmu, start, end);
/*
* Eager-splitting is done when manual-protect is set. We
@@ -1204,6 +1246,8 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
*/
if (kvm_dirty_log_manual_protect_and_init_set(kvm))
kvm_mmu_split_huge_pages(kvm, start, end);
+
+ kvm_nested_s2_wp(kvm);
}
static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
@@ -1375,6 +1419,7 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
}
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+ struct kvm_s2_trans *nested,
struct kvm_memory_slot *memslot, unsigned long hva,
bool fault_is_perm)
{
@@ -1383,6 +1428,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
bool exec_fault, mte_allowed;
bool device = false, vfio_allow_any_uc = false;
unsigned long mmu_seq;
+ phys_addr_t ipa = fault_ipa;
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
struct vm_area_struct *vma;
@@ -1466,10 +1512,38 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
}
vma_pagesize = 1UL << vma_shift;
+
+ if (nested) {
+ unsigned long max_map_size;
+
+ max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE;
+
+ ipa = kvm_s2_trans_output(nested);
+
+ /*
+ * If we're about to create a shadow stage 2 entry, then we
+ * can only create a block mapping if the guest stage 2 page
+ * table uses at least as big a mapping.
+ */
+ max_map_size = min(kvm_s2_trans_size(nested), max_map_size);
+
+ /*
+ * Be careful that if the mapping size falls between
+ * two host sizes, take the smallest of the two.
+ */
+ if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE)
+ max_map_size = PMD_SIZE;
+ else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE)
+ max_map_size = PAGE_SIZE;
+
+ force_pte = (max_map_size == PAGE_SIZE);
+ vma_pagesize = min(vma_pagesize, (long)max_map_size);
+ }
+
if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
fault_ipa &= ~(vma_pagesize - 1);
- gfn = fault_ipa >> PAGE_SHIFT;
+ gfn = ipa >> PAGE_SHIFT;
mte_allowed = kvm_vma_mte_allowed(vma);
vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
@@ -1520,6 +1594,25 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (exec_fault && device)
return -ENOEXEC;
+ /*
+ * Potentially reduce shadow S2 permissions to match the guest's own
+ * S2. For exec faults, we'd only reach this point if the guest
+ * actually allowed it (see kvm_s2_handle_perm_fault).
+ *
+ * Also encode the level of the original translation in the SW bits
+ * of the leaf entry as a proxy for the span of that translation.
+ * This will be retrieved on TLB invalidation from the guest and
+ * used to limit the invalidation scope if a TTL hint or a range
+ * isn't provided.
+ */
+ if (nested) {
+ writable &= kvm_s2_trans_writable(nested);
+ if (!kvm_s2_trans_readable(nested))
+ prot &= ~KVM_PGTABLE_PROT_R;
+
+ prot |= kvm_encode_nested_level(nested);
+ }
+
read_lock(&kvm->mmu_lock);
pgt = vcpu->arch.hw_mmu->pgt;
if (mmu_invalidate_retry(kvm, mmu_seq)) {
@@ -1566,7 +1659,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
prot |= KVM_PGTABLE_PROT_NORMAL_NC;
else
prot |= KVM_PGTABLE_PROT_DEVICE;
- } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) {
+ } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) &&
+ (!nested || kvm_s2_trans_executable(nested))) {
prot |= KVM_PGTABLE_PROT_X;
}
@@ -1575,14 +1669,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* permissions only if vma_pagesize equals fault_granule. Otherwise,
* kvm_pgtable_stage2_map() should be called to change block size.
*/
- if (fault_is_perm && vma_pagesize == fault_granule)
+ if (fault_is_perm && vma_pagesize == fault_granule) {
+ /*
+ * Drop the SW bits in favour of those stored in the
+ * PTE, which will be preserved.
+ */
+ prot &= ~KVM_NV_GUEST_MAP_SZ;
ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
- else
+ } else {
ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
__pfn_to_phys(pfn), prot,
memcache,
KVM_PGTABLE_WALK_HANDLE_FAULT |
KVM_PGTABLE_WALK_SHARED);
+ }
+
out_unlock:
read_unlock(&kvm->mmu_lock);
@@ -1626,8 +1727,10 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
*/
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
{
+ struct kvm_s2_trans nested_trans, *nested = NULL;
unsigned long esr;
- phys_addr_t fault_ipa;
+ phys_addr_t fault_ipa; /* The address we faulted on */
+ phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */
struct kvm_memory_slot *memslot;
unsigned long hva;
bool is_iabt, write_fault, writable;
@@ -1636,7 +1739,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
esr = kvm_vcpu_get_esr(vcpu);
- fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
+ ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
if (esr_fsc_is_translation_fault(esr)) {
@@ -1686,7 +1789,42 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
idx = srcu_read_lock(&vcpu->kvm->srcu);
- gfn = fault_ipa >> PAGE_SHIFT;
+ /*
+ * We may have faulted on a shadow stage 2 page table if we are
+ * running a nested guest. In this case, we have to resolve the L2
+ * IPA to the L1 IPA first, before knowing what kind of memory should
+ * back the L1 IPA.
+ *
+ * If the shadow stage 2 page table walk faults, then we simply inject
+ * this to the guest and carry on.
+ *
+ * If there are no shadow S2 PTs because S2 is disabled, there is
+ * nothing to walk and we treat it as a 1:1 before going through the
+ * canonical translation.
+ */
+ if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) &&
+ vcpu->arch.hw_mmu->nested_stage2_enabled) {
+ u32 esr;
+
+ ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans);
+ if (ret) {
+ esr = kvm_s2_trans_esr(&nested_trans);
+ kvm_inject_s2_fault(vcpu, esr);
+ goto out_unlock;
+ }
+
+ ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans);
+ if (ret) {
+ esr = kvm_s2_trans_esr(&nested_trans);
+ kvm_inject_s2_fault(vcpu, esr);
+ goto out_unlock;
+ }
+
+ ipa = kvm_s2_trans_output(&nested_trans);
+ nested = &nested_trans;
+ }
+
+ gfn = ipa >> PAGE_SHIFT;
memslot = gfn_to_memslot(vcpu->kvm, gfn);
hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
write_fault = kvm_is_write_fault(vcpu);
@@ -1730,13 +1868,13 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
* faulting VA. This is always 12 bits, irrespective
* of the page size.
*/
- fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
- ret = io_mem_abort(vcpu, fault_ipa);
+ ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
+ ret = io_mem_abort(vcpu, ipa);
goto out_unlock;
}
/* Userspace should not be able to register out-of-bounds IPAs */
- VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->arch.hw_mmu));
+ VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu));
if (esr_fsc_is_access_flag_fault(esr)) {
handle_access_fault(vcpu, fault_ipa);
@@ -1744,7 +1882,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
goto out_unlock;
}
- ret = user_mem_abort(vcpu, fault_ipa, memslot, hva,
+ ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
esr_fsc_is_permission_fault(esr));
if (ret == 0)
ret = 1;
@@ -1767,6 +1905,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
(range->end - range->start) << PAGE_SHIFT,
range->may_block);
+ kvm_nested_s2_unmap(kvm);
return false;
}
@@ -1780,6 +1919,10 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
range->start << PAGE_SHIFT,
size, true);
+ /*
+ * TODO: Handle nested_mmu structures here using the reverse mapping in
+ * a later version of patch series.
+ */
}
bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
@@ -2022,11 +2165,6 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
{
}
-void kvm_arch_flush_shadow_all(struct kvm *kvm)
-{
- kvm_uninit_stage2_mmu(kvm);
-}
-
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot)
{
@@ -2034,7 +2172,8 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
phys_addr_t size = slot->npages << PAGE_SHIFT;
write_lock(&kvm->mmu_lock);
- unmap_stage2_range(&kvm->arch.mmu, gpa, size);
+ kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size);
+ kvm_nested_s2_unmap(kvm);
write_unlock(&kvm->mmu_lock);
}
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index bae8536cbf00..de789e0f1ae9 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -4,10 +4,13 @@
* Author: Jintack Lim <jintack.lim@linaro.org>
*/
+#include <linux/bitfield.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
+#include <asm/kvm_arm.h>
#include <asm/kvm_emulate.h>
+#include <asm/kvm_mmu.h>
#include <asm/kvm_nested.h>
#include <asm/sysreg.h>
@@ -17,149 +20,910 @@
#define NV_FTR(r, f) ID_AA64##r##_EL1_##f
/*
- * Our emulated CPU doesn't support all the possible features. For the
- * sake of simplicity (and probably mental sanity), wipe out a number
- * of feature bits we don't intend to support for the time being.
- * This list should get updated as new features get added to the NV
- * support, and new extension to the architecture.
+ * Ratio of live shadow S2 MMU per vcpu. This is a trade-off between
+ * memory usage and potential number of different sets of S2 PTs in
+ * the guests. Running out of S2 MMUs only affects performance (we
+ * will invalidate them more often).
*/
-static u64 limit_nv_id_reg(u32 id, u64 val)
+#define S2_MMU_PER_VCPU 2
+
+void kvm_init_nested(struct kvm *kvm)
{
- u64 tmp;
+ kvm->arch.nested_mmus = NULL;
+ kvm->arch.nested_mmus_size = 0;
+}
- switch (id) {
- case SYS_ID_AA64ISAR0_EL1:
- /* Support everything but TME, O.S. and Range TLBIs */
- val &= ~(NV_FTR(ISAR0, TLB) |
- NV_FTR(ISAR0, TME));
- break;
+static int init_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
+{
+ /*
+ * We only initialise the IPA range on the canonical MMU, which
+ * defines the contract between KVM and userspace on where the
+ * "hardware" is in the IPA space. This affects the validity of MMIO
+ * exits forwarded to userspace, for example.
+ *
+ * For nested S2s, we use the PARange as exposed to the guest, as it
+ * is allowed to use it at will to expose whatever memory map it
+ * wants to its own guests as it would be on real HW.
+ */
+ return kvm_init_stage2_mmu(kvm, mmu, kvm_get_pa_bits(kvm));
+}
- case SYS_ID_AA64ISAR1_EL1:
- /* Support everything but Spec Invalidation */
- val &= ~(GENMASK_ULL(63, 56) |
- NV_FTR(ISAR1, SPECRES));
- break;
+int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_s2_mmu *tmp;
+ int num_mmus, ret = 0;
+
+ /*
+ * Let's treat memory allocation failures as benign: If we fail to
+ * allocate anything, return an error and keep the allocated array
+ * alive. Userspace may try to recover by intializing the vcpu
+ * again, and there is no reason to affect the whole VM for this.
+ */
+ num_mmus = atomic_read(&kvm->online_vcpus) * S2_MMU_PER_VCPU;
+ tmp = kvrealloc(kvm->arch.nested_mmus,
+ size_mul(sizeof(*kvm->arch.nested_mmus), kvm->arch.nested_mmus_size),
+ size_mul(sizeof(*kvm->arch.nested_mmus), num_mmus),
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!tmp)
+ return -ENOMEM;
+
+ /*
+ * If we went through a realocation, adjust the MMU back-pointers in
+ * the previously initialised kvm_pgtable structures.
+ */
+ if (kvm->arch.nested_mmus != tmp)
+ for (int i = 0; i < kvm->arch.nested_mmus_size; i++)
+ tmp[i].pgt->mmu = &tmp[i];
+
+ for (int i = kvm->arch.nested_mmus_size; !ret && i < num_mmus; i++)
+ ret = init_nested_s2_mmu(kvm, &tmp[i]);
+
+ if (ret) {
+ for (int i = kvm->arch.nested_mmus_size; i < num_mmus; i++)
+ kvm_free_stage2_pgd(&tmp[i]);
+
+ return ret;
+ }
- case SYS_ID_AA64PFR0_EL1:
- /* No AMU, MPAM, S-EL2, RAS or SVE */
- val &= ~(GENMASK_ULL(55, 52) |
- NV_FTR(PFR0, AMU) |
- NV_FTR(PFR0, MPAM) |
- NV_FTR(PFR0, SEL2) |
- NV_FTR(PFR0, RAS) |
- NV_FTR(PFR0, SVE) |
- NV_FTR(PFR0, EL3) |
- NV_FTR(PFR0, EL2) |
- NV_FTR(PFR0, EL1));
- /* 64bit EL1/EL2/EL3 only */
- val |= FIELD_PREP(NV_FTR(PFR0, EL1), 0b0001);
- val |= FIELD_PREP(NV_FTR(PFR0, EL2), 0b0001);
- val |= FIELD_PREP(NV_FTR(PFR0, EL3), 0b0001);
+ kvm->arch.nested_mmus_size = num_mmus;
+ kvm->arch.nested_mmus = tmp;
+
+ return 0;
+}
+
+struct s2_walk_info {
+ int (*read_desc)(phys_addr_t pa, u64 *desc, void *data);
+ void *data;
+ u64 baddr;
+ unsigned int max_oa_bits;
+ unsigned int pgshift;
+ unsigned int sl;
+ unsigned int t0sz;
+ bool be;
+};
+
+static unsigned int ps_to_output_size(unsigned int ps)
+{
+ switch (ps) {
+ case 0: return 32;
+ case 1: return 36;
+ case 2: return 40;
+ case 3: return 42;
+ case 4: return 44;
+ case 5:
+ default:
+ return 48;
+ }
+}
+
+static u32 compute_fsc(int level, u32 fsc)
+{
+ return fsc | (level & 0x3);
+}
+
+static int esr_s2_fault(struct kvm_vcpu *vcpu, int level, u32 fsc)
+{
+ u32 esr;
+
+ esr = kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC;
+ esr |= compute_fsc(level, fsc);
+ return esr;
+}
+
+static int get_ia_size(struct s2_walk_info *wi)
+{
+ return 64 - wi->t0sz;
+}
+
+static int check_base_s2_limits(struct s2_walk_info *wi,
+ int level, int input_size, int stride)
+{
+ int start_size, ia_size;
+
+ ia_size = get_ia_size(wi);
+
+ /* Check translation limits */
+ switch (BIT(wi->pgshift)) {
+ case SZ_64K:
+ if (level == 0 || (level == 1 && ia_size <= 42))
+ return -EFAULT;
break;
+ case SZ_16K:
+ if (level == 0 || (level == 1 && ia_size <= 40))
+ return -EFAULT;
+ break;
+ case SZ_4K:
+ if (level < 0 || (level == 0 && ia_size <= 42))
+ return -EFAULT;
+ break;
+ }
+
+ /* Check input size limits */
+ if (input_size > ia_size)
+ return -EFAULT;
+
+ /* Check number of entries in starting level table */
+ start_size = input_size - ((3 - level) * stride + wi->pgshift);
+ if (start_size < 1 || start_size > stride + 4)
+ return -EFAULT;
+
+ return 0;
+}
+
+/* Check if output is within boundaries */
+static int check_output_size(struct s2_walk_info *wi, phys_addr_t output)
+{
+ unsigned int output_size = wi->max_oa_bits;
+
+ if (output_size != 48 && (output & GENMASK_ULL(47, output_size)))
+ return -1;
+
+ return 0;
+}
- case SYS_ID_AA64PFR1_EL1:
- /* Only support BTI, SSBS, CSV2_frac */
- val &= (NV_FTR(PFR1, BT) |
- NV_FTR(PFR1, SSBS) |
- NV_FTR(PFR1, CSV2_frac));
+/*
+ * This is essentially a C-version of the pseudo code from the ARM ARM
+ * AArch64.TranslationTableWalk function. I strongly recommend looking at
+ * that pseudocode in trying to understand this.
+ *
+ * Must be called with the kvm->srcu read lock held
+ */
+static int walk_nested_s2_pgd(phys_addr_t ipa,
+ struct s2_walk_info *wi, struct kvm_s2_trans *out)
+{
+ int first_block_level, level, stride, input_size, base_lower_bound;
+ phys_addr_t base_addr;
+ unsigned int addr_top, addr_bottom;
+ u64 desc; /* page table entry */
+ int ret;
+ phys_addr_t paddr;
+
+ switch (BIT(wi->pgshift)) {
+ default:
+ case SZ_64K:
+ case SZ_16K:
+ level = 3 - wi->sl;
+ first_block_level = 2;
break;
+ case SZ_4K:
+ level = 2 - wi->sl;
+ first_block_level = 1;
+ break;
+ }
+
+ stride = wi->pgshift - 3;
+ input_size = get_ia_size(wi);
+ if (input_size > 48 || input_size < 25)
+ return -EFAULT;
+
+ ret = check_base_s2_limits(wi, level, input_size, stride);
+ if (WARN_ON(ret))
+ return ret;
+
+ base_lower_bound = 3 + input_size - ((3 - level) * stride +
+ wi->pgshift);
+ base_addr = wi->baddr & GENMASK_ULL(47, base_lower_bound);
+
+ if (check_output_size(wi, base_addr)) {
+ out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
+ return 1;
+ }
+
+ addr_top = input_size - 1;
+
+ while (1) {
+ phys_addr_t index;
+
+ addr_bottom = (3 - level) * stride + wi->pgshift;
+ index = (ipa & GENMASK_ULL(addr_top, addr_bottom))
+ >> (addr_bottom - 3);
+
+ paddr = base_addr | index;
+ ret = wi->read_desc(paddr, &desc, wi->data);
+ if (ret < 0)
+ return ret;
- case SYS_ID_AA64MMFR0_EL1:
- /* Hide ECV, ExS, Secure Memory */
- val &= ~(NV_FTR(MMFR0, ECV) |
- NV_FTR(MMFR0, EXS) |
- NV_FTR(MMFR0, TGRAN4_2) |
- NV_FTR(MMFR0, TGRAN16_2) |
- NV_FTR(MMFR0, TGRAN64_2) |
- NV_FTR(MMFR0, SNSMEM));
-
- /* Disallow unsupported S2 page sizes */
- switch (PAGE_SIZE) {
- case SZ_64K:
- val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN16_2), 0b0001);
- fallthrough;
- case SZ_16K:
- val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN4_2), 0b0001);
- fallthrough;
- case SZ_4K:
- /* Support everything */
- break;
- }
/*
- * Since we can't support a guest S2 page size smaller than
- * the host's own page size (due to KVM only populating its
- * own S2 using the kernel's page size), advertise the
- * limitation using FEAT_GTG.
+ * Handle reversedescriptors if endianness differs between the
+ * host and the guest hypervisor.
*/
- switch (PAGE_SIZE) {
- case SZ_4K:
- val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN4_2), 0b0010);
- fallthrough;
- case SZ_16K:
- val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN16_2), 0b0010);
- fallthrough;
- case SZ_64K:
- val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN64_2), 0b0010);
+ if (wi->be)
+ desc = be64_to_cpu((__force __be64)desc);
+ else
+ desc = le64_to_cpu((__force __le64)desc);
+
+ /* Check for valid descriptor at this point */
+ if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) {
+ out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
+ out->upper_attr = desc;
+ return 1;
+ }
+
+ /* We're at the final level or block translation level */
+ if ((desc & 3) == 1 || level == 3)
+ break;
+
+ if (check_output_size(wi, desc)) {
+ out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
+ out->upper_attr = desc;
+ return 1;
+ }
+
+ base_addr = desc & GENMASK_ULL(47, wi->pgshift);
+
+ level += 1;
+ addr_top = addr_bottom - 1;
+ }
+
+ if (level < first_block_level) {
+ out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
+ out->upper_attr = desc;
+ return 1;
+ }
+
+ /*
+ * We don't use the contiguous bit in the stage-2 ptes, so skip check
+ * for misprogramming of the contiguous bit.
+ */
+
+ if (check_output_size(wi, desc)) {
+ out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
+ out->upper_attr = desc;
+ return 1;
+ }
+
+ if (!(desc & BIT(10))) {
+ out->esr = compute_fsc(level, ESR_ELx_FSC_ACCESS);
+ out->upper_attr = desc;
+ return 1;
+ }
+
+ /* Calculate and return the result */
+ paddr = (desc & GENMASK_ULL(47, addr_bottom)) |
+ (ipa & GENMASK_ULL(addr_bottom - 1, 0));
+ out->output = paddr;
+ out->block_size = 1UL << ((3 - level) * stride + wi->pgshift);
+ out->readable = desc & (0b01 << 6);
+ out->writable = desc & (0b10 << 6);
+ out->level = level;
+ out->upper_attr = desc & GENMASK_ULL(63, 52);
+ return 0;
+}
+
+static int read_guest_s2_desc(phys_addr_t pa, u64 *desc, void *data)
+{
+ struct kvm_vcpu *vcpu = data;
+
+ return kvm_read_guest(vcpu->kvm, pa, desc, sizeof(*desc));
+}
+
+static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi)
+{
+ wi->t0sz = vtcr & TCR_EL2_T0SZ_MASK;
+
+ switch (vtcr & VTCR_EL2_TG0_MASK) {
+ case VTCR_EL2_TG0_4K:
+ wi->pgshift = 12; break;
+ case VTCR_EL2_TG0_16K:
+ wi->pgshift = 14; break;
+ case VTCR_EL2_TG0_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ wi->pgshift = 16; break;
+ }
+
+ wi->sl = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
+ /* Global limit for now, should eventually be per-VM */
+ wi->max_oa_bits = min(get_kvm_ipa_limit(),
+ ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr)));
+}
+
+int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
+ struct kvm_s2_trans *result)
+{
+ u64 vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
+ struct s2_walk_info wi;
+ int ret;
+
+ result->esr = 0;
+
+ if (!vcpu_has_nv(vcpu))
+ return 0;
+
+ wi.read_desc = read_guest_s2_desc;
+ wi.data = vcpu;
+ wi.baddr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
+
+ vtcr_to_walk_info(vtcr, &wi);
+
+ wi.be = vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_EE;
+
+ ret = walk_nested_s2_pgd(gipa, &wi, result);
+ if (ret)
+ result->esr |= (kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC);
+
+ return ret;
+}
+
+static unsigned int ttl_to_size(u8 ttl)
+{
+ int level = ttl & 3;
+ int gran = (ttl >> 2) & 3;
+ unsigned int max_size = 0;
+
+ switch (gran) {
+ case TLBI_TTL_TG_4K:
+ switch (level) {
+ case 0:
+ break;
+ case 1:
+ max_size = SZ_1G;
+ break;
+ case 2:
+ max_size = SZ_2M;
+ break;
+ case 3:
+ max_size = SZ_4K;
break;
}
- /* Cap PARange to 48bits */
- tmp = FIELD_GET(NV_FTR(MMFR0, PARANGE), val);
- if (tmp > 0b0101) {
- val &= ~NV_FTR(MMFR0, PARANGE);
- val |= FIELD_PREP(NV_FTR(MMFR0, PARANGE), 0b0101);
+ break;
+ case TLBI_TTL_TG_16K:
+ switch (level) {
+ case 0:
+ case 1:
+ break;
+ case 2:
+ max_size = SZ_32M;
+ break;
+ case 3:
+ max_size = SZ_16K;
+ break;
}
break;
-
- case SYS_ID_AA64MMFR1_EL1:
- val &= (NV_FTR(MMFR1, HCX) |
- NV_FTR(MMFR1, PAN) |
- NV_FTR(MMFR1, LO) |
- NV_FTR(MMFR1, HPDS) |
- NV_FTR(MMFR1, VH) |
- NV_FTR(MMFR1, VMIDBits));
+ case TLBI_TTL_TG_64K:
+ switch (level) {
+ case 0:
+ case 1:
+ /* No 52bit IPA support */
+ break;
+ case 2:
+ max_size = SZ_512M;
+ break;
+ case 3:
+ max_size = SZ_64K;
+ break;
+ }
+ break;
+ default: /* No size information */
break;
+ }
- case SYS_ID_AA64MMFR2_EL1:
- val &= ~(NV_FTR(MMFR2, BBM) |
- NV_FTR(MMFR2, TTL) |
- GENMASK_ULL(47, 44) |
- NV_FTR(MMFR2, ST) |
- NV_FTR(MMFR2, CCIDX) |
- NV_FTR(MMFR2, VARange));
+ return max_size;
+}
- /* Force TTL support */
- val |= FIELD_PREP(NV_FTR(MMFR2, TTL), 0b0001);
+/*
+ * Compute the equivalent of the TTL field by parsing the shadow PT. The
+ * granule size is extracted from the cached VTCR_EL2.TG0 while the level is
+ * retrieved from first entry carrying the level as a tag.
+ */
+static u8 get_guest_mapping_ttl(struct kvm_s2_mmu *mmu, u64 addr)
+{
+ u64 tmp, sz = 0, vtcr = mmu->tlb_vtcr;
+ kvm_pte_t pte;
+ u8 ttl, level;
+
+ lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(mmu)->mmu_lock);
+
+ switch (vtcr & VTCR_EL2_TG0_MASK) {
+ case VTCR_EL2_TG0_4K:
+ ttl = (TLBI_TTL_TG_4K << 2);
+ break;
+ case VTCR_EL2_TG0_16K:
+ ttl = (TLBI_TTL_TG_16K << 2);
break;
+ case VTCR_EL2_TG0_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ ttl = (TLBI_TTL_TG_64K << 2);
+ break;
+ }
- case SYS_ID_AA64MMFR4_EL1:
- val = 0;
- if (!cpus_have_final_cap(ARM64_HAS_HCR_NV1))
- val |= FIELD_PREP(NV_FTR(MMFR4, E2H0),
- ID_AA64MMFR4_EL1_E2H0_NI_NV1);
+ tmp = addr;
+
+again:
+ /* Iteratively compute the block sizes for a particular granule size */
+ switch (vtcr & VTCR_EL2_TG0_MASK) {
+ case VTCR_EL2_TG0_4K:
+ if (sz < SZ_4K) sz = SZ_4K;
+ else if (sz < SZ_2M) sz = SZ_2M;
+ else if (sz < SZ_1G) sz = SZ_1G;
+ else sz = 0;
+ break;
+ case VTCR_EL2_TG0_16K:
+ if (sz < SZ_16K) sz = SZ_16K;
+ else if (sz < SZ_32M) sz = SZ_32M;
+ else sz = 0;
break;
+ case VTCR_EL2_TG0_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ if (sz < SZ_64K) sz = SZ_64K;
+ else if (sz < SZ_512M) sz = SZ_512M;
+ else sz = 0;
+ break;
+ }
+
+ if (sz == 0)
+ return 0;
+
+ tmp &= ~(sz - 1);
+ if (kvm_pgtable_get_leaf(mmu->pgt, tmp, &pte, NULL))
+ goto again;
+ if (!(pte & PTE_VALID))
+ goto again;
+ level = FIELD_GET(KVM_NV_GUEST_MAP_SZ, pte);
+ if (!level)
+ goto again;
+
+ ttl |= level;
- case SYS_ID_AA64DFR0_EL1:
- /* Only limited support for PMU, Debug, BPs and WPs */
- val &= (NV_FTR(DFR0, PMUVer) |
- NV_FTR(DFR0, WRPs) |
- NV_FTR(DFR0, BRPs) |
- NV_FTR(DFR0, DebugVer));
-
- /* Cap Debug to ARMv8.1 */
- tmp = FIELD_GET(NV_FTR(DFR0, DebugVer), val);
- if (tmp > 0b0111) {
- val &= ~NV_FTR(DFR0, DebugVer);
- val |= FIELD_PREP(NV_FTR(DFR0, DebugVer), 0b0111);
+ /*
+ * We now have found some level information in the shadow S2. Check
+ * that the resulting range is actually including the original IPA.
+ */
+ sz = ttl_to_size(ttl);
+ if (addr < (tmp + sz))
+ return ttl;
+
+ return 0;
+}
+
+unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val)
+{
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
+ unsigned long max_size;
+ u8 ttl;
+
+ ttl = FIELD_GET(TLBI_TTL_MASK, val);
+
+ if (!ttl || !kvm_has_feat(kvm, ID_AA64MMFR2_EL1, TTL, IMP)) {
+ /* No TTL, check the shadow S2 for a hint */
+ u64 addr = (val & GENMASK_ULL(35, 0)) << 12;
+ ttl = get_guest_mapping_ttl(mmu, addr);
+ }
+
+ max_size = ttl_to_size(ttl);
+
+ if (!max_size) {
+ /* Compute the maximum extent of the invalidation */
+ switch (mmu->tlb_vtcr & VTCR_EL2_TG0_MASK) {
+ case VTCR_EL2_TG0_4K:
+ max_size = SZ_1G;
+ break;
+ case VTCR_EL2_TG0_16K:
+ max_size = SZ_32M;
+ break;
+ case VTCR_EL2_TG0_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ /*
+ * No, we do not support 52bit IPA in nested yet. Once
+ * we do, this should be 4TB.
+ */
+ max_size = SZ_512M;
+ break;
}
- break;
+ }
- default:
- /* Unknown register, just wipe it clean */
- val = 0;
+ WARN_ON(!max_size);
+ return max_size;
+}
+
+/*
+ * We can have multiple *different* MMU contexts with the same VMID:
+ *
+ * - S2 being enabled or not, hence differing by the HCR_EL2.VM bit
+ *
+ * - Multiple vcpus using private S2s (huh huh...), hence differing by the
+ * VBBTR_EL2.BADDR address
+ *
+ * - A combination of the above...
+ *
+ * We can always identify which MMU context to pick at run-time. However,
+ * TLB invalidation involving a VMID must take action on all the TLBs using
+ * this particular VMID. This translates into applying the same invalidation
+ * operation to all the contexts that are using this VMID. Moar phun!
+ */
+void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid,
+ const union tlbi_info *info,
+ void (*tlbi_callback)(struct kvm_s2_mmu *,
+ const union tlbi_info *))
+{
+ write_lock(&kvm->mmu_lock);
+
+ for (int i = 0; i < kvm->arch.nested_mmus_size; i++) {
+ struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+ if (!kvm_s2_mmu_valid(mmu))
+ continue;
+
+ if (vmid == get_vmid(mmu->tlb_vttbr))
+ tlbi_callback(mmu, info);
+ }
+
+ write_unlock(&kvm->mmu_lock);
+}
+
+struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ bool nested_stage2_enabled;
+ u64 vttbr, vtcr, hcr;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
+ vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
+ hcr = vcpu_read_sys_reg(vcpu, HCR_EL2);
+
+ nested_stage2_enabled = hcr & HCR_VM;
+
+ /* Don't consider the CnP bit for the vttbr match */
+ vttbr &= ~VTTBR_CNP_BIT;
+
+ /*
+ * Two possibilities when looking up a S2 MMU context:
+ *
+ * - either S2 is enabled in the guest, and we need a context that is
+ * S2-enabled and matches the full VTTBR (VMID+BADDR) and VTCR,
+ * which makes it safe from a TLB conflict perspective (a broken
+ * guest won't be able to generate them),
+ *
+ * - or S2 is disabled, and we need a context that is S2-disabled
+ * and matches the VMID only, as all TLBs are tagged by VMID even
+ * if S2 translation is disabled.
+ */
+ for (int i = 0; i < kvm->arch.nested_mmus_size; i++) {
+ struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+ if (!kvm_s2_mmu_valid(mmu))
+ continue;
+
+ if (nested_stage2_enabled &&
+ mmu->nested_stage2_enabled &&
+ vttbr == mmu->tlb_vttbr &&
+ vtcr == mmu->tlb_vtcr)
+ return mmu;
+
+ if (!nested_stage2_enabled &&
+ !mmu->nested_stage2_enabled &&
+ get_vmid(vttbr) == get_vmid(mmu->tlb_vttbr))
+ return mmu;
+ }
+ return NULL;
+}
+
+static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_s2_mmu *s2_mmu;
+ int i;
+
+ lockdep_assert_held_write(&vcpu->kvm->mmu_lock);
+
+ s2_mmu = lookup_s2_mmu(vcpu);
+ if (s2_mmu)
+ goto out;
+
+ /*
+ * Make sure we don't always search from the same point, or we
+ * will always reuse a potentially active context, leaving
+ * free contexts unused.
+ */
+ for (i = kvm->arch.nested_mmus_next;
+ i < (kvm->arch.nested_mmus_size + kvm->arch.nested_mmus_next);
+ i++) {
+ s2_mmu = &kvm->arch.nested_mmus[i % kvm->arch.nested_mmus_size];
+
+ if (atomic_read(&s2_mmu->refcnt) == 0)
+ break;
+ }
+ BUG_ON(atomic_read(&s2_mmu->refcnt)); /* We have struct MMUs to spare */
+
+ /* Set the scene for the next search */
+ kvm->arch.nested_mmus_next = (i + 1) % kvm->arch.nested_mmus_size;
+
+ /* Clear the old state */
+ if (kvm_s2_mmu_valid(s2_mmu))
+ kvm_stage2_unmap_range(s2_mmu, 0, kvm_phys_size(s2_mmu));
+
+ /*
+ * The virtual VMID (modulo CnP) will be used as a key when matching
+ * an existing kvm_s2_mmu.
+ *
+ * We cache VTCR at allocation time, once and for all. It'd be great
+ * if the guest didn't screw that one up, as this is not very
+ * forgiving...
+ */
+ s2_mmu->tlb_vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2) & ~VTTBR_CNP_BIT;
+ s2_mmu->tlb_vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
+ s2_mmu->nested_stage2_enabled = vcpu_read_sys_reg(vcpu, HCR_EL2) & HCR_VM;
+
+out:
+ atomic_inc(&s2_mmu->refcnt);
+ return s2_mmu;
+}
+
+void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu)
+{
+ /* CnP being set denotes an invalid entry */
+ mmu->tlb_vttbr = VTTBR_CNP_BIT;
+ mmu->nested_stage2_enabled = false;
+ atomic_set(&mmu->refcnt, 0);
+}
+
+void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
+{
+ if (is_hyp_ctxt(vcpu)) {
+ vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
+ } else {
+ write_lock(&vcpu->kvm->mmu_lock);
+ vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu);
+ write_unlock(&vcpu->kvm->mmu_lock);
+ }
+}
+
+void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu)
+{
+ if (kvm_is_nested_s2_mmu(vcpu->kvm, vcpu->arch.hw_mmu)) {
+ atomic_dec(&vcpu->arch.hw_mmu->refcnt);
+ vcpu->arch.hw_mmu = NULL;
+ }
+}
+
+/*
+ * Returns non-zero if permission fault is handled by injecting it to the next
+ * level hypervisor.
+ */
+int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans)
+{
+ bool forward_fault = false;
+
+ trans->esr = 0;
+
+ if (!kvm_vcpu_trap_is_permission_fault(vcpu))
+ return 0;
+
+ if (kvm_vcpu_trap_is_iabt(vcpu)) {
+ forward_fault = !kvm_s2_trans_executable(trans);
+ } else {
+ bool write_fault = kvm_is_write_fault(vcpu);
+
+ forward_fault = ((write_fault && !trans->writable) ||
+ (!write_fault && !trans->readable));
+ }
+
+ if (forward_fault)
+ trans->esr = esr_s2_fault(vcpu, trans->level, ESR_ELx_FSC_PERM);
+
+ return forward_fault;
+}
+
+int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2)
+{
+ vcpu_write_sys_reg(vcpu, vcpu->arch.fault.far_el2, FAR_EL2);
+ vcpu_write_sys_reg(vcpu, vcpu->arch.fault.hpfar_el2, HPFAR_EL2);
+
+ return kvm_inject_nested_sync(vcpu, esr_el2);
+}
+
+void kvm_nested_s2_wp(struct kvm *kvm)
+{
+ int i;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
+ struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+ if (kvm_s2_mmu_valid(mmu))
+ kvm_stage2_wp_range(mmu, 0, kvm_phys_size(mmu));
+ }
+}
+
+void kvm_nested_s2_unmap(struct kvm *kvm)
+{
+ int i;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
+ struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+ if (kvm_s2_mmu_valid(mmu))
+ kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu));
+ }
+}
+
+void kvm_nested_s2_flush(struct kvm *kvm)
+{
+ int i;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
+ struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+ if (kvm_s2_mmu_valid(mmu))
+ kvm_stage2_flush_range(mmu, 0, kvm_phys_size(mmu));
+ }
+}
+
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+ int i;
+
+ for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
+ struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+ if (!WARN_ON(atomic_read(&mmu->refcnt)))
+ kvm_free_stage2_pgd(mmu);
+ }
+ kfree(kvm->arch.nested_mmus);
+ kvm->arch.nested_mmus = NULL;
+ kvm->arch.nested_mmus_size = 0;
+ kvm_uninit_stage2_mmu(kvm);
+}
+
+/*
+ * Our emulated CPU doesn't support all the possible features. For the
+ * sake of simplicity (and probably mental sanity), wipe out a number
+ * of feature bits we don't intend to support for the time being.
+ * This list should get updated as new features get added to the NV
+ * support, and new extension to the architecture.
+ */
+static void limit_nv_id_regs(struct kvm *kvm)
+{
+ u64 val, tmp;
+
+ /* Support everything but TME */
+ val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64ISAR0_EL1);
+ val &= ~NV_FTR(ISAR0, TME);
+ kvm_set_vm_id_reg(kvm, SYS_ID_AA64ISAR0_EL1, val);
+
+ /* Support everything but Spec Invalidation and LS64 */
+ val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64ISAR1_EL1);
+ val &= ~(NV_FTR(ISAR1, LS64) |
+ NV_FTR(ISAR1, SPECRES));
+ kvm_set_vm_id_reg(kvm, SYS_ID_AA64ISAR1_EL1, val);
+
+ /* No AMU, MPAM, S-EL2, or RAS */
+ val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1);
+ val &= ~(GENMASK_ULL(55, 52) |
+ NV_FTR(PFR0, AMU) |
+ NV_FTR(PFR0, MPAM) |
+ NV_FTR(PFR0, SEL2) |
+ NV_FTR(PFR0, RAS) |
+ NV_FTR(PFR0, EL3) |
+ NV_FTR(PFR0, EL2) |
+ NV_FTR(PFR0, EL1));
+ /* 64bit EL1/EL2/EL3 only */
+ val |= FIELD_PREP(NV_FTR(PFR0, EL1), 0b0001);
+ val |= FIELD_PREP(NV_FTR(PFR0, EL2), 0b0001);
+ val |= FIELD_PREP(NV_FTR(PFR0, EL3), 0b0001);
+ kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, val);
+
+ /* Only support BTI, SSBS, CSV2_frac */
+ val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR1_EL1);
+ val &= (NV_FTR(PFR1, BT) |
+ NV_FTR(PFR1, SSBS) |
+ NV_FTR(PFR1, CSV2_frac));
+ kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR1_EL1, val);
+
+ /* Hide ECV, ExS, Secure Memory */
+ val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64MMFR0_EL1);
+ val &= ~(NV_FTR(MMFR0, ECV) |
+ NV_FTR(MMFR0, EXS) |
+ NV_FTR(MMFR0, TGRAN4_2) |
+ NV_FTR(MMFR0, TGRAN16_2) |
+ NV_FTR(MMFR0, TGRAN64_2) |
+ NV_FTR(MMFR0, SNSMEM));
+
+ /* Disallow unsupported S2 page sizes */
+ switch (PAGE_SIZE) {
+ case SZ_64K:
+ val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN16_2), 0b0001);
+ fallthrough;
+ case SZ_16K:
+ val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN4_2), 0b0001);
+ fallthrough;
+ case SZ_4K:
+ /* Support everything */
break;
}
+ /*
+ * Since we can't support a guest S2 page size smaller than
+ * the host's own page size (due to KVM only populating its
+ * own S2 using the kernel's page size), advertise the
+ * limitation using FEAT_GTG.
+ */
+ switch (PAGE_SIZE) {
+ case SZ_4K:
+ val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN4_2), 0b0010);
+ fallthrough;
+ case SZ_16K:
+ val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN16_2), 0b0010);
+ fallthrough;
+ case SZ_64K:
+ val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN64_2), 0b0010);
+ break;
+ }
+ /* Cap PARange to 48bits */
+ tmp = FIELD_GET(NV_FTR(MMFR0, PARANGE), val);
+ if (tmp > 0b0101) {
+ val &= ~NV_FTR(MMFR0, PARANGE);
+ val |= FIELD_PREP(NV_FTR(MMFR0, PARANGE), 0b0101);
+ }
+ kvm_set_vm_id_reg(kvm, SYS_ID_AA64MMFR0_EL1, val);
+
+ val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64MMFR1_EL1);
+ val &= (NV_FTR(MMFR1, HCX) |
+ NV_FTR(MMFR1, PAN) |
+ NV_FTR(MMFR1, LO) |
+ NV_FTR(MMFR1, HPDS) |
+ NV_FTR(MMFR1, VH) |
+ NV_FTR(MMFR1, VMIDBits));
+ kvm_set_vm_id_reg(kvm, SYS_ID_AA64MMFR1_EL1, val);
+
+ val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64MMFR2_EL1);
+ val &= ~(NV_FTR(MMFR2, BBM) |
+ NV_FTR(MMFR2, TTL) |
+ GENMASK_ULL(47, 44) |
+ NV_FTR(MMFR2, ST) |
+ NV_FTR(MMFR2, CCIDX) |
+ NV_FTR(MMFR2, VARange));
- return val;
+ /* Force TTL support */
+ val |= FIELD_PREP(NV_FTR(MMFR2, TTL), 0b0001);
+ kvm_set_vm_id_reg(kvm, SYS_ID_AA64MMFR2_EL1, val);
+
+ val = 0;
+ if (!cpus_have_final_cap(ARM64_HAS_HCR_NV1))
+ val |= FIELD_PREP(NV_FTR(MMFR4, E2H0),
+ ID_AA64MMFR4_EL1_E2H0_NI_NV1);
+ kvm_set_vm_id_reg(kvm, SYS_ID_AA64MMFR4_EL1, val);
+
+ /* Only limited support for PMU, Debug, BPs and WPs */
+ val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1);
+ val &= (NV_FTR(DFR0, PMUVer) |
+ NV_FTR(DFR0, WRPs) |
+ NV_FTR(DFR0, BRPs) |
+ NV_FTR(DFR0, DebugVer));
+
+ /* Cap Debug to ARMv8.1 */
+ tmp = FIELD_GET(NV_FTR(DFR0, DebugVer), val);
+ if (tmp > 0b0111) {
+ val &= ~NV_FTR(DFR0, DebugVer);
+ val |= FIELD_PREP(NV_FTR(DFR0, DebugVer), 0b0111);
+ }
+ kvm_set_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1, val);
}
u64 kvm_vcpu_sanitise_vncr_reg(const struct kvm_vcpu *vcpu, enum vcpu_sysreg sr)
@@ -198,15 +962,13 @@ int kvm_init_nv_sysregs(struct kvm *kvm)
goto out;
kvm->arch.sysreg_masks = kzalloc(sizeof(*(kvm->arch.sysreg_masks)),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!kvm->arch.sysreg_masks) {
ret = -ENOMEM;
goto out;
}
- for (int i = 0; i < KVM_ARM_ID_REG_NUM; i++)
- kvm->arch.id_regs[i] = limit_nv_id_reg(IDX_IDREG(i),
- kvm->arch.id_regs[i]);
+ limit_nv_id_regs(kvm);
/* VTTBR_EL2 */
res0 = res1 = 0;
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index d1a476b08f54..82a2a003259c 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -53,7 +53,7 @@ static u32 __kvm_pmu_event_mask(unsigned int pmuver)
static u32 kvm_pmu_event_mask(struct kvm *kvm)
{
- u64 dfr0 = IDREG(kvm, SYS_ID_AA64DFR0_EL1);
+ u64 dfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1);
u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, dfr0);
return __kvm_pmu_event_mask(pmuver);
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 3fc8ca164dbe..0b0ae5ae7bc2 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -268,6 +268,12 @@ void kvm_reset_vcpu(struct kvm_vcpu *vcpu)
preempt_enable();
}
+u32 kvm_get_pa_bits(struct kvm *kvm)
+{
+ /* Fixed limit until we can configure ID_AA64MMFR0.PARange */
+ return kvm_ipa_limit;
+}
+
u32 get_kvm_ipa_limit(void)
{
return kvm_ipa_limit;
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 22b45a15d068..c90324060436 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -121,6 +121,7 @@ static bool get_el2_to_el1_mapping(unsigned int reg,
MAPPED_EL2_SYSREG(AMAIR_EL2, AMAIR_EL1, NULL );
MAPPED_EL2_SYSREG(ELR_EL2, ELR_EL1, NULL );
MAPPED_EL2_SYSREG(SPSR_EL2, SPSR_EL1, NULL );
+ MAPPED_EL2_SYSREG(ZCR_EL2, ZCR_EL1, NULL );
default:
return false;
}
@@ -383,6 +384,12 @@ static bool access_vm_reg(struct kvm_vcpu *vcpu,
bool was_enabled = vcpu_has_cache_enabled(vcpu);
u64 val, mask, shift;
+ if (reg_to_encoding(r) == SYS_TCR2_EL1 &&
+ !kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) {
+ kvm_inject_undefined(vcpu);
+ return false;
+ }
+
BUG_ON(!p->is_write);
get_access_mask(r, &mask, &shift);
@@ -1565,7 +1572,7 @@ static u64 kvm_read_sanitised_id_reg(struct kvm_vcpu *vcpu,
static u64 read_id_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
- return IDREG(vcpu->kvm, reg_to_encoding(r));
+ return kvm_read_vm_id_reg(vcpu->kvm, reg_to_encoding(r));
}
static bool is_feature_id_reg(u32 encoding)
@@ -1583,6 +1590,9 @@ static bool is_feature_id_reg(u32 encoding)
*/
static inline bool is_vm_ftr_id_reg(u32 id)
{
+ if (id == SYS_CTR_EL0)
+ return true;
+
return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 &&
sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 &&
sys_reg_CRm(id) < 8);
@@ -1851,7 +1861,7 @@ static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
ret = arm64_check_features(vcpu, rd, val);
if (!ret)
- IDREG(vcpu->kvm, id) = val;
+ kvm_set_vm_id_reg(vcpu->kvm, id, val);
mutex_unlock(&vcpu->kvm->arch.config_lock);
@@ -1867,6 +1877,18 @@ static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
return ret;
}
+void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val)
+{
+ u64 *p = __vm_id_reg(&kvm->arch, reg);
+
+ lockdep_assert_held(&kvm->arch.config_lock);
+
+ if (KVM_BUG_ON(kvm_vm_has_ran_once(kvm) || !p, kvm))
+ return;
+
+ *p = val;
+}
+
static int get_raz_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
u64 *val)
{
@@ -1886,7 +1908,7 @@ static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
if (p->is_write)
return write_to_read_only(vcpu, p, r);
- p->regval = read_sanitised_ftr_reg(SYS_CTR_EL0);
+ p->regval = kvm_read_vm_id_reg(vcpu->kvm, SYS_CTR_EL0);
return true;
}
@@ -2199,6 +2221,40 @@ static u64 reset_hcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
return __vcpu_sys_reg(vcpu, r->reg) = val;
}
+static unsigned int sve_el2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ unsigned int r;
+
+ r = el2_visibility(vcpu, rd);
+ if (r)
+ return r;
+
+ return sve_visibility(vcpu, rd);
+}
+
+static bool access_zcr_el2(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ unsigned int vq;
+
+ if (guest_hyp_sve_traps_enabled(vcpu)) {
+ kvm_inject_nested_sve_trap(vcpu);
+ return true;
+ }
+
+ if (!p->is_write) {
+ p->regval = vcpu_read_sys_reg(vcpu, ZCR_EL2);
+ return true;
+ }
+
+ vq = SYS_FIELD_GET(ZCR_ELx, LEN, p->regval) + 1;
+ vq = min(vq, vcpu_sve_max_vq(vcpu));
+ vcpu_write_sys_reg(vcpu, vq - 1, ZCR_EL2);
+ return true;
+}
+
/*
* Architected system registers.
* Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2
@@ -2471,11 +2527,14 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_CCSIDR_EL1), access_ccsidr },
{ SYS_DESC(SYS_CLIDR_EL1), access_clidr, reset_clidr, CLIDR_EL1,
- .set_user = set_clidr },
+ .set_user = set_clidr, .val = ~CLIDR_EL1_RES0 },
{ SYS_DESC(SYS_CCSIDR2_EL1), undef_access },
{ SYS_DESC(SYS_SMIDR_EL1), undef_access },
{ SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 },
- { SYS_DESC(SYS_CTR_EL0), access_ctr },
+ ID_WRITABLE(CTR_EL0, CTR_EL0_DIC_MASK |
+ CTR_EL0_IDC_MASK |
+ CTR_EL0_DminLine_MASK |
+ CTR_EL0_IminLine_MASK),
{ SYS_DESC(SYS_SVCR), undef_access },
{ PMU_SYS_REG(PMCR_EL0), .access = access_pmcr, .reset = reset_pmcr,
@@ -2688,6 +2747,9 @@ static const struct sys_reg_desc sys_reg_descs[] = {
EL2_REG_VNCR(HFGITR_EL2, reset_val, 0),
EL2_REG_VNCR(HACR_EL2, reset_val, 0),
+ { SYS_DESC(SYS_ZCR_EL2), .access = access_zcr_el2, .reset = reset_val,
+ .visibility = sve_el2_visibility, .reg = ZCR_EL2 },
+
EL2_REG_VNCR(HCRX_EL2, reset_val, 0),
EL2_REG(TTBR0_EL2, access_rw, reset_val, 0),
@@ -2741,6 +2803,264 @@ static const struct sys_reg_desc sys_reg_descs[] = {
EL2_REG(SP_EL2, NULL, reset_unknown, 0),
};
+static bool kvm_supported_tlbi_s12_op(struct kvm_vcpu *vpcu, u32 instr)
+{
+ struct kvm *kvm = vpcu->kvm;
+ u8 CRm = sys_reg_CRm(instr);
+
+ if (sys_reg_CRn(instr) == TLBI_CRn_nXS &&
+ !kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP))
+ return false;
+
+ if (CRm == TLBI_CRm_nROS &&
+ !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS))
+ return false;
+
+ return true;
+}
+
+static bool handle_alle1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+
+ if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) {
+ kvm_inject_undefined(vcpu);
+ return false;
+ }
+
+ write_lock(&vcpu->kvm->mmu_lock);
+
+ /*
+ * Drop all shadow S2s, resulting in S1/S2 TLBIs for each of the
+ * corresponding VMIDs.
+ */
+ kvm_nested_s2_unmap(vcpu->kvm);
+
+ write_unlock(&vcpu->kvm->mmu_lock);
+
+ return true;
+}
+
+static bool kvm_supported_tlbi_ipas2_op(struct kvm_vcpu *vpcu, u32 instr)
+{
+ struct kvm *kvm = vpcu->kvm;
+ u8 CRm = sys_reg_CRm(instr);
+ u8 Op2 = sys_reg_Op2(instr);
+
+ if (sys_reg_CRn(instr) == TLBI_CRn_nXS &&
+ !kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP))
+ return false;
+
+ if (CRm == TLBI_CRm_IPAIS && (Op2 == 2 || Op2 == 6) &&
+ !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE))
+ return false;
+
+ if (CRm == TLBI_CRm_IPAONS && (Op2 == 0 || Op2 == 4) &&
+ !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS))
+ return false;
+
+ if (CRm == TLBI_CRm_IPAONS && (Op2 == 3 || Op2 == 7) &&
+ !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE))
+ return false;
+
+ return true;
+}
+
+/* Only defined here as this is an internal "abstraction" */
+union tlbi_info {
+ struct {
+ u64 start;
+ u64 size;
+ } range;
+
+ struct {
+ u64 addr;
+ } ipa;
+
+ struct {
+ u64 addr;
+ u32 encoding;
+ } va;
+};
+
+static void s2_mmu_unmap_range(struct kvm_s2_mmu *mmu,
+ const union tlbi_info *info)
+{
+ kvm_stage2_unmap_range(mmu, info->range.start, info->range.size);
+}
+
+static bool handle_vmalls12e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+ u64 limit, vttbr;
+
+ if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) {
+ kvm_inject_undefined(vcpu);
+ return false;
+ }
+
+ vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
+ limit = BIT_ULL(kvm_get_pa_bits(vcpu->kvm));
+
+ kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr),
+ &(union tlbi_info) {
+ .range = {
+ .start = 0,
+ .size = limit,
+ },
+ },
+ s2_mmu_unmap_range);
+
+ return true;
+}
+
+static bool handle_ripas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+ u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
+ u64 base, range, tg, num, scale;
+ int shift;
+
+ if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) {
+ kvm_inject_undefined(vcpu);
+ return false;
+ }
+
+ /*
+ * Because the shadow S2 structure doesn't necessarily reflect that
+ * of the guest's S2 (different base granule size, for example), we
+ * decide to ignore TTL and only use the described range.
+ */
+ tg = FIELD_GET(GENMASK(47, 46), p->regval);
+ scale = FIELD_GET(GENMASK(45, 44), p->regval);
+ num = FIELD_GET(GENMASK(43, 39), p->regval);
+ base = p->regval & GENMASK(36, 0);
+
+ switch(tg) {
+ case 1:
+ shift = 12;
+ break;
+ case 2:
+ shift = 14;
+ break;
+ case 3:
+ default: /* IMPDEF: handle tg==0 as 64k */
+ shift = 16;
+ break;
+ }
+
+ base <<= shift;
+ range = __TLBI_RANGE_PAGES(num, scale) << shift;
+
+ kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr),
+ &(union tlbi_info) {
+ .range = {
+ .start = base,
+ .size = range,
+ },
+ },
+ s2_mmu_unmap_range);
+
+ return true;
+}
+
+static void s2_mmu_unmap_ipa(struct kvm_s2_mmu *mmu,
+ const union tlbi_info *info)
+{
+ unsigned long max_size;
+ u64 base_addr;
+
+ /*
+ * We drop a number of things from the supplied value:
+ *
+ * - NS bit: we're non-secure only.
+ *
+ * - IPA[51:48]: We don't support 52bit IPA just yet...
+ *
+ * And of course, adjust the IPA to be on an actual address.
+ */
+ base_addr = (info->ipa.addr & GENMASK_ULL(35, 0)) << 12;
+ max_size = compute_tlb_inval_range(mmu, info->ipa.addr);
+ base_addr &= ~(max_size - 1);
+
+ kvm_stage2_unmap_range(mmu, base_addr, max_size);
+}
+
+static bool handle_ipas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+ u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
+
+ if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) {
+ kvm_inject_undefined(vcpu);
+ return false;
+ }
+
+ kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr),
+ &(union tlbi_info) {
+ .ipa = {
+ .addr = p->regval,
+ },
+ },
+ s2_mmu_unmap_ipa);
+
+ return true;
+}
+
+static void s2_mmu_tlbi_s1e1(struct kvm_s2_mmu *mmu,
+ const union tlbi_info *info)
+{
+ WARN_ON(__kvm_tlbi_s1e2(mmu, info->va.addr, info->va.encoding));
+}
+
+static bool handle_tlbi_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+ u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
+
+ /*
+ * If we're here, this is because we've trapped on a EL1 TLBI
+ * instruction that affects the EL1 translation regime while
+ * we're running in a context that doesn't allow us to let the
+ * HW do its thing (aka vEL2):
+ *
+ * - HCR_EL2.E2H == 0 : a non-VHE guest
+ * - HCR_EL2.{E2H,TGE} == { 1, 0 } : a VHE guest in guest mode
+ *
+ * We don't expect these helpers to ever be called when running
+ * in a vEL1 context.
+ */
+
+ WARN_ON(!vcpu_is_el2(vcpu));
+
+ if (!kvm_supported_tlbi_s1e1_op(vcpu, sys_encoding)) {
+ kvm_inject_undefined(vcpu);
+ return false;
+ }
+
+ kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr),
+ &(union tlbi_info) {
+ .va = {
+ .addr = p->regval,
+ .encoding = sys_encoding,
+ },
+ },
+ s2_mmu_tlbi_s1e1);
+
+ return true;
+}
+
+#define SYS_INSN(insn, access_fn) \
+ { \
+ SYS_DESC(OP_##insn), \
+ .access = (access_fn), \
+ }
+
static struct sys_reg_desc sys_insn_descs[] = {
{ SYS_DESC(SYS_DC_ISW), access_dcsw },
{ SYS_DESC(SYS_DC_IGSW), access_dcgsw },
@@ -2751,9 +3071,147 @@ static struct sys_reg_desc sys_insn_descs[] = {
{ SYS_DESC(SYS_DC_CISW), access_dcsw },
{ SYS_DESC(SYS_DC_CIGSW), access_dcgsw },
{ SYS_DESC(SYS_DC_CIGDSW), access_dcgsw },
-};
-static const struct sys_reg_desc *first_idreg;
+ SYS_INSN(TLBI_VMALLE1OS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAE1OS, handle_tlbi_el1),
+ SYS_INSN(TLBI_ASIDE1OS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAAE1OS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VALE1OS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAALE1OS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_RVAE1IS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAAE1IS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVALE1IS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAALE1IS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_VMALLE1IS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAE1IS, handle_tlbi_el1),
+ SYS_INSN(TLBI_ASIDE1IS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAAE1IS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VALE1IS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAALE1IS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_RVAE1OS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAAE1OS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVALE1OS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAALE1OS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_RVAE1, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAAE1, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVALE1, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAALE1, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_VMALLE1, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAE1, handle_tlbi_el1),
+ SYS_INSN(TLBI_ASIDE1, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAAE1, handle_tlbi_el1),
+ SYS_INSN(TLBI_VALE1, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAALE1, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_VMALLE1OSNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAE1OSNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_ASIDE1OSNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAAE1OSNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VALE1OSNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAALE1OSNXS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_RVAE1ISNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAAE1ISNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVALE1ISNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAALE1ISNXS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_VMALLE1ISNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAE1ISNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_ASIDE1ISNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAAE1ISNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VALE1ISNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAALE1ISNXS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_RVAE1OSNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAAE1OSNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVALE1OSNXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAALE1OSNXS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_RVAE1NXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAAE1NXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVALE1NXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_RVAALE1NXS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_VMALLE1NXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAE1NXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_ASIDE1NXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAAE1NXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VALE1NXS, handle_tlbi_el1),
+ SYS_INSN(TLBI_VAALE1NXS, handle_tlbi_el1),
+
+ SYS_INSN(TLBI_IPAS2E1IS, handle_ipas2e1is),
+ SYS_INSN(TLBI_RIPAS2E1IS, handle_ripas2e1is),
+ SYS_INSN(TLBI_IPAS2LE1IS, handle_ipas2e1is),
+ SYS_INSN(TLBI_RIPAS2LE1IS, handle_ripas2e1is),
+
+ SYS_INSN(TLBI_ALLE2OS, trap_undef),
+ SYS_INSN(TLBI_VAE2OS, trap_undef),
+ SYS_INSN(TLBI_ALLE1OS, handle_alle1is),
+ SYS_INSN(TLBI_VALE2OS, trap_undef),
+ SYS_INSN(TLBI_VMALLS12E1OS, handle_vmalls12e1is),
+
+ SYS_INSN(TLBI_RVAE2IS, trap_undef),
+ SYS_INSN(TLBI_RVALE2IS, trap_undef),
+
+ SYS_INSN(TLBI_ALLE1IS, handle_alle1is),
+ SYS_INSN(TLBI_VMALLS12E1IS, handle_vmalls12e1is),
+ SYS_INSN(TLBI_IPAS2E1OS, handle_ipas2e1is),
+ SYS_INSN(TLBI_IPAS2E1, handle_ipas2e1is),
+ SYS_INSN(TLBI_RIPAS2E1, handle_ripas2e1is),
+ SYS_INSN(TLBI_RIPAS2E1OS, handle_ripas2e1is),
+ SYS_INSN(TLBI_IPAS2LE1OS, handle_ipas2e1is),
+ SYS_INSN(TLBI_IPAS2LE1, handle_ipas2e1is),
+ SYS_INSN(TLBI_RIPAS2LE1, handle_ripas2e1is),
+ SYS_INSN(TLBI_RIPAS2LE1OS, handle_ripas2e1is),
+ SYS_INSN(TLBI_RVAE2OS, trap_undef),
+ SYS_INSN(TLBI_RVALE2OS, trap_undef),
+ SYS_INSN(TLBI_RVAE2, trap_undef),
+ SYS_INSN(TLBI_RVALE2, trap_undef),
+ SYS_INSN(TLBI_ALLE1, handle_alle1is),
+ SYS_INSN(TLBI_VMALLS12E1, handle_vmalls12e1is),
+
+ SYS_INSN(TLBI_IPAS2E1ISNXS, handle_ipas2e1is),
+ SYS_INSN(TLBI_RIPAS2E1ISNXS, handle_ripas2e1is),
+ SYS_INSN(TLBI_IPAS2LE1ISNXS, handle_ipas2e1is),
+ SYS_INSN(TLBI_RIPAS2LE1ISNXS, handle_ripas2e1is),
+
+ SYS_INSN(TLBI_ALLE2OSNXS, trap_undef),
+ SYS_INSN(TLBI_VAE2OSNXS, trap_undef),
+ SYS_INSN(TLBI_ALLE1OSNXS, handle_alle1is),
+ SYS_INSN(TLBI_VALE2OSNXS, trap_undef),
+ SYS_INSN(TLBI_VMALLS12E1OSNXS, handle_vmalls12e1is),
+
+ SYS_INSN(TLBI_RVAE2ISNXS, trap_undef),
+ SYS_INSN(TLBI_RVALE2ISNXS, trap_undef),
+ SYS_INSN(TLBI_ALLE2ISNXS, trap_undef),
+ SYS_INSN(TLBI_VAE2ISNXS, trap_undef),
+
+ SYS_INSN(TLBI_ALLE1ISNXS, handle_alle1is),
+ SYS_INSN(TLBI_VALE2ISNXS, trap_undef),
+ SYS_INSN(TLBI_VMALLS12E1ISNXS, handle_vmalls12e1is),
+ SYS_INSN(TLBI_IPAS2E1OSNXS, handle_ipas2e1is),
+ SYS_INSN(TLBI_IPAS2E1NXS, handle_ipas2e1is),
+ SYS_INSN(TLBI_RIPAS2E1NXS, handle_ripas2e1is),
+ SYS_INSN(TLBI_RIPAS2E1OSNXS, handle_ripas2e1is),
+ SYS_INSN(TLBI_IPAS2LE1OSNXS, handle_ipas2e1is),
+ SYS_INSN(TLBI_IPAS2LE1NXS, handle_ipas2e1is),
+ SYS_INSN(TLBI_RIPAS2LE1NXS, handle_ripas2e1is),
+ SYS_INSN(TLBI_RIPAS2LE1OSNXS, handle_ripas2e1is),
+ SYS_INSN(TLBI_RVAE2OSNXS, trap_undef),
+ SYS_INSN(TLBI_RVALE2OSNXS, trap_undef),
+ SYS_INSN(TLBI_RVAE2NXS, trap_undef),
+ SYS_INSN(TLBI_RVALE2NXS, trap_undef),
+ SYS_INSN(TLBI_ALLE2NXS, trap_undef),
+ SYS_INSN(TLBI_VAE2NXS, trap_undef),
+ SYS_INSN(TLBI_ALLE1NXS, handle_alle1is),
+ SYS_INSN(TLBI_VALE2NXS, trap_undef),
+ SYS_INSN(TLBI_VMALLS12E1NXS, handle_vmalls12e1is),
+};
static bool trap_dbgdidr(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
@@ -2762,7 +3220,7 @@ static bool trap_dbgdidr(struct kvm_vcpu *vcpu,
if (p->is_write) {
return ignore_write(vcpu, p);
} else {
- u64 dfr = IDREG(vcpu->kvm, SYS_ID_AA64DFR0_EL1);
+ u64 dfr = kvm_read_vm_id_reg(vcpu->kvm, SYS_ID_AA64DFR0_EL1);
u32 el3 = kvm_has_feat(vcpu->kvm, ID_AA64PFR0_EL1, EL3, IMP);
p->regval = ((SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr) << 28) |
@@ -3440,6 +3898,25 @@ static bool emulate_sys_reg(struct kvm_vcpu *vcpu,
return false;
}
+static const struct sys_reg_desc *idregs_debug_find(struct kvm *kvm, u8 pos)
+{
+ unsigned long i, idreg_idx = 0;
+
+ for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) {
+ const struct sys_reg_desc *r = &sys_reg_descs[i];
+
+ if (!is_vm_ftr_id_reg(reg_to_encoding(r)))
+ continue;
+
+ if (idreg_idx == pos)
+ return r;
+
+ idreg_idx++;
+ }
+
+ return NULL;
+}
+
static void *idregs_debug_start(struct seq_file *s, loff_t *pos)
{
struct kvm *kvm = s->private;
@@ -3451,7 +3928,7 @@ static void *idregs_debug_start(struct seq_file *s, loff_t *pos)
if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags) &&
*iter == (u8)~0) {
*iter = *pos;
- if (*iter >= KVM_ARM_ID_REG_NUM)
+ if (!idregs_debug_find(kvm, *iter))
iter = NULL;
} else {
iter = ERR_PTR(-EBUSY);
@@ -3468,7 +3945,7 @@ static void *idregs_debug_next(struct seq_file *s, void *v, loff_t *pos)
(*pos)++;
- if ((kvm->arch.idreg_debugfs_iter + 1) < KVM_ARM_ID_REG_NUM) {
+ if (idregs_debug_find(kvm, kvm->arch.idreg_debugfs_iter + 1)) {
kvm->arch.idreg_debugfs_iter++;
return &kvm->arch.idreg_debugfs_iter;
@@ -3493,16 +3970,16 @@ static void idregs_debug_stop(struct seq_file *s, void *v)
static int idregs_debug_show(struct seq_file *s, void *v)
{
- struct kvm *kvm = s->private;
const struct sys_reg_desc *desc;
+ struct kvm *kvm = s->private;
- desc = first_idreg + kvm->arch.idreg_debugfs_iter;
+ desc = idregs_debug_find(kvm, kvm->arch.idreg_debugfs_iter);
if (!desc->name)
return 0;
seq_printf(s, "%20s:\t%016llx\n",
- desc->name, IDREG(kvm, IDX_IDREG(kvm->arch.idreg_debugfs_iter)));
+ desc->name, kvm_read_vm_id_reg(kvm, reg_to_encoding(desc)));
return 0;
}
@@ -3532,8 +4009,7 @@ static void reset_vm_ftr_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc
if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags))
return;
- lockdep_assert_held(&kvm->arch.config_lock);
- IDREG(kvm, id) = reg->reset(vcpu, reg);
+ kvm_set_vm_id_reg(kvm, id, reg->reset(vcpu, reg));
}
static void reset_vcpu_ftr_id_reg(struct kvm_vcpu *vcpu,
@@ -3686,8 +4162,8 @@ id_to_sys_reg_desc(struct kvm_vcpu *vcpu, u64 id,
*/
#define FUNCTION_INVARIANT(reg) \
- static u64 get_##reg(struct kvm_vcpu *v, \
- const struct sys_reg_desc *r) \
+ static u64 reset_##reg(struct kvm_vcpu *v, \
+ const struct sys_reg_desc *r) \
{ \
((struct sys_reg_desc *)r)->val = read_sysreg(reg); \
return ((struct sys_reg_desc *)r)->val; \
@@ -3697,18 +4173,11 @@ FUNCTION_INVARIANT(midr_el1)
FUNCTION_INVARIANT(revidr_el1)
FUNCTION_INVARIANT(aidr_el1)
-static u64 get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r)
-{
- ((struct sys_reg_desc *)r)->val = read_sanitised_ftr_reg(SYS_CTR_EL0);
- return ((struct sys_reg_desc *)r)->val;
-}
-
/* ->val is filled in by kvm_sys_reg_table_init() */
static struct sys_reg_desc invariant_sys_regs[] __ro_after_init = {
- { SYS_DESC(SYS_MIDR_EL1), NULL, get_midr_el1 },
- { SYS_DESC(SYS_REVIDR_EL1), NULL, get_revidr_el1 },
- { SYS_DESC(SYS_AIDR_EL1), NULL, get_aidr_el1 },
- { SYS_DESC(SYS_CTR_EL0), NULL, get_ctr_el0 },
+ { SYS_DESC(SYS_MIDR_EL1), NULL, reset_midr_el1 },
+ { SYS_DESC(SYS_REVIDR_EL1), NULL, reset_revidr_el1 },
+ { SYS_DESC(SYS_AIDR_EL1), NULL, reset_aidr_el1 },
};
static int get_invariant_sys_reg(u64 id, u64 __user *uaddr)
@@ -4019,20 +4488,11 @@ int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm, struct reg_mask_range *
if (!is_feature_id_reg(encoding) || !reg->set_user)
continue;
- /*
- * For ID registers, we return the writable mask. Other feature
- * registers return a full 64bit mask. That's not necessary
- * compliant with a given revision of the architecture, but the
- * RES0/RES1 definitions allow us to do that.
- */
- if (is_vm_ftr_id_reg(encoding)) {
- if (!reg->val ||
- (is_aa32_id_reg(encoding) && !kvm_supports_32bit_el0()))
- continue;
- val = reg->val;
- } else {
- val = ~0UL;
+ if (!reg->val ||
+ (is_aa32_id_reg(encoding) && !kvm_supports_32bit_el0())) {
+ continue;
}
+ val = reg->val;
if (put_user(val, (masks + KVM_ARM_FEATURE_ID_RANGE_INDEX(encoding))))
return -EFAULT;
@@ -4041,11 +4501,34 @@ int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm, struct reg_mask_range *
return 0;
}
-void kvm_init_sysreg(struct kvm_vcpu *vcpu)
+static void vcpu_set_hcr(struct kvm_vcpu *vcpu)
{
struct kvm *kvm = vcpu->kvm;
- mutex_lock(&kvm->arch.config_lock);
+ if (has_vhe() || has_hvhe())
+ vcpu->arch.hcr_el2 |= HCR_E2H;
+ if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) {
+ /* route synchronous external abort exceptions to EL2 */
+ vcpu->arch.hcr_el2 |= HCR_TEA;
+ /* trap error record accesses */
+ vcpu->arch.hcr_el2 |= HCR_TERR;
+ }
+
+ if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
+ vcpu->arch.hcr_el2 |= HCR_FWB;
+
+ if (cpus_have_final_cap(ARM64_HAS_EVT) &&
+ !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE) &&
+ kvm_read_vm_id_reg(kvm, SYS_CTR_EL0) == read_sanitised_ftr_reg(SYS_CTR_EL0))
+ vcpu->arch.hcr_el2 |= HCR_TID4;
+ else
+ vcpu->arch.hcr_el2 |= HCR_TID2;
+
+ if (vcpu_el1_is_32bit(vcpu))
+ vcpu->arch.hcr_el2 &= ~HCR_RW;
+
+ if (kvm_has_mte(vcpu->kvm))
+ vcpu->arch.hcr_el2 |= HCR_ATA;
/*
* In the absence of FGT, we cannot independently trap TLBI
@@ -4054,12 +4537,29 @@ void kvm_init_sysreg(struct kvm_vcpu *vcpu)
*/
if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS))
vcpu->arch.hcr_el2 |= HCR_TTLBOS;
+}
+
+void kvm_calculate_traps(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ mutex_lock(&kvm->arch.config_lock);
+ vcpu_set_hcr(vcpu);
if (cpus_have_final_cap(ARM64_HAS_HCX)) {
- vcpu->arch.hcrx_el2 = HCRX_GUEST_FLAGS;
+ /*
+ * In general, all HCRX_EL2 bits are gated by a feature.
+ * The only reason we can set SMPME without checking any
+ * feature is that its effects are not directly observable
+ * from the guest.
+ */
+ vcpu->arch.hcrx_el2 = HCRX_EL2_SMPME;
if (kvm_has_feat(kvm, ID_AA64ISAR2_EL1, MOPS, IMP))
vcpu->arch.hcrx_el2 |= (HCRX_EL2_MSCEn | HCRX_EL2_MCE2);
+
+ if (kvm_has_feat(kvm, ID_AA64MMFR3_EL1, TCRX, IMP))
+ vcpu->arch.hcrx_el2 |= HCRX_EL2_TCR2En;
}
if (test_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags))
@@ -4115,7 +4615,6 @@ out:
int __init kvm_sys_reg_table_init(void)
{
- struct sys_reg_params params;
bool valid = true;
unsigned int i;
int ret = 0;
@@ -4136,12 +4635,6 @@ int __init kvm_sys_reg_table_init(void)
for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++)
invariant_sys_regs[i].reset(NULL, &invariant_sys_regs[i]);
- /* Find the first idreg (SYS_ID_PFR0_EL1) in sys_reg_descs. */
- params = encoding_to_params(SYS_ID_PFR0_EL1);
- first_idreg = find_reg(&params, sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
- if (!first_idreg)
- return -EINVAL;
-
ret = populate_nv_trap_config();
for (i = 0; !ret && i < ARRAY_SIZE(sys_reg_descs); i++)
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index ddc042895d01..b81d0eba5c7e 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -649,6 +649,17 @@ config PARAVIRT
over full virtualization. However, when run without a hypervisor
the kernel is theoretically slower and slightly larger.
+config PARAVIRT_TIME_ACCOUNTING
+ bool "Paravirtual steal time accounting"
+ depends on PARAVIRT
+ help
+ Select this option to enable fine granularity task steal time
+ accounting. Time spent executing other tasks in parallel with
+ the current vCPU is discounted from the vCPU power. To account for
+ that, there can be a small performance impact.
+
+ If in doubt, say N here.
+
endmenu
config ARCH_SELECT_MEMORY_MODEL
diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h
index c87b6ea0ec47..44b54965f5b4 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -30,12 +30,17 @@
#define KVM_PRIVATE_MEM_SLOTS 0
#define KVM_HALT_POLL_NS_DEFAULT 500000
+#define KVM_REQ_TLB_FLUSH_GPA KVM_ARCH_REQ(0)
+#define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(1)
#define KVM_GUESTDBG_SW_BP_MASK \
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)
#define KVM_GUESTDBG_VALID_MASK \
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP | KVM_GUESTDBG_SINGLESTEP)
+#define KVM_DIRTY_LOG_MANUAL_CAPS \
+ (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | KVM_DIRTY_LOG_INITIALLY_SET)
+
struct kvm_vm_stat {
struct kvm_vm_stat_generic generic;
u64 pages;
@@ -190,6 +195,7 @@ struct kvm_vcpu_arch {
/* vcpu's vpid */
u64 vpid;
+ gpa_t flush_gpa;
/* Frequency of stable timer in Hz */
u64 timer_mhz;
@@ -201,6 +207,13 @@ struct kvm_vcpu_arch {
struct kvm_mp_state mp_state;
/* cpucfg */
u32 cpucfg[KVM_MAX_CPUCFG_REGS];
+
+ /* paravirt steal time */
+ struct {
+ u64 guest_addr;
+ u64 last_steal;
+ struct gfn_to_hva_cache cache;
+ } st;
};
static inline unsigned long readl_sw_gcsr(struct loongarch_csrs *csr, int reg)
@@ -261,7 +274,6 @@ static inline bool kvm_is_ifetch_fault(struct kvm_vcpu_arch *arch)
static inline void kvm_arch_hardware_unsetup(void) {}
static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
-static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
diff --git a/arch/loongarch/include/asm/kvm_para.h b/arch/loongarch/include/asm/kvm_para.h
index 4ba2312e5f8c..335fb86778e2 100644
--- a/arch/loongarch/include/asm/kvm_para.h
+++ b/arch/loongarch/include/asm/kvm_para.h
@@ -14,6 +14,7 @@
#define KVM_HCALL_SERVICE HYPERCALL_ENCODE(HYPERVISOR_KVM, KVM_HCALL_CODE_SERVICE)
#define KVM_HCALL_FUNC_IPI 1
+#define KVM_HCALL_FUNC_NOTIFY 2
#define KVM_HCALL_SWDBG HYPERCALL_ENCODE(HYPERVISOR_KVM, KVM_HCALL_CODE_SWDBG)
@@ -24,6 +25,16 @@
#define KVM_HCALL_INVALID_CODE -1UL
#define KVM_HCALL_INVALID_PARAMETER -2UL
+#define KVM_STEAL_PHYS_VALID BIT_ULL(0)
+#define KVM_STEAL_PHYS_MASK GENMASK_ULL(63, 6)
+
+struct kvm_steal_time {
+ __u64 steal;
+ __u32 version;
+ __u32 flags;
+ __u32 pad[12];
+};
+
/*
* Hypercall interface for KVM hypervisor
*
diff --git a/arch/loongarch/include/asm/kvm_vcpu.h b/arch/loongarch/include/asm/kvm_vcpu.h
index 590a92cb5416..c416cb7125c0 100644
--- a/arch/loongarch/include/asm/kvm_vcpu.h
+++ b/arch/loongarch/include/asm/kvm_vcpu.h
@@ -120,4 +120,9 @@ static inline void kvm_write_reg(struct kvm_vcpu *vcpu, int num, unsigned long v
vcpu->arch.gprs[num] = val;
}
+static inline bool kvm_pvtime_supported(void)
+{
+ return !!sched_info_on();
+}
+
#endif /* __ASM_LOONGARCH_KVM_VCPU_H__ */
diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h
index eb09adda54b7..7a4633ef284b 100644
--- a/arch/loongarch/include/asm/loongarch.h
+++ b/arch/loongarch/include/asm/loongarch.h
@@ -169,6 +169,7 @@
#define KVM_SIGNATURE "KVM\0"
#define CPUCFG_KVM_FEATURE (CPUCFG_KVM_BASE + 4)
#define KVM_FEATURE_IPI BIT(1)
+#define KVM_FEATURE_STEAL_TIME BIT(2)
#ifndef __ASSEMBLY__
diff --git a/arch/loongarch/include/asm/paravirt.h b/arch/loongarch/include/asm/paravirt.h
index 0965710f47f2..dddec49671ae 100644
--- a/arch/loongarch/include/asm/paravirt.h
+++ b/arch/loongarch/include/asm/paravirt.h
@@ -18,6 +18,7 @@ static inline u64 paravirt_steal_clock(int cpu)
}
int __init pv_ipi_init(void);
+int __init pv_time_init(void);
#else
@@ -26,5 +27,9 @@ static inline int pv_ipi_init(void)
return 0;
}
+static inline int pv_time_init(void)
+{
+ return 0;
+}
#endif // CONFIG_PARAVIRT
#endif
diff --git a/arch/loongarch/include/uapi/asm/kvm.h b/arch/loongarch/include/uapi/asm/kvm.h
index f9abef382317..ddc5cab0ffd0 100644
--- a/arch/loongarch/include/uapi/asm/kvm.h
+++ b/arch/loongarch/include/uapi/asm/kvm.h
@@ -81,7 +81,11 @@ struct kvm_fpu {
#define LOONGARCH_REG_64(TYPE, REG) (TYPE | KVM_REG_SIZE_U64 | (REG << LOONGARCH_REG_SHIFT))
#define KVM_IOC_CSRID(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CSR, REG)
#define KVM_IOC_CPUCFG(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CPUCFG, REG)
+
+/* Device Control API on vcpu fd */
#define KVM_LOONGARCH_VCPU_CPUCFG 0
+#define KVM_LOONGARCH_VCPU_PVTIME_CTRL 1
+#define KVM_LOONGARCH_VCPU_PVTIME_GPA 0
struct kvm_debug_exit_arch {
};
diff --git a/arch/loongarch/kernel/paravirt.c b/arch/loongarch/kernel/paravirt.c
index 1633ed4f692f..9abe8b71aa48 100644
--- a/arch/loongarch/kernel/paravirt.c
+++ b/arch/loongarch/kernel/paravirt.c
@@ -4,11 +4,14 @@
#include <linux/interrupt.h>
#include <linux/jump_label.h>
#include <linux/kvm_para.h>
+#include <linux/reboot.h>
#include <linux/static_call.h>
#include <asm/paravirt.h>
+static int has_steal_clock;
struct static_key paravirt_steal_enabled;
struct static_key paravirt_steal_rq_enabled;
+static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
static u64 native_steal_clock(int cpu)
{
@@ -17,6 +20,34 @@ static u64 native_steal_clock(int cpu)
DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock);
+static bool steal_acc = true;
+
+static int __init parse_no_stealacc(char *arg)
+{
+ steal_acc = false;
+ return 0;
+}
+early_param("no-steal-acc", parse_no_stealacc);
+
+static u64 paravt_steal_clock(int cpu)
+{
+ int version;
+ u64 steal;
+ struct kvm_steal_time *src;
+
+ src = &per_cpu(steal_time, cpu);
+ do {
+
+ version = src->version;
+ virt_rmb(); /* Make sure that the version is read before the steal */
+ steal = src->steal;
+ virt_rmb(); /* Make sure that the steal is read before the next version */
+
+ } while ((version & 1) || (version != src->version));
+
+ return steal;
+}
+
#ifdef CONFIG_SMP
static void pv_send_ipi_single(int cpu, unsigned int action)
{
@@ -149,3 +180,117 @@ int __init pv_ipi_init(void)
return 0;
}
+
+static int pv_enable_steal_time(void)
+{
+ int cpu = smp_processor_id();
+ unsigned long addr;
+ struct kvm_steal_time *st;
+
+ if (!has_steal_clock)
+ return -EPERM;
+
+ st = &per_cpu(steal_time, cpu);
+ addr = per_cpu_ptr_to_phys(st);
+
+ /* The whole structure kvm_steal_time should be in one page */
+ if (PFN_DOWN(addr) != PFN_DOWN(addr + sizeof(*st))) {
+ pr_warn("Illegal PV steal time addr %lx\n", addr);
+ return -EFAULT;
+ }
+
+ addr |= KVM_STEAL_PHYS_VALID;
+ kvm_hypercall2(KVM_HCALL_FUNC_NOTIFY, KVM_FEATURE_STEAL_TIME, addr);
+
+ return 0;
+}
+
+static void pv_disable_steal_time(void)
+{
+ if (has_steal_clock)
+ kvm_hypercall2(KVM_HCALL_FUNC_NOTIFY, KVM_FEATURE_STEAL_TIME, 0);
+}
+
+#ifdef CONFIG_SMP
+static int pv_time_cpu_online(unsigned int cpu)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ pv_enable_steal_time();
+ local_irq_restore(flags);
+
+ return 0;
+}
+
+static int pv_time_cpu_down_prepare(unsigned int cpu)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ pv_disable_steal_time();
+ local_irq_restore(flags);
+
+ return 0;
+}
+#endif
+
+static void pv_cpu_reboot(void *unused)
+{
+ pv_disable_steal_time();
+}
+
+static int pv_reboot_notify(struct notifier_block *nb, unsigned long code, void *unused)
+{
+ on_each_cpu(pv_cpu_reboot, NULL, 1);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block pv_reboot_nb = {
+ .notifier_call = pv_reboot_notify,
+};
+
+int __init pv_time_init(void)
+{
+ int r, feature;
+
+ if (!cpu_has_hypervisor)
+ return 0;
+ if (!kvm_para_available())
+ return 0;
+
+ feature = read_cpucfg(CPUCFG_KVM_FEATURE);
+ if (!(feature & KVM_FEATURE_STEAL_TIME))
+ return 0;
+
+ has_steal_clock = 1;
+ r = pv_enable_steal_time();
+ if (r < 0) {
+ has_steal_clock = 0;
+ return 0;
+ }
+ register_reboot_notifier(&pv_reboot_nb);
+
+#ifdef CONFIG_SMP
+ r = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+ "loongarch/pv_time:online",
+ pv_time_cpu_online, pv_time_cpu_down_prepare);
+ if (r < 0) {
+ has_steal_clock = 0;
+ pr_err("Failed to install cpu hotplug callbacks\n");
+ return r;
+ }
+#endif
+
+ static_call_update(pv_steal_clock, paravt_steal_clock);
+
+ static_key_slow_inc(&paravirt_steal_enabled);
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ if (steal_acc)
+ static_key_slow_inc(&paravirt_steal_rq_enabled);
+#endif
+
+ pr_info("Using paravirt steal-time\n");
+
+ return 0;
+}
diff --git a/arch/loongarch/kernel/time.c b/arch/loongarch/kernel/time.c
index fd5354f9be7c..46d7d40c87e3 100644
--- a/arch/loongarch/kernel/time.c
+++ b/arch/loongarch/kernel/time.c
@@ -15,6 +15,7 @@
#include <asm/cpu-features.h>
#include <asm/loongarch.h>
+#include <asm/paravirt.h>
#include <asm/time.h>
u64 cpu_clock_freq;
@@ -214,4 +215,5 @@ void __init time_init(void)
constant_clockevent_init();
constant_clocksource_init();
+ pv_time_init();
}
diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig
index c4ef2b4d9797..248744b4d086 100644
--- a/arch/loongarch/kvm/Kconfig
+++ b/arch/loongarch/kvm/Kconfig
@@ -29,6 +29,7 @@ config KVM
select KVM_MMIO
select HAVE_KVM_READONLY_MEM
select KVM_XFER_TO_GUEST_WORK
+ select SCHED_INFO
help
Support hosting virtualized guest machines using
hardware virtualization extensions. You will need
diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c
index a68573e091c0..ea73f9dc2cc6 100644
--- a/arch/loongarch/kvm/exit.c
+++ b/arch/loongarch/kvm/exit.c
@@ -24,7 +24,7 @@
static int kvm_emu_cpucfg(struct kvm_vcpu *vcpu, larch_inst inst)
{
int rd, rj;
- unsigned int index;
+ unsigned int index, ret;
if (inst.reg2_format.opcode != cpucfg_op)
return EMULATE_FAIL;
@@ -50,7 +50,10 @@ static int kvm_emu_cpucfg(struct kvm_vcpu *vcpu, larch_inst inst)
vcpu->arch.gprs[rd] = *(unsigned int *)KVM_SIGNATURE;
break;
case CPUCFG_KVM_FEATURE:
- vcpu->arch.gprs[rd] = KVM_FEATURE_IPI;
+ ret = KVM_FEATURE_IPI;
+ if (kvm_pvtime_supported())
+ ret |= KVM_FEATURE_STEAL_TIME;
+ vcpu->arch.gprs[rd] = ret;
break;
default:
vcpu->arch.gprs[rd] = 0;
@@ -687,6 +690,34 @@ static int kvm_handle_fpu_disabled(struct kvm_vcpu *vcpu)
return RESUME_GUEST;
}
+static long kvm_save_notify(struct kvm_vcpu *vcpu)
+{
+ unsigned long id, data;
+
+ id = kvm_read_reg(vcpu, LOONGARCH_GPR_A1);
+ data = kvm_read_reg(vcpu, LOONGARCH_GPR_A2);
+ switch (id) {
+ case KVM_FEATURE_STEAL_TIME:
+ if (!kvm_pvtime_supported())
+ return KVM_HCALL_INVALID_CODE;
+
+ if (data & ~(KVM_STEAL_PHYS_MASK | KVM_STEAL_PHYS_VALID))
+ return KVM_HCALL_INVALID_PARAMETER;
+
+ vcpu->arch.st.guest_addr = data;
+ if (!(data & KVM_STEAL_PHYS_VALID))
+ break;
+
+ vcpu->arch.st.last_steal = current->sched_info.run_delay;
+ kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
+ break;
+ default:
+ break;
+ };
+
+ return 0;
+};
+
/*
* kvm_handle_lsx_disabled() - Guest used LSX while disabled in root.
* @vcpu: Virtual CPU context.
@@ -758,6 +789,9 @@ static void kvm_handle_service(struct kvm_vcpu *vcpu)
kvm_send_pv_ipi(vcpu);
ret = KVM_HCALL_SUCCESS;
break;
+ case KVM_HCALL_FUNC_NOTIFY:
+ ret = kvm_save_notify(vcpu);
+ break;
default:
ret = KVM_HCALL_INVALID_CODE;
break;
diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c
index 86a2f2d0cb27..844736b99d38 100644
--- a/arch/loongarch/kvm/main.c
+++ b/arch/loongarch/kvm/main.c
@@ -242,6 +242,7 @@ void kvm_check_vpid(struct kvm_vcpu *vcpu)
kvm_update_vpid(vcpu, cpu);
trace_kvm_vpid_change(vcpu, vcpu->arch.vpid);
vcpu->cpu = cpu;
+ kvm_clear_request(KVM_REQ_TLB_FLUSH_GPA, vcpu);
}
/* Restore GSTAT(0x50).vpid */
diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c
index 98883aa23ab8..2634a9e8d82c 100644
--- a/arch/loongarch/kvm/mmu.c
+++ b/arch/loongarch/kvm/mmu.c
@@ -163,6 +163,7 @@ static kvm_pte_t *kvm_populate_gpa(struct kvm *kvm,
child = kvm_mmu_memory_cache_alloc(cache);
_kvm_pte_init(child, ctx.invalid_ptes[ctx.level - 1]);
+ smp_wmb(); /* Make pte visible before pmd */
kvm_set_pte(entry, __pa(child));
} else if (kvm_pte_huge(*entry)) {
return entry;
@@ -444,6 +445,17 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
enum kvm_mr_change change)
{
int needs_flush;
+ u32 old_flags = old ? old->flags : 0;
+ u32 new_flags = new ? new->flags : 0;
+ bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES;
+
+ /* Only track memslot flags changed */
+ if (change != KVM_MR_FLAGS_ONLY)
+ return;
+
+ /* Discard dirty page tracking on readonly memslot */
+ if ((old_flags & new_flags) & KVM_MEM_READONLY)
+ return;
/*
* If dirty page logging is enabled, write protect all pages in the slot
@@ -454,9 +466,14 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
* MOVE/DELETE: The old mappings will already have been cleaned up by
* kvm_arch_flush_shadow_memslot()
*/
- if (change == KVM_MR_FLAGS_ONLY &&
- (!(old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
- new->flags & KVM_MEM_LOG_DIRTY_PAGES)) {
+ if (!(old_flags & KVM_MEM_LOG_DIRTY_PAGES) && log_dirty_pages) {
+ /*
+ * Initially-all-set does not require write protecting any page
+ * because they're all assumed to be dirty.
+ */
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm))
+ return;
+
spin_lock(&kvm->mmu_lock);
/* Write protect GPA page table entries */
needs_flush = kvm_mkclean_gpa_pt(kvm, new->base_gfn,
@@ -540,6 +557,7 @@ static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool writ
gfn_t gfn = gpa >> PAGE_SHIFT;
struct kvm *kvm = vcpu->kvm;
struct kvm_memory_slot *slot;
+ struct page *page;
spin_lock(&kvm->mmu_lock);
@@ -551,10 +569,8 @@ static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool writ
}
/* Track access to pages marked old */
- new = *ptep;
- if (!kvm_pte_young(new))
- new = kvm_pte_mkyoung(new);
- /* call kvm_set_pfn_accessed() after unlock */
+ new = kvm_pte_mkyoung(*ptep);
+ /* call kvm_set_pfn_accessed() after unlock */
if (write && !kvm_pte_dirty(new)) {
if (!kvm_pte_write(new)) {
@@ -582,19 +598,22 @@ static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool writ
if (changed) {
kvm_set_pte(ptep, new);
pfn = kvm_pte_pfn(new);
+ page = kvm_pfn_to_refcounted_page(pfn);
+ if (page)
+ get_page(page);
}
spin_unlock(&kvm->mmu_lock);
- /*
- * Fixme: pfn may be freed after mmu_lock
- * kvm_try_get_pfn(pfn)/kvm_release_pfn pair to prevent this?
- */
- if (kvm_pte_young(changed))
- kvm_set_pfn_accessed(pfn);
+ if (changed) {
+ if (kvm_pte_young(changed))
+ kvm_set_pfn_accessed(pfn);
- if (kvm_pte_dirty(changed)) {
- mark_page_dirty(kvm, gfn);
- kvm_set_pfn_dirty(pfn);
+ if (kvm_pte_dirty(changed)) {
+ mark_page_dirty(kvm, gfn);
+ kvm_set_pfn_dirty(pfn);
+ }
+ if (page)
+ put_page(page);
}
return ret;
out:
@@ -737,6 +756,7 @@ static kvm_pte_t *kvm_split_huge(struct kvm_vcpu *vcpu, kvm_pte_t *ptep, gfn_t g
val += PAGE_SIZE;
}
+ smp_wmb(); /* Make pte visible before pmd */
/* The later kvm_flush_tlb_gpa() will flush hugepage tlb */
kvm_set_pte(ptep, __pa(child));
@@ -858,11 +878,21 @@ retry:
/* Disable dirty logging on HugePages */
level = 0;
- if (!fault_supports_huge_mapping(memslot, hva, write)) {
- level = 0;
- } else {
+ if (fault_supports_huge_mapping(memslot, hva, write)) {
+ /* Check page level about host mmu*/
level = host_pfn_mapping_level(kvm, gfn, memslot);
if (level == 1) {
+ /*
+ * Check page level about secondary mmu
+ * Disable hugepage if it is normal page on
+ * secondary mmu already
+ */
+ ptep = kvm_populate_gpa(kvm, NULL, gpa, 0);
+ if (ptep && !kvm_pte_huge(*ptep))
+ level = 0;
+ }
+
+ if (level == 1) {
gfn = gfn & ~(PTRS_PER_PTE - 1);
pfn = pfn & ~(PTRS_PER_PTE - 1);
}
@@ -892,7 +922,6 @@ retry:
kvm_set_pfn_dirty(pfn);
}
- kvm_set_pfn_accessed(pfn);
kvm_release_pfn_clean(pfn);
out:
srcu_read_unlock(&kvm->srcu, srcu_idx);
@@ -908,7 +937,8 @@ int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long gpa, bool write)
return ret;
/* Invalidate this entry in the TLB */
- kvm_flush_tlb_gpa(vcpu, gpa);
+ vcpu->arch.flush_gpa = gpa;
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GPA, vcpu);
return 0;
}
diff --git a/arch/loongarch/kvm/tlb.c b/arch/loongarch/kvm/tlb.c
index 02535df6b51f..ebdbe9264e9c 100644
--- a/arch/loongarch/kvm/tlb.c
+++ b/arch/loongarch/kvm/tlb.c
@@ -23,10 +23,7 @@ void kvm_flush_tlb_all(void)
void kvm_flush_tlb_gpa(struct kvm_vcpu *vcpu, unsigned long gpa)
{
- unsigned long flags;
-
- local_irq_save(flags);
+ lockdep_assert_irqs_disabled();
gpa &= (PAGE_MASK << 1);
invtlb(INVTLB_GID_ADDR, read_csr_gstat() & CSR_GSTAT_GID, gpa);
- local_irq_restore(flags);
}
diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c
index 9e8030d45129..16756ffb55e8 100644
--- a/arch/loongarch/kvm/vcpu.c
+++ b/arch/loongarch/kvm/vcpu.c
@@ -31,6 +31,50 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
sizeof(kvm_vcpu_stats_desc),
};
+static void kvm_update_stolen_time(struct kvm_vcpu *vcpu)
+{
+ u32 version;
+ u64 steal;
+ gpa_t gpa;
+ struct kvm_memslots *slots;
+ struct kvm_steal_time __user *st;
+ struct gfn_to_hva_cache *ghc;
+
+ ghc = &vcpu->arch.st.cache;
+ gpa = vcpu->arch.st.guest_addr;
+ if (!(gpa & KVM_STEAL_PHYS_VALID))
+ return;
+
+ gpa &= KVM_STEAL_PHYS_MASK;
+ slots = kvm_memslots(vcpu->kvm);
+ if (slots->generation != ghc->generation || gpa != ghc->gpa) {
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st))) {
+ ghc->gpa = INVALID_GPA;
+ return;
+ }
+ }
+
+ st = (struct kvm_steal_time __user *)ghc->hva;
+ unsafe_get_user(version, &st->version, out);
+ if (version & 1)
+ version += 1; /* first time write, random junk */
+
+ version += 1;
+ unsafe_put_user(version, &st->version, out);
+ smp_wmb();
+
+ unsafe_get_user(steal, &st->steal, out);
+ steal += current->sched_info.run_delay - vcpu->arch.st.last_steal;
+ vcpu->arch.st.last_steal = current->sched_info.run_delay;
+ unsafe_put_user(steal, &st->steal, out);
+
+ smp_wmb();
+ version += 1;
+ unsafe_put_user(version, &st->version, out);
+out:
+ mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
+}
+
/*
* kvm_check_requests - check and handle pending vCPU requests
*
@@ -48,9 +92,22 @@ static int kvm_check_requests(struct kvm_vcpu *vcpu)
if (kvm_dirty_ring_check_request(vcpu))
return RESUME_HOST;
+ if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
+ kvm_update_stolen_time(vcpu);
+
return RESUME_GUEST;
}
+static void kvm_late_check_requests(struct kvm_vcpu *vcpu)
+{
+ lockdep_assert_irqs_disabled();
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_GPA, vcpu))
+ if (vcpu->arch.flush_gpa != INVALID_GPA) {
+ kvm_flush_tlb_gpa(vcpu, vcpu->arch.flush_gpa);
+ vcpu->arch.flush_gpa = INVALID_GPA;
+ }
+}
+
/*
* Check and handle pending signal and vCPU requests etc
* Run with irq enabled and preempt enabled
@@ -101,6 +158,13 @@ static int kvm_pre_enter_guest(struct kvm_vcpu *vcpu)
/* Make sure the vcpu mode has been written */
smp_store_mb(vcpu->mode, IN_GUEST_MODE);
kvm_check_vpid(vcpu);
+
+ /*
+ * Called after function kvm_check_vpid()
+ * Since it updates CSR.GSTAT used by kvm_flush_tlb_gpa(),
+ * and it may also clear KVM_REQ_TLB_FLUSH_GPA pending bit
+ */
+ kvm_late_check_requests(vcpu);
vcpu->arch.host_eentry = csr_read64(LOONGARCH_CSR_EENTRY);
/* Clear KVM_LARCH_SWCSR_LATEST as CSR will change when enter guest */
vcpu->arch.aux_inuse &= ~KVM_LARCH_SWCSR_LATEST;
@@ -354,6 +418,17 @@ static int _kvm_getcsr(struct kvm_vcpu *vcpu, unsigned int id, u64 *val)
return -EINVAL;
if (id == LOONGARCH_CSR_ESTAT) {
+ preempt_disable();
+ vcpu_load(vcpu);
+ /*
+ * Sync pending interrupts into ESTAT so that interrupt
+ * remains during VM migration stage
+ */
+ kvm_deliver_intr(vcpu);
+ vcpu->arch.aux_inuse &= ~KVM_LARCH_SWCSR_LATEST;
+ vcpu_put(vcpu);
+ preempt_enable();
+
/* ESTAT IP0~IP7 get from GINTC */
gintc = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_GINTC) & 0xff;
*val = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_ESTAT) | (gintc << 2);
@@ -662,6 +737,16 @@ static int kvm_loongarch_cpucfg_has_attr(struct kvm_vcpu *vcpu,
return -ENXIO;
}
+static int kvm_loongarch_pvtime_has_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ if (!kvm_pvtime_supported() ||
+ attr->attr != KVM_LOONGARCH_VCPU_PVTIME_GPA)
+ return -ENXIO;
+
+ return 0;
+}
+
static int kvm_loongarch_vcpu_has_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr)
{
@@ -671,6 +756,9 @@ static int kvm_loongarch_vcpu_has_attr(struct kvm_vcpu *vcpu,
case KVM_LOONGARCH_VCPU_CPUCFG:
ret = kvm_loongarch_cpucfg_has_attr(vcpu, attr);
break;
+ case KVM_LOONGARCH_VCPU_PVTIME_CTRL:
+ ret = kvm_loongarch_pvtime_has_attr(vcpu, attr);
+ break;
default:
break;
}
@@ -678,7 +766,7 @@ static int kvm_loongarch_vcpu_has_attr(struct kvm_vcpu *vcpu,
return ret;
}
-static int kvm_loongarch_get_cpucfg_attr(struct kvm_vcpu *vcpu,
+static int kvm_loongarch_cpucfg_get_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr)
{
int ret = 0;
@@ -694,6 +782,23 @@ static int kvm_loongarch_get_cpucfg_attr(struct kvm_vcpu *vcpu,
return ret;
}
+static int kvm_loongarch_pvtime_get_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ u64 gpa;
+ u64 __user *user = (u64 __user *)attr->addr;
+
+ if (!kvm_pvtime_supported() ||
+ attr->attr != KVM_LOONGARCH_VCPU_PVTIME_GPA)
+ return -ENXIO;
+
+ gpa = vcpu->arch.st.guest_addr;
+ if (put_user(gpa, user))
+ return -EFAULT;
+
+ return 0;
+}
+
static int kvm_loongarch_vcpu_get_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr)
{
@@ -701,7 +806,10 @@ static int kvm_loongarch_vcpu_get_attr(struct kvm_vcpu *vcpu,
switch (attr->group) {
case KVM_LOONGARCH_VCPU_CPUCFG:
- ret = kvm_loongarch_get_cpucfg_attr(vcpu, attr);
+ ret = kvm_loongarch_cpucfg_get_attr(vcpu, attr);
+ break;
+ case KVM_LOONGARCH_VCPU_PVTIME_CTRL:
+ ret = kvm_loongarch_pvtime_get_attr(vcpu, attr);
break;
default:
break;
@@ -716,6 +824,43 @@ static int kvm_loongarch_cpucfg_set_attr(struct kvm_vcpu *vcpu,
return -ENXIO;
}
+static int kvm_loongarch_pvtime_set_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ int idx, ret = 0;
+ u64 gpa, __user *user = (u64 __user *)attr->addr;
+ struct kvm *kvm = vcpu->kvm;
+
+ if (!kvm_pvtime_supported() ||
+ attr->attr != KVM_LOONGARCH_VCPU_PVTIME_GPA)
+ return -ENXIO;
+
+ if (get_user(gpa, user))
+ return -EFAULT;
+
+ if (gpa & ~(KVM_STEAL_PHYS_MASK | KVM_STEAL_PHYS_VALID))
+ return -EINVAL;
+
+ if (!(gpa & KVM_STEAL_PHYS_VALID)) {
+ vcpu->arch.st.guest_addr = gpa;
+ return 0;
+ }
+
+ /* Check the address is in a valid memslot */
+ idx = srcu_read_lock(&kvm->srcu);
+ if (kvm_is_error_hva(gfn_to_hva(kvm, gpa >> PAGE_SHIFT)))
+ ret = -EINVAL;
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ if (!ret) {
+ vcpu->arch.st.guest_addr = gpa;
+ vcpu->arch.st.last_steal = current->sched_info.run_delay;
+ kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
+ }
+
+ return ret;
+}
+
static int kvm_loongarch_vcpu_set_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr)
{
@@ -725,6 +870,9 @@ static int kvm_loongarch_vcpu_set_attr(struct kvm_vcpu *vcpu,
case KVM_LOONGARCH_VCPU_CPUCFG:
ret = kvm_loongarch_cpucfg_set_attr(vcpu, attr);
break;
+ case KVM_LOONGARCH_VCPU_PVTIME_CTRL:
+ ret = kvm_loongarch_pvtime_set_attr(vcpu, attr);
+ break;
default:
break;
}
@@ -994,6 +1142,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
struct loongarch_csrs *csr;
vcpu->arch.vpid = 0;
+ vcpu->arch.flush_gpa = INVALID_GPA;
hrtimer_init(&vcpu->arch.swtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
vcpu->arch.swtimer.function = kvm_swtimer_wakeup;
@@ -1084,6 +1233,7 @@ static int _kvm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
/* Control guest page CCA attribute */
change_csr_gcfg(CSR_GCFG_MATC_MASK, CSR_GCFG_MATC_ROOT);
+ kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
/* Don't bother restoring registers multiple times unless necessary */
if (vcpu->arch.aux_inuse & KVM_LARCH_HWCSR_USABLE)
@@ -1266,7 +1416,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
kvm_complete_iocsr_read(vcpu, run);
}
- if (run->immediate_exit)
+ if (!vcpu->wants_to_run)
return r;
/* Clear exit_reason */
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 179f320cc231..6743a57c1ab4 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -890,7 +890,6 @@ static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_free_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot) {}
static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
-static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 56fedfbe9455..b5de770b092e 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -434,7 +434,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
vcpu->mmio_needed = 0;
}
- if (vcpu->run->immediate_exit)
+ if (!vcpu->wants_to_run)
goto out;
lose_fpu(1);
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 6a0c771d3ce8..37e581c5b201 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -900,7 +900,6 @@ struct kvm_vcpu_arch {
static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
-static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index d32abe7fe6ab..961aadc71de2 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -1852,7 +1852,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
kvm_sigset_activate(vcpu);
- if (run->immediate_exit)
+ if (!vcpu->wants_to_run)
r = -EINTR;
else
r = kvmppc_vcpu_run(vcpu);
diff --git a/arch/riscv/include/asm/kvm_aia_aplic.h b/arch/riscv/include/asm/kvm_aia_aplic.h
deleted file mode 100644
index 6dd1a4809ec1..000000000000
--- a/arch/riscv/include/asm/kvm_aia_aplic.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2021 Western Digital Corporation or its affiliates.
- * Copyright (C) 2022 Ventana Micro Systems Inc.
- */
-#ifndef __KVM_RISCV_AIA_IMSIC_H
-#define __KVM_RISCV_AIA_IMSIC_H
-
-#include <linux/bitops.h>
-
-#define APLIC_MAX_IDC BIT(14)
-#define APLIC_MAX_SOURCE 1024
-
-#define APLIC_DOMAINCFG 0x0000
-#define APLIC_DOMAINCFG_RDONLY 0x80000000
-#define APLIC_DOMAINCFG_IE BIT(8)
-#define APLIC_DOMAINCFG_DM BIT(2)
-#define APLIC_DOMAINCFG_BE BIT(0)
-
-#define APLIC_SOURCECFG_BASE 0x0004
-#define APLIC_SOURCECFG_D BIT(10)
-#define APLIC_SOURCECFG_CHILDIDX_MASK 0x000003ff
-#define APLIC_SOURCECFG_SM_MASK 0x00000007
-#define APLIC_SOURCECFG_SM_INACTIVE 0x0
-#define APLIC_SOURCECFG_SM_DETACH 0x1
-#define APLIC_SOURCECFG_SM_EDGE_RISE 0x4
-#define APLIC_SOURCECFG_SM_EDGE_FALL 0x5
-#define APLIC_SOURCECFG_SM_LEVEL_HIGH 0x6
-#define APLIC_SOURCECFG_SM_LEVEL_LOW 0x7
-
-#define APLIC_IRQBITS_PER_REG 32
-
-#define APLIC_SETIP_BASE 0x1c00
-#define APLIC_SETIPNUM 0x1cdc
-
-#define APLIC_CLRIP_BASE 0x1d00
-#define APLIC_CLRIPNUM 0x1ddc
-
-#define APLIC_SETIE_BASE 0x1e00
-#define APLIC_SETIENUM 0x1edc
-
-#define APLIC_CLRIE_BASE 0x1f00
-#define APLIC_CLRIENUM 0x1fdc
-
-#define APLIC_SETIPNUM_LE 0x2000
-#define APLIC_SETIPNUM_BE 0x2004
-
-#define APLIC_GENMSI 0x3000
-
-#define APLIC_TARGET_BASE 0x3004
-#define APLIC_TARGET_HART_IDX_SHIFT 18
-#define APLIC_TARGET_HART_IDX_MASK 0x3fff
-#define APLIC_TARGET_GUEST_IDX_SHIFT 12
-#define APLIC_TARGET_GUEST_IDX_MASK 0x3f
-#define APLIC_TARGET_IPRIO_MASK 0xff
-#define APLIC_TARGET_EIID_MASK 0x7ff
-
-#endif
diff --git a/arch/riscv/include/asm/kvm_aia_imsic.h b/arch/riscv/include/asm/kvm_aia_imsic.h
deleted file mode 100644
index da5881d2bde0..000000000000
--- a/arch/riscv/include/asm/kvm_aia_imsic.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2021 Western Digital Corporation or its affiliates.
- * Copyright (C) 2022 Ventana Micro Systems Inc.
- */
-#ifndef __KVM_RISCV_AIA_IMSIC_H
-#define __KVM_RISCV_AIA_IMSIC_H
-
-#include <linux/types.h>
-#include <asm/csr.h>
-
-#define IMSIC_MMIO_PAGE_SHIFT 12
-#define IMSIC_MMIO_PAGE_SZ (1UL << IMSIC_MMIO_PAGE_SHIFT)
-#define IMSIC_MMIO_PAGE_LE 0x00
-#define IMSIC_MMIO_PAGE_BE 0x04
-
-#define IMSIC_MIN_ID 63
-#define IMSIC_MAX_ID 2048
-
-#define IMSIC_EIDELIVERY 0x70
-
-#define IMSIC_EITHRESHOLD 0x72
-
-#define IMSIC_EIP0 0x80
-#define IMSIC_EIP63 0xbf
-#define IMSIC_EIPx_BITS 32
-
-#define IMSIC_EIE0 0xc0
-#define IMSIC_EIE63 0xff
-#define IMSIC_EIEx_BITS 32
-
-#define IMSIC_FIRST IMSIC_EIDELIVERY
-#define IMSIC_LAST IMSIC_EIE63
-
-#define IMSIC_MMIO_SETIPNUM_LE 0x00
-#define IMSIC_MMIO_SETIPNUM_BE 0x04
-
-#endif
diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
index e65d1584d48e..2e2254fd2a2a 100644
--- a/arch/riscv/include/asm/kvm_host.h
+++ b/arch/riscv/include/asm/kvm_host.h
@@ -287,7 +287,6 @@ struct kvm_vcpu_arch {
};
static inline void kvm_arch_sync_events(struct kvm *kvm) {}
-static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
#define KVM_RISCV_GSTAGE_TLB_MIN_ORDER 12
diff --git a/arch/riscv/kvm/aia.c b/arch/riscv/kvm/aia.c
index 0f0a9d11bb5f..2967d305c442 100644
--- a/arch/riscv/kvm/aia.c
+++ b/arch/riscv/kvm/aia.c
@@ -10,12 +10,12 @@
#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/irq.h>
+#include <linux/irqchip/riscv-imsic.h>
#include <linux/irqdomain.h>
#include <linux/kvm_host.h>
#include <linux/percpu.h>
#include <linux/spinlock.h>
#include <asm/cpufeature.h>
-#include <asm/kvm_aia_imsic.h>
struct aia_hgei_control {
raw_spinlock_t lock;
@@ -394,6 +394,8 @@ int kvm_riscv_aia_alloc_hgei(int cpu, struct kvm_vcpu *owner,
{
int ret = -ENOENT;
unsigned long flags;
+ const struct imsic_global_config *gc;
+ const struct imsic_local_config *lc;
struct aia_hgei_control *hgctrl = per_cpu_ptr(&aia_hgei, cpu);
if (!kvm_riscv_aia_available() || !hgctrl)
@@ -409,11 +411,14 @@ int kvm_riscv_aia_alloc_hgei(int cpu, struct kvm_vcpu *owner,
raw_spin_unlock_irqrestore(&hgctrl->lock, flags);
- /* TODO: To be updated later by AIA IMSIC HW guest file support */
- if (hgei_va)
- *hgei_va = NULL;
- if (hgei_pa)
- *hgei_pa = 0;
+ gc = imsic_get_global_config();
+ lc = (gc) ? per_cpu_ptr(gc->local, cpu) : NULL;
+ if (lc && ret > 0) {
+ if (hgei_va)
+ *hgei_va = lc->msi_va + (ret * IMSIC_MMIO_PAGE_SZ);
+ if (hgei_pa)
+ *hgei_pa = lc->msi_pa + (ret * IMSIC_MMIO_PAGE_SZ);
+ }
return ret;
}
@@ -605,9 +610,11 @@ void kvm_riscv_aia_disable(void)
int kvm_riscv_aia_init(void)
{
int rc;
+ const struct imsic_global_config *gc;
if (!riscv_isa_extension_available(NULL, SxAIA))
return -ENODEV;
+ gc = imsic_get_global_config();
/* Figure-out number of bits in HGEIE */
csr_write(CSR_HGEIE, -1UL);
@@ -619,17 +626,17 @@ int kvm_riscv_aia_init(void)
/*
* Number of usable HGEI lines should be minimum of per-HART
* IMSIC guest files and number of bits in HGEIE
- *
- * TODO: To be updated later by AIA IMSIC HW guest file support
*/
- kvm_riscv_aia_nr_hgei = 0;
+ if (gc)
+ kvm_riscv_aia_nr_hgei = min((ulong)kvm_riscv_aia_nr_hgei,
+ BIT(gc->guest_index_bits) - 1);
+ else
+ kvm_riscv_aia_nr_hgei = 0;
- /*
- * Find number of guest MSI IDs
- *
- * TODO: To be updated later by AIA IMSIC HW guest file support
- */
+ /* Find number of guest MSI IDs */
kvm_riscv_aia_max_ids = IMSIC_MAX_ID;
+ if (gc && kvm_riscv_aia_nr_hgei)
+ kvm_riscv_aia_max_ids = gc->nr_guest_ids + 1;
/* Initialize guest external interrupt line management */
rc = aia_hgei_init();
diff --git a/arch/riscv/kvm/aia_aplic.c b/arch/riscv/kvm/aia_aplic.c
index b467ba5ed910..da6ff1bade0d 100644
--- a/arch/riscv/kvm/aia_aplic.c
+++ b/arch/riscv/kvm/aia_aplic.c
@@ -7,12 +7,12 @@
* Anup Patel <apatel@ventanamicro.com>
*/
+#include <linux/irqchip/riscv-aplic.h>
#include <linux/kvm_host.h>
#include <linux/math.h>
#include <linux/spinlock.h>
#include <linux/swab.h>
#include <kvm/iodev.h>
-#include <asm/kvm_aia_aplic.h>
struct aplic_irq {
raw_spinlock_t lock;
diff --git a/arch/riscv/kvm/aia_device.c b/arch/riscv/kvm/aia_device.c
index 5cd407c6a8e4..39cd26af5a69 100644
--- a/arch/riscv/kvm/aia_device.c
+++ b/arch/riscv/kvm/aia_device.c
@@ -8,9 +8,9 @@
*/
#include <linux/bits.h>
+#include <linux/irqchip/riscv-imsic.h>
#include <linux/kvm_host.h>
#include <linux/uaccess.h>
-#include <asm/kvm_aia_imsic.h>
static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
{
diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c
index e808723a85f1..0a1e859323b4 100644
--- a/arch/riscv/kvm/aia_imsic.c
+++ b/arch/riscv/kvm/aia_imsic.c
@@ -9,13 +9,13 @@
#include <linux/atomic.h>
#include <linux/bitmap.h>
+#include <linux/irqchip/riscv-imsic.h>
#include <linux/kvm_host.h>
#include <linux/math.h>
#include <linux/spinlock.h>
#include <linux/swab.h>
#include <kvm/iodev.h>
#include <asm/csr.h>
-#include <asm/kvm_aia_imsic.h>
#define IMSIC_MAX_EIX (IMSIC_MAX_ID / BITS_PER_TYPE(u64))
diff --git a/arch/riscv/kvm/trace.h b/arch/riscv/kvm/trace.h
new file mode 100644
index 000000000000..3d54175d805c
--- /dev/null
+++ b/arch/riscv/kvm/trace.h
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Tracepoints for RISC-V KVM
+ *
+ * Copyright 2024 Beijing ESWIN Computing Technology Co., Ltd.
+ *
+ */
+#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+
+TRACE_EVENT(kvm_entry,
+ TP_PROTO(struct kvm_vcpu *vcpu),
+ TP_ARGS(vcpu),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, pc)
+ ),
+
+ TP_fast_assign(
+ __entry->pc = vcpu->arch.guest_context.sepc;
+ ),
+
+ TP_printk("PC: 0x016%lx", __entry->pc)
+);
+
+TRACE_EVENT(kvm_exit,
+ TP_PROTO(struct kvm_cpu_trap *trap),
+ TP_ARGS(trap),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, sepc)
+ __field(unsigned long, scause)
+ __field(unsigned long, stval)
+ __field(unsigned long, htval)
+ __field(unsigned long, htinst)
+ ),
+
+ TP_fast_assign(
+ __entry->sepc = trap->sepc;
+ __entry->scause = trap->scause;
+ __entry->stval = trap->stval;
+ __entry->htval = trap->htval;
+ __entry->htinst = trap->htinst;
+ ),
+
+ TP_printk("SEPC:0x%lx, SCAUSE:0x%lx, STVAL:0x%lx, HTVAL:0x%lx, HTINST:0x%lx",
+ __entry->sepc,
+ __entry->scause,
+ __entry->stval,
+ __entry->htval,
+ __entry->htinst)
+);
+
+#endif /* _TRACE_RSICV_KVM_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
index c58a0a7f5e5f..8d7d381737ee 100644
--- a/arch/riscv/kvm/vcpu.c
+++ b/arch/riscv/kvm/vcpu.c
@@ -21,6 +21,9 @@
#include <asm/cacheflush.h>
#include <asm/kvm_vcpu_vector.h>
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
KVM_GENERIC_VCPU_STATS(),
STATS_DESC_COUNTER(VCPU, ecall_exit_stat),
@@ -761,7 +764,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
return ret;
}
- if (run->immediate_exit) {
+ if (!vcpu->wants_to_run) {
kvm_vcpu_srcu_read_unlock(vcpu);
return -EINTR;
}
@@ -832,6 +835,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
*/
kvm_riscv_local_tlb_sanitize(vcpu);
+ trace_kvm_entry(vcpu);
+
guest_timing_enter_irqoff();
kvm_riscv_vcpu_enter_exit(vcpu);
@@ -870,6 +875,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
local_irq_enable();
+ trace_kvm_exit(&trap);
+
preempt_enable();
kvm_vcpu_srcu_read_lock(vcpu);
diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c
index 5761f95abb60..fa98e5c024b2 100644
--- a/arch/riscv/kvm/vcpu_exit.c
+++ b/arch/riscv/kvm/vcpu_exit.c
@@ -185,6 +185,8 @@ int kvm_riscv_vcpu_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
case EXC_INST_ILLEGAL:
case EXC_LOAD_MISALIGNED:
case EXC_STORE_MISALIGNED:
+ case EXC_LOAD_ACCESS:
+ case EXC_STORE_ACCESS:
if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV) {
kvm_riscv_vcpu_trap_redirect(vcpu, trap);
ret = 1;
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index a0479c4892f8..8e77afbed58e 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -15,7 +15,6 @@
#include <linux/hrtimer.h>
#include <linux/interrupt.h>
#include <linux/kvm_types.h>
-#include <linux/kvm_host.h>
#include <linux/kvm.h>
#include <linux/seqlock.h>
#include <linux/module.h>
@@ -1047,7 +1046,6 @@ extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc);
extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc);
static inline void kvm_arch_sync_events(struct kvm *kvm) {}
-static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
static inline void kvm_arch_free_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot) {}
static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 06a14e717178..0fd96860fc45 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2997,14 +2997,9 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
break;
}
case KVM_CREATE_IRQCHIP: {
- struct kvm_irq_routing_entry routing;
-
r = -EINVAL;
- if (kvm->arch.use_irqchip) {
- /* Set up dummy routing. */
- memset(&routing, 0, sizeof(routing));
- r = kvm_set_irq_routing(kvm, &routing, 0, 0);
- }
+ if (kvm->arch.use_irqchip)
+ r = 0;
break;
}
case KVM_SET_DEVICE_ATTR: {
@@ -5033,7 +5028,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
if (vcpu->kvm->arch.pv.dumping)
return -EINVAL;
- if (kvm_run->immediate_exit)
+ if (!vcpu->wants_to_run)
return -EINTR;
if (kvm_run->kvm_valid_regs & ~KVM_SYNC_S390_VALID_FIELDS ||
@@ -5750,6 +5745,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
{
gpa_t size;
+ if (kvm_is_ucontrol(kvm))
+ return -EINVAL;
+
/* When we are protected, we should not change the memory slots */
if (kvm_s390_pv_get_handle(kvm))
return -EINVAL;
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 97a70c2b83ee..89cafea4c41f 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -1304,10 +1304,24 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
if (rc == -EAGAIN)
rc = 0;
- if (rc || scb_s->icptcode || signal_pending(current) ||
+
+ /*
+ * Exit the loop if the guest needs to process the intercept
+ */
+ if (rc || scb_s->icptcode)
+ break;
+
+ /*
+ * Exit the loop if the host needs to process an intercept,
+ * but rewind the PSW to re-enter SIE once that's completed
+ * instead of passing a "no action" intercept to the guest.
+ */
+ if (signal_pending(current) ||
kvm_s390_vcpu_has_irq(vcpu, 0) ||
- kvm_s390_vcpu_sie_inhibited(vcpu))
+ kvm_s390_vcpu_sie_inhibited(vcpu)) {
+ kvm_s390_rewind_psw(vcpu, 4);
break;
+ }
cond_resched();
}
@@ -1426,8 +1440,10 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0) ||
- kvm_s390_vcpu_sie_inhibited(vcpu))
+ kvm_s390_vcpu_sie_inhibited(vcpu)) {
+ kvm_s390_rewind_psw(vcpu, 4);
return 0;
+ }
vsie_page = get_vsie_page(vcpu->kvm, scb_addr);
if (IS_ERR(vsie_page))
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 5187fcf4b610..68ad4f923664 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -9,8 +9,7 @@ BUILD_BUG_ON(1)
* "static_call_update()" calls.
*
* KVM_X86_OP_OPTIONAL() can be used for those functions that can have
- * a NULL definition, for example if "static_call_cond()" will be used
- * at the call sites. KVM_X86_OP_OPTIONAL_RET0() can be used likewise
+ * a NULL definition. KVM_X86_OP_OPTIONAL_RET0() can be used likewise
* to make a definition optional, but in this case the default will
* be __static_call_return0.
*/
@@ -85,7 +84,6 @@ KVM_X86_OP_OPTIONAL(update_cr8_intercept)
KVM_X86_OP(refresh_apicv_exec_ctrl)
KVM_X86_OP_OPTIONAL(hwapic_irr_update)
KVM_X86_OP_OPTIONAL(hwapic_isr_update)
-KVM_X86_OP_OPTIONAL_RET0(guest_apic_has_interrupt)
KVM_X86_OP_OPTIONAL(load_eoi_exitmap)
KVM_X86_OP_OPTIONAL(set_virtual_apic_mode)
KVM_X86_OP_OPTIONAL(set_apic_access_page_addr)
@@ -103,7 +101,6 @@ KVM_X86_OP(write_tsc_multiplier)
KVM_X86_OP(get_exit_info)
KVM_X86_OP(check_intercept)
KVM_X86_OP(handle_exit_irqoff)
-KVM_X86_OP(sched_in)
KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging)
KVM_X86_OP_OPTIONAL(vcpu_blocking)
KVM_X86_OP_OPTIONAL(vcpu_unblocking)
@@ -139,6 +136,9 @@ KVM_X86_OP(vcpu_deliver_sipi_vector)
KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
KVM_X86_OP_OPTIONAL(get_untagged_addr)
KVM_X86_OP_OPTIONAL(alloc_apic_backing_page)
+KVM_X86_OP_OPTIONAL_RET0(gmem_prepare)
+KVM_X86_OP_OPTIONAL_RET0(private_max_mapping_level)
+KVM_X86_OP_OPTIONAL(gmem_invalidate)
#undef KVM_X86_OP
#undef KVM_X86_OP_OPTIONAL
diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h
index f852b13aeefe..9159bf1a4730 100644
--- a/arch/x86/include/asm/kvm-x86-pmu-ops.h
+++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h
@@ -9,8 +9,7 @@ BUILD_BUG_ON(1)
* "static_call_update()" calls.
*
* KVM_X86_PMU_OP_OPTIONAL() can be used for those functions that can have
- * a NULL definition, for example if "static_call_cond()" will be used
- * at the call sites.
+ * a NULL definition.
*/
KVM_X86_PMU_OP(rdpmc_ecx_to_pmc)
KVM_X86_PMU_OP(msr_idx_to_pmc)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f8ca74e7678f..950a03e0181e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -121,6 +121,7 @@
KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_HV_TLB_FLUSH \
KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE KVM_ARCH_REQ(34)
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -159,7 +160,6 @@
#define KVM_MIN_FREE_MMU_PAGES 5
#define KVM_REFILL_PAGES 25
#define KVM_MAX_CPUID_ENTRIES 256
-#define KVM_NR_FIXED_MTRR_REGION 88
#define KVM_NR_VAR_MTRR 8
#define ASYNC_PF_PER_VCPU 64
@@ -533,12 +533,16 @@ struct kvm_pmc {
};
/* More counters may conflict with other existing Architectural MSRs */
-#define KVM_INTEL_PMC_MAX_GENERIC 8
-#define MSR_ARCH_PERFMON_PERFCTR_MAX (MSR_ARCH_PERFMON_PERFCTR0 + KVM_INTEL_PMC_MAX_GENERIC - 1)
-#define MSR_ARCH_PERFMON_EVENTSEL_MAX (MSR_ARCH_PERFMON_EVENTSEL0 + KVM_INTEL_PMC_MAX_GENERIC - 1)
-#define KVM_PMC_MAX_FIXED 3
-#define MSR_ARCH_PERFMON_FIXED_CTR_MAX (MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1)
-#define KVM_AMD_PMC_MAX_GENERIC 6
+#define KVM_MAX(a, b) ((a) >= (b) ? (a) : (b))
+#define KVM_MAX_NR_INTEL_GP_COUNTERS 8
+#define KVM_MAX_NR_AMD_GP_COUNTERS 6
+#define KVM_MAX_NR_GP_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_GP_COUNTERS, \
+ KVM_MAX_NR_AMD_GP_COUNTERS)
+
+#define KVM_MAX_NR_INTEL_FIXED_COUTNERS 3
+#define KVM_MAX_NR_AMD_FIXED_COUTNERS 0
+#define KVM_MAX_NR_FIXED_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_FIXED_COUTNERS, \
+ KVM_MAX_NR_AMD_FIXED_COUTNERS)
struct kvm_pmu {
u8 version;
@@ -546,16 +550,16 @@ struct kvm_pmu {
unsigned nr_arch_fixed_counters;
unsigned available_event_types;
u64 fixed_ctr_ctrl;
- u64 fixed_ctr_ctrl_mask;
+ u64 fixed_ctr_ctrl_rsvd;
u64 global_ctrl;
u64 global_status;
u64 counter_bitmask[2];
- u64 global_ctrl_mask;
- u64 global_status_mask;
+ u64 global_ctrl_rsvd;
+ u64 global_status_rsvd;
u64 reserved_bits;
u64 raw_event_mask;
- struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC];
- struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED];
+ struct kvm_pmc gp_counters[KVM_MAX_NR_GP_COUNTERS];
+ struct kvm_pmc fixed_counters[KVM_MAX_NR_FIXED_COUNTERS];
/*
* Overlay the bitmap with a 64-bit atomic so that all bits can be
@@ -571,9 +575,9 @@ struct kvm_pmu {
u64 ds_area;
u64 pebs_enable;
- u64 pebs_enable_mask;
+ u64 pebs_enable_rsvd;
u64 pebs_data_cfg;
- u64 pebs_data_cfg_mask;
+ u64 pebs_data_cfg_rsvd;
/*
* If a guest counter is cross-mapped to host counter with different
@@ -604,18 +608,12 @@ enum {
KVM_DEBUGREG_WONT_EXIT = 2,
};
-struct kvm_mtrr_range {
- u64 base;
- u64 mask;
- struct list_head node;
-};
-
struct kvm_mtrr {
- struct kvm_mtrr_range var_ranges[KVM_NR_VAR_MTRR];
- mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION];
+ u64 var[KVM_NR_VAR_MTRR * 2];
+ u64 fixed_64k;
+ u64 fixed_16k[2];
+ u64 fixed_4k[8];
u64 deftype;
-
- struct list_head head;
};
/* Hyper-V SynIC timer */
@@ -1207,7 +1205,7 @@ enum kvm_apicv_inhibit {
* APIC acceleration is disabled by a module parameter
* and/or not supported in hardware.
*/
- APICV_INHIBIT_REASON_DISABLE,
+ APICV_INHIBIT_REASON_DISABLED,
/*
* APIC acceleration is inhibited because AutoEOI feature is
@@ -1277,8 +1275,27 @@ enum kvm_apicv_inhibit {
* mapping between logical ID and vCPU.
*/
APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED,
+
+ NR_APICV_INHIBIT_REASONS,
};
+#define __APICV_INHIBIT_REASON(reason) \
+ { BIT(APICV_INHIBIT_REASON_##reason), #reason }
+
+#define APICV_INHIBIT_REASONS \
+ __APICV_INHIBIT_REASON(DISABLED), \
+ __APICV_INHIBIT_REASON(HYPERV), \
+ __APICV_INHIBIT_REASON(ABSENT), \
+ __APICV_INHIBIT_REASON(BLOCKIRQ), \
+ __APICV_INHIBIT_REASON(PHYSICAL_ID_ALIASED), \
+ __APICV_INHIBIT_REASON(APIC_ID_MODIFIED), \
+ __APICV_INHIBIT_REASON(APIC_BASE_MODIFIED), \
+ __APICV_INHIBIT_REASON(NESTED), \
+ __APICV_INHIBIT_REASON(IRQWIN), \
+ __APICV_INHIBIT_REASON(PIT_REINJ), \
+ __APICV_INHIBIT_REASON(SEV), \
+ __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED)
+
struct kvm_arch {
unsigned long n_used_mmu_pages;
unsigned long n_requested_mmu_pages;
@@ -1364,6 +1381,7 @@ struct kvm_arch {
u32 default_tsc_khz;
bool user_set_tsc;
+ u64 apic_bus_cycle_ns;
seqcount_raw_spinlock_t pvclock_sc;
bool use_master_clock;
@@ -1708,13 +1726,11 @@ struct kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
- bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason);
const unsigned long required_apicv_inhibits;
bool allow_apicv_in_x2apic_without_x2apic_virtualization;
void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
void (*hwapic_isr_update)(int isr);
- bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu);
void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu);
@@ -1749,8 +1765,6 @@ struct kvm_x86_ops {
struct x86_exception *exception);
void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
- void (*sched_in)(struct kvm_vcpu *vcpu, int cpu);
-
/*
* Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero
* value indicates CPU dirty logging is unsupported or disabled.
@@ -1812,6 +1826,9 @@ struct kvm_x86_ops {
gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags);
void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
+ int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
+ void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end);
+ int (*private_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn);
};
struct kvm_x86_nested_ops {
@@ -1819,7 +1836,7 @@ struct kvm_x86_nested_ops {
bool (*is_exception_vmexit)(struct kvm_vcpu *vcpu, u8 vector,
u32 error_code);
int (*check_events)(struct kvm_vcpu *vcpu);
- bool (*has_events)(struct kvm_vcpu *vcpu);
+ bool (*has_events)(struct kvm_vcpu *vcpu, bool for_injection);
void (*triple_fault)(struct kvm_vcpu *vcpu);
int (*get_state)(struct kvm_vcpu *vcpu,
struct kvm_nested_state __user *user_kvm_nested_state,
@@ -1853,11 +1870,13 @@ struct kvm_arch_async_pf {
};
extern u32 __read_mostly kvm_nr_uret_msrs;
-extern u64 __read_mostly host_efer;
extern bool __read_mostly allow_smaller_maxphyaddr;
extern bool __read_mostly enable_apicv;
extern struct kvm_x86_ops kvm_x86_ops;
+#define kvm_x86_call(func) static_call(kvm_x86_##func)
+#define kvm_pmu_call(func) static_call(kvm_x86_pmu_##func)
+
#define KVM_X86_OP(func) \
DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func));
#define KVM_X86_OP_OPTIONAL KVM_X86_OP
@@ -1881,7 +1900,7 @@ void kvm_arch_free_vm(struct kvm *kvm);
static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
{
if (kvm_x86_ops.flush_remote_tlbs &&
- !static_call(kvm_x86_flush_remote_tlbs)(kvm))
+ !kvm_x86_call(flush_remote_tlbs)(kvm))
return 0;
else
return -ENOTSUPP;
@@ -1894,7 +1913,7 @@ static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn,
if (!kvm_x86_ops.flush_remote_tlbs_range)
return -EOPNOTSUPP;
- return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages);
+ return kvm_x86_call(flush_remote_tlbs_range)(kvm, gfn, nr_pages);
}
#endif /* CONFIG_HYPERV */
@@ -1939,6 +1958,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -2292,12 +2312,12 @@ static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq)
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
{
- static_call_cond(kvm_x86_vcpu_blocking)(vcpu);
+ kvm_x86_call(vcpu_blocking)(vcpu);
}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
{
- static_call_cond(kvm_x86_vcpu_unblocking)(vcpu);
+ kvm_x86_call(vcpu_unblocking)(vcpu);
}
static inline int kvm_cpu_get_apicid(int mps_cpu)
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index e90d403f2068..98726c2b04f8 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -59,6 +59,14 @@
#define GHCB_MSR_AP_RESET_HOLD_RESULT_POS 12
#define GHCB_MSR_AP_RESET_HOLD_RESULT_MASK GENMASK_ULL(51, 0)
+/* Preferred GHCB GPA Request */
+#define GHCB_MSR_PREF_GPA_REQ 0x010
+#define GHCB_MSR_GPA_VALUE_POS 12
+#define GHCB_MSR_GPA_VALUE_MASK GENMASK_ULL(51, 0)
+
+#define GHCB_MSR_PREF_GPA_RESP 0x011
+#define GHCB_MSR_PREF_GPA_NONE 0xfffffffffffff
+
/* GHCB GPA Register */
#define GHCB_MSR_REG_GPA_REQ 0x012
#define GHCB_MSR_REG_GPA_REQ_VAL(v) \
@@ -93,11 +101,17 @@ enum psc_op {
/* GHCBData[11:0] */ \
GHCB_MSR_PSC_REQ)
+#define GHCB_MSR_PSC_REQ_TO_GFN(msr) (((msr) & GENMASK_ULL(51, 12)) >> 12)
+#define GHCB_MSR_PSC_REQ_TO_OP(msr) (((msr) & GENMASK_ULL(55, 52)) >> 52)
+
#define GHCB_MSR_PSC_RESP 0x015
#define GHCB_MSR_PSC_RESP_VAL(val) \
/* GHCBData[63:32] */ \
(((u64)(val) & GENMASK_ULL(63, 32)) >> 32)
+/* Set highest bit as a generic error response */
+#define GHCB_MSR_PSC_RESP_ERROR (BIT_ULL(63) | GHCB_MSR_PSC_RESP)
+
/* GHCB Run at VMPL Request/Response */
#define GHCB_MSR_VMPL_REQ 0x016
#define GHCB_MSR_VMPL_REQ_LEVEL(v) \
@@ -129,8 +143,19 @@ enum psc_op {
* The VMGEXIT_PSC_MAX_ENTRY determines the size of the PSC structure, which
* is a local stack variable in set_pages_state(). Do not increase this value
* without evaluating the impact to stack usage.
+ *
+ * Use VMGEXIT_PSC_MAX_COUNT in cases where the actual GHCB-defined max value
+ * is needed, such as when processing GHCB requests on the hypervisor side.
*/
#define VMGEXIT_PSC_MAX_ENTRY 64
+#define VMGEXIT_PSC_MAX_COUNT 253
+
+#define VMGEXIT_PSC_ERROR_GENERIC (0x100UL << 32)
+#define VMGEXIT_PSC_ERROR_INVALID_HDR ((1UL << 32) | 1)
+#define VMGEXIT_PSC_ERROR_INVALID_ENTRY ((1UL << 32) | 2)
+
+#define VMGEXIT_PSC_OP_PRIVATE 1
+#define VMGEXIT_PSC_OP_SHARED 2
struct psc_hdr {
u16 cur_entry;
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index ac5886ce252e..79bbe2be900e 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -91,6 +91,9 @@ extern bool handle_vc_boot_ghcb(struct pt_regs *regs);
/* RMUPDATE detected 4K page and 2MB page overlap. */
#define RMPUPDATE_FAIL_OVERLAP 4
+/* PSMASH failed due to concurrent access by another CPU */
+#define PSMASH_FAIL_INUSE 3
+
/* RMP page size */
#define RMP_PG_SIZE_4K 0
#define RMP_PG_SIZE_2M 1
@@ -116,6 +119,54 @@ struct snp_req_data {
unsigned int data_npages;
};
+#define MAX_AUTHTAG_LEN 32
+
+/* See SNP spec SNP_GUEST_REQUEST section for the structure */
+enum msg_type {
+ SNP_MSG_TYPE_INVALID = 0,
+ SNP_MSG_CPUID_REQ,
+ SNP_MSG_CPUID_RSP,
+ SNP_MSG_KEY_REQ,
+ SNP_MSG_KEY_RSP,
+ SNP_MSG_REPORT_REQ,
+ SNP_MSG_REPORT_RSP,
+ SNP_MSG_EXPORT_REQ,
+ SNP_MSG_EXPORT_RSP,
+ SNP_MSG_IMPORT_REQ,
+ SNP_MSG_IMPORT_RSP,
+ SNP_MSG_ABSORB_REQ,
+ SNP_MSG_ABSORB_RSP,
+ SNP_MSG_VMRK_REQ,
+ SNP_MSG_VMRK_RSP,
+
+ SNP_MSG_TYPE_MAX
+};
+
+enum aead_algo {
+ SNP_AEAD_INVALID,
+ SNP_AEAD_AES_256_GCM,
+};
+
+struct snp_guest_msg_hdr {
+ u8 authtag[MAX_AUTHTAG_LEN];
+ u64 msg_seqno;
+ u8 rsvd1[8];
+ u8 algo;
+ u8 hdr_version;
+ u16 hdr_sz;
+ u8 msg_type;
+ u8 msg_version;
+ u16 msg_sz;
+ u32 rsvd2;
+ u8 msg_vmpck;
+ u8 rsvd3[35];
+} __packed;
+
+struct snp_guest_msg {
+ struct snp_guest_msg_hdr hdr;
+ u8 payload[4000];
+} __packed;
+
struct sev_guest_platform_data {
u64 secrets_gpa;
};
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 728c98175b9c..f0dea3750ca9 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -285,7 +285,14 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_
#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
-#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5)
+#define SVM_SEV_FEAT_SNP_ACTIVE BIT(0)
+#define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3)
+#define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4)
+#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5)
+
+#define SVM_SEV_FEAT_INT_INJ_MODES \
+ (SVM_SEV_FEAT_RESTRICTED_INJECTION | \
+ SVM_SEV_FEAT_ALTERNATE_INJECTION)
struct vmcb_seg {
u16 selector;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 9fae1b73b529..bf57a824f722 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -106,6 +106,7 @@ struct kvm_ioapic_state {
#define KVM_RUN_X86_SMM (1 << 0)
#define KVM_RUN_X86_BUS_LOCK (1 << 1)
+#define KVM_RUN_X86_GUEST_MODE (1 << 2)
/* for KVM_GET_REGS and KVM_SET_REGS */
struct kvm_regs {
@@ -697,6 +698,11 @@ enum sev_cmd_id {
/* Second time is the charm; improved versions of the above ioctls. */
KVM_SEV_INIT2,
+ /* SNP-specific commands */
+ KVM_SEV_SNP_LAUNCH_START = 100,
+ KVM_SEV_SNP_LAUNCH_UPDATE,
+ KVM_SEV_SNP_LAUNCH_FINISH,
+
KVM_SEV_NR_MAX,
};
@@ -824,6 +830,48 @@ struct kvm_sev_receive_update_data {
__u32 pad2;
};
+struct kvm_sev_snp_launch_start {
+ __u64 policy;
+ __u8 gosvw[16];
+ __u16 flags;
+ __u8 pad0[6];
+ __u64 pad1[4];
+};
+
+/* Kept in sync with firmware values for simplicity. */
+#define KVM_SEV_SNP_PAGE_TYPE_NORMAL 0x1
+#define KVM_SEV_SNP_PAGE_TYPE_ZERO 0x3
+#define KVM_SEV_SNP_PAGE_TYPE_UNMEASURED 0x4
+#define KVM_SEV_SNP_PAGE_TYPE_SECRETS 0x5
+#define KVM_SEV_SNP_PAGE_TYPE_CPUID 0x6
+
+struct kvm_sev_snp_launch_update {
+ __u64 gfn_start;
+ __u64 uaddr;
+ __u64 len;
+ __u8 type;
+ __u8 pad0;
+ __u16 flags;
+ __u32 pad1;
+ __u64 pad2[4];
+};
+
+#define KVM_SEV_SNP_ID_BLOCK_SIZE 96
+#define KVM_SEV_SNP_ID_AUTH_SIZE 4096
+#define KVM_SEV_SNP_FINISH_DATA_SIZE 32
+
+struct kvm_sev_snp_launch_finish {
+ __u64 id_block_uaddr;
+ __u64 id_auth_uaddr;
+ __u8 id_block_en;
+ __u8 auth_key_en;
+ __u8 vcek_disabled;
+ __u8 host_data[KVM_SEV_SNP_FINISH_DATA_SIZE];
+ __u8 pad0[3];
+ __u16 flags;
+ __u64 pad1[4];
+};
+
#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0)
#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1)
@@ -874,5 +922,6 @@ struct kvm_hyperv_eventfd {
#define KVM_X86_SW_PROTECTED_VM 1
#define KVM_X86_SEV_VM 2
#define KVM_X86_SEV_ES_VM 3
+#define KVM_X86_SNP_VM 4
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index fec95a770270..4287a8071a3a 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -44,6 +44,7 @@ config KVM
select KVM_VFIO
select HAVE_KVM_PM_NOTIFIER if PM
select KVM_GENERIC_HARDWARE_ENABLING
+ select KVM_GENERIC_PRE_FAULT_MEMORY
select KVM_WERROR if WERROR
help
Support hosting fully virtualized guest machines using hardware
@@ -139,6 +140,9 @@ config KVM_AMD_SEV
depends on KVM_AMD && X86_64
depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
select ARCH_HAS_CC_PLATFORM
+ select KVM_GENERIC_PRIVATE_MEM
+ select HAVE_KVM_GMEM_PREPARE
+ select HAVE_KVM_GMEM_INVALIDATE
help
Provides support for launching Encrypted VMs (SEV) and Encrypted VMs
with Encrypted State (SEV-ES) on AMD processors.
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index f2f2be5d1141..2617be544480 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -335,6 +335,18 @@ static bool kvm_cpuid_has_hyperv(struct kvm_cpuid_entry2 *entries, int nent)
#endif
}
+static bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *entry;
+
+ entry = kvm_find_cpuid_entry(vcpu, 0);
+ if (!entry)
+ return false;
+
+ return is_guest_vendor_amd(entry->ebx, entry->ecx, entry->edx) ||
+ is_guest_vendor_hygon(entry->ebx, entry->ecx, entry->edx);
+}
+
static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
@@ -388,7 +400,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vcpu->arch.cpuid_nent));
/* Invoke the vendor callback only after the above state is updated. */
- static_call(kvm_x86_vcpu_after_set_cpuid)(vcpu);
+ kvm_x86_call(vcpu_after_set_cpuid)(vcpu);
/*
* Except for the MMU, which needs to do its thing any vendor specific
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 23dbb9eb277c..41697cca354e 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -102,24 +102,6 @@ static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu,
*reg &= ~__feature_bit(x86_feature);
}
-static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 0);
- return best &&
- (is_guest_vendor_amd(best->ebx, best->ecx, best->edx) ||
- is_guest_vendor_hygon(best->ebx, best->ecx, best->edx));
-}
-
-static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 0);
- return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx);
-}
-
static inline bool guest_cpuid_is_amd_compatible(struct kvm_vcpu *vcpu)
{
return vcpu->arch.is_amd_compatible;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c8cc578646d0..e72aed25d721 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2354,50 +2354,6 @@ setup_syscalls_segments(struct desc_struct *cs, struct desc_struct *ss)
ss->avl = 0;
}
-static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
-{
- u32 eax, ebx, ecx, edx;
-
- eax = ecx = 0;
- ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true);
- return is_guest_vendor_intel(ebx, ecx, edx);
-}
-
-static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
-{
- const struct x86_emulate_ops *ops = ctxt->ops;
- u32 eax, ebx, ecx, edx;
-
- /*
- * syscall should always be enabled in longmode - so only become
- * vendor specific (cpuid) if other modes are active...
- */
- if (ctxt->mode == X86EMUL_MODE_PROT64)
- return true;
-
- eax = 0x00000000;
- ecx = 0x00000000;
- ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true);
- /*
- * remark: Intel CPUs only support "syscall" in 64bit longmode. Also a
- * 64bit guest with a 32bit compat-app running will #UD !! While this
- * behaviour can be fixed (by emulating) into AMD response - CPUs of
- * AMD can't behave like Intel.
- */
- if (is_guest_vendor_intel(ebx, ecx, edx))
- return false;
-
- if (is_guest_vendor_amd(ebx, ecx, edx) ||
- is_guest_vendor_hygon(ebx, ecx, edx))
- return true;
-
- /*
- * default: (not Intel, not AMD, not Hygon), apply Intel's
- * stricter rules...
- */
- return false;
-}
-
static int em_syscall(struct x86_emulate_ctxt *ctxt)
{
const struct x86_emulate_ops *ops = ctxt->ops;
@@ -2411,7 +2367,15 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
ctxt->mode == X86EMUL_MODE_VM86)
return emulate_ud(ctxt);
- if (!(em_syscall_is_enabled(ctxt)))
+ /*
+ * Intel compatible CPUs only support SYSCALL in 64-bit mode, whereas
+ * AMD allows SYSCALL in any flavor of protected mode. Note, it's
+ * infeasible to emulate Intel behavior when running on AMD hardware,
+ * as SYSCALL won't fault in the "wrong" mode, i.e. there is no #UD
+ * for KVM to trap-and-emulate, unlike emulating AMD on Intel.
+ */
+ if (ctxt->mode != X86EMUL_MODE_PROT64 &&
+ ctxt->ops->guest_cpuid_is_intel_compatible(ctxt))
return emulate_ud(ctxt);
ops->get_msr(ctxt, MSR_EFER, &efer);
@@ -2471,11 +2435,11 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
return emulate_gp(ctxt, 0);
/*
- * Not recognized on AMD in compat mode (but is recognized in legacy
- * mode).
+ * Intel's architecture allows SYSENTER in compatibility mode, but AMD
+ * does not. Note, AMD does allow SYSENTER in legacy protected mode.
*/
- if ((ctxt->mode != X86EMUL_MODE_PROT64) && (efer & EFER_LMA)
- && !vendor_intel(ctxt))
+ if ((ctxt->mode != X86EMUL_MODE_PROT64) && (efer & EFER_LMA) &&
+ !ctxt->ops->guest_cpuid_is_intel_compatible(ctxt))
return emulate_ud(ctxt);
/* sysenter/sysexit have not been tested in 64bit mode. */
@@ -2647,7 +2611,14 @@ static void string_registers_quirk(struct x86_emulate_ctxt *ctxt)
* manner when ECX is zero due to REP-string optimizations.
*/
#ifdef CONFIG_X86_64
- if (ctxt->ad_bytes != 4 || !vendor_intel(ctxt))
+ u32 eax, ebx, ecx, edx;
+
+ if (ctxt->ad_bytes != 4)
+ return;
+
+ eax = ecx = 0;
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true);
+ if (!is_guest_vendor_intel(ebx, ecx, edx))
return;
*reg_write(ctxt, VCPU_REGS_RCX) = 0;
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 8a47f8541eab..4f0a94346d00 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1417,7 +1417,7 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
}
/* vmcall/vmmcall */
- static_call(kvm_x86_patch_hypercall)(vcpu, instructions + i);
+ kvm_x86_call(patch_hypercall)(vcpu, instructions + i);
i += 3;
/* ret */
@@ -1737,7 +1737,8 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
data = (u64)vcpu->arch.virtual_tsc_khz * 1000;
break;
case HV_X64_MSR_APIC_FREQUENCY:
- data = APIC_BUS_FREQUENCY;
+ data = div64_u64(1000000000ULL,
+ vcpu->kvm->arch.apic_bus_cycle_ns);
break;
default:
kvm_pr_unimpl_rdmsr(vcpu, msr);
@@ -1985,7 +1986,7 @@ int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
*/
gva = entries[i] & PAGE_MASK;
for (j = 0; j < (entries[i] & ~PAGE_MASK) + 1; j++)
- static_call(kvm_x86_flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE);
+ kvm_x86_call(flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE);
++vcpu->stat.tlb_flush;
}
@@ -2526,7 +2527,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
* hypercall generates UD from non zero cpl and real mode
* per HYPER-V spec
*/
- if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) {
+ if (kvm_x86_call(get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index ad9ca8a60144..3d7eb11d0e45 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -157,7 +157,7 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
{
__kvm_migrate_apic_timer(vcpu);
__kvm_migrate_pit_timer(vcpu);
- static_call_cond(kvm_x86_migrate_timers)(vcpu);
+ kvm_x86_call(migrate_timers)(vcpu);
}
bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index c2d7cfe82d00..76d46b2f41dd 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -106,7 +106,6 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
int apic_has_pending_timer(struct kvm_vcpu *vcpu);
int kvm_setup_default_irq_routing(struct kvm *kvm);
-int kvm_setup_empty_irq_routing(struct kvm *kvm);
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq,
struct dest_map *dest_map);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 68f3f6c26046..8136695f7b96 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -395,13 +395,6 @@ int kvm_setup_default_irq_routing(struct kvm *kvm)
ARRAY_SIZE(default_routing), 0);
}
-static const struct kvm_irq_routing_entry empty_routing[] = {};
-
-int kvm_setup_empty_irq_routing(struct kvm *kvm)
-{
- return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
-}
-
void kvm_arch_post_irq_routing_update(struct kvm *kvm)
{
if (!irqchip_split(kvm))
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 75eae9c4998a..b1eb46e26b2e 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -98,7 +98,7 @@ static inline unsigned long kvm_register_read_raw(struct kvm_vcpu *vcpu, int reg
return 0;
if (!kvm_register_is_available(vcpu, reg))
- static_call(kvm_x86_cache_reg)(vcpu, reg);
+ kvm_x86_call(cache_reg)(vcpu, reg);
return vcpu->arch.regs[reg];
}
@@ -138,7 +138,7 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
might_sleep(); /* on svm */
if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
- static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_PDPTR);
+ kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_PDPTR);
return vcpu->arch.walk_mmu->pdptrs[index];
}
@@ -153,7 +153,7 @@ static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
if ((tmask & vcpu->arch.cr0_guest_owned_bits) &&
!kvm_register_is_available(vcpu, VCPU_EXREG_CR0))
- static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR0);
+ kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR0);
return vcpu->arch.cr0 & mask;
}
@@ -175,7 +175,7 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
if ((tmask & vcpu->arch.cr4_guest_owned_bits) &&
!kvm_register_is_available(vcpu, VCPU_EXREG_CR4))
- static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR4);
+ kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR4);
return vcpu->arch.cr4 & mask;
}
@@ -190,7 +190,7 @@ static __always_inline bool kvm_is_cr4_bit_set(struct kvm_vcpu *vcpu,
static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
{
if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
- static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR3);
+ kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR3);
return vcpu->arch.cr3;
}
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 29ea4313e1bb..55a18e2f2dcd 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -223,6 +223,7 @@ struct x86_emulate_ops {
bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt);
bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt);
bool (*guest_has_rdpid)(struct x86_emulate_ctxt *ctxt);
+ bool (*guest_cpuid_is_intel_compatible)(struct x86_emulate_ctxt *ctxt);
void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index acd7d48100a1..a7172ba59ad2 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -738,8 +738,8 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
if (unlikely(apic->apicv_active)) {
/* need to update RVI */
kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
- static_call_cond(kvm_x86_hwapic_irr_update)(apic->vcpu,
- apic_find_highest_irr(apic));
+ kvm_x86_call(hwapic_irr_update)(apic->vcpu,
+ apic_find_highest_irr(apic));
} else {
apic->irr_pending = false;
kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
@@ -765,7 +765,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
* just set SVI.
*/
if (unlikely(apic->apicv_active))
- static_call_cond(kvm_x86_hwapic_isr_update)(vec);
+ kvm_x86_call(hwapic_isr_update)(vec);
else {
++apic->isr_count;
BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
@@ -810,7 +810,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
* and must be left alone.
*/
if (unlikely(apic->apicv_active))
- static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic));
+ kvm_x86_call(hwapic_isr_update)(apic_find_highest_isr(apic));
else {
--apic->isr_count;
BUG_ON(apic->isr_count < 0);
@@ -946,7 +946,7 @@ static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
{
int highest_irr;
if (kvm_x86_ops.sync_pir_to_irr)
- highest_irr = static_call(kvm_x86_sync_pir_to_irr)(apic->vcpu);
+ highest_irr = kvm_x86_call(sync_pir_to_irr)(apic->vcpu);
else
highest_irr = apic_find_highest_irr(apic);
if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
@@ -1338,8 +1338,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
apic->regs + APIC_TMR);
}
- static_call(kvm_x86_deliver_interrupt)(apic, delivery_mode,
- trig_mode, vector);
+ kvm_x86_call(deliver_interrupt)(apic, delivery_mode,
+ trig_mode, vector);
break;
case APIC_DM_REMRD:
@@ -1557,7 +1557,8 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
remaining = 0;
ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
- return div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->divide_count));
+ return div64_u64(ns, (apic->vcpu->kvm->arch.apic_bus_cycle_ns *
+ apic->divide_count));
}
static void __report_tpr_access(struct kvm_lapic *apic, bool write)
@@ -1973,7 +1974,8 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
static inline u64 tmict_to_ns(struct kvm_lapic *apic, u32 tmict)
{
- return (u64)tmict * APIC_BUS_CYCLE_NS * (u64)apic->divide_count;
+ return (u64)tmict * apic->vcpu->kvm->arch.apic_bus_cycle_ns *
+ (u64)apic->divide_count;
}
static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
@@ -2103,7 +2105,7 @@ static void cancel_hv_timer(struct kvm_lapic *apic)
{
WARN_ON(preemptible());
WARN_ON(!apic->lapic_timer.hv_timer_in_use);
- static_call(kvm_x86_cancel_hv_timer)(apic->vcpu);
+ kvm_x86_call(cancel_hv_timer)(apic->vcpu);
apic->lapic_timer.hv_timer_in_use = false;
}
@@ -2120,7 +2122,7 @@ static bool start_hv_timer(struct kvm_lapic *apic)
if (!ktimer->tscdeadline)
return false;
- if (static_call(kvm_x86_set_hv_timer)(vcpu, ktimer->tscdeadline, &expired))
+ if (kvm_x86_call(set_hv_timer)(vcpu, ktimer->tscdeadline, &expired))
return false;
ktimer->hv_timer_in_use = true;
@@ -2575,7 +2577,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) {
kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
- static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu);
+ kvm_x86_call(set_virtual_apic_mode)(vcpu);
}
apic->base_address = apic->vcpu->arch.apic_base &
@@ -2685,7 +2687,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
u64 msr_val;
int i;
- static_call_cond(kvm_x86_apicv_pre_state_restore)(vcpu);
+ kvm_x86_call(apicv_pre_state_restore)(vcpu);
if (!init_event) {
msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
@@ -2740,9 +2742,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.pv_eoi.msr_val = 0;
apic_update_ppr(apic);
if (apic->apicv_active) {
- static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
- static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1);
- static_call_cond(kvm_x86_hwapic_isr_update)(-1);
+ kvm_x86_call(apicv_post_state_restore)(vcpu);
+ kvm_x86_call(hwapic_irr_update)(vcpu, -1);
+ kvm_x86_call(hwapic_isr_update)(-1);
}
vcpu->arch.apic_arb_prio = 0;
@@ -2838,7 +2840,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
vcpu->arch.apic = apic;
if (kvm_x86_ops.alloc_apic_backing_page)
- apic->regs = static_call(kvm_x86_alloc_apic_backing_page)(vcpu);
+ apic->regs = kvm_x86_call(alloc_apic_backing_page)(vcpu);
else
apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!apic->regs) {
@@ -3017,7 +3019,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
struct kvm_lapic *apic = vcpu->arch.apic;
int r;
- static_call_cond(kvm_x86_apicv_pre_state_restore)(vcpu);
+ kvm_x86_call(apicv_pre_state_restore)(vcpu);
kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
/* set SPIV separately to get count of SW disabled APICs right */
@@ -3044,9 +3046,10 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
kvm_lapic_set_reg(apic, APIC_TMCCT, 0);
kvm_apic_update_apicv(vcpu);
if (apic->apicv_active) {
- static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
- static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
- static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic));
+ kvm_x86_call(apicv_post_state_restore)(vcpu);
+ kvm_x86_call(hwapic_irr_update)(vcpu,
+ apic_find_highest_irr(apic));
+ kvm_x86_call(hwapic_isr_update)(apic_find_highest_isr(apic));
}
kvm_make_request(KVM_REQ_EVENT, vcpu);
if (ioapic_in_kernel(vcpu->kvm))
@@ -3334,7 +3337,8 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu)
/* evaluate pending_events before reading the vector */
smp_rmb();
sipi_vector = apic->sipi_vector;
- static_call(kvm_x86_vcpu_deliver_sipi_vector)(vcpu, sipi_vector);
+ kvm_x86_call(vcpu_deliver_sipi_vector)(vcpu,
+ sipi_vector);
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index a69e706b9080..7ef8ae73e82d 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -16,8 +16,7 @@
#define APIC_DEST_NOSHORT 0x0
#define APIC_DEST_MASK 0x800
-#define APIC_BUS_CYCLE_NS 1
-#define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS)
+#define APIC_BUS_CYCLE_NS_DEFAULT 1
#define APIC_BROADCAST 0xFF
#define X2APIC_BROADCAST 0xFFFFFFFFul
@@ -236,7 +235,7 @@ static inline bool kvm_apic_has_pending_init_or_sipi(struct kvm_vcpu *vcpu)
static inline bool kvm_apic_init_sipi_allowed(struct kvm_vcpu *vcpu)
{
return !is_smm(vcpu) &&
- !static_call(kvm_x86_apic_init_signal_blocked)(vcpu);
+ !kvm_x86_call(apic_init_signal_blocked)(vcpu);
}
static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 2e454316f2a2..4341e0e28571 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -57,12 +57,6 @@ static __always_inline u64 rsvd_bits(int s, int e)
return ((2ULL << (e - s)) - 1) << s;
}
-/*
- * The number of non-reserved physical address bits irrespective of features
- * that repurpose legal bits, e.g. MKTME.
- */
-extern u8 __read_mostly shadow_phys_bits;
-
static inline gfn_t kvm_mmu_max_gfn(void)
{
/*
@@ -76,30 +70,11 @@ static inline gfn_t kvm_mmu_max_gfn(void)
* than hardware's real MAXPHYADDR. Using the host MAXPHYADDR
* disallows such SPTEs entirely and simplifies the TDP MMU.
*/
- int max_gpa_bits = likely(tdp_enabled) ? shadow_phys_bits : 52;
+ int max_gpa_bits = likely(tdp_enabled) ? kvm_host.maxphyaddr : 52;
return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1;
}
-static inline u8 kvm_get_shadow_phys_bits(void)
-{
- /*
- * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
- * in CPU detection code, but the processor treats those reduced bits as
- * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
- * the physical address bits reported by CPUID.
- */
- if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
- return cpuid_eax(0x80000008) & 0xff;
-
- /*
- * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
- * custom CPUID. Proceed with whatever the kernel found since these features
- * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
- */
- return boot_cpu_data.x86_phys_bits;
-}
-
u8 kvm_mmu_get_max_tdp_level(void);
void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
@@ -163,8 +138,8 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(root_hpa))
return;
- static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa,
- vcpu->arch.mmu->root_role.level);
+ kvm_x86_call(load_mmu_pgd)(vcpu, root_hpa,
+ vcpu->arch.mmu->root_role.level);
}
static inline void kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
@@ -199,7 +174,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
{
/* strip nested paging fault error codes */
unsigned int pfec = access;
- unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
+ unsigned long rflags = kvm_x86_call(get_rflags)(vcpu);
/*
* For explicit supervisor accesses, SMAP is disabled if EFLAGS.AC = 1.
@@ -246,14 +221,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return -(u32)fault & errcode;
}
-bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma);
-
-static inline bool kvm_mmu_honors_guest_mtrrs(struct kvm *kvm)
-{
- return __kvm_mmu_honors_guest_mtrrs(kvm_arch_has_noncoherent_dma(kvm));
-}
-
-void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
+bool kvm_mmu_may_ignore_guest_pat(void);
int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 8d74bdef68c1..901be9e420a4 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -722,7 +722,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
if (sp->role.passthrough)
return sp->gfn;
- if (!sp->role.direct)
+ if (sp->shadowed_translation)
return sp->shadowed_translation[index] >> PAGE_SHIFT;
return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS));
@@ -736,7 +736,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
*/
static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
{
- if (sp_has_gptes(sp))
+ if (sp->shadowed_translation)
return sp->shadowed_translation[index] & ACC_ALL;
/*
@@ -757,7 +757,7 @@ static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index,
gfn_t gfn, unsigned int access)
{
- if (sp_has_gptes(sp)) {
+ if (sp->shadowed_translation) {
sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access;
return;
}
@@ -1700,8 +1700,7 @@ static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
hlist_del(&sp->hash_link);
list_del(&sp->link);
free_page((unsigned long)sp->spt);
- if (!sp->role.direct)
- free_page((unsigned long)sp->shadowed_translation);
+ free_page((unsigned long)sp->shadowed_translation);
kmem_cache_free(mmu_page_header_cache, sp);
}
@@ -2203,7 +2202,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
- if (!role.direct)
+ if (!role.direct && role.level <= KVM_MAX_HUGEPAGE_LEVEL)
sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
@@ -3308,7 +3307,7 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
return RET_PF_CONTINUE;
}
-static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
+static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault)
{
/*
* Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only
@@ -3320,6 +3319,26 @@ static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
return false;
/*
+ * For hardware-protected VMs, certain conditions like attempting to
+ * perform a write to a page which is not in the state that the guest
+ * expects it to be in can result in a nested/extended #PF. In this
+ * case, the below code might misconstrue this situation as being the
+ * result of a write-protected access, and treat it as a spurious case
+ * rather than taking any action to satisfy the real source of the #PF
+ * such as generating a KVM_EXIT_MEMORY_FAULT. This can lead to the
+ * guest spinning on a #PF indefinitely, so don't attempt the fast path
+ * in this case.
+ *
+ * Note that the kvm_mem_is_private() check might race with an
+ * attribute update, but this will either result in the guest spinning
+ * on RET_PF_SPURIOUS until the update completes, or an actual spurious
+ * case might go down the slow path. Either case will resolve itself.
+ */
+ if (kvm->arch.has_private_mem &&
+ fault->is_private != kvm_mem_is_private(kvm, fault->gfn))
+ return false;
+
+ /*
* #PF can be fast if:
*
* 1. The shadow page table entry is not present and A/D bits are
@@ -3419,7 +3438,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
u64 *sptep;
uint retry_count = 0;
- if (!page_fault_can_be_fast(fault))
+ if (!page_fault_can_be_fast(vcpu->kvm, fault))
return ret;
walk_shadow_page_lockless_begin(vcpu);
@@ -3428,7 +3447,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
u64 new_spte;
if (tdp_mmu_enabled)
- sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
+ sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->gfn, &spte);
else
sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
@@ -3438,7 +3457,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
* available as the vCPU holds a reference to its root(s).
*/
if (WARN_ON_ONCE(!sptep))
- spte = REMOVED_SPTE;
+ spte = FROZEN_SPTE;
if (!is_shadow_present_pte(spte))
break;
@@ -4271,7 +4290,16 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
return;
- kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL);
+ r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code,
+ true, NULL, NULL);
+
+ /*
+ * Account fixed page faults, otherwise they'll never be counted, but
+ * ignore stats for all other return times. Page-ready "faults" aren't
+ * truly spurious and never trigger emulation
+ */
+ if (r == RET_PF_FIXED)
+ vcpu->stat.pf_fixed++;
}
static inline u8 kvm_max_level_for_order(int order)
@@ -4291,6 +4319,25 @@ static inline u8 kvm_max_level_for_order(int order)
return PG_LEVEL_4K;
}
+static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
+ u8 max_level, int gmem_order)
+{
+ u8 req_max_level;
+
+ if (max_level == PG_LEVEL_4K)
+ return PG_LEVEL_4K;
+
+ max_level = min(kvm_max_level_for_order(gmem_order), max_level);
+ if (max_level == PG_LEVEL_4K)
+ return PG_LEVEL_4K;
+
+ req_max_level = kvm_x86_call(private_max_mapping_level)(kvm, pfn);
+ if (req_max_level)
+ max_level = min(max_level, req_max_level);
+
+ return req_max_level;
+}
+
static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault)
{
@@ -4308,9 +4355,9 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
return r;
}
- fault->max_level = min(kvm_max_level_for_order(max_order),
- fault->max_level);
fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
+ fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn,
+ fault->max_level, max_order);
return RET_PF_CONTINUE;
}
@@ -4561,7 +4608,10 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
if (WARN_ON_ONCE(error_code >> 32))
error_code = lower_32_bits(error_code);
- /* Ensure the above sanity check also covers KVM-defined flags. */
+ /*
+ * Restrict KVM-defined flags to bits 63:32 so that it's impossible for
+ * them to conflict with #PF error codes, which are limited to 32 bits.
+ */
BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK));
vcpu->arch.l1tf_flush_l1d = true;
@@ -4621,38 +4671,23 @@ out_unlock:
}
#endif
-bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma)
+bool kvm_mmu_may_ignore_guest_pat(void)
{
/*
- * If host MTRRs are ignored (shadow_memtype_mask is non-zero), and the
- * VM has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is
- * to honor the memtype from the guest's MTRRs so that guest accesses
- * to memory that is DMA'd aren't cached against the guest's wishes.
- *
- * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs,
- * e.g. KVM will force UC memtype for host MMIO.
+ * When EPT is enabled (shadow_memtype_mask is non-zero), the CPU does
+ * not support self-snoop (or is affected by an erratum), and the VM
+ * has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to
+ * honor the memtype from the guest's PAT so that guest accesses to
+ * memory that is DMA'd aren't cached against the guest's wishes. As a
+ * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA,
+ * KVM _always_ ignores or honors guest PAT, i.e. doesn't toggle SPTE
+ * bits in response to non-coherent device (un)registration.
*/
- return vm_has_noncoherent_dma && shadow_memtype_mask;
+ return !static_cpu_has(X86_FEATURE_SELFSNOOP) && shadow_memtype_mask;
}
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- /*
- * If the guest's MTRRs may be used to compute the "real" memtype,
- * restrict the mapping level to ensure KVM uses a consistent memtype
- * across the entire mapping.
- */
- if (kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) {
- for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
- int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
- gfn_t base = gfn_round_for_level(fault->gfn,
- fault->max_level);
-
- if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
- break;
- }
- }
-
#ifdef CONFIG_X86_64
if (tdp_mmu_enabled)
return kvm_tdp_mmu_page_fault(vcpu, fault);
@@ -4661,6 +4696,79 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
return direct_page_fault(vcpu, fault);
}
+static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
+ u8 *level)
+{
+ int r;
+
+ /*
+ * Restrict to TDP page fault, since that's the only case where the MMU
+ * is indexed by GPA.
+ */
+ if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault)
+ return -EOPNOTSUPP;
+
+ do {
+ if (signal_pending(current))
+ return -EINTR;
+ cond_resched();
+ r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level);
+ } while (r == RET_PF_RETRY);
+
+ if (r < 0)
+ return r;
+
+ switch (r) {
+ case RET_PF_FIXED:
+ case RET_PF_SPURIOUS:
+ return 0;
+
+ case RET_PF_EMULATE:
+ return -ENOENT;
+
+ case RET_PF_RETRY:
+ case RET_PF_CONTINUE:
+ case RET_PF_INVALID:
+ default:
+ WARN_ONCE(1, "could not fix page fault during prefault");
+ return -EIO;
+ }
+}
+
+long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
+ struct kvm_pre_fault_memory *range)
+{
+ u64 error_code = PFERR_GUEST_FINAL_MASK;
+ u8 level = PG_LEVEL_4K;
+ u64 end;
+ int r;
+
+ /*
+ * reload is efficient when called repeatedly, so we can do it on
+ * every iteration.
+ */
+ kvm_mmu_reload(vcpu);
+
+ if (kvm_arch_has_private_mem(vcpu->kvm) &&
+ kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa)))
+ error_code |= PFERR_PRIVATE_ACCESS;
+
+ /*
+ * Shadow paging uses GVA for kvm page fault, so restrict to
+ * two-dimensional paging.
+ */
+ r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level);
+ if (r < 0)
+ return r;
+
+ /*
+ * If the mapping that covers range->gpa can use a huge page, it
+ * may start below it or end after range->gpa + range->size.
+ */
+ end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level);
+ return min(range->size, end - range->gpa);
+}
+
static void nonpaging_init_context(struct kvm_mmu *context)
{
context->page_fault = nonpaging_page_fault;
@@ -4988,7 +5096,7 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
static inline u64 reserved_hpa_bits(void)
{
- return rsvd_bits(shadow_phys_bits, 63);
+ return rsvd_bits(kvm_host.maxphyaddr, 63);
}
/*
@@ -5633,7 +5741,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
* stale entries. Flushing on alloc also allows KVM to skip the TLB
* flush when freeing a root (see kvm_tdp_mmu_put_root()).
*/
- static_call(kvm_x86_flush_tlb_current)(vcpu);
+ kvm_x86_call(flush_tlb_current)(vcpu);
out:
return r;
}
@@ -5886,14 +5994,24 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
}
if (r == RET_PF_INVALID) {
+ vcpu->stat.pf_taken++;
+
r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false,
- &emulation_type);
+ &emulation_type, NULL);
if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
return -EIO;
}
if (r < 0)
return r;
+
+ if (r == RET_PF_FIXED)
+ vcpu->stat.pf_fixed++;
+ else if (r == RET_PF_EMULATE)
+ vcpu->stat.pf_emulate++;
+ else if (r == RET_PF_SPURIOUS)
+ vcpu->stat.pf_spurious++;
+
if (r != RET_PF_EMULATE)
return 1;
@@ -5995,7 +6113,7 @@ void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
if (is_noncanonical_address(addr, vcpu))
return;
- static_call(kvm_x86_flush_tlb_gva)(vcpu, addr);
+ kvm_x86_call(flush_tlb_gva)(vcpu, addr);
}
if (!mmu->sync_spte)
@@ -6787,6 +6905,7 @@ restart:
return need_tlb_flush;
}
+EXPORT_SYMBOL_GPL(kvm_zap_gfn_range);
static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *slot)
@@ -6917,7 +7036,6 @@ static unsigned long mmu_shrink_scan(struct shrinker *shrink,
list_for_each_entry(kvm, &vm_list, vm_list) {
int idx;
- LIST_HEAD(invalid_list);
/*
* Never scan more than sc->nr_to_scan VM instances.
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index ce2fcd19ba6b..1721d97743e9 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -288,7 +288,8 @@ static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
}
static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- u64 err, bool prefetch, int *emulation_type)
+ u64 err, bool prefetch,
+ int *emulation_type, u8 *level)
{
struct kvm_page_fault fault = {
.addr = cr2_or_gpa,
@@ -318,14 +319,6 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);
}
- /*
- * Async #PF "faults", a.k.a. prefetch faults, are not faults from the
- * guest perspective and have already been counted at the time of the
- * original fault.
- */
- if (!prefetch)
- vcpu->stat.pf_taken++;
-
if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp)
r = kvm_tdp_page_fault(vcpu, &fault);
else
@@ -344,20 +337,9 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
if (fault.write_fault_to_shadow_pgtable && emulation_type)
*emulation_type |= EMULTYPE_WRITE_PF_TO_SP;
+ if (level)
+ *level = fault.goal_level;
- /*
- * Similar to above, prefetch faults aren't truly spurious, and the
- * async #PF path doesn't do emulation. Do count faults that are fixed
- * by the async #PF handler though, otherwise they'll never be counted.
- */
- if (r == RET_PF_FIXED)
- vcpu->stat.pf_fixed++;
- else if (prefetch)
- ;
- else if (r == RET_PF_EMULATE)
- vcpu->stat.pf_emulate++;
- else if (r == RET_PF_SPURIOUS)
- vcpu->stat.pf_spurious++;
return r;
}
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index d3dbcf382ed2..69941cebb3a8 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -911,7 +911,8 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int
gpa_t pte_gpa;
gfn_t gfn;
- if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE))
+ if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE ||
+ !sp->shadowed_translation))
return 0;
first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index a5e014d7bc62..d4527965e48c 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -43,7 +43,25 @@ u64 __read_mostly shadow_acc_track_mask;
u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
-u8 __read_mostly shadow_phys_bits;
+static u8 __init kvm_get_host_maxphyaddr(void)
+{
+ /*
+ * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
+ * in CPU detection code, but the processor treats those reduced bits as
+ * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
+ * the physical address bits reported by CPUID, i.e. the raw MAXPHYADDR,
+ * when reasoning about CPU behavior with respect to MAXPHYADDR.
+ */
+ if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
+ return cpuid_eax(0x80000008) & 0xff;
+
+ /*
+ * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
+ * custom CPUID. Proceed with whatever the kernel found since these features
+ * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
+ */
+ return boot_cpu_data.x86_phys_bits;
+}
void __init kvm_mmu_spte_module_init(void)
{
@@ -55,6 +73,8 @@ void __init kvm_mmu_spte_module_init(void)
* will change when the vendor module is (re)loaded.
*/
allow_mmio_caching = enable_mmio_caching;
+
+ kvm_host.maxphyaddr = kvm_get_host_maxphyaddr();
}
static u64 generation_mmio_spte_mask(u64 gen)
@@ -190,8 +210,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
spte |= PT_PAGE_SIZE_MASK;
if (shadow_memtype_mask)
- spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn,
- kvm_is_mmio_pfn(pfn));
+ spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn,
+ kvm_is_mmio_pfn(pfn));
if (host_writable)
spte |= shadow_host_writable_mask;
else
@@ -271,18 +291,12 @@ static u64 make_spte_executable(u64 spte)
* This is used during huge page splitting to build the SPTEs that make up the
* new page table.
*/
-u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, union kvm_mmu_page_role role,
- int index)
+u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte,
+ union kvm_mmu_page_role role, int index)
{
- u64 child_spte;
-
- if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte)))
- return 0;
+ u64 child_spte = huge_spte;
- if (WARN_ON_ONCE(!is_large_pte(huge_spte)))
- return 0;
-
- child_spte = huge_spte;
+ KVM_BUG_ON(!is_shadow_present_pte(huge_spte) || !is_large_pte(huge_spte), kvm);
/*
* The child_spte already has the base address of the huge page being
@@ -383,7 +397,7 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
* not set any RWX bits.
*/
if (WARN_ON((mmio_value & mmio_mask) != mmio_value) ||
- WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value))
+ WARN_ON(mmio_value && (FROZEN_SPTE & mmio_mask) == mmio_value))
mmio_value = 0;
if (!mmio_value)
@@ -441,8 +455,6 @@ void kvm_mmu_reset_all_pte_masks(void)
u8 low_phys_bits;
u64 mask;
- shadow_phys_bits = kvm_get_shadow_phys_bits();
-
/*
* If the CPU has 46 or less physical address bits, then set an
* appropriate mask to guard against L1TF attacks. Otherwise, it is
@@ -494,7 +506,7 @@ void kvm_mmu_reset_all_pte_masks(void)
* 52-bit physical addresses then there are no reserved PA bits in the
* PTEs and so the reserved PA approach must be disabled.
*/
- if (shadow_phys_bits < 52)
+ if (kvm_host.maxphyaddr < 52)
mask = BIT_ULL(51) | PT_PRESENT_MASK;
else
mask = 0;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 52fa004a1fbc..ef793c459b05 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -202,7 +202,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
/*
* If a thread running without exclusive control of the MMU lock must perform a
- * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a
+ * multi-part operation on an SPTE, it can set the SPTE to FROZEN_SPTE as a
* non-present intermediate value. Other threads which encounter this value
* should not modify the SPTE.
*
@@ -212,14 +212,14 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
*
* Only used by the TDP MMU.
*/
-#define REMOVED_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL)
+#define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL)
/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
-static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK));
+static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK));
-static inline bool is_removed_spte(u64 spte)
+static inline bool is_frozen_spte(u64 spte)
{
- return spte == REMOVED_SPTE;
+ return spte == FROZEN_SPTE;
}
/* Get an SPTE's index into its parent's page table (and the spt array). */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 36539c1b36cd..c7dc49ee7388 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -365,8 +365,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
* value to the removed SPTE value.
*/
for (;;) {
- old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
- if (!is_removed_spte(old_spte))
+ old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE);
+ if (!is_frozen_spte(old_spte))
break;
cpu_relax();
}
@@ -397,11 +397,11 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
* No retry is needed in the atomic update path as the
* sole concern is dropping a Dirty bit, i.e. no other
* task can zap/remove the SPTE as mmu_lock is held for
- * write. Marking the SPTE as a removed SPTE is not
+ * write. Marking the SPTE as a frozen SPTE is not
* strictly necessary for the same reason, but using
- * the remove SPTE value keeps the shared/exclusive
+ * the frozen SPTE value keeps the shared/exclusive
* paths consistent and allows the handle_changed_spte()
- * call below to hardcode the new value to REMOVED_SPTE.
+ * call below to hardcode the new value to FROZEN_SPTE.
*
* Note, even though dropping a Dirty bit is the only
* scenario where a non-atomic update could result in a
@@ -413,10 +413,10 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
* it here.
*/
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
- REMOVED_SPTE, level);
+ FROZEN_SPTE, level);
}
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
- old_spte, REMOVED_SPTE, level, shared);
+ old_spte, FROZEN_SPTE, level, shared);
}
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
@@ -490,19 +490,19 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
*/
if (!was_present && !is_present) {
/*
- * If this change does not involve a MMIO SPTE or removed SPTE,
+ * If this change does not involve a MMIO SPTE or frozen SPTE,
* it is unexpected. Log the change, though it should not
* impact the guest since both the former and current SPTEs
* are nonpresent.
*/
if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) &&
!is_mmio_spte(kvm, new_spte) &&
- !is_removed_spte(new_spte)))
+ !is_frozen_spte(new_spte)))
pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
"should not be replaced with another,\n"
"different nonpresent SPTE, unless one or both\n"
"are MMIO SPTEs, or the new SPTE is\n"
- "a temporary removed SPTE.\n"
+ "a temporary frozen SPTE.\n"
"as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
as_id, gfn, old_spte, new_spte, level);
return;
@@ -530,7 +530,8 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
kvm_set_pfn_accessed(spte_to_pfn(old_spte));
}
-static inline int __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, u64 new_spte)
+static inline int __must_check __tdp_mmu_set_spte_atomic(struct tdp_iter *iter,
+ u64 new_spte)
{
u64 *sptep = rcu_dereference(iter->sptep);
@@ -540,7 +541,7 @@ static inline int __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, u64 new_spte)
* and pre-checking before inserting a new SPTE is advantageous as it
* avoids unnecessary work.
*/
- WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
+ WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte));
/*
* Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
@@ -572,9 +573,9 @@ static inline int __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, u64 new_spte)
* no side-effects other than setting iter->old_spte to the last
* known value of the spte.
*/
-static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
- struct tdp_iter *iter,
- u64 new_spte)
+static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
{
int ret;
@@ -590,8 +591,8 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
return 0;
}
-static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
- struct tdp_iter *iter)
+static inline int __must_check tdp_mmu_zap_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter)
{
int ret;
@@ -603,26 +604,26 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* in its place before the TLBs are flushed.
*
* Delay processing of the zapped SPTE until after TLBs are flushed and
- * the REMOVED_SPTE is replaced (see below).
+ * the FROZEN_SPTE is replaced (see below).
*/
- ret = __tdp_mmu_set_spte_atomic(iter, REMOVED_SPTE);
+ ret = __tdp_mmu_set_spte_atomic(iter, FROZEN_SPTE);
if (ret)
return ret;
kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level);
/*
- * No other thread can overwrite the removed SPTE as they must either
+ * No other thread can overwrite the frozen SPTE as they must either
* wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
- * overwrite the special removed SPTE value. Use the raw write helper to
+ * overwrite the special frozen SPTE value. Use the raw write helper to
* avoid an unnecessary check on volatile bits.
*/
__kvm_tdp_mmu_write_spte(iter->sptep, SHADOW_NONPRESENT_VALUE);
/*
* Process the zapped SPTE after flushing TLBs, and after replacing
- * REMOVED_SPTE with 0. This minimizes the amount of time vCPUs are
- * blocked by the REMOVED_SPTE and reduces contention on the child
+ * FROZEN_SPTE with 0. This minimizes the amount of time vCPUs are
+ * blocked by the FROZEN_SPTE and reduces contention on the child
* SPTEs.
*/
handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
@@ -652,12 +653,12 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
/*
* No thread should be using this function to set SPTEs to or from the
- * temporary removed SPTE value.
+ * temporary frozen SPTE value.
* If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
* should be used. If operating under the MMU lock in write mode, the
- * use of the removed SPTE should not be necessary.
+ * use of the frozen SPTE should not be necessary.
*/
- WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte));
+ WARN_ON_ONCE(is_frozen_spte(old_spte) || is_frozen_spte(new_spte));
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
@@ -1126,7 +1127,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
* If SPTE has been frozen by another thread, just give up and
* retry, avoiding unnecessary page table allocation and free.
*/
- if (is_removed_spte(iter.old_spte))
+ if (is_frozen_spte(iter.old_spte))
goto retry;
if (iter.level == fault->goal_level)
@@ -1339,17 +1340,15 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
return spte_set;
}
-static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
+static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(void)
{
struct kvm_mmu_page *sp;
- gfp |= __GFP_ZERO;
-
- sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
+ sp = kmem_cache_zalloc(mmu_page_header_cache, GFP_KERNEL_ACCOUNT);
if (!sp)
return NULL;
- sp->spt = (void *)__get_free_page(gfp);
+ sp->spt = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!sp->spt) {
kmem_cache_free(mmu_page_header_cache, sp);
return NULL;
@@ -1358,47 +1357,6 @@ static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
return sp;
}
-static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
- struct tdp_iter *iter,
- bool shared)
-{
- struct kvm_mmu_page *sp;
-
- kvm_lockdep_assert_mmu_lock_held(kvm, shared);
-
- /*
- * Since we are allocating while under the MMU lock we have to be
- * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
- * reclaim and to avoid making any filesystem callbacks (which can end
- * up invoking KVM MMU notifiers, resulting in a deadlock).
- *
- * If this allocation fails we drop the lock and retry with reclaim
- * allowed.
- */
- sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
- if (sp)
- return sp;
-
- rcu_read_unlock();
-
- if (shared)
- read_unlock(&kvm->mmu_lock);
- else
- write_unlock(&kvm->mmu_lock);
-
- iter->yielded = true;
- sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
-
- if (shared)
- read_lock(&kvm->mmu_lock);
- else
- write_lock(&kvm->mmu_lock);
-
- rcu_read_lock();
-
- return sp;
-}
-
/* Note, the caller is responsible for initializing @sp. */
static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
struct kvm_mmu_page *sp, bool shared)
@@ -1445,7 +1403,6 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
{
struct kvm_mmu_page *sp = NULL;
struct tdp_iter iter;
- int ret = 0;
rcu_read_lock();
@@ -1469,17 +1426,31 @@ retry:
continue;
if (!sp) {
- sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
+ rcu_read_unlock();
+
+ if (shared)
+ read_unlock(&kvm->mmu_lock);
+ else
+ write_unlock(&kvm->mmu_lock);
+
+ sp = tdp_mmu_alloc_sp_for_split();
+
+ if (shared)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
+
if (!sp) {
- ret = -ENOMEM;
trace_kvm_mmu_split_huge_page(iter.gfn,
iter.old_spte,
- iter.level, ret);
- break;
+ iter.level, -ENOMEM);
+ return -ENOMEM;
}
- if (iter.yielded)
- continue;
+ rcu_read_lock();
+
+ iter.yielded = true;
+ continue;
}
tdp_mmu_init_child_sp(sp, &iter);
@@ -1500,7 +1471,7 @@ retry:
if (sp)
tdp_mmu_free_sp(sp);
- return ret;
+ return 0;
}
@@ -1801,12 +1772,11 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
*
* WARNING: This function is only intended to be called during fast_page_fault.
*/
-u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
+u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
u64 *spte)
{
struct tdp_iter iter;
struct kvm_mmu *mmu = vcpu->arch.mmu;
- gfn_t gfn = addr >> PAGE_SHIFT;
tdp_ptep_t sptep = NULL;
tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 58b55e61bd33..1b74e058a81c 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -64,7 +64,7 @@ static inline void kvm_tdp_mmu_walk_lockless_end(void)
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
int *root_level);
-u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
+u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
u64 *spte);
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index a67c28a56417..05490b9d8a43 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -19,33 +19,21 @@
#include <asm/mtrr.h>
#include "cpuid.h"
-#include "mmu.h"
-#define IA32_MTRR_DEF_TYPE_E (1ULL << 11)
-#define IA32_MTRR_DEF_TYPE_FE (1ULL << 10)
-#define IA32_MTRR_DEF_TYPE_TYPE_MASK (0xff)
-
-static bool is_mtrr_base_msr(unsigned int msr)
-{
- /* MTRR base MSRs use even numbers, masks use odd numbers. */
- return !(msr & 0x1);
-}
-
-static struct kvm_mtrr_range *var_mtrr_msr_to_range(struct kvm_vcpu *vcpu,
- unsigned int msr)
+static u64 *find_mtrr(struct kvm_vcpu *vcpu, unsigned int msr)
{
- int index = (msr - MTRRphysBase_MSR(0)) / 2;
-
- return &vcpu->arch.mtrr_state.var_ranges[index];
-}
+ int index;
-static bool msr_mtrr_valid(unsigned msr)
-{
switch (msr) {
case MTRRphysBase_MSR(0) ... MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1):
+ index = msr - MTRRphysBase_MSR(0);
+ return &vcpu->arch.mtrr_state.var[index];
case MSR_MTRRfix64K_00000:
+ return &vcpu->arch.mtrr_state.fixed_64k;
case MSR_MTRRfix16K_80000:
case MSR_MTRRfix16K_A0000:
+ index = msr - MSR_MTRRfix16K_80000;
+ return &vcpu->arch.mtrr_state.fixed_16k[index];
case MSR_MTRRfix4K_C0000:
case MSR_MTRRfix4K_C8000:
case MSR_MTRRfix4K_D0000:
@@ -54,10 +42,14 @@ static bool msr_mtrr_valid(unsigned msr)
case MSR_MTRRfix4K_E8000:
case MSR_MTRRfix4K_F0000:
case MSR_MTRRfix4K_F8000:
+ index = msr - MSR_MTRRfix4K_C0000;
+ return &vcpu->arch.mtrr_state.fixed_4k[index];
case MSR_MTRRdefType:
- return true;
+ return &vcpu->arch.mtrr_state.deftype;
+ default:
+ break;
}
- return false;
+ return NULL;
}
static bool valid_mtrr_type(unsigned t)
@@ -70,9 +62,6 @@ static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
int i;
u64 mask;
- if (!msr_mtrr_valid(msr))
- return false;
-
if (msr == MSR_MTRRdefType) {
if (data & ~0xcff)
return false;
@@ -85,8 +74,9 @@ static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
}
/* variable MTRRs */
- WARN_ON(!(msr >= MTRRphysBase_MSR(0) &&
- msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1)));
+ if (WARN_ON_ONCE(!(msr >= MTRRphysBase_MSR(0) &&
+ msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1))))
+ return false;
mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
if ((msr & 1) == 0) {
@@ -94,309 +84,32 @@ static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (!valid_mtrr_type(data & 0xff))
return false;
mask |= 0xf00;
- } else
+ } else {
/* MTRR mask */
mask |= 0x7ff;
-
- return (data & mask) == 0;
-}
-
-static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
-{
- return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_E);
-}
-
-static bool fixed_mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
-{
- return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_FE);
-}
-
-static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state)
-{
- return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK;
-}
-
-static u8 mtrr_disabled_type(struct kvm_vcpu *vcpu)
-{
- /*
- * Intel SDM 11.11.2.2: all MTRRs are disabled when
- * IA32_MTRR_DEF_TYPE.E bit is cleared, and the UC
- * memory type is applied to all of physical memory.
- *
- * However, virtual machines can be run with CPUID such that
- * there are no MTRRs. In that case, the firmware will never
- * enable MTRRs and it is obviously undesirable to run the
- * guest entirely with UC memory and we use WB.
- */
- if (guest_cpuid_has(vcpu, X86_FEATURE_MTRR))
- return MTRR_TYPE_UNCACHABLE;
- else
- return MTRR_TYPE_WRBACK;
-}
-
-/*
-* Three terms are used in the following code:
-* - segment, it indicates the address segments covered by fixed MTRRs.
-* - unit, it corresponds to the MSR entry in the segment.
-* - range, a range is covered in one memory cache type.
-*/
-struct fixed_mtrr_segment {
- u64 start;
- u64 end;
-
- int range_shift;
-
- /* the start position in kvm_mtrr.fixed_ranges[]. */
- int range_start;
-};
-
-static struct fixed_mtrr_segment fixed_seg_table[] = {
- /* MSR_MTRRfix64K_00000, 1 unit. 64K fixed mtrr. */
- {
- .start = 0x0,
- .end = 0x80000,
- .range_shift = 16, /* 64K */
- .range_start = 0,
- },
-
- /*
- * MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000, 2 units,
- * 16K fixed mtrr.
- */
- {
- .start = 0x80000,
- .end = 0xc0000,
- .range_shift = 14, /* 16K */
- .range_start = 8,
- },
-
- /*
- * MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000, 8 units,
- * 4K fixed mtrr.
- */
- {
- .start = 0xc0000,
- .end = 0x100000,
- .range_shift = 12, /* 12K */
- .range_start = 24,
- }
-};
-
-/*
- * The size of unit is covered in one MSR, one MSR entry contains
- * 8 ranges so that unit size is always 8 * 2^range_shift.
- */
-static u64 fixed_mtrr_seg_unit_size(int seg)
-{
- return 8 << fixed_seg_table[seg].range_shift;
-}
-
-static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit)
-{
- switch (msr) {
- case MSR_MTRRfix64K_00000:
- *seg = 0;
- *unit = 0;
- break;
- case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000:
- *seg = 1;
- *unit = array_index_nospec(
- msr - MSR_MTRRfix16K_80000,
- MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1);
- break;
- case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
- *seg = 2;
- *unit = array_index_nospec(
- msr - MSR_MTRRfix4K_C0000,
- MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1);
- break;
- default:
- return false;
}
- return true;
-}
-
-static void fixed_mtrr_seg_unit_range(int seg, int unit, u64 *start, u64 *end)
-{
- struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
- u64 unit_size = fixed_mtrr_seg_unit_size(seg);
-
- *start = mtrr_seg->start + unit * unit_size;
- *end = *start + unit_size;
- WARN_ON(*end > mtrr_seg->end);
-}
-
-static int fixed_mtrr_seg_unit_range_index(int seg, int unit)
-{
- struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
-
- WARN_ON(mtrr_seg->start + unit * fixed_mtrr_seg_unit_size(seg)
- > mtrr_seg->end);
-
- /* each unit has 8 ranges. */
- return mtrr_seg->range_start + 8 * unit;
-}
-
-static int fixed_mtrr_seg_end_range_index(int seg)
-{
- struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
- int n;
-
- n = (mtrr_seg->end - mtrr_seg->start) >> mtrr_seg->range_shift;
- return mtrr_seg->range_start + n - 1;
-}
-
-static bool fixed_msr_to_range(u32 msr, u64 *start, u64 *end)
-{
- int seg, unit;
-
- if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
- return false;
-
- fixed_mtrr_seg_unit_range(seg, unit, start, end);
- return true;
-}
-
-static int fixed_msr_to_range_index(u32 msr)
-{
- int seg, unit;
-
- if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
- return -1;
-
- return fixed_mtrr_seg_unit_range_index(seg, unit);
-}
-
-static int fixed_mtrr_addr_to_seg(u64 addr)
-{
- struct fixed_mtrr_segment *mtrr_seg;
- int seg, seg_num = ARRAY_SIZE(fixed_seg_table);
-
- for (seg = 0; seg < seg_num; seg++) {
- mtrr_seg = &fixed_seg_table[seg];
- if (mtrr_seg->start <= addr && addr < mtrr_seg->end)
- return seg;
- }
-
- return -1;
-}
-
-static int fixed_mtrr_addr_seg_to_range_index(u64 addr, int seg)
-{
- struct fixed_mtrr_segment *mtrr_seg;
- int index;
-
- mtrr_seg = &fixed_seg_table[seg];
- index = mtrr_seg->range_start;
- index += (addr - mtrr_seg->start) >> mtrr_seg->range_shift;
- return index;
-}
-
-static u64 fixed_mtrr_range_end_addr(int seg, int index)
-{
- struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
- int pos = index - mtrr_seg->range_start;
-
- return mtrr_seg->start + ((pos + 1) << mtrr_seg->range_shift);
-}
-
-static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end)
-{
- u64 mask;
-
- *start = range->base & PAGE_MASK;
-
- mask = range->mask & PAGE_MASK;
-
- /* This cannot overflow because writing to the reserved bits of
- * variable MTRRs causes a #GP.
- */
- *end = (*start | ~mask) + 1;
-}
-
-static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
-{
- struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
- gfn_t start, end;
-
- if (!kvm_mmu_honors_guest_mtrrs(vcpu->kvm))
- return;
-
- if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType)
- return;
-
- /* fixed MTRRs. */
- if (fixed_msr_to_range(msr, &start, &end)) {
- if (!fixed_mtrr_is_enabled(mtrr_state))
- return;
- } else if (msr == MSR_MTRRdefType) {
- start = 0x0;
- end = ~0ULL;
- } else {
- /* variable range MTRRs. */
- var_mtrr_range(var_mtrr_msr_to_range(vcpu, msr), &start, &end);
- }
-
- kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end));
-}
-
-static bool var_mtrr_range_is_valid(struct kvm_mtrr_range *range)
-{
- return (range->mask & (1 << 11)) != 0;
-}
-
-static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
- struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
- struct kvm_mtrr_range *tmp, *cur;
-
- cur = var_mtrr_msr_to_range(vcpu, msr);
-
- /* remove the entry if it's in the list. */
- if (var_mtrr_range_is_valid(cur))
- list_del(&cur->node);
-
- /*
- * Set all illegal GPA bits in the mask, since those bits must
- * implicitly be 0. The bits are then cleared when reading them.
- */
- if (is_mtrr_base_msr(msr))
- cur->base = data;
- else
- cur->mask = data | kvm_vcpu_reserved_gpa_bits_raw(vcpu);
-
- /* add it to the list if it's enabled. */
- if (var_mtrr_range_is_valid(cur)) {
- list_for_each_entry(tmp, &mtrr_state->head, node)
- if (cur->base >= tmp->base)
- break;
- list_add_tail(&cur->node, &tmp->node);
- }
+ return (data & mask) == 0;
}
int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
- int index;
+ u64 *mtrr;
- if (!kvm_mtrr_valid(vcpu, msr, data))
+ mtrr = find_mtrr(vcpu, msr);
+ if (!mtrr)
return 1;
- index = fixed_msr_to_range_index(msr);
- if (index >= 0)
- *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index] = data;
- else if (msr == MSR_MTRRdefType)
- vcpu->arch.mtrr_state.deftype = data;
- else
- set_var_mtrr_msr(vcpu, msr, data);
+ if (!kvm_mtrr_valid(vcpu, msr, data))
+ return 1;
- update_mtrr(vcpu, msr);
+ *mtrr = data;
return 0;
}
int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
{
- int index;
+ u64 *mtrr;
/* MSR_MTRRcap is a readonly MSR. */
if (msr == MSR_MTRRcap) {
@@ -410,311 +123,10 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
return 0;
}
- if (!msr_mtrr_valid(msr))
+ mtrr = find_mtrr(vcpu, msr);
+ if (!mtrr)
return 1;
- index = fixed_msr_to_range_index(msr);
- if (index >= 0) {
- *pdata = *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index];
- } else if (msr == MSR_MTRRdefType) {
- *pdata = vcpu->arch.mtrr_state.deftype;
- } else {
- /* Variable MTRRs */
- if (is_mtrr_base_msr(msr))
- *pdata = var_mtrr_msr_to_range(vcpu, msr)->base;
- else
- *pdata = var_mtrr_msr_to_range(vcpu, msr)->mask;
-
- *pdata &= ~kvm_vcpu_reserved_gpa_bits_raw(vcpu);
- }
-
+ *pdata = *mtrr;
return 0;
}
-
-void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu)
-{
- INIT_LIST_HEAD(&vcpu->arch.mtrr_state.head);
-}
-
-struct mtrr_iter {
- /* input fields. */
- struct kvm_mtrr *mtrr_state;
- u64 start;
- u64 end;
-
- /* output fields. */
- int mem_type;
- /* mtrr is completely disabled? */
- bool mtrr_disabled;
- /* [start, end) is not fully covered in MTRRs? */
- bool partial_map;
-
- /* private fields. */
- union {
- /* used for fixed MTRRs. */
- struct {
- int index;
- int seg;
- };
-
- /* used for var MTRRs. */
- struct {
- struct kvm_mtrr_range *range;
- /* max address has been covered in var MTRRs. */
- u64 start_max;
- };
- };
-
- bool fixed;
-};
-
-static bool mtrr_lookup_fixed_start(struct mtrr_iter *iter)
-{
- int seg, index;
-
- if (!fixed_mtrr_is_enabled(iter->mtrr_state))
- return false;
-
- seg = fixed_mtrr_addr_to_seg(iter->start);
- if (seg < 0)
- return false;
-
- iter->fixed = true;
- index = fixed_mtrr_addr_seg_to_range_index(iter->start, seg);
- iter->index = index;
- iter->seg = seg;
- return true;
-}
-
-static bool match_var_range(struct mtrr_iter *iter,
- struct kvm_mtrr_range *range)
-{
- u64 start, end;
-
- var_mtrr_range(range, &start, &end);
- if (!(start >= iter->end || end <= iter->start)) {
- iter->range = range;
-
- /*
- * the function is called when we do kvm_mtrr.head walking.
- * Range has the minimum base address which interleaves
- * [looker->start_max, looker->end).
- */
- iter->partial_map |= iter->start_max < start;
-
- /* update the max address has been covered. */
- iter->start_max = max(iter->start_max, end);
- return true;
- }
-
- return false;
-}
-
-static void __mtrr_lookup_var_next(struct mtrr_iter *iter)
-{
- struct kvm_mtrr *mtrr_state = iter->mtrr_state;
-
- list_for_each_entry_continue(iter->range, &mtrr_state->head, node)
- if (match_var_range(iter, iter->range))
- return;
-
- iter->range = NULL;
- iter->partial_map |= iter->start_max < iter->end;
-}
-
-static void mtrr_lookup_var_start(struct mtrr_iter *iter)
-{
- struct kvm_mtrr *mtrr_state = iter->mtrr_state;
-
- iter->fixed = false;
- iter->start_max = iter->start;
- iter->range = NULL;
- iter->range = list_prepare_entry(iter->range, &mtrr_state->head, node);
-
- __mtrr_lookup_var_next(iter);
-}
-
-static void mtrr_lookup_fixed_next(struct mtrr_iter *iter)
-{
- /* terminate the lookup. */
- if (fixed_mtrr_range_end_addr(iter->seg, iter->index) >= iter->end) {
- iter->fixed = false;
- iter->range = NULL;
- return;
- }
-
- iter->index++;
-
- /* have looked up for all fixed MTRRs. */
- if (iter->index >= ARRAY_SIZE(iter->mtrr_state->fixed_ranges))
- return mtrr_lookup_var_start(iter);
-
- /* switch to next segment. */
- if (iter->index > fixed_mtrr_seg_end_range_index(iter->seg))
- iter->seg++;
-}
-
-static void mtrr_lookup_var_next(struct mtrr_iter *iter)
-{
- __mtrr_lookup_var_next(iter);
-}
-
-static void mtrr_lookup_start(struct mtrr_iter *iter)
-{
- if (!mtrr_is_enabled(iter->mtrr_state)) {
- iter->mtrr_disabled = true;
- return;
- }
-
- if (!mtrr_lookup_fixed_start(iter))
- mtrr_lookup_var_start(iter);
-}
-
-static void mtrr_lookup_init(struct mtrr_iter *iter,
- struct kvm_mtrr *mtrr_state, u64 start, u64 end)
-{
- iter->mtrr_state = mtrr_state;
- iter->start = start;
- iter->end = end;
- iter->mtrr_disabled = false;
- iter->partial_map = false;
- iter->fixed = false;
- iter->range = NULL;
-
- mtrr_lookup_start(iter);
-}
-
-static bool mtrr_lookup_okay(struct mtrr_iter *iter)
-{
- if (iter->fixed) {
- iter->mem_type = iter->mtrr_state->fixed_ranges[iter->index];
- return true;
- }
-
- if (iter->range) {
- iter->mem_type = iter->range->base & 0xff;
- return true;
- }
-
- return false;
-}
-
-static void mtrr_lookup_next(struct mtrr_iter *iter)
-{
- if (iter->fixed)
- mtrr_lookup_fixed_next(iter);
- else
- mtrr_lookup_var_next(iter);
-}
-
-#define mtrr_for_each_mem_type(_iter_, _mtrr_, _gpa_start_, _gpa_end_) \
- for (mtrr_lookup_init(_iter_, _mtrr_, _gpa_start_, _gpa_end_); \
- mtrr_lookup_okay(_iter_); mtrr_lookup_next(_iter_))
-
-u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
- struct mtrr_iter iter;
- u64 start, end;
- int type = -1;
- const int wt_wb_mask = (1 << MTRR_TYPE_WRBACK)
- | (1 << MTRR_TYPE_WRTHROUGH);
-
- start = gfn_to_gpa(gfn);
- end = start + PAGE_SIZE;
-
- mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
- int curr_type = iter.mem_type;
-
- /*
- * Please refer to Intel SDM Volume 3: 11.11.4.1 MTRR
- * Precedences.
- */
-
- if (type == -1) {
- type = curr_type;
- continue;
- }
-
- /*
- * If two or more variable memory ranges match and the
- * memory types are identical, then that memory type is
- * used.
- */
- if (type == curr_type)
- continue;
-
- /*
- * If two or more variable memory ranges match and one of
- * the memory types is UC, the UC memory type used.
- */
- if (curr_type == MTRR_TYPE_UNCACHABLE)
- return MTRR_TYPE_UNCACHABLE;
-
- /*
- * If two or more variable memory ranges match and the
- * memory types are WT and WB, the WT memory type is used.
- */
- if (((1 << type) & wt_wb_mask) &&
- ((1 << curr_type) & wt_wb_mask)) {
- type = MTRR_TYPE_WRTHROUGH;
- continue;
- }
-
- /*
- * For overlaps not defined by the above rules, processor
- * behavior is undefined.
- */
-
- /* We use WB for this undefined behavior. :( */
- return MTRR_TYPE_WRBACK;
- }
-
- if (iter.mtrr_disabled)
- return mtrr_disabled_type(vcpu);
-
- /* not contained in any MTRRs. */
- if (type == -1)
- return mtrr_default_type(mtrr_state);
-
- /*
- * We just check one page, partially covered by MTRRs is
- * impossible.
- */
- WARN_ON(iter.partial_map);
-
- return type;
-}
-EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type);
-
-bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
- int page_num)
-{
- struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
- struct mtrr_iter iter;
- u64 start, end;
- int type = -1;
-
- start = gfn_to_gpa(gfn);
- end = gfn_to_gpa(gfn + page_num);
- mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
- if (type == -1) {
- type = iter.mem_type;
- continue;
- }
-
- if (type != iter.mem_type)
- return false;
- }
-
- if (iter.mtrr_disabled)
- return true;
-
- if (!iter.partial_map)
- return true;
-
- if (type == -1)
- return true;
-
- return type == mtrr_default_type(mtrr_state);
-}
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index a593b03c9aed..47a46283c866 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -34,16 +34,16 @@ EXPORT_SYMBOL_GPL(kvm_pmu_eventsel);
/* Precise Distribution of Instructions Retired (PDIR) */
static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = {
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, NULL),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, NULL),
/* Instruction-Accurate PDIR (PDIR++) */
- X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL),
{}
};
/* Precise Distribution (PDist) */
static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
- X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL),
{}
};
@@ -69,7 +69,7 @@ static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
* code. Each pmc, stored in kvm_pmc.idx field, is unique across
* all perf counters (both gp and fixed). The mapping relationship
* between pmc and perf counters is as the following:
- * * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters
+ * * Intel: [0 .. KVM_MAX_NR_INTEL_GP_COUNTERS-1] <=> gp counters
* [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed
* * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
* and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
@@ -194,7 +194,7 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
attr.sample_period = get_sample_period(pmc, pmc->counter);
if ((attr.config & HSW_IN_TX_CHECKPOINTED) &&
- guest_cpuid_is_intel(pmc->vcpu)) {
+ (boot_cpu_has(X86_FEATURE_RTM) || boot_cpu_has(X86_FEATURE_HLE))) {
/*
* HSW_IN_TX_CHECKPOINTED is not supported with nonzero
* period. Just clear the sample period so at least
@@ -469,11 +469,11 @@ static int reprogram_counter(struct kvm_pmc *pmc)
if (pmc_is_fixed(pmc)) {
fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
pmc->idx - KVM_FIXED_PMC_BASE_IDX);
- if (fixed_ctr_ctrl & 0x1)
+ if (fixed_ctr_ctrl & INTEL_FIXED_0_KERNEL)
eventsel |= ARCH_PERFMON_EVENTSEL_OS;
- if (fixed_ctr_ctrl & 0x2)
+ if (fixed_ctr_ctrl & INTEL_FIXED_0_USER)
eventsel |= ARCH_PERFMON_EVENTSEL_USR;
- if (fixed_ctr_ctrl & 0x8)
+ if (fixed_ctr_ctrl & INTEL_FIXED_0_ENABLE_PMI)
eventsel |= ARCH_PERFMON_EVENTSEL_INT;
new_config = (u64)fixed_ctr_ctrl;
}
@@ -521,9 +521,9 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
}
/*
- * Unused perf_events are only released if the corresponding MSRs
- * weren't accessed during the last vCPU time slice. kvm_arch_sched_in
- * triggers KVM_REQ_PMU if cleanup is needed.
+ * Release unused perf_events if the corresponding guest MSRs weren't
+ * accessed during the last vCPU time slice (need_cleanup is set when
+ * the vCPU is scheduled back in).
*/
if (unlikely(pmu->need_cleanup))
kvm_pmu_cleanup(vcpu);
@@ -542,7 +542,7 @@ int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx)
if (!kvm_pmu_ops.check_rdpmc_early)
return 0;
- return static_call(kvm_x86_pmu_check_rdpmc_early)(vcpu, idx);
+ return kvm_pmu_call(check_rdpmc_early)(vcpu, idx);
}
bool is_vmware_backdoor_pmc(u32 pmc_idx)
@@ -591,12 +591,12 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
if (is_vmware_backdoor_pmc(idx))
return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
- pmc = static_call(kvm_x86_pmu_rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
+ pmc = kvm_pmu_call(rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
if (!pmc)
return 1;
if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) &&
- (static_call(kvm_x86_get_cpl)(vcpu) != 0) &&
+ (kvm_x86_call(get_cpl)(vcpu) != 0) &&
kvm_is_cr0_bit_set(vcpu, X86_CR0_PE))
return 1;
@@ -607,7 +607,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
{
if (lapic_in_kernel(vcpu)) {
- static_call_cond(kvm_x86_pmu_deliver_pmi)(vcpu);
+ kvm_pmu_call(deliver_pmi)(vcpu);
kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
}
}
@@ -622,14 +622,14 @@ bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
default:
break;
}
- return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) ||
- static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr);
+ return kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr) ||
+ kvm_pmu_call(is_valid_msr)(vcpu, msr);
}
static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- struct kvm_pmc *pmc = static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr);
+ struct kvm_pmc *pmc = kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr);
if (pmc)
__set_bit(pmc->idx, pmu->pmc_in_use);
@@ -654,7 +654,7 @@ int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = 0;
break;
default:
- return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info);
+ return kvm_pmu_call(get_msr)(vcpu, msr_info);
}
return 0;
@@ -681,13 +681,13 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!msr_info->host_initiated)
break;
- if (data & pmu->global_status_mask)
+ if (data & pmu->global_status_rsvd)
return 1;
pmu->global_status = data;
break;
case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
- data &= ~pmu->global_ctrl_mask;
+ data &= ~pmu->global_ctrl_rsvd;
fallthrough;
case MSR_CORE_PERF_GLOBAL_CTRL:
if (!kvm_valid_perf_global_ctrl(pmu, data))
@@ -704,7 +704,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
* GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in
* GLOBAL_STATUS, and so the set of reserved bits is the same.
*/
- if (data & pmu->global_status_mask)
+ if (data & pmu->global_status_rsvd)
return 1;
fallthrough;
case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
@@ -713,7 +713,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
default:
kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
- return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info);
+ return kvm_pmu_call(set_msr)(vcpu, msr_info);
}
return 0;
@@ -740,7 +740,7 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu)
pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;
- static_call_cond(kvm_x86_pmu_reset)(vcpu);
+ kvm_pmu_call(reset)(vcpu);
}
@@ -768,17 +768,17 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
pmu->reserved_bits = 0xffffffff00200000ull;
pmu->raw_event_mask = X86_RAW_EVENT_MASK;
- pmu->global_ctrl_mask = ~0ull;
- pmu->global_status_mask = ~0ull;
- pmu->fixed_ctr_ctrl_mask = ~0ull;
- pmu->pebs_enable_mask = ~0ull;
- pmu->pebs_data_cfg_mask = ~0ull;
+ pmu->global_ctrl_rsvd = ~0ull;
+ pmu->global_status_rsvd = ~0ull;
+ pmu->fixed_ctr_ctrl_rsvd = ~0ull;
+ pmu->pebs_enable_rsvd = ~0ull;
+ pmu->pebs_data_cfg_rsvd = ~0ull;
bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
if (!vcpu->kvm->arch.enable_pmu)
return;
- static_call(kvm_x86_pmu_refresh)(vcpu);
+ kvm_pmu_call(refresh)(vcpu);
/*
* At RESET, both Intel and AMD CPUs set all enable bits for general
@@ -796,7 +796,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
memset(pmu, 0, sizeof(*pmu));
- static_call(kvm_x86_pmu_init)(vcpu);
+ kvm_pmu_call(init)(vcpu);
kvm_pmu_refresh(vcpu);
}
@@ -818,7 +818,7 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
pmc_stop_counter(pmc);
}
- static_call_cond(kvm_x86_pmu_cleanup)(vcpu);
+ kvm_pmu_call(cleanup)(vcpu);
bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
}
@@ -846,8 +846,8 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc)
} else {
config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
pmc->idx - KVM_FIXED_PMC_BASE_IDX);
- select_os = config & 0x1;
- select_user = config & 0x2;
+ select_os = config & INTEL_FIXED_0_KERNEL;
+ select_user = config & INTEL_FIXED_0_USER;
}
/*
@@ -857,7 +857,8 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc)
if (select_os == select_user)
return select_os;
- return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user;
+ return (kvm_x86_call(get_cpl)(pmc->vcpu) == 0) ? select_os :
+ select_user;
}
void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel)
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 4d52b0b539ba..ad89d0bd6005 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -14,7 +14,8 @@
MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)
/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
-#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf)
+#define fixed_ctrl_field(ctrl_reg, idx) \
+ (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK)
#define VMWARE_BACKDOOR_PMC_HOST_TSC 0x10000
#define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001
@@ -129,7 +130,7 @@ static inline bool pmc_is_fixed(struct kvm_pmc *pmc)
static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu,
u64 data)
{
- return !(pmu->global_ctrl_mask & data);
+ return !(pmu->global_ctrl_rsvd & data);
}
/* returns general purpose PMC with the specified MSR. Note that it can be
@@ -170,7 +171,8 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
if (pmc_is_fixed(pmc))
return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
- pmc->idx - KVM_FIXED_PMC_BASE_IDX) & 0x3;
+ pmc->idx - KVM_FIXED_PMC_BASE_IDX) &
+ (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER);
return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
}
@@ -217,7 +219,7 @@ static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops)
kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp,
pmu_ops->MAX_NR_GP_COUNTERS);
kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed,
- KVM_PMC_MAX_FIXED);
+ KVM_MAX_NR_FIXED_COUNTERS);
kvm_pmu_eventsel.INSTRUCTIONS_RETIRED =
perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS);
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
index d06d43d8d2aa..00e3c27d2a87 100644
--- a/arch/x86/kvm/smm.c
+++ b/arch/x86/kvm/smm.c
@@ -200,11 +200,11 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu,
enter_smm_save_seg_32(vcpu, &smram->tr, &smram->tr_sel, VCPU_SREG_TR);
enter_smm_save_seg_32(vcpu, &smram->ldtr, &smram->ldtr_sel, VCPU_SREG_LDTR);
- static_call(kvm_x86_get_gdt)(vcpu, &dt);
+ kvm_x86_call(get_gdt)(vcpu, &dt);
smram->gdtr.base = dt.address;
smram->gdtr.limit = dt.size;
- static_call(kvm_x86_get_idt)(vcpu, &dt);
+ kvm_x86_call(get_idt)(vcpu, &dt);
smram->idtr.base = dt.address;
smram->idtr.limit = dt.size;
@@ -220,7 +220,7 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu,
smram->smm_revision = 0x00020000;
smram->smbase = vcpu->arch.smbase;
- smram->int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+ smram->int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu);
}
#ifdef CONFIG_X86_64
@@ -250,13 +250,13 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu,
enter_smm_save_seg_64(vcpu, &smram->tr, VCPU_SREG_TR);
- static_call(kvm_x86_get_idt)(vcpu, &dt);
+ kvm_x86_call(get_idt)(vcpu, &dt);
smram->idtr.limit = dt.size;
smram->idtr.base = dt.address;
enter_smm_save_seg_64(vcpu, &smram->ldtr, VCPU_SREG_LDTR);
- static_call(kvm_x86_get_gdt)(vcpu, &dt);
+ kvm_x86_call(get_gdt)(vcpu, &dt);
smram->gdtr.limit = dt.size;
smram->gdtr.base = dt.address;
@@ -267,7 +267,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu,
enter_smm_save_seg_64(vcpu, &smram->fs, VCPU_SREG_FS);
enter_smm_save_seg_64(vcpu, &smram->gs, VCPU_SREG_GS);
- smram->int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+ smram->int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu);
}
#endif
@@ -297,7 +297,7 @@ void enter_smm(struct kvm_vcpu *vcpu)
* Kill the VM in the unlikely case of failure, because the VM
* can be in undefined state in this case.
*/
- if (static_call(kvm_x86_enter_smm)(vcpu, &smram))
+ if (kvm_x86_call(enter_smm)(vcpu, &smram))
goto error;
kvm_smm_changed(vcpu, true);
@@ -305,24 +305,24 @@ void enter_smm(struct kvm_vcpu *vcpu)
if (kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, &smram, sizeof(smram)))
goto error;
- if (static_call(kvm_x86_get_nmi_mask)(vcpu))
+ if (kvm_x86_call(get_nmi_mask)(vcpu))
vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
else
- static_call(kvm_x86_set_nmi_mask)(vcpu, true);
+ kvm_x86_call(set_nmi_mask)(vcpu, true);
kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
kvm_rip_write(vcpu, 0x8000);
- static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0);
+ kvm_x86_call(set_interrupt_shadow)(vcpu, 0);
cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
- static_call(kvm_x86_set_cr0)(vcpu, cr0);
+ kvm_x86_call(set_cr0)(vcpu, cr0);
- static_call(kvm_x86_set_cr4)(vcpu, 0);
+ kvm_x86_call(set_cr4)(vcpu, 0);
/* Undocumented: IDT limit is set to zero on entry to SMM. */
dt.address = dt.size = 0;
- static_call(kvm_x86_set_idt)(vcpu, &dt);
+ kvm_x86_call(set_idt)(vcpu, &dt);
if (WARN_ON_ONCE(kvm_set_dr(vcpu, 7, DR7_FIXED_1)))
goto error;
@@ -354,7 +354,7 @@ void enter_smm(struct kvm_vcpu *vcpu)
#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
- if (static_call(kvm_x86_set_efer)(vcpu, 0))
+ if (kvm_x86_call(set_efer)(vcpu, 0))
goto error;
#endif
@@ -479,11 +479,11 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
dt.address = smstate->gdtr.base;
dt.size = smstate->gdtr.limit;
- static_call(kvm_x86_set_gdt)(vcpu, &dt);
+ kvm_x86_call(set_gdt)(vcpu, &dt);
dt.address = smstate->idtr.base;
dt.size = smstate->idtr.limit;
- static_call(kvm_x86_set_idt)(vcpu, &dt);
+ kvm_x86_call(set_idt)(vcpu, &dt);
rsm_load_seg_32(vcpu, &smstate->es, smstate->es_sel, VCPU_SREG_ES);
rsm_load_seg_32(vcpu, &smstate->cs, smstate->cs_sel, VCPU_SREG_CS);
@@ -501,7 +501,7 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
if (r != X86EMUL_CONTINUE)
return r;
- static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0);
+ kvm_x86_call(set_interrupt_shadow)(vcpu, 0);
ctxt->interruptibility = (u8)smstate->int_shadow;
return r;
@@ -535,13 +535,13 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
dt.size = smstate->idtr.limit;
dt.address = smstate->idtr.base;
- static_call(kvm_x86_set_idt)(vcpu, &dt);
+ kvm_x86_call(set_idt)(vcpu, &dt);
rsm_load_seg_64(vcpu, &smstate->ldtr, VCPU_SREG_LDTR);
dt.size = smstate->gdtr.limit;
dt.address = smstate->gdtr.base;
- static_call(kvm_x86_set_gdt)(vcpu, &dt);
+ kvm_x86_call(set_gdt)(vcpu, &dt);
r = rsm_enter_protected_mode(vcpu, smstate->cr0, smstate->cr3, smstate->cr4);
if (r != X86EMUL_CONTINUE)
@@ -554,7 +554,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
rsm_load_seg_64(vcpu, &smstate->fs, VCPU_SREG_FS);
rsm_load_seg_64(vcpu, &smstate->gs, VCPU_SREG_GS);
- static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0);
+ kvm_x86_call(set_interrupt_shadow)(vcpu, 0);
ctxt->interruptibility = (u8)smstate->int_shadow;
return X86EMUL_CONTINUE;
@@ -576,7 +576,7 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
return X86EMUL_UNHANDLEABLE;
if ((vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK) == 0)
- static_call(kvm_x86_set_nmi_mask)(vcpu, false);
+ kvm_x86_call(set_nmi_mask)(vcpu, false);
kvm_smm_changed(vcpu, false);
@@ -628,7 +628,7 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
* state (e.g. enter guest mode) before loading state from the SMM
* state-save area.
*/
- if (static_call(kvm_x86_leave_smm)(vcpu, &smram))
+ if (kvm_x86_call(leave_smm)(vcpu, &smram))
return X86EMUL_UNHANDLEABLE;
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 55b9a6d96bcf..6f704c1037e5 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -1181,7 +1181,7 @@ int svm_allocate_nested(struct vcpu_svm *svm)
if (svm->nested.initialized)
return 0;
- vmcb02_page = snp_safe_alloc_page(&svm->vcpu);
+ vmcb02_page = snp_safe_alloc_page();
if (!vmcb02_page)
return -ENOMEM;
svm->nested.vmcb02.ptr = page_address(vmcb02_page);
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index dfcc38bd97d3..22d5a65b410c 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -199,8 +199,8 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
kvm_pmu_cap.num_counters_gp);
if (pmu->version > 1) {
- pmu->global_ctrl_mask = ~((1ull << pmu->nr_arch_gp_counters) - 1);
- pmu->global_status_mask = pmu->global_ctrl_mask;
+ pmu->global_ctrl_rsvd = ~((1ull << pmu->nr_arch_gp_counters) - 1);
+ pmu->global_status_rsvd = pmu->global_ctrl_rsvd;
}
pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
@@ -217,10 +217,9 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
int i;
- BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > AMD64_NUM_COUNTERS_CORE);
- BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > INTEL_PMC_MAX_GENERIC);
+ BUILD_BUG_ON(KVM_MAX_NR_AMD_GP_COUNTERS > AMD64_NUM_COUNTERS_CORE);
- for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC ; i++) {
+ for (i = 0; i < KVM_MAX_NR_AMD_GP_COUNTERS; i++) {
pmu->gp_counters[i].type = KVM_PMC_GP;
pmu->gp_counters[i].vcpu = vcpu;
pmu->gp_counters[i].idx = i;
@@ -238,6 +237,6 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = {
.refresh = amd_pmu_refresh,
.init = amd_pmu_init,
.EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT,
- .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC,
+ .MAX_NR_GP_COUNTERS = KVM_MAX_NR_AMD_GP_COUNTERS,
.MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS,
};
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 95095a233a45..a16c873b3232 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -19,12 +19,14 @@
#include <linux/misc_cgroup.h>
#include <linux/processor.h>
#include <linux/trace_events.h>
+#include <uapi/linux/sev-guest.h>
#include <asm/pkru.h>
#include <asm/trapnr.h>
#include <asm/fpu/xcr.h>
#include <asm/fpu/xstate.h>
#include <asm/debugreg.h>
+#include <asm/sev.h>
#include "mmu.h"
#include "x86.h"
@@ -37,7 +39,7 @@
#define GHCB_VERSION_DEFAULT 2ULL
#define GHCB_VERSION_MIN 1ULL
-#define GHCB_HV_FT_SUPPORTED GHCB_HV_FT_SNP
+#define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION)
/* enable/disable SEV support */
static bool sev_enabled = true;
@@ -47,6 +49,10 @@ module_param_named(sev, sev_enabled, bool, 0444);
static bool sev_es_enabled = true;
module_param_named(sev_es, sev_es_enabled, bool, 0444);
+/* enable/disable SEV-SNP support */
+static bool sev_snp_enabled = true;
+module_param_named(sev_snp, sev_snp_enabled, bool, 0444);
+
/* enable/disable SEV-ES DebugSwap support */
static bool sev_es_debug_swap_enabled = true;
module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444);
@@ -56,6 +62,23 @@ static u64 sev_supported_vmsa_features;
#define AP_RESET_HOLD_NAE_EVENT 1
#define AP_RESET_HOLD_MSR_PROTO 2
+/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */
+#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0)
+#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8)
+#define SNP_POLICY_MASK_SMT BIT_ULL(16)
+#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17)
+#define SNP_POLICY_MASK_DEBUG BIT_ULL(19)
+#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20)
+
+#define SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \
+ SNP_POLICY_MASK_API_MAJOR | \
+ SNP_POLICY_MASK_SMT | \
+ SNP_POLICY_MASK_RSVD_MBO | \
+ SNP_POLICY_MASK_DEBUG | \
+ SNP_POLICY_MASK_SINGLE_SOCKET)
+
+#define INITIAL_VMSA_GPA 0xFFFFFFFFF000
+
static u8 sev_enc_bit;
static DECLARE_RWSEM(sev_deactivate_lock);
static DEFINE_MUTEX(sev_bitmap_lock);
@@ -66,6 +89,8 @@ static unsigned int nr_asids;
static unsigned long *sev_asid_bitmap;
static unsigned long *sev_reclaim_asid_bitmap;
+static int snp_decommission_context(struct kvm *kvm);
+
struct enc_region {
struct list_head list;
unsigned long npages;
@@ -92,12 +117,17 @@ static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid)
down_write(&sev_deactivate_lock);
wbinvd_on_all_cpus();
- ret = sev_guest_df_flush(&error);
+
+ if (sev_snp_enabled)
+ ret = sev_do_cmd(SEV_CMD_SNP_DF_FLUSH, NULL, &error);
+ else
+ ret = sev_guest_df_flush(&error);
up_write(&sev_deactivate_lock);
if (ret)
- pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error);
+ pr_err("SEV%s: DF_FLUSH failed, ret=%d, error=%#x\n",
+ sev_snp_enabled ? "-SNP" : "", ret, error);
return ret;
}
@@ -233,6 +263,53 @@ static void sev_decommission(unsigned int handle)
sev_guest_decommission(&decommission, NULL);
}
+/*
+ * Transition a page to hypervisor-owned/shared state in the RMP table. This
+ * should not fail under normal conditions, but leak the page should that
+ * happen since it will no longer be usable by the host due to RMP protections.
+ */
+static int kvm_rmp_make_shared(struct kvm *kvm, u64 pfn, enum pg_level level)
+{
+ if (KVM_BUG_ON(rmp_make_shared(pfn, level), kvm)) {
+ snp_leak_pages(pfn, page_level_size(level) >> PAGE_SHIFT);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+/*
+ * Certain page-states, such as Pre-Guest and Firmware pages (as documented
+ * in Chapter 5 of the SEV-SNP Firmware ABI under "Page States") cannot be
+ * directly transitioned back to normal/hypervisor-owned state via RMPUPDATE
+ * unless they are reclaimed first.
+ *
+ * Until they are reclaimed and subsequently transitioned via RMPUPDATE, they
+ * might not be usable by the host due to being set as immutable or still
+ * being associated with a guest ASID.
+ *
+ * Bug the VM and leak the page if reclaim fails, or if the RMP entry can't be
+ * converted back to shared, as the page is no longer usable due to RMP
+ * protections, and it's infeasible for the guest to continue on.
+ */
+static int snp_page_reclaim(struct kvm *kvm, u64 pfn)
+{
+ struct sev_data_snp_page_reclaim data = {0};
+ int fw_err, rc;
+
+ data.paddr = __sme_set(pfn << PAGE_SHIFT);
+ rc = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &fw_err);
+ if (KVM_BUG(rc, kvm, "Failed to reclaim PFN %llx, rc %d fw_err %d", pfn, rc, fw_err)) {
+ snp_leak_pages(pfn, 1);
+ return -EIO;
+ }
+
+ if (kvm_rmp_make_shared(kvm, pfn, PG_LEVEL_4K))
+ return -EIO;
+
+ return rc;
+}
+
static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
{
struct sev_data_deactivate deactivate;
@@ -250,6 +327,78 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
sev_decommission(handle);
}
+/*
+ * This sets up bounce buffers/firmware pages to handle SNP Guest Request
+ * messages (e.g. attestation requests). See "SNP Guest Request" in the GHCB
+ * 2.0 specification for more details.
+ *
+ * Technically, when an SNP Guest Request is issued, the guest will provide its
+ * own request/response pages, which could in theory be passed along directly
+ * to firmware rather than using bounce pages. However, these pages would need
+ * special care:
+ *
+ * - Both pages are from shared guest memory, so they need to be protected
+ * from migration/etc. occurring while firmware reads/writes to them. At a
+ * minimum, this requires elevating the ref counts and potentially needing
+ * an explicit pinning of the memory. This places additional restrictions
+ * on what type of memory backends userspace can use for shared guest
+ * memory since there is some reliance on using refcounted pages.
+ *
+ * - The response page needs to be switched to Firmware-owned[1] state
+ * before the firmware can write to it, which can lead to potential
+ * host RMP #PFs if the guest is misbehaved and hands the host a
+ * guest page that KVM might write to for other reasons (e.g. virtio
+ * buffers/etc.).
+ *
+ * Both of these issues can be avoided completely by using separately-allocated
+ * bounce pages for both the request/response pages and passing those to
+ * firmware instead. So that's what is being set up here.
+ *
+ * Guest requests rely on message sequence numbers to ensure requests are
+ * issued to firmware in the order the guest issues them, so concurrent guest
+ * requests generally shouldn't happen. But a misbehaved guest could issue
+ * concurrent guest requests in theory, so a mutex is used to serialize
+ * access to the bounce buffers.
+ *
+ * [1] See the "Page States" section of the SEV-SNP Firmware ABI for more
+ * details on Firmware-owned pages, along with "RMP and VMPL Access Checks"
+ * in the APM for details on the related RMP restrictions.
+ */
+static int snp_guest_req_init(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct page *req_page;
+
+ req_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!req_page)
+ return -ENOMEM;
+
+ sev->guest_resp_buf = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!sev->guest_resp_buf) {
+ __free_page(req_page);
+ return -EIO;
+ }
+
+ sev->guest_req_buf = page_address(req_page);
+ mutex_init(&sev->guest_req_mutex);
+
+ return 0;
+}
+
+static void snp_guest_req_cleanup(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+
+ if (sev->guest_resp_buf)
+ snp_free_firmware_page(sev->guest_resp_buf);
+
+ if (sev->guest_req_buf)
+ __free_page(virt_to_page(sev->guest_req_buf));
+
+ sev->guest_req_buf = NULL;
+ sev->guest_resp_buf = NULL;
+}
+
static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
struct kvm_sev_init *data,
unsigned long vm_type)
@@ -288,6 +437,9 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
if (sev->es_active && !sev->ghcb_version)
sev->ghcb_version = GHCB_VERSION_DEFAULT;
+ if (vm_type == KVM_X86_SNP_VM)
+ sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE;
+
ret = sev_asid_new(sev);
if (ret)
goto e_no_asid;
@@ -297,6 +449,10 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
if (ret)
goto e_free;
+ /* This needs to happen after SEV/SNP firmware initialization. */
+ if (vm_type == KVM_X86_SNP_VM && snp_guest_req_init(kvm))
+ goto e_free;
+
INIT_LIST_HEAD(&sev->regions_list);
INIT_LIST_HEAD(&sev->mirror_vms);
sev->need_init = false;
@@ -348,7 +504,8 @@ static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp)
return -EINVAL;
if (kvm->arch.vm_type != KVM_X86_SEV_VM &&
- kvm->arch.vm_type != KVM_X86_SEV_ES_VM)
+ kvm->arch.vm_type != KVM_X86_SEV_ES_VM &&
+ kvm->arch.vm_type != KVM_X86_SNP_VM)
return -EINVAL;
if (copy_from_user(&data, u64_to_user_ptr(argp->data), sizeof(data)))
@@ -1999,6 +2156,410 @@ int sev_dev_get_attr(u32 group, u64 attr, u64 *val)
}
}
+/*
+ * The guest context contains all the information, keys and metadata
+ * associated with the guest that the firmware tracks to implement SEV
+ * and SNP features. The firmware stores the guest context in hypervisor
+ * provide page via the SNP_GCTX_CREATE command.
+ */
+static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct sev_data_snp_addr data = {};
+ void *context;
+ int rc;
+
+ /* Allocate memory for context page */
+ context = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT);
+ if (!context)
+ return NULL;
+
+ data.address = __psp_pa(context);
+ rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_GCTX_CREATE, &data, &argp->error);
+ if (rc) {
+ pr_warn("Failed to create SEV-SNP context, rc %d fw_error %d",
+ rc, argp->error);
+ snp_free_firmware_page(context);
+ return NULL;
+ }
+
+ return context;
+}
+
+static int snp_bind_asid(struct kvm *kvm, int *error)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_snp_activate data = {0};
+
+ data.gctx_paddr = __psp_pa(sev->snp_context);
+ data.asid = sev_get_asid(kvm);
+ return sev_issue_cmd(kvm, SEV_CMD_SNP_ACTIVATE, &data, error);
+}
+
+static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_snp_launch_start start = {0};
+ struct kvm_sev_snp_launch_start params;
+ int rc;
+
+ if (!sev_snp_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ /* Don't allow userspace to allocate memory for more than 1 SNP context. */
+ if (sev->snp_context)
+ return -EINVAL;
+
+ sev->snp_context = snp_context_create(kvm, argp);
+ if (!sev->snp_context)
+ return -ENOTTY;
+
+ if (params.flags)
+ return -EINVAL;
+
+ if (params.policy & ~SNP_POLICY_MASK_VALID)
+ return -EINVAL;
+
+ /* Check for policy bits that must be set */
+ if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO) ||
+ !(params.policy & SNP_POLICY_MASK_SMT))
+ return -EINVAL;
+
+ if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET)
+ return -EINVAL;
+
+ start.gctx_paddr = __psp_pa(sev->snp_context);
+ start.policy = params.policy;
+ memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw));
+ rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error);
+ if (rc) {
+ pr_debug("%s: SEV_CMD_SNP_LAUNCH_START firmware command failed, rc %d\n",
+ __func__, rc);
+ goto e_free_context;
+ }
+
+ sev->fd = argp->sev_fd;
+ rc = snp_bind_asid(kvm, &argp->error);
+ if (rc) {
+ pr_debug("%s: Failed to bind ASID to SEV-SNP context, rc %d\n",
+ __func__, rc);
+ goto e_free_context;
+ }
+
+ return 0;
+
+e_free_context:
+ snp_decommission_context(kvm);
+
+ return rc;
+}
+
+struct sev_gmem_populate_args {
+ __u8 type;
+ int sev_fd;
+ int fw_error;
+};
+
+static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pfn,
+ void __user *src, int order, void *opaque)
+{
+ struct sev_gmem_populate_args *sev_populate_args = opaque;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ int n_private = 0, ret, i;
+ int npages = (1 << order);
+ gfn_t gfn;
+
+ if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src))
+ return -EINVAL;
+
+ for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) {
+ struct sev_data_snp_launch_update fw_args = {0};
+ bool assigned;
+ int level;
+
+ if (!kvm_mem_is_private(kvm, gfn)) {
+ pr_debug("%s: Failed to ensure GFN 0x%llx has private memory attribute set\n",
+ __func__, gfn);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level);
+ if (ret || assigned) {
+ pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n",
+ __func__, gfn, ret, assigned);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (src) {
+ void *vaddr = kmap_local_pfn(pfn + i);
+
+ ret = copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE);
+ if (ret)
+ goto err;
+ kunmap_local(vaddr);
+ }
+
+ ret = rmp_make_private(pfn + i, gfn << PAGE_SHIFT, PG_LEVEL_4K,
+ sev_get_asid(kvm), true);
+ if (ret)
+ goto err;
+
+ n_private++;
+
+ fw_args.gctx_paddr = __psp_pa(sev->snp_context);
+ fw_args.address = __sme_set(pfn_to_hpa(pfn + i));
+ fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K);
+ fw_args.page_type = sev_populate_args->type;
+
+ ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
+ &fw_args, &sev_populate_args->fw_error);
+ if (ret)
+ goto fw_err;
+ }
+
+ return 0;
+
+fw_err:
+ /*
+ * If the firmware command failed handle the reclaim and cleanup of that
+ * PFN specially vs. prior pages which can be cleaned up below without
+ * needing to reclaim in advance.
+ *
+ * Additionally, when invalid CPUID function entries are detected,
+ * firmware writes the expected values into the page and leaves it
+ * unencrypted so it can be used for debugging and error-reporting.
+ *
+ * Copy this page back into the source buffer so userspace can use this
+ * information to provide information on which CPUID leaves/fields
+ * failed CPUID validation.
+ */
+ if (!snp_page_reclaim(kvm, pfn + i) &&
+ sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID &&
+ sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) {
+ void *vaddr = kmap_local_pfn(pfn + i);
+
+ if (copy_to_user(src + i * PAGE_SIZE, vaddr, PAGE_SIZE))
+ pr_debug("Failed to write CPUID page back to userspace\n");
+
+ kunmap_local(vaddr);
+ }
+
+ /* pfn + i is hypervisor-owned now, so skip below cleanup for it. */
+ n_private--;
+
+err:
+ pr_debug("%s: exiting with error ret %d (fw_error %d), restoring %d gmem PFNs to shared.\n",
+ __func__, ret, sev_populate_args->fw_error, n_private);
+ for (i = 0; i < n_private; i++)
+ kvm_rmp_make_shared(kvm, pfn + i, PG_LEVEL_4K);
+
+ return ret;
+}
+
+static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_gmem_populate_args sev_populate_args = {0};
+ struct kvm_sev_snp_launch_update params;
+ struct kvm_memory_slot *memslot;
+ long npages, count;
+ void __user *src;
+ int ret = 0;
+
+ if (!sev_snp_guest(kvm) || !sev->snp_context)
+ return -EINVAL;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__,
+ params.gfn_start, params.len, params.type, params.flags);
+
+ if (!PAGE_ALIGNED(params.len) || params.flags ||
+ (params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL &&
+ params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO &&
+ params.type != KVM_SEV_SNP_PAGE_TYPE_UNMEASURED &&
+ params.type != KVM_SEV_SNP_PAGE_TYPE_SECRETS &&
+ params.type != KVM_SEV_SNP_PAGE_TYPE_CPUID))
+ return -EINVAL;
+
+ npages = params.len / PAGE_SIZE;
+
+ /*
+ * For each GFN that's being prepared as part of the initial guest
+ * state, the following pre-conditions are verified:
+ *
+ * 1) The backing memslot is a valid private memslot.
+ * 2) The GFN has been set to private via KVM_SET_MEMORY_ATTRIBUTES
+ * beforehand.
+ * 3) The PFN of the guest_memfd has not already been set to private
+ * in the RMP table.
+ *
+ * The KVM MMU relies on kvm->mmu_invalidate_seq to retry nested page
+ * faults if there's a race between a fault and an attribute update via
+ * KVM_SET_MEMORY_ATTRIBUTES, and a similar approach could be utilized
+ * here. However, kvm->slots_lock guards against both this as well as
+ * concurrent memslot updates occurring while these checks are being
+ * performed, so use that here to make it easier to reason about the
+ * initial expected state and better guard against unexpected
+ * situations.
+ */
+ mutex_lock(&kvm->slots_lock);
+
+ memslot = gfn_to_memslot(kvm, params.gfn_start);
+ if (!kvm_slot_can_be_private(memslot)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ sev_populate_args.sev_fd = argp->sev_fd;
+ sev_populate_args.type = params.type;
+ src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr);
+
+ count = kvm_gmem_populate(kvm, params.gfn_start, src, npages,
+ sev_gmem_post_populate, &sev_populate_args);
+ if (count < 0) {
+ argp->error = sev_populate_args.fw_error;
+ pr_debug("%s: kvm_gmem_populate failed, ret %ld (fw_error %d)\n",
+ __func__, count, argp->error);
+ ret = -EIO;
+ } else {
+ params.gfn_start += count;
+ params.len -= count * PAGE_SIZE;
+ if (params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO)
+ params.uaddr += count * PAGE_SIZE;
+
+ ret = 0;
+ if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params)))
+ ret = -EFAULT;
+ }
+
+out:
+ mutex_unlock(&kvm->slots_lock);
+
+ return ret;
+}
+
+static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_snp_launch_update data = {};
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ int ret;
+
+ data.gctx_paddr = __psp_pa(sev->snp_context);
+ data.page_type = SNP_PAGE_TYPE_VMSA;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
+
+ ret = sev_es_sync_vmsa(svm);
+ if (ret)
+ return ret;
+
+ /* Transition the VMSA page to a firmware state. */
+ ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true);
+ if (ret)
+ return ret;
+
+ /* Issue the SNP command to encrypt the VMSA */
+ data.address = __sme_pa(svm->sev_es.vmsa);
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
+ &data, &argp->error);
+ if (ret) {
+ snp_page_reclaim(kvm, pfn);
+
+ return ret;
+ }
+
+ svm->vcpu.arch.guest_state_protected = true;
+ /*
+ * SEV-ES (and thus SNP) guest mandates LBR Virtualization to
+ * be _always_ ON. Enable it only after setting
+ * guest_state_protected because KVM_SET_MSRS allows dynamic
+ * toggling of LBRV (for performance reason) on write access to
+ * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set.
+ */
+ svm_enable_lbrv(vcpu);
+ }
+
+ return 0;
+}
+
+static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_snp_launch_finish params;
+ struct sev_data_snp_launch_finish *data;
+ void *id_block = NULL, *id_auth = NULL;
+ int ret;
+
+ if (!sev_snp_guest(kvm))
+ return -ENOTTY;
+
+ if (!sev->snp_context)
+ return -EINVAL;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ if (params.flags)
+ return -EINVAL;
+
+ /* Measure all vCPUs using LAUNCH_UPDATE before finalizing the launch flow. */
+ ret = snp_launch_update_vmsa(kvm, argp);
+ if (ret)
+ return ret;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+ if (!data)
+ return -ENOMEM;
+
+ if (params.id_block_en) {
+ id_block = psp_copy_user_blob(params.id_block_uaddr, KVM_SEV_SNP_ID_BLOCK_SIZE);
+ if (IS_ERR(id_block)) {
+ ret = PTR_ERR(id_block);
+ goto e_free;
+ }
+
+ data->id_block_en = 1;
+ data->id_block_paddr = __sme_pa(id_block);
+
+ id_auth = psp_copy_user_blob(params.id_auth_uaddr, KVM_SEV_SNP_ID_AUTH_SIZE);
+ if (IS_ERR(id_auth)) {
+ ret = PTR_ERR(id_auth);
+ goto e_free_id_block;
+ }
+
+ data->id_auth_paddr = __sme_pa(id_auth);
+
+ if (params.auth_key_en)
+ data->auth_key_en = 1;
+ }
+
+ data->vcek_disabled = params.vcek_disabled;
+
+ memcpy(data->host_data, params.host_data, KVM_SEV_SNP_FINISH_DATA_SIZE);
+ data->gctx_paddr = __psp_pa(sev->snp_context);
+ ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error);
+
+ kfree(id_auth);
+
+e_free_id_block:
+ kfree(id_block);
+
+e_free:
+ kfree(data);
+
+ return ret;
+}
+
int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
{
struct kvm_sev_cmd sev_cmd;
@@ -2022,6 +2583,15 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
goto out;
}
+ /*
+ * Once KVM_SEV_INIT2 initializes a KVM instance as an SNP guest, only
+ * allow the use of SNP-specific commands.
+ */
+ if (sev_snp_guest(kvm) && sev_cmd.id < KVM_SEV_SNP_LAUNCH_START) {
+ r = -EPERM;
+ goto out;
+ }
+
switch (sev_cmd.id) {
case KVM_SEV_ES_INIT:
if (!sev_es_enabled) {
@@ -2086,6 +2656,15 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
case KVM_SEV_RECEIVE_FINISH:
r = sev_receive_finish(kvm, &sev_cmd);
break;
+ case KVM_SEV_SNP_LAUNCH_START:
+ r = snp_launch_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SNP_LAUNCH_UPDATE:
+ r = snp_launch_update(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SNP_LAUNCH_FINISH:
+ r = snp_launch_finish(kvm, &sev_cmd);
+ break;
default:
r = -EINVAL;
goto out;
@@ -2281,6 +2860,31 @@ e_source_fput:
return ret;
}
+static int snp_decommission_context(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_snp_addr data = {};
+ int ret;
+
+ /* If context is not created then do nothing */
+ if (!sev->snp_context)
+ return 0;
+
+ /* Do the decommision, which will unbind the ASID from the SNP context */
+ data.address = __sme_pa(sev->snp_context);
+ down_write(&sev_deactivate_lock);
+ ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL);
+ up_write(&sev_deactivate_lock);
+
+ if (WARN_ONCE(ret, "Failed to release guest context, ret %d", ret))
+ return ret;
+
+ snp_free_firmware_page(sev->snp_context);
+ sev->snp_context = NULL;
+
+ return 0;
+}
+
void sev_vm_destroy(struct kvm *kvm)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@ -2322,7 +2926,19 @@ void sev_vm_destroy(struct kvm *kvm)
}
}
- sev_unbind_asid(kvm, sev->handle);
+ if (sev_snp_guest(kvm)) {
+ snp_guest_req_cleanup(kvm);
+
+ /*
+ * Decomission handles unbinding of the ASID. If it fails for
+ * some unexpected reason, just leak the ASID.
+ */
+ if (snp_decommission_context(kvm))
+ return;
+ } else {
+ sev_unbind_asid(kvm, sev->handle);
+ }
+
sev_asid_free(sev);
}
@@ -2336,11 +2952,16 @@ void __init sev_set_cpu_caps(void)
kvm_cpu_cap_set(X86_FEATURE_SEV_ES);
kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM);
}
+ if (sev_snp_enabled) {
+ kvm_cpu_cap_set(X86_FEATURE_SEV_SNP);
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_SNP_VM);
+ }
}
void __init sev_hardware_setup(void)
{
unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
+ bool sev_snp_supported = false;
bool sev_es_supported = false;
bool sev_supported = false;
@@ -2427,6 +3048,7 @@ void __init sev_hardware_setup(void)
sev_es_asid_count = min_sev_asid - 1;
WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count));
sev_es_supported = true;
+ sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP);
out:
if (boot_cpu_has(X86_FEATURE_SEV))
@@ -2439,9 +3061,15 @@ out:
pr_info("SEV-ES %s (ASIDs %u - %u)\n",
sev_es_supported ? "enabled" : "disabled",
min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1);
+ if (boot_cpu_has(X86_FEATURE_SEV_SNP))
+ pr_info("SEV-SNP %s (ASIDs %u - %u)\n",
+ sev_snp_supported ? "enabled" : "disabled",
+ min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1);
sev_enabled = sev_supported;
sev_es_enabled = sev_es_supported;
+ sev_snp_enabled = sev_snp_supported;
+
if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) ||
!cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP))
sev_es_debug_swap_enabled = false;
@@ -2520,7 +3148,13 @@ do_wbinvd:
void sev_guest_memory_reclaimed(struct kvm *kvm)
{
- if (!sev_guest(kvm))
+ /*
+ * With SNP+gmem, private/encrypted memory is unreachable via the
+ * hva-based mmu notifiers, so these events are only actually
+ * pertaining to shared pages where there is no need to perform
+ * the WBINVD to flush associated caches.
+ */
+ if (!sev_guest(kvm) || sev_snp_guest(kvm))
return;
wbinvd_on_all_cpus();
@@ -2535,11 +3169,24 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu)
svm = to_svm(vcpu);
+ /*
+ * If it's an SNP guest, then the VMSA was marked in the RMP table as
+ * a guest-owned page. Transition the page to hypervisor state before
+ * releasing it back to the system.
+ */
+ if (sev_snp_guest(vcpu->kvm)) {
+ u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
+
+ if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K))
+ goto skip_vmsa_free;
+ }
+
if (vcpu->arch.guest_state_protected)
sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa);
__free_page(virt_to_page(svm->sev_es.vmsa));
+skip_vmsa_free:
if (svm->sev_es.ghcb_sa_free)
kvfree(svm->sev_es.ghcb_sa);
}
@@ -2735,6 +3382,13 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
if (!kvm_ghcb_sw_scratch_is_valid(svm))
goto vmgexit_err;
break;
+ case SVM_VMGEXIT_AP_CREATION:
+ if (!sev_snp_guest(vcpu->kvm))
+ goto vmgexit_err;
+ if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY)
+ if (!kvm_ghcb_rax_is_valid(svm))
+ goto vmgexit_err;
+ break;
case SVM_VMGEXIT_NMI_COMPLETE:
case SVM_VMGEXIT_AP_HLT_LOOP:
case SVM_VMGEXIT_AP_JUMP_TABLE:
@@ -2742,6 +3396,18 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
case SVM_VMGEXIT_HV_FEATURES:
case SVM_VMGEXIT_TERM_REQUEST:
break;
+ case SVM_VMGEXIT_PSC:
+ if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_VMGEXIT_GUEST_REQUEST:
+ case SVM_VMGEXIT_EXT_GUEST_REQUEST:
+ if (!sev_snp_guest(vcpu->kvm) ||
+ !PAGE_ALIGNED(control->exit_info_1) ||
+ !PAGE_ALIGNED(control->exit_info_2) ||
+ control->exit_info_1 == control->exit_info_2)
+ goto vmgexit_err;
+ break;
default:
reason = GHCB_ERR_INVALID_EVENT;
goto vmgexit_err;
@@ -2929,6 +3595,534 @@ static void set_ghcb_msr(struct vcpu_svm *svm, u64 value)
svm->vmcb->control.ghcb_gpa = value;
}
+static int snp_rmptable_psmash(kvm_pfn_t pfn)
+{
+ int ret;
+
+ pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1);
+
+ /*
+ * PSMASH_FAIL_INUSE indicates another processor is modifying the
+ * entry, so retry until that's no longer the case.
+ */
+ do {
+ ret = psmash(pfn);
+ } while (ret == PSMASH_FAIL_INUSE);
+
+ return ret;
+}
+
+static int snp_complete_psc_msr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (vcpu->run->hypercall.ret)
+ set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
+ else
+ set_ghcb_msr(svm, GHCB_MSR_PSC_RESP);
+
+ return 1; /* resume guest */
+}
+
+static int snp_begin_psc_msr(struct vcpu_svm *svm, u64 ghcb_msr)
+{
+ u64 gpa = gfn_to_gpa(GHCB_MSR_PSC_REQ_TO_GFN(ghcb_msr));
+ u8 op = GHCB_MSR_PSC_REQ_TO_OP(ghcb_msr);
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ if (op != SNP_PAGE_STATE_PRIVATE && op != SNP_PAGE_STATE_SHARED) {
+ set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
+ return 1; /* resume guest */
+ }
+
+ if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) {
+ set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
+ return 1; /* resume guest */
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
+ vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ vcpu->run->hypercall.args[0] = gpa;
+ vcpu->run->hypercall.args[1] = 1;
+ vcpu->run->hypercall.args[2] = (op == SNP_PAGE_STATE_PRIVATE)
+ ? KVM_MAP_GPA_RANGE_ENCRYPTED
+ : KVM_MAP_GPA_RANGE_DECRYPTED;
+ vcpu->run->hypercall.args[2] |= KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
+
+ vcpu->arch.complete_userspace_io = snp_complete_psc_msr;
+
+ return 0; /* forward request to userspace */
+}
+
+struct psc_buffer {
+ struct psc_hdr hdr;
+ struct psc_entry entries[];
+} __packed;
+
+static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc);
+
+static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret)
+{
+ svm->sev_es.psc_inflight = 0;
+ svm->sev_es.psc_idx = 0;
+ svm->sev_es.psc_2m = false;
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, psc_ret);
+}
+
+static void __snp_complete_one_psc(struct vcpu_svm *svm)
+{
+ struct psc_buffer *psc = svm->sev_es.ghcb_sa;
+ struct psc_entry *entries = psc->entries;
+ struct psc_hdr *hdr = &psc->hdr;
+ __u16 idx;
+
+ /*
+ * Everything in-flight has been processed successfully. Update the
+ * corresponding entries in the guest's PSC buffer and zero out the
+ * count of in-flight PSC entries.
+ */
+ for (idx = svm->sev_es.psc_idx; svm->sev_es.psc_inflight;
+ svm->sev_es.psc_inflight--, idx++) {
+ struct psc_entry *entry = &entries[idx];
+
+ entry->cur_page = entry->pagesize ? 512 : 1;
+ }
+
+ hdr->cur_entry = idx;
+}
+
+static int snp_complete_one_psc(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct psc_buffer *psc = svm->sev_es.ghcb_sa;
+
+ if (vcpu->run->hypercall.ret) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
+ return 1; /* resume guest */
+ }
+
+ __snp_complete_one_psc(svm);
+
+ /* Handle the next range (if any). */
+ return snp_begin_psc(svm, psc);
+}
+
+static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
+{
+ struct psc_entry *entries = psc->entries;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct psc_hdr *hdr = &psc->hdr;
+ struct psc_entry entry_start;
+ u16 idx, idx_start, idx_end;
+ int npages;
+ bool huge;
+ u64 gfn;
+
+ if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
+ return 1;
+ }
+
+next_range:
+ /* There should be no other PSCs in-flight at this point. */
+ if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
+ return 1;
+ }
+
+ /*
+ * The PSC descriptor buffer can be modified by a misbehaved guest after
+ * validation, so take care to only use validated copies of values used
+ * for things like array indexing.
+ */
+ idx_start = hdr->cur_entry;
+ idx_end = hdr->end_entry;
+
+ if (idx_end >= VMGEXIT_PSC_MAX_COUNT) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR);
+ return 1;
+ }
+
+ /* Find the start of the next range which needs processing. */
+ for (idx = idx_start; idx <= idx_end; idx++, hdr->cur_entry++) {
+ entry_start = entries[idx];
+
+ gfn = entry_start.gfn;
+ huge = entry_start.pagesize;
+ npages = huge ? 512 : 1;
+
+ if (entry_start.cur_page > npages || !IS_ALIGNED(gfn, npages)) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_ENTRY);
+ return 1;
+ }
+
+ if (entry_start.cur_page) {
+ /*
+ * If this is a partially-completed 2M range, force 4K handling
+ * for the remaining pages since they're effectively split at
+ * this point. Subsequent code should ensure this doesn't get
+ * combined with adjacent PSC entries where 2M handling is still
+ * possible.
+ */
+ npages -= entry_start.cur_page;
+ gfn += entry_start.cur_page;
+ huge = false;
+ }
+
+ if (npages)
+ break;
+ }
+
+ if (idx > idx_end) {
+ /* Nothing more to process. */
+ snp_complete_psc(svm, 0);
+ return 1;
+ }
+
+ svm->sev_es.psc_2m = huge;
+ svm->sev_es.psc_idx = idx;
+ svm->sev_es.psc_inflight = 1;
+
+ /*
+ * Find all subsequent PSC entries that contain adjacent GPA
+ * ranges/operations and can be combined into a single
+ * KVM_HC_MAP_GPA_RANGE exit.
+ */
+ while (++idx <= idx_end) {
+ struct psc_entry entry = entries[idx];
+
+ if (entry.operation != entry_start.operation ||
+ entry.gfn != entry_start.gfn + npages ||
+ entry.cur_page || !!entry.pagesize != huge)
+ break;
+
+ svm->sev_es.psc_inflight++;
+ npages += huge ? 512 : 1;
+ }
+
+ switch (entry_start.operation) {
+ case VMGEXIT_PSC_OP_PRIVATE:
+ case VMGEXIT_PSC_OP_SHARED:
+ vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
+ vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn);
+ vcpu->run->hypercall.args[1] = npages;
+ vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE
+ ? KVM_MAP_GPA_RANGE_ENCRYPTED
+ : KVM_MAP_GPA_RANGE_DECRYPTED;
+ vcpu->run->hypercall.args[2] |= entry_start.pagesize
+ ? KVM_MAP_GPA_RANGE_PAGE_SZ_2M
+ : KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
+ vcpu->arch.complete_userspace_io = snp_complete_one_psc;
+ return 0; /* forward request to userspace */
+ default:
+ /*
+ * Only shared/private PSC operations are currently supported, so if the
+ * entire range consists of unsupported operations (e.g. SMASH/UNSMASH),
+ * then consider the entire range completed and avoid exiting to
+ * userspace. In theory snp_complete_psc() can always be called directly
+ * at this point to complete the current range and start the next one,
+ * but that could lead to unexpected levels of recursion.
+ */
+ __snp_complete_one_psc(svm);
+ goto next_range;
+ }
+
+ unreachable();
+}
+
+static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ WARN_ON(!mutex_is_locked(&svm->sev_es.snp_vmsa_mutex));
+
+ /* Mark the vCPU as offline and not runnable */
+ vcpu->arch.pv.pv_unhalted = false;
+ vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
+
+ /* Clear use of the VMSA */
+ svm->vmcb->control.vmsa_pa = INVALID_PAGE;
+
+ if (VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) {
+ gfn_t gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa);
+ struct kvm_memory_slot *slot;
+ kvm_pfn_t pfn;
+
+ slot = gfn_to_memslot(vcpu->kvm, gfn);
+ if (!slot)
+ return -EINVAL;
+
+ /*
+ * The new VMSA will be private memory guest memory, so
+ * retrieve the PFN from the gmem backend.
+ */
+ if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, NULL))
+ return -EINVAL;
+
+ /*
+ * From this point forward, the VMSA will always be a
+ * guest-mapped page rather than the initial one allocated
+ * by KVM in svm->sev_es.vmsa. In theory, svm->sev_es.vmsa
+ * could be free'd and cleaned up here, but that involves
+ * cleanups like wbinvd_on_all_cpus() which would ideally
+ * be handled during teardown rather than guest boot.
+ * Deferring that also allows the existing logic for SEV-ES
+ * VMSAs to be re-used with minimal SNP-specific changes.
+ */
+ svm->sev_es.snp_has_guest_vmsa = true;
+
+ /* Use the new VMSA */
+ svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn);
+
+ /* Mark the vCPU as runnable */
+ vcpu->arch.pv.pv_unhalted = false;
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+ svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
+
+ /*
+ * gmem pages aren't currently migratable, but if this ever
+ * changes then care should be taken to ensure
+ * svm->sev_es.vmsa is pinned through some other means.
+ */
+ kvm_release_pfn_clean(pfn);
+ }
+
+ /*
+ * When replacing the VMSA during SEV-SNP AP creation,
+ * mark the VMCB dirty so that full state is always reloaded.
+ */
+ vmcb_mark_all_dirty(svm->vmcb);
+
+ return 0;
+}
+
+/*
+ * Invoked as part of svm_vcpu_reset() processing of an init event.
+ */
+void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ if (!sev_snp_guest(vcpu->kvm))
+ return;
+
+ mutex_lock(&svm->sev_es.snp_vmsa_mutex);
+
+ if (!svm->sev_es.snp_ap_waiting_for_reset)
+ goto unlock;
+
+ svm->sev_es.snp_ap_waiting_for_reset = false;
+
+ ret = __sev_snp_update_protected_guest_state(vcpu);
+ if (ret)
+ vcpu_unimpl(vcpu, "snp: AP state update on init failed\n");
+
+unlock:
+ mutex_unlock(&svm->sev_es.snp_vmsa_mutex);
+}
+
+static int sev_snp_ap_creation(struct vcpu_svm *svm)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_vcpu *target_vcpu;
+ struct vcpu_svm *target_svm;
+ unsigned int request;
+ unsigned int apic_id;
+ bool kick;
+ int ret;
+
+ request = lower_32_bits(svm->vmcb->control.exit_info_1);
+ apic_id = upper_32_bits(svm->vmcb->control.exit_info_1);
+
+ /* Validate the APIC ID */
+ target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id);
+ if (!target_vcpu) {
+ vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n",
+ apic_id);
+ return -EINVAL;
+ }
+
+ ret = 0;
+
+ target_svm = to_svm(target_vcpu);
+
+ /*
+ * The target vCPU is valid, so the vCPU will be kicked unless the
+ * request is for CREATE_ON_INIT. For any errors at this stage, the
+ * kick will place the vCPU in an non-runnable state.
+ */
+ kick = true;
+
+ mutex_lock(&target_svm->sev_es.snp_vmsa_mutex);
+
+ target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
+ target_svm->sev_es.snp_ap_waiting_for_reset = true;
+
+ /* Interrupt injection mode shouldn't change for AP creation */
+ if (request < SVM_VMGEXIT_AP_DESTROY) {
+ u64 sev_features;
+
+ sev_features = vcpu->arch.regs[VCPU_REGS_RAX];
+ sev_features ^= sev->vmsa_features;
+
+ if (sev_features & SVM_SEV_FEAT_INT_INJ_MODES) {
+ vcpu_unimpl(vcpu, "vmgexit: invalid AP injection mode [%#lx] from guest\n",
+ vcpu->arch.regs[VCPU_REGS_RAX]);
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ switch (request) {
+ case SVM_VMGEXIT_AP_CREATE_ON_INIT:
+ kick = false;
+ fallthrough;
+ case SVM_VMGEXIT_AP_CREATE:
+ if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) {
+ vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n",
+ svm->vmcb->control.exit_info_2);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Malicious guest can RMPADJUST a large page into VMSA which
+ * will hit the SNP erratum where the CPU will incorrectly signal
+ * an RMP violation #PF if a hugepage collides with the RMP entry
+ * of VMSA page, reject the AP CREATE request if VMSA address from
+ * guest is 2M aligned.
+ */
+ if (IS_ALIGNED(svm->vmcb->control.exit_info_2, PMD_SIZE)) {
+ vcpu_unimpl(vcpu,
+ "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n",
+ svm->vmcb->control.exit_info_2);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2;
+ break;
+ case SVM_VMGEXIT_AP_DESTROY:
+ break;
+ default:
+ vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n",
+ request);
+ ret = -EINVAL;
+ break;
+ }
+
+out:
+ if (kick) {
+ kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu);
+ kvm_vcpu_kick(target_vcpu);
+ }
+
+ mutex_unlock(&target_svm->sev_es.snp_vmsa_mutex);
+
+ return ret;
+}
+
+static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
+{
+ struct sev_data_snp_guest_request data = {0};
+ struct kvm *kvm = svm->vcpu.kvm;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ sev_ret_code fw_err = 0;
+ int ret;
+
+ if (!sev_snp_guest(kvm))
+ return -EINVAL;
+
+ mutex_lock(&sev->guest_req_mutex);
+
+ if (kvm_read_guest(kvm, req_gpa, sev->guest_req_buf, PAGE_SIZE)) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
+ data.gctx_paddr = __psp_pa(sev->snp_context);
+ data.req_paddr = __psp_pa(sev->guest_req_buf);
+ data.res_paddr = __psp_pa(sev->guest_resp_buf);
+
+ /*
+ * Firmware failures are propagated on to guest, but any other failure
+ * condition along the way should be reported to userspace. E.g. if
+ * the PSP is dead and commands are timing out.
+ */
+ ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err);
+ if (ret && !fw_err)
+ goto out_unlock;
+
+ if (kvm_write_guest(kvm, resp_gpa, sev->guest_resp_buf, PAGE_SIZE)) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(0, fw_err));
+
+ ret = 1; /* resume guest */
+
+out_unlock:
+ mutex_unlock(&sev->guest_req_mutex);
+ return ret;
+}
+
+static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
+{
+ struct kvm *kvm = svm->vcpu.kvm;
+ u8 msg_type;
+
+ if (!sev_snp_guest(kvm))
+ return -EINVAL;
+
+ if (kvm_read_guest(kvm, req_gpa + offsetof(struct snp_guest_msg_hdr, msg_type),
+ &msg_type, 1))
+ return -EIO;
+
+ /*
+ * As per GHCB spec, requests of type MSG_REPORT_REQ also allow for
+ * additional certificate data to be provided alongside the attestation
+ * report via the guest-provided data pages indicated by RAX/RBX. The
+ * certificate data is optional and requires additional KVM enablement
+ * to provide an interface for userspace to provide it, but KVM still
+ * needs to be able to handle extended guest requests either way. So
+ * provide a stub implementation that will always return an empty
+ * certificate table in the guest-provided data pages.
+ */
+ if (msg_type == SNP_MSG_REPORT_REQ) {
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u64 data_npages;
+ gpa_t data_gpa;
+
+ if (!kvm_ghcb_rax_is_valid(svm) || !kvm_ghcb_rbx_is_valid(svm))
+ goto request_invalid;
+
+ data_gpa = vcpu->arch.regs[VCPU_REGS_RAX];
+ data_npages = vcpu->arch.regs[VCPU_REGS_RBX];
+
+ if (!PAGE_ALIGNED(data_gpa))
+ goto request_invalid;
+
+ /*
+ * As per GHCB spec (see "SNP Extended Guest Request"), the
+ * certificate table is terminated by 24-bytes of zeroes.
+ */
+ if (data_npages && kvm_clear_guest(kvm, data_gpa, 24))
+ return -EIO;
+ }
+
+ return snp_handle_guest_req(svm, req_gpa, resp_gpa);
+
+request_invalid:
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT);
+ return 1; /* resume guest */
+}
+
static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
@@ -3008,6 +4202,38 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP,
GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS);
break;
+ case GHCB_MSR_PREF_GPA_REQ:
+ if (!sev_snp_guest(vcpu->kvm))
+ goto out_terminate;
+
+ set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_NONE, GHCB_MSR_GPA_VALUE_MASK,
+ GHCB_MSR_GPA_VALUE_POS);
+ set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_RESP, GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ case GHCB_MSR_REG_GPA_REQ: {
+ u64 gfn;
+
+ if (!sev_snp_guest(vcpu->kvm))
+ goto out_terminate;
+
+ gfn = get_ghcb_msr_bits(svm, GHCB_MSR_GPA_VALUE_MASK,
+ GHCB_MSR_GPA_VALUE_POS);
+
+ svm->sev_es.ghcb_registered_gpa = gfn_to_gpa(gfn);
+
+ set_ghcb_msr_bits(svm, gfn, GHCB_MSR_GPA_VALUE_MASK,
+ GHCB_MSR_GPA_VALUE_POS);
+ set_ghcb_msr_bits(svm, GHCB_MSR_REG_GPA_RESP, GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ }
+ case GHCB_MSR_PSC_REQ:
+ if (!sev_snp_guest(vcpu->kvm))
+ goto out_terminate;
+
+ ret = snp_begin_psc_msr(svm, control->ghcb_gpa);
+ break;
case GHCB_MSR_TERM_REQ: {
u64 reason_set, reason_code;
@@ -3020,12 +4246,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
reason_set, reason_code);
- vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
- vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
- vcpu->run->system_event.ndata = 1;
- vcpu->run->system_event.data[0] = control->ghcb_gpa;
-
- return 0;
+ goto out_terminate;
}
default:
/* Error, keep GHCB MSR value as-is */
@@ -3036,6 +4257,14 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
control->ghcb_gpa, ret);
return ret;
+
+out_terminate:
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
+ vcpu->run->system_event.ndata = 1;
+ vcpu->run->system_event.data[0] = control->ghcb_gpa;
+
+ return 0;
}
int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
@@ -3071,6 +4300,13 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb);
sev_es_sync_from_ghcb(svm);
+
+ /* SEV-SNP guest requires that the GHCB GPA must be registered */
+ if (sev_snp_guest(svm->vcpu.kvm) && !ghcb_gpa_is_registered(svm, ghcb_gpa)) {
+ vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB GPA [%#llx] is not registered.\n", ghcb_gpa);
+ return -EINVAL;
+ }
+
ret = sev_es_validate_vmgexit(svm);
if (ret)
return ret;
@@ -3145,6 +4381,28 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
vcpu->run->system_event.ndata = 1;
vcpu->run->system_event.data[0] = control->ghcb_gpa;
break;
+ case SVM_VMGEXIT_PSC:
+ ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
+ if (ret)
+ break;
+
+ ret = snp_begin_psc(svm, svm->sev_es.ghcb_sa);
+ break;
+ case SVM_VMGEXIT_AP_CREATION:
+ ret = sev_snp_ap_creation(svm);
+ if (ret) {
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT);
+ }
+
+ ret = 1;
+ break;
+ case SVM_VMGEXIT_GUEST_REQUEST:
+ ret = snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2);
+ break;
+ case SVM_VMGEXIT_EXT_GUEST_REQUEST:
+ ret = snp_handle_ext_guest_req(svm, control->exit_info_1, control->exit_info_2);
+ break;
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
vcpu_unimpl(vcpu,
"vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
@@ -3238,7 +4496,7 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
* the VMSA will be NULL if this vCPU is the destination for intrahost
* migration, and will be copied later.
*/
- if (svm->sev_es.vmsa)
+ if (svm->sev_es.vmsa && !svm->sev_es.snp_has_guest_vmsa)
svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
/* Can't intercept CR register access, HV can't modify CR registers */
@@ -3310,6 +4568,8 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm)
set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version,
GHCB_VERSION_MIN,
sev_enc_bit));
+
+ mutex_init(&svm->sev_es.snp_vmsa_mutex);
}
void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa)
@@ -3331,9 +4591,9 @@ void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_are
* isn't saved by VMRUN, that isn't already saved by VMSAVE (performed
* by common SVM code).
*/
- hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ hostsa->xcr0 = kvm_host.xcr0;
hostsa->pkru = read_pkru();
- hostsa->xss = host_xss;
+ hostsa->xss = kvm_host.xss;
/*
* If DebugSwap is enabled, debug registers are loaded but NOT saved by
@@ -3389,13 +4649,13 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
}
}
-struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
+struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
{
unsigned long pfn;
struct page *p;
if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
- return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ return alloc_pages_node(node, gfp | __GFP_ZERO, 0);
/*
* Allocate an SNP-safe page to workaround the SNP erratum where
@@ -3406,7 +4666,7 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
* Allocate one extra page, choose a page which is not
* 2MB-aligned, and free the other.
*/
- p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1);
+ p = alloc_pages_node(node, gfp | __GFP_ZERO, 1);
if (!p)
return NULL;
@@ -3420,3 +4680,271 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
return p;
}
+
+void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code)
+{
+ struct kvm_memory_slot *slot;
+ struct kvm *kvm = vcpu->kvm;
+ int order, rmp_level, ret;
+ bool assigned;
+ kvm_pfn_t pfn;
+ gfn_t gfn;
+
+ gfn = gpa >> PAGE_SHIFT;
+
+ /*
+ * The only time RMP faults occur for shared pages is when the guest is
+ * triggering an RMP fault for an implicit page-state change from
+ * shared->private. Implicit page-state changes are forwarded to
+ * userspace via KVM_EXIT_MEMORY_FAULT events, however, so RMP faults
+ * for shared pages should not end up here.
+ */
+ if (!kvm_mem_is_private(kvm, gfn)) {
+ pr_warn_ratelimited("SEV: Unexpected RMP fault for non-private GPA 0x%llx\n",
+ gpa);
+ return;
+ }
+
+ slot = gfn_to_memslot(kvm, gfn);
+ if (!kvm_slot_can_be_private(slot)) {
+ pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n",
+ gpa);
+ return;
+ }
+
+ ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &order);
+ if (ret) {
+ pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n",
+ gpa);
+ return;
+ }
+
+ ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
+ if (ret || !assigned) {
+ pr_warn_ratelimited("SEV: Unexpected RMP fault, no assigned RMP entry found for GPA 0x%llx PFN 0x%llx error %d\n",
+ gpa, pfn, ret);
+ goto out_no_trace;
+ }
+
+ /*
+ * There are 2 cases where a PSMASH may be needed to resolve an #NPF
+ * with PFERR_GUEST_RMP_BIT set:
+ *
+ * 1) RMPADJUST/PVALIDATE can trigger an #NPF with PFERR_GUEST_SIZEM
+ * bit set if the guest issues them with a smaller granularity than
+ * what is indicated by the page-size bit in the 2MB RMP entry for
+ * the PFN that backs the GPA.
+ *
+ * 2) Guest access via NPT can trigger an #NPF if the NPT mapping is
+ * smaller than what is indicated by the 2MB RMP entry for the PFN
+ * that backs the GPA.
+ *
+ * In both these cases, the corresponding 2M RMP entry needs to
+ * be PSMASH'd to 512 4K RMP entries. If the RMP entry is already
+ * split into 4K RMP entries, then this is likely a spurious case which
+ * can occur when there are concurrent accesses by the guest to a 2MB
+ * GPA range that is backed by a 2MB-aligned PFN who's RMP entry is in
+ * the process of being PMASH'd into 4K entries. These cases should
+ * resolve automatically on subsequent accesses, so just ignore them
+ * here.
+ */
+ if (rmp_level == PG_LEVEL_4K)
+ goto out;
+
+ ret = snp_rmptable_psmash(pfn);
+ if (ret) {
+ /*
+ * Look it up again. If it's 4K now then the PSMASH may have
+ * raced with another process and the issue has already resolved
+ * itself.
+ */
+ if (!snp_lookup_rmpentry(pfn, &assigned, &rmp_level) &&
+ assigned && rmp_level == PG_LEVEL_4K)
+ goto out;
+
+ pr_warn_ratelimited("SEV: Unable to split RMP entry for GPA 0x%llx PFN 0x%llx ret %d\n",
+ gpa, pfn, ret);
+ }
+
+ kvm_zap_gfn_range(kvm, gfn, gfn + PTRS_PER_PMD);
+out:
+ trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret);
+out_no_trace:
+ put_page(pfn_to_page(pfn));
+}
+
+static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end)
+{
+ kvm_pfn_t pfn = start;
+
+ while (pfn < end) {
+ int ret, rmp_level;
+ bool assigned;
+
+ ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
+ if (ret) {
+ pr_warn_ratelimited("SEV: Failed to retrieve RMP entry: PFN 0x%llx GFN start 0x%llx GFN end 0x%llx RMP level %d error %d\n",
+ pfn, start, end, rmp_level, ret);
+ return false;
+ }
+
+ if (assigned) {
+ pr_debug("%s: overlap detected, PFN 0x%llx start 0x%llx end 0x%llx RMP level %d\n",
+ __func__, pfn, start, end, rmp_level);
+ return false;
+ }
+
+ pfn++;
+ }
+
+ return true;
+}
+
+static u8 max_level_for_order(int order)
+{
+ if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
+ return PG_LEVEL_2M;
+
+ return PG_LEVEL_4K;
+}
+
+static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order)
+{
+ kvm_pfn_t pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD);
+
+ /*
+ * If this is a large folio, and the entire 2M range containing the
+ * PFN is currently shared, then the entire 2M-aligned range can be
+ * set to private via a single 2M RMP entry.
+ */
+ if (max_level_for_order(order) > PG_LEVEL_4K &&
+ is_pfn_range_shared(pfn_aligned, pfn_aligned + PTRS_PER_PMD))
+ return true;
+
+ return false;
+}
+
+int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ kvm_pfn_t pfn_aligned;
+ gfn_t gfn_aligned;
+ int level, rc;
+ bool assigned;
+
+ if (!sev_snp_guest(kvm))
+ return 0;
+
+ rc = snp_lookup_rmpentry(pfn, &assigned, &level);
+ if (rc) {
+ pr_err_ratelimited("SEV: Failed to look up RMP entry: GFN %llx PFN %llx error %d\n",
+ gfn, pfn, rc);
+ return -ENOENT;
+ }
+
+ if (assigned) {
+ pr_debug("%s: already assigned: gfn %llx pfn %llx max_order %d level %d\n",
+ __func__, gfn, pfn, max_order, level);
+ return 0;
+ }
+
+ if (is_large_rmp_possible(kvm, pfn, max_order)) {
+ level = PG_LEVEL_2M;
+ pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD);
+ gfn_aligned = ALIGN_DOWN(gfn, PTRS_PER_PMD);
+ } else {
+ level = PG_LEVEL_4K;
+ pfn_aligned = pfn;
+ gfn_aligned = gfn;
+ }
+
+ rc = rmp_make_private(pfn_aligned, gfn_to_gpa(gfn_aligned), level, sev->asid, false);
+ if (rc) {
+ pr_err_ratelimited("SEV: Failed to update RMP entry: GFN %llx PFN %llx level %d error %d\n",
+ gfn, pfn, level, rc);
+ return -EINVAL;
+ }
+
+ pr_debug("%s: updated: gfn %llx pfn %llx pfn_aligned %llx max_order %d level %d\n",
+ __func__, gfn, pfn, pfn_aligned, max_order, level);
+
+ return 0;
+}
+
+void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
+{
+ kvm_pfn_t pfn;
+
+ if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ return;
+
+ pr_debug("%s: PFN start 0x%llx PFN end 0x%llx\n", __func__, start, end);
+
+ for (pfn = start; pfn < end;) {
+ bool use_2m_update = false;
+ int rc, rmp_level;
+ bool assigned;
+
+ rc = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
+ if (rc || !assigned)
+ goto next_pfn;
+
+ use_2m_update = IS_ALIGNED(pfn, PTRS_PER_PMD) &&
+ end >= (pfn + PTRS_PER_PMD) &&
+ rmp_level > PG_LEVEL_4K;
+
+ /*
+ * If an unaligned PFN corresponds to a 2M region assigned as a
+ * large page in the RMP table, PSMASH the region into individual
+ * 4K RMP entries before attempting to convert a 4K sub-page.
+ */
+ if (!use_2m_update && rmp_level > PG_LEVEL_4K) {
+ /*
+ * This shouldn't fail, but if it does, report it, but
+ * still try to update RMP entry to shared and pray this
+ * was a spurious error that can be addressed later.
+ */
+ rc = snp_rmptable_psmash(pfn);
+ WARN_ONCE(rc, "SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n",
+ pfn, rc);
+ }
+
+ rc = rmp_make_shared(pfn, use_2m_update ? PG_LEVEL_2M : PG_LEVEL_4K);
+ if (WARN_ONCE(rc, "SEV: Failed to update RMP entry for PFN 0x%llx error %d\n",
+ pfn, rc))
+ goto next_pfn;
+
+ /*
+ * SEV-ES avoids host/guest cache coherency issues through
+ * WBINVD hooks issued via MMU notifiers during run-time, and
+ * KVM's VM destroy path at shutdown. Those MMU notifier events
+ * don't cover gmem since there is no requirement to map pages
+ * to a HVA in order to use them for a running guest. While the
+ * shutdown path would still likely cover things for SNP guests,
+ * userspace may also free gmem pages during run-time via
+ * hole-punching operations on the guest_memfd, so flush the
+ * cache entries for these pages before free'ing them back to
+ * the host.
+ */
+ clflush_cache_range(__va(pfn_to_hpa(pfn)),
+ use_2m_update ? PMD_SIZE : PAGE_SIZE);
+next_pfn:
+ pfn += use_2m_update ? PTRS_PER_PMD : 1;
+ cond_resched();
+ }
+}
+
+int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
+{
+ int level, rc;
+ bool assigned;
+
+ if (!sev_snp_guest(kvm))
+ return 0;
+
+ rc = snp_lookup_rmpentry(pfn, &assigned, &level);
+ if (rc || !assigned)
+ return PG_LEVEL_4K;
+
+ return level;
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index c95d3900fe56..c115d26844f7 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -53,6 +53,7 @@
#include "svm_onhyperv.h"
MODULE_AUTHOR("Qumranet");
+MODULE_DESCRIPTION("KVM support for SVM (AMD-V) extensions");
MODULE_LICENSE("GPL");
#ifdef MODULE
@@ -570,6 +571,11 @@ static void __svm_write_tsc_multiplier(u64 multiplier)
__this_cpu_write(current_tsc_ratio, multiplier);
}
+static __always_inline struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd)
+{
+ return page_address(sd->save_area) + 0x400;
+}
+
static inline void kvm_cpu_svm_disable(void)
{
uint64_t efer;
@@ -674,12 +680,9 @@ static int svm_hardware_enable(void)
* TSC_AUX field now to avoid a RDMSR on every vCPU run.
*/
if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) {
- struct sev_es_save_area *hostsa;
u32 __maybe_unused msr_hi;
- hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
-
- rdmsr(MSR_TSC_AUX, hostsa->tsc_aux, msr_hi);
+ rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi);
}
return 0;
@@ -704,7 +707,7 @@ static int svm_cpu_init(int cpu)
int ret = -ENOMEM;
memset(sd, 0, sizeof(struct svm_cpu_data));
- sd->save_area = snp_safe_alloc_page(NULL);
+ sd->save_area = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL);
if (!sd->save_area)
return ret;
@@ -1202,7 +1205,7 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (guest_cpuid_is_intel(vcpu)) {
+ if (guest_cpuid_is_intel_compatible(vcpu)) {
/*
* We must intercept SYSENTER_EIP and SYSENTER_ESP
* accesses because the processor only stores 32 bits.
@@ -1404,6 +1407,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
svm->spec_ctrl = 0;
svm->virt_spec_ctrl = 0;
+ if (init_event)
+ sev_snp_init_protected_guest_state(vcpu);
+
init_vmcb(vcpu);
if (!init_event)
@@ -1427,7 +1433,7 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
svm = to_svm(vcpu);
err = -ENOMEM;
- vmcb01_page = snp_safe_alloc_page(vcpu);
+ vmcb01_page = snp_safe_alloc_page();
if (!vmcb01_page)
goto out;
@@ -1436,7 +1442,7 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
* SEV-ES guests require a separate VMSA page used to contain
* the encrypted register state of the guest.
*/
- vmsa_page = snp_safe_alloc_page(vcpu);
+ vmsa_page = snp_safe_alloc_page();
if (!vmsa_page)
goto error_free_vmcb_page;
}
@@ -1501,11 +1507,6 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
__free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
}
-static struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd)
-{
- return page_address(sd->save_area) + 0x400;
-}
-
static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1551,6 +1552,9 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
struct vcpu_svm *svm = to_svm(vcpu);
struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
+ if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
+ shrink_ple_window(vcpu);
+
if (sd->current_vmcb != svm->vmcb) {
sd->current_vmcb = svm->vmcb;
@@ -2050,6 +2054,7 @@ static int pf_interception(struct kvm_vcpu *vcpu)
static int npf_interception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ int rc;
u64 fault_address = svm->vmcb->control.exit_info_2;
u64 error_code = svm->vmcb->control.exit_info_1;
@@ -2063,11 +2068,19 @@ static int npf_interception(struct kvm_vcpu *vcpu)
if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK))
error_code &= ~PFERR_SYNTHETIC_MASK;
+ if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK))
+ error_code |= PFERR_PRIVATE_ACCESS;
+
trace_kvm_page_fault(vcpu, fault_address, error_code);
- return kvm_mmu_page_fault(vcpu, fault_address, error_code,
- static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
- svm->vmcb->control.insn_bytes : NULL,
- svm->vmcb->control.insn_len);
+ rc = kvm_mmu_page_fault(vcpu, fault_address, error_code,
+ static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
+ svm->vmcb->control.insn_bytes : NULL,
+ svm->vmcb->control.insn_len);
+
+ if (rc > 0 && error_code & PFERR_GUEST_RMP_MASK)
+ sev_handle_rmp_fault(vcpu, fault_address, error_code);
+
+ return rc;
}
static int db_interception(struct kvm_vcpu *vcpu)
@@ -2875,12 +2888,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_SYSENTER_EIP:
msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
- if (guest_cpuid_is_intel(vcpu))
+ if (guest_cpuid_is_intel_compatible(vcpu))
msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
break;
case MSR_IA32_SYSENTER_ESP:
msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
- if (guest_cpuid_is_intel(vcpu))
+ if (guest_cpuid_is_intel_compatible(vcpu))
msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
break;
case MSR_TSC_AUX:
@@ -3107,11 +3120,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
* 32 bit part of these msrs to support Intel's
* implementation of SYSENTER/SYSEXIT.
*/
- svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
+ svm->sysenter_eip_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0;
break;
case MSR_IA32_SYSENTER_ESP:
svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
- svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
+ svm->sysenter_esp_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0;
break;
case MSR_TSC_AUX:
/*
@@ -4372,11 +4385,11 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV);
/*
- * Intercept VMLOAD if the vCPU mode is Intel in order to emulate that
+ * Intercept VMLOAD if the vCPU model is Intel in order to emulate that
* VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing
* SVM on Intel is bonkers and extremely unlikely to work).
*/
- if (!guest_cpuid_is_intel(vcpu))
+ if (!guest_cpuid_is_intel_compatible(vcpu))
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER);
@@ -4595,12 +4608,6 @@ static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
vcpu->arch.at_instruction_boundary = true;
}
-static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
-{
- if (!kvm_pause_in_guest(vcpu->kvm))
- shrink_ple_window(vcpu);
-}
-
static void svm_setup_mce(struct kvm_vcpu *vcpu)
{
/* [63:9] are reserved. */
@@ -4937,8 +4944,11 @@ static int svm_vm_init(struct kvm *kvm)
if (type != KVM_X86_DEFAULT_VM &&
type != KVM_X86_SW_PROTECTED_VM) {
- kvm->arch.has_protected_state = (type == KVM_X86_SEV_ES_VM);
+ kvm->arch.has_protected_state =
+ (type == KVM_X86_SEV_ES_VM || type == KVM_X86_SNP_VM);
to_kvm_sev_info(kvm)->need_init = true;
+
+ kvm->arch.has_private_mem = (type == KVM_X86_SNP_VM);
}
if (!pause_filter_count || !pause_filter_thresh)
@@ -4955,7 +4965,7 @@ static int svm_vm_init(struct kvm *kvm)
static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu)
{
- struct page *page = snp_safe_alloc_page(vcpu);
+ struct page *page = snp_safe_alloc_page();
if (!page)
return NULL;
@@ -5060,8 +5070,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.check_intercept = svm_check_intercept,
.handle_exit_irqoff = svm_handle_exit_irqoff,
- .sched_in = svm_sched_in,
-
.nested_ops = &svm_nested_ops,
.deliver_interrupt = svm_deliver_interrupt,
@@ -5095,6 +5103,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
.vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
.alloc_apic_backing_page = svm_alloc_apic_backing_page,
+
+ .gmem_prepare = sev_gmem_prepare,
+ .gmem_invalidate = sev_gmem_invalidate,
+ .private_max_mapping_level = sev_private_max_mapping_level,
};
/*
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 0f1472690b59..76107c7d0595 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -94,6 +94,10 @@ struct kvm_sev_info {
struct list_head mirror_entry; /* Use as a list entry of mirrors */
struct misc_cg *misc_cg; /* For misc cgroup accounting */
atomic_t migration_in_progress;
+ void *snp_context; /* SNP guest context page */
+ void *guest_req_buf; /* Bounce buffer for SNP Guest Request input */
+ void *guest_resp_buf; /* Bounce buffer for SNP Guest Request output */
+ struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */
};
struct kvm_svm {
@@ -209,6 +213,18 @@ struct vcpu_sev_es_state {
u32 ghcb_sa_len;
bool ghcb_sa_sync;
bool ghcb_sa_free;
+
+ /* SNP Page-State-Change buffer entries currently being processed */
+ u16 psc_idx;
+ u16 psc_inflight;
+ bool psc_2m;
+
+ u64 ghcb_registered_gpa;
+
+ struct mutex snp_vmsa_mutex; /* Used to handle concurrent updates of VMSA. */
+ gpa_t snp_vmsa_gpa;
+ bool snp_ap_waiting_for_reset;
+ bool snp_has_guest_vmsa;
};
struct vcpu_svm {
@@ -350,6 +366,23 @@ static __always_inline bool sev_es_guest(struct kvm *kvm)
#endif
}
+static __always_inline bool sev_snp_guest(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_AMD_SEV
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return (sev->vmsa_features & SVM_SEV_FEAT_SNP_ACTIVE) &&
+ !WARN_ON_ONCE(!sev_es_guest(kvm));
+#else
+ return false;
+#endif
+}
+
+static inline bool ghcb_gpa_is_registered(struct vcpu_svm *svm, u64 val)
+{
+ return svm->sev_es.ghcb_registered_gpa == val;
+}
+
static inline void vmcb_mark_all_dirty(struct vmcb *vmcb)
{
vmcb->control.clean = 0;
@@ -638,7 +671,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
/* avic.c */
#define AVIC_REQUIRED_APICV_INHIBITS \
( \
- BIT(APICV_INHIBIT_REASON_DISABLE) | \
+ BIT(APICV_INHIBIT_REASON_DISABLED) | \
BIT(APICV_INHIBIT_REASON_ABSENT) | \
BIT(APICV_INHIBIT_REASON_HYPERV) | \
BIT(APICV_INHIBIT_REASON_NESTED) | \
@@ -696,7 +729,13 @@ void sev_guest_memory_reclaimed(struct kvm *kvm);
int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
/* These symbols are used in common code and are stubbed below. */
-struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
+
+struct page *snp_safe_alloc_page_node(int node, gfp_t gfp);
+static inline struct page *snp_safe_alloc_page(void)
+{
+ return snp_safe_alloc_page_node(numa_node_id(), GFP_KERNEL_ACCOUNT);
+}
+
void sev_free_vcpu(struct kvm_vcpu *vcpu);
void sev_vm_destroy(struct kvm *kvm);
void __init sev_set_cpu_caps(void);
@@ -705,9 +744,20 @@ void sev_hardware_unsetup(void);
int sev_cpu_init(struct svm_cpu_data *sd);
int sev_dev_get_attr(u32 group, u64 attr, u64 *val);
extern unsigned int max_sev_asid;
+void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code);
+void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu);
+int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
+void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
+int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn);
#else
-static inline struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) {
- return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+static inline struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
+{
+ return alloc_pages_node(node, gfp | __GFP_ZERO, 0);
+}
+
+static inline struct page *snp_safe_alloc_page(void)
+{
+ return snp_safe_alloc_page_node(numa_node_id(), GFP_KERNEL_ACCOUNT);
}
static inline void sev_free_vcpu(struct kvm_vcpu *vcpu) {}
@@ -718,6 +768,18 @@ static inline void sev_hardware_unsetup(void) {}
static inline int sev_cpu_init(struct svm_cpu_data *sd) { return 0; }
static inline int sev_dev_get_attr(u32 group, u64 attr, u64 *val) { return -ENXIO; }
#define max_sev_asid 0
+static inline void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) {}
+static inline void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) {}
+static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order)
+{
+ return 0;
+}
+static inline void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) {}
+static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
+{
+ return 0;
+}
+
#endif
/* vmenter.S */
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index e19fed438a67..d3aeffd6ae75 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -314,12 +314,12 @@ TRACE_EVENT(name, \
__entry->guest_rip = kvm_rip_read(vcpu); \
__entry->isa = isa; \
__entry->vcpu_id = vcpu->vcpu_id; \
- static_call(kvm_x86_get_exit_info)(vcpu, \
- &__entry->exit_reason, \
- &__entry->info1, \
- &__entry->info2, \
- &__entry->intr_info, \
- &__entry->error_code); \
+ kvm_x86_call(get_exit_info)(vcpu, \
+ &__entry->exit_reason, \
+ &__entry->info1, \
+ &__entry->info2, \
+ &__entry->intr_info, \
+ &__entry->error_code); \
), \
\
TP_printk("vcpu %u reason %s%s%s rip 0x%lx info1 0x%016llx " \
@@ -828,7 +828,8 @@ TRACE_EVENT(kvm_emulate_insn,
),
TP_fast_assign(
- __entry->csbase = static_call(kvm_x86_get_segment_base)(vcpu, VCPU_SREG_CS);
+ __entry->csbase = kvm_x86_call(get_segment_base)(vcpu,
+ VCPU_SREG_CS);
__entry->len = vcpu->arch.emulate_ctxt->fetch.ptr
- vcpu->arch.emulate_ctxt->fetch.data;
__entry->rip = vcpu->arch.emulate_ctxt->_eip - __entry->len;
@@ -1375,6 +1376,10 @@ TRACE_EVENT(kvm_hv_stimer_cleanup,
__entry->vcpu_id, __entry->timer_index)
);
+#define kvm_print_apicv_inhibit_reasons(inhibits) \
+ (inhibits), (inhibits) ? " " : "", \
+ (inhibits) ? __print_flags(inhibits, "|", APICV_INHIBIT_REASONS) : ""
+
TRACE_EVENT(kvm_apicv_inhibit_changed,
TP_PROTO(int reason, bool set, unsigned long inhibits),
TP_ARGS(reason, set, inhibits),
@@ -1391,9 +1396,10 @@ TRACE_EVENT(kvm_apicv_inhibit_changed,
__entry->inhibits = inhibits;
),
- TP_printk("%s reason=%u, inhibits=0x%lx",
+ TP_printk("%s reason=%u, inhibits=0x%lx%s%s",
__entry->set ? "set" : "cleared",
- __entry->reason, __entry->inhibits)
+ __entry->reason,
+ kvm_print_apicv_inhibit_reasons(__entry->inhibits))
);
TRACE_EVENT(kvm_apicv_accept_irq,
@@ -1834,6 +1840,37 @@ TRACE_EVENT(kvm_vmgexit_msr_protocol_exit,
__entry->vcpu_id, __entry->ghcb_gpa, __entry->result)
);
+/*
+ * Tracepoint for #NPFs due to RMP faults.
+ */
+TRACE_EVENT(kvm_rmp_fault,
+ TP_PROTO(struct kvm_vcpu *vcpu, u64 gpa, u64 pfn, u64 error_code,
+ int rmp_level, int psmash_ret),
+ TP_ARGS(vcpu, gpa, pfn, error_code, rmp_level, psmash_ret),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(u64, gpa)
+ __field(u64, pfn)
+ __field(u64, error_code)
+ __field(int, rmp_level)
+ __field(int, psmash_ret)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu->vcpu_id;
+ __entry->gpa = gpa;
+ __entry->pfn = pfn;
+ __entry->error_code = error_code;
+ __entry->rmp_level = rmp_level;
+ __entry->psmash_ret = psmash_ret;
+ ),
+
+ TP_printk("vcpu %u gpa %016llx pfn 0x%llx error_code 0x%llx rmp_level %d psmash_ret %d",
+ __entry->vcpu_id, __entry->gpa, __entry->pfn,
+ __entry->error_code, __entry->rmp_level, __entry->psmash_ret)
+);
+
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index d4ed681785fd..0bf35ebe8a1b 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -8,7 +8,7 @@
#include "posted_intr.h"
#define VMX_REQUIRED_APICV_INHIBITS \
- (BIT(APICV_INHIBIT_REASON_DISABLE)| \
+ (BIT(APICV_INHIBIT_REASON_DISABLED) | \
BIT(APICV_INHIBIT_REASON_ABSENT) | \
BIT(APICV_INHIBIT_REASON_HYPERV) | \
BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
@@ -97,7 +97,6 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
.hwapic_irr_update = vmx_hwapic_irr_update,
.hwapic_isr_update = vmx_hwapic_isr_update,
- .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
.sync_pir_to_irr = vmx_sync_pir_to_irr,
.deliver_interrupt = vmx_deliver_interrupt,
.dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
@@ -122,8 +121,6 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.check_intercept = vmx_check_intercept,
.handle_exit_irqoff = vmx_handle_exit_irqoff,
- .sched_in = vmx_sched_in,
-
.cpu_dirty_log_size = PML_ENTITY_NUM,
.update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 643935a0f70a..2392a7ef254d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -12,6 +12,7 @@
#include "mmu.h"
#include "nested.h"
#include "pmu.h"
+#include "posted_intr.h"
#include "sgx.h"
#include "trace.h"
#include "vmx.h"
@@ -2425,7 +2426,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
if (cpu_has_load_ia32_efer()) {
if (guest_efer & EFER_LMA)
exec_control |= VM_ENTRY_IA32E_MODE;
- if (guest_efer != host_efer)
+ if (guest_efer != kvm_host.efer)
exec_control |= VM_ENTRY_LOAD_IA32_EFER;
}
vm_entry_controls_set(vmx, exec_control);
@@ -2438,7 +2439,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
* bits may be modified by vmx_set_efer() in prepare_vmcs02().
*/
exec_control = __vm_exit_controls_get(vmcs01);
- if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
+ if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer)
exec_control |= VM_EXIT_LOAD_IA32_EFER;
else
exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
@@ -3899,8 +3900,8 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
if (!pi_test_and_clear_on(vmx->nested.pi_desc))
return 0;
- max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
- if (max_irr != 256) {
+ max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
+ if (max_irr > 0) {
vapic_page = vmx->nested.virtual_apic_map.hva;
if (!vapic_page)
goto mmio_needed;
@@ -4031,10 +4032,46 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
to_vmx(vcpu)->nested.preemption_timer_expired;
}
-static bool vmx_has_nested_events(struct kvm_vcpu *vcpu)
+static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection)
{
- return nested_vmx_preemption_timer_pending(vcpu) ||
- to_vmx(vcpu)->nested.mtf_pending;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ void *vapic = vmx->nested.virtual_apic_map.hva;
+ int max_irr, vppr;
+
+ if (nested_vmx_preemption_timer_pending(vcpu) ||
+ vmx->nested.mtf_pending)
+ return true;
+
+ /*
+ * Virtual Interrupt Delivery doesn't require manual injection. Either
+ * the interrupt is already in GUEST_RVI and will be recognized by CPU
+ * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move
+ * the interrupt from the PIR to RVI prior to entering the guest.
+ */
+ if (for_injection)
+ return false;
+
+ if (!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
+ __vmx_interrupt_blocked(vcpu))
+ return false;
+
+ if (!vapic)
+ return false;
+
+ vppr = *((u32 *)(vapic + APIC_PROCPRI));
+
+ max_irr = vmx_get_rvi();
+ if ((max_irr & 0xf0) > (vppr & 0xf0))
+ return true;
+
+ if (vmx->nested.pi_pending && vmx->nested.pi_desc &&
+ pi_test_on(vmx->nested.pi_desc)) {
+ max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
+ if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0))
+ return true;
+ }
+
+ return false;
}
/*
@@ -4665,7 +4702,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
return vmcs_read64(GUEST_IA32_EFER);
if (cpu_has_load_ia32_efer())
- return host_efer;
+ return kvm_host.efer;
for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
@@ -4676,7 +4713,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
if (efer_msr)
return efer_msr->data;
- return host_efer;
+ return kvm_host.efer;
}
static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index be40474de6e4..83382a4d1d66 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -348,14 +348,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr) {
case MSR_CORE_PERF_FIXED_CTR_CTRL:
- if (data & pmu->fixed_ctr_ctrl_mask)
+ if (data & pmu->fixed_ctr_ctrl_rsvd)
return 1;
if (pmu->fixed_ctr_ctrl != data)
reprogram_fixed_counters(pmu, data);
break;
case MSR_IA32_PEBS_ENABLE:
- if (data & pmu->pebs_enable_mask)
+ if (data & pmu->pebs_enable_rsvd)
return 1;
if (pmu->pebs_enable != data) {
@@ -371,7 +371,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
pmu->ds_area = data;
break;
case MSR_PEBS_DATA_CFG:
- if (data & pmu->pebs_data_cfg_mask)
+ if (data & pmu->pebs_data_cfg_rsvd)
return 1;
pmu->pebs_data_cfg = data;
@@ -436,8 +436,8 @@ static __always_inline u64 intel_get_fixed_pmc_eventsel(unsigned int index)
};
u64 eventsel;
- BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_PMC_MAX_FIXED);
- BUILD_BUG_ON(index >= KVM_PMC_MAX_FIXED);
+ BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_MAX_NR_INTEL_FIXED_COUTNERS);
+ BUILD_BUG_ON(index >= KVM_MAX_NR_INTEL_FIXED_COUTNERS);
/*
* Yell if perf reports support for a fixed counter but perf doesn't
@@ -448,6 +448,14 @@ static __always_inline u64 intel_get_fixed_pmc_eventsel(unsigned int index)
return eventsel;
}
+static void intel_pmu_enable_fixed_counter_bits(struct kvm_pmu *pmu, u64 bits)
+{
+ int i;
+
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
+ pmu->fixed_ctr_ctrl_rsvd &= ~intel_fixed_bits_by_idx(i, bits);
+}
+
static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -456,8 +464,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
union cpuid10_eax eax;
union cpuid10_edx edx;
u64 perf_capabilities;
- u64 counter_mask;
- int i;
+ u64 counter_rsvd;
memset(&lbr_desc->records, 0, sizeof(lbr_desc->records));
@@ -501,22 +508,24 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
((u64)1 << edx.split.bit_width_fixed) - 1;
}
- for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
- pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4));
- counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
+ intel_pmu_enable_fixed_counter_bits(pmu, INTEL_FIXED_0_KERNEL |
+ INTEL_FIXED_0_USER |
+ INTEL_FIXED_0_ENABLE_PMI);
+
+ counter_rsvd = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
(((1ull << pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX));
- pmu->global_ctrl_mask = counter_mask;
+ pmu->global_ctrl_rsvd = counter_rsvd;
/*
* GLOBAL_STATUS and GLOBAL_OVF_CONTROL (a.k.a. GLOBAL_STATUS_RESET)
* share reserved bit definitions. The kernel just happens to use
* OVF_CTRL for the names.
*/
- pmu->global_status_mask = pmu->global_ctrl_mask
+ pmu->global_status_rsvd = pmu->global_ctrl_rsvd
& ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF |
MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD);
if (vmx_pt_mode_is_host_guest())
- pmu->global_status_mask &=
+ pmu->global_status_rsvd &=
~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI;
entry = kvm_find_cpuid_entry_index(vcpu, 7, 0);
@@ -544,15 +553,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
if (perf_capabilities & PERF_CAP_PEBS_FORMAT) {
if (perf_capabilities & PERF_CAP_PEBS_BASELINE) {
- pmu->pebs_enable_mask = counter_mask;
+ pmu->pebs_enable_rsvd = counter_rsvd;
pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE;
- for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
- pmu->fixed_ctr_ctrl_mask &=
- ~(1ULL << (KVM_FIXED_PMC_BASE_IDX + i * 4));
- }
- pmu->pebs_data_cfg_mask = ~0xff00000full;
+ pmu->pebs_data_cfg_rsvd = ~0xff00000full;
+ intel_pmu_enable_fixed_counter_bits(pmu, ICL_FIXED_0_ADAPTIVE);
} else {
- pmu->pebs_enable_mask =
+ pmu->pebs_enable_rsvd =
~((1ull << pmu->nr_arch_gp_counters) - 1);
}
}
@@ -564,14 +570,14 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
- for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) {
+ for (i = 0; i < KVM_MAX_NR_INTEL_GP_COUNTERS; i++) {
pmu->gp_counters[i].type = KVM_PMC_GP;
pmu->gp_counters[i].vcpu = vcpu;
pmu->gp_counters[i].idx = i;
pmu->gp_counters[i].current_config = 0;
}
- for (i = 0; i < KVM_PMC_MAX_FIXED; i++) {
+ for (i = 0; i < KVM_MAX_NR_INTEL_FIXED_COUTNERS; i++) {
pmu->fixed_counters[i].type = KVM_PMC_FIXED;
pmu->fixed_counters[i].vcpu = vcpu;
pmu->fixed_counters[i].idx = i + KVM_FIXED_PMC_BASE_IDX;
@@ -731,6 +737,6 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = {
.deliver_pmi = intel_pmu_deliver_pmi,
.cleanup = intel_pmu_cleanup,
.EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT,
- .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC,
+ .MAX_NR_GP_COUNTERS = KVM_MAX_NR_INTEL_GP_COUNTERS,
.MIN_NR_GP_COUNTERS = 1,
};
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 6b2a0226257e..1715d2ab07be 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -1,6 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __KVM_X86_VMX_POSTED_INTR_H
#define __KVM_X86_VMX_POSTED_INTR_H
+
+#include <linux/find.h>
#include <asm/posted_intr.h>
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
@@ -12,4 +14,12 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
void vmx_pi_start_assignment(struct kvm *kvm);
+static inline int pi_find_highest_vector(struct pi_desc *pi_desc)
+{
+ int vec;
+
+ vec = find_last_bit((unsigned long *)pi_desc->pir, 256);
+ return vec < 256 ? vec : -1;
+}
+
#endif /* __KVM_X86_VMX_POSTED_INTR_H */
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 01936013428b..56fd150a6f24 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -188,12 +188,13 @@ struct __packed vmcs12 {
};
/*
- * VMCS12_REVISION is an arbitrary id that should be changed if the content or
- * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
- * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
+ * VMCS12_REVISION is KVM's arbitrary ID for the layout of struct vmcs12. KVM
+ * enumerates this value to L1 via MSR_IA32_VMX_BASIC, and checks the revision
+ * ID during nested VMPTRLD to verify that L1 is loading a VMCS that adhere's
+ * to KVM's virtual CPU definition.
*
- * IMPORTANT: Changing this value will break save/restore compatibility with
- * older kvm releases.
+ * DO NOT change this value, as it will break save/restore compatibility with
+ * older KVM releases.
*/
#define VMCS12_REVISION 0x11e57ed0
@@ -206,7 +207,8 @@ struct __packed vmcs12 {
#define VMCS12_SIZE KVM_STATE_NESTED_VMX_VMCS_SIZE
/*
- * For save/restore compatibility, the vmcs12 field offsets must not change.
+ * For save/restore compatibility, the vmcs12 field offsets must not change,
+ * although appending fields and/or filling gaps is obviously allowed.
*/
#define CHECK_OFFSET(field, loc) \
ASSERT_STRUCT_OFFSET(struct vmcs12, field, loc)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b3c83c06f826..f18c2d8c7476 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -74,6 +74,7 @@
#include "posted_intr.h"
MODULE_AUTHOR("Qumranet");
+MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions");
MODULE_LICENSE("GPL");
#ifdef MODULE
@@ -259,7 +260,7 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
return 0;
}
- if (host_arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
+ if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
return 0;
}
@@ -404,7 +405,7 @@ static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
* and VM-Exit.
*/
vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) &&
- (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
+ (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
!boot_cpu_has_bug(X86_BUG_MDS) &&
!boot_cpu_has_bug(X86_BUG_TAA);
@@ -1123,12 +1124,12 @@ static bool update_transition_efer(struct vcpu_vmx *vmx)
* atomically, since it's faster than switching it manually.
*/
if (cpu_has_load_ia32_efer() ||
- (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
+ (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) {
if (!(guest_efer & EFER_LMA))
guest_efer &= ~EFER_LME;
- if (guest_efer != host_efer)
+ if (guest_efer != kvm_host.efer)
add_atomic_switch_msr(vmx, MSR_EFER,
- guest_efer, host_efer, false);
+ guest_efer, kvm_host.efer, false);
else
clear_atomic_switch_msr(vmx, MSR_EFER);
return false;
@@ -1141,7 +1142,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx)
clear_atomic_switch_msr(vmx, MSR_EFER);
guest_efer &= ~ignore_bits;
- guest_efer |= host_efer & ignore_bits;
+ guest_efer |= kvm_host.efer & ignore_bits;
vmx->guest_uret_msrs[i].data = guest_efer;
vmx->guest_uret_msrs[i].mask = ~ignore_bits;
@@ -1411,6 +1412,38 @@ static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
}
#endif
+static void grow_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned int old = vmx->ple_window;
+
+ vmx->ple_window = __grow_ple_window(old, ple_window,
+ ple_window_grow,
+ ple_window_max);
+
+ if (vmx->ple_window != old) {
+ vmx->ple_window_dirty = true;
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ vmx->ple_window, old);
+ }
+}
+
+static void shrink_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned int old = vmx->ple_window;
+
+ vmx->ple_window = __shrink_ple_window(old, ple_window,
+ ple_window_shrink,
+ ple_window);
+
+ if (vmx->ple_window != old) {
+ vmx->ple_window_dirty = true;
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ vmx->ple_window, old);
+ }
+}
+
void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
struct loaded_vmcs *buddy)
{
@@ -1486,6 +1519,9 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
+ shrink_ple_window(vcpu);
+
vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
vmx_vcpu_pi_load(vcpu, cpu);
@@ -2525,17 +2561,15 @@ static bool cpu_has_sgx(void)
*/
static bool cpu_has_perf_global_ctrl_bug(void)
{
- if (boot_cpu_data.x86 == 0x6) {
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_NEHALEM_EP: /* AAK155 */
- case INTEL_FAM6_NEHALEM: /* AAP115 */
- case INTEL_FAM6_WESTMERE: /* AAT100 */
- case INTEL_FAM6_WESTMERE_EP: /* BC86,AAY89,BD102 */
- case INTEL_FAM6_NEHALEM_EX: /* BA97 */
- return true;
- default:
- break;
- }
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_NEHALEM_EP: /* AAK155 */
+ case INTEL_NEHALEM: /* AAP115 */
+ case INTEL_WESTMERE: /* AAT100 */
+ case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */
+ case INTEL_NEHALEM_EX: /* BA97 */
+ return true;
+ default:
+ break;
}
return false;
@@ -2834,9 +2868,6 @@ int vmx_hardware_enable(void)
return r;
}
- if (enable_ept)
- ept_sync_global();
-
return 0;
}
@@ -4108,26 +4139,6 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
}
}
-bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- void *vapic_page;
- u32 vppr;
- int rvi;
-
- if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
- !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
- WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
- return false;
-
- rvi = vmx_get_rvi();
-
- vapic_page = vmx->nested.virtual_apic_map.hva;
- vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
-
- return ((rvi & 0xf0) > (vppr & 0xf0));
-}
-
void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4357,7 +4368,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
}
if (cpu_has_load_ia32_efer())
- vmcs_write64(HOST_IA32_EFER, host_efer);
+ vmcs_write64(HOST_IA32_EFER, kvm_host.efer);
}
void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
@@ -5052,14 +5063,19 @@ int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !vmx_nmi_blocked(vcpu);
}
+bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
+{
+ return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
+ (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+}
+
bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
return false;
- return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
- (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+ return __vmx_interrupt_blocked(vcpu);
}
int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
@@ -5897,38 +5913,6 @@ int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
return 1;
}
-static void grow_ple_window(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned int old = vmx->ple_window;
-
- vmx->ple_window = __grow_ple_window(old, ple_window,
- ple_window_grow,
- ple_window_max);
-
- if (vmx->ple_window != old) {
- vmx->ple_window_dirty = true;
- trace_kvm_ple_window_update(vcpu->vcpu_id,
- vmx->ple_window, old);
- }
-}
-
-static void shrink_ple_window(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned int old = vmx->ple_window;
-
- vmx->ple_window = __shrink_ple_window(old, ple_window,
- ple_window_shrink,
- ple_window);
-
- if (vmx->ple_window != old) {
- vmx->ple_window_dirty = true;
- trace_kvm_ple_window_update(vcpu->vcpu_id,
- vmx->ple_window, old);
- }
-}
-
/*
* Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
* exiting, so only get here on cpu with PAUSE-Loop-Exiting.
@@ -6677,9 +6661,10 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
bool flush_l1d;
/*
- * Clear the per-vcpu flush bit, it gets set again
- * either from vcpu_run() or from one of the unsafe
- * VMEXIT handlers.
+ * Clear the per-vcpu flush bit, it gets set again if the vCPU
+ * is reloaded, i.e. if the vCPU is scheduled out or if KVM
+ * exits to userspace, or if KVM reaches one of the unsafe
+ * VMEXIT handlers, e.g. if KVM calls into the emulator.
*/
flush_l1d = vcpu->arch.l1tf_flush_l1d;
vcpu->arch.l1tf_flush_l1d = false;
@@ -7665,39 +7650,25 @@ int vmx_vm_init(struct kvm *kvm)
u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
- /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
- * memory aliases with conflicting memory types and sometimes MCEs.
- * We have to be careful as to what are honored and when.
- *
- * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to
- * UC. The effective memory type is UC or WC depending on guest PAT.
- * This was historically the source of MCEs and we want to be
- * conservative.
- *
- * When there is no need to deal with noncoherent DMA (e.g., no VT-d
- * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The
- * EPT memory type is set to WB. The effective memory type is forced
- * WB.
- *
- * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The
- * EPT memory type is used to emulate guest CD/MTRR.
+ /*
+ * Force UC for host MMIO regions, as allowing the guest to access MMIO
+ * with cacheable accesses will result in Machine Checks.
*/
-
if (is_mmio)
return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
- if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
+ /*
+ * Force WB and ignore guest PAT if the VM does NOT have a non-coherent
+ * device attached and the CPU doesn't support self-snoop. Letting the
+ * guest control memory types on Intel CPUs without self-snoop may
+ * result in unexpected behavior, and so KVM's (historical) ABI is to
+ * trust the guest to behave only as a last resort.
+ */
+ if (!static_cpu_has(X86_FEATURE_SELFSNOOP) &&
+ !kvm_arch_has_noncoherent_dma(vcpu->kvm))
return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
- if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) {
- if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
- return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT;
- else
- return (MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT) |
- VMX_EPT_IPAT_BIT;
- }
-
- return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
+ return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT);
}
static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
@@ -8179,12 +8150,6 @@ void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
}
#endif
-void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
-{
- if (!kvm_pause_in_guest(vcpu->kvm))
- shrink_ple_window(vcpu);
-}
-
void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8396,18 +8361,16 @@ static void __init vmx_setup_me_spte_mask(void)
u64 me_mask = 0;
/*
- * kvm_get_shadow_phys_bits() returns shadow_phys_bits. Use
- * the former to avoid exposing shadow_phys_bits.
- *
* On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
- * shadow_phys_bits. On MKTME and/or TDX capable systems,
+ * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems,
* boot_cpu_data.x86_phys_bits holds the actual physical address
- * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR
- * reported by CPUID. Those bits between are KeyID bits.
+ * w/o the KeyID bits, and kvm_host.maxphyaddr equals to
+ * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits.
*/
- if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits())
+ if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr)
me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
- kvm_get_shadow_phys_bits() - 1);
+ kvm_host.maxphyaddr - 1);
+
/*
* Unlike SME, host kernel doesn't support setting up any
* MKTME KeyID on Intel platforms. No memory encryption
@@ -8629,9 +8592,9 @@ static void __vmx_exit(void)
static void vmx_exit(void)
{
kvm_exit();
+ __vmx_exit();
kvm_x86_vendor_exit();
- __vmx_exit();
}
module_exit(vmx_exit);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 7b64e271a931..42498fa63abb 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -406,6 +406,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu);
void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);
+bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
@@ -727,7 +728,7 @@ static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu)
return true;
return allow_smaller_maxphyaddr &&
- cpuid_maxphyaddr(vcpu) < kvm_get_shadow_phys_bits();
+ cpuid_maxphyaddr(vcpu) < kvm_host.maxphyaddr;
}
static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index 502704596c83..ce3221cd1d01 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -46,10 +46,8 @@ bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu);
void vmx_migrate_timers(struct kvm_vcpu *vcpu);
void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu);
-bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason);
void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr);
void vmx_hwapic_isr_update(int max_isr);
-bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu);
int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu);
void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
int trig_mode, int vector);
@@ -111,8 +109,6 @@ u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu);
u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
void vmx_write_tsc_offset(struct kvm_vcpu *vcpu);
void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu);
-void vmx_request_immediate_exit(struct kvm_vcpu *vcpu);
-void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu);
void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
#ifdef CONFIG_X86_64
int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0763a0f72a06..af6c8cf6a37a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -100,6 +100,9 @@
struct kvm_caps kvm_caps __read_mostly;
EXPORT_SYMBOL_GPL(kvm_caps);
+struct kvm_host_values kvm_host __read_mostly;
+EXPORT_SYMBOL_GPL(kvm_host);
+
#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e))
#define emul_to_vcpu(ctxt) \
@@ -220,21 +223,12 @@ static struct kvm_user_return_msrs __percpu *user_return_msrs;
| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
| XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE)
-u64 __read_mostly host_efer;
-EXPORT_SYMBOL_GPL(host_efer);
-
bool __read_mostly allow_smaller_maxphyaddr = 0;
EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
bool __read_mostly enable_apicv = true;
EXPORT_SYMBOL_GPL(enable_apicv);
-u64 __read_mostly host_xss;
-EXPORT_SYMBOL_GPL(host_xss);
-
-u64 __read_mostly host_arch_capabilities;
-EXPORT_SYMBOL_GPL(host_arch_capabilities);
-
const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
KVM_GENERIC_VM_STATS(),
STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
@@ -308,8 +302,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
sizeof(kvm_vcpu_stats_desc),
};
-u64 __read_mostly host_xcr0;
-
static struct kmem_cache *x86_emulator_cache;
/*
@@ -833,7 +825,7 @@ EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
*/
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
{
- if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl)
+ if (kvm_x86_call(get_cpl)(vcpu) <= required_cpl)
return true;
kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
return false;
@@ -917,7 +909,7 @@ static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
return false;
- return static_call(kvm_x86_is_valid_cr0)(vcpu, cr0);
+ return kvm_x86_call(is_valid_cr0)(vcpu, cr0);
}
void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
@@ -954,11 +946,6 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon
if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
kvm_mmu_reset_context(vcpu);
-
- if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
- kvm_mmu_honors_guest_mtrrs(vcpu->kvm) &&
- !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
- kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
}
EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
@@ -981,7 +968,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if (!is_pae(vcpu))
return 1;
- static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
+ kvm_x86_call(get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
if (cs_l)
return 1;
}
@@ -995,7 +982,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
(is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)))
return 1;
- static_call(kvm_x86_set_cr0)(vcpu, cr0);
+ kvm_x86_call(set_cr0)(vcpu, cr0);
kvm_post_set_cr0(vcpu, old_cr0, cr0);
@@ -1016,11 +1003,11 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
- if (vcpu->arch.xcr0 != host_xcr0)
+ if (vcpu->arch.xcr0 != kvm_host.xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
- vcpu->arch.ia32_xss != host_xss)
+ vcpu->arch.ia32_xss != kvm_host.xss)
wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
}
@@ -1047,12 +1034,12 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
- if (vcpu->arch.xcr0 != host_xcr0)
- xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
+ if (vcpu->arch.xcr0 != kvm_host.xcr0)
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
- vcpu->arch.ia32_xss != host_xss)
- wrmsrl(MSR_IA32_XSS, host_xss);
+ vcpu->arch.ia32_xss != kvm_host.xss)
+ wrmsrl(MSR_IA32_XSS, kvm_host.xss);
}
}
@@ -1113,7 +1100,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
{
/* Note, #UD due to CR4.OSXSAVE=0 has priority over the intercept. */
- if (static_call(kvm_x86_get_cpl)(vcpu) != 0 ||
+ if (kvm_x86_call(get_cpl)(vcpu) != 0 ||
__kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) {
kvm_inject_gp(vcpu, 0);
return 1;
@@ -1138,7 +1125,7 @@ EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4);
static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
return __kvm_is_valid_cr4(vcpu, cr4) &&
- static_call(kvm_x86_is_valid_cr4)(vcpu, cr4);
+ kvm_x86_call(is_valid_cr4)(vcpu, cr4);
}
void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
@@ -1206,7 +1193,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
return 1;
}
- static_call(kvm_x86_set_cr4)(vcpu, cr4);
+ kvm_x86_call(set_cr4)(vcpu, cr4);
kvm_post_set_cr4(vcpu, old_cr4, cr4);
@@ -1345,7 +1332,7 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu)
dr7 = vcpu->arch.guest_debug_dr7;
else
dr7 = vcpu->arch.dr7;
- static_call(kvm_x86_set_dr7)(vcpu, dr7);
+ kvm_x86_call(set_dr7)(vcpu, dr7);
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
if (dr7 & DR7_BP_EN_MASK)
vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
@@ -1461,10 +1448,10 @@ static const u32 msrs_to_save_pmu[] = {
MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
- MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
+ MSR_CORE_PERF_GLOBAL_CTRL,
MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
- /* This part of MSRs should match KVM_INTEL_PMC_MAX_GENERIC. */
+ /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */
MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
@@ -1477,7 +1464,7 @@ static const u32 msrs_to_save_pmu[] = {
MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
- /* This part of MSRs should match KVM_AMD_PMC_MAX_GENERIC. */
+ /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */
MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
@@ -1619,7 +1606,7 @@ static bool kvm_is_immutable_feature_msr(u32 msr)
static u64 kvm_get_arch_capabilities(void)
{
- u64 data = host_arch_capabilities & KVM_SUPPORTED_ARCH_CAP;
+ u64 data = kvm_host.arch_capabilities & KVM_SUPPORTED_ARCH_CAP;
/*
* If nx_huge_pages is enabled, KVM's shadow paging will ensure that
@@ -1688,7 +1675,7 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
rdmsrl_safe(msr->index, &msr->data);
break;
default:
- return static_call(kvm_x86_get_msr_feature)(msr);
+ return kvm_x86_call(get_msr_feature)(msr);
}
return 0;
}
@@ -1762,7 +1749,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
efer &= ~EFER_LMA;
efer |= vcpu->arch.efer & EFER_LMA;
- r = static_call(kvm_x86_set_efer)(vcpu, efer);
+ r = kvm_x86_call(set_efer)(vcpu, efer);
if (r) {
WARN_ON(r > 0);
return r;
@@ -1877,11 +1864,11 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
* incomplete and conflicting architectural behavior. Current
* AMD CPUs completely ignore bits 63:32, i.e. they aren't
* reserved and always read as zeros. Enforce Intel's reserved
- * bits check if and only if the guest CPU is Intel, and clear
- * the bits in all other cases. This ensures cross-vendor
- * migration will provide consistent behavior for the guest.
+ * bits check if the guest CPU is Intel compatible, otherwise
+ * clear the bits. This ensures cross-vendor migration will
+ * provide consistent behavior for the guest.
*/
- if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0)
+ if (guest_cpuid_is_intel_compatible(vcpu) && (data >> 32) != 0)
return 1;
data = (u32)data;
@@ -1892,7 +1879,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
msr.index = index;
msr.host_initiated = host_initiated;
- return static_call(kvm_x86_set_msr)(vcpu, &msr);
+ return kvm_x86_call(set_msr)(vcpu, &msr);
}
static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
@@ -1934,7 +1921,7 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
msr.index = index;
msr.host_initiated = host_initiated;
- ret = static_call(kvm_x86_get_msr)(vcpu, &msr);
+ ret = kvm_x86_call(get_msr)(vcpu, &msr);
if (!ret)
*data = msr.data;
return ret;
@@ -2002,7 +1989,7 @@ static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
static int complete_fast_msr_access(struct kvm_vcpu *vcpu)
{
- return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error);
+ return kvm_x86_call(complete_emulated_msr)(vcpu, vcpu->run->msr.error);
}
static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
@@ -2066,7 +2053,7 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
trace_kvm_msr_read_ex(ecx);
}
- return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
+ return kvm_x86_call(complete_emulated_msr)(vcpu, r);
}
EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
@@ -2091,7 +2078,7 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
trace_kvm_msr_write_ex(ecx, data);
}
- return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
+ return kvm_x86_call(complete_emulated_msr)(vcpu, r);
}
EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
@@ -2616,12 +2603,12 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
if (is_guest_mode(vcpu))
vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
l1_offset,
- static_call(kvm_x86_get_l2_tsc_offset)(vcpu),
- static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
+ kvm_x86_call(get_l2_tsc_offset)(vcpu),
+ kvm_x86_call(get_l2_tsc_multiplier)(vcpu));
else
vcpu->arch.tsc_offset = l1_offset;
- static_call(kvm_x86_write_tsc_offset)(vcpu);
+ kvm_x86_call(write_tsc_offset)(vcpu);
}
static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
@@ -2632,12 +2619,12 @@ static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multipli
if (is_guest_mode(vcpu))
vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
l1_multiplier,
- static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
+ kvm_x86_call(get_l2_tsc_multiplier)(vcpu));
else
vcpu->arch.tsc_scaling_ratio = l1_multiplier;
if (kvm_caps.has_tsc_control)
- static_call(kvm_x86_write_tsc_multiplier)(vcpu);
+ kvm_x86_call(write_tsc_multiplier)(vcpu);
}
static inline bool kvm_check_tsc_unstable(void)
@@ -3610,7 +3597,7 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- static_call(kvm_x86_flush_tlb_all)(vcpu);
+ kvm_x86_call(flush_tlb_all)(vcpu);
/* Flushing all ASIDs flushes the current ASID... */
kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
@@ -3631,7 +3618,7 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
kvm_mmu_sync_prev_roots(vcpu);
}
- static_call(kvm_x86_flush_tlb_guest)(vcpu);
+ kvm_x86_call(flush_tlb_guest)(vcpu);
/*
* Flushing all "guest" TLB is always a superset of Hyper-V's fine
@@ -3644,7 +3631,7 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- static_call(kvm_x86_flush_tlb_current)(vcpu);
+ kvm_x86_call(flush_tlb_current)(vcpu);
}
/*
@@ -4703,8 +4690,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
case KVM_CAP_IRQFD_RESAMPLE:
case KVM_CAP_MEMORY_FAULT_INFO:
+ case KVM_CAP_X86_GUEST_MODE:
r = 1;
break;
+ case KVM_CAP_PRE_FAULT_MEMORY:
+ r = tdp_enabled;
+ break;
+ case KVM_CAP_X86_APIC_BUS_CYCLES_NS:
+ r = APIC_BUS_CYCLE_NS_DEFAULT;
+ break;
case KVM_CAP_EXIT_HYPERCALL:
r = KVM_EXIT_HYPERCALL_VALID_MASK;
break;
@@ -4753,7 +4747,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
* fringe case that is not enabled except via specific settings
* of the module parameters.
*/
- r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE);
+ r = kvm_x86_call(has_emulated_msr)(kvm, MSR_IA32_SMBASE);
break;
case KVM_CAP_NR_VCPUS:
r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS);
@@ -4833,7 +4827,7 @@ static int __kvm_x86_dev_get_attr(struct kvm_device_attr *attr, u64 *val)
{
if (attr->group) {
if (kvm_x86_ops.dev_get_attr)
- return static_call(kvm_x86_dev_get_attr)(attr->group, attr->attr, val);
+ return kvm_x86_call(dev_get_attr)(attr->group, attr->attr, val);
return -ENXIO;
}
@@ -4995,16 +4989,25 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ vcpu->arch.l1tf_flush_l1d = true;
+
+ if (vcpu->scheduled_out && pmu->version && pmu->event_count) {
+ pmu->need_cleanup = true;
+ kvm_make_request(KVM_REQ_PMU, vcpu);
+ }
+
/* Address WBINVD may be executed by guest */
if (need_emulate_wbinvd(vcpu)) {
- if (static_call(kvm_x86_has_wbinvd_exit)())
+ if (kvm_x86_call(has_wbinvd_exit)())
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
smp_call_function_single(vcpu->cpu,
wbinvd_ipi, NULL, 1);
}
- static_call(kvm_x86_vcpu_load)(vcpu, cpu);
+ kvm_x86_call(vcpu_load)(vcpu, cpu);
/* Save host pkru register if supported */
vcpu->arch.host_pkru = read_pkru();
@@ -5112,14 +5115,14 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
srcu_read_unlock(&vcpu->kvm->srcu, idx);
}
- static_call(kvm_x86_vcpu_put)(vcpu);
+ kvm_x86_call(vcpu_put)(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
}
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
+ kvm_x86_call(sync_pir_to_irr)(vcpu);
return kvm_apic_get_state(vcpu, s);
}
@@ -5236,7 +5239,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
kvm_apic_after_set_mcg_cap(vcpu);
- static_call(kvm_x86_setup_mce)(vcpu);
+ kvm_x86_call(setup_mce)(vcpu);
out:
return r;
}
@@ -5396,11 +5399,11 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
events->interrupt.injected =
vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
events->interrupt.nr = vcpu->arch.interrupt.nr;
- events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+ events->interrupt.shadow = kvm_x86_call(get_interrupt_shadow)(vcpu);
events->nmi.injected = vcpu->arch.nmi_injected;
events->nmi.pending = kvm_get_nr_pending_nmis(vcpu);
- events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu);
+ events->nmi.masked = kvm_x86_call(get_nmi_mask)(vcpu);
/* events->sipi_vector is never valid when reporting to user space */
@@ -5482,8 +5485,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.interrupt.nr = events->interrupt.nr;
vcpu->arch.interrupt.soft = events->interrupt.soft;
if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
- static_call(kvm_x86_set_interrupt_shadow)(vcpu,
- events->interrupt.shadow);
+ kvm_x86_call(set_interrupt_shadow)(vcpu,
+ events->interrupt.shadow);
vcpu->arch.nmi_injected = events->nmi.injected;
if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
@@ -5492,7 +5495,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
if (events->nmi.pending)
kvm_make_request(KVM_REQ_NMI, vcpu);
}
- static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
+ kvm_x86_call(set_nmi_mask)(vcpu, events->nmi.masked);
if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
lapic_in_kernel(vcpu))
@@ -5840,7 +5843,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
if (!kvm_x86_ops.enable_l2_tlb_flush)
return -ENOTTY;
- return static_call(kvm_x86_enable_l2_tlb_flush)(vcpu);
+ return kvm_x86_call(enable_l2_tlb_flush)(vcpu);
case KVM_CAP_HYPERV_ENFORCE_CPUID:
return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
@@ -5879,8 +5882,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EINVAL;
if (!lapic_in_kernel(vcpu))
goto out;
- u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
- GFP_KERNEL_ACCOUNT);
+ u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
r = -ENOMEM;
if (!u.lapic)
@@ -6073,7 +6075,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave))
break;
- u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
+ u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
r = -ENOMEM;
if (!u.xsave)
break;
@@ -6104,7 +6106,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
case KVM_GET_XSAVE2: {
int size = vcpu->arch.guest_fpu.uabi_size;
- u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT);
+ u.xsave = kzalloc(size, GFP_KERNEL);
r = -ENOMEM;
if (!u.xsave)
break;
@@ -6122,7 +6124,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
case KVM_GET_XCRS: {
- u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
+ u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
r = -ENOMEM;
if (!u.xcrs)
break;
@@ -6330,14 +6332,14 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
if (addr > (unsigned int)(-3 * PAGE_SIZE))
return -EINVAL;
- ret = static_call(kvm_x86_set_tss_addr)(kvm, addr);
+ ret = kvm_x86_call(set_tss_addr)(kvm, addr);
return ret;
}
static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
u64 ident_addr)
{
- return static_call(kvm_x86_set_identity_map_addr)(kvm, ident_addr);
+ return kvm_x86_call(set_identity_map_addr)(kvm, ident_addr);
}
static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
@@ -6543,9 +6545,6 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
goto split_irqchip_unlock;
if (kvm->created_vcpus)
goto split_irqchip_unlock;
- r = kvm_setup_empty_irq_routing(kvm);
- if (r)
- goto split_irqchip_unlock;
/* Pairs with irqchip_in_kernel. */
smp_wmb();
kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
@@ -6650,14 +6649,14 @@ split_irqchip_unlock:
if (!kvm_x86_ops.vm_copy_enc_context_from)
break;
- r = static_call(kvm_x86_vm_copy_enc_context_from)(kvm, cap->args[0]);
+ r = kvm_x86_call(vm_copy_enc_context_from)(kvm, cap->args[0]);
break;
case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
r = -EINVAL;
if (!kvm_x86_ops.vm_move_enc_context_from)
break;
- r = static_call(kvm_x86_vm_move_enc_context_from)(kvm, cap->args[0]);
+ r = kvm_x86_call(vm_move_enc_context_from)(kvm, cap->args[0]);
break;
case KVM_CAP_EXIT_HYPERCALL:
if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
@@ -6692,7 +6691,9 @@ split_irqchip_unlock:
break;
mutex_lock(&kvm->lock);
- if (kvm->arch.max_vcpu_ids == cap->args[0]) {
+ if (kvm->arch.bsp_vcpu_id > cap->args[0]) {
+ ;
+ } else if (kvm->arch.max_vcpu_ids == cap->args[0]) {
r = 0;
} else if (!kvm->arch.max_vcpu_ids) {
kvm->arch.max_vcpu_ids = cap->args[0];
@@ -6745,6 +6746,30 @@ split_irqchip_unlock:
}
mutex_unlock(&kvm->lock);
break;
+ case KVM_CAP_X86_APIC_BUS_CYCLES_NS: {
+ u64 bus_cycle_ns = cap->args[0];
+ u64 unused;
+
+ /*
+ * Guard against overflow in tmict_to_ns(). 128 is the highest
+ * divide value that can be programmed in APIC_TDCR.
+ */
+ r = -EINVAL;
+ if (!bus_cycle_ns ||
+ check_mul_overflow((u64)U32_MAX * 128, bus_cycle_ns, &unused))
+ break;
+
+ r = 0;
+ mutex_lock(&kvm->lock);
+ if (!irqchip_in_kernel(kvm))
+ r = -ENXIO;
+ else if (kvm->created_vcpus)
+ r = -EINVAL;
+ else
+ kvm->arch.apic_bus_cycle_ns = bus_cycle_ns;
+ mutex_unlock(&kvm->lock);
+ break;
+ }
default:
r = -EINVAL;
break;
@@ -7213,6 +7238,9 @@ set_pit2_out:
mutex_lock(&kvm->lock);
if (kvm->created_vcpus)
r = -EBUSY;
+ else if (arg > KVM_MAX_VCPU_IDS ||
+ (kvm->arch.max_vcpu_ids && arg > kvm->arch.max_vcpu_ids))
+ r = -EINVAL;
else
kvm->arch.bsp_vcpu_id = arg;
mutex_unlock(&kvm->lock);
@@ -7289,7 +7317,7 @@ set_pit2_out:
if (!kvm_x86_ops.mem_enc_ioctl)
goto out;
- r = static_call(kvm_x86_mem_enc_ioctl)(kvm, argp);
+ r = kvm_x86_call(mem_enc_ioctl)(kvm, argp);
break;
}
case KVM_MEMORY_ENCRYPT_REG_REGION: {
@@ -7303,7 +7331,7 @@ set_pit2_out:
if (!kvm_x86_ops.mem_enc_register_region)
goto out;
- r = static_call(kvm_x86_mem_enc_register_region)(kvm, &region);
+ r = kvm_x86_call(mem_enc_register_region)(kvm, &region);
break;
}
case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
@@ -7317,7 +7345,7 @@ set_pit2_out:
if (!kvm_x86_ops.mem_enc_unregister_region)
goto out;
- r = static_call(kvm_x86_mem_enc_unregister_region)(kvm, &region);
+ r = kvm_x86_call(mem_enc_unregister_region)(kvm, &region);
break;
}
#ifdef CONFIG_KVM_HYPERV
@@ -7411,17 +7439,20 @@ static void kvm_probe_msr_to_save(u32 msr_index)
intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2))
return;
break;
- case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX:
+ case MSR_ARCH_PERFMON_PERFCTR0 ...
+ MSR_ARCH_PERFMON_PERFCTR0 + KVM_MAX_NR_GP_COUNTERS - 1:
if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >=
kvm_pmu_cap.num_counters_gp)
return;
break;
- case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX:
+ case MSR_ARCH_PERFMON_EVENTSEL0 ...
+ MSR_ARCH_PERFMON_EVENTSEL0 + KVM_MAX_NR_GP_COUNTERS - 1:
if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >=
kvm_pmu_cap.num_counters_gp)
return;
break;
- case MSR_ARCH_PERFMON_FIXED_CTR0 ... MSR_ARCH_PERFMON_FIXED_CTR_MAX:
+ case MSR_ARCH_PERFMON_FIXED_CTR0 ...
+ MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_MAX_NR_FIXED_COUNTERS - 1:
if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >=
kvm_pmu_cap.num_counters_fixed)
return;
@@ -7452,7 +7483,7 @@ static void kvm_init_msr_lists(void)
{
unsigned i;
- BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3,
+ BUILD_BUG_ON_MSG(KVM_MAX_NR_FIXED_COUNTERS != 3,
"Please update the fixed PMCs in msrs_to_save_pmu[]");
num_msrs_to_save = 0;
@@ -7468,7 +7499,8 @@ static void kvm_init_msr_lists(void)
}
for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
- if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i]))
+ if (!kvm_x86_call(has_emulated_msr)(NULL,
+ emulated_msrs_all[i]))
continue;
emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
@@ -7527,13 +7559,13 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
void kvm_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
- static_call(kvm_x86_set_segment)(vcpu, var, seg);
+ kvm_x86_call(set_segment)(vcpu, var, seg);
}
void kvm_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
- static_call(kvm_x86_get_segment)(vcpu, var, seg);
+ kvm_x86_call(get_segment)(vcpu, var, seg);
}
gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access,
@@ -7556,7 +7588,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
}
EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
@@ -7566,7 +7598,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_WRITE_MASK;
return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
}
@@ -7619,7 +7651,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
unsigned offset;
int ret;
@@ -7644,7 +7676,7 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception)
{
- u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
/*
* FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
@@ -7667,7 +7699,7 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
if (system)
access |= PFERR_IMPLICIT_ACCESS;
- else if (static_call(kvm_x86_get_cpl)(vcpu) == 3)
+ else if (kvm_x86_call(get_cpl)(vcpu) == 3)
access |= PFERR_USER_MASK;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
@@ -7712,7 +7744,7 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v
if (system)
access |= PFERR_IMPLICIT_ACCESS;
- else if (static_call(kvm_x86_get_cpl)(vcpu) == 3)
+ else if (kvm_x86_call(get_cpl)(vcpu) == 3)
access |= PFERR_USER_MASK;
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
@@ -7733,8 +7765,8 @@ EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
static int kvm_check_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
void *insn, int insn_len)
{
- return static_call(kvm_x86_check_emulate_instruction)(vcpu, emul_type,
- insn, insn_len);
+ return kvm_x86_call(check_emulate_instruction)(vcpu, emul_type,
+ insn, insn_len);
}
int handle_ud(struct kvm_vcpu *vcpu)
@@ -7784,8 +7816,8 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
bool write)
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- u64 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
- | (write ? PFERR_WRITE_MASK : 0);
+ u64 access = ((kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
+ | (write ? PFERR_WRITE_MASK : 0);
/*
* currently PKRU is only applied to ept enabled guest so
@@ -8211,7 +8243,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
- return static_call(kvm_x86_get_segment_base)(vcpu, seg);
+ return kvm_x86_call(get_segment_base)(vcpu, seg);
}
static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
@@ -8224,7 +8256,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
if (!need_emulate_wbinvd(vcpu))
return X86EMUL_CONTINUE;
- if (static_call(kvm_x86_has_wbinvd_exit)()) {
+ if (kvm_x86_call(has_wbinvd_exit)()) {
int cpu = get_cpu();
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
@@ -8328,27 +8360,27 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
{
- return static_call(kvm_x86_get_cpl)(emul_to_vcpu(ctxt));
+ return kvm_x86_call(get_cpl)(emul_to_vcpu(ctxt));
}
static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- static_call(kvm_x86_get_gdt)(emul_to_vcpu(ctxt), dt);
+ kvm_x86_call(get_gdt)(emul_to_vcpu(ctxt), dt);
}
static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- static_call(kvm_x86_get_idt)(emul_to_vcpu(ctxt), dt);
+ kvm_x86_call(get_idt)(emul_to_vcpu(ctxt), dt);
}
static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- static_call(kvm_x86_set_gdt)(emul_to_vcpu(ctxt), dt);
+ kvm_x86_call(set_gdt)(emul_to_vcpu(ctxt), dt);
}
static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- static_call(kvm_x86_set_idt)(emul_to_vcpu(ctxt), dt);
+ kvm_x86_call(set_idt)(emul_to_vcpu(ctxt), dt);
}
static unsigned long emulator_get_cached_segment_base(
@@ -8495,8 +8527,8 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
struct x86_instruction_info *info,
enum x86_intercept_stage stage)
{
- return static_call(kvm_x86_check_intercept)(emul_to_vcpu(ctxt), info, stage,
- &ctxt->exception);
+ return kvm_x86_call(check_intercept)(emul_to_vcpu(ctxt), info, stage,
+ &ctxt->exception);
}
static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
@@ -8521,6 +8553,11 @@ static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt)
return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
}
+static bool emulator_guest_cpuid_is_intel_compatible(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpuid_is_intel_compatible(emul_to_vcpu(ctxt));
+}
+
static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
{
return kvm_register_read_raw(emul_to_vcpu(ctxt), reg);
@@ -8533,7 +8570,7 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon
static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
{
- static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked);
+ kvm_x86_call(set_nmi_mask)(emul_to_vcpu(ctxt), masked);
}
static bool emulator_is_smm(struct x86_emulate_ctxt *ctxt)
@@ -8578,7 +8615,8 @@ static gva_t emulator_get_untagged_addr(struct x86_emulate_ctxt *ctxt,
if (!kvm_x86_ops.get_untagged_addr)
return addr;
- return static_call(kvm_x86_get_untagged_addr)(emul_to_vcpu(ctxt), addr, flags);
+ return kvm_x86_call(get_untagged_addr)(emul_to_vcpu(ctxt),
+ addr, flags);
}
static const struct x86_emulate_ops emulate_ops = {
@@ -8619,6 +8657,7 @@ static const struct x86_emulate_ops emulate_ops = {
.guest_has_movbe = emulator_guest_has_movbe,
.guest_has_fxsr = emulator_guest_has_fxsr,
.guest_has_rdpid = emulator_guest_has_rdpid,
+ .guest_cpuid_is_intel_compatible = emulator_guest_cpuid_is_intel_compatible,
.set_nmi_mask = emulator_set_nmi_mask,
.is_smm = emulator_is_smm,
.is_guest_mode = emulator_is_guest_mode,
@@ -8630,7 +8669,7 @@ static const struct x86_emulate_ops emulate_ops = {
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
{
- u32 int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+ u32 int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu);
/*
* an sti; sti; sequence only disable interrupts for the first
* instruction. So, if the last instruction, be it emulated or
@@ -8641,7 +8680,7 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
if (int_shadow & mask)
mask = 0;
if (unlikely(int_shadow || mask)) {
- static_call(kvm_x86_set_interrupt_shadow)(vcpu, mask);
+ kvm_x86_call(set_interrupt_shadow)(vcpu, mask);
if (!mask)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
@@ -8682,7 +8721,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
int cs_db, cs_l;
- static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
+ kvm_x86_call(get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
ctxt->gpa_available = false;
ctxt->eflags = kvm_get_rflags(vcpu);
@@ -8738,9 +8777,8 @@ static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
*/
memset(&info, 0, sizeof(info));
- static_call(kvm_x86_get_exit_info)(vcpu, (u32 *)&info[0], &info[1],
- &info[2], (u32 *)&info[3],
- (u32 *)&info[4]);
+ kvm_x86_call(get_exit_info)(vcpu, (u32 *)&info[0], &info[1], &info[2],
+ (u32 *)&info[3], (u32 *)&info[4]);
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
@@ -8817,7 +8855,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
kvm_queue_exception(vcpu, UD_VECTOR);
- if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) {
+ if (!is_guest_mode(vcpu) && kvm_x86_call(get_cpl)(vcpu) == 0) {
prepare_emulation_ctxt_failure_exit(vcpu);
return 0;
}
@@ -8975,10 +9013,10 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
- unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
+ unsigned long rflags = kvm_x86_call(get_rflags)(vcpu);
int r;
- r = static_call(kvm_x86_skip_emulated_instruction)(vcpu);
+ r = kvm_x86_call(skip_emulated_instruction)(vcpu);
if (unlikely(!r))
return 0;
@@ -9000,19 +9038,17 @@ EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu)
{
- u32 shadow;
-
if (kvm_get_rflags(vcpu) & X86_EFLAGS_RF)
return true;
/*
- * Intel CPUs inhibit code #DBs when MOV/POP SS blocking is active,
- * but AMD CPUs do not. MOV/POP SS blocking is rare, check that first
- * to avoid the relatively expensive CPUID lookup.
+ * Intel compatible CPUs inhibit code #DBs when MOV/POP SS blocking is
+ * active, but AMD compatible CPUs do not.
*/
- shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
- return (shadow & KVM_X86_SHADOW_INT_MOV_SS) &&
- guest_cpuid_is_intel(vcpu);
+ if (!guest_cpuid_is_intel_compatible(vcpu))
+ return false;
+
+ return kvm_x86_call(get_interrupt_shadow)(vcpu) & KVM_X86_SHADOW_INT_MOV_SS;
}
static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu,
@@ -9284,7 +9320,7 @@ restart:
writeback:
if (writeback) {
- unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
+ unsigned long rflags = kvm_x86_call(get_rflags)(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
@@ -9301,7 +9337,7 @@ writeback:
kvm_rip_write(vcpu, ctxt->eip);
if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
r = kvm_vcpu_do_singlestep(vcpu);
- static_call_cond(kvm_x86_update_emulated_instruction)(vcpu);
+ kvm_x86_call(update_emulated_instruction)(vcpu);
__kvm_set_rflags(vcpu, ctxt->eflags);
}
@@ -9700,7 +9736,7 @@ static int kvm_x86_check_processor_compatibility(void)
__cr4_reserved_bits(cpu_has, &boot_cpu_data))
return -EIO;
- return static_call(kvm_x86_check_processor_compatibility)();
+ return kvm_x86_call(check_processor_compatibility)();
}
static void kvm_x86_check_cpu_compat(void *ret)
@@ -9772,19 +9808,19 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P;
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
- host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
- kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
+ kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0;
}
- rdmsrl_safe(MSR_EFER, &host_efer);
+ rdmsrl_safe(MSR_EFER, &kvm_host.efer);
if (boot_cpu_has(X86_FEATURE_XSAVES))
- rdmsrl(MSR_IA32_XSS, host_xss);
+ rdmsrl(MSR_IA32_XSS, kvm_host.xss);
kvm_init_pmu_capability(ops->pmu_ops);
if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, host_arch_capabilities);
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities);
r = ops->hardware_setup();
if (r != 0)
@@ -9843,7 +9879,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
out_unwind_ops:
kvm_x86_ops.hardware_enable = NULL;
- static_call(kvm_x86_hardware_unsetup)();
+ kvm_x86_call(hardware_unsetup)();
out_mmu_exit:
kvm_mmu_vendor_module_exit();
out_free_percpu:
@@ -9874,7 +9910,7 @@ void kvm_x86_vendor_exit(void)
irq_work_sync(&pvclock_irq_work);
cancel_work_sync(&pvclock_gtod_work);
#endif
- static_call(kvm_x86_hardware_unsetup)();
+ kvm_x86_call(hardware_unsetup)();
kvm_mmu_vendor_module_exit();
free_percpu(user_return_msrs);
kmem_cache_destroy(x86_emulator_cache);
@@ -10000,7 +10036,8 @@ EXPORT_SYMBOL_GPL(kvm_apicv_activated);
bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu)
{
ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
- ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu);
+ ulong vcpu_reasons =
+ kvm_x86_call(vcpu_get_apicv_inhibit_reasons)(vcpu);
return (vm_reasons | vcpu_reasons) == 0;
}
@@ -10009,6 +10046,10 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated);
static void set_or_clear_apicv_inhibit(unsigned long *inhibits,
enum kvm_apicv_inhibit reason, bool set)
{
+ const struct trace_print_flags apicv_inhibits[] = { APICV_INHIBIT_REASONS };
+
+ BUILD_BUG_ON(ARRAY_SIZE(apicv_inhibits) != NR_APICV_INHIBIT_REASONS);
+
if (set)
__set_bit(reason, inhibits);
else
@@ -10020,7 +10061,7 @@ static void set_or_clear_apicv_inhibit(unsigned long *inhibits,
static void kvm_apicv_init(struct kvm *kvm)
{
enum kvm_apicv_inhibit reason = enable_apicv ? APICV_INHIBIT_REASON_ABSENT :
- APICV_INHIBIT_REASON_DISABLE;
+ APICV_INHIBIT_REASON_DISABLED;
set_or_clear_apicv_inhibit(&kvm->arch.apicv_inhibit_reasons, reason, true);
@@ -10182,7 +10223,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
a2 = kvm_rdx_read(vcpu);
a3 = kvm_rsi_read(vcpu);
op_64_bit = is_64_bit_hypercall(vcpu);
- cpl = static_call(kvm_x86_get_cpl)(vcpu);
+ cpl = kvm_x86_call(get_cpl)(vcpu);
ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl);
if (nr == KVM_HC_MAP_GPA_RANGE && !ret)
@@ -10214,7 +10255,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
return X86EMUL_PROPAGATE_FAULT;
}
- static_call(kvm_x86_patch_hypercall)(vcpu, instruction);
+ kvm_x86_call(patch_hypercall)(vcpu, instruction);
return emulator_write_emulated(ctxt, rip, instruction, 3,
&ctxt->exception);
@@ -10231,7 +10272,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
- kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu);
+ kvm_run->if_flag = kvm_x86_call(get_if_flag)(vcpu);
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
@@ -10241,6 +10282,8 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
if (is_smm(vcpu))
kvm_run->flags |= KVM_RUN_X86_SMM;
+ if (is_guest_mode(vcpu))
+ kvm_run->flags |= KVM_RUN_X86_GUEST_MODE;
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -10266,7 +10309,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
tpr = kvm_lapic_get_cr8(vcpu);
- static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr);
+ kvm_x86_call(update_cr8_intercept)(vcpu, tpr, max_irr);
}
@@ -10296,7 +10339,7 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu)
vcpu->arch.exception.error_code,
vcpu->arch.exception.injected);
- static_call(kvm_x86_inject_exception)(vcpu);
+ kvm_x86_call(inject_exception)(vcpu);
}
/*
@@ -10382,9 +10425,9 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
else if (kvm_is_exception_pending(vcpu))
; /* see above */
else if (vcpu->arch.nmi_injected)
- static_call(kvm_x86_inject_nmi)(vcpu);
+ kvm_x86_call(inject_nmi)(vcpu);
else if (vcpu->arch.interrupt.injected)
- static_call(kvm_x86_inject_irq)(vcpu, true);
+ kvm_x86_call(inject_irq)(vcpu, true);
/*
* Exceptions that morph to VM-Exits are handled above, and pending
@@ -10469,7 +10512,8 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
*/
#ifdef CONFIG_KVM_SMM
if (vcpu->arch.smi_pending) {
- r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY;
+ r = can_inject ? kvm_x86_call(smi_allowed)(vcpu, true) :
+ -EBUSY;
if (r < 0)
goto out;
if (r) {
@@ -10478,27 +10522,29 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
enter_smm(vcpu);
can_inject = false;
} else
- static_call(kvm_x86_enable_smi_window)(vcpu);
+ kvm_x86_call(enable_smi_window)(vcpu);
}
#endif
if (vcpu->arch.nmi_pending) {
- r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY;
+ r = can_inject ? kvm_x86_call(nmi_allowed)(vcpu, true) :
+ -EBUSY;
if (r < 0)
goto out;
if (r) {
--vcpu->arch.nmi_pending;
vcpu->arch.nmi_injected = true;
- static_call(kvm_x86_inject_nmi)(vcpu);
+ kvm_x86_call(inject_nmi)(vcpu);
can_inject = false;
- WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0);
+ WARN_ON(kvm_x86_call(nmi_allowed)(vcpu, true) < 0);
}
if (vcpu->arch.nmi_pending)
- static_call(kvm_x86_enable_nmi_window)(vcpu);
+ kvm_x86_call(enable_nmi_window)(vcpu);
}
if (kvm_cpu_has_injectable_intr(vcpu)) {
- r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY;
+ r = can_inject ? kvm_x86_call(interrupt_allowed)(vcpu, true) :
+ -EBUSY;
if (r < 0)
goto out;
if (r) {
@@ -10506,17 +10552,17 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
if (!WARN_ON_ONCE(irq == -1)) {
kvm_queue_interrupt(vcpu, irq, false);
- static_call(kvm_x86_inject_irq)(vcpu, false);
- WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
+ kvm_x86_call(inject_irq)(vcpu, false);
+ WARN_ON(kvm_x86_call(interrupt_allowed)(vcpu, true) < 0);
}
}
if (kvm_cpu_has_injectable_intr(vcpu))
- static_call(kvm_x86_enable_irq_window)(vcpu);
+ kvm_x86_call(enable_irq_window)(vcpu);
}
if (is_guest_mode(vcpu) &&
kvm_x86_ops.nested_ops->has_events &&
- kvm_x86_ops.nested_ops->has_events(vcpu))
+ kvm_x86_ops.nested_ops->has_events(vcpu, true))
*req_immediate_exit = true;
/*
@@ -10557,7 +10603,7 @@ static void process_nmi(struct kvm_vcpu *vcpu)
* blocks NMIs). KVM will immediately inject one of the two NMIs, and
* will request an NMI window to handle the second NMI.
*/
- if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
+ if (kvm_x86_call(get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
limit = 1;
else
limit = 2;
@@ -10566,14 +10612,14 @@ static void process_nmi(struct kvm_vcpu *vcpu)
* Adjust the limit to account for pending virtual NMIs, which aren't
* tracked in vcpu->arch.nmi_pending.
*/
- if (static_call(kvm_x86_is_vnmi_pending)(vcpu))
+ if (kvm_x86_call(is_vnmi_pending)(vcpu))
limit--;
vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
if (vcpu->arch.nmi_pending &&
- (static_call(kvm_x86_set_vnmi_pending)(vcpu)))
+ (kvm_x86_call(set_vnmi_pending)(vcpu)))
vcpu->arch.nmi_pending--;
if (vcpu->arch.nmi_pending)
@@ -10584,7 +10630,7 @@ static void process_nmi(struct kvm_vcpu *vcpu)
int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu)
{
return vcpu->arch.nmi_pending +
- static_call(kvm_x86_is_vnmi_pending)(vcpu);
+ kvm_x86_call(is_vnmi_pending)(vcpu);
}
void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
@@ -10618,7 +10664,7 @@ void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
apic->apicv_active = activate;
kvm_apic_update_apicv(vcpu);
- static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
+ kvm_x86_call(refresh_apicv_exec_ctrl)(vcpu);
/*
* When APICv gets disabled, we may still have injected interrupts
@@ -10718,7 +10764,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
- static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
+ kvm_x86_call(sync_pir_to_irr)(vcpu);
if (irqchip_split(vcpu->kvm))
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
@@ -10743,17 +10789,17 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
bitmap_or((ulong *)eoi_exit_bitmap,
vcpu->arch.ioapic_handled_vectors,
to_hv_synic(vcpu)->vec_bitmap, 256);
- static_call_cond(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
+ kvm_x86_call(load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
return;
}
#endif
- static_call_cond(kvm_x86_load_eoi_exitmap)(
+ kvm_x86_call(load_eoi_exitmap)(
vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors);
}
void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
{
- static_call_cond(kvm_x86_guest_memory_reclaimed)(kvm);
+ kvm_x86_call(guest_memory_reclaimed)(kvm);
}
static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
@@ -10761,7 +10807,7 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
if (!lapic_in_kernel(vcpu))
return;
- static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
+ kvm_x86_call(set_apic_access_page_addr)(vcpu);
}
/*
@@ -10925,10 +10971,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
kvm_check_async_pf_completion(vcpu);
if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
- static_call(kvm_x86_msr_filter_changed)(vcpu);
+ kvm_x86_call(msr_filter_changed)(vcpu);
if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
- static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
+ kvm_x86_call(update_cpu_dirty_logging)(vcpu);
+
+ if (kvm_check_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) {
+ kvm_vcpu_reset(vcpu, true);
+ if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) {
+ r = 1;
+ goto out;
+ }
+ }
}
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
@@ -10950,7 +11004,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto out;
}
if (req_int_win)
- static_call(kvm_x86_enable_irq_window)(vcpu);
+ kvm_x86_call(enable_irq_window)(vcpu);
if (kvm_lapic_enabled(vcpu)) {
update_cr8_intercept(vcpu);
@@ -10965,7 +11019,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
preempt_disable();
- static_call(kvm_x86_prepare_switch_to_guest)(vcpu);
+ kvm_x86_call(prepare_switch_to_guest)(vcpu);
/*
* Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
@@ -11001,7 +11055,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
* i.e. they can post interrupts even if APICv is temporarily disabled.
*/
if (kvm_lapic_enabled(vcpu))
- static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
+ kvm_x86_call(sync_pir_to_irr)(vcpu);
if (kvm_vcpu_exit_request(vcpu)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
@@ -11045,12 +11099,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
(kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
- exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit);
+ exit_fastpath = kvm_x86_call(vcpu_run)(vcpu,
+ req_immediate_exit);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
if (kvm_lapic_enabled(vcpu))
- static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
+ kvm_x86_call(sync_pir_to_irr)(vcpu);
if (unlikely(kvm_vcpu_exit_request(vcpu))) {
exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
@@ -11069,7 +11124,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
*/
if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
- static_call(kvm_x86_sync_dirty_debug_regs)(vcpu);
+ kvm_x86_call(sync_dirty_debug_regs)(vcpu);
kvm_update_dr0123(vcpu);
kvm_update_dr7(vcpu);
}
@@ -11098,7 +11153,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (vcpu->arch.xfd_no_write_intercept)
fpu_sync_guest_vmexit_xfd_state();
- static_call(kvm_x86_handle_exit_irqoff)(vcpu);
+ kvm_x86_call(handle_exit_irqoff)(vcpu);
if (vcpu->arch.guest_fpu.xfd_err)
wrmsrl(MSR_IA32_XFD_ERR, 0);
@@ -11131,6 +11186,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_vcpu_srcu_read_lock(vcpu);
/*
+ * Call this to ensure WC buffers in guest are evicted after each VM
+ * Exit, so that the evicted WC writes can be snooped across all cpus
+ */
+ smp_mb__after_srcu_read_lock();
+
+ /*
* Profile KVM exit RIPs:
*/
if (unlikely(prof_on == KVM_PROFILING)) {
@@ -11144,13 +11205,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (vcpu->arch.apic_attention)
kvm_lapic_sync_from_vapic(vcpu);
- r = static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath);
+ r = kvm_x86_call(handle_exit)(vcpu, exit_fastpath);
return r;
cancel_injection:
if (req_immediate_exit)
kvm_make_request(KVM_REQ_EVENT, vcpu);
- static_call(kvm_x86_cancel_injection)(vcpu);
+ kvm_x86_call(cancel_injection)(vcpu);
if (unlikely(vcpu->arch.apic_attention))
kvm_lapic_sync_from_vapic(vcpu);
out:
@@ -11200,7 +11261,10 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu)
* causes a spurious wakeup from HLT).
*/
if (is_guest_mode(vcpu)) {
- if (kvm_check_nested_events(vcpu) < 0)
+ int r = kvm_check_nested_events(vcpu);
+
+ WARN_ON_ONCE(r == -EBUSY);
+ if (r < 0)
return 0;
}
@@ -11237,7 +11301,6 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
int r;
vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
- vcpu->arch.l1tf_flush_l1d = true;
for (;;) {
/*
@@ -11387,7 +11450,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
kvm_vcpu_srcu_read_lock(vcpu);
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
- if (kvm_run->immediate_exit) {
+ if (!vcpu->wants_to_run) {
r = -EINTR;
goto out;
}
@@ -11465,12 +11528,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
WARN_ON_ONCE(vcpu->mmio_needed);
}
- if (kvm_run->immediate_exit) {
+ if (!vcpu->wants_to_run) {
r = -EINTR;
goto out;
}
- r = static_call(kvm_x86_vcpu_pre_run)(vcpu);
+ r = kvm_x86_call(vcpu_pre_run)(vcpu);
if (r <= 0)
goto out;
@@ -11598,10 +11661,10 @@ static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
- static_call(kvm_x86_get_idt)(vcpu, &dt);
+ kvm_x86_call(get_idt)(vcpu, &dt);
sregs->idt.limit = dt.size;
sregs->idt.base = dt.address;
- static_call(kvm_x86_get_gdt)(vcpu, &dt);
+ kvm_x86_call(get_gdt)(vcpu, &dt);
sregs->gdt.limit = dt.size;
sregs->gdt.base = dt.address;
@@ -11743,7 +11806,13 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
has_error_code, error_code);
- if (ret) {
+
+ /*
+ * Report an error userspace if MMIO is needed, as KVM doesn't support
+ * MMIO during a task switch (or any other complex operation).
+ */
+ if (ret || vcpu->mmio_needed) {
+ vcpu->mmio_needed = false;
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
@@ -11801,27 +11870,27 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
dt.size = sregs->idt.limit;
dt.address = sregs->idt.base;
- static_call(kvm_x86_set_idt)(vcpu, &dt);
+ kvm_x86_call(set_idt)(vcpu, &dt);
dt.size = sregs->gdt.limit;
dt.address = sregs->gdt.base;
- static_call(kvm_x86_set_gdt)(vcpu, &dt);
+ kvm_x86_call(set_gdt)(vcpu, &dt);
vcpu->arch.cr2 = sregs->cr2;
*mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
vcpu->arch.cr3 = sregs->cr3;
kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
- static_call_cond(kvm_x86_post_set_cr3)(vcpu, sregs->cr3);
+ kvm_x86_call(post_set_cr3)(vcpu, sregs->cr3);
kvm_set_cr8(vcpu, sregs->cr8);
*mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
- static_call(kvm_x86_set_efer)(vcpu, sregs->efer);
+ kvm_x86_call(set_efer)(vcpu, sregs->efer);
*mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
- static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
+ kvm_x86_call(set_cr0)(vcpu, sregs->cr0);
*mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
- static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
+ kvm_x86_call(set_cr4)(vcpu, sregs->cr4);
if (update_pdptrs) {
idx = srcu_read_lock(&vcpu->kvm->srcu);
@@ -11999,7 +12068,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
*/
kvm_set_rflags(vcpu, rflags);
- static_call(kvm_x86_update_exception_bitmap)(vcpu);
+ kvm_x86_call(update_exception_bitmap)(vcpu);
kvm_arch_vcpu_guestdbg_update_apicv_inhibit(vcpu->kvm);
@@ -12136,7 +12205,7 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
if (id >= kvm->arch.max_vcpu_ids)
return -EINVAL;
- return static_call(kvm_x86_vcpu_precreate)(kvm);
+ return kvm_x86_call(vcpu_precreate)(kvm);
}
int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
@@ -12207,14 +12276,13 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.hv_root_tdp = INVALID_PAGE;
#endif
- r = static_call(kvm_x86_vcpu_create)(vcpu);
+ r = kvm_x86_call(vcpu_create)(vcpu);
if (r)
goto free_guest_fpu;
vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
kvm_xen_init_vcpu(vcpu);
- kvm_vcpu_mtrr_init(vcpu);
vcpu_load(vcpu);
kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
kvm_vcpu_reset(vcpu, false);
@@ -12265,7 +12333,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kvmclock_reset(vcpu);
- static_call(kvm_x86_vcpu_free)(vcpu);
+ kvm_x86_call(vcpu_free)(vcpu);
kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
@@ -12383,7 +12451,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1);
kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600);
- static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
+ kvm_x86_call(vcpu_reset)(vcpu, init_event);
kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
kvm_rip_write(vcpu, 0xfff0);
@@ -12402,10 +12470,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
else
new_cr0 |= X86_CR0_NW | X86_CR0_CD;
- static_call(kvm_x86_set_cr0)(vcpu, new_cr0);
- static_call(kvm_x86_set_cr4)(vcpu, 0);
- static_call(kvm_x86_set_efer)(vcpu, 0);
- static_call(kvm_x86_update_exception_bitmap)(vcpu);
+ kvm_x86_call(set_cr0)(vcpu, new_cr0);
+ kvm_x86_call(set_cr4)(vcpu, 0);
+ kvm_x86_call(set_efer)(vcpu, 0);
+ kvm_x86_call(update_exception_bitmap)(vcpu);
/*
* On the standard CR0/CR4/EFER modification paths, there are several
@@ -12462,7 +12530,7 @@ int kvm_arch_hardware_enable(void)
if (ret)
return ret;
- ret = static_call(kvm_x86_hardware_enable)();
+ ret = kvm_x86_call(hardware_enable)();
if (ret != 0)
return ret;
@@ -12544,7 +12612,7 @@ int kvm_arch_hardware_enable(void)
void kvm_arch_hardware_disable(void)
{
- static_call(kvm_x86_hardware_disable)();
+ kvm_x86_call(hardware_disable)();
drop_user_return_notifiers();
}
@@ -12558,18 +12626,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
}
-void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
-{
- struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
-
- vcpu->arch.l1tf_flush_l1d = true;
- if (pmu->version && unlikely(pmu->event_count)) {
- pmu->need_cleanup = true;
- kvm_make_request(KVM_REQ_PMU, vcpu);
- }
- static_call(kvm_x86_sched_in)(vcpu, cpu);
-}
-
void kvm_arch_free_vm(struct kvm *kvm)
{
#if IS_ENABLED(CONFIG_HYPERV)
@@ -12597,7 +12653,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm_mmu_init_vm(kvm);
- ret = static_call(kvm_x86_vm_init)(kvm);
+ ret = kvm_x86_call(vm_init)(kvm);
if (ret)
goto out_uninit_mmu;
@@ -12620,6 +12676,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
+ kvm->arch.apic_bus_cycle_ns = APIC_BUS_CYCLE_NS_DEFAULT;
kvm->arch.guest_can_read_msr_platform_info = true;
kvm->arch.enable_pmu = enable_pmu;
@@ -12771,7 +12828,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
mutex_unlock(&kvm->slots_lock);
}
kvm_unload_vcpu_mmus(kvm);
- static_call_cond(kvm_x86_vm_destroy)(kvm);
+ kvm_x86_call(vm_destroy)(kvm);
kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
@@ -13100,12 +13157,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
kvm_arch_free_memslot(kvm, old);
}
-static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
-{
- return (is_guest_mode(vcpu) &&
- static_call(kvm_x86_guest_apic_has_interrupt)(vcpu));
-}
-
static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
{
if (!list_empty_careful(&vcpu->async_pf.done))
@@ -13123,22 +13174,23 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
(vcpu->arch.nmi_pending &&
- static_call(kvm_x86_nmi_allowed)(vcpu, false)))
+ kvm_x86_call(nmi_allowed)(vcpu, false)))
return true;
#ifdef CONFIG_KVM_SMM
if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
(vcpu->arch.smi_pending &&
- static_call(kvm_x86_smi_allowed)(vcpu, false)))
+ kvm_x86_call(smi_allowed)(vcpu, false)))
return true;
#endif
if (kvm_test_request(KVM_REQ_PMI, vcpu))
return true;
- if (kvm_arch_interrupt_allowed(vcpu) &&
- (kvm_cpu_has_interrupt(vcpu) ||
- kvm_guest_apic_has_interrupt(vcpu)))
+ if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu))
+ return true;
+
+ if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu))
return true;
if (kvm_hv_has_stimer_pending(vcpu))
@@ -13146,7 +13198,7 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
if (is_guest_mode(vcpu) &&
kvm_x86_ops.nested_ops->has_events &&
- kvm_x86_ops.nested_ops->has_events(vcpu))
+ kvm_x86_ops.nested_ops->has_events(vcpu, false))
return true;
if (kvm_xen_has_pending_events(vcpu))
@@ -13163,7 +13215,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
{
return kvm_vcpu_apicv_active(vcpu) &&
- static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu);
+ kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu);
}
bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
@@ -13191,7 +13243,7 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
if (vcpu->arch.guest_state_protected)
return true;
- return static_call(kvm_x86_get_cpl)(vcpu) == 0;
+ return kvm_x86_call(get_cpl)(vcpu) == 0;
}
unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
@@ -13206,7 +13258,7 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
{
- return static_call(kvm_x86_interrupt_allowed)(vcpu, false);
+ return kvm_x86_call(interrupt_allowed)(vcpu, false);
}
unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
@@ -13232,7 +13284,7 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
{
unsigned long rflags;
- rflags = static_call(kvm_x86_get_rflags)(vcpu);
+ rflags = kvm_x86_call(get_rflags)(vcpu);
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
rflags &= ~X86_EFLAGS_TF;
return rflags;
@@ -13244,7 +13296,7 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
rflags |= X86_EFLAGS_TF;
- static_call(kvm_x86_set_rflags)(vcpu, rflags);
+ kvm_x86_call(set_rflags)(vcpu, rflags);
}
void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
@@ -13356,7 +13408,7 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
return false;
if (vcpu->arch.apf.send_user_only &&
- static_call(kvm_x86_get_cpl)(vcpu) == 0)
+ kvm_x86_call(get_cpl)(vcpu) == 0)
return false;
if (is_guest_mode(vcpu)) {
@@ -13467,7 +13519,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
void kvm_arch_start_assignment(struct kvm *kvm)
{
if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1)
- static_call_cond(kvm_x86_pi_start_assignment)(kvm);
+ kvm_x86_call(pi_start_assignment)(kvm);
}
EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
@@ -13486,13 +13538,13 @@ EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
{
/*
- * Non-coherent DMA assignment and de-assignment will affect
- * whether KVM honors guest MTRRs and cause changes in memtypes
- * in TDP.
- * So, pass %true unconditionally to indicate non-coherent DMA was,
- * or will be involved, and that zapping SPTEs might be necessary.
+ * Non-coherent DMA assignment and de-assignment may affect whether or
+ * not KVM honors guest PAT, and thus may cause changes in EPT SPTEs
+ * due to toggling the "ignore PAT" bit. Zap all SPTEs when the first
+ * (or last) non-coherent device is (un)registered to so that new SPTEs
+ * with the correct "ignore guest PAT" setting are created.
*/
- if (__kvm_mmu_honors_guest_mtrrs(true))
+ if (kvm_mmu_may_ignore_guest_pat())
kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL));
}
@@ -13530,9 +13582,8 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
irqfd->producer = prod;
kvm_arch_start_assignment(irqfd->kvm);
- ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm,
- prod->irq, irqfd->gsi, 1);
-
+ ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
+ prod->irq, irqfd->gsi, 1);
if (ret)
kvm_arch_end_assignment(irqfd->kvm);
@@ -13555,7 +13606,8 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
* when the irq is masked/disabled or the consumer side (KVM
* int this case doesn't want to receive the interrupts.
*/
- ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+ ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
+ prod->irq, irqfd->gsi, 0);
if (ret)
printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
" fails: %d\n", irqfd->consumer.token, ret);
@@ -13566,7 +13618,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)
{
- return static_call(kvm_x86_pi_update_irte)(kvm, host_irq, guest_irq, set);
+ return kvm_x86_call(pi_update_irte)(kvm, host_irq, guest_irq, set);
}
bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
@@ -13589,6 +13641,24 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
+#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE
+bool kvm_arch_gmem_prepare_needed(struct kvm *kvm)
+{
+ return kvm->arch.vm_type == KVM_X86_SNP_VM;
+}
+
+int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order)
+{
+ return kvm_x86_call(gmem_prepare)(kvm, pfn, gfn, max_order);
+}
+#endif
+
+#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE
+void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
+{
+ kvm_x86_call(gmem_invalidate)(start, end);
+}
+#endif
int kvm_spec_ctrl_test_value(u64 value)
{
@@ -13974,6 +14044,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_rmp_fault);
static int __init kvm_x86_init(void)
{
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index d80a4c6b5a38..50596f6f8320 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -33,6 +33,20 @@ struct kvm_caps {
u64 supported_perf_cap;
};
+struct kvm_host_values {
+ /*
+ * The host's raw MAXPHYADDR, i.e. the number of non-reserved physical
+ * address bits irrespective of features that repurpose legal bits,
+ * e.g. MKTME.
+ */
+ u8 maxphyaddr;
+
+ u64 efer;
+ u64 xcr0;
+ u64 xss;
+ u64 arch_capabilities;
+};
+
void kvm_spurious_fault(void);
#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \
@@ -159,7 +173,7 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
if (!is_long_mode(vcpu))
return false;
- static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
+ kvm_x86_call(get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
return cs_l;
}
@@ -311,12 +325,8 @@ int handle_ud(struct kvm_vcpu *vcpu);
void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
struct kvm_queued_exception *ex);
-void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
-u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
-bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
- int page_num);
bool kvm_vector_hashing_enabled(void);
void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code);
int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
@@ -325,11 +335,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int emulation_type, void *insn, int insn_len);
fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
-extern u64 host_xcr0;
-extern u64 host_xss;
-extern u64 host_arch_capabilities;
-
extern struct kvm_caps kvm_caps;
+extern struct kvm_host_values kvm_host;
extern bool enable_pmu;
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index f65b35a05d91..622fe24da910 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -741,7 +741,7 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
} else {
void __user * hva = u64_to_user_ptr(data->u.shared_info.hva);
- if (!PAGE_ALIGNED(hva) || !access_ok(hva, PAGE_SIZE)) {
+ if (!PAGE_ALIGNED(hva)) {
r = -EINVAL;
} else if (!hva) {
kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
@@ -1270,7 +1270,7 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
instructions[0] = 0xb8;
/* vmcall / vmmcall */
- static_call(kvm_x86_patch_hypercall)(vcpu, instructions + 5);
+ kvm_x86_call(patch_hypercall)(vcpu, instructions + 5);
/* ret */
instructions[8] = 0xc3;
@@ -1650,7 +1650,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
params[5] = (u64)kvm_r9_read(vcpu);
}
#endif
- cpl = static_call(kvm_x86_get_cpl)(vcpu);
+ cpl = kvm_x86_call(get_cpl)(vcpu);
trace_kvm_xen_hypercall(cpl, input, params[0], params[1], params[2],
params[3], params[4], params[5]);
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index 1912bee22dd4..9810edbb272d 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -2033,6 +2033,39 @@ static int sev_ioctl_do_snp_set_config(struct sev_issue_cmd *argp, bool writable
return __sev_do_cmd_locked(SEV_CMD_SNP_CONFIG, &config, &argp->error);
}
+static int sev_ioctl_do_snp_vlek_load(struct sev_issue_cmd *argp, bool writable)
+{
+ struct sev_device *sev = psp_master->sev_data;
+ struct sev_user_data_snp_vlek_load input;
+ void *blob;
+ int ret;
+
+ if (!sev->snp_initialized || !argp->data)
+ return -EINVAL;
+
+ if (!writable)
+ return -EPERM;
+
+ if (copy_from_user(&input, u64_to_user_ptr(argp->data), sizeof(input)))
+ return -EFAULT;
+
+ if (input.len != sizeof(input) || input.vlek_wrapped_version != 0)
+ return -EINVAL;
+
+ blob = psp_copy_user_blob(input.vlek_wrapped_address,
+ sizeof(struct sev_user_data_snp_wrapped_vlek_hashstick));
+ if (IS_ERR(blob))
+ return PTR_ERR(blob);
+
+ input.vlek_wrapped_address = __psp_pa(blob);
+
+ ret = __sev_do_cmd_locked(SEV_CMD_SNP_VLEK_LOAD, &input, &argp->error);
+
+ kfree(blob);
+
+ return ret;
+}
+
static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg)
{
void __user *argp = (void __user *)arg;
@@ -2093,6 +2126,9 @@ static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg)
case SNP_SET_CONFIG:
ret = sev_ioctl_do_snp_set_config(&input, writable);
break;
+ case SNP_VLEK_LOAD:
+ ret = sev_ioctl_do_snp_vlek_load(&input, writable);
+ break;
default:
ret = -EINVAL;
goto out;
diff --git a/drivers/virt/coco/sev-guest/sev-guest.c b/drivers/virt/coco/sev-guest/sev-guest.c
index f714009b9ff7..6fc7884ea0a1 100644
--- a/drivers/virt/coco/sev-guest/sev-guest.c
+++ b/drivers/virt/coco/sev-guest/sev-guest.c
@@ -30,8 +30,6 @@
#include <asm/svm.h>
#include <asm/sev.h>
-#include "sev-guest.h"
-
#define DEVICE_NAME "sev-guest"
#define AAD_LEN 48
#define MSG_HDR_VER 1
diff --git a/drivers/virt/coco/sev-guest/sev-guest.h b/drivers/virt/coco/sev-guest/sev-guest.h
deleted file mode 100644
index 21bda26fdb95..000000000000
--- a/drivers/virt/coco/sev-guest/sev-guest.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2021 Advanced Micro Devices, Inc.
- *
- * Author: Brijesh Singh <brijesh.singh@amd.com>
- *
- * SEV-SNP API spec is available at https://developer.amd.com/sev
- */
-
-#ifndef __VIRT_SEVGUEST_H__
-#define __VIRT_SEVGUEST_H__
-
-#include <linux/types.h>
-
-#define MAX_AUTHTAG_LEN 32
-
-/* See SNP spec SNP_GUEST_REQUEST section for the structure */
-enum msg_type {
- SNP_MSG_TYPE_INVALID = 0,
- SNP_MSG_CPUID_REQ,
- SNP_MSG_CPUID_RSP,
- SNP_MSG_KEY_REQ,
- SNP_MSG_KEY_RSP,
- SNP_MSG_REPORT_REQ,
- SNP_MSG_REPORT_RSP,
- SNP_MSG_EXPORT_REQ,
- SNP_MSG_EXPORT_RSP,
- SNP_MSG_IMPORT_REQ,
- SNP_MSG_IMPORT_RSP,
- SNP_MSG_ABSORB_REQ,
- SNP_MSG_ABSORB_RSP,
- SNP_MSG_VMRK_REQ,
- SNP_MSG_VMRK_RSP,
-
- SNP_MSG_TYPE_MAX
-};
-
-enum aead_algo {
- SNP_AEAD_INVALID,
- SNP_AEAD_AES_256_GCM,
-};
-
-struct snp_guest_msg_hdr {
- u8 authtag[MAX_AUTHTAG_LEN];
- u64 msg_seqno;
- u8 rsvd1[8];
- u8 algo;
- u8 hdr_version;
- u16 hdr_sz;
- u8 msg_type;
- u8 msg_version;
- u16 msg_sz;
- u32 rsvd2;
- u8 msg_vmpck;
- u8 rsvd3[35];
-} __packed;
-
-struct snp_guest_msg {
- struct snp_guest_msg_hdr hdr;
- u8 payload[4000];
-} __packed;
-
-#endif /* __VIRT_SEVGUEST_H__ */
diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index c82d56768101..c6d18f50f671 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -212,6 +212,9 @@ bool ffa_device_is_valid(struct ffa_device *ffa_dev) { return false; }
extern const struct bus_type ffa_bus_type;
+/* The FF-A 1.0 partition structure lacks the uuid[4] */
+#define FFA_1_0_PARTITON_INFO_SZ (8)
+
/* FFA transport related */
struct ffa_partition_info {
u16 id;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 692c01e41a18..689e8be873a7 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -378,8 +378,10 @@ struct kvm_vcpu {
bool dy_eligible;
} spin_loop;
#endif
+ bool wants_to_run;
bool preempted;
bool ready;
+ bool scheduled_out;
struct kvm_vcpu_arch arch;
struct kvm_vcpu_stat stat;
char stats_id[KVM_STATS_NAME_SIZE];
@@ -1494,8 +1496,6 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg);
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu);
-void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu);
-
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id);
@@ -1955,8 +1955,6 @@ struct _kvm_stats_desc {
HALT_POLL_HIST_COUNT), \
STATS_DESC_IBOOLEAN(VCPU_GENERIC, blocking)
-extern struct dentry *kvm_debugfs_dir;
-
ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header,
const struct _kvm_stats_desc *desc,
void *stats, size_t size_stats,
@@ -2096,6 +2094,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
const struct kvm_irq_routing_entry *entries,
unsigned nr,
unsigned flags);
+int kvm_init_irq_routing(struct kvm *kvm);
int kvm_set_routing_entry(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue);
@@ -2105,6 +2104,11 @@ void kvm_free_irq_routing(struct kvm *kvm);
static inline void kvm_free_irq_routing(struct kvm *kvm) {}
+static inline int kvm_init_irq_routing(struct kvm *kvm)
+{
+ return 0;
+}
+
#endif
int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);
@@ -2441,4 +2445,45 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm,
}
#endif /* CONFIG_KVM_PRIVATE_MEM */
+#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE
+int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order);
+bool kvm_arch_gmem_prepare_needed(struct kvm *kvm);
+#endif
+
+/**
+ * kvm_gmem_populate() - Populate/prepare a GPA range with guest data
+ *
+ * @kvm: KVM instance
+ * @gfn: starting GFN to be populated
+ * @src: userspace-provided buffer containing data to copy into GFN range
+ * (passed to @post_populate, and incremented on each iteration
+ * if not NULL)
+ * @npages: number of pages to copy from userspace-buffer
+ * @post_populate: callback to issue for each gmem page that backs the GPA
+ * range
+ * @opaque: opaque data to pass to @post_populate callback
+ *
+ * This is primarily intended for cases where a gmem-backed GPA range needs
+ * to be initialized with userspace-provided data prior to being mapped into
+ * the guest as a private page. This should be called with the slots->lock
+ * held so that caller-enforced invariants regarding the expected memory
+ * attributes of the GPA range do not race with KVM_SET_MEMORY_ATTRIBUTES.
+ *
+ * Returns the number of pages that were populated.
+ */
+typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
+ void __user *src, int order, void *opaque);
+
+long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
+ kvm_gmem_populate_cb post_populate, void *opaque);
+
+#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE
+void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
+#endif
+
+#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
+long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
+ struct kvm_pre_fault_memory *range);
+#endif
+
#endif
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index a0a026d2d244..2d72bd89bf7b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -208,7 +208,8 @@ enum mapping_flags {
AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */
AS_STABLE_WRITES, /* must wait for writeback before modifying
folio contents */
- AS_UNMOVABLE, /* The mapping cannot be moved, ever */
+ AS_INACCESSIBLE, /* Do not attempt direct R/W access to the mapping,
+ including to move the mapping */
};
/**
@@ -309,20 +310,20 @@ static inline void mapping_clear_stable_writes(struct address_space *mapping)
clear_bit(AS_STABLE_WRITES, &mapping->flags);
}
-static inline void mapping_set_unmovable(struct address_space *mapping)
+static inline void mapping_set_inaccessible(struct address_space *mapping)
{
/*
- * It's expected unmovable mappings are also unevictable. Compaction
+ * It's expected inaccessible mappings are also unevictable. Compaction
* migrate scanner (isolate_migratepages_block()) relies on this to
* reduce page locking.
*/
set_bit(AS_UNEVICTABLE, &mapping->flags);
- set_bit(AS_UNMOVABLE, &mapping->flags);
+ set_bit(AS_INACCESSIBLE, &mapping->flags);
}
-static inline bool mapping_unmovable(struct address_space *mapping)
+static inline bool mapping_inaccessible(struct address_space *mapping)
{
- return test_bit(AS_UNMOVABLE, &mapping->flags);
+ return test_bit(AS_INACCESSIBLE, &mapping->flags);
}
static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h
index 3705c2044fc0..903ddfea8585 100644
--- a/include/linux/psp-sev.h
+++ b/include/linux/psp-sev.h
@@ -658,6 +658,7 @@ struct sev_data_snp_launch_update {
* @id_auth_paddr: system physical address of ID block authentication structure
* @id_block_en: indicates whether ID block is present
* @auth_key_en: indicates whether author key is present in authentication structure
+ * @vcek_disabled: indicates whether use of VCEK is allowed for attestation reports
* @rsvd: reserved
* @host_data: host-supplied data for guest, not interpreted by firmware
*/
@@ -667,7 +668,8 @@ struct sev_data_snp_launch_finish {
u64 id_auth_paddr;
u8 id_block_en:1;
u8 auth_key_en:1;
- u64 rsvd:62;
+ u8 vcek_disabled:1;
+ u64 rsvd:61;
u8 host_data[32];
} __packed;
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 6f6cb5fc1242..835bbb2d1f88 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -378,6 +378,20 @@ static inline void smp_mb__after_srcu_read_unlock(void)
/* __srcu_read_unlock has smp_mb() internally so nothing to do here. */
}
+/**
+ * smp_mb__after_srcu_read_lock - ensure full ordering after srcu_read_lock
+ *
+ * Converts the preceding srcu_read_lock into a two-way memory barrier.
+ *
+ * Call this after srcu_read_lock, to guarantee that all memory operations
+ * that occur after smp_mb__after_srcu_read_lock will appear to happen after
+ * the preceding srcu_read_lock.
+ */
+static inline void smp_mb__after_srcu_read_lock(void)
+{
+ /* __srcu_read_lock has smp_mb() internally so nothing to do here. */
+}
+
DEFINE_LOCK_GUARD_1(srcu, struct srcu_struct,
_T->idx = srcu_read_lock(_T->lock),
srcu_read_unlock(_T->lock, _T->idx),
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d03842abae57..637efc055145 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -192,11 +192,24 @@ struct kvm_xen_exit {
/* Flags that describe what fields in emulation_failure hold valid data. */
#define KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES (1ULL << 0)
+/*
+ * struct kvm_run can be modified by userspace at any time, so KVM must be
+ * careful to avoid TOCTOU bugs. In order to protect KVM, HINT_UNSAFE_IN_KVM()
+ * renames fields in struct kvm_run from <symbol> to <symbol>__unsafe when
+ * compiled into the kernel, ensuring that any use within KVM is obvious and
+ * gets extra scrutiny.
+ */
+#ifdef __KERNEL__
+#define HINT_UNSAFE_IN_KVM(_symbol) _symbol##__unsafe
+#else
+#define HINT_UNSAFE_IN_KVM(_symbol) _symbol
+#endif
+
/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
struct kvm_run {
/* in */
__u8 request_interrupt_window;
- __u8 immediate_exit;
+ __u8 HINT_UNSAFE_IN_KVM(immediate_exit);
__u8 padding1[6];
/* out */
@@ -917,6 +930,9 @@ struct kvm_enable_cap {
#define KVM_CAP_MEMORY_ATTRIBUTES 233
#define KVM_CAP_GUEST_MEMFD 234
#define KVM_CAP_VM_TYPES 235
+#define KVM_CAP_PRE_FAULT_MEMORY 236
+#define KVM_CAP_X86_APIC_BUS_CYCLES_NS 237
+#define KVM_CAP_X86_GUEST_MODE 238
struct kvm_irq_routing_irqchip {
__u32 irqchip;
@@ -1548,4 +1564,13 @@ struct kvm_create_guest_memfd {
__u64 reserved[6];
};
+#define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory)
+
+struct kvm_pre_fault_memory {
+ __u64 gpa;
+ __u64 size;
+ __u64 flags;
+ __u64 padding[5];
+};
+
#endif /* __LINUX_KVM_H */
diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h
index b7a2c2ee35b7..2289b7c76c59 100644
--- a/include/uapi/linux/psp-sev.h
+++ b/include/uapi/linux/psp-sev.h
@@ -31,6 +31,7 @@ enum {
SNP_PLATFORM_STATUS,
SNP_COMMIT,
SNP_SET_CONFIG,
+ SNP_VLEK_LOAD,
SEV_MAX,
};
@@ -215,6 +216,32 @@ struct sev_user_data_snp_config {
} __packed;
/**
+ * struct sev_data_snp_vlek_load - SNP_VLEK_LOAD structure
+ *
+ * @len: length of the command buffer read by the PSP
+ * @vlek_wrapped_version: version of wrapped VLEK hashstick (Must be 0h)
+ * @rsvd: reserved
+ * @vlek_wrapped_address: address of a wrapped VLEK hashstick
+ * (struct sev_user_data_snp_wrapped_vlek_hashstick)
+ */
+struct sev_user_data_snp_vlek_load {
+ __u32 len; /* In */
+ __u8 vlek_wrapped_version; /* In */
+ __u8 rsvd[3]; /* In */
+ __u64 vlek_wrapped_address; /* In */
+} __packed;
+
+/**
+ * struct sev_user_data_snp_vlek_wrapped_vlek_hashstick - Wrapped VLEK data
+ *
+ * @data: Opaque data provided by AMD KDS (as described in SEV-SNP Firmware ABI
+ * 1.54, SNP_VLEK_LOAD)
+ */
+struct sev_user_data_snp_wrapped_vlek_hashstick {
+ __u8 data[432]; /* In */
+} __packed;
+
+/**
* struct sev_issue_cmd - SEV ioctl parameters
*
* @cmd: SEV commands to execute
diff --git a/include/uapi/linux/sev-guest.h b/include/uapi/linux/sev-guest.h
index 154a87a1eca9..fcdfea767fca 100644
--- a/include/uapi/linux/sev-guest.h
+++ b/include/uapi/linux/sev-guest.h
@@ -89,6 +89,9 @@ struct snp_ext_report_req {
#define SNP_GUEST_FW_ERR_MASK GENMASK_ULL(31, 0)
#define SNP_GUEST_VMM_ERR_SHIFT 32
#define SNP_GUEST_VMM_ERR(x) (((u64)x) << SNP_GUEST_VMM_ERR_SHIFT)
+#define SNP_GUEST_FW_ERR(x) ((x) & SNP_GUEST_FW_ERR_MASK)
+#define SNP_GUEST_ERR(vmm_err, fw_err) (SNP_GUEST_VMM_ERR(vmm_err) | \
+ SNP_GUEST_FW_ERR(fw_err))
#define SNP_GUEST_VMM_ERR_INVALID_LEN 1
#define SNP_GUEST_VMM_ERR_BUSY 2
diff --git a/mm/compaction.c b/mm/compaction.c
index 739b1bf3d637..6cb901b63482 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1179,22 +1179,22 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (((mode & ISOLATE_ASYNC_MIGRATE) && is_dirty) ||
(mapping && is_unevictable)) {
bool migrate_dirty = true;
- bool is_unmovable;
+ bool is_inaccessible;
/*
* Only folios without mappings or that have
* a ->migrate_folio callback are possible to migrate
* without blocking.
*
- * Folios from unmovable mappings are not migratable.
+ * Folios from inaccessible mappings are not migratable.
*
* However, we can be racing with truncation, which can
* free the mapping that we need to check. Truncation
* holds the folio lock until after the folio is removed
* from the page so holding it ourselves is sufficient.
*
- * To avoid locking the folio just to check unmovable,
- * assume every unmovable folio is also unevictable,
+ * To avoid locking the folio just to check inaccessible,
+ * assume every inaccessible folio is also unevictable,
* which is a cheaper test. If our assumption goes
* wrong, it's not a correctness bug, just potentially
* wasted cycles.
@@ -1207,9 +1207,9 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
migrate_dirty = !mapping ||
mapping->a_ops->migrate_folio;
}
- is_unmovable = mapping && mapping_unmovable(mapping);
+ is_inaccessible = mapping && mapping_inaccessible(mapping);
folio_unlock(folio);
- if (!migrate_dirty || is_unmovable)
+ if (!migrate_dirty || is_inaccessible)
goto isolate_fail_put;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index a8c6f466e33a..ed3aac90cf4f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -978,7 +978,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
if (!mapping)
rc = migrate_folio(mapping, dst, src, mode);
- else if (mapping_unmovable(mapping))
+ else if (mapping_inaccessible(mapping))
rc = -EOPNOTSUPP;
else if (mapping->a_ops->migrate_folio)
/*
diff --git a/mm/truncate.c b/mm/truncate.c
index e99085bf3d34..581977d2356f 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -233,7 +233,8 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
* doing a complex calculation here, and then doing the zeroing
* anyway if the page split fails.
*/
- folio_zero_range(folio, offset, length);
+ if (!mapping_inaccessible(folio->mapping))
+ folio_zero_range(folio, offset, length);
if (folio_has_private(folio))
folio_invalidate(folio, offset, length);
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index d03842abae57..e5af8c692dc0 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -917,6 +917,7 @@ struct kvm_enable_cap {
#define KVM_CAP_MEMORY_ATTRIBUTES 233
#define KVM_CAP_GUEST_MEMFD 234
#define KVM_CAP_VM_TYPES 235
+#define KVM_CAP_PRE_FAULT_MEMORY 236
struct kvm_irq_routing_irqchip {
__u32 irqchip;
@@ -1548,4 +1549,13 @@ struct kvm_create_guest_memfd {
__u64 reserved[6];
};
+#define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory)
+
+struct kvm_pre_fault_memory {
+ __u64 gpa;
+ __u64 size;
+ __u64 flags;
+ __u64 padding[5];
+};
+
#endif /* __LINUX_KVM_H */
diff --git a/tools/perf/arch/loongarch/Makefile b/tools/perf/arch/loongarch/Makefile
index 3992a67a87d9..c89d6bb6b184 100644
--- a/tools/perf/arch/loongarch/Makefile
+++ b/tools/perf/arch/loongarch/Makefile
@@ -4,6 +4,7 @@ PERF_HAVE_DWARF_REGS := 1
endif
PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1
PERF_HAVE_JITDUMP := 1
+HAVE_KVM_STAT_SUPPORT := 1
#
# Syscall table generation for perf
diff --git a/tools/perf/arch/loongarch/util/Build b/tools/perf/arch/loongarch/util/Build
index 2386ebbf6dd4..b6b97de48233 100644
--- a/tools/perf/arch/loongarch/util/Build
+++ b/tools/perf/arch/loongarch/util/Build
@@ -1,5 +1,7 @@
+perf-util-y += header.o
perf-util-y += perf_regs.o
perf-util-$(CONFIG_DWARF) += dwarf-regs.o
perf-util-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o
perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
+perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
diff --git a/tools/perf/arch/loongarch/util/header.c b/tools/perf/arch/loongarch/util/header.c
new file mode 100644
index 000000000000..d962dff55512
--- /dev/null
+++ b/tools/perf/arch/loongarch/util/header.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Implementation of get_cpuid().
+ *
+ * Author: Nikita Shubin <n.shubin@yadro.com>
+ * Bibo Mao <maobibo@loongson.cn>
+ * Huacai Chen <chenhuacai@loongson.cn>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <api/fs/fs.h>
+#include <errno.h>
+#include "util/debug.h"
+#include "util/header.h"
+
+/*
+ * Output example from /proc/cpuinfo
+ * CPU Family : Loongson-64bit
+ * Model Name : Loongson-3C5000
+ * CPU Revision : 0x10
+ * FPU Revision : 0x01
+ */
+#define CPUINFO_MODEL "Model Name"
+#define CPUINFO "/proc/cpuinfo"
+
+static char *_get_field(const char *line)
+{
+ char *line2, *nl;
+
+ line2 = strrchr(line, ' ');
+ if (!line2)
+ return NULL;
+
+ line2++;
+ nl = strrchr(line, '\n');
+ if (!nl)
+ return NULL;
+
+ return strndup(line2, nl - line2);
+}
+
+static char *_get_cpuid(void)
+{
+ unsigned long line_sz;
+ char *line, *model, *cpuid;
+ FILE *file;
+
+ file = fopen(CPUINFO, "r");
+ if (file == NULL)
+ return NULL;
+
+ line = model = cpuid = NULL;
+ while (getline(&line, &line_sz, file) != -1) {
+ if (strncmp(line, CPUINFO_MODEL, strlen(CPUINFO_MODEL)))
+ continue;
+
+ model = _get_field(line);
+ if (!model)
+ goto out_free;
+ break;
+ }
+
+ if (model && (asprintf(&cpuid, "%s", model) < 0))
+ cpuid = NULL;
+
+out_free:
+ fclose(file);
+ free(model);
+ return cpuid;
+}
+
+int get_cpuid(char *buffer, size_t sz)
+{
+ int ret = 0;
+ char *cpuid = _get_cpuid();
+
+ if (!cpuid)
+ return EINVAL;
+
+ if (sz < strlen(cpuid)) {
+ ret = ENOBUFS;
+ goto out_free;
+ }
+
+ scnprintf(buffer, sz, "%s", cpuid);
+
+out_free:
+ free(cpuid);
+ return ret;
+}
+
+char *get_cpuid_str(struct perf_pmu *pmu __maybe_unused)
+{
+ return _get_cpuid();
+}
diff --git a/tools/perf/arch/loongarch/util/kvm-stat.c b/tools/perf/arch/loongarch/util/kvm-stat.c
new file mode 100644
index 000000000000..a7859a3a9a51
--- /dev/null
+++ b/tools/perf/arch/loongarch/util/kvm-stat.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <memory.h>
+#include "util/kvm-stat.h"
+#include "util/parse-events.h"
+#include "util/debug.h"
+#include "util/evsel.h"
+#include "util/evlist.h"
+#include "util/pmus.h"
+
+#define LOONGARCH_EXCEPTION_INT 0
+#define LOONGARCH_EXCEPTION_PIL 1
+#define LOONGARCH_EXCEPTION_PIS 2
+#define LOONGARCH_EXCEPTION_PIF 3
+#define LOONGARCH_EXCEPTION_PME 4
+#define LOONGARCH_EXCEPTION_FPD 15
+#define LOONGARCH_EXCEPTION_SXD 16
+#define LOONGARCH_EXCEPTION_ASXD 17
+#define LOONGARCH_EXCEPTION_GSPR 22
+#define LOONGARCH_EXCEPTION_CPUCFG 100
+#define LOONGARCH_EXCEPTION_CSR 101
+#define LOONGARCH_EXCEPTION_IOCSR 102
+#define LOONGARCH_EXCEPTION_IDLE 103
+#define LOONGARCH_EXCEPTION_OTHERS 104
+#define LOONGARCH_EXCEPTION_HVC 23
+
+#define loongarch_exception_type \
+ {LOONGARCH_EXCEPTION_INT, "Interrupt" }, \
+ {LOONGARCH_EXCEPTION_PIL, "Mem Read" }, \
+ {LOONGARCH_EXCEPTION_PIS, "Mem Store" }, \
+ {LOONGARCH_EXCEPTION_PIF, "Inst Fetch" }, \
+ {LOONGARCH_EXCEPTION_PME, "Mem Modify" }, \
+ {LOONGARCH_EXCEPTION_FPD, "FPU" }, \
+ {LOONGARCH_EXCEPTION_SXD, "LSX" }, \
+ {LOONGARCH_EXCEPTION_ASXD, "LASX" }, \
+ {LOONGARCH_EXCEPTION_GSPR, "Privilege Error" }, \
+ {LOONGARCH_EXCEPTION_HVC, "Hypercall" }, \
+ {LOONGARCH_EXCEPTION_CPUCFG, "CPUCFG" }, \
+ {LOONGARCH_EXCEPTION_CSR, "CSR" }, \
+ {LOONGARCH_EXCEPTION_IOCSR, "IOCSR" }, \
+ {LOONGARCH_EXCEPTION_IDLE, "Idle" }, \
+ {LOONGARCH_EXCEPTION_OTHERS, "Others" }
+
+define_exit_reasons_table(loongarch_exit_reasons, loongarch_exception_type);
+
+const char *vcpu_id_str = "vcpu_id";
+const char *kvm_exit_reason = "reason";
+const char *kvm_entry_trace = "kvm:kvm_enter";
+const char *kvm_reenter_trace = "kvm:kvm_reenter";
+const char *kvm_exit_trace = "kvm:kvm_exit";
+const char *kvm_events_tp[] = {
+ "kvm:kvm_enter",
+ "kvm:kvm_reenter",
+ "kvm:kvm_exit",
+ "kvm:kvm_exit_gspr",
+ NULL,
+};
+
+static bool event_begin(struct evsel *evsel,
+ struct perf_sample *sample, struct event_key *key)
+{
+ return exit_event_begin(evsel, sample, key);
+}
+
+static bool event_end(struct evsel *evsel,
+ struct perf_sample *sample __maybe_unused,
+ struct event_key *key __maybe_unused)
+{
+ /*
+ * LoongArch kvm is different with other architectures
+ *
+ * There is kvm:kvm_reenter or kvm:kvm_enter event adjacent with
+ * kvm:kvm_exit event.
+ * kvm:kvm_enter means returning to vmm and then to guest
+ * kvm:kvm_reenter means returning to guest immediately
+ */
+ return evsel__name_is(evsel, kvm_entry_trace) || evsel__name_is(evsel, kvm_reenter_trace);
+}
+
+static void event_gspr_get_key(struct evsel *evsel,
+ struct perf_sample *sample, struct event_key *key)
+{
+ unsigned int insn;
+
+ key->key = LOONGARCH_EXCEPTION_OTHERS;
+ insn = evsel__intval(evsel, sample, "inst_word");
+
+ switch (insn >> 24) {
+ case 0:
+ /* CPUCFG inst trap */
+ if ((insn >> 10) == 0x1b)
+ key->key = LOONGARCH_EXCEPTION_CPUCFG;
+ break;
+ case 4:
+ /* CSR inst trap */
+ key->key = LOONGARCH_EXCEPTION_CSR;
+ break;
+ case 6:
+ /* IOCSR inst trap */
+ if ((insn >> 15) == 0xc90)
+ key->key = LOONGARCH_EXCEPTION_IOCSR;
+ else if ((insn >> 15) == 0xc91)
+ /* Idle inst trap */
+ key->key = LOONGARCH_EXCEPTION_IDLE;
+ break;
+ default:
+ key->key = LOONGARCH_EXCEPTION_OTHERS;
+ break;
+ }
+}
+
+static struct child_event_ops child_events[] = {
+ { .name = "kvm:kvm_exit_gspr", .get_key = event_gspr_get_key },
+ { NULL, NULL },
+};
+
+static struct kvm_events_ops exit_events = {
+ .is_begin_event = event_begin,
+ .is_end_event = event_end,
+ .child_ops = child_events,
+ .decode_key = exit_event_decode_key,
+ .name = "VM-EXIT"
+};
+
+struct kvm_reg_events_ops kvm_reg_events_ops[] = {
+ { .name = "vmexit", .ops = &exit_events, },
+ { NULL, NULL },
+};
+
+const char * const kvm_skip_events[] = {
+ NULL,
+};
+
+int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
+{
+ kvm->exit_reasons_isa = "loongarch64";
+ kvm->exit_reasons = loongarch_exit_reasons;
+ return 0;
+}
diff --git a/tools/perf/arch/riscv/Makefile b/tools/perf/arch/riscv/Makefile
index a8d25d005207..90c3c476a242 100644
--- a/tools/perf/arch/riscv/Makefile
+++ b/tools/perf/arch/riscv/Makefile
@@ -3,3 +3,4 @@ PERF_HAVE_DWARF_REGS := 1
endif
PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1
PERF_HAVE_JITDUMP := 1
+HAVE_KVM_STAT_SUPPORT := 1
diff --git a/tools/perf/arch/riscv/util/Build b/tools/perf/arch/riscv/util/Build
index 65ec3c66a375..f865cb0489ec 100644
--- a/tools/perf/arch/riscv/util/Build
+++ b/tools/perf/arch/riscv/util/Build
@@ -1,5 +1,6 @@
perf-util-y += perf_regs.o
perf-util-y += header.o
+perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
perf-util-$(CONFIG_DWARF) += dwarf-regs.o
perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
diff --git a/tools/perf/arch/riscv/util/kvm-stat.c b/tools/perf/arch/riscv/util/kvm-stat.c
new file mode 100644
index 000000000000..491aef449d1a
--- /dev/null
+++ b/tools/perf/arch/riscv/util/kvm-stat.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Arch specific functions for perf kvm stat.
+ *
+ * Copyright 2024 Beijing ESWIN Computing Technology Co., Ltd.
+ *
+ */
+#include <errno.h>
+#include <memory.h>
+#include "../../../util/evsel.h"
+#include "../../../util/kvm-stat.h"
+#include "riscv_exception_types.h"
+#include "debug.h"
+
+define_exit_reasons_table(riscv_exit_reasons, kvm_riscv_exception_class);
+
+const char *vcpu_id_str = "id";
+const char *kvm_exit_reason = "scause";
+const char *kvm_entry_trace = "kvm:kvm_entry";
+const char *kvm_exit_trace = "kvm:kvm_exit";
+
+const char *kvm_events_tp[] = {
+ "kvm:kvm_entry",
+ "kvm:kvm_exit",
+ NULL,
+};
+
+static void event_get_key(struct evsel *evsel,
+ struct perf_sample *sample,
+ struct event_key *key)
+{
+ key->info = 0;
+ key->key = evsel__intval(evsel, sample, kvm_exit_reason);
+ key->exit_reasons = riscv_exit_reasons;
+}
+
+static bool event_begin(struct evsel *evsel,
+ struct perf_sample *sample __maybe_unused,
+ struct event_key *key __maybe_unused)
+{
+ return evsel__name_is(evsel, kvm_entry_trace);
+}
+
+static bool event_end(struct evsel *evsel,
+ struct perf_sample *sample,
+ struct event_key *key)
+{
+ if (evsel__name_is(evsel, kvm_exit_trace)) {
+ event_get_key(evsel, sample, key);
+ return true;
+ }
+ return false;
+}
+
+static struct kvm_events_ops exit_events = {
+ .is_begin_event = event_begin,
+ .is_end_event = event_end,
+ .decode_key = exit_event_decode_key,
+ .name = "VM-EXIT"
+};
+
+struct kvm_reg_events_ops kvm_reg_events_ops[] = {
+ {
+ .name = "vmexit",
+ .ops = &exit_events,
+ },
+ { NULL, NULL },
+};
+
+const char * const kvm_skip_events[] = {
+ NULL,
+};
+
+int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
+{
+ kvm->exit_reasons_isa = "riscv64";
+ return 0;
+}
diff --git a/tools/perf/arch/riscv/util/riscv_exception_types.h b/tools/perf/arch/riscv/util/riscv_exception_types.h
new file mode 100644
index 000000000000..c49b8fa5e847
--- /dev/null
+++ b/tools/perf/arch/riscv/util/riscv_exception_types.h
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef ARCH_PERF_RISCV_EXCEPTION_TYPES_H
+#define ARCH_PERF_RISCV_EXCEPTION_TYPES_H
+
+#define EXC_INST_MISALIGNED 0
+#define EXC_INST_ACCESS 1
+#define EXC_INST_ILLEGAL 2
+#define EXC_BREAKPOINT 3
+#define EXC_LOAD_MISALIGNED 4
+#define EXC_LOAD_ACCESS 5
+#define EXC_STORE_MISALIGNED 6
+#define EXC_STORE_ACCESS 7
+#define EXC_SYSCALL 8
+#define EXC_HYPERVISOR_SYSCALL 9
+#define EXC_SUPERVISOR_SYSCALL 10
+#define EXC_INST_PAGE_FAULT 12
+#define EXC_LOAD_PAGE_FAULT 13
+#define EXC_STORE_PAGE_FAULT 15
+#define EXC_INST_GUEST_PAGE_FAULT 20
+#define EXC_LOAD_GUEST_PAGE_FAULT 21
+#define EXC_VIRTUAL_INST_FAULT 22
+#define EXC_STORE_GUEST_PAGE_FAULT 23
+
+#define EXC(x) {EXC_##x, #x }
+
+#define kvm_riscv_exception_class \
+ EXC(INST_MISALIGNED), EXC(INST_ACCESS), EXC(INST_ILLEGAL), \
+ EXC(BREAKPOINT), EXC(LOAD_MISALIGNED), EXC(LOAD_ACCESS), \
+ EXC(STORE_MISALIGNED), EXC(STORE_ACCESS), EXC(SYSCALL), \
+ EXC(HYPERVISOR_SYSCALL), EXC(SUPERVISOR_SYSCALL), \
+ EXC(INST_PAGE_FAULT), EXC(LOAD_PAGE_FAULT), EXC(STORE_PAGE_FAULT), \
+ EXC(INST_GUEST_PAGE_FAULT), EXC(LOAD_GUEST_PAGE_FAULT), \
+ EXC(VIRTUAL_INST_FAULT), EXC(STORE_GUEST_PAGE_FAULT)
+
+#endif /* ARCH_PERF_RISCV_EXCEPTION_TYPES_H */
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index ac280dcba996..b084ba2262a0 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -112,6 +112,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state
TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test
+TEST_GEN_PROGS_x86_64 += x86_64/apic_bus_clock_test
TEST_GEN_PROGS_x86_64 += x86_64/xapic_ipi_test
TEST_GEN_PROGS_x86_64 += x86_64/xapic_state_test
TEST_GEN_PROGS_x86_64 += x86_64/xcr0_cpuid_test
@@ -145,6 +146,7 @@ TEST_GEN_PROGS_x86_64 += set_memory_region_test
TEST_GEN_PROGS_x86_64 += steal_time
TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test
TEST_GEN_PROGS_x86_64 += system_counter_offset_test
+TEST_GEN_PROGS_x86_64 += pre_fault_memory_test
# Compiled outputs used by test targets
TEST_GEN_PROGS_EXTENDED_x86_64 += x86_64/nx_huge_pages_test
diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c
index a7de39fa2a0a..d20981663831 100644
--- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c
+++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c
@@ -219,6 +219,7 @@ static void guest_code(void)
GUEST_REG_SYNC(SYS_ID_AA64MMFR1_EL1);
GUEST_REG_SYNC(SYS_ID_AA64MMFR2_EL1);
GUEST_REG_SYNC(SYS_ID_AA64ZFR0_EL1);
+ GUEST_REG_SYNC(SYS_CTR_EL0);
GUEST_DONE();
}
@@ -490,11 +491,25 @@ static void test_clidr(struct kvm_vcpu *vcpu)
test_reg_vals[encoding_to_range_idx(SYS_CLIDR_EL1)] = clidr;
}
+static void test_ctr(struct kvm_vcpu *vcpu)
+{
+ u64 ctr;
+
+ vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CTR_EL0), &ctr);
+ ctr &= ~CTR_EL0_DIC_MASK;
+ if (ctr & CTR_EL0_IminLine_MASK)
+ ctr--;
+
+ vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CTR_EL0), ctr);
+ test_reg_vals[encoding_to_range_idx(SYS_CTR_EL0)] = ctr;
+}
+
static void test_vcpu_ftr_id_regs(struct kvm_vcpu *vcpu)
{
u64 val;
test_clidr(vcpu);
+ test_ctr(vcpu);
vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), &val);
val++;
@@ -524,7 +539,9 @@ static void test_reset_preserves_id_regs(struct kvm_vcpu *vcpu)
for (int i = 0; i < ARRAY_SIZE(test_regs); i++)
test_assert_id_reg_unchanged(vcpu, test_regs[i].reg);
+ test_assert_id_reg_unchanged(vcpu, SYS_MPIDR_EL1);
test_assert_id_reg_unchanged(vcpu, SYS_CLIDR_EL1);
+ test_assert_id_reg_unchanged(vcpu, SYS_CTR_EL0);
ksft_test_result_pass("%s\n", __func__);
}
diff --git a/tools/testing/selftests/kvm/include/x86_64/apic.h b/tools/testing/selftests/kvm/include/x86_64/apic.h
index bed316fdecd5..0f268b55fa06 100644
--- a/tools/testing/selftests/kvm/include/x86_64/apic.h
+++ b/tools/testing/selftests/kvm/include/x86_64/apic.h
@@ -60,6 +60,14 @@
#define APIC_VECTOR_MASK 0x000FF
#define APIC_ICR2 0x310
#define SET_APIC_DEST_FIELD(x) ((x) << 24)
+#define APIC_LVTT 0x320
+#define APIC_LVT_TIMER_ONESHOT (0 << 17)
+#define APIC_LVT_TIMER_PERIODIC (1 << 17)
+#define APIC_LVT_TIMER_TSCDEADLINE (2 << 17)
+#define APIC_LVT_MASKED (1 << 16)
+#define APIC_TMICT 0x380
+#define APIC_TMCCT 0x390
+#define APIC_TDCR 0x3E0
void apic_disable(void);
void xapic_enable(void);
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index c0c7c1fe93f9..a0c1440017bb 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -23,6 +23,7 @@
extern bool host_cpu_is_intel;
extern bool host_cpu_is_amd;
+extern uint64_t guest_tsc_khz;
/* Forced emulation prefix, used to invoke the emulator unconditionally. */
#define KVM_FEP "ud2; .byte 'k', 'v', 'm';"
@@ -816,6 +817,23 @@ static inline void cpu_relax(void)
asm volatile("rep; nop" ::: "memory");
}
+static inline void udelay(unsigned long usec)
+{
+ uint64_t start, now, cycles;
+
+ GUEST_ASSERT(guest_tsc_khz);
+ cycles = guest_tsc_khz / 1000 * usec;
+
+ /*
+ * Deliberately don't PAUSE, a.k.a. cpu_relax(), so that the delay is
+ * as accurate as possible, e.g. doesn't trigger PAUSE-Loop VM-Exits.
+ */
+ start = rdtsc();
+ do {
+ now = rdtsc();
+ } while (now - start < cycles);
+}
+
#define ud2() \
__asm__ __volatile__( \
"ud2\n" \
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index ad00e4761886..56b170b725b3 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -21,6 +21,7 @@
uint32_t guest_random_seed;
struct guest_random_state guest_rng;
+static uint32_t last_guest_seed;
static int vcpu_mmap_sz(void);
@@ -434,7 +435,10 @@ struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus,
slot0 = memslot2region(vm, 0);
ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size);
- pr_info("Random seed: 0x%x\n", guest_random_seed);
+ if (guest_random_seed != last_guest_seed) {
+ pr_info("Random seed: 0x%x\n", guest_random_seed);
+ last_guest_seed = guest_random_seed;
+ }
guest_rng = new_guest_random_state(guest_random_seed);
sync_global_to_guest(vm, guest_rng);
@@ -2319,7 +2323,8 @@ void __attribute((constructor)) kvm_selftest_init(void)
/* Tell stdout not to buffer its content. */
setbuf(stdout, NULL);
- guest_random_seed = random();
+ guest_random_seed = last_guest_seed = random();
+ pr_info("Random seed: 0x%x\n", guest_random_seed);
kvm_selftest_arch_init();
}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 594b061aef52..153739f2e201 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -25,6 +25,7 @@ vm_vaddr_t exception_handlers;
bool host_cpu_is_amd;
bool host_cpu_is_intel;
bool is_forced_emulation_enabled;
+uint64_t guest_tsc_khz;
static void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent)
{
@@ -616,6 +617,11 @@ void assert_on_unhandled_exception(struct kvm_vcpu *vcpu)
void kvm_arch_vm_post_create(struct kvm_vm *vm)
{
+ int r;
+
+ TEST_ASSERT(kvm_has_cap(KVM_CAP_GET_TSC_KHZ),
+ "Require KVM_GET_TSC_KHZ to provide udelay() to guest.");
+
vm_create_irqchip(vm);
vm_init_descriptor_tables(vm);
@@ -628,6 +634,11 @@ void kvm_arch_vm_post_create(struct kvm_vm *vm)
vm_sev_ioctl(vm, KVM_SEV_INIT2, &init);
}
+
+ r = __vm_ioctl(vm, KVM_GET_TSC_KHZ, NULL);
+ TEST_ASSERT(r > 0, "KVM_GET_TSC_KHZ did not provide a valid TSC frequency.");
+ guest_tsc_khz = r;
+ sync_global_to_guest(vm, guest_tsc_khz);
}
void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code)
diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c
index 05fcf902e067..49f162573126 100644
--- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c
+++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c
@@ -53,12 +53,6 @@ static void vcpu_worker(struct memstress_vcpu_args *vcpu_args)
}
}
-struct memslot_antagonist_args {
- struct kvm_vm *vm;
- useconds_t delay;
- uint64_t nr_modifications;
-};
-
static void add_remove_memslot(struct kvm_vm *vm, useconds_t delay,
uint64_t nr_modifications)
{
diff --git a/tools/testing/selftests/kvm/pre_fault_memory_test.c b/tools/testing/selftests/kvm/pre_fault_memory_test.c
new file mode 100644
index 000000000000..0350a8896a2f
--- /dev/null
+++ b/tools/testing/selftests/kvm/pre_fault_memory_test.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024, Intel, Inc
+ *
+ * Author:
+ * Isaku Yamahata <isaku.yamahata at gmail.com>
+ */
+#include <linux/sizes.h>
+
+#include <test_util.h>
+#include <kvm_util.h>
+#include <processor.h>
+
+/* Arbitrarily chosen values */
+#define TEST_SIZE (SZ_2M + PAGE_SIZE)
+#define TEST_NPAGES (TEST_SIZE / PAGE_SIZE)
+#define TEST_SLOT 10
+
+static void guest_code(uint64_t base_gpa)
+{
+ volatile uint64_t val __used;
+ int i;
+
+ for (i = 0; i < TEST_NPAGES; i++) {
+ uint64_t *src = (uint64_t *)(base_gpa + i * PAGE_SIZE);
+
+ val = *src;
+ }
+
+ GUEST_DONE();
+}
+
+static void pre_fault_memory(struct kvm_vcpu *vcpu, u64 gpa, u64 size,
+ u64 left)
+{
+ struct kvm_pre_fault_memory range = {
+ .gpa = gpa,
+ .size = size,
+ .flags = 0,
+ };
+ u64 prev;
+ int ret, save_errno;
+
+ do {
+ prev = range.size;
+ ret = __vcpu_ioctl(vcpu, KVM_PRE_FAULT_MEMORY, &range);
+ save_errno = errno;
+ TEST_ASSERT((range.size < prev) ^ (ret < 0),
+ "%sexpecting range.size to change on %s",
+ ret < 0 ? "not " : "",
+ ret < 0 ? "failure" : "success");
+ } while (ret >= 0 ? range.size : save_errno == EINTR);
+
+ TEST_ASSERT(range.size == left,
+ "Completed with %lld bytes left, expected %" PRId64,
+ range.size, left);
+
+ if (left == 0)
+ __TEST_ASSERT_VM_VCPU_IOCTL(!ret, "KVM_PRE_FAULT_MEMORY", ret, vcpu->vm);
+ else
+ /* No memory slot causes RET_PF_EMULATE. it results in -ENOENT. */
+ __TEST_ASSERT_VM_VCPU_IOCTL(ret && save_errno == ENOENT,
+ "KVM_PRE_FAULT_MEMORY", ret, vcpu->vm);
+}
+
+static void __test_pre_fault_memory(unsigned long vm_type, bool private)
+{
+ const struct vm_shape shape = {
+ .mode = VM_MODE_DEFAULT,
+ .type = vm_type,
+ };
+ struct kvm_vcpu *vcpu;
+ struct kvm_run *run;
+ struct kvm_vm *vm;
+ struct ucall uc;
+
+ uint64_t guest_test_phys_mem;
+ uint64_t guest_test_virt_mem;
+ uint64_t alignment, guest_page_size;
+
+ vm = vm_create_shape_with_one_vcpu(shape, &vcpu, guest_code);
+
+ alignment = guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size;
+ guest_test_phys_mem = (vm->max_gfn - TEST_NPAGES) * guest_page_size;
+#ifdef __s390x__
+ alignment = max(0x100000UL, guest_page_size);
+#else
+ alignment = SZ_2M;
+#endif
+ guest_test_phys_mem = align_down(guest_test_phys_mem, alignment);
+ guest_test_virt_mem = guest_test_phys_mem & ((1ULL << (vm->va_bits - 1)) - 1);
+
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+ guest_test_phys_mem, TEST_SLOT, TEST_NPAGES,
+ private ? KVM_MEM_GUEST_MEMFD : 0);
+ virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, TEST_NPAGES);
+
+ if (private)
+ vm_mem_set_private(vm, guest_test_phys_mem, TEST_SIZE);
+ pre_fault_memory(vcpu, guest_test_phys_mem, SZ_2M, 0);
+ pre_fault_memory(vcpu, guest_test_phys_mem + SZ_2M, PAGE_SIZE * 2, PAGE_SIZE);
+ pre_fault_memory(vcpu, guest_test_phys_mem + TEST_SIZE, PAGE_SIZE, PAGE_SIZE);
+
+ vcpu_args_set(vcpu, 1, guest_test_virt_mem);
+ vcpu_run(vcpu);
+
+ run = vcpu->run;
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Wanted KVM_EXIT_IO, got exit reason: %u (%s)",
+ run->exit_reason, exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vcpu, &uc)) {
+ case UCALL_ABORT:
+ REPORT_GUEST_ASSERT(uc);
+ break;
+ case UCALL_DONE:
+ break;
+ default:
+ TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
+ break;
+ }
+
+ kvm_vm_free(vm);
+}
+
+static void test_pre_fault_memory(unsigned long vm_type, bool private)
+{
+ if (vm_type && !(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(vm_type))) {
+ pr_info("Skipping tests for vm_type 0x%lx\n", vm_type);
+ return;
+ }
+
+ __test_pre_fault_memory(vm_type, private);
+}
+
+int main(int argc, char *argv[])
+{
+ TEST_REQUIRE(kvm_check_cap(KVM_CAP_PRE_FAULT_MEMORY));
+
+ test_pre_fault_memory(0, false);
+#ifdef __x86_64__
+ test_pre_fault_memory(KVM_X86_SW_PROTECTED_VM, false);
+ test_pre_fault_memory(KVM_X86_SW_PROTECTED_VM, true);
+#endif
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/apic_bus_clock_test.c b/tools/testing/selftests/kvm/x86_64/apic_bus_clock_test.c
new file mode 100644
index 000000000000..f8916bb34405
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/apic_bus_clock_test.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024 Intel Corporation
+ *
+ * Verify KVM correctly emulates the APIC bus frequency when the VMM configures
+ * the frequency via KVM_CAP_X86_APIC_BUS_CYCLES_NS. Start the APIC timer by
+ * programming TMICT (timer initial count) to the largest value possible (so
+ * that the timer will not expire during the test). Then, after an arbitrary
+ * amount of time has elapsed, verify TMCCT (timer current count) is within 1%
+ * of the expected value based on the time elapsed, the APIC bus frequency, and
+ * the programmed TDCR (timer divide configuration register).
+ */
+
+#include "apic.h"
+#include "test_util.h"
+
+/*
+ * Possible TDCR values with matching divide count. Used to modify APIC
+ * timer frequency.
+ */
+static const struct {
+ const uint32_t tdcr;
+ const uint32_t divide_count;
+} tdcrs[] = {
+ {0x0, 2},
+ {0x1, 4},
+ {0x2, 8},
+ {0x3, 16},
+ {0x8, 32},
+ {0x9, 64},
+ {0xa, 128},
+ {0xb, 1},
+};
+
+static bool is_x2apic;
+
+static void apic_enable(void)
+{
+ if (is_x2apic)
+ x2apic_enable();
+ else
+ xapic_enable();
+}
+
+static uint32_t apic_read_reg(unsigned int reg)
+{
+ return is_x2apic ? x2apic_read_reg(reg) : xapic_read_reg(reg);
+}
+
+static void apic_write_reg(unsigned int reg, uint32_t val)
+{
+ if (is_x2apic)
+ x2apic_write_reg(reg, val);
+ else
+ xapic_write_reg(reg, val);
+}
+
+static void apic_guest_code(uint64_t apic_hz, uint64_t delay_ms)
+{
+ uint64_t tsc_hz = guest_tsc_khz * 1000;
+ const uint32_t tmict = ~0u;
+ uint64_t tsc0, tsc1, freq;
+ uint32_t tmcct;
+ int i;
+
+ apic_enable();
+
+ /*
+ * Setup one-shot timer. The vector does not matter because the
+ * interrupt should not fire.
+ */
+ apic_write_reg(APIC_LVTT, APIC_LVT_TIMER_ONESHOT | APIC_LVT_MASKED);
+
+ for (i = 0; i < ARRAY_SIZE(tdcrs); i++) {
+ apic_write_reg(APIC_TDCR, tdcrs[i].tdcr);
+ apic_write_reg(APIC_TMICT, tmict);
+
+ tsc0 = rdtsc();
+ udelay(delay_ms * 1000);
+ tmcct = apic_read_reg(APIC_TMCCT);
+ tsc1 = rdtsc();
+
+ /*
+ * Stop the timer _after_ reading the current, final count, as
+ * writing the initial counter also modifies the current count.
+ */
+ apic_write_reg(APIC_TMICT, 0);
+
+ freq = (tmict - tmcct) * tdcrs[i].divide_count * tsc_hz / (tsc1 - tsc0);
+ /* Check if measured frequency is within 5% of configured frequency. */
+ __GUEST_ASSERT(freq < apic_hz * 105 / 100 && freq > apic_hz * 95 / 100,
+ "Frequency = %lu (wanted %lu - %lu), bus = %lu, div = %u, tsc = %lu",
+ freq, apic_hz * 95 / 100, apic_hz * 105 / 100,
+ apic_hz, tdcrs[i].divide_count, tsc_hz);
+ }
+
+ GUEST_DONE();
+}
+
+static void test_apic_bus_clock(struct kvm_vcpu *vcpu)
+{
+ bool done = false;
+ struct ucall uc;
+
+ while (!done) {
+ vcpu_run(vcpu);
+
+ TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+ switch (get_ucall(vcpu, &uc)) {
+ case UCALL_DONE:
+ done = true;
+ break;
+ case UCALL_ABORT:
+ REPORT_GUEST_ASSERT(uc);
+ break;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ break;
+ }
+ }
+}
+
+static void run_apic_bus_clock_test(uint64_t apic_hz, uint64_t delay_ms,
+ bool x2apic)
+{
+ struct kvm_vcpu *vcpu;
+ struct kvm_vm *vm;
+ int ret;
+
+ is_x2apic = x2apic;
+
+ vm = vm_create(1);
+
+ sync_global_to_guest(vm, is_x2apic);
+
+ vm_enable_cap(vm, KVM_CAP_X86_APIC_BUS_CYCLES_NS,
+ NSEC_PER_SEC / apic_hz);
+
+ vcpu = vm_vcpu_add(vm, 0, apic_guest_code);
+ vcpu_args_set(vcpu, 2, apic_hz, delay_ms);
+
+ ret = __vm_enable_cap(vm, KVM_CAP_X86_APIC_BUS_CYCLES_NS,
+ NSEC_PER_SEC / apic_hz);
+ TEST_ASSERT(ret < 0 && errno == EINVAL,
+ "Setting of APIC bus frequency after vCPU is created should fail.");
+
+ if (!is_x2apic)
+ virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
+
+ test_apic_bus_clock(vcpu);
+ kvm_vm_free(vm);
+}
+
+static void help(char *name)
+{
+ puts("");
+ printf("usage: %s [-h] [-d delay] [-f APIC bus freq]\n", name);
+ puts("");
+ printf("-d: Delay (in msec) guest uses to measure APIC bus frequency.\n");
+ printf("-f: The APIC bus frequency (in MHz) to be configured for the guest.\n");
+ puts("");
+}
+
+int main(int argc, char *argv[])
+{
+ /*
+ * Arbitrarilty default to 25MHz for the APIC bus frequency, which is
+ * different enough from the default 1GHz to be interesting.
+ */
+ uint64_t apic_hz = 25 * 1000 * 1000;
+ uint64_t delay_ms = 100;
+ int opt;
+
+ TEST_REQUIRE(kvm_has_cap(KVM_CAP_X86_APIC_BUS_CYCLES_NS));
+
+ while ((opt = getopt(argc, argv, "d:f:h")) != -1) {
+ switch (opt) {
+ case 'f':
+ apic_hz = atoi_positive("APIC bus frequency", optarg) * 1000 * 1000;
+ break;
+ case 'd':
+ delay_ms = atoi_positive("Delay in milliseconds", optarg);
+ break;
+ case 'h':
+ default:
+ help(argv[0]);
+ exit(KSFT_SKIP);
+ }
+ }
+
+ run_apic_bus_clock_test(apic_hz, delay_ms, false);
+ run_apic_bus_clock_test(apic_hz, delay_ms, true);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c
index 3cc4b86832fe..7e2bfb3c3f3b 100644
--- a/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c
+++ b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c
@@ -26,19 +26,37 @@ int main(int argc, char *argv[])
TEST_ASSERT(ret < 0,
"Setting KVM_CAP_MAX_VCPU_ID beyond KVM cap should fail");
+ /* Test BOOT_CPU_ID interaction (MAX_VCPU_ID cannot be lower) */
+ if (kvm_has_cap(KVM_CAP_SET_BOOT_CPU_ID)) {
+ vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *)MAX_VCPU_ID);
+
+ /* Try setting KVM_CAP_MAX_VCPU_ID below BOOT_CPU_ID */
+ ret = __vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, MAX_VCPU_ID - 1);
+ TEST_ASSERT(ret < 0,
+ "Setting KVM_CAP_MAX_VCPU_ID below BOOT_CPU_ID should fail");
+ }
+
/* Set KVM_CAP_MAX_VCPU_ID */
vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, MAX_VCPU_ID);
-
/* Try to set KVM_CAP_MAX_VCPU_ID again */
ret = __vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, MAX_VCPU_ID + 1);
TEST_ASSERT(ret < 0,
"Setting KVM_CAP_MAX_VCPU_ID multiple times should fail");
- /* Create vCPU with id beyond KVM_CAP_MAX_VCPU_ID cap*/
+ /* Create vCPU with id beyond KVM_CAP_MAX_VCPU_ID cap */
ret = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)MAX_VCPU_ID);
TEST_ASSERT(ret < 0, "Creating vCPU with ID > MAX_VCPU_ID should fail");
+ /* Create vCPU with bits 63:32 != 0, but an otherwise valid id */
+ ret = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)(1L << 32));
+ TEST_ASSERT(ret < 0, "Creating vCPU with ID[63:32] != 0 should fail");
+
+ /* Create vCPU with id within bounds */
+ ret = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)0);
+ TEST_ASSERT(ret >= 0, "Creating vCPU with ID 0 should succeed");
+
+ close(ret);
kvm_vm_free(vm);
return 0;
}
diff --git a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
index 96446134c00b..698cb36989db 100644
--- a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
+++ b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
@@ -7,15 +7,28 @@
#include "pmu.h"
#include "processor.h"
-/* Number of LOOP instructions for the guest measurement payload. */
-#define NUM_BRANCHES 10
+/* Number of iterations of the loop for the guest measurement payload. */
+#define NUM_LOOPS 10
+
+/* Each iteration of the loop retires one branch instruction. */
+#define NUM_BRANCH_INSNS_RETIRED (NUM_LOOPS)
+
+/*
+ * Number of instructions in each loop. 1 CLFLUSH/CLFLUSHOPT/NOP, 1 MFENCE,
+ * 1 LOOP.
+ */
+#define NUM_INSNS_PER_LOOP 3
+
/*
* Number of "extra" instructions that will be counted, i.e. the number of
- * instructions that are needed to set up the loop and then disabled the
- * counter. 1 CLFLUSH/CLFLUSHOPT/NOP, 1 MFENCE, 2 MOV, 2 XOR, 1 WRMSR.
+ * instructions that are needed to set up the loop and then disable the
+ * counter. 2 MOV, 2 XOR, 1 WRMSR.
*/
-#define NUM_EXTRA_INSNS 7
-#define NUM_INSNS_RETIRED (NUM_BRANCHES + NUM_EXTRA_INSNS)
+#define NUM_EXTRA_INSNS 5
+
+/* Total number of instructions retired within the measured section. */
+#define NUM_INSNS_RETIRED (NUM_LOOPS * NUM_INSNS_PER_LOOP + NUM_EXTRA_INSNS)
+
static uint8_t kvm_pmu_version;
static bool kvm_has_perf_caps;
@@ -100,7 +113,7 @@ static void guest_assert_event_count(uint8_t idx,
GUEST_ASSERT_EQ(count, NUM_INSNS_RETIRED);
break;
case INTEL_ARCH_BRANCHES_RETIRED_INDEX:
- GUEST_ASSERT_EQ(count, NUM_BRANCHES);
+ GUEST_ASSERT_EQ(count, NUM_BRANCH_INSNS_RETIRED);
break;
case INTEL_ARCH_LLC_REFERENCES_INDEX:
case INTEL_ARCH_LLC_MISSES_INDEX:
@@ -120,7 +133,7 @@ static void guest_assert_event_count(uint8_t idx,
}
sanity_checks:
- __asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
+ __asm__ __volatile__("loop ." : "+c"((int){NUM_LOOPS}));
GUEST_ASSERT_EQ(_rdpmc(pmc), count);
wrmsr(pmc_msr, 0xdead);
@@ -134,8 +147,8 @@ sanity_checks:
* before the end of the sequence.
*
* If CLFUSH{,OPT} is supported, flush the cacheline containing (at least) the
- * start of the loop to force LLC references and misses, i.e. to allow testing
- * that those events actually count.
+ * CLFUSH{,OPT} instruction on each loop iteration to force LLC references and
+ * misses, i.e. to allow testing that those events actually count.
*
* If forced emulation is enabled (and specified), force emulation on a subset
* of the measured code to verify that KVM correctly emulates instructions and
@@ -145,10 +158,11 @@ sanity_checks:
#define GUEST_MEASURE_EVENT(_msr, _value, clflush, FEP) \
do { \
__asm__ __volatile__("wrmsr\n\t" \
+ " mov $" __stringify(NUM_LOOPS) ", %%ecx\n\t" \
+ "1:\n\t" \
clflush "\n\t" \
"mfence\n\t" \
- "1: mov $" __stringify(NUM_BRANCHES) ", %%ecx\n\t" \
- FEP "loop .\n\t" \
+ FEP "loop 1b\n\t" \
FEP "mov %%edi, %%ecx\n\t" \
FEP "xor %%eax, %%eax\n\t" \
FEP "xor %%edx, %%edx\n\t" \
@@ -163,9 +177,9 @@ do { \
wrmsr(pmc_msr, 0); \
\
if (this_cpu_has(X86_FEATURE_CLFLUSHOPT)) \
- GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflushopt 1f", FEP); \
+ GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflushopt .", FEP); \
else if (this_cpu_has(X86_FEATURE_CLFLUSH)) \
- GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflush 1f", FEP); \
+ GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflush .", FEP); \
else \
GUEST_MEASURE_EVENT(_ctrl_msr, _value, "nop", FEP); \
\
@@ -500,7 +514,7 @@ static void guest_test_fixed_counters(void)
wrmsr(MSR_CORE_PERF_FIXED_CTR0 + i, 0);
wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, FIXED_PMC_CTRL(i, FIXED_PMC_KERNEL));
wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, FIXED_PMC_GLOBAL_CTRL_ENABLE(i));
- __asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
+ __asm__ __volatile__("loop ." : "+c"((int){NUM_LOOPS}));
wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
val = rdmsr(MSR_CORE_PERF_FIXED_CTR0 + i);
diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
index 26b3e7efe5dd..c15513cd74d1 100644
--- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
+++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
@@ -32,8 +32,8 @@ struct __kvm_pmu_event_filter {
/*
* This event list comprises Intel's known architectural events, plus AMD's
- * "retired branch instructions" for Zen1-Zen3 (and* possibly other AMD CPUs).
- * Note, AMD and Intel use the same encoding for instructions retired.
+ * Branch Instructions Retired for Zen CPUs. Note, AMD and Intel use the
+ * same encoding for Instructions Retired.
*/
kvm_static_assert(INTEL_ARCH_INSTRUCTIONS_RETIRED == AMD_ZEN_INSTRUCTIONS_RETIRED);
@@ -353,38 +353,13 @@ static bool use_intel_pmu(void)
kvm_pmu_has(X86_PMU_FEATURE_BRANCH_INSNS_RETIRED);
}
-static bool is_zen1(uint32_t family, uint32_t model)
-{
- return family == 0x17 && model <= 0x0f;
-}
-
-static bool is_zen2(uint32_t family, uint32_t model)
-{
- return family == 0x17 && model >= 0x30 && model <= 0x3f;
-}
-
-static bool is_zen3(uint32_t family, uint32_t model)
-{
- return family == 0x19 && model <= 0x0f;
-}
-
/*
- * Determining AMD support for a PMU event requires consulting the AMD
- * PPR for the CPU or reference material derived therefrom. The AMD
- * test code herein has been verified to work on Zen1, Zen2, and Zen3.
- *
- * Feel free to add more AMD CPUs that are documented to support event
- * select 0xc2 umask 0 as "retired branch instructions."
+ * On AMD, all Family 17h+ CPUs (Zen and its successors) use event encoding
+ * 0xc2,0 for Branch Instructions Retired.
*/
static bool use_amd_pmu(void)
{
- uint32_t family = kvm_cpu_family();
- uint32_t model = kvm_cpu_model();
-
- return host_cpu_is_amd &&
- (is_zen1(family, model) ||
- is_zen2(family, model) ||
- is_zen3(family, model));
+ return host_cpu_is_amd && kvm_cpu_family() >= 0x17;
}
/*
diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c
index d691d86e5bc3..49913784bc82 100644
--- a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c
+++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c
@@ -33,6 +33,20 @@ static void guest_not_bsp_vcpu(void *arg)
GUEST_DONE();
}
+static void test_set_invalid_bsp(struct kvm_vm *vm)
+{
+ unsigned long max_vcpu_id = vm_check_cap(vm, KVM_CAP_MAX_VCPU_ID);
+ int r;
+
+ if (max_vcpu_id) {
+ r = __vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *)(max_vcpu_id + 1));
+ TEST_ASSERT(r == -1 && errno == EINVAL, "BSP with ID > MAX should fail");
+ }
+
+ r = __vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *)(1L << 32));
+ TEST_ASSERT(r == -1 && errno == EINVAL, "BSP with ID[63:32]!=0 should fail");
+}
+
static void test_set_bsp_busy(struct kvm_vcpu *vcpu, const char *msg)
{
int r = __vm_ioctl(vcpu->vm, KVM_SET_BOOT_CPU_ID,
@@ -80,6 +94,8 @@ static struct kvm_vm *create_vm(uint32_t nr_vcpus, uint32_t bsp_vcpu_id,
vm = vm_create(nr_vcpus);
+ test_set_invalid_bsp(vm);
+
vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *)(unsigned long)bsp_vcpu_id);
for (i = 0; i < nr_vcpus; i++)
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 29b73eedfe74..b14e14cdbfb9 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -67,6 +67,9 @@ config HAVE_KVM_INVALID_WAKEUPS
config KVM_GENERIC_DIRTYLOG_READ_PROTECT
bool
+config KVM_GENERIC_PRE_FAULT_MEMORY
+ bool
+
config KVM_COMPAT
def_bool y
depends on KVM && COMPAT && !(S390 || ARM64 || RISCV)
@@ -109,3 +112,11 @@ config KVM_GENERIC_PRIVATE_MEM
select KVM_GENERIC_MEMORY_ATTRIBUTES
select KVM_PRIVATE_MEM
bool
+
+config HAVE_KVM_GMEM_PREPARE
+ bool
+ depends on KVM_PRIVATE_MEM
+
+config HAVE_KVM_GMEM_INVALIDATE
+ bool
+ depends on KVM_PRIVATE_MEM
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 99a63bad0306..0ee4816b079a 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -80,7 +80,6 @@ static void async_pf_execute(struct work_struct *work)
spin_lock(&vcpu->async_pf.lock);
first = list_empty(&vcpu->async_pf.done);
list_add_tail(&apf->link, &vcpu->async_pf.done);
- apf->vcpu = NULL;
spin_unlock(&vcpu->async_pf.lock);
/*
@@ -120,8 +119,6 @@ static void kvm_flush_and_free_async_pf_work(struct kvm_async_pf *work)
void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
{
- spin_lock(&vcpu->async_pf.lock);
-
/* cancel outstanding work queue item */
while (!list_empty(&vcpu->async_pf.queue)) {
struct kvm_async_pf *work =
@@ -129,23 +126,15 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
typeof(*work), queue);
list_del(&work->queue);
- /*
- * We know it's present in vcpu->async_pf.done, do
- * nothing here.
- */
- if (!work->vcpu)
- continue;
-
- spin_unlock(&vcpu->async_pf.lock);
#ifdef CONFIG_KVM_ASYNC_PF_SYNC
flush_work(&work->work);
#else
if (cancel_work_sync(&work->work))
kmem_cache_free(async_pf_cache, work);
#endif
- spin_lock(&vcpu->async_pf.lock);
}
+ spin_lock(&vcpu->async_pf.lock);
while (!list_empty(&vcpu->async_pf.done)) {
struct kvm_async_pf *work =
list_first_entry(&vcpu->async_pf.done,
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 747fe251e445..1c509c351261 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -13,14 +13,50 @@ struct kvm_gmem {
struct list_head entry;
};
-static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
+static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct folio *folio)
+{
+#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE
+ struct list_head *gmem_list = &inode->i_mapping->i_private_list;
+ struct kvm_gmem *gmem;
+
+ list_for_each_entry(gmem, gmem_list, entry) {
+ struct kvm_memory_slot *slot;
+ struct kvm *kvm = gmem->kvm;
+ struct page *page;
+ kvm_pfn_t pfn;
+ gfn_t gfn;
+ int rc;
+
+ if (!kvm_arch_gmem_prepare_needed(kvm))
+ continue;
+
+ slot = xa_load(&gmem->bindings, index);
+ if (!slot)
+ continue;
+
+ page = folio_file_page(folio, index);
+ pfn = page_to_pfn(page);
+ gfn = slot->base_gfn + index - slot->gmem.pgoff;
+ rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, compound_order(compound_head(page)));
+ if (rc) {
+ pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n",
+ index, gfn, pfn, rc);
+ return rc;
+ }
+ }
+
+#endif
+ return 0;
+}
+
+static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool prepare)
{
struct folio *folio;
/* TODO: Support huge pages. */
folio = filemap_grab_folio(inode->i_mapping, index);
- if (IS_ERR_OR_NULL(folio))
- return NULL;
+ if (IS_ERR(folio))
+ return folio;
/*
* Use the up-to-date flag to track whether or not the memory has been
@@ -41,6 +77,15 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
folio_mark_uptodate(folio);
}
+ if (prepare) {
+ int r = kvm_gmem_prepare_folio(inode, index, folio);
+ if (r < 0) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return ERR_PTR(r);
+ }
+ }
+
/*
* Ignore accessed, referenced, and dirty flags. The memory is
* unevictable and there is no storage to write back to.
@@ -145,9 +190,9 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
break;
}
- folio = kvm_gmem_get_folio(inode, index);
- if (!folio) {
- r = -ENOMEM;
+ folio = kvm_gmem_get_folio(inode, index, true);
+ if (IS_ERR(folio)) {
+ r = PTR_ERR(folio);
break;
}
@@ -298,10 +343,24 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
return MF_DELAYED;
}
+#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE
+static void kvm_gmem_free_folio(struct folio *folio)
+{
+ struct page *page = folio_page(folio, 0);
+ kvm_pfn_t pfn = page_to_pfn(page);
+ int order = folio_order(folio);
+
+ kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
+}
+#endif
+
static const struct address_space_operations kvm_gmem_aops = {
.dirty_folio = noop_dirty_folio,
.migrate_folio = kvm_gmem_migrate_folio,
.error_remove_folio = kvm_gmem_error_folio,
+#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE
+ .free_folio = kvm_gmem_free_folio,
+#endif
};
static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path,
@@ -360,7 +419,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
inode->i_mode |= S_IFREG;
inode->i_size = size;
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
- mapping_set_unmovable(inode->i_mapping);
+ mapping_set_inaccessible(inode->i_mapping);
/* Unmovable mappings are supposed to be marked unevictable as well. */
WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
@@ -482,38 +541,34 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot)
fput(file);
}
-int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
- gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
+static int __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
+ gfn_t gfn, kvm_pfn_t *pfn, int *max_order, bool prepare)
{
pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
- struct kvm_gmem *gmem;
+ struct kvm_gmem *gmem = file->private_data;
struct folio *folio;
struct page *page;
- struct file *file;
int r;
- file = kvm_gmem_get_file(slot);
- if (!file)
+ if (file != slot->gmem.file) {
+ WARN_ON_ONCE(slot->gmem.file);
return -EFAULT;
+ }
gmem = file->private_data;
-
- if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) {
- r = -EIO;
- goto out_fput;
+ if (xa_load(&gmem->bindings, index) != slot) {
+ WARN_ON_ONCE(xa_load(&gmem->bindings, index));
+ return -EIO;
}
- folio = kvm_gmem_get_folio(file_inode(file), index);
- if (!folio) {
- r = -ENOMEM;
- goto out_fput;
- }
+ folio = kvm_gmem_get_folio(file_inode(file), index, prepare);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
if (folio_test_hwpoison(folio)) {
folio_unlock(folio);
folio_put(folio);
- r = -EHWPOISON;
- goto out_fput;
+ return -EHWPOISON;
}
page = folio_file_page(folio, index);
@@ -525,9 +580,78 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
r = 0;
folio_unlock(folio);
-out_fput:
- fput(file);
return r;
}
+
+int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
+{
+ struct file *file = kvm_gmem_get_file(slot);
+ int r;
+
+ if (!file)
+ return -EFAULT;
+
+ r = __kvm_gmem_get_pfn(file, slot, gfn, pfn, max_order, true);
+ fput(file);
+ return r;
+}
EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
+
+long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
+ kvm_gmem_populate_cb post_populate, void *opaque)
+{
+ struct file *file;
+ struct kvm_memory_slot *slot;
+ void __user *p;
+
+ int ret = 0, max_order;
+ long i;
+
+ lockdep_assert_held(&kvm->slots_lock);
+ if (npages < 0)
+ return -EINVAL;
+
+ slot = gfn_to_memslot(kvm, start_gfn);
+ if (!kvm_slot_can_be_private(slot))
+ return -EINVAL;
+
+ file = kvm_gmem_get_file(slot);
+ if (!file)
+ return -EFAULT;
+
+ filemap_invalidate_lock(file->f_mapping);
+
+ npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
+ for (i = 0; i < npages; i += (1 << max_order)) {
+ gfn_t gfn = start_gfn + i;
+ kvm_pfn_t pfn;
+
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ ret = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &max_order, false);
+ if (ret)
+ break;
+
+ if (!IS_ALIGNED(gfn, (1 << max_order)) ||
+ (npages - i) < (1 << max_order))
+ max_order = 0;
+
+ p = src ? src + i * PAGE_SIZE : NULL;
+ ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
+
+ put_page(pfn_to_page(pfn));
+ if (ret)
+ break;
+ }
+
+ filemap_invalidate_unlock(file->f_mapping);
+
+ fput(file);
+ return ret && !i ? ret : i;
+}
+EXPORT_SYMBOL_GPL(kvm_gmem_populate);
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index 1e567d1f6d3d..162d8ed889f2 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -237,3 +237,27 @@ out:
return r;
}
+
+/*
+ * Allocate empty IRQ routing by default so that additional setup isn't needed
+ * when userspace-driven IRQ routing is activated, and so that kvm->irq_routing
+ * is guaranteed to be non-NULL.
+ */
+int kvm_init_irq_routing(struct kvm *kvm)
+{
+ struct kvm_irq_routing_table *new;
+ int chip_size;
+
+ new = kzalloc(struct_size(new, map, 1), GFP_KERNEL_ACCOUNT);
+ if (!new)
+ return -ENOMEM;
+
+ new->nr_rt_entries = 1;
+
+ chip_size = sizeof(int) * KVM_NR_IRQCHIPS * KVM_IRQCHIP_NUM_PINS;
+ memset(new->chip, -1, chip_size);
+
+ RCU_INIT_POINTER(kvm->irq_routing, new);
+
+ return 0;
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1192942aef91..d0788d0a72cc 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1,9 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
+ * Kernel-based Virtual Machine (KVM) Hypervisor
*
* Copyright (C) 2006 Qumranet, Inc.
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
@@ -74,6 +71,7 @@
#define ITOA_MAX_LEN 12
MODULE_AUTHOR("Qumranet");
+MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor");
MODULE_LICENSE("GPL");
/* Architectures should define their poll value according to the halt latency */
@@ -91,8 +89,8 @@ unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
module_param(halt_poll_ns_grow_start, uint, 0644);
EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
-/* Default resets per-vcpu halt_poll_ns . */
-unsigned int halt_poll_ns_shrink;
+/* Default halves per-vcpu halt_poll_ns. */
+unsigned int halt_poll_ns_shrink = 2;
module_param(halt_poll_ns_shrink, uint, 0644);
EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
@@ -110,8 +108,7 @@ static struct kmem_cache *kvm_vcpu_cache;
static __read_mostly struct preempt_ops kvm_preempt_ops;
static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
-struct dentry *kvm_debugfs_dir;
-EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
+static struct dentry *kvm_debugfs_dir;
static const struct file_operations stat_fops_per_vm;
@@ -1145,8 +1142,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
{
struct kvm *kvm = kvm_arch_alloc_vm();
struct kvm_memslots *slots;
- int r = -ENOMEM;
- int i, j;
+ int r, i, j;
if (!kvm)
return ERR_PTR(-ENOMEM);
@@ -1183,12 +1179,18 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d",
task_pid_nr(current));
+ r = -ENOMEM;
if (init_srcu_struct(&kvm->srcu))
goto out_err_no_srcu;
if (init_srcu_struct(&kvm->irq_srcu))
goto out_err_no_irq_srcu;
+ r = kvm_init_irq_routing(kvm);
+ if (r)
+ goto out_err_no_irq_routing;
+
refcount_set(&kvm->users_count, 1);
+
for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
for (j = 0; j < 2; j++) {
slots = &kvm->__memslots[i][j];
@@ -1206,6 +1208,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
}
+ r = -ENOMEM;
for (i = 0; i < KVM_NR_BUSES; i++) {
rcu_assign_pointer(kvm->buses[i],
kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
@@ -1267,6 +1270,8 @@ out_err_no_arch_destroy_vm:
WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
for (i = 0; i < KVM_NR_BUSES; i++)
kfree(kvm_get_bus(kvm, i));
+ kvm_free_irq_routing(kvm);
+out_err_no_irq_routing:
cleanup_srcu_struct(&kvm->irq_srcu);
out_err_no_irq_srcu:
cleanup_srcu_struct(&kvm->srcu);
@@ -4202,12 +4207,21 @@ static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
/*
* Creates some virtual cpus. Good luck creating more than one.
*/
-static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
+static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
{
int r;
struct kvm_vcpu *vcpu;
struct page *page;
+ /*
+ * KVM tracks vCPU IDs as 'int', be kind to userspace and reject
+ * too-large values instead of silently truncating.
+ *
+ * Ensure KVM_MAX_VCPU_IDS isn't pushed above INT_MAX without first
+ * changing the storage type (at the very least, IDs should be tracked
+ * as unsigned ints).
+ */
+ BUILD_BUG_ON(KVM_MAX_VCPU_IDS > INT_MAX);
if (id >= KVM_MAX_VCPU_IDS)
return -EINVAL;
@@ -4375,6 +4389,52 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
return fd;
}
+#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
+static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
+ struct kvm_pre_fault_memory *range)
+{
+ int idx;
+ long r;
+ u64 full_size;
+
+ if (range->flags)
+ return -EINVAL;
+
+ if (!PAGE_ALIGNED(range->gpa) ||
+ !PAGE_ALIGNED(range->size) ||
+ range->gpa + range->size <= range->gpa)
+ return -EINVAL;
+
+ vcpu_load(vcpu);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ full_size = range->size;
+ do {
+ if (signal_pending(current)) {
+ r = -EINTR;
+ break;
+ }
+
+ r = kvm_arch_vcpu_pre_fault_memory(vcpu, range);
+ if (WARN_ON_ONCE(r == 0 || r == -EIO))
+ break;
+
+ if (r < 0)
+ break;
+
+ range->size -= r;
+ range->gpa += r;
+ cond_resched();
+ } while (range->size);
+
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ vcpu_put(vcpu);
+
+ /* Return success if at least one page was mapped successfully. */
+ return full_size == range->size ? r : 0;
+}
+#endif
+
static long kvm_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -4421,7 +4481,10 @@ static long kvm_vcpu_ioctl(struct file *filp,
synchronize_rcu();
put_pid(oldpid);
}
+ vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe);
r = kvm_arch_vcpu_ioctl_run(vcpu);
+ vcpu->wants_to_run = false;
+
trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
break;
}
@@ -4575,6 +4638,20 @@ out_free1:
r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
break;
}
+#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
+ case KVM_PRE_FAULT_MEMORY: {
+ struct kvm_pre_fault_memory range;
+
+ r = -EFAULT;
+ if (copy_from_user(&range, argp, sizeof(range)))
+ break;
+ r = kvm_vcpu_pre_fault_memory(vcpu, &range);
+ /* Pass back leftover range. */
+ if (copy_to_user(argp, &range, sizeof(range)))
+ r = -EFAULT;
+ break;
+ }
+#endif
default:
r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
}
@@ -6287,8 +6364,9 @@ static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
WRITE_ONCE(vcpu->ready, false);
__this_cpu_write(kvm_running_vcpu, vcpu);
- kvm_arch_sched_in(vcpu, cpu);
kvm_arch_vcpu_load(vcpu, cpu);
+
+ WRITE_ONCE(vcpu->scheduled_out, false);
}
static void kvm_sched_out(struct preempt_notifier *pn,
@@ -6296,7 +6374,9 @@ static void kvm_sched_out(struct preempt_notifier *pn,
{
struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
- if (current->on_rq) {
+ WRITE_ONCE(vcpu->scheduled_out, true);
+
+ if (current->on_rq && vcpu->wants_to_run) {
WRITE_ONCE(vcpu->preempted, true);
WRITE_ONCE(vcpu->ready, true);
}
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index e3453e869e92..f0039efb9e1e 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -430,6 +430,9 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len)
int kvm_gpc_activate_hva(struct gfn_to_pfn_cache *gpc, unsigned long uhva, unsigned long len)
{
+ if (!access_ok((void __user *)uhva, len))
+ return -EINVAL;
+
return __kvm_gpc_activate(gpc, INVALID_GPA, uhva, len);
}