summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-08-02 16:11:27 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-08-02 16:11:27 -0400
commit221bb8a46e230b9824204ae86537183d9991ff2a (patch)
tree92510d72285b2285be7cb87288bf088cb28af4c1
parentf7b32e4c021fd788f13f6785e17efbc3eb05b351 (diff)
parent23528bb21ee2c9b27f3feddd77a2a3351a8df148 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini: - ARM: GICv3 ITS emulation and various fixes. Removal of the old VGIC implementation. - s390: support for trapping software breakpoints, nested virtualization (vSIE), the STHYI opcode, initial extensions for CPU model support. - MIPS: support for MIPS64 hosts (32-bit guests only) and lots of cleanups, preliminary to this and the upcoming support for hardware virtualization extensions. - x86: support for execute-only mappings in nested EPT; reduced vmexit latency for TSC deadline timer (by about 30%) on Intel hosts; support for more than 255 vCPUs. - PPC: bugfixes. * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (302 commits) KVM: PPC: Introduce KVM_CAP_PPC_HTM MIPS: Select HAVE_KVM for MIPS64_R{2,6} MIPS: KVM: Reset CP0_PageMask during host TLB flush MIPS: KVM: Fix ptr->int cast via KVM_GUEST_KSEGX() MIPS: KVM: Sign extend MFC0/RDHWR results MIPS: KVM: Fix 64-bit big endian dynamic translation MIPS: KVM: Fail if ebase doesn't fit in CP0_EBase MIPS: KVM: Use 64-bit CP0_EBase when appropriate MIPS: KVM: Set CP0_Status.KX on MIPS64 MIPS: KVM: Make entry code MIPS64 friendly MIPS: KVM: Use kmap instead of CKSEG0ADDR() MIPS: KVM: Use virt_to_phys() to get commpage PFN MIPS: Fix definition of KSEGX() for 64-bit KVM: VMX: Add VMCS to CPU's loaded VMCSs before VMPTRLD kvm: x86: nVMX: maintain internal copy of current VMCS KVM: PPC: Book3S HV: Save/restore TM state in H_CEDE KVM: PPC: Book3S HV: Pull out TM state save/restore into separate procedures KVM: arm64: vgic-its: Simplify MAPI error handling KVM: arm64: vgic-its: Make vgic_its_cmd_handle_mapi similar to other handlers KVM: arm64: vgic-its: Turn device_id validation into generic ID validation ...
-rw-r--r--Documentation/virtual/kvm/api.txt82
-rw-r--r--Documentation/virtual/kvm/devices/arm-vgic.txt25
-rw-r--r--Documentation/virtual/kvm/devices/vm.txt87
-rw-r--r--Documentation/virtual/kvm/locking.txt4
-rw-r--r--arch/arm/include/asm/kvm_asm.h2
-rw-r--r--arch/arm/include/asm/kvm_host.h27
-rw-r--r--arch/arm/include/asm/kvm_hyp.h3
-rw-r--r--arch/arm/include/asm/kvm_mmu.h15
-rw-r--r--arch/arm/include/asm/pgtable.h4
-rw-r--r--arch/arm/include/asm/virt.h4
-rw-r--r--arch/arm/kvm/Kconfig7
-rw-r--r--arch/arm/kvm/Makefile6
-rw-r--r--arch/arm/kvm/arm.c46
-rw-r--r--arch/arm/kvm/emulate.c2
-rw-r--r--arch/arm/kvm/guest.c2
-rw-r--r--arch/arm/kvm/init.S56
-rw-r--r--arch/arm/kvm/mmu.c142
-rw-r--r--arch/arm/kvm/reset.c2
-rw-r--r--arch/arm64/include/asm/cpufeature.h3
-rw-r--r--arch/arm64/include/asm/kvm_arm.h2
-rw-r--r--arch/arm64/include/asm/kvm_host.h19
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h23
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h96
-rw-r--r--arch/arm64/include/asm/pgtable-hwdef.h1
-rw-r--r--arch/arm64/include/asm/pgtable-prot.h4
-rw-r--r--arch/arm64/include/asm/virt.h4
-rw-r--r--arch/arm64/include/uapi/asm/kvm.h2
-rw-r--r--arch/arm64/kernel/cpufeature.c19
-rw-r--r--arch/arm64/kvm/Kconfig8
-rw-r--r--arch/arm64/kvm/Makefile9
-rw-r--r--arch/arm64/kvm/guest.c2
-rw-r--r--arch/arm64/kvm/hyp-init.S61
-rw-r--r--arch/arm64/kvm/hyp/entry.S19
-rw-r--r--arch/arm64/kvm/hyp/hyp-entry.S15
-rw-r--r--arch/arm64/kvm/hyp/switch.c11
-rw-r--r--arch/arm64/kvm/reset.c38
-rw-r--r--arch/arm64/kvm/sys_regs.c4
-rw-r--r--arch/mips/Kconfig2
-rw-r--r--arch/mips/include/asm/addrspace.h2
-rw-r--r--arch/mips/include/asm/kvm_host.h315
-rw-r--r--arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h2
-rw-r--r--arch/mips/include/asm/mipsregs.h21
-rw-r--r--arch/mips/include/asm/setup.h1
-rw-r--r--arch/mips/include/asm/uasm.h7
-rw-r--r--arch/mips/include/uapi/asm/inst.h114
-rw-r--r--arch/mips/kernel/asm-offsets.c70
-rw-r--r--arch/mips/kernel/branch.c8
-rw-r--r--arch/mips/kernel/traps.c23
-rw-r--r--arch/mips/kvm/Kconfig1
-rw-r--r--arch/mips/kvm/Makefile3
-rw-r--r--arch/mips/kvm/commpage.c2
-rw-r--r--arch/mips/kvm/dyntrans.c182
-rw-r--r--arch/mips/kvm/emulate.c485
-rw-r--r--arch/mips/kvm/entry.c701
-rw-r--r--arch/mips/kvm/fpu.S7
-rw-r--r--arch/mips/kvm/interrupt.c12
-rw-r--r--arch/mips/kvm/interrupt.h14
-rw-r--r--arch/mips/kvm/locore.S605
-rw-r--r--arch/mips/kvm/mips.c367
-rw-r--r--arch/mips/kvm/mmu.c375
-rw-r--r--arch/mips/kvm/stats.c21
-rw-r--r--arch/mips/kvm/tlb.c498
-rw-r--r--arch/mips/kvm/trace.h236
-rw-r--r--arch/mips/kvm/trap_emul.c178
-rw-r--r--arch/mips/math-emu/cp1emu.c8
-rw-r--r--arch/mips/mm/c-r4k.c2
-rw-r--r--arch/mips/mm/uasm-micromips.c13
-rw-r--r--arch/mips/mm/uasm-mips.c11
-rw-r--r--arch/mips/mm/uasm.c24
-rw-r--r--arch/powerpc/include/asm/hmi.h45
-rw-r--r--arch/powerpc/include/asm/paca.h6
-rw-r--r--arch/powerpc/kernel/Makefile2
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S4
-rw-r--r--arch/powerpc/kernel/hmi.c56
-rw-r--r--arch/powerpc/kernel/idle_book3s.S4
-rw-r--r--arch/powerpc/kernel/traps.c5
-rw-r--r--arch/powerpc/kvm/book3s_hv.c41
-rw-r--r--arch/powerpc/kvm/book3s_hv_ras.c176
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S527
-rw-r--r--arch/powerpc/kvm/book3s_pr.c16
-rw-r--r--arch/powerpc/kvm/booke.c4
-rw-r--r--arch/powerpc/kvm/emulate.c1
-rw-r--r--arch/powerpc/kvm/mpic.c3
-rw-r--r--arch/powerpc/kvm/powerpc.c6
-rw-r--r--arch/powerpc/platforms/powernv/opal-wrappers.S2
-rw-r--r--arch/s390/hypfs/hypfs_diag.c377
-rw-r--r--arch/s390/include/asm/cpacf.h10
-rw-r--r--arch/s390/include/asm/diag.h149
-rw-r--r--arch/s390/include/asm/gmap.h82
-rw-r--r--arch/s390/include/asm/kvm_host.h33
-rw-r--r--arch/s390/include/asm/mmu.h11
-rw-r--r--arch/s390/include/asm/mmu_context.h3
-rw-r--r--arch/s390/include/asm/page.h9
-rw-r--r--arch/s390/include/asm/pgalloc.h2
-rw-r--r--arch/s390/include/asm/pgtable.h17
-rw-r--r--arch/s390/include/asm/processor.h2
-rw-r--r--arch/s390/include/asm/sclp.h23
-rw-r--r--arch/s390/include/uapi/asm/kvm.h41
-rw-r--r--arch/s390/include/uapi/asm/sie.h1
-rw-r--r--arch/s390/kernel/diag.c39
-rw-r--r--arch/s390/kvm/Makefile2
-rw-r--r--arch/s390/kvm/diag.c5
-rw-r--r--arch/s390/kvm/gaccess.c387
-rw-r--r--arch/s390/kvm/gaccess.h3
-rw-r--r--arch/s390/kvm/guestdbg.c19
-rw-r--r--arch/s390/kvm/intercept.c33
-rw-r--r--arch/s390/kvm/interrupt.c34
-rw-r--r--arch/s390/kvm/kvm-s390.c404
-rw-r--r--arch/s390/kvm/kvm-s390.h22
-rw-r--r--arch/s390/kvm/priv.c226
-rw-r--r--arch/s390/kvm/sigp.c10
-rw-r--r--arch/s390/kvm/sthyi.c471
-rw-r--r--arch/s390/kvm/trace.h49
-rw-r--r--arch/s390/kvm/vsie.c1091
-rw-r--r--arch/s390/mm/fault.c2
-rw-r--r--arch/s390/mm/gmap.c1574
-rw-r--r--arch/s390/mm/pgalloc.c39
-rw-r--r--arch/s390/mm/pgtable.c209
-rw-r--r--arch/x86/include/asm/kvm_host.h31
-rw-r--r--arch/x86/include/asm/svm.h1
-rw-r--r--arch/x86/include/asm/virtext.h8
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/i8254.c4
-rw-r--r--arch/x86/kvm/iommu.c2
-rw-r--r--arch/x86/kvm/irq_comm.c49
-rw-r--r--arch/x86/kvm/lapic.c539
-rw-r--r--arch/x86/kvm/lapic.h19
-rw-r--r--arch/x86/kvm/mmu.c29
-rw-r--r--arch/x86/kvm/mmu.h5
-rw-r--r--arch/x86/kvm/paging_tmpl.h10
-rw-r--r--arch/x86/kvm/pmu_intel.c2
-rw-r--r--arch/x86/kvm/svm.c8
-rw-r--r--arch/x86/kvm/trace.h15
-rw-r--r--arch/x86/kvm/vmx.c386
-rw-r--r--arch/x86/kvm/x86.c175
-rw-r--r--drivers/s390/char/sclp_early.c12
-rw-r--r--drivers/s390/char/sclp_ocf.c23
-rw-r--r--include/kvm/arm_vgic.h438
-rw-r--r--include/kvm/vgic/vgic.h246
-rw-r--r--include/linux/context_tracking.h38
-rw-r--r--include/linux/irqchip/arm-gic-v3.h213
-rw-r--r--include/linux/kvm_host.h58
-rw-r--r--include/linux/page_ref.h9
-rw-r--r--include/trace/events/kvm.h5
-rw-r--r--include/uapi/linux/kvm.h13
-rw-r--r--mm/gup.c1
-rw-r--r--virt/kvm/Kconfig3
-rw-r--r--virt/kvm/arm/hyp/vgic-v2-sr.c15
-rw-r--r--virt/kvm/arm/vgic-v2-emul.c856
-rw-r--r--virt/kvm/arm/vgic-v2.c274
-rw-r--r--virt/kvm/arm/vgic-v3-emul.c1074
-rw-r--r--virt/kvm/arm/vgic-v3.c279
-rw-r--r--virt/kvm/arm/vgic.c2417
-rw-r--r--virt/kvm/arm/vgic.h140
-rw-r--r--virt/kvm/arm/vgic/vgic-init.c9
-rw-r--r--virt/kvm/arm/vgic/vgic-its.c1500
-rw-r--r--virt/kvm/arm/vgic/vgic-kvm-device.c22
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio-v2.c10
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio-v3.c247
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio.c64
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio.h31
-rw-r--r--virt/kvm/arm/vgic/vgic-v2.c12
-rw-r--r--virt/kvm/arm/vgic/vgic-v3.c29
-rw-r--r--virt/kvm/arm/vgic/vgic.c119
-rw-r--r--virt/kvm/arm/vgic/vgic.h38
-rw-r--r--virt/kvm/irqchip.c7
-rw-r--r--virt/kvm/kvm_main.c110
167 files changed, 11766 insertions, 9273 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index a4482cce4bae..5237e1b2fd66 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1482,6 +1482,11 @@ struct kvm_irq_routing_msi {
__u32 pad;
};
+On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS
+feature of KVM_CAP_X2APIC_API capability is enabled. If it is enabled,
+address_hi bits 31-8 provide bits 31-8 of the destination id. Bits 7-0 of
+address_hi must be zero.
+
struct kvm_irq_routing_s390_adapter {
__u64 ind_addr;
__u64 summary_addr;
@@ -1583,6 +1588,17 @@ struct kvm_lapic_state {
Reads the Local APIC registers and copies them into the input argument. The
data format and layout are the same as documented in the architecture manual.
+If KVM_X2APIC_API_USE_32BIT_IDS feature of KVM_CAP_X2APIC_API is
+enabled, then the format of APIC_ID register depends on the APIC mode
+(reported by MSR_IA32_APICBASE) of its VCPU. x2APIC stores APIC ID in
+the APIC_ID register (bytes 32-35). xAPIC only allows an 8-bit APIC ID
+which is stored in bits 31-24 of the APIC register, or equivalently in
+byte 35 of struct kvm_lapic_state's regs field. KVM_GET_LAPIC must then
+be called after MSR_IA32_APICBASE has been set with KVM_SET_MSR.
+
+If KVM_X2APIC_API_USE_32BIT_IDS feature is disabled, struct kvm_lapic_state
+always uses xAPIC format.
+
4.58 KVM_SET_LAPIC
@@ -1600,6 +1616,10 @@ struct kvm_lapic_state {
Copies the input argument into the Local APIC registers. The data format
and layout are the same as documented in the architecture manual.
+The format of the APIC ID register (bytes 32-35 of struct kvm_lapic_state's
+regs field) depends on the state of the KVM_CAP_X2APIC_API capability.
+See the note in KVM_GET_LAPIC.
+
4.59 KVM_IOEVENTFD
@@ -2032,6 +2052,12 @@ registers, find a list below:
MIPS | KVM_REG_MIPS_CP0_CONFIG5 | 32
MIPS | KVM_REG_MIPS_CP0_CONFIG7 | 32
MIPS | KVM_REG_MIPS_CP0_ERROREPC | 64
+ MIPS | KVM_REG_MIPS_CP0_KSCRATCH1 | 64
+ MIPS | KVM_REG_MIPS_CP0_KSCRATCH2 | 64
+ MIPS | KVM_REG_MIPS_CP0_KSCRATCH3 | 64
+ MIPS | KVM_REG_MIPS_CP0_KSCRATCH4 | 64
+ MIPS | KVM_REG_MIPS_CP0_KSCRATCH5 | 64
+ MIPS | KVM_REG_MIPS_CP0_KSCRATCH6 | 64
MIPS | KVM_REG_MIPS_COUNT_CTL | 64
MIPS | KVM_REG_MIPS_COUNT_RESUME | 64
MIPS | KVM_REG_MIPS_COUNT_HZ | 64
@@ -2156,7 +2182,7 @@ after pausing the vcpu, but before it is resumed.
4.71 KVM_SIGNAL_MSI
Capability: KVM_CAP_SIGNAL_MSI
-Architectures: x86
+Architectures: x86 arm64
Type: vm ioctl
Parameters: struct kvm_msi (in)
Returns: >0 on delivery, 0 if guest blocked the MSI, and -1 on error
@@ -2169,10 +2195,22 @@ struct kvm_msi {
__u32 address_hi;
__u32 data;
__u32 flags;
- __u8 pad[16];
+ __u32 devid;
+ __u8 pad[12];
};
-No flags are defined so far. The corresponding field must be 0.
+flags: KVM_MSI_VALID_DEVID: devid contains a valid value
+devid: If KVM_MSI_VALID_DEVID is set, contains a unique device identifier
+ for the device that wrote the MSI message.
+ For PCI, this is usually a BFD identifier in the lower 16 bits.
+
+The per-VM KVM_CAP_MSI_DEVID capability advertises the need to provide
+the device ID. If this capability is not set, userland cannot rely on
+the kernel to allow the KVM_MSI_VALID_DEVID flag being set.
+
+On x86, address_hi is ignored unless the KVM_CAP_X2APIC_API capability is
+enabled. If it is enabled, address_hi bits 31-8 provide bits 31-8 of the
+destination id. Bits 7-0 of address_hi must be zero.
4.71 KVM_CREATE_PIT2
@@ -2520,6 +2558,7 @@ Parameters: struct kvm_device_attr
Returns: 0 on success, -1 on error
Errors:
ENXIO: The group or attribute is unknown/unsupported for this device
+ or hardware support is missing.
EPERM: The attribute cannot (currently) be accessed this way
(e.g. read-only attribute, or attribute that only makes
sense when the device is in a different state)
@@ -2547,6 +2586,7 @@ Parameters: struct kvm_device_attr
Returns: 0 on success, -1 on error
Errors:
ENXIO: The group or attribute is unknown/unsupported for this device
+ or hardware support is missing.
Tests whether a device supports a particular attribute. A successful
return indicates the attribute is implemented. It does not necessarily
@@ -3803,6 +3843,42 @@ Allows use of runtime-instrumentation introduced with zEC12 processor.
Will return -EINVAL if the machine does not support runtime-instrumentation.
Will return -EBUSY if a VCPU has already been created.
+7.7 KVM_CAP_X2APIC_API
+
+Architectures: x86
+Parameters: args[0] - features that should be enabled
+Returns: 0 on success, -EINVAL when args[0] contains invalid features
+
+Valid feature flags in args[0] are
+
+#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0)
+#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1)
+
+Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of
+KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC,
+allowing the use of 32-bit APIC IDs. See KVM_CAP_X2APIC_API in their
+respective sections.
+
+KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK must be enabled for x2APIC to work
+in logical mode or with more than 255 VCPUs. Otherwise, KVM treats 0xff
+as a broadcast even in x2APIC mode in order to support physical x2APIC
+without interrupt remapping. This is undesirable in logical mode,
+where 0xff represents CPUs 0-7 in cluster 0.
+
+7.8 KVM_CAP_S390_USER_INSTR0
+
+Architectures: s390
+Parameters: none
+
+With this capability enabled, all illegal instructions 0x0000 (2 bytes) will
+be intercepted and forwarded to user space. User space can use this
+mechanism e.g. to realize 2-byte software breakpoints. The kernel will
+not inject an operating exception for these instructions, user space has
+to take care of that.
+
+This capability can be enabled dynamically even if VCPUs were already
+created and are running.
+
8. Other capabilities.
----------------------
diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt
index 59541d49e15c..89182f80cc7f 100644
--- a/Documentation/virtual/kvm/devices/arm-vgic.txt
+++ b/Documentation/virtual/kvm/devices/arm-vgic.txt
@@ -4,16 +4,22 @@ ARM Virtual Generic Interrupt Controller (VGIC)
Device types supported:
KVM_DEV_TYPE_ARM_VGIC_V2 ARM Generic Interrupt Controller v2.0
KVM_DEV_TYPE_ARM_VGIC_V3 ARM Generic Interrupt Controller v3.0
+ KVM_DEV_TYPE_ARM_VGIC_ITS ARM Interrupt Translation Service Controller
-Only one VGIC instance may be instantiated through either this API or the
-legacy KVM_CREATE_IRQCHIP api. The created VGIC will act as the VM interrupt
-controller, requiring emulated user-space devices to inject interrupts to the
-VGIC instead of directly to CPUs.
+Only one VGIC instance of the V2/V3 types above may be instantiated through
+either this API or the legacy KVM_CREATE_IRQCHIP api. The created VGIC will
+act as the VM interrupt controller, requiring emulated user-space devices to
+inject interrupts to the VGIC instead of directly to CPUs.
Creating a guest GICv3 device requires a host GICv3 as well.
GICv3 implementations with hardware compatibility support allow a guest GICv2
as well.
+Creating a virtual ITS controller requires a host GICv3 (but does not depend
+on having physical ITS controllers).
+There can be multiple ITS controllers per guest, each of them has to have
+a separate, non-overlapping MMIO region.
+
Groups:
KVM_DEV_ARM_VGIC_GRP_ADDR
Attributes:
@@ -39,6 +45,13 @@ Groups:
Only valid for KVM_DEV_TYPE_ARM_VGIC_V3.
This address needs to be 64K aligned.
+ KVM_VGIC_V3_ADDR_TYPE_ITS (rw, 64-bit)
+ Base address in the guest physical address space of the GICv3 ITS
+ control register frame. The ITS allows MSI(-X) interrupts to be
+ injected into guests. This extension is optional. If the kernel
+ does not support the ITS, the call returns -ENODEV.
+ Only valid for KVM_DEV_TYPE_ARM_VGIC_ITS.
+ This address needs to be 64K aligned and the region covers 128K.
KVM_DEV_ARM_VGIC_GRP_DIST_REGS
Attributes:
@@ -109,8 +122,8 @@ Groups:
KVM_DEV_ARM_VGIC_GRP_CTRL
Attributes:
KVM_DEV_ARM_VGIC_CTRL_INIT
- request the initialization of the VGIC, no additional parameter in
- kvm_device_attr.addr.
+ request the initialization of the VGIC or ITS, no additional parameter
+ in kvm_device_attr.addr.
Errors:
-ENXIO: VGIC not properly configured as required prior to calling
this attribute
diff --git a/Documentation/virtual/kvm/devices/vm.txt b/Documentation/virtual/kvm/devices/vm.txt
index a9ea8774a45f..b6cda49f2ba4 100644
--- a/Documentation/virtual/kvm/devices/vm.txt
+++ b/Documentation/virtual/kvm/devices/vm.txt
@@ -20,7 +20,8 @@ Enables Collaborative Memory Management Assist (CMMA) for the virtual machine.
1.2. ATTRIBUTE: KVM_S390_VM_MEM_CLR_CMMA
Parameters: none
-Returns: 0
+Returns: -EINVAL if CMMA was not enabled
+ 0 otherwise
Clear the CMMA status for all guest pages, so any pages the guest marked
as unused are again used any may not be reclaimed by the host.
@@ -85,6 +86,90 @@ Returns: -EBUSY in case 1 or more vcpus are already activated (only in write
-ENOMEM if not enough memory is available to process the ioctl
0 in case of success
+2.3. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_FEAT (r/o)
+
+Allows user space to retrieve available cpu features. A feature is available if
+provided by the hardware and supported by kvm. In theory, cpu features could
+even be completely emulated by kvm.
+
+struct kvm_s390_vm_cpu_feat {
+ __u64 feat[16]; # Bitmap (1 = feature available), MSB 0 bit numbering
+};
+
+Parameters: address of a buffer to load the feature list from.
+Returns: -EFAULT if the given address is not accessible from kernel space.
+ 0 in case of success.
+
+2.4. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_FEAT (r/w)
+
+Allows user space to retrieve or change enabled cpu features for all VCPUs of a
+VM. Features that are not available cannot be enabled.
+
+See 2.3. for a description of the parameter struct.
+
+Parameters: address of a buffer to store/load the feature list from.
+Returns: -EFAULT if the given address is not accessible from kernel space.
+ -EINVAL if a cpu feature that is not available is to be enabled.
+ -EBUSY if at least one VCPU has already been defined.
+ 0 in case of success.
+
+2.5. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_SUBFUNC (r/o)
+
+Allows user space to retrieve available cpu subfunctions without any filtering
+done by a set IBC. These subfunctions are indicated to the guest VCPU via
+query or "test bit" subfunctions and used e.g. by cpacf functions, plo and ptff.
+
+A subfunction block is only valid if KVM_S390_VM_CPU_MACHINE contains the
+STFL(E) bit introducing the affected instruction. If the affected instruction
+indicates subfunctions via a "query subfunction", the response block is
+contained in the returned struct. If the affected instruction
+indicates subfunctions via a "test bit" mechanism, the subfunction codes are
+contained in the returned struct in MSB 0 bit numbering.
+
+struct kvm_s390_vm_cpu_subfunc {
+ u8 plo[32]; # always valid (ESA/390 feature)
+ u8 ptff[16]; # valid with TOD-clock steering
+ u8 kmac[16]; # valid with Message-Security-Assist
+ u8 kmc[16]; # valid with Message-Security-Assist
+ u8 km[16]; # valid with Message-Security-Assist
+ u8 kimd[16]; # valid with Message-Security-Assist
+ u8 klmd[16]; # valid with Message-Security-Assist
+ u8 pckmo[16]; # valid with Message-Security-Assist-Extension 3
+ u8 kmctr[16]; # valid with Message-Security-Assist-Extension 4
+ u8 kmf[16]; # valid with Message-Security-Assist-Extension 4
+ u8 kmo[16]; # valid with Message-Security-Assist-Extension 4
+ u8 pcc[16]; # valid with Message-Security-Assist-Extension 4
+ u8 ppno[16]; # valid with Message-Security-Assist-Extension 5
+ u8 reserved[1824]; # reserved for future instructions
+};
+
+Parameters: address of a buffer to load the subfunction blocks from.
+Returns: -EFAULT if the given address is not accessible from kernel space.
+ 0 in case of success.
+
+2.6. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_SUBFUNC (r/w)
+
+Allows user space to retrieve or change cpu subfunctions to be indicated for
+all VCPUs of a VM. This attribute will only be available if kernel and
+hardware support are in place.
+
+The kernel uses the configured subfunction blocks for indication to
+the guest. A subfunction block will only be used if the associated STFL(E) bit
+has not been disabled by user space (so the instruction to be queried is
+actually available for the guest).
+
+As long as no data has been written, a read will fail. The IBC will be used
+to determine available subfunctions in this case, this will guarantee backward
+compatibility.
+
+See 2.5. for a description of the parameter struct.
+
+Parameters: address of a buffer to store/load the subfunction blocks from.
+Returns: -EFAULT if the given address is not accessible from kernel space.
+ -EINVAL when reading, if there was no write yet.
+ -EBUSY if at least one VCPU has already been defined.
+ 0 in case of success.
+
3. GROUP: KVM_S390_VM_TOD
Architectures: s390
diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt
index 19f94a6b9bb0..f2491a8c68b4 100644
--- a/Documentation/virtual/kvm/locking.txt
+++ b/Documentation/virtual/kvm/locking.txt
@@ -89,7 +89,7 @@ In mmu_spte_clear_track_bits():
old_spte = *spte;
/* 'if' condition is satisfied. */
- if (old_spte.Accssed == 1 &&
+ if (old_spte.Accessed == 1 &&
old_spte.W == 0)
spte = 0ull;
on fast page fault path:
@@ -102,7 +102,7 @@ In mmu_spte_clear_track_bits():
old_spte = xchg(spte, 0ull)
- if (old_spte.Accssed == 1)
+ if (old_spte.Accessed == 1)
kvm_set_pfn_accessed(spte.pfn);
if (old_spte.Dirty == 1)
kvm_set_pfn_dirty(spte.pfn);
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 3d5a5cd071bd..58faff5f1eb2 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -66,6 +66,8 @@ extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
extern void __init_stage2_translation(void);
+
+extern void __kvm_hyp_reset(unsigned long);
#endif
#endif /* __ARM_KVM_ASM_H__ */
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 96387d477e91..de338d93d11b 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -241,8 +241,7 @@ int kvm_arm_coproc_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *);
int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
int exception_index);
-static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
- phys_addr_t pgd_ptr,
+static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
unsigned long hyp_stack_ptr,
unsigned long vector_ptr)
{
@@ -251,18 +250,13 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
* code. The init code doesn't need to preserve these
* registers as r0-r3 are already callee saved according to
* the AAPCS.
- * Note that we slightly misuse the prototype by casing the
+ * Note that we slightly misuse the prototype by casting the
* stack pointer to a void *.
- *
- * We don't have enough registers to perform the full init in
- * one go. Install the boot PGD first, and then install the
- * runtime PGD, stack pointer and vectors. The PGDs are always
- * passed as the third argument, in order to be passed into
- * r2-r3 to the init code (yes, this is compliant with the
- * PCS!).
- */
- kvm_call_hyp(NULL, 0, boot_pgd_ptr);
+ * The PGDs are always passed as the third argument, in order
+ * to be passed into r2-r3 to the init code (yes, this is
+ * compliant with the PCS!).
+ */
kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr);
}
@@ -272,16 +266,13 @@ static inline void __cpu_init_stage2(void)
kvm_call_hyp(__init_stage2_translation);
}
-static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr,
+static inline void __cpu_reset_hyp_mode(unsigned long vector_ptr,
phys_addr_t phys_idmap_start)
{
- /*
- * TODO
- * kvm_call_reset(boot_pgd_ptr, phys_idmap_start);
- */
+ kvm_call_hyp((void *)virt_to_idmap(__kvm_hyp_reset), vector_ptr);
}
-static inline int kvm_arch_dev_ioctl_check_extension(long ext)
+static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
{
return 0;
}
diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h
index f0e860761380..6eaff28f2ff3 100644
--- a/arch/arm/include/asm/kvm_hyp.h
+++ b/arch/arm/include/asm/kvm_hyp.h
@@ -25,9 +25,6 @@
#define __hyp_text __section(.hyp.text) notrace
-#define kern_hyp_va(v) (v)
-#define hyp_kern_va(v) (v)
-
#define __ACCESS_CP15(CRn, Op1, CRm, Op2) \
"mrc", "mcr", __stringify(p15, Op1, %0, CRn, CRm, Op2), u32
#define __ACCESS_CP15_64(Op1, CRm) \
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index f9a65061130b..3bb803d6814b 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -26,16 +26,7 @@
* We directly use the kernel VA for the HYP, as we can directly share
* the mapping (HTTBR "covers" TTBR1).
*/
-#define HYP_PAGE_OFFSET_MASK UL(~0)
-#define HYP_PAGE_OFFSET PAGE_OFFSET
-#define KERN_TO_HYP(kva) (kva)
-
-/*
- * Our virtual mapping for the boot-time MMU-enable code. Must be
- * shared across all the page-tables. Conveniently, we use the vectors
- * page, where no kernel data will ever be shared with HYP.
- */
-#define TRAMPOLINE_VA UL(CONFIG_VECTORS_BASE)
+#define kern_hyp_va(kva) (kva)
/*
* KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels.
@@ -49,9 +40,8 @@
#include <asm/pgalloc.h>
#include <asm/stage2_pgtable.h>
-int create_hyp_mappings(void *from, void *to);
+int create_hyp_mappings(void *from, void *to, pgprot_t prot);
int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
-void free_boot_hyp_pgd(void);
void free_hyp_pgds(void);
void stage2_unmap_vm(struct kvm *kvm);
@@ -65,7 +55,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run);
void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
phys_addr_t kvm_mmu_get_httbr(void);
-phys_addr_t kvm_mmu_get_boot_httbr(void);
phys_addr_t kvm_get_idmap_vector(void);
phys_addr_t kvm_get_idmap_start(void);
int kvm_mmu_init(void);
diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index d62204060cbe..a8d656d9aec7 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -97,7 +97,9 @@ extern pgprot_t pgprot_s2_device;
#define PAGE_READONLY_EXEC _MOD_PROT(pgprot_user, L_PTE_USER | L_PTE_RDONLY)
#define PAGE_KERNEL _MOD_PROT(pgprot_kernel, L_PTE_XN)
#define PAGE_KERNEL_EXEC pgprot_kernel
-#define PAGE_HYP _MOD_PROT(pgprot_kernel, L_PTE_HYP)
+#define PAGE_HYP _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_XN)
+#define PAGE_HYP_EXEC _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY)
+#define PAGE_HYP_RO _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY | L_PTE_XN)
#define PAGE_HYP_DEVICE _MOD_PROT(pgprot_hyp_device, L_PTE_HYP)
#define PAGE_S2 _MOD_PROT(pgprot_s2, L_PTE_S2_RDONLY)
#define PAGE_S2_DEVICE _MOD_PROT(pgprot_s2_device, L_PTE_S2_RDONLY)
diff --git a/arch/arm/include/asm/virt.h b/arch/arm/include/asm/virt.h
index d4ceaf5f299b..a2e75b84e2ae 100644
--- a/arch/arm/include/asm/virt.h
+++ b/arch/arm/include/asm/virt.h
@@ -80,6 +80,10 @@ static inline bool is_kernel_in_hyp_mode(void)
return false;
}
+/* The section containing the hypervisor idmap text */
+extern char __hyp_idmap_text_start[];
+extern char __hyp_idmap_text_end[];
+
/* The section containing the hypervisor text */
extern char __hyp_text_start[];
extern char __hyp_text_end[];
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index 02abfff68ee5..95a000515e43 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@@ -46,13 +46,6 @@ config KVM_ARM_HOST
---help---
Provides host support for ARM processors.
-config KVM_NEW_VGIC
- bool "New VGIC implementation"
- depends on KVM
- default y
- ---help---
- uses the new VGIC implementation
-
source drivers/vhost/Kconfig
endif # VIRTUALIZATION
diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
index a596b58f6d37..5e28df80dca7 100644
--- a/arch/arm/kvm/Makefile
+++ b/arch/arm/kvm/Makefile
@@ -22,7 +22,6 @@ obj-y += kvm-arm.o init.o interrupts.o
obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o
-ifeq ($(CONFIG_KVM_NEW_VGIC),y)
obj-y += $(KVM)/arm/vgic/vgic.o
obj-y += $(KVM)/arm/vgic/vgic-init.o
obj-y += $(KVM)/arm/vgic/vgic-irqfd.o
@@ -30,9 +29,4 @@ obj-y += $(KVM)/arm/vgic/vgic-v2.o
obj-y += $(KVM)/arm/vgic/vgic-mmio.o
obj-y += $(KVM)/arm/vgic/vgic-mmio-v2.o
obj-y += $(KVM)/arm/vgic/vgic-kvm-device.o
-else
-obj-y += $(KVM)/arm/vgic.o
-obj-y += $(KVM)/arm/vgic-v2.o
-obj-y += $(KVM)/arm/vgic-v2-emul.o
-endif
obj-y += $(KVM)/arm/arch_timer.o
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index f1bde7c4e736..d94bb9093ead 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -20,6 +20,7 @@
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/kvm_host.h>
+#include <linux/list.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/fs.h>
@@ -122,7 +123,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (ret)
goto out_fail_alloc;
- ret = create_hyp_mappings(kvm, kvm + 1);
+ ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP);
if (ret)
goto out_free_stage2_pgd;
@@ -201,7 +202,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = KVM_MAX_VCPUS;
break;
default:
- r = kvm_arch_dev_ioctl_check_extension(ext);
+ r = kvm_arch_dev_ioctl_check_extension(kvm, ext);
break;
}
return r;
@@ -239,7 +240,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
if (err)
goto free_vcpu;
- err = create_hyp_mappings(vcpu, vcpu + 1);
+ err = create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP);
if (err)
goto vcpu_uninit;
@@ -377,7 +378,7 @@ void force_vm_exit(const cpumask_t *mask)
/**
* need_new_vmid_gen - check that the VMID is still valid
- * @kvm: The VM's VMID to checkt
+ * @kvm: The VM's VMID to check
*
* return true if there is a new generation of VMIDs being used
*
@@ -616,7 +617,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
* Enter the guest
*/
trace_kvm_entry(*vcpu_pc(vcpu));
- __kvm_guest_enter();
+ guest_enter_irqoff();
vcpu->mode = IN_GUEST_MODE;
ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
@@ -642,14 +643,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
local_irq_enable();
/*
- * We do local_irq_enable() before calling kvm_guest_exit() so
+ * We do local_irq_enable() before calling guest_exit() so
* that if a timer interrupt hits while running the guest we
* account that tick as being spent in the guest. We enable
- * preemption after calling kvm_guest_exit() so that if we get
+ * preemption after calling guest_exit() so that if we get
* preempted we make sure ticks after that is not counted as
* guest time.
*/
- kvm_guest_exit();
+ guest_exit();
trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
/*
@@ -1039,7 +1040,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
static void cpu_init_hyp_mode(void *dummy)
{
- phys_addr_t boot_pgd_ptr;
phys_addr_t pgd_ptr;
unsigned long hyp_stack_ptr;
unsigned long stack_page;
@@ -1048,13 +1048,12 @@ static void cpu_init_hyp_mode(void *dummy)
/* Switch from the HYP stub to our own HYP init vector */
__hyp_set_vectors(kvm_get_idmap_vector());
- boot_pgd_ptr = kvm_mmu_get_boot_httbr();
pgd_ptr = kvm_mmu_get_httbr();
stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
hyp_stack_ptr = stack_page + PAGE_SIZE;
vector_ptr = (unsigned long)kvm_ksym_ref(__kvm_hyp_vector);
- __cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr);
+ __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr);
__cpu_init_stage2();
kvm_arm_init_debug();
@@ -1076,15 +1075,9 @@ static void cpu_hyp_reinit(void)
static void cpu_hyp_reset(void)
{
- phys_addr_t boot_pgd_ptr;
- phys_addr_t phys_idmap_start;
-
- if (!is_kernel_in_hyp_mode()) {
- boot_pgd_ptr = kvm_mmu_get_boot_httbr();
- phys_idmap_start = kvm_get_idmap_start();
-
- __cpu_reset_hyp_mode(boot_pgd_ptr, phys_idmap_start);
- }
+ if (!is_kernel_in_hyp_mode())
+ __cpu_reset_hyp_mode(hyp_default_vectors,
+ kvm_get_idmap_start());
}
static void _kvm_arch_hardware_enable(void *discard)
@@ -1294,14 +1287,14 @@ static int init_hyp_mode(void)
* Map the Hyp-code called directly from the host
*/
err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
- kvm_ksym_ref(__hyp_text_end));
+ kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC);
if (err) {
kvm_err("Cannot map world-switch code\n");
goto out_err;
}
err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
- kvm_ksym_ref(__end_rodata));
+ kvm_ksym_ref(__end_rodata), PAGE_HYP_RO);
if (err) {
kvm_err("Cannot map rodata section\n");
goto out_err;
@@ -1312,7 +1305,8 @@ static int init_hyp_mode(void)
*/
for_each_possible_cpu(cpu) {
char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
- err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE);
+ err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE,
+ PAGE_HYP);
if (err) {
kvm_err("Cannot map hyp stack\n");
@@ -1324,7 +1318,7 @@ static int init_hyp_mode(void)
kvm_cpu_context_t *cpu_ctxt;
cpu_ctxt = per_cpu_ptr(kvm_host_cpu_state, cpu);
- err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1);
+ err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1, PAGE_HYP);
if (err) {
kvm_err("Cannot map host CPU state: %d\n", err);
@@ -1332,10 +1326,6 @@ static int init_hyp_mode(void)
}
}
-#ifndef CONFIG_HOTPLUG_CPU
- free_boot_hyp_pgd();
-#endif
-
/* set size of VMID supported by CPU */
kvm_vmid_bits = kvm_get_vmid_bits();
kvm_info("%d-bit VMID\n", kvm_vmid_bits);
diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c
index a494def3f195..af93e3ffc9f3 100644
--- a/arch/arm/kvm/emulate.c
+++ b/arch/arm/kvm/emulate.c
@@ -210,7 +210,7 @@ bool kvm_condition_valid(struct kvm_vcpu *vcpu)
* @vcpu: The VCPU pointer
*
* When exceptions occur while instructions are executed in Thumb IF-THEN
- * blocks, the ITSTATE field of the CPSR is not advanved (updated), so we have
+ * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have
* to do this little bit of work manually. The fields map like this:
*
* IT[7:0] -> CPSR[26:25],CPSR[15:10]
diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c
index 9093ed0f8b2a..9aca92074f85 100644
--- a/arch/arm/kvm/guest.c
+++ b/arch/arm/kvm/guest.c
@@ -182,7 +182,7 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu)
/**
* kvm_arm_copy_reg_indices - get indices of all registers.
*
- * We do core registers right here, then we apppend coproc regs.
+ * We do core registers right here, then we append coproc regs.
*/
int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
{
diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S
index 1f9ae17476f9..bf89c919efc1 100644
--- a/arch/arm/kvm/init.S
+++ b/arch/arm/kvm/init.S
@@ -32,23 +32,13 @@
* r2,r3 = Hypervisor pgd pointer
*
* The init scenario is:
- * - We jump in HYP with four parameters: boot HYP pgd, runtime HYP pgd,
- * runtime stack, runtime vectors
- * - Enable the MMU with the boot pgd
- * - Jump to a target into the trampoline page (remember, this is the same
- * physical page!)
- * - Now switch to the runtime pgd (same VA, and still the same physical
- * page!)
+ * - We jump in HYP with 3 parameters: runtime HYP pgd, runtime stack,
+ * runtime vectors
* - Invalidate TLBs
* - Set stack and vectors
+ * - Setup the page tables
+ * - Enable the MMU
* - Profit! (or eret, if you only care about the code).
- *
- * As we only have four registers available to pass parameters (and we
- * need six), we split the init in two phases:
- * - Phase 1: r0 = 0, r1 = 0, r2,r3 contain the boot PGD.
- * Provides the basic HYP init, and enable the MMU.
- * - Phase 2: r0 = ToS, r1 = vectors, r2,r3 contain the runtime PGD.
- * Switches to the runtime PGD, set stack and vectors.
*/
.text
@@ -68,8 +58,11 @@ __kvm_hyp_init:
W(b) .
__do_hyp_init:
- cmp r0, #0 @ We have a SP?
- bne phase2 @ Yes, second stage init
+ @ Set stack pointer
+ mov sp, r0
+
+ @ Set HVBAR to point to the HYP vectors
+ mcr p15, 4, r1, c12, c0, 0 @ HVBAR
@ Set the HTTBR to point to the hypervisor PGD pointer passed
mcrr p15, 4, rr_lo_hi(r2, r3), c2
@@ -114,34 +107,25 @@ __do_hyp_init:
THUMB( ldr r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_TE) )
orr r1, r1, r2
orr r0, r0, r1
- isb
mcr p15, 4, r0, c1, c0, 0 @ HSCR
+ isb
- @ End of init phase-1
eret
-phase2:
- @ Set stack pointer
- mov sp, r0
-
- @ Set HVBAR to point to the HYP vectors
- mcr p15, 4, r1, c12, c0, 0 @ HVBAR
-
- @ Jump to the trampoline page
- ldr r0, =TRAMPOLINE_VA
- adr r1, target
- bfi r0, r1, #0, #PAGE_SHIFT
- ret r0
+ @ r0 : stub vectors address
+ENTRY(__kvm_hyp_reset)
+ /* We're now in idmap, disable MMU */
+ mrc p15, 4, r1, c1, c0, 0 @ HSCTLR
+ ldr r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_C | HSCTLR_I)
+ bic r1, r1, r2
+ mcr p15, 4, r1, c1, c0, 0 @ HSCTLR
-target: @ We're now in the trampoline code, switch page tables
- mcrr p15, 4, rr_lo_hi(r2, r3), c2
+ /* Install stub vectors */
+ mcr p15, 4, r0, c12, c0, 0 @ HVBAR
isb
- @ Invalidate the old TLBs
- mcr p15, 4, r0, c8, c7, 0 @ TLBIALLH
- dsb ish
-
eret
+ENDPROC(__kvm_hyp_reset)
.ltorg
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 45c43aecb8f2..bda27b6b1aa2 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -32,8 +32,6 @@
#include "trace.h"
-extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[];
-
static pgd_t *boot_hyp_pgd;
static pgd_t *hyp_pgd;
static pgd_t *merged_hyp_pgd;
@@ -484,28 +482,6 @@ static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
}
/**
- * free_boot_hyp_pgd - free HYP boot page tables
- *
- * Free the HYP boot page tables. The bounce page is also freed.
- */
-void free_boot_hyp_pgd(void)
-{
- mutex_lock(&kvm_hyp_pgd_mutex);
-
- if (boot_hyp_pgd) {
- unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
- unmap_hyp_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
- free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
- boot_hyp_pgd = NULL;
- }
-
- if (hyp_pgd)
- unmap_hyp_range(hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
-
- mutex_unlock(&kvm_hyp_pgd_mutex);
-}
-
-/**
* free_hyp_pgds - free Hyp-mode page tables
*
* Assumes hyp_pgd is a page table used strictly in Hyp-mode and
@@ -519,15 +495,20 @@ void free_hyp_pgds(void)
{
unsigned long addr;
- free_boot_hyp_pgd();
-
mutex_lock(&kvm_hyp_pgd_mutex);
+ if (boot_hyp_pgd) {
+ unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
+ free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
+ boot_hyp_pgd = NULL;
+ }
+
if (hyp_pgd) {
+ unmap_hyp_range(hyp_pgd, hyp_idmap_start, PAGE_SIZE);
for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
- unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
+ unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE);
for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
- unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
+ unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE);
free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
hyp_pgd = NULL;
@@ -679,17 +660,18 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
* create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
* @from: The virtual kernel start address of the range
* @to: The virtual kernel end address of the range (exclusive)
+ * @prot: The protection to be applied to this range
*
* The same virtual address as the kernel virtual address is also used
* in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
* physical pages.
*/
-int create_hyp_mappings(void *from, void *to)
+int create_hyp_mappings(void *from, void *to, pgprot_t prot)
{
phys_addr_t phys_addr;
unsigned long virt_addr;
- unsigned long start = KERN_TO_HYP((unsigned long)from);
- unsigned long end = KERN_TO_HYP((unsigned long)to);
+ unsigned long start = kern_hyp_va((unsigned long)from);
+ unsigned long end = kern_hyp_va((unsigned long)to);
if (is_kernel_in_hyp_mode())
return 0;
@@ -704,7 +686,7 @@ int create_hyp_mappings(void *from, void *to)
err = __create_hyp_mappings(hyp_pgd, virt_addr,
virt_addr + PAGE_SIZE,
__phys_to_pfn(phys_addr),
- PAGE_HYP);
+ prot);
if (err)
return err;
}
@@ -723,8 +705,8 @@ int create_hyp_mappings(void *from, void *to)
*/
int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
{
- unsigned long start = KERN_TO_HYP((unsigned long)from);
- unsigned long end = KERN_TO_HYP((unsigned long)to);
+ unsigned long start = kern_hyp_va((unsigned long)from);
+ unsigned long end = kern_hyp_va((unsigned long)to);
if (is_kernel_in_hyp_mode())
return 0;
@@ -1687,14 +1669,6 @@ phys_addr_t kvm_mmu_get_httbr(void)
return virt_to_phys(hyp_pgd);
}
-phys_addr_t kvm_mmu_get_boot_httbr(void)
-{
- if (__kvm_cpu_uses_extended_idmap())
- return virt_to_phys(merged_hyp_pgd);
- else
- return virt_to_phys(boot_hyp_pgd);
-}
-
phys_addr_t kvm_get_idmap_vector(void)
{
return hyp_idmap_vector;
@@ -1705,6 +1679,22 @@ phys_addr_t kvm_get_idmap_start(void)
return hyp_idmap_start;
}
+static int kvm_map_idmap_text(pgd_t *pgd)
+{
+ int err;
+
+ /* Create the idmap in the boot page tables */
+ err = __create_hyp_mappings(pgd,
+ hyp_idmap_start, hyp_idmap_end,
+ __phys_to_pfn(hyp_idmap_start),
+ PAGE_HYP_EXEC);
+ if (err)
+ kvm_err("Failed to idmap %lx-%lx\n",
+ hyp_idmap_start, hyp_idmap_end);
+
+ return err;
+}
+
int kvm_mmu_init(void)
{
int err;
@@ -1719,28 +1709,41 @@ int kvm_mmu_init(void)
*/
BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
- hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
- boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
+ kvm_info("IDMAP page: %lx\n", hyp_idmap_start);
+ kvm_info("HYP VA range: %lx:%lx\n",
+ kern_hyp_va(PAGE_OFFSET), kern_hyp_va(~0UL));
- if (!hyp_pgd || !boot_hyp_pgd) {
- kvm_err("Hyp mode PGD not allocated\n");
- err = -ENOMEM;
+ if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
+ hyp_idmap_start < kern_hyp_va(~0UL)) {
+ /*
+ * The idmap page is intersecting with the VA space,
+ * it is not safe to continue further.
+ */
+ kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
+ err = -EINVAL;
goto out;
}
- /* Create the idmap in the boot page tables */
- err = __create_hyp_mappings(boot_hyp_pgd,
- hyp_idmap_start, hyp_idmap_end,
- __phys_to_pfn(hyp_idmap_start),
- PAGE_HYP);
-
- if (err) {
- kvm_err("Failed to idmap %lx-%lx\n",
- hyp_idmap_start, hyp_idmap_end);
+ hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
+ if (!hyp_pgd) {
+ kvm_err("Hyp mode PGD not allocated\n");
+ err = -ENOMEM;
goto out;
}
if (__kvm_cpu_uses_extended_idmap()) {
+ boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ hyp_pgd_order);
+ if (!boot_hyp_pgd) {
+ kvm_err("Hyp boot PGD not allocated\n");
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = kvm_map_idmap_text(boot_hyp_pgd);
+ if (err)
+ goto out;
+
merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
if (!merged_hyp_pgd) {
kvm_err("Failed to allocate extra HYP pgd\n");
@@ -1748,29 +1751,10 @@ int kvm_mmu_init(void)
}
__kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
hyp_idmap_start);
- return 0;
- }
-
- /* Map the very same page at the trampoline VA */
- err = __create_hyp_mappings(boot_hyp_pgd,
- TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
- __phys_to_pfn(hyp_idmap_start),
- PAGE_HYP);
- if (err) {
- kvm_err("Failed to map trampoline @%lx into boot HYP pgd\n",
- TRAMPOLINE_VA);
- goto out;
- }
-
- /* Map the same page again into the runtime page tables */
- err = __create_hyp_mappings(hyp_pgd,
- TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
- __phys_to_pfn(hyp_idmap_start),
- PAGE_HYP);
- if (err) {
- kvm_err("Failed to map trampoline @%lx into runtime HYP pgd\n",
- TRAMPOLINE_VA);
- goto out;
+ } else {
+ err = kvm_map_idmap_text(hyp_pgd);
+ if (err)
+ goto out;
}
return 0;
diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c
index 0048b5a62a50..4b5e802e57d1 100644
--- a/arch/arm/kvm/reset.c
+++ b/arch/arm/kvm/reset.c
@@ -52,7 +52,7 @@ static const struct kvm_irq_level cortexa_vtimer_irq = {
* @vcpu: The VCPU pointer
*
* This function finds the right table above and sets the registers on the
- * virtual CPU struct to their architectually defined reset values.
+ * virtual CPU struct to their architecturally defined reset values.
*/
int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
{
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 49dd1bd3ea50..7099f26e3702 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -36,8 +36,9 @@
#define ARM64_HAS_VIRT_HOST_EXTN 11
#define ARM64_WORKAROUND_CAVIUM_27456 12
#define ARM64_HAS_32BIT_EL0 13
+#define ARM64_HYP_OFFSET_LOW 14
-#define ARM64_NCAPS 14
+#define ARM64_NCAPS 15
#ifndef __ASSEMBLY__
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 2cdb6b551ac6..4b5c977af465 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -178,7 +178,7 @@
/* Hyp System Trap Register */
#define HSTR_EL2_T(x) (1 << x)
-/* Hyp Coproccessor Trap Register Shifts */
+/* Hyp Coprocessor Trap Register Shifts */
#define CPTR_EL2_TFP_SHIFT 10
/* Hyp Coprocessor Trap Register */
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 49095fc4b482..3eda975837d0 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -47,8 +47,7 @@
int __attribute_const__ kvm_target_cpu(void);
int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
-int kvm_arch_dev_ioctl_check_extension(long ext);
-unsigned long kvm_hyp_reset_entry(void);
+int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext);
void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start);
struct kvm_arch {
@@ -348,8 +347,7 @@ int kvm_perf_teardown(void);
struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
-static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
- phys_addr_t pgd_ptr,
+static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
unsigned long hyp_stack_ptr,
unsigned long vector_ptr)
{
@@ -357,19 +355,14 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
* Call initialization code, and switch to the full blown
* HYP code.
*/
- __kvm_call_hyp((void *)boot_pgd_ptr, pgd_ptr,
- hyp_stack_ptr, vector_ptr);
+ __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr);
}
-static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr,
+void __kvm_hyp_teardown(void);
+static inline void __cpu_reset_hyp_mode(unsigned long vector_ptr,
phys_addr_t phys_idmap_start)
{
- /*
- * Call reset code, and switch back to stub hyp vectors.
- * Uses __kvm_call_hyp() to avoid kaslr's kvm_ksym_ref() translation.
- */
- __kvm_call_hyp((void *)kvm_hyp_reset_entry(),
- boot_pgd_ptr, phys_idmap_start);
+ kvm_call_hyp(__kvm_hyp_teardown, phys_idmap_start);
}
static inline void kvm_arch_hardware_unsetup(void) {}
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 44eaff70da6a..cff510574fae 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -25,29 +25,6 @@
#define __hyp_text __section(.hyp.text) notrace
-static inline unsigned long __kern_hyp_va(unsigned long v)
-{
- asm volatile(ALTERNATIVE("and %0, %0, %1",
- "nop",
- ARM64_HAS_VIRT_HOST_EXTN)
- : "+r" (v) : "i" (HYP_PAGE_OFFSET_MASK));
- return v;
-}
-
-#define kern_hyp_va(v) (typeof(v))(__kern_hyp_va((unsigned long)(v)))
-
-static inline unsigned long __hyp_kern_va(unsigned long v)
-{
- u64 offset = PAGE_OFFSET - HYP_PAGE_OFFSET;
- asm volatile(ALTERNATIVE("add %0, %0, %1",
- "nop",
- ARM64_HAS_VIRT_HOST_EXTN)
- : "+r" (v) : "r" (offset));
- return v;
-}
-
-#define hyp_kern_va(v) (typeof(v))(__hyp_kern_va((unsigned long)(v)))
-
#define read_sysreg_elx(r,nvh,vh) \
({ \
u64 reg; \
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index f05ac27d033e..b6bb83400cd8 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -29,21 +29,48 @@
*
* Instead, give the HYP mode its own VA region at a fixed offset from
* the kernel by just masking the top bits (which are all ones for a
- * kernel address).
+ * kernel address). We need to find out how many bits to mask.
*
- * ARMv8.1 (using VHE) does have a TTBR1_EL2, and doesn't use these
- * macros (the entire kernel runs at EL2).
+ * We want to build a set of page tables that cover both parts of the
+ * idmap (the trampoline page used to initialize EL2), and our normal
+ * runtime VA space, at the same time.
+ *
+ * Given that the kernel uses VA_BITS for its entire address space,
+ * and that half of that space (VA_BITS - 1) is used for the linear
+ * mapping, we can also limit the EL2 space to (VA_BITS - 1).
+ *
+ * The main question is "Within the VA_BITS space, does EL2 use the
+ * top or the bottom half of that space to shadow the kernel's linear
+ * mapping?". As we need to idmap the trampoline page, this is
+ * determined by the range in which this page lives.
+ *
+ * If the page is in the bottom half, we have to use the top half. If
+ * the page is in the top half, we have to use the bottom half:
+ *
+ * T = __virt_to_phys(__hyp_idmap_text_start)
+ * if (T & BIT(VA_BITS - 1))
+ * HYP_VA_MIN = 0 //idmap in upper half
+ * else
+ * HYP_VA_MIN = 1 << (VA_BITS - 1)
+ * HYP_VA_MAX = HYP_VA_MIN + (1 << (VA_BITS - 1)) - 1
+ *
+ * This of course assumes that the trampoline page exists within the
+ * VA_BITS range. If it doesn't, then it means we're in the odd case
+ * where the kernel idmap (as well as HYP) uses more levels than the
+ * kernel runtime page tables (as seen when the kernel is configured
+ * for 4k pages, 39bits VA, and yet memory lives just above that
+ * limit, forcing the idmap to use 4 levels of page tables while the
+ * kernel itself only uses 3). In this particular case, it doesn't
+ * matter which side of VA_BITS we use, as we're guaranteed not to
+ * conflict with anything.
+ *
+ * When using VHE, there are no separate hyp mappings and all KVM
+ * functionality is already mapped as part of the main kernel
+ * mappings, and none of this applies in that case.
*/
-#define HYP_PAGE_OFFSET_SHIFT VA_BITS
-#define HYP_PAGE_OFFSET_MASK ((UL(1) << HYP_PAGE_OFFSET_SHIFT) - 1)
-#define HYP_PAGE_OFFSET (PAGE_OFFSET & HYP_PAGE_OFFSET_MASK)
-/*
- * Our virtual mapping for the idmap-ed MMU-enable code. Must be
- * shared across all the page-tables. Conveniently, we use the last
- * possible page, where no kernel mapping will ever exist.
- */
-#define TRAMPOLINE_VA (HYP_PAGE_OFFSET_MASK & PAGE_MASK)
+#define HYP_PAGE_OFFSET_HIGH_MASK ((UL(1) << VA_BITS) - 1)
+#define HYP_PAGE_OFFSET_LOW_MASK ((UL(1) << (VA_BITS - 1)) - 1)
#ifdef __ASSEMBLY__
@@ -53,13 +80,33 @@
/*
* Convert a kernel VA into a HYP VA.
* reg: VA to be converted.
+ *
+ * This generates the following sequences:
+ * - High mask:
+ * and x0, x0, #HYP_PAGE_OFFSET_HIGH_MASK
+ * nop
+ * - Low mask:
+ * and x0, x0, #HYP_PAGE_OFFSET_HIGH_MASK
+ * and x0, x0, #HYP_PAGE_OFFSET_LOW_MASK
+ * - VHE:
+ * nop
+ * nop
+ *
+ * The "low mask" version works because the mask is a strict subset of
+ * the "high mask", hence performing the first mask for nothing.
+ * Should be completely invisible on any viable CPU.
*/
.macro kern_hyp_va reg
-alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
- and \reg, \reg, #HYP_PAGE_OFFSET_MASK
+alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
+ and \reg, \reg, #HYP_PAGE_OFFSET_HIGH_MASK
alternative_else
nop
alternative_endif
+alternative_if_not ARM64_HYP_OFFSET_LOW
+ nop
+alternative_else
+ and \reg, \reg, #HYP_PAGE_OFFSET_LOW_MASK
+alternative_endif
.endm
#else
@@ -70,7 +117,22 @@ alternative_endif
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
-#define KERN_TO_HYP(kva) ((unsigned long)kva - PAGE_OFFSET + HYP_PAGE_OFFSET)
+static inline unsigned long __kern_hyp_va(unsigned long v)
+{
+ asm volatile(ALTERNATIVE("and %0, %0, %1",
+ "nop",
+ ARM64_HAS_VIRT_HOST_EXTN)
+ : "+r" (v)
+ : "i" (HYP_PAGE_OFFSET_HIGH_MASK));
+ asm volatile(ALTERNATIVE("nop",
+ "and %0, %0, %1",
+ ARM64_HYP_OFFSET_LOW)
+ : "+r" (v)
+ : "i" (HYP_PAGE_OFFSET_LOW_MASK));
+ return v;
+}
+
+#define kern_hyp_va(v) (typeof(v))(__kern_hyp_va((unsigned long)(v)))
/*
* We currently only support a 40bit IPA.
@@ -81,9 +143,8 @@ alternative_endif
#include <asm/stage2_pgtable.h>
-int create_hyp_mappings(void *from, void *to);
+int create_hyp_mappings(void *from, void *to, pgprot_t prot);
int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
-void free_boot_hyp_pgd(void);
void free_hyp_pgds(void);
void stage2_unmap_vm(struct kvm *kvm);
@@ -97,7 +158,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run);
void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
phys_addr_t kvm_mmu_get_httbr(void);
-phys_addr_t kvm_mmu_get_boot_httbr(void);
phys_addr_t kvm_get_idmap_vector(void);
phys_addr_t kvm_get_idmap_start(void);
int kvm_mmu_init(void);
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 2813748e2f24..c3ae239db3ee 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -164,6 +164,7 @@
#define PTE_CONT (_AT(pteval_t, 1) << 52) /* Contiguous range */
#define PTE_PXN (_AT(pteval_t, 1) << 53) /* Privileged XN */
#define PTE_UXN (_AT(pteval_t, 1) << 54) /* User XN */
+#define PTE_HYP_XN (_AT(pteval_t, 1) << 54) /* HYP XN */
/*
* AttrIndx[2:0] encoding (mapping attributes defined in the MAIR* registers).
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 29fcb33ab401..39f5252673f7 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -55,7 +55,9 @@
#define PAGE_KERNEL_EXEC __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE)
#define PAGE_KERNEL_EXEC_CONT __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_CONT)
-#define PAGE_HYP __pgprot(_PAGE_DEFAULT | PTE_HYP)
+#define PAGE_HYP __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN)
+#define PAGE_HYP_EXEC __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY)
+#define PAGE_HYP_RO __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN)
#define PAGE_HYP_DEVICE __pgprot(PROT_DEVICE_nGnRE | PTE_HYP)
#define PAGE_S2 __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY)
diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
index bbc6a8cf83f1..1788545f25bc 100644
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -87,6 +87,10 @@ extern void verify_cpu_run_el(void);
static inline void verify_cpu_run_el(void) {}
#endif
+/* The section containing the hypervisor idmap text */
+extern char __hyp_idmap_text_start[];
+extern char __hyp_idmap_text_end[];
+
/* The section containing the hypervisor text */
extern char __hyp_text_start[];
extern char __hyp_text_end[];
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index f209ea151dca..3051f86a9b5f 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -87,9 +87,11 @@ struct kvm_regs {
/* Supported VGICv3 address types */
#define KVM_VGIC_V3_ADDR_TYPE_DIST 2
#define KVM_VGIC_V3_ADDR_TYPE_REDIST 3
+#define KVM_VGIC_ITS_ADDR_TYPE 4
#define KVM_VGIC_V3_DIST_SIZE SZ_64K
#define KVM_VGIC_V3_REDIST_SIZE (2 * SZ_64K)
+#define KVM_VGIC_V3_ITS_SIZE (2 * SZ_64K)
#define KVM_ARM_VCPU_POWER_OFF 0 /* CPU is started in OFF state */
#define KVM_ARM_VCPU_EL1_32BIT 1 /* CPU running a 32bit VM */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 916d27ad79c1..62272eac1352 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -726,6 +726,19 @@ static bool runs_at_el2(const struct arm64_cpu_capabilities *entry, int __unused
return is_kernel_in_hyp_mode();
}
+static bool hyp_offset_low(const struct arm64_cpu_capabilities *entry,
+ int __unused)
+{
+ phys_addr_t idmap_addr = virt_to_phys(__hyp_idmap_text_start);
+
+ /*
+ * Activate the lower HYP offset only if:
+ * - the idmap doesn't clash with it,
+ * - the kernel is not running at EL2.
+ */
+ return idmap_addr > GENMASK(VA_BITS - 2, 0) && !is_kernel_in_hyp_mode();
+}
+
static const struct arm64_cpu_capabilities arm64_features[] = {
{
.desc = "GIC system register CPU interface",
@@ -803,6 +816,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.field_pos = ID_AA64PFR0_EL0_SHIFT,
.min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT,
},
+ {
+ .desc = "Reduced HYP mapping offset",
+ .capability = ARM64_HYP_OFFSET_LOW,
+ .def_scope = SCOPE_SYSTEM,
+ .matches = hyp_offset_low,
+ },
{},
};
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index c4f26ef91e77..9d2eff0b3ad3 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -36,6 +36,7 @@ config KVM
select HAVE_KVM_IRQFD
select KVM_ARM_VGIC_V3
select KVM_ARM_PMU if HW_PERF_EVENTS
+ select HAVE_KVM_MSI
---help---
Support hosting virtualized guest machines.
We don't support KVM with 16K page tables yet, due to the multiple
@@ -54,13 +55,6 @@ config KVM_ARM_PMU
Adds support for a virtual Performance Monitoring Unit (PMU) in
virtual machines.
-config KVM_NEW_VGIC
- bool "New VGIC implementation"
- depends on KVM
- default y
- ---help---
- uses the new VGIC implementation
-
source drivers/vhost/Kconfig
endif # VIRTUALIZATION
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index a7a958ca29d5..a5b96642a9cb 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -20,7 +20,6 @@ kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o
kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
-ifeq ($(CONFIG_KVM_NEW_VGIC),y)
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-init.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-irqfd.o
@@ -30,12 +29,6 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v2.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v3.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-kvm-device.o
-else
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2-emul.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3-emul.o
-endif
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-its.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o
kvm-$(CONFIG_KVM_ARM_PMU) += $(KVM)/arm/pmu.o
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 32fad75bb9ff..3f9e15722473 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -211,7 +211,7 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu)
/**
* kvm_arm_copy_reg_indices - get indices of all registers.
*
- * We do core registers right here, then we apppend system regs.
+ * We do core registers right here, then we append system regs.
*/
int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
{
diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index a873a6d8be90..6b29d3d9e1f2 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -53,10 +53,9 @@ __invalid:
b .
/*
- * x0: HYP boot pgd
- * x1: HYP pgd
- * x2: HYP stack
- * x3: HYP vectors
+ * x0: HYP pgd
+ * x1: HYP stack
+ * x2: HYP vectors
*/
__do_hyp_init:
@@ -110,71 +109,27 @@ __do_hyp_init:
msr sctlr_el2, x4
isb
- /* Skip the trampoline dance if we merged the boot and runtime PGDs */
- cmp x0, x1
- b.eq merged
-
- /* MMU is now enabled. Get ready for the trampoline dance */
- ldr x4, =TRAMPOLINE_VA
- adr x5, target
- bfi x4, x5, #0, #PAGE_SHIFT
- br x4
-
-target: /* We're now in the trampoline code, switch page tables */
- msr ttbr0_el2, x1
- isb
-
- /* Invalidate the old TLBs */
- tlbi alle2
- dsb sy
-
-merged:
/* Set the stack and new vectors */
+ kern_hyp_va x1
+ mov sp, x1
kern_hyp_va x2
- mov sp, x2
- kern_hyp_va x3
- msr vbar_el2, x3
+ msr vbar_el2, x2
/* Hello, World! */
eret
ENDPROC(__kvm_hyp_init)
/*
- * Reset kvm back to the hyp stub. This is the trampoline dance in
- * reverse. If kvm used an extended idmap, __extended_idmap_trampoline
- * calls this code directly in the idmap. In this case switching to the
- * boot tables is a no-op.
- *
- * x0: HYP boot pgd
- * x1: HYP phys_idmap_start
+ * Reset kvm back to the hyp stub.
*/
ENTRY(__kvm_hyp_reset)
- /* We're in trampoline code in VA, switch back to boot page tables */
- msr ttbr0_el2, x0
- isb
-
- /* Ensure the PA branch doesn't find a stale tlb entry or stale code. */
- ic iallu
- tlbi alle2
- dsb sy
- isb
-
- /* Branch into PA space */
- adr x0, 1f
- bfi x1, x0, #0, #PAGE_SHIFT
- br x1
-
/* We're now in idmap, disable MMU */
-1: mrs x0, sctlr_el2
+ mrs x0, sctlr_el2
ldr x1, =SCTLR_ELx_FLAGS
bic x0, x0, x1 // Clear SCTL_M and etc
msr sctlr_el2, x0
isb
- /* Invalidate the old TLBs */
- tlbi alle2
- dsb sy
-
/* Install stub vectors */
adr_l x0, __hyp_stub_vectors
msr vbar_el2, x0
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index 70254a65bd5b..ce9e5e5f28cf 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -164,22 +164,3 @@ alternative_endif
eret
ENDPROC(__fpsimd_guest_restore)
-
-/*
- * When using the extended idmap, we don't have a trampoline page we can use
- * while we switch pages tables during __kvm_hyp_reset. Accessing the idmap
- * directly would be ideal, but if we're using the extended idmap then the
- * idmap is located above HYP_PAGE_OFFSET, and the address will be masked by
- * kvm_call_hyp using kern_hyp_va.
- *
- * x0: HYP boot pgd
- * x1: HYP phys_idmap_start
- */
-ENTRY(__extended_idmap_trampoline)
- mov x4, x1
- adr_l x3, __kvm_hyp_reset
-
- /* insert __kvm_hyp_reset()s offset into phys_idmap_start */
- bfi x4, x3, #0, #PAGE_SHIFT
- br x4
-ENDPROC(__extended_idmap_trampoline)
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 2d87f36d5cb4..f6d9694ae3b1 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -62,6 +62,21 @@ ENTRY(__vhe_hyp_call)
isb
ret
ENDPROC(__vhe_hyp_call)
+
+/*
+ * Compute the idmap address of __kvm_hyp_reset based on the idmap
+ * start passed as a parameter, and jump there.
+ *
+ * x0: HYP phys_idmap_start
+ */
+ENTRY(__kvm_hyp_teardown)
+ mov x4, x0
+ adr_l x3, __kvm_hyp_reset
+
+ /* insert __kvm_hyp_reset()s offset into phys_idmap_start */
+ bfi x4, x3, #0, #PAGE_SHIFT
+ br x4
+ENDPROC(__kvm_hyp_teardown)
el1_sync: // Guest trapped into EL2
save_x0_to_x3
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index 4373997d1a70..ae7855f16ec2 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -299,9 +299,16 @@ static const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%
static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par)
{
- unsigned long str_va = (unsigned long)__hyp_panic_string;
+ unsigned long str_va;
- __hyp_do_panic(hyp_kern_va(str_va),
+ /*
+ * Force the panic string to be loaded from the literal pool,
+ * making sure it is a kernel address and not a PC-relative
+ * reference.
+ */
+ asm volatile("ldr %0, =__hyp_panic_string" : "=r" (str_va));
+
+ __hyp_do_panic(str_va,
spsr, elr,
read_sysreg(esr_el2), read_sysreg_el2(far),
read_sysreg(hpfar_el2), par,
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index b1ad730e1567..5bc460884639 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -65,7 +65,7 @@ static bool cpu_has_32bit_el1(void)
* We currently assume that the number of HW registers is uniform
* across all CPUs (see cpuinfo_sanity_check).
*/
-int kvm_arch_dev_ioctl_check_extension(long ext)
+int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
{
int r;
@@ -86,6 +86,12 @@ int kvm_arch_dev_ioctl_check_extension(long ext)
case KVM_CAP_VCPU_ATTRIBUTES:
r = 1;
break;
+ case KVM_CAP_MSI_DEVID:
+ if (!kvm)
+ r = -EINVAL;
+ else
+ r = kvm->arch.vgic.msis_require_devid;
+ break;
default:
r = 0;
}
@@ -98,7 +104,7 @@ int kvm_arch_dev_ioctl_check_extension(long ext)
* @vcpu: The VCPU pointer
*
* This function finds the right table above and sets the registers on
- * the virtual CPU struct to their architectually defined reset
+ * the virtual CPU struct to their architecturally defined reset
* values.
*/
int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
@@ -132,31 +138,3 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
/* Reset timer */
return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
}
-
-extern char __hyp_idmap_text_start[];
-
-unsigned long kvm_hyp_reset_entry(void)
-{
- if (!__kvm_cpu_uses_extended_idmap()) {
- unsigned long offset;
-
- /*
- * Find the address of __kvm_hyp_reset() in the trampoline page.
- * This is present in the running page tables, and the boot page
- * tables, so we call the code here to start the trampoline
- * dance in reverse.
- */
- offset = (unsigned long)__kvm_hyp_reset
- - ((unsigned long)__hyp_idmap_text_start & PAGE_MASK);
-
- return TRAMPOLINE_VA + offset;
- } else {
- /*
- * KVM is running with merged page tables, which don't have the
- * trampoline page mapped. We know the idmap is still mapped,
- * but can't be called into directly. Use
- * __extended_idmap_trampoline to do the call.
- */
- return (unsigned long)kvm_ksym_ref(__extended_idmap_trampoline);
- }
-}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index a57d650f552c..b0b225ceca18 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1546,7 +1546,7 @@ static void unhandled_cp_access(struct kvm_vcpu *vcpu,
struct sys_reg_params *params)
{
u8 hsr_ec = kvm_vcpu_trap_get_class(vcpu);
- int cp;
+ int cp = -1;
switch(hsr_ec) {
case ESR_ELx_EC_CP15_32:
@@ -1558,7 +1558,7 @@ static void unhandled_cp_access(struct kvm_vcpu *vcpu,
cp = 14;
break;
default:
- WARN_ON((cp = -1));
+ WARN_ON(1);
}
kvm_err("Unsupported guest CP%d access at: %08lx\n",
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index ac91939b9b75..29867139851e 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -1488,6 +1488,7 @@ config CPU_MIPS64_R2
select CPU_SUPPORTS_HIGHMEM
select CPU_SUPPORTS_HUGEPAGES
select CPU_SUPPORTS_MSA
+ select HAVE_KVM
help
Choose this option to build a kernel for release 2 or later of the
MIPS64 architecture. Many modern embedded systems with a 64-bit
@@ -1505,6 +1506,7 @@ config CPU_MIPS64_R6
select CPU_SUPPORTS_MSA
select GENERIC_CSUM
select MIPS_O32_FP64_SUPPORT if MIPS32_O32
+ select HAVE_KVM
help
Choose this option to build a kernel for release 6 or later of the
MIPS64 architecture. New MIPS processors, starting with the Warrior
diff --git a/arch/mips/include/asm/addrspace.h b/arch/mips/include/asm/addrspace.h
index 3b0e51d5a613..c5b04e752e97 100644
--- a/arch/mips/include/asm/addrspace.h
+++ b/arch/mips/include/asm/addrspace.h
@@ -45,7 +45,7 @@
/*
* Returns the kernel segment base of a given address
*/
-#define KSEGX(a) ((_ACAST32_ (a)) & 0xe0000000)
+#define KSEGX(a) ((_ACAST32_(a)) & _ACAST32_(0xe0000000))
/*
* Returns the physical address of a CKSEGx / XKPHYS address
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 36a391d289aa..b54bcadd8aec 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -19,6 +19,9 @@
#include <linux/threads.h>
#include <linux/spinlock.h>
+#include <asm/inst.h>
+#include <asm/mipsregs.h>
+
/* MIPS KVM register ids */
#define MIPS_CP0_32(_R, _S) \
(KVM_REG_MIPS_CP0 | KVM_REG_SIZE_U32 | (8 * (_R) + (_S)))
@@ -53,6 +56,12 @@
#define KVM_REG_MIPS_CP0_CONFIG7 MIPS_CP0_32(16, 7)
#define KVM_REG_MIPS_CP0_XCONTEXT MIPS_CP0_64(20, 0)
#define KVM_REG_MIPS_CP0_ERROREPC MIPS_CP0_64(30, 0)
+#define KVM_REG_MIPS_CP0_KSCRATCH1 MIPS_CP0_64(31, 2)
+#define KVM_REG_MIPS_CP0_KSCRATCH2 MIPS_CP0_64(31, 3)
+#define KVM_REG_MIPS_CP0_KSCRATCH3 MIPS_CP0_64(31, 4)
+#define KVM_REG_MIPS_CP0_KSCRATCH4 MIPS_CP0_64(31, 5)
+#define KVM_REG_MIPS_CP0_KSCRATCH5 MIPS_CP0_64(31, 6)
+#define KVM_REG_MIPS_CP0_KSCRATCH6 MIPS_CP0_64(31, 7)
#define KVM_MAX_VCPUS 1
@@ -65,8 +74,14 @@
-/* Special address that contains the comm page, used for reducing # of traps */
-#define KVM_GUEST_COMMPAGE_ADDR 0x0
+/*
+ * Special address that contains the comm page, used for reducing # of traps
+ * This needs to be within 32Kb of 0x0 (so the zero register can be used), but
+ * preferably not at 0x0 so that most kernel NULL pointer dereferences can be
+ * caught.
+ */
+#define KVM_GUEST_COMMPAGE_ADDR ((PAGE_SIZE > 0x8000) ? 0 : \
+ (0x8000 - PAGE_SIZE))
#define KVM_GUEST_KERNEL_MODE(vcpu) ((kvm_read_c0_guest_status(vcpu->arch.cop0) & (ST0_EXL | ST0_ERL)) || \
((kvm_read_c0_guest_status(vcpu->arch.cop0) & KSU_USER) == 0))
@@ -93,9 +108,6 @@
#define KVM_INVALID_ADDR 0xdeadbeef
extern atomic_t kvm_mips_instance;
-extern kvm_pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn);
-extern void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn);
-extern bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn);
struct kvm_vm_stat {
u32 remote_tlb_flush;
@@ -126,28 +138,6 @@ struct kvm_vcpu_stat {
u32 halt_wakeup;
};
-enum kvm_mips_exit_types {
- WAIT_EXITS,
- CACHE_EXITS,
- SIGNAL_EXITS,
- INT_EXITS,
- COP_UNUSABLE_EXITS,
- TLBMOD_EXITS,
- TLBMISS_LD_EXITS,
- TLBMISS_ST_EXITS,
- ADDRERR_ST_EXITS,
- ADDRERR_LD_EXITS,
- SYSCALL_EXITS,
- RESVD_INST_EXITS,
- BREAK_INST_EXITS,
- TRAP_INST_EXITS,
- MSA_FPE_EXITS,
- FPE_EXITS,
- MSA_DISABLED_EXITS,
- FLUSH_DCACHE_EXITS,
- MAX_KVM_MIPS_EXIT_TYPES
-};
-
struct kvm_arch_memory_slot {
};
@@ -215,73 +205,6 @@ struct mips_coproc {
#define MIPS_CP0_CONFIG4_SEL 4
#define MIPS_CP0_CONFIG5_SEL 5
-/* Config0 register bits */
-#define CP0C0_M 31
-#define CP0C0_K23 28
-#define CP0C0_KU 25
-#define CP0C0_MDU 20
-#define CP0C0_MM 17
-#define CP0C0_BM 16
-#define CP0C0_BE 15
-#define CP0C0_AT 13
-#define CP0C0_AR 10
-#define CP0C0_MT 7
-#define CP0C0_VI 3
-#define CP0C0_K0 0
-
-/* Config1 register bits */
-#define CP0C1_M 31
-#define CP0C1_MMU 25
-#define CP0C1_IS 22
-#define CP0C1_IL 19
-#define CP0C1_IA 16
-#define CP0C1_DS 13
-#define CP0C1_DL 10
-#define CP0C1_DA 7
-#define CP0C1_C2 6
-#define CP0C1_MD 5
-#define CP0C1_PC 4
-#define CP0C1_WR 3
-#define CP0C1_CA 2
-#define CP0C1_EP 1
-#define CP0C1_FP 0
-
-/* Config2 Register bits */
-#define CP0C2_M 31
-#define CP0C2_TU 28
-#define CP0C2_TS 24
-#define CP0C2_TL 20
-#define CP0C2_TA 16
-#define CP0C2_SU 12
-#define CP0C2_SS 8
-#define CP0C2_SL 4
-#define CP0C2_SA 0
-
-/* Config3 Register bits */
-#define CP0C3_M 31
-#define CP0C3_ISA_ON_EXC 16
-#define CP0C3_ULRI 13
-#define CP0C3_DSPP 10
-#define CP0C3_LPA 7
-#define CP0C3_VEIC 6
-#define CP0C3_VInt 5
-#define CP0C3_SP 4
-#define CP0C3_MT 2
-#define CP0C3_SM 1
-#define CP0C3_TL 0
-
-/* MMU types, the first four entries have the same layout as the
- CP0C0_MT field. */
-enum mips_mmu_types {
- MMU_TYPE_NONE,
- MMU_TYPE_R4000,
- MMU_TYPE_RESERVED,
- MMU_TYPE_FMT,
- MMU_TYPE_R3000,
- MMU_TYPE_R6000,
- MMU_TYPE_R8000
-};
-
/* Resume Flags */
#define RESUME_FLAG_DR (1<<0) /* Reload guest nonvolatile state? */
#define RESUME_FLAG_HOST (1<<1) /* Resume host? */
@@ -298,11 +221,6 @@ enum emulation_result {
EMULATE_PRIV_FAIL,
};
-#define MIPS3_PG_G 0x00000001 /* Global; ignore ASID if in lo0 & lo1 */
-#define MIPS3_PG_V 0x00000002 /* Valid */
-#define MIPS3_PG_NV 0x00000000
-#define MIPS3_PG_D 0x00000004 /* Dirty */
-
#define mips3_paddr_to_tlbpfn(x) \
(((unsigned long)(x) >> MIPS3_PG_SHIFT) & MIPS3_PG_FRAME)
#define mips3_tlbpfn_to_paddr(x) \
@@ -313,13 +231,11 @@ enum emulation_result {
#define VPN2_MASK 0xffffe000
#define KVM_ENTRYHI_ASID MIPS_ENTRYHI_ASID
-#define TLB_IS_GLOBAL(x) (((x).tlb_lo0 & MIPS3_PG_G) && \
- ((x).tlb_lo1 & MIPS3_PG_G))
+#define TLB_IS_GLOBAL(x) ((x).tlb_lo[0] & (x).tlb_lo[1] & ENTRYLO_G)
#define TLB_VPN2(x) ((x).tlb_hi & VPN2_MASK)
#define TLB_ASID(x) ((x).tlb_hi & KVM_ENTRYHI_ASID)
-#define TLB_IS_VALID(x, va) (((va) & (1 << PAGE_SHIFT)) \
- ? ((x).tlb_lo1 & MIPS3_PG_V) \
- : ((x).tlb_lo0 & MIPS3_PG_V))
+#define TLB_LO_IDX(x, va) (((va) >> PAGE_SHIFT) & 1)
+#define TLB_IS_VALID(x, va) ((x).tlb_lo[TLB_LO_IDX(x, va)] & ENTRYLO_V)
#define TLB_HI_VPN2_HIT(x, y) ((TLB_VPN2(x) & ~(x).tlb_mask) == \
((y) & VPN2_MASK & ~(x).tlb_mask))
#define TLB_HI_ASID_HIT(x, y) (TLB_IS_GLOBAL(x) || \
@@ -328,26 +244,23 @@ enum emulation_result {
struct kvm_mips_tlb {
long tlb_mask;
long tlb_hi;
- long tlb_lo0;
- long tlb_lo1;
+ long tlb_lo[2];
};
-#define KVM_MIPS_FPU_FPU 0x1
-#define KVM_MIPS_FPU_MSA 0x2
+#define KVM_MIPS_AUX_FPU 0x1
+#define KVM_MIPS_AUX_MSA 0x2
#define KVM_MIPS_GUEST_TLB_SIZE 64
struct kvm_vcpu_arch {
- void *host_ebase, *guest_ebase;
+ void *guest_ebase;
int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu);
unsigned long host_stack;
unsigned long host_gp;
/* Host CP0 registers used when handling exits from guest */
unsigned long host_cp0_badvaddr;
- unsigned long host_cp0_cause;
unsigned long host_cp0_epc;
- unsigned long host_cp0_entryhi;
- uint32_t guest_inst;
+ u32 host_cp0_cause;
/* GPRS */
unsigned long gprs[32];
@@ -357,8 +270,8 @@ struct kvm_vcpu_arch {
/* FPU State */
struct mips_fpu_struct fpu;
- /* Which FPU state is loaded (KVM_MIPS_FPU_*) */
- unsigned int fpu_inuse;
+ /* Which auxiliary state is loaded (KVM_MIPS_AUX_*) */
+ unsigned int aux_inuse;
/* COP0 State */
struct mips_coproc *cop0;
@@ -370,11 +283,11 @@ struct kvm_vcpu_arch {
struct hrtimer comparecount_timer;
/* Count timer control KVM register */
- uint32_t count_ctl;
+ u32 count_ctl;
/* Count bias from the raw time */
- uint32_t count_bias;
+ u32 count_bias;
/* Frequency of timer in Hz */
- uint32_t count_hz;
+ u32 count_hz;
/* Dynamic nanosecond bias (multiple of count_period) to avoid overflow */
s64 count_dyn_bias;
/* Resume time */
@@ -388,7 +301,7 @@ struct kvm_vcpu_arch {
/* Bitmask of pending exceptions to be cleared */
unsigned long pending_exceptions_clr;
- unsigned long pending_load_cause;
+ u32 pending_load_cause;
/* Save/Restore the entryhi register when are are preempted/scheduled back in */
unsigned long preempt_entryhi;
@@ -397,8 +310,8 @@ struct kvm_vcpu_arch {
struct kvm_mips_tlb guest_tlb[KVM_MIPS_GUEST_TLB_SIZE];
/* Cached guest kernel/user ASIDs */
- uint32_t guest_user_asid[NR_CPUS];
- uint32_t guest_kernel_asid[NR_CPUS];
+ u32 guest_user_asid[NR_CPUS];
+ u32 guest_kernel_asid[NR_CPUS];
struct mm_struct guest_kernel_mm, guest_user_mm;
int last_sched_cpu;
@@ -408,6 +321,7 @@ struct kvm_vcpu_arch {
u8 fpu_enabled;
u8 msa_enabled;
+ u8 kscratch_enabled;
};
@@ -461,6 +375,18 @@ struct kvm_vcpu_arch {
#define kvm_write_c0_guest_config7(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][7] = (val))
#define kvm_read_c0_guest_errorepc(cop0) (cop0->reg[MIPS_CP0_ERROR_PC][0])
#define kvm_write_c0_guest_errorepc(cop0, val) (cop0->reg[MIPS_CP0_ERROR_PC][0] = (val))
+#define kvm_read_c0_guest_kscratch1(cop0) (cop0->reg[MIPS_CP0_DESAVE][2])
+#define kvm_read_c0_guest_kscratch2(cop0) (cop0->reg[MIPS_CP0_DESAVE][3])
+#define kvm_read_c0_guest_kscratch3(cop0) (cop0->reg[MIPS_CP0_DESAVE][4])
+#define kvm_read_c0_guest_kscratch4(cop0) (cop0->reg[MIPS_CP0_DESAVE][5])
+#define kvm_read_c0_guest_kscratch5(cop0) (cop0->reg[MIPS_CP0_DESAVE][6])
+#define kvm_read_c0_guest_kscratch6(cop0) (cop0->reg[MIPS_CP0_DESAVE][7])
+#define kvm_write_c0_guest_kscratch1(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][2] = (val))
+#define kvm_write_c0_guest_kscratch2(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][3] = (val))
+#define kvm_write_c0_guest_kscratch3(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][4] = (val))
+#define kvm_write_c0_guest_kscratch4(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][5] = (val))
+#define kvm_write_c0_guest_kscratch5(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][6] = (val))
+#define kvm_write_c0_guest_kscratch6(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][7] = (val))
/*
* Some of the guest registers may be modified asynchronously (e.g. from a
@@ -474,7 +400,7 @@ static inline void _kvm_atomic_set_c0_guest_reg(unsigned long *reg,
unsigned long temp;
do {
__asm__ __volatile__(
- " .set mips3 \n"
+ " .set "MIPS_ISA_ARCH_LEVEL" \n"
" " __LL "%0, %1 \n"
" or %0, %2 \n"
" " __SC "%0, %1 \n"
@@ -490,7 +416,7 @@ static inline void _kvm_atomic_clear_c0_guest_reg(unsigned long *reg,
unsigned long temp;
do {
__asm__ __volatile__(
- " .set mips3 \n"
+ " .set "MIPS_ISA_ARCH_LEVEL" \n"
" " __LL "%0, %1 \n"
" and %0, %2 \n"
" " __SC "%0, %1 \n"
@@ -507,7 +433,7 @@ static inline void _kvm_atomic_change_c0_guest_reg(unsigned long *reg,
unsigned long temp;
do {
__asm__ __volatile__(
- " .set mips3 \n"
+ " .set "MIPS_ISA_ARCH_LEVEL" \n"
" " __LL "%0, %1 \n"
" and %0, %2 \n"
" or %0, %3 \n"
@@ -542,7 +468,7 @@ static inline void _kvm_atomic_change_c0_guest_reg(unsigned long *reg,
static inline bool kvm_mips_guest_can_have_fpu(struct kvm_vcpu_arch *vcpu)
{
- return (!__builtin_constant_p(cpu_has_fpu) || cpu_has_fpu) &&
+ return (!__builtin_constant_p(raw_cpu_has_fpu) || raw_cpu_has_fpu) &&
vcpu->fpu_enabled;
}
@@ -589,9 +515,11 @@ struct kvm_mips_callbacks {
void (*dequeue_io_int)(struct kvm_vcpu *vcpu,
struct kvm_mips_interrupt *irq);
int (*irq_deliver)(struct kvm_vcpu *vcpu, unsigned int priority,
- uint32_t cause);
+ u32 cause);
int (*irq_clear)(struct kvm_vcpu *vcpu, unsigned int priority,
- uint32_t cause);
+ u32 cause);
+ unsigned long (*num_regs)(struct kvm_vcpu *vcpu);
+ int (*copy_reg_indices)(struct kvm_vcpu *vcpu, u64 __user *indices);
int (*get_one_reg)(struct kvm_vcpu *vcpu,
const struct kvm_one_reg *reg, s64 *v);
int (*set_one_reg)(struct kvm_vcpu *vcpu,
@@ -605,8 +533,13 @@ int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks);
/* Debug: dump vcpu state */
int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu);
-/* Trampoline ASM routine to start running in "Guest" context */
-extern int __kvm_mips_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu);
+extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu);
+
+/* Building of entry/exception code */
+int kvm_mips_entry_setup(void);
+void *kvm_mips_build_vcpu_run(void *addr);
+void *kvm_mips_build_exception(void *addr, void *handler);
+void *kvm_mips_build_exit(void *addr);
/* FPU/MSA context management */
void __kvm_save_fpu(struct kvm_vcpu_arch *vcpu);
@@ -622,11 +555,11 @@ void kvm_drop_fpu(struct kvm_vcpu *vcpu);
void kvm_lose_fpu(struct kvm_vcpu *vcpu);
/* TLB handling */
-uint32_t kvm_get_kernel_asid(struct kvm_vcpu *vcpu);
+u32 kvm_get_kernel_asid(struct kvm_vcpu *vcpu);
-uint32_t kvm_get_user_asid(struct kvm_vcpu *vcpu);
+u32 kvm_get_user_asid(struct kvm_vcpu *vcpu);
-uint32_t kvm_get_commpage_asid (struct kvm_vcpu *vcpu);
+u32 kvm_get_commpage_asid (struct kvm_vcpu *vcpu);
extern int kvm_mips_handle_kseg0_tlb_fault(unsigned long badbaddr,
struct kvm_vcpu *vcpu);
@@ -635,22 +568,24 @@ extern int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
struct kvm_vcpu *vcpu);
extern int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
- struct kvm_mips_tlb *tlb,
- unsigned long *hpa0,
- unsigned long *hpa1);
+ struct kvm_mips_tlb *tlb);
-extern enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_handle_tlbmod(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
extern void kvm_mips_dump_host_tlbs(void);
extern void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu);
+extern int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
+ unsigned long entrylo0,
+ unsigned long entrylo1,
+ int flush_dcache_mask);
extern void kvm_mips_flush_host_tlb(int skip_kseg0);
extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi);
@@ -667,90 +602,90 @@ extern void kvm_mips_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
extern void kvm_mips_vcpu_put(struct kvm_vcpu *vcpu);
/* Emulation */
-uint32_t kvm_get_inst(uint32_t *opc, struct kvm_vcpu *vcpu);
-enum emulation_result update_pc(struct kvm_vcpu *vcpu, uint32_t cause);
+u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu);
+enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause);
-extern enum emulation_result kvm_mips_emulate_inst(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_inst(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_syscall(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_syscall(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_tlbmod(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_handle_ri(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_handle_ri(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_ri_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_bp_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_trap_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
extern enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu,
struct kvm_run *run);
-uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu);
-void kvm_mips_write_count(struct kvm_vcpu *vcpu, uint32_t count);
-void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack);
+u32 kvm_mips_read_count(struct kvm_vcpu *vcpu);
+void kvm_mips_write_count(struct kvm_vcpu *vcpu, u32 count);
+void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack);
void kvm_mips_init_count(struct kvm_vcpu *vcpu);
int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl);
int kvm_mips_set_count_resume(struct kvm_vcpu *vcpu, s64 count_resume);
@@ -759,27 +694,27 @@ void kvm_mips_count_enable_cause(struct kvm_vcpu *vcpu);
void kvm_mips_count_disable_cause(struct kvm_vcpu *vcpu);
enum hrtimer_restart kvm_mips_count_timeout(struct kvm_vcpu *vcpu);
-enum emulation_result kvm_mips_check_privilege(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_check_privilege(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-enum emulation_result kvm_mips_emulate_cache(uint32_t inst,
- uint32_t *opc,
- uint32_t cause,
+enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
+ u32 *opc,
+ u32 cause,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-enum emulation_result kvm_mips_emulate_CP0(uint32_t inst,
- uint32_t *opc,
- uint32_t cause,
+enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
+ u32 *opc,
+ u32 cause,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-enum emulation_result kvm_mips_emulate_store(uint32_t inst,
- uint32_t cause,
+enum emulation_result kvm_mips_emulate_store(union mips_instruction inst,
+ u32 cause,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-enum emulation_result kvm_mips_emulate_load(uint32_t inst,
- uint32_t cause,
+enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
+ u32 cause,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
@@ -789,13 +724,13 @@ unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu);
unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu);
/* Dynamic binary translation */
-extern int kvm_mips_trans_cache_index(uint32_t inst, uint32_t *opc,
- struct kvm_vcpu *vcpu);
-extern int kvm_mips_trans_cache_va(uint32_t inst, uint32_t *opc,
+extern int kvm_mips_trans_cache_index(union mips_instruction inst,
+ u32 *opc, struct kvm_vcpu *vcpu);
+extern int kvm_mips_trans_cache_va(union mips_instruction inst, u32 *opc,
struct kvm_vcpu *vcpu);
-extern int kvm_mips_trans_mfc0(uint32_t inst, uint32_t *opc,
+extern int kvm_mips_trans_mfc0(union mips_instruction inst, u32 *opc,
struct kvm_vcpu *vcpu);
-extern int kvm_mips_trans_mtc0(uint32_t inst, uint32_t *opc,
+extern int kvm_mips_trans_mtc0(union mips_instruction inst, u32 *opc,
struct kvm_vcpu *vcpu);
/* Misc */
diff --git a/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h b/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h
index d68e685cde60..bd8b9bbe1771 100644
--- a/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h
+++ b/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h
@@ -55,7 +55,7 @@
#define cpu_has_mipsmt 0
#define cpu_has_vint 0
#define cpu_has_veic 0
-#define cpu_hwrena_impl_bits 0xc0000000
+#define cpu_hwrena_impl_bits (MIPS_HWRENA_IMPL1 | MIPS_HWRENA_IMPL2)
#define cpu_has_wsbh 1
#define cpu_has_rixi (cpu_data[0].cputype != CPU_CAVIUM_OCTEON)
diff --git a/arch/mips/include/asm/mipsregs.h b/arch/mips/include/asm/mipsregs.h
index e1ca65c62f6a..def9d8d13f6e 100644
--- a/arch/mips/include/asm/mipsregs.h
+++ b/arch/mips/include/asm/mipsregs.h
@@ -53,7 +53,7 @@
#define CP0_SEGCTL2 $5, 4
#define CP0_WIRED $6
#define CP0_INFO $7
-#define CP0_HWRENA $7, 0
+#define CP0_HWRENA $7
#define CP0_BADVADDR $8
#define CP0_BADINSTR $8, 1
#define CP0_COUNT $9
@@ -533,6 +533,7 @@
#define TX49_CONF_CWFON (_ULCAST_(1) << 27)
/* Bits specific to the MIPS32/64 PRA. */
+#define MIPS_CONF_VI (_ULCAST_(1) << 3)
#define MIPS_CONF_MT (_ULCAST_(7) << 7)
#define MIPS_CONF_MT_TLB (_ULCAST_(1) << 7)
#define MIPS_CONF_MT_FTLB (_ULCAST_(4) << 7)
@@ -853,6 +854,24 @@
#define MIPS_CDMMBASE_ADDR_SHIFT 11
#define MIPS_CDMMBASE_ADDR_START 15
+/* RDHWR register numbers */
+#define MIPS_HWR_CPUNUM 0 /* CPU number */
+#define MIPS_HWR_SYNCISTEP 1 /* SYNCI step size */
+#define MIPS_HWR_CC 2 /* Cycle counter */
+#define MIPS_HWR_CCRES 3 /* Cycle counter resolution */
+#define MIPS_HWR_ULR 29 /* UserLocal */
+#define MIPS_HWR_IMPL1 30 /* Implementation dependent */
+#define MIPS_HWR_IMPL2 31 /* Implementation dependent */
+
+/* Bits in HWREna register */
+#define MIPS_HWRENA_CPUNUM (_ULCAST_(1) << MIPS_HWR_CPUNUM)
+#define MIPS_HWRENA_SYNCISTEP (_ULCAST_(1) << MIPS_HWR_SYNCISTEP)
+#define MIPS_HWRENA_CC (_ULCAST_(1) << MIPS_HWR_CC)
+#define MIPS_HWRENA_CCRES (_ULCAST_(1) << MIPS_HWR_CCRES)
+#define MIPS_HWRENA_ULR (_ULCAST_(1) << MIPS_HWR_ULR)
+#define MIPS_HWRENA_IMPL1 (_ULCAST_(1) << MIPS_HWR_IMPL1)
+#define MIPS_HWRENA_IMPL2 (_ULCAST_(1) << MIPS_HWR_IMPL2)
+
/*
* Bitfields in the TX39 family CP0 Configuration Register 3
*/
diff --git a/arch/mips/include/asm/setup.h b/arch/mips/include/asm/setup.h
index d7bfdeba9e84..4f5279a8308d 100644
--- a/arch/mips/include/asm/setup.h
+++ b/arch/mips/include/asm/setup.h
@@ -21,6 +21,7 @@ extern void *set_vi_handler(int n, vi_handler_t addr);
extern void *set_except_vector(int n, void *addr);
extern unsigned long ebase;
+extern unsigned int hwrena;
extern void per_cpu_trap_init(bool);
extern void cpu_cache_init(void);
diff --git a/arch/mips/include/asm/uasm.h b/arch/mips/include/asm/uasm.h
index b6ecfeee4dbe..f7929f65f7ca 100644
--- a/arch/mips/include/asm/uasm.h
+++ b/arch/mips/include/asm/uasm.h
@@ -104,8 +104,13 @@ Ip_u1s2(_bltz);
Ip_u1s2(_bltzl);
Ip_u1u2s3(_bne);
Ip_u2s3u1(_cache);
+Ip_u1u2(_cfc1);
+Ip_u2u1(_cfcmsa);
+Ip_u1u2(_ctc1);
+Ip_u2u1(_ctcmsa);
Ip_u2u1s3(_daddiu);
Ip_u3u1u2(_daddu);
+Ip_u1(_di);
Ip_u2u1msbu3(_dins);
Ip_u2u1msbu3(_dinsm);
Ip_u1u2(_divu);
@@ -141,6 +146,8 @@ Ip_u1(_mfhi);
Ip_u1(_mflo);
Ip_u1u2u3(_mtc0);
Ip_u1u2u3(_mthc0);
+Ip_u1(_mthi);
+Ip_u1(_mtlo);
Ip_u3u1u2(_mul);
Ip_u3u1u2(_or);
Ip_u2u1u3(_ori);
diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
index 8051f9aa1379..77429d1622b3 100644
--- a/arch/mips/include/uapi/asm/inst.h
+++ b/arch/mips/include/uapi/asm/inst.h
@@ -21,20 +21,20 @@
enum major_op {
spec_op, bcond_op, j_op, jal_op,
beq_op, bne_op, blez_op, bgtz_op,
- addi_op, cbcond0_op = addi_op, addiu_op, slti_op, sltiu_op,
+ addi_op, pop10_op = addi_op, addiu_op, slti_op, sltiu_op,
andi_op, ori_op, xori_op, lui_op,
cop0_op, cop1_op, cop2_op, cop1x_op,
beql_op, bnel_op, blezl_op, bgtzl_op,
- daddi_op, cbcond1_op = daddi_op, daddiu_op, ldl_op, ldr_op,
+ daddi_op, pop30_op = daddi_op, daddiu_op, ldl_op, ldr_op,
spec2_op, jalx_op, mdmx_op, msa_op = mdmx_op, spec3_op,
lb_op, lh_op, lwl_op, lw_op,
lbu_op, lhu_op, lwr_op, lwu_op,
sb_op, sh_op, swl_op, sw_op,
sdl_op, sdr_op, swr_op, cache_op,
ll_op, lwc1_op, lwc2_op, bc6_op = lwc2_op, pref_op,
- lld_op, ldc1_op, ldc2_op, beqzcjic_op = ldc2_op, ld_op,
+ lld_op, ldc1_op, ldc2_op, pop66_op = ldc2_op, ld_op,
sc_op, swc1_op, swc2_op, balc6_op = swc2_op, major_3b_op,
- scd_op, sdc1_op, sdc2_op, bnezcjialc_op = sdc2_op, sd_op
+ scd_op, sdc1_op, sdc2_op, pop76_op = sdc2_op, sd_op
};
/*
@@ -93,6 +93,50 @@ enum spec3_op {
};
/*
+ * Bits 10-6 minor opcode for r6 spec mult/div encodings
+ */
+enum mult_op {
+ mult_mult_op = 0x0,
+ mult_mul_op = 0x2,
+ mult_muh_op = 0x3,
+};
+enum multu_op {
+ multu_multu_op = 0x0,
+ multu_mulu_op = 0x2,
+ multu_muhu_op = 0x3,
+};
+enum div_op {
+ div_div_op = 0x0,
+ div_div6_op = 0x2,
+ div_mod_op = 0x3,
+};
+enum divu_op {
+ divu_divu_op = 0x0,
+ divu_divu6_op = 0x2,
+ divu_modu_op = 0x3,
+};
+enum dmult_op {
+ dmult_dmult_op = 0x0,
+ dmult_dmul_op = 0x2,
+ dmult_dmuh_op = 0x3,
+};
+enum dmultu_op {
+ dmultu_dmultu_op = 0x0,
+ dmultu_dmulu_op = 0x2,
+ dmultu_dmuhu_op = 0x3,
+};
+enum ddiv_op {
+ ddiv_ddiv_op = 0x0,
+ ddiv_ddiv6_op = 0x2,
+ ddiv_dmod_op = 0x3,
+};
+enum ddivu_op {
+ ddivu_ddivu_op = 0x0,
+ ddivu_ddivu6_op = 0x2,
+ ddivu_dmodu_op = 0x3,
+};
+
+/*
* rt field of bcond opcodes.
*/
enum rt_op {
@@ -103,7 +147,7 @@ enum rt_op {
bltzal_op, bgezal_op, bltzall_op, bgezall_op,
rt_op_0x14, rt_op_0x15, rt_op_0x16, rt_op_0x17,
rt_op_0x18, rt_op_0x19, rt_op_0x1a, rt_op_0x1b,
- bposge32_op, rt_op_0x1d, rt_op_0x1e, rt_op_0x1f
+ bposge32_op, rt_op_0x1d, rt_op_0x1e, synci_op
};
/*
@@ -238,6 +282,21 @@ enum bshfl_func {
};
/*
+ * MSA minor opcodes.
+ */
+enum msa_func {
+ msa_elm_op = 0x19,
+};
+
+/*
+ * MSA ELM opcodes.
+ */
+enum msa_elm {
+ msa_ctc_op = 0x3e,
+ msa_cfc_op = 0x7e,
+};
+
+/*
* func field for MSA MI10 format.
*/
enum msa_mi10_func {
@@ -264,7 +323,7 @@ enum mm_major_op {
mm_pool32b_op, mm_pool16b_op, mm_lhu16_op, mm_andi16_op,
mm_addiu32_op, mm_lhu32_op, mm_sh32_op, mm_lh32_op,
mm_pool32i_op, mm_pool16c_op, mm_lwsp16_op, mm_pool16d_op,
- mm_ori32_op, mm_pool32f_op, mm_reserved1_op, mm_reserved2_op,
+ mm_ori32_op, mm_pool32f_op, mm_pool32s_op, mm_reserved2_op,
mm_pool32c_op, mm_lwgp16_op, mm_lw16_op, mm_pool16e_op,
mm_xori32_op, mm_jals32_op, mm_addiupc_op, mm_reserved3_op,
mm_reserved4_op, mm_pool16f_op, mm_sb16_op, mm_beqz16_op,
@@ -360,7 +419,10 @@ enum mm_32axf_minor_op {
mm_mflo32_op = 0x075,
mm_jalrhb_op = 0x07c,
mm_tlbwi_op = 0x08d,
+ mm_mthi32_op = 0x0b5,
mm_tlbwr_op = 0x0cd,
+ mm_mtlo32_op = 0x0f5,
+ mm_di_op = 0x11d,
mm_jalrs_op = 0x13c,
mm_jalrshb_op = 0x17c,
mm_sync_op = 0x1ad,
@@ -479,6 +541,13 @@ enum mm_32f_73_minor_op {
};
/*
+ * (microMIPS) POOL32S minor opcodes.
+ */
+enum mm_32s_minor_op {
+ mm_32s_elm_op = 0x16,
+};
+
+/*
* (microMIPS) POOL16C minor opcodes.
*/
enum mm_16c_minor_op {
@@ -586,6 +655,36 @@ struct r_format { /* Register format */
;))))))
};
+struct c0r_format { /* C0 register format */
+ __BITFIELD_FIELD(unsigned int opcode : 6,
+ __BITFIELD_FIELD(unsigned int rs : 5,
+ __BITFIELD_FIELD(unsigned int rt : 5,
+ __BITFIELD_FIELD(unsigned int rd : 5,
+ __BITFIELD_FIELD(unsigned int z: 8,
+ __BITFIELD_FIELD(unsigned int sel : 3,
+ ;))))))
+};
+
+struct mfmc0_format { /* MFMC0 register format */
+ __BITFIELD_FIELD(unsigned int opcode : 6,
+ __BITFIELD_FIELD(unsigned int rs : 5,
+ __BITFIELD_FIELD(unsigned int rt : 5,
+ __BITFIELD_FIELD(unsigned int rd : 5,
+ __BITFIELD_FIELD(unsigned int re : 5,
+ __BITFIELD_FIELD(unsigned int sc : 1,
+ __BITFIELD_FIELD(unsigned int : 2,
+ __BITFIELD_FIELD(unsigned int sel : 3,
+ ;))))))))
+};
+
+struct co_format { /* C0 CO format */
+ __BITFIELD_FIELD(unsigned int opcode : 6,
+ __BITFIELD_FIELD(unsigned int co : 1,
+ __BITFIELD_FIELD(unsigned int code : 19,
+ __BITFIELD_FIELD(unsigned int func : 6,
+ ;))))
+};
+
struct p_format { /* Performance counter format (R10000) */
__BITFIELD_FIELD(unsigned int opcode : 6,
__BITFIELD_FIELD(unsigned int rs : 5,
@@ -937,6 +1036,9 @@ union mips_instruction {
struct u_format u_format;
struct c_format c_format;
struct r_format r_format;
+ struct c0r_format c0r_format;
+ struct mfmc0_format mfmc0_format;
+ struct co_format co_format;
struct p_format p_format;
struct f_format f_format;
struct ma_format ma_format;
diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c
index 1ea973b2abb1..fae2f9447792 100644
--- a/arch/mips/kernel/asm-offsets.c
+++ b/arch/mips/kernel/asm-offsets.c
@@ -339,71 +339,9 @@ void output_pm_defines(void)
}
#endif
-void output_cpuinfo_defines(void)
-{
- COMMENT(" MIPS cpuinfo offsets. ");
- DEFINE(CPUINFO_SIZE, sizeof(struct cpuinfo_mips));
-#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
- OFFSET(CPUINFO_ASID_MASK, cpuinfo_mips, asid_mask);
-#endif
-}
-
void output_kvm_defines(void)
{
COMMENT(" KVM/MIPS Specfic offsets. ");
- DEFINE(VCPU_ARCH_SIZE, sizeof(struct kvm_vcpu_arch));
- OFFSET(VCPU_RUN, kvm_vcpu, run);
- OFFSET(VCPU_HOST_ARCH, kvm_vcpu, arch);
-
- OFFSET(VCPU_HOST_EBASE, kvm_vcpu_arch, host_ebase);
- OFFSET(VCPU_GUEST_EBASE, kvm_vcpu_arch, guest_ebase);
-
- OFFSET(VCPU_HOST_STACK, kvm_vcpu_arch, host_stack);
- OFFSET(VCPU_HOST_GP, kvm_vcpu_arch, host_gp);
-
- OFFSET(VCPU_HOST_CP0_BADVADDR, kvm_vcpu_arch, host_cp0_badvaddr);
- OFFSET(VCPU_HOST_CP0_CAUSE, kvm_vcpu_arch, host_cp0_cause);
- OFFSET(VCPU_HOST_EPC, kvm_vcpu_arch, host_cp0_epc);
- OFFSET(VCPU_HOST_ENTRYHI, kvm_vcpu_arch, host_cp0_entryhi);
-
- OFFSET(VCPU_GUEST_INST, kvm_vcpu_arch, guest_inst);
-
- OFFSET(VCPU_R0, kvm_vcpu_arch, gprs[0]);
- OFFSET(VCPU_R1, kvm_vcpu_arch, gprs[1]);
- OFFSET(VCPU_R2, kvm_vcpu_arch, gprs[2]);
- OFFSET(VCPU_R3, kvm_vcpu_arch, gprs[3]);
- OFFSET(VCPU_R4, kvm_vcpu_arch, gprs[4]);
- OFFSET(VCPU_R5, kvm_vcpu_arch, gprs[5]);
- OFFSET(VCPU_R6, kvm_vcpu_arch, gprs[6]);
- OFFSET(VCPU_R7, kvm_vcpu_arch, gprs[7]);
- OFFSET(VCPU_R8, kvm_vcpu_arch, gprs[8]);
- OFFSET(VCPU_R9, kvm_vcpu_arch, gprs[9]);
- OFFSET(VCPU_R10, kvm_vcpu_arch, gprs[10]);
- OFFSET(VCPU_R11, kvm_vcpu_arch, gprs[11]);
- OFFSET(VCPU_R12, kvm_vcpu_arch, gprs[12]);
- OFFSET(VCPU_R13, kvm_vcpu_arch, gprs[13]);
- OFFSET(VCPU_R14, kvm_vcpu_arch, gprs[14]);
- OFFSET(VCPU_R15, kvm_vcpu_arch, gprs[15]);
- OFFSET(VCPU_R16, kvm_vcpu_arch, gprs[16]);
- OFFSET(VCPU_R17, kvm_vcpu_arch, gprs[17]);
- OFFSET(VCPU_R18, kvm_vcpu_arch, gprs[18]);
- OFFSET(VCPU_R19, kvm_vcpu_arch, gprs[19]);
- OFFSET(VCPU_R20, kvm_vcpu_arch, gprs[20]);
- OFFSET(VCPU_R21, kvm_vcpu_arch, gprs[21]);
- OFFSET(VCPU_R22, kvm_vcpu_arch, gprs[22]);
- OFFSET(VCPU_R23, kvm_vcpu_arch, gprs[23]);
- OFFSET(VCPU_R24, kvm_vcpu_arch, gprs[24]);
- OFFSET(VCPU_R25, kvm_vcpu_arch, gprs[25]);
- OFFSET(VCPU_R26, kvm_vcpu_arch, gprs[26]);
- OFFSET(VCPU_R27, kvm_vcpu_arch, gprs[27]);
- OFFSET(VCPU_R28, kvm_vcpu_arch, gprs[28]);
- OFFSET(VCPU_R29, kvm_vcpu_arch, gprs[29]);
- OFFSET(VCPU_R30, kvm_vcpu_arch, gprs[30]);
- OFFSET(VCPU_R31, kvm_vcpu_arch, gprs[31]);
- OFFSET(VCPU_LO, kvm_vcpu_arch, lo);
- OFFSET(VCPU_HI, kvm_vcpu_arch, hi);
- OFFSET(VCPU_PC, kvm_vcpu_arch, pc);
- BLANK();
OFFSET(VCPU_FPR0, kvm_vcpu_arch, fpu.fpr[0]);
OFFSET(VCPU_FPR1, kvm_vcpu_arch, fpu.fpr[1]);
@@ -441,14 +379,6 @@ void output_kvm_defines(void)
OFFSET(VCPU_FCR31, kvm_vcpu_arch, fpu.fcr31);
OFFSET(VCPU_MSA_CSR, kvm_vcpu_arch, fpu.msacsr);
BLANK();
-
- OFFSET(VCPU_COP0, kvm_vcpu_arch, cop0);
- OFFSET(VCPU_GUEST_KERNEL_ASID, kvm_vcpu_arch, guest_kernel_asid);
- OFFSET(VCPU_GUEST_USER_ASID, kvm_vcpu_arch, guest_user_asid);
-
- OFFSET(COP0_TLB_HI, mips_coproc, reg[MIPS_CP0_TLB_HI][0]);
- OFFSET(COP0_STATUS, mips_coproc, reg[MIPS_CP0_STATUS][0]);
- BLANK();
}
#ifdef CONFIG_MIPS_CPS
diff --git a/arch/mips/kernel/branch.c b/arch/mips/kernel/branch.c
index 6dc3f1fdaccc..46c227fc98f5 100644
--- a/arch/mips/kernel/branch.c
+++ b/arch/mips/kernel/branch.c
@@ -790,7 +790,7 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
epc += 4 + (insn.i_format.simmediate << 2);
regs->cp0_epc = epc;
break;
- case beqzcjic_op:
+ case pop66_op:
if (!cpu_has_mips_r6) {
ret = -SIGILL;
break;
@@ -798,7 +798,7 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
/* Compact branch: BEQZC || JIC */
regs->cp0_epc += 8;
break;
- case bnezcjialc_op:
+ case pop76_op:
if (!cpu_has_mips_r6) {
ret = -SIGILL;
break;
@@ -809,8 +809,8 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
regs->cp0_epc += 8;
break;
#endif
- case cbcond0_op:
- case cbcond1_op:
+ case pop10_op:
+ case pop30_op:
/* Only valid for MIPS R6 */
if (!cpu_has_mips_r6) {
ret = -SIGILL;
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 4a1712b5abdf..6fb4704bd156 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -619,17 +619,17 @@ static int simulate_rdhwr(struct pt_regs *regs, int rd, int rt)
perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS,
1, regs, 0);
switch (rd) {
- case 0: /* CPU number */
+ case MIPS_HWR_CPUNUM: /* CPU number */
regs->regs[rt] = smp_processor_id();
return 0;
- case 1: /* SYNCI length */
+ case MIPS_HWR_SYNCISTEP: /* SYNCI length */
regs->regs[rt] = min(current_cpu_data.dcache.linesz,
current_cpu_data.icache.linesz);
return 0;
- case 2: /* Read count register */
+ case MIPS_HWR_CC: /* Read count register */
regs->regs[rt] = read_c0_count();
return 0;
- case 3: /* Count register resolution */
+ case MIPS_HWR_CCRES: /* Count register resolution */
switch (current_cpu_type()) {
case CPU_20KC:
case CPU_25KF:
@@ -639,7 +639,7 @@ static int simulate_rdhwr(struct pt_regs *regs, int rd, int rt)
regs->regs[rt] = 2;
}
return 0;
- case 29:
+ case MIPS_HWR_ULR: /* Read UserLocal register */
regs->regs[rt] = ti->tp_value;
return 0;
default:
@@ -1859,6 +1859,7 @@ void __noreturn nmi_exception_handler(struct pt_regs *regs)
#define VECTORSPACING 0x100 /* for EI/VI mode */
unsigned long ebase;
+EXPORT_SYMBOL_GPL(ebase);
unsigned long exception_handlers[32];
unsigned long vi_handlers[64];
@@ -2063,16 +2064,22 @@ static void configure_status(void)
status_set);
}
+unsigned int hwrena;
+EXPORT_SYMBOL_GPL(hwrena);
+
/* configure HWRENA register */
static void configure_hwrena(void)
{
- unsigned int hwrena = cpu_hwrena_impl_bits;
+ hwrena = cpu_hwrena_impl_bits;
if (cpu_has_mips_r2_r6)
- hwrena |= 0x0000000f;
+ hwrena |= MIPS_HWRENA_CPUNUM |
+ MIPS_HWRENA_SYNCISTEP |
+ MIPS_HWRENA_CC |
+ MIPS_HWRENA_CCRES;
if (!noulri && cpu_has_userlocal)
- hwrena |= (1 << 29);
+ hwrena |= MIPS_HWRENA_ULR;
if (hwrena)
write_c0_hwrena(hwrena);
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index 2ae12825529f..7c56d6b124d1 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -17,6 +17,7 @@ if VIRTUALIZATION
config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
depends on HAVE_KVM
+ select EXPORT_UASM
select PREEMPT_NOTIFIERS
select ANON_INODES
select KVM_MMIO
diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile
index 637ebbebd549..847429de780d 100644
--- a/arch/mips/kvm/Makefile
+++ b/arch/mips/kvm/Makefile
@@ -7,9 +7,10 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/mips/kvm
common-objs-$(CONFIG_CPU_HAS_MSA) += msa.o
-kvm-objs := $(common-objs-y) mips.o emulate.o locore.o \
+kvm-objs := $(common-objs-y) mips.o emulate.o entry.o \
interrupt.o stats.o commpage.o \
dyntrans.o trap_emul.o fpu.o
+kvm-objs += mmu.o
obj-$(CONFIG_KVM) += kvm.o
obj-y += callback.o tlb.o
diff --git a/arch/mips/kvm/commpage.c b/arch/mips/kvm/commpage.c
index 2d6e976d1add..a36b77e1705c 100644
--- a/arch/mips/kvm/commpage.c
+++ b/arch/mips/kvm/commpage.c
@@ -4,7 +4,7 @@
* for more details.
*
* commpage, currently used for Virtual COP0 registers.
- * Mapped into the guest kernel @ 0x0.
+ * Mapped into the guest kernel @ KVM_GUEST_COMMPAGE_ADDR.
*
* Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved.
* Authors: Sanjay Lal <sanjayl@kymasys.com>
diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c
index f1527a465c1b..d280894915ed 100644
--- a/arch/mips/kvm/dyntrans.c
+++ b/arch/mips/kvm/dyntrans.c
@@ -11,6 +11,7 @@
#include <linux/errno.h>
#include <linux/err.h>
+#include <linux/highmem.h>
#include <linux/kvm_host.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
@@ -20,125 +21,114 @@
#include "commpage.h"
-#define SYNCI_TEMPLATE 0x041f0000
-#define SYNCI_BASE(x) (((x) >> 21) & 0x1f)
-#define SYNCI_OFFSET ((x) & 0xffff)
+/**
+ * kvm_mips_trans_replace() - Replace trapping instruction in guest memory.
+ * @vcpu: Virtual CPU.
+ * @opc: PC of instruction to replace.
+ * @replace: Instruction to write
+ */
+static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc,
+ union mips_instruction replace)
+{
+ unsigned long paddr, flags;
+ void *vaddr;
+
+ if (KVM_GUEST_KSEGX((unsigned long)opc) == KVM_GUEST_KSEG0) {
+ paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu,
+ (unsigned long)opc);
+ vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr)));
+ vaddr += paddr & ~PAGE_MASK;
+ memcpy(vaddr, (void *)&replace, sizeof(u32));
+ local_flush_icache_range((unsigned long)vaddr,
+ (unsigned long)vaddr + 32);
+ kunmap_atomic(vaddr);
+ } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
+ local_irq_save(flags);
+ memcpy((void *)opc, (void *)&replace, sizeof(u32));
+ local_flush_icache_range((unsigned long)opc,
+ (unsigned long)opc + 32);
+ local_irq_restore(flags);
+ } else {
+ kvm_err("%s: Invalid address: %p\n", __func__, opc);
+ return -EFAULT;
+ }
-#define LW_TEMPLATE 0x8c000000
-#define CLEAR_TEMPLATE 0x00000020
-#define SW_TEMPLATE 0xac000000
+ return 0;
+}
-int kvm_mips_trans_cache_index(uint32_t inst, uint32_t *opc,
+int kvm_mips_trans_cache_index(union mips_instruction inst, u32 *opc,
struct kvm_vcpu *vcpu)
{
- int result = 0;
- unsigned long kseg0_opc;
- uint32_t synci_inst = 0x0;
+ union mips_instruction nop_inst = { 0 };
/* Replace the CACHE instruction, with a NOP */
- kseg0_opc =
- CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
- (vcpu, (unsigned long) opc));
- memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(uint32_t));
- local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
-
- return result;
+ return kvm_mips_trans_replace(vcpu, opc, nop_inst);
}
/*
* Address based CACHE instructions are transformed into synci(s). A little
* heavy for just D-cache invalidates, but avoids an expensive trap
*/
-int kvm_mips_trans_cache_va(uint32_t inst, uint32_t *opc,
+int kvm_mips_trans_cache_va(union mips_instruction inst, u32 *opc,
struct kvm_vcpu *vcpu)
{
- int result = 0;
- unsigned long kseg0_opc;
- uint32_t synci_inst = SYNCI_TEMPLATE, base, offset;
-
- base = (inst >> 21) & 0x1f;
- offset = inst & 0xffff;
- synci_inst |= (base << 21);
- synci_inst |= offset;
-
- kseg0_opc =
- CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
- (vcpu, (unsigned long) opc));
- memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(uint32_t));
- local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
-
- return result;
+ union mips_instruction synci_inst = { 0 };
+
+ synci_inst.i_format.opcode = bcond_op;
+ synci_inst.i_format.rs = inst.i_format.rs;
+ synci_inst.i_format.rt = synci_op;
+ if (cpu_has_mips_r6)
+ synci_inst.i_format.simmediate = inst.spec3_format.simmediate;
+ else
+ synci_inst.i_format.simmediate = inst.i_format.simmediate;
+
+ return kvm_mips_trans_replace(vcpu, opc, synci_inst);
}
-int kvm_mips_trans_mfc0(uint32_t inst, uint32_t *opc, struct kvm_vcpu *vcpu)
+int kvm_mips_trans_mfc0(union mips_instruction inst, u32 *opc,
+ struct kvm_vcpu *vcpu)
{
- int32_t rt, rd, sel;
- uint32_t mfc0_inst;
- unsigned long kseg0_opc, flags;
-
- rt = (inst >> 16) & 0x1f;
- rd = (inst >> 11) & 0x1f;
- sel = inst & 0x7;
+ union mips_instruction mfc0_inst = { 0 };
+ u32 rd, sel;
- if ((rd == MIPS_CP0_ERRCTL) && (sel == 0)) {
- mfc0_inst = CLEAR_TEMPLATE;
- mfc0_inst |= ((rt & 0x1f) << 16);
- } else {
- mfc0_inst = LW_TEMPLATE;
- mfc0_inst |= ((rt & 0x1f) << 16);
- mfc0_inst |= offsetof(struct kvm_mips_commpage,
- cop0.reg[rd][sel]);
- }
+ rd = inst.c0r_format.rd;
+ sel = inst.c0r_format.sel;
- if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) {
- kseg0_opc =
- CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
- (vcpu, (unsigned long) opc));
- memcpy((void *)kseg0_opc, (void *)&mfc0_inst, sizeof(uint32_t));
- local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
- } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
- local_irq_save(flags);
- memcpy((void *)opc, (void *)&mfc0_inst, sizeof(uint32_t));
- local_flush_icache_range((unsigned long)opc,
- (unsigned long)opc + 32);
- local_irq_restore(flags);
+ if (rd == MIPS_CP0_ERRCTL && sel == 0) {
+ mfc0_inst.r_format.opcode = spec_op;
+ mfc0_inst.r_format.rd = inst.c0r_format.rt;
+ mfc0_inst.r_format.func = add_op;
} else {
- kvm_err("%s: Invalid address: %p\n", __func__, opc);
- return -EFAULT;
+ mfc0_inst.i_format.opcode = lw_op;
+ mfc0_inst.i_format.rt = inst.c0r_format.rt;
+ mfc0_inst.i_format.simmediate = KVM_GUEST_COMMPAGE_ADDR |
+ offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]);
+#ifdef CONFIG_CPU_BIG_ENDIAN
+ if (sizeof(vcpu->arch.cop0->reg[0][0]) == 8)
+ mfc0_inst.i_format.simmediate |= 4;
+#endif
}
- return 0;
+ return kvm_mips_trans_replace(vcpu, opc, mfc0_inst);
}
-int kvm_mips_trans_mtc0(uint32_t inst, uint32_t *opc, struct kvm_vcpu *vcpu)
+int kvm_mips_trans_mtc0(union mips_instruction inst, u32 *opc,
+ struct kvm_vcpu *vcpu)
{
- int32_t rt, rd, sel;
- uint32_t mtc0_inst = SW_TEMPLATE;
- unsigned long kseg0_opc, flags;
-
- rt = (inst >> 16) & 0x1f;
- rd = (inst >> 11) & 0x1f;
- sel = inst & 0x7;
-
- mtc0_inst |= ((rt & 0x1f) << 16);
- mtc0_inst |= offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]);
-
- if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) {
- kseg0_opc =
- CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
- (vcpu, (unsigned long) opc));
- memcpy((void *)kseg0_opc, (void *)&mtc0_inst, sizeof(uint32_t));
- local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
- } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
- local_irq_save(flags);
- memcpy((void *)opc, (void *)&mtc0_inst, sizeof(uint32_t));
- local_flush_icache_range((unsigned long)opc,
- (unsigned long)opc + 32);
- local_irq_restore(flags);
- } else {
- kvm_err("%s: Invalid address: %p\n", __func__, opc);
- return -EFAULT;
- }
-
- return 0;
+ union mips_instruction mtc0_inst = { 0 };
+ u32 rd, sel;
+
+ rd = inst.c0r_format.rd;
+ sel = inst.c0r_format.sel;
+
+ mtc0_inst.i_format.opcode = sw_op;
+ mtc0_inst.i_format.rt = inst.c0r_format.rt;
+ mtc0_inst.i_format.simmediate = KVM_GUEST_COMMPAGE_ADDR |
+ offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]);
+#ifdef CONFIG_CPU_BIG_ENDIAN
+ if (sizeof(vcpu->arch.cop0->reg[0][0]) == 8)
+ mtc0_inst.i_format.simmediate |= 4;
+#endif
+
+ return kvm_mips_trans_replace(vcpu, opc, mtc0_inst);
}
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 645c8a1982a7..6eb52b9c9818 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -52,7 +52,7 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
goto unaligned;
/* Read the instruction */
- insn.word = kvm_get_inst((uint32_t *) epc, vcpu);
+ insn.word = kvm_get_inst((u32 *) epc, vcpu);
if (insn.word == KVM_INVALID_INST)
return KVM_INVALID_INST;
@@ -161,9 +161,12 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
nextpc = epc;
break;
- case blez_op: /* not really i_format */
- case blezl_op:
- /* rt field assumed to be zero */
+ case blez_op: /* POP06 */
+#ifndef CONFIG_CPU_MIPSR6
+ case blezl_op: /* removed in R6 */
+#endif
+ if (insn.i_format.rt != 0)
+ goto compact_branch;
if ((long)arch->gprs[insn.i_format.rs] <= 0)
epc = epc + 4 + (insn.i_format.simmediate << 2);
else
@@ -171,9 +174,12 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
nextpc = epc;
break;
- case bgtz_op:
- case bgtzl_op:
- /* rt field assumed to be zero */
+ case bgtz_op: /* POP07 */
+#ifndef CONFIG_CPU_MIPSR6
+ case bgtzl_op: /* removed in R6 */
+#endif
+ if (insn.i_format.rt != 0)
+ goto compact_branch;
if ((long)arch->gprs[insn.i_format.rs] > 0)
epc = epc + 4 + (insn.i_format.simmediate << 2);
else
@@ -185,6 +191,40 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
case cop1_op:
kvm_err("%s: unsupported cop1_op\n", __func__);
break;
+
+#ifdef CONFIG_CPU_MIPSR6
+ /* R6 added the following compact branches with forbidden slots */
+ case blezl_op: /* POP26 */
+ case bgtzl_op: /* POP27 */
+ /* only rt == 0 isn't compact branch */
+ if (insn.i_format.rt != 0)
+ goto compact_branch;
+ break;
+ case pop10_op:
+ case pop30_op:
+ /* only rs == rt == 0 is reserved, rest are compact branches */
+ if (insn.i_format.rs != 0 || insn.i_format.rt != 0)
+ goto compact_branch;
+ break;
+ case pop66_op:
+ case pop76_op:
+ /* only rs == 0 isn't compact branch */
+ if (insn.i_format.rs != 0)
+ goto compact_branch;
+ break;
+compact_branch:
+ /*
+ * If we've hit an exception on the forbidden slot, then
+ * the branch must not have been taken.
+ */
+ epc += 8;
+ nextpc = epc;
+ break;
+#else
+compact_branch:
+ /* Compact branches not supported before R6 */
+ break;
+#endif
}
return nextpc;
@@ -198,7 +238,7 @@ sigill:
return nextpc;
}
-enum emulation_result update_pc(struct kvm_vcpu *vcpu, uint32_t cause)
+enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause)
{
unsigned long branch_pc;
enum emulation_result er = EMULATE_DONE;
@@ -243,7 +283,7 @@ static inline int kvm_mips_count_disabled(struct kvm_vcpu *vcpu)
*
* Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running).
*/
-static uint32_t kvm_mips_ktime_to_count(struct kvm_vcpu *vcpu, ktime_t now)
+static u32 kvm_mips_ktime_to_count(struct kvm_vcpu *vcpu, ktime_t now)
{
s64 now_ns, periods;
u64 delta;
@@ -300,11 +340,11 @@ static inline ktime_t kvm_mips_count_time(struct kvm_vcpu *vcpu)
*
* Returns: The current value of the guest CP0_Count register.
*/
-static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
+static u32 kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
ktime_t expires, threshold;
- uint32_t count, compare;
+ u32 count, compare;
int running;
/* Calculate the biased and scaled guest CP0_Count */
@@ -315,7 +355,7 @@ static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
* Find whether CP0_Count has reached the closest timer interrupt. If
* not, we shouldn't inject it.
*/
- if ((int32_t)(count - compare) < 0)
+ if ((s32)(count - compare) < 0)
return count;
/*
@@ -360,7 +400,7 @@ static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
*
* Returns: The current guest CP0_Count value.
*/
-uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu)
+u32 kvm_mips_read_count(struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
@@ -387,8 +427,7 @@ uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu)
*
* Returns: The ktime at the point of freeze.
*/
-static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu,
- uint32_t *count)
+static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu, u32 *count)
{
ktime_t now;
@@ -419,16 +458,16 @@ static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu,
* Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running).
*/
static void kvm_mips_resume_hrtimer(struct kvm_vcpu *vcpu,
- ktime_t now, uint32_t count)
+ ktime_t now, u32 count)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
- uint32_t compare;
+ u32 compare;
u64 delta;
ktime_t expire;
/* Calculate timeout (wrap 0 to 2^32) */
compare = kvm_read_c0_guest_compare(cop0);
- delta = (u64)(uint32_t)(compare - count - 1) + 1;
+ delta = (u64)(u32)(compare - count - 1) + 1;
delta = div_u64(delta * NSEC_PER_SEC, vcpu->arch.count_hz);
expire = ktime_add_ns(now, delta);
@@ -444,7 +483,7 @@ static void kvm_mips_resume_hrtimer(struct kvm_vcpu *vcpu,
*
* Sets the CP0_Count value and updates the timer accordingly.
*/
-void kvm_mips_write_count(struct kvm_vcpu *vcpu, uint32_t count)
+void kvm_mips_write_count(struct kvm_vcpu *vcpu, u32 count)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
ktime_t now;
@@ -538,13 +577,13 @@ int kvm_mips_set_count_hz(struct kvm_vcpu *vcpu, s64 count_hz)
* If @ack, atomically acknowledge any pending timer interrupt, otherwise ensure
* any pending timer interrupt is preserved.
*/
-void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack)
+void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
int dc;
u32 old_compare = kvm_read_c0_guest_compare(cop0);
ktime_t now;
- uint32_t count;
+ u32 count;
/* if unchanged, must just be an ack */
if (old_compare == compare) {
@@ -585,7 +624,7 @@ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack)
static ktime_t kvm_mips_count_disable(struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
- uint32_t count;
+ u32 count;
ktime_t now;
/* Stop hrtimer */
@@ -632,7 +671,7 @@ void kvm_mips_count_disable_cause(struct kvm_vcpu *vcpu)
void kvm_mips_count_enable_cause(struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
- uint32_t count;
+ u32 count;
kvm_clear_c0_guest_cause(cop0, CAUSEF_DC);
@@ -661,7 +700,7 @@ int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl)
s64 changed = count_ctl ^ vcpu->arch.count_ctl;
s64 delta;
ktime_t expire, now;
- uint32_t count, compare;
+ u32 count, compare;
/* Only allow defined bits to be changed */
if (changed & ~(s64)(KVM_REG_MIPS_COUNT_CTL_DC))
@@ -687,7 +726,7 @@ int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl)
*/
count = kvm_read_c0_guest_count(cop0);
compare = kvm_read_c0_guest_compare(cop0);
- delta = (u64)(uint32_t)(compare - count - 1) + 1;
+ delta = (u64)(u32)(compare - count - 1) + 1;
delta = div_u64(delta * NSEC_PER_SEC,
vcpu->arch.count_hz);
expire = ktime_add_ns(vcpu->arch.count_resume, delta);
@@ -776,7 +815,7 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu)
vcpu->arch.pending_exceptions);
++vcpu->stat.wait_exits;
- trace_kvm_exit(vcpu, WAIT_EXITS);
+ trace_kvm_exit(vcpu, KVM_TRACE_EXIT_WAIT);
if (!vcpu->arch.pending_exceptions) {
vcpu->arch.wait = 1;
kvm_vcpu_block(vcpu);
@@ -801,9 +840,9 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu)
enum emulation_result kvm_mips_emul_tlbr(struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
- uint32_t pc = vcpu->arch.pc;
+ unsigned long pc = vcpu->arch.pc;
- kvm_err("[%#x] COP0_TLBR [%ld]\n", pc, kvm_read_c0_guest_index(cop0));
+ kvm_err("[%#lx] COP0_TLBR [%ld]\n", pc, kvm_read_c0_guest_index(cop0));
return EMULATE_FAIL;
}
@@ -813,11 +852,11 @@ enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu)
struct mips_coproc *cop0 = vcpu->arch.cop0;
int index = kvm_read_c0_guest_index(cop0);
struct kvm_mips_tlb *tlb = NULL;
- uint32_t pc = vcpu->arch.pc;
+ unsigned long pc = vcpu->arch.pc;
if (index < 0 || index >= KVM_MIPS_GUEST_TLB_SIZE) {
kvm_debug("%s: illegal index: %d\n", __func__, index);
- kvm_debug("[%#x] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n",
+ kvm_debug("[%#lx] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n",
pc, index, kvm_read_c0_guest_entryhi(cop0),
kvm_read_c0_guest_entrylo0(cop0),
kvm_read_c0_guest_entrylo1(cop0),
@@ -834,10 +873,10 @@ enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu)
tlb->tlb_mask = kvm_read_c0_guest_pagemask(cop0);
tlb->tlb_hi = kvm_read_c0_guest_entryhi(cop0);
- tlb->tlb_lo0 = kvm_read_c0_guest_entrylo0(cop0);
- tlb->tlb_lo1 = kvm_read_c0_guest_entrylo1(cop0);
+ tlb->tlb_lo[0] = kvm_read_c0_guest_entrylo0(cop0);
+ tlb->tlb_lo[1] = kvm_read_c0_guest_entrylo1(cop0);
- kvm_debug("[%#x] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n",
+ kvm_debug("[%#lx] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n",
pc, index, kvm_read_c0_guest_entryhi(cop0),
kvm_read_c0_guest_entrylo0(cop0),
kvm_read_c0_guest_entrylo1(cop0),
@@ -851,7 +890,7 @@ enum emulation_result kvm_mips_emul_tlbwr(struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
struct kvm_mips_tlb *tlb = NULL;
- uint32_t pc = vcpu->arch.pc;
+ unsigned long pc = vcpu->arch.pc;
int index;
get_random_bytes(&index, sizeof(index));
@@ -867,10 +906,10 @@ enum emulation_result kvm_mips_emul_tlbwr(struct kvm_vcpu *vcpu)
tlb->tlb_mask = kvm_read_c0_guest_pagemask(cop0);
tlb->tlb_hi = kvm_read_c0_guest_entryhi(cop0);
- tlb->tlb_lo0 = kvm_read_c0_guest_entrylo0(cop0);
- tlb->tlb_lo1 = kvm_read_c0_guest_entrylo1(cop0);
+ tlb->tlb_lo[0] = kvm_read_c0_guest_entrylo0(cop0);
+ tlb->tlb_lo[1] = kvm_read_c0_guest_entrylo1(cop0);
- kvm_debug("[%#x] COP0_TLBWR[%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx)\n",
+ kvm_debug("[%#lx] COP0_TLBWR[%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx)\n",
pc, index, kvm_read_c0_guest_entryhi(cop0),
kvm_read_c0_guest_entrylo0(cop0),
kvm_read_c0_guest_entrylo1(cop0));
@@ -882,14 +921,14 @@ enum emulation_result kvm_mips_emul_tlbp(struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
long entryhi = kvm_read_c0_guest_entryhi(cop0);
- uint32_t pc = vcpu->arch.pc;
+ unsigned long pc = vcpu->arch.pc;
int index = -1;
index = kvm_mips_guest_tlb_lookup(vcpu, entryhi);
kvm_write_c0_guest_index(cop0, index);
- kvm_debug("[%#x] COP0_TLBP (entryhi: %#lx), index: %d\n", pc, entryhi,
+ kvm_debug("[%#lx] COP0_TLBP (entryhi: %#lx), index: %d\n", pc, entryhi,
index);
return EMULATE_DONE;
@@ -922,8 +961,8 @@ unsigned int kvm_mips_config1_wrmask(struct kvm_vcpu *vcpu)
*/
unsigned int kvm_mips_config3_wrmask(struct kvm_vcpu *vcpu)
{
- /* Config4 is optional */
- unsigned int mask = MIPS_CONF_M;
+ /* Config4 and ULRI are optional */
+ unsigned int mask = MIPS_CONF_M | MIPS_CONF3_ULRI;
/* Permit MSA to be present if MSA is supported */
if (kvm_mips_guest_can_have_msa(&vcpu->arch))
@@ -942,7 +981,12 @@ unsigned int kvm_mips_config3_wrmask(struct kvm_vcpu *vcpu)
unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu)
{
/* Config5 is optional */
- return MIPS_CONF_M;
+ unsigned int mask = MIPS_CONF_M;
+
+ /* KScrExist */
+ mask |= (unsigned int)vcpu->arch.kscratch_enabled << 16;
+
+ return mask;
}
/**
@@ -973,14 +1017,14 @@ unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu)
return mask;
}
-enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
- uint32_t cause, struct kvm_run *run,
+enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
+ u32 *opc, u32 cause,
+ struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
enum emulation_result er = EMULATE_DONE;
- int32_t rt, rd, copz, sel, co_bit, op;
- uint32_t pc = vcpu->arch.pc;
+ u32 rt, rd, sel;
unsigned long curr_pc;
/*
@@ -992,16 +1036,8 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
if (er == EMULATE_FAIL)
return er;
- copz = (inst >> 21) & 0x1f;
- rt = (inst >> 16) & 0x1f;
- rd = (inst >> 11) & 0x1f;
- sel = inst & 0x7;
- co_bit = (inst >> 25) & 1;
-
- if (co_bit) {
- op = (inst) & 0xff;
-
- switch (op) {
+ if (inst.co_format.co) {
+ switch (inst.co_format.func) {
case tlbr_op: /* Read indexed TLB entry */
er = kvm_mips_emul_tlbr(vcpu);
break;
@@ -1020,47 +1056,58 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
case eret_op:
er = kvm_mips_emul_eret(vcpu);
goto dont_update_pc;
- break;
case wait_op:
er = kvm_mips_emul_wait(vcpu);
break;
}
} else {
- switch (copz) {
+ rt = inst.c0r_format.rt;
+ rd = inst.c0r_format.rd;
+ sel = inst.c0r_format.sel;
+
+ switch (inst.c0r_format.rs) {
case mfc_op:
#ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS
cop0->stat[rd][sel]++;
#endif
/* Get reg */
if ((rd == MIPS_CP0_COUNT) && (sel == 0)) {
- vcpu->arch.gprs[rt] = kvm_mips_read_count(vcpu);
+ vcpu->arch.gprs[rt] =
+ (s32)kvm_mips_read_count(vcpu);
} else if ((rd == MIPS_CP0_ERRCTL) && (sel == 0)) {
vcpu->arch.gprs[rt] = 0x0;
#ifdef CONFIG_KVM_MIPS_DYN_TRANS
kvm_mips_trans_mfc0(inst, opc, vcpu);
#endif
} else {
- vcpu->arch.gprs[rt] = cop0->reg[rd][sel];
+ vcpu->arch.gprs[rt] = (s32)cop0->reg[rd][sel];
#ifdef CONFIG_KVM_MIPS_DYN_TRANS
kvm_mips_trans_mfc0(inst, opc, vcpu);
#endif
}
- kvm_debug
- ("[%#x] MFCz[%d][%d], vcpu->arch.gprs[%d]: %#lx\n",
- pc, rd, sel, rt, vcpu->arch.gprs[rt]);
-
+ trace_kvm_hwr(vcpu, KVM_TRACE_MFC0,
+ KVM_TRACE_COP0(rd, sel),
+ vcpu->arch.gprs[rt]);
break;
case dmfc_op:
vcpu->arch.gprs[rt] = cop0->reg[rd][sel];
+
+ trace_kvm_hwr(vcpu, KVM_TRACE_DMFC0,
+ KVM_TRACE_COP0(rd, sel),
+ vcpu->arch.gprs[rt]);
break;
case mtc_op:
#ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS
cop0->stat[rd][sel]++;
#endif
+ trace_kvm_hwr(vcpu, KVM_TRACE_MTC0,
+ KVM_TRACE_COP0(rd, sel),
+ vcpu->arch.gprs[rt]);
+
if ((rd == MIPS_CP0_TLB_INDEX)
&& (vcpu->arch.gprs[rt] >=
KVM_MIPS_GUEST_TLB_SIZE)) {
@@ -1078,16 +1125,15 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
kvm_err("MTCz, cop0->reg[EBASE]: %#lx\n",
kvm_read_c0_guest_ebase(cop0));
} else if (rd == MIPS_CP0_TLB_HI && sel == 0) {
- uint32_t nasid =
+ u32 nasid =
vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID;
if ((KSEGX(vcpu->arch.gprs[rt]) != CKSEG0) &&
((kvm_read_c0_guest_entryhi(cop0) &
KVM_ENTRYHI_ASID) != nasid)) {
- kvm_debug("MTCz, change ASID from %#lx to %#lx\n",
+ trace_kvm_asid_change(vcpu,
kvm_read_c0_guest_entryhi(cop0)
- & KVM_ENTRYHI_ASID,
- vcpu->arch.gprs[rt]
- & KVM_ENTRYHI_ASID);
+ & KVM_ENTRYHI_ASID,
+ nasid);
/* Blow away the shadow host TLBs */
kvm_mips_flush_host_tlb(1);
@@ -1100,10 +1146,6 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
kvm_mips_write_count(vcpu, vcpu->arch.gprs[rt]);
goto done;
} else if ((rd == MIPS_CP0_COMPARE) && (sel == 0)) {
- kvm_debug("[%#x] MTCz, COMPARE %#lx <- %#lx\n",
- pc, kvm_read_c0_guest_compare(cop0),
- vcpu->arch.gprs[rt]);
-
/* If we are writing to COMPARE */
/* Clear pending timer interrupt, if any */
kvm_mips_write_compare(vcpu,
@@ -1155,7 +1197,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
* it first.
*/
if (change & ST0_CU1 && !(val & ST0_FR) &&
- vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA)
+ vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA)
kvm_lose_fpu(vcpu);
/*
@@ -1166,7 +1208,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
* the near future.
*/
if (change & ST0_CU1 &&
- vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU)
+ vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)
change_c0_status(ST0_CU1, val);
preempt_enable();
@@ -1201,7 +1243,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
* context is already loaded.
*/
if (change & MIPS_CONF5_FRE &&
- vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU)
+ vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)
change_c0_config5(MIPS_CONF5_FRE, val);
/*
@@ -1211,7 +1253,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
* quickly enabled again in the near future.
*/
if (change & MIPS_CONF5_MSAEN &&
- vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA)
+ vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA)
change_c0_config5(MIPS_CONF5_MSAEN,
val);
@@ -1219,7 +1261,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
kvm_write_c0_guest_config5(cop0, val);
} else if ((rd == MIPS_CP0_CAUSE) && (sel == 0)) {
- uint32_t old_cause, new_cause;
+ u32 old_cause, new_cause;
old_cause = kvm_read_c0_guest_cause(cop0);
new_cause = vcpu->arch.gprs[rt];
@@ -1233,20 +1275,30 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
else
kvm_mips_count_enable_cause(vcpu);
}
+ } else if ((rd == MIPS_CP0_HWRENA) && (sel == 0)) {
+ u32 mask = MIPS_HWRENA_CPUNUM |
+ MIPS_HWRENA_SYNCISTEP |
+ MIPS_HWRENA_CC |
+ MIPS_HWRENA_CCRES;
+
+ if (kvm_read_c0_guest_config3(cop0) &
+ MIPS_CONF3_ULRI)
+ mask |= MIPS_HWRENA_ULR;
+ cop0->reg[rd][sel] = vcpu->arch.gprs[rt] & mask;
} else {
cop0->reg[rd][sel] = vcpu->arch.gprs[rt];
#ifdef CONFIG_KVM_MIPS_DYN_TRANS
kvm_mips_trans_mtc0(inst, opc, vcpu);
#endif
}
-
- kvm_debug("[%#x] MTCz, cop0->reg[%d][%d]: %#lx\n", pc,
- rd, sel, cop0->reg[rd][sel]);
break;
case dmtc_op:
kvm_err("!!!!!!![%#lx]dmtc_op: rt: %d, rd: %d, sel: %d!!!!!!\n",
vcpu->arch.pc, rt, rd, sel);
+ trace_kvm_hwr(vcpu, KVM_TRACE_DMTC0,
+ KVM_TRACE_COP0(rd, sel),
+ vcpu->arch.gprs[rt]);
er = EMULATE_FAIL;
break;
@@ -1258,7 +1310,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
vcpu->arch.gprs[rt] =
kvm_read_c0_guest_status(cop0);
/* EI */
- if (inst & 0x20) {
+ if (inst.mfmc0_format.sc) {
kvm_debug("[%#lx] mfmc0_op: EI\n",
vcpu->arch.pc);
kvm_set_c0_guest_status(cop0, ST0_IE);
@@ -1272,9 +1324,8 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
case wrpgpr_op:
{
- uint32_t css =
- cop0->reg[MIPS_CP0_STATUS][2] & 0xf;
- uint32_t pss =
+ u32 css = cop0->reg[MIPS_CP0_STATUS][2] & 0xf;
+ u32 pss =
(cop0->reg[MIPS_CP0_STATUS][2] >> 6) & 0xf;
/*
* We don't support any shadow register sets, so
@@ -1291,7 +1342,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
break;
default:
kvm_err("[%#lx]MachEmulateCP0: unsupported COP0, copz: 0x%x\n",
- vcpu->arch.pc, copz);
+ vcpu->arch.pc, inst.c0r_format.rs);
er = EMULATE_FAIL;
break;
}
@@ -1312,13 +1363,14 @@ dont_update_pc:
return er;
}
-enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
+enum emulation_result kvm_mips_emulate_store(union mips_instruction inst,
+ u32 cause,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
enum emulation_result er = EMULATE_DO_MMIO;
- int32_t op, base, rt, offset;
- uint32_t bytes;
+ u32 rt;
+ u32 bytes;
void *data = run->mmio.data;
unsigned long curr_pc;
@@ -1331,12 +1383,9 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
if (er == EMULATE_FAIL)
return er;
- rt = (inst >> 16) & 0x1f;
- base = (inst >> 21) & 0x1f;
- offset = inst & 0xffff;
- op = (inst >> 26) & 0x3f;
+ rt = inst.i_format.rt;
- switch (op) {
+ switch (inst.i_format.opcode) {
case sb_op:
bytes = 1;
if (bytes > sizeof(run->mmio.data)) {
@@ -1357,7 +1406,7 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
*(u8 *) data = vcpu->arch.gprs[rt];
kvm_debug("OP_SB: eaddr: %#lx, gpr: %#lx, data: %#x\n",
vcpu->arch.host_cp0_badvaddr, vcpu->arch.gprs[rt],
- *(uint8_t *) data);
+ *(u8 *) data);
break;
@@ -1379,11 +1428,11 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
run->mmio.is_write = 1;
vcpu->mmio_needed = 1;
vcpu->mmio_is_write = 1;
- *(uint32_t *) data = vcpu->arch.gprs[rt];
+ *(u32 *) data = vcpu->arch.gprs[rt];
kvm_debug("[%#lx] OP_SW: eaddr: %#lx, gpr: %#lx, data: %#x\n",
vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
- vcpu->arch.gprs[rt], *(uint32_t *) data);
+ vcpu->arch.gprs[rt], *(u32 *) data);
break;
case sh_op:
@@ -1404,15 +1453,16 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
run->mmio.is_write = 1;
vcpu->mmio_needed = 1;
vcpu->mmio_is_write = 1;
- *(uint16_t *) data = vcpu->arch.gprs[rt];
+ *(u16 *) data = vcpu->arch.gprs[rt];
kvm_debug("[%#lx] OP_SH: eaddr: %#lx, gpr: %#lx, data: %#x\n",
vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
- vcpu->arch.gprs[rt], *(uint32_t *) data);
+ vcpu->arch.gprs[rt], *(u32 *) data);
break;
default:
- kvm_err("Store not yet supported");
+ kvm_err("Store not yet supported (inst=0x%08x)\n",
+ inst.word);
er = EMULATE_FAIL;
break;
}
@@ -1424,18 +1474,16 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
return er;
}
-enum emulation_result kvm_mips_emulate_load(uint32_t inst, uint32_t cause,
- struct kvm_run *run,
+enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
+ u32 cause, struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
enum emulation_result er = EMULATE_DO_MMIO;
- int32_t op, base, rt, offset;
- uint32_t bytes;
+ u32 op, rt;
+ u32 bytes;
- rt = (inst >> 16) & 0x1f;
- base = (inst >> 21) & 0x1f;
- offset = inst & 0xffff;
- op = (inst >> 26) & 0x3f;
+ rt = inst.i_format.rt;
+ op = inst.i_format.opcode;
vcpu->arch.pending_load_cause = cause;
vcpu->arch.io_gpr = rt;
@@ -1521,7 +1569,8 @@ enum emulation_result kvm_mips_emulate_load(uint32_t inst, uint32_t cause,
break;
default:
- kvm_err("Load not yet supported");
+ kvm_err("Load not yet supported (inst=0x%08x)\n",
+ inst.word);
er = EMULATE_FAIL;
break;
}
@@ -1529,40 +1578,15 @@ enum emulation_result kvm_mips_emulate_load(uint32_t inst, uint32_t cause,
return er;
}
-int kvm_mips_sync_icache(unsigned long va, struct kvm_vcpu *vcpu)
-{
- unsigned long offset = (va & ~PAGE_MASK);
- struct kvm *kvm = vcpu->kvm;
- unsigned long pa;
- gfn_t gfn;
- kvm_pfn_t pfn;
-
- gfn = va >> PAGE_SHIFT;
-
- if (gfn >= kvm->arch.guest_pmap_npages) {
- kvm_err("%s: Invalid gfn: %#llx\n", __func__, gfn);
- kvm_mips_dump_host_tlbs();
- kvm_arch_vcpu_dump_regs(vcpu);
- return -1;
- }
- pfn = kvm->arch.guest_pmap[gfn];
- pa = (pfn << PAGE_SHIFT) | offset;
-
- kvm_debug("%s: va: %#lx, unmapped: %#x\n", __func__, va,
- CKSEG0ADDR(pa));
-
- local_flush_icache_range(CKSEG0ADDR(pa), 32);
- return 0;
-}
-
-enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc,
- uint32_t cause,
+enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
+ u32 *opc, u32 cause,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
enum emulation_result er = EMULATE_DONE;
- int32_t offset, cache, op_inst, op, base;
+ u32 cache, op_inst, op, base;
+ s16 offset;
struct kvm_vcpu_arch *arch = &vcpu->arch;
unsigned long va;
unsigned long curr_pc;
@@ -1576,9 +1600,12 @@ enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc,
if (er == EMULATE_FAIL)
return er;
- base = (inst >> 21) & 0x1f;
- op_inst = (inst >> 16) & 0x1f;
- offset = (int16_t)inst;
+ base = inst.i_format.rs;
+ op_inst = inst.i_format.rt;
+ if (cpu_has_mips_r6)
+ offset = inst.spec3_format.simmediate;
+ else
+ offset = inst.i_format.simmediate;
cache = op_inst & CacheOp_Cache;
op = op_inst & CacheOp_Op;
@@ -1634,7 +1661,6 @@ enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc,
(cop0) & KVM_ENTRYHI_ASID));
if (index < 0) {
- vcpu->arch.host_cp0_entryhi = (va & VPN2_MASK);
vcpu->arch.host_cp0_badvaddr = va;
vcpu->arch.pc = curr_pc;
er = kvm_mips_emulate_tlbmiss_ld(cause, NULL, run,
@@ -1659,9 +1685,7 @@ enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc,
* We fault an entry from the guest tlb to the
* shadow host TLB
*/
- kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb,
- NULL,
- NULL);
+ kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb);
}
}
} else {
@@ -1714,20 +1738,20 @@ dont_update_pc:
return er;
}
-enum emulation_result kvm_mips_emulate_inst(unsigned long cause, uint32_t *opc,
+enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
+ union mips_instruction inst;
enum emulation_result er = EMULATE_DONE;
- uint32_t inst;
/* Fetch the instruction. */
if (cause & CAUSEF_BD)
opc += 1;
- inst = kvm_get_inst(opc, vcpu);
+ inst.word = kvm_get_inst(opc, vcpu);
- switch (((union mips_instruction)inst).r_format.opcode) {
+ switch (inst.r_format.opcode) {
case cop0_op:
er = kvm_mips_emulate_CP0(inst, opc, cause, run, vcpu);
break;
@@ -1744,15 +1768,31 @@ enum emulation_result kvm_mips_emulate_inst(unsigned long cause, uint32_t *opc,
er = kvm_mips_emulate_load(inst, cause, run, vcpu);
break;
+#ifndef CONFIG_CPU_MIPSR6
case cache_op:
++vcpu->stat.cache_exits;
- trace_kvm_exit(vcpu, CACHE_EXITS);
+ trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
er = kvm_mips_emulate_cache(inst, opc, cause, run, vcpu);
break;
+#else
+ case spec3_op:
+ switch (inst.spec3_format.func) {
+ case cache6_op:
+ ++vcpu->stat.cache_exits;
+ trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
+ er = kvm_mips_emulate_cache(inst, opc, cause, run,
+ vcpu);
+ break;
+ default:
+ goto unknown;
+ };
+ break;
+unknown:
+#endif
default:
kvm_err("Instruction emulation not supported (%p/%#x)\n", opc,
- inst);
+ inst.word);
kvm_arch_vcpu_dump_regs(vcpu);
er = EMULATE_FAIL;
break;
@@ -1761,8 +1801,8 @@ enum emulation_result kvm_mips_emulate_inst(unsigned long cause, uint32_t *opc,
return er;
}
-enum emulation_result kvm_mips_emulate_syscall(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_syscall(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -1796,8 +1836,8 @@ enum emulation_result kvm_mips_emulate_syscall(unsigned long cause,
return er;
}
-enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -1842,8 +1882,8 @@ enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause,
return EMULATE_DONE;
}
-enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -1888,8 +1928,8 @@ enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause,
return EMULATE_DONE;
}
-enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -1932,8 +1972,8 @@ enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause,
return EMULATE_DONE;
}
-enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -1977,7 +2017,7 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause,
}
/* TLBMOD: store into address matching TLB with Dirty bit off */
-enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause, uint32_t *opc,
+enum emulation_result kvm_mips_handle_tlbmod(u32 cause, u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -2005,8 +2045,8 @@ enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause, uint32_t *opc,
return er;
}
-enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_tlbmod(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -2048,8 +2088,8 @@ enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause,
return EMULATE_DONE;
}
-enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -2077,8 +2117,8 @@ enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause,
return EMULATE_DONE;
}
-enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_ri_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -2112,8 +2152,8 @@ enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause,
return er;
}
-enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_bp_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -2147,8 +2187,8 @@ enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause,
return er;
}
-enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_trap_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -2182,8 +2222,8 @@ enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause,
return er;
}
-enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -2217,8 +2257,8 @@ enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause,
return er;
}
-enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -2252,8 +2292,8 @@ enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause,
return er;
}
-enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -2287,22 +2327,7 @@ enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause,
return er;
}
-/* ll/sc, rdhwr, sync emulation */
-
-#define OPCODE 0xfc000000
-#define BASE 0x03e00000
-#define RT 0x001f0000
-#define OFFSET 0x0000ffff
-#define LL 0xc0000000
-#define SC 0xe0000000
-#define SPEC0 0x00000000
-#define SPEC3 0x7c000000
-#define RD 0x0000f800
-#define FUNC 0x0000003f
-#define SYNC 0x0000000f
-#define RDHWR 0x0000003b
-
-enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
+enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -2310,7 +2335,7 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
struct kvm_vcpu_arch *arch = &vcpu->arch;
enum emulation_result er = EMULATE_DONE;
unsigned long curr_pc;
- uint32_t inst;
+ union mips_instruction inst;
/*
* Update PC and hold onto current PC in case there is
@@ -2325,17 +2350,22 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
if (cause & CAUSEF_BD)
opc += 1;
- inst = kvm_get_inst(opc, vcpu);
+ inst.word = kvm_get_inst(opc, vcpu);
- if (inst == KVM_INVALID_INST) {
+ if (inst.word == KVM_INVALID_INST) {
kvm_err("%s: Cannot get inst @ %p\n", __func__, opc);
return EMULATE_FAIL;
}
- if ((inst & OPCODE) == SPEC3 && (inst & FUNC) == RDHWR) {
+ if (inst.r_format.opcode == spec3_op &&
+ inst.r_format.func == rdhwr_op &&
+ inst.r_format.rs == 0 &&
+ (inst.r_format.re >> 3) == 0) {
int usermode = !KVM_GUEST_KERNEL_MODE(vcpu);
- int rd = (inst & RD) >> 11;
- int rt = (inst & RT) >> 16;
+ int rd = inst.r_format.rd;
+ int rt = inst.r_format.rt;
+ int sel = inst.r_format.re & 0x7;
+
/* If usermode, check RDHWR rd is allowed by guest HWREna */
if (usermode && !(kvm_read_c0_guest_hwrena(cop0) & BIT(rd))) {
kvm_debug("RDHWR %#x disallowed by HWREna @ %p\n",
@@ -2343,17 +2373,17 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
goto emulate_ri;
}
switch (rd) {
- case 0: /* CPU number */
- arch->gprs[rt] = 0;
+ case MIPS_HWR_CPUNUM: /* CPU number */
+ arch->gprs[rt] = vcpu->vcpu_id;
break;
- case 1: /* SYNCI length */
+ case MIPS_HWR_SYNCISTEP: /* SYNCI length */
arch->gprs[rt] = min(current_cpu_data.dcache.linesz,
current_cpu_data.icache.linesz);
break;
- case 2: /* Read count register */
- arch->gprs[rt] = kvm_mips_read_count(vcpu);
+ case MIPS_HWR_CC: /* Read count register */
+ arch->gprs[rt] = (s32)kvm_mips_read_count(vcpu);
break;
- case 3: /* Count register resolution */
+ case MIPS_HWR_CCRES: /* Count register resolution */
switch (current_cpu_data.cputype) {
case CPU_20KC:
case CPU_25KF:
@@ -2363,7 +2393,7 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
arch->gprs[rt] = 2;
}
break;
- case 29:
+ case MIPS_HWR_ULR: /* Read UserLocal register */
arch->gprs[rt] = kvm_read_c0_guest_userlocal(cop0);
break;
@@ -2371,8 +2401,12 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
kvm_debug("RDHWR %#x not supported @ %p\n", rd, opc);
goto emulate_ri;
}
+
+ trace_kvm_hwr(vcpu, KVM_TRACE_RDHWR, KVM_TRACE_HWR(rd, sel),
+ vcpu->arch.gprs[rt]);
} else {
- kvm_debug("Emulate RI not supported @ %p: %#x\n", opc, inst);
+ kvm_debug("Emulate RI not supported @ %p: %#x\n",
+ opc, inst.word);
goto emulate_ri;
}
@@ -2405,19 +2439,19 @@ enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu,
switch (run->mmio.len) {
case 4:
- *gpr = *(int32_t *) run->mmio.data;
+ *gpr = *(s32 *) run->mmio.data;
break;
case 2:
if (vcpu->mmio_needed == 2)
- *gpr = *(int16_t *) run->mmio.data;
+ *gpr = *(s16 *) run->mmio.data;
else
- *gpr = *(uint16_t *)run->mmio.data;
+ *gpr = *(u16 *)run->mmio.data;
break;
case 1:
if (vcpu->mmio_needed == 2)
- *gpr = *(int8_t *) run->mmio.data;
+ *gpr = *(s8 *) run->mmio.data;
else
*gpr = *(u8 *) run->mmio.data;
break;
@@ -2432,12 +2466,12 @@ done:
return er;
}
-static enum emulation_result kvm_mips_emulate_exc(unsigned long cause,
- uint32_t *opc,
+static enum emulation_result kvm_mips_emulate_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
- uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
+ u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
struct mips_coproc *cop0 = vcpu->arch.cop0;
struct kvm_vcpu_arch *arch = &vcpu->arch;
enum emulation_result er = EMULATE_DONE;
@@ -2470,13 +2504,13 @@ static enum emulation_result kvm_mips_emulate_exc(unsigned long cause,
return er;
}
-enum emulation_result kvm_mips_check_privilege(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_check_privilege(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
enum emulation_result er = EMULATE_DONE;
- uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
+ u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
int usermode = !KVM_GUEST_KERNEL_MODE(vcpu);
@@ -2566,18 +2600,18 @@ enum emulation_result kvm_mips_check_privilege(unsigned long cause,
* (2) TLB entry is present in the Guest TLB but not in the shadow, in this
* case we inject the TLB from the Guest TLB into the shadow host TLB
*/
-enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
enum emulation_result er = EMULATE_DONE;
- uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
+ u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
unsigned long va = vcpu->arch.host_cp0_badvaddr;
int index;
- kvm_debug("kvm_mips_handle_tlbmiss: badvaddr: %#lx, entryhi: %#lx\n",
- vcpu->arch.host_cp0_badvaddr, vcpu->arch.host_cp0_entryhi);
+ kvm_debug("kvm_mips_handle_tlbmiss: badvaddr: %#lx\n",
+ vcpu->arch.host_cp0_badvaddr);
/*
* KVM would not have got the exception if this entry was valid in the
@@ -2620,13 +2654,12 @@ enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause,
}
} else {
kvm_debug("Injecting hi: %#lx, lo0: %#lx, lo1: %#lx into shadow host TLB\n",
- tlb->tlb_hi, tlb->tlb_lo0, tlb->tlb_lo1);
+ tlb->tlb_hi, tlb->tlb_lo[0], tlb->tlb_lo[1]);
/*
* OK we have a Guest TLB entry, now inject it into the
* shadow host TLB
*/
- kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, NULL,
- NULL);
+ kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb);
}
}
diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
new file mode 100644
index 000000000000..6a02b3a3fa65
--- /dev/null
+++ b/arch/mips/kvm/entry.c
@@ -0,0 +1,701 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Generation of main entry point for the guest, exception handling.
+ *
+ * Copyright (C) 2012 MIPS Technologies, Inc.
+ * Authors: Sanjay Lal <sanjayl@kymasys.com>
+ *
+ * Copyright (C) 2016 Imagination Technologies Ltd.
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/msa.h>
+#include <asm/setup.h>
+#include <asm/uasm.h>
+
+/* Register names */
+#define ZERO 0
+#define AT 1
+#define V0 2
+#define V1 3
+#define A0 4
+#define A1 5
+
+#if _MIPS_SIM == _MIPS_SIM_ABI32
+#define T0 8
+#define T1 9
+#define T2 10
+#define T3 11
+#endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */
+
+#if _MIPS_SIM == _MIPS_SIM_ABI64 || _MIPS_SIM == _MIPS_SIM_NABI32
+#define T0 12
+#define T1 13
+#define T2 14
+#define T3 15
+#endif /* _MIPS_SIM == _MIPS_SIM_ABI64 || _MIPS_SIM == _MIPS_SIM_NABI32 */
+
+#define S0 16
+#define S1 17
+#define T9 25
+#define K0 26
+#define K1 27
+#define GP 28
+#define SP 29
+#define RA 31
+
+/* Some CP0 registers */
+#define C0_HWRENA 7, 0
+#define C0_BADVADDR 8, 0
+#define C0_ENTRYHI 10, 0
+#define C0_STATUS 12, 0
+#define C0_CAUSE 13, 0
+#define C0_EPC 14, 0
+#define C0_EBASE 15, 1
+#define C0_CONFIG5 16, 5
+#define C0_DDATA_LO 28, 3
+#define C0_ERROREPC 30, 0
+
+#define CALLFRAME_SIZ 32
+
+#ifdef CONFIG_64BIT
+#define ST0_KX_IF_64 ST0_KX
+#else
+#define ST0_KX_IF_64 0
+#endif
+
+static unsigned int scratch_vcpu[2] = { C0_DDATA_LO };
+static unsigned int scratch_tmp[2] = { C0_ERROREPC };
+
+enum label_id {
+ label_fpu_1 = 1,
+ label_msa_1,
+ label_return_to_host,
+ label_kernel_asid,
+ label_exit_common,
+};
+
+UASM_L_LA(_fpu_1)
+UASM_L_LA(_msa_1)
+UASM_L_LA(_return_to_host)
+UASM_L_LA(_kernel_asid)
+UASM_L_LA(_exit_common)
+
+static void *kvm_mips_build_enter_guest(void *addr);
+static void *kvm_mips_build_ret_from_exit(void *addr);
+static void *kvm_mips_build_ret_to_guest(void *addr);
+static void *kvm_mips_build_ret_to_host(void *addr);
+
+/**
+ * kvm_mips_entry_setup() - Perform global setup for entry code.
+ *
+ * Perform global setup for entry code, such as choosing a scratch register.
+ *
+ * Returns: 0 on success.
+ * -errno on failure.
+ */
+int kvm_mips_entry_setup(void)
+{
+ /*
+ * We prefer to use KScratchN registers if they are available over the
+ * defaults above, which may not work on all cores.
+ */
+ unsigned int kscratch_mask = cpu_data[0].kscratch_mask & 0xfc;
+
+ /* Pick a scratch register for storing VCPU */
+ if (kscratch_mask) {
+ scratch_vcpu[0] = 31;
+ scratch_vcpu[1] = ffs(kscratch_mask) - 1;
+ kscratch_mask &= ~BIT(scratch_vcpu[1]);
+ }
+
+ /* Pick a scratch register to use as a temp for saving state */
+ if (kscratch_mask) {
+ scratch_tmp[0] = 31;
+ scratch_tmp[1] = ffs(kscratch_mask) - 1;
+ kscratch_mask &= ~BIT(scratch_tmp[1]);
+ }
+
+ return 0;
+}
+
+static void kvm_mips_build_save_scratch(u32 **p, unsigned int tmp,
+ unsigned int frame)
+{
+ /* Save the VCPU scratch register value in cp0_epc of the stack frame */
+ UASM_i_MFC0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]);
+ UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame);
+
+ /* Save the temp scratch register value in cp0_cause of stack frame */
+ if (scratch_tmp[0] == 31) {
+ UASM_i_MFC0(p, tmp, scratch_tmp[0], scratch_tmp[1]);
+ UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame);
+ }
+}
+
+static void kvm_mips_build_restore_scratch(u32 **p, unsigned int tmp,
+ unsigned int frame)
+{
+ /*
+ * Restore host scratch register values saved by
+ * kvm_mips_build_save_scratch().
+ */
+ UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame);
+ UASM_i_MTC0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]);
+
+ if (scratch_tmp[0] == 31) {
+ UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame);
+ UASM_i_MTC0(p, tmp, scratch_tmp[0], scratch_tmp[1]);
+ }
+}
+
+/**
+ * build_set_exc_base() - Assemble code to write exception base address.
+ * @p: Code buffer pointer.
+ * @reg: Source register (generated code may set WG bit in @reg).
+ *
+ * Assemble code to modify the exception base address in the EBase register,
+ * using the appropriately sized access and setting the WG bit if necessary.
+ */
+static inline void build_set_exc_base(u32 **p, unsigned int reg)
+{
+ if (cpu_has_ebase_wg) {
+ /* Set WG so that all the bits get written */
+ uasm_i_ori(p, reg, reg, MIPS_EBASE_WG);
+ UASM_i_MTC0(p, reg, C0_EBASE);
+ } else {
+ uasm_i_mtc0(p, reg, C0_EBASE);
+ }
+}
+
+/**
+ * kvm_mips_build_vcpu_run() - Assemble function to start running a guest VCPU.
+ * @addr: Address to start writing code.
+ *
+ * Assemble the start of the vcpu_run function to run a guest VCPU. The function
+ * conforms to the following prototype:
+ *
+ * int vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu);
+ *
+ * The exit from the guest and return to the caller is handled by the code
+ * generated by kvm_mips_build_ret_to_host().
+ *
+ * Returns: Next address after end of written function.
+ */
+void *kvm_mips_build_vcpu_run(void *addr)
+{
+ u32 *p = addr;
+ unsigned int i;
+
+ /*
+ * A0: run
+ * A1: vcpu
+ */
+
+ /* k0/k1 not being used in host kernel context */
+ UASM_i_ADDIU(&p, K1, SP, -(int)sizeof(struct pt_regs));
+ for (i = 16; i < 32; ++i) {
+ if (i == 24)
+ i = 28;
+ UASM_i_SW(&p, i, offsetof(struct pt_regs, regs[i]), K1);
+ }
+
+ /* Save host status */
+ uasm_i_mfc0(&p, V0, C0_STATUS);
+ UASM_i_SW(&p, V0, offsetof(struct pt_regs, cp0_status), K1);
+
+ /* Save scratch registers, will be used to store pointer to vcpu etc */
+ kvm_mips_build_save_scratch(&p, V1, K1);
+
+ /* VCPU scratch register has pointer to vcpu */
+ UASM_i_MTC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]);
+
+ /* Offset into vcpu->arch */
+ UASM_i_ADDIU(&p, K1, A1, offsetof(struct kvm_vcpu, arch));
+
+ /*
+ * Save the host stack to VCPU, used for exception processing
+ * when we exit from the Guest
+ */
+ UASM_i_SW(&p, SP, offsetof(struct kvm_vcpu_arch, host_stack), K1);
+
+ /* Save the kernel gp as well */
+ UASM_i_SW(&p, GP, offsetof(struct kvm_vcpu_arch, host_gp), K1);
+
+ /*
+ * Setup status register for running the guest in UM, interrupts
+ * are disabled
+ */
+ UASM_i_LA(&p, K0, ST0_EXL | KSU_USER | ST0_BEV | ST0_KX_IF_64);
+ uasm_i_mtc0(&p, K0, C0_STATUS);
+ uasm_i_ehb(&p);
+
+ /* load up the new EBASE */
+ UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1);
+ build_set_exc_base(&p, K0);
+
+ /*
+ * Now that the new EBASE has been loaded, unset BEV, set
+ * interrupt mask as it was but make sure that timer interrupts
+ * are enabled
+ */
+ uasm_i_addiu(&p, K0, ZERO, ST0_EXL | KSU_USER | ST0_IE | ST0_KX_IF_64);
+ uasm_i_andi(&p, V0, V0, ST0_IM);
+ uasm_i_or(&p, K0, K0, V0);
+ uasm_i_mtc0(&p, K0, C0_STATUS);
+ uasm_i_ehb(&p);
+
+ p = kvm_mips_build_enter_guest(p);
+
+ return p;
+}
+
+/**
+ * kvm_mips_build_enter_guest() - Assemble code to resume guest execution.
+ * @addr: Address to start writing code.
+ *
+ * Assemble the code to resume guest execution. This code is common between the
+ * initial entry into the guest from the host, and returning from the exit
+ * handler back to the guest.
+ *
+ * Returns: Next address after end of written function.
+ */
+static void *kvm_mips_build_enter_guest(void *addr)
+{
+ u32 *p = addr;
+ unsigned int i;
+ struct uasm_label labels[2];
+ struct uasm_reloc relocs[2];
+ struct uasm_label *l = labels;
+ struct uasm_reloc *r = relocs;
+
+ memset(labels, 0, sizeof(labels));
+ memset(relocs, 0, sizeof(relocs));
+
+ /* Set Guest EPC */
+ UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, pc), K1);
+ UASM_i_MTC0(&p, T0, C0_EPC);
+
+ /* Set the ASID for the Guest Kernel */
+ UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, cop0), K1);
+ UASM_i_LW(&p, T0, offsetof(struct mips_coproc, reg[MIPS_CP0_STATUS][0]),
+ T0);
+ uasm_i_andi(&p, T0, T0, KSU_USER | ST0_ERL | ST0_EXL);
+ uasm_i_xori(&p, T0, T0, KSU_USER);
+ uasm_il_bnez(&p, &r, T0, label_kernel_asid);
+ UASM_i_ADDIU(&p, T1, K1,
+ offsetof(struct kvm_vcpu_arch, guest_kernel_asid));
+ /* else user */
+ UASM_i_ADDIU(&p, T1, K1,
+ offsetof(struct kvm_vcpu_arch, guest_user_asid));
+ uasm_l_kernel_asid(&l, p);
+
+ /* t1: contains the base of the ASID array, need to get the cpu id */
+ /* smp_processor_id */
+ uasm_i_lw(&p, T2, offsetof(struct thread_info, cpu), GP);
+ /* x4 */
+ uasm_i_sll(&p, T2, T2, 2);
+ UASM_i_ADDU(&p, T3, T1, T2);
+ uasm_i_lw(&p, K0, 0, T3);
+#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
+ /* x sizeof(struct cpuinfo_mips)/4 */
+ uasm_i_addiu(&p, T3, ZERO, sizeof(struct cpuinfo_mips)/4);
+ uasm_i_mul(&p, T2, T2, T3);
+
+ UASM_i_LA_mostly(&p, AT, (long)&cpu_data[0].asid_mask);
+ UASM_i_ADDU(&p, AT, AT, T2);
+ UASM_i_LW(&p, T2, uasm_rel_lo((long)&cpu_data[0].asid_mask), AT);
+ uasm_i_and(&p, K0, K0, T2);
+#else
+ uasm_i_andi(&p, K0, K0, MIPS_ENTRYHI_ASID);
+#endif
+ uasm_i_mtc0(&p, K0, C0_ENTRYHI);
+ uasm_i_ehb(&p);
+
+ /* Disable RDHWR access */
+ uasm_i_mtc0(&p, ZERO, C0_HWRENA);
+
+ /* load the guest context from VCPU and return */
+ for (i = 1; i < 32; ++i) {
+ /* Guest k0/k1 loaded later */
+ if (i == K0 || i == K1)
+ continue;
+ UASM_i_LW(&p, i, offsetof(struct kvm_vcpu_arch, gprs[i]), K1);
+ }
+
+#ifndef CONFIG_CPU_MIPSR6
+ /* Restore hi/lo */
+ UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, hi), K1);
+ uasm_i_mthi(&p, K0);
+
+ UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, lo), K1);
+ uasm_i_mtlo(&p, K0);
+#endif
+
+ /* Restore the guest's k0/k1 registers */
+ UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1);
+ UASM_i_LW(&p, K1, offsetof(struct kvm_vcpu_arch, gprs[K1]), K1);
+
+ /* Jump to guest */
+ uasm_i_eret(&p);
+
+ uasm_resolve_relocs(relocs, labels);
+
+ return p;
+}
+
+/**
+ * kvm_mips_build_exception() - Assemble first level guest exception handler.
+ * @addr: Address to start writing code.
+ * @handler: Address of common handler (within range of @addr).
+ *
+ * Assemble exception vector code for guest execution. The generated vector will
+ * branch to the common exception handler generated by kvm_mips_build_exit().
+ *
+ * Returns: Next address after end of written function.
+ */
+void *kvm_mips_build_exception(void *addr, void *handler)
+{
+ u32 *p = addr;
+ struct uasm_label labels[2];
+ struct uasm_reloc relocs[2];
+ struct uasm_label *l = labels;
+ struct uasm_reloc *r = relocs;
+
+ memset(labels, 0, sizeof(labels));
+ memset(relocs, 0, sizeof(relocs));
+
+ /* Save guest k1 into scratch register */
+ UASM_i_MTC0(&p, K1, scratch_tmp[0], scratch_tmp[1]);
+
+ /* Get the VCPU pointer from the VCPU scratch register */
+ UASM_i_MFC0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]);
+ UASM_i_ADDIU(&p, K1, K1, offsetof(struct kvm_vcpu, arch));
+
+ /* Save guest k0 into VCPU structure */
+ UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1);
+
+ /* Branch to the common handler */
+ uasm_il_b(&p, &r, label_exit_common);
+ uasm_i_nop(&p);
+
+ uasm_l_exit_common(&l, handler);
+ uasm_resolve_relocs(relocs, labels);
+
+ return p;
+}
+
+/**
+ * kvm_mips_build_exit() - Assemble common guest exit handler.
+ * @addr: Address to start writing code.
+ *
+ * Assemble the generic guest exit handling code. This is called by the
+ * exception vectors (generated by kvm_mips_build_exception()), and calls
+ * kvm_mips_handle_exit(), then either resumes the guest or returns to the host
+ * depending on the return value.
+ *
+ * Returns: Next address after end of written function.
+ */
+void *kvm_mips_build_exit(void *addr)
+{
+ u32 *p = addr;
+ unsigned int i;
+ struct uasm_label labels[3];
+ struct uasm_reloc relocs[3];
+ struct uasm_label *l = labels;
+ struct uasm_reloc *r = relocs;
+
+ memset(labels, 0, sizeof(labels));
+ memset(relocs, 0, sizeof(relocs));
+
+ /*
+ * Generic Guest exception handler. We end up here when the guest
+ * does something that causes a trap to kernel mode.
+ *
+ * Both k0/k1 registers will have already been saved (k0 into the vcpu
+ * structure, and k1 into the scratch_tmp register).
+ *
+ * The k1 register will already contain the kvm_vcpu_arch pointer.
+ */
+
+ /* Start saving Guest context to VCPU */
+ for (i = 0; i < 32; ++i) {
+ /* Guest k0/k1 saved later */
+ if (i == K0 || i == K1)
+ continue;
+ UASM_i_SW(&p, i, offsetof(struct kvm_vcpu_arch, gprs[i]), K1);
+ }
+
+#ifndef CONFIG_CPU_MIPSR6
+ /* We need to save hi/lo and restore them on the way out */
+ uasm_i_mfhi(&p, T0);
+ UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, hi), K1);
+
+ uasm_i_mflo(&p, T0);
+ UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, lo), K1);
+#endif
+
+ /* Finally save guest k1 to VCPU */
+ uasm_i_ehb(&p);
+ UASM_i_MFC0(&p, T0, scratch_tmp[0], scratch_tmp[1]);
+ UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, gprs[K1]), K1);
+
+ /* Now that context has been saved, we can use other registers */
+
+ /* Restore vcpu */
+ UASM_i_MFC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]);
+ uasm_i_move(&p, S1, A1);
+
+ /* Restore run (vcpu->run) */
+ UASM_i_LW(&p, A0, offsetof(struct kvm_vcpu, run), A1);
+ /* Save pointer to run in s0, will be saved by the compiler */
+ uasm_i_move(&p, S0, A0);
+
+ /*
+ * Save Host level EPC, BadVaddr and Cause to VCPU, useful to process
+ * the exception
+ */
+ UASM_i_MFC0(&p, K0, C0_EPC);
+ UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, pc), K1);
+
+ UASM_i_MFC0(&p, K0, C0_BADVADDR);
+ UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_badvaddr),
+ K1);
+
+ uasm_i_mfc0(&p, K0, C0_CAUSE);
+ uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_cause), K1);
+
+ /* Now restore the host state just enough to run the handlers */
+
+ /* Switch EBASE to the one used by Linux */
+ /* load up the host EBASE */
+ uasm_i_mfc0(&p, V0, C0_STATUS);
+
+ uasm_i_lui(&p, AT, ST0_BEV >> 16);
+ uasm_i_or(&p, K0, V0, AT);
+
+ uasm_i_mtc0(&p, K0, C0_STATUS);
+ uasm_i_ehb(&p);
+
+ UASM_i_LA_mostly(&p, K0, (long)&ebase);
+ UASM_i_LW(&p, K0, uasm_rel_lo((long)&ebase), K0);
+ build_set_exc_base(&p, K0);
+
+ if (raw_cpu_has_fpu) {
+ /*
+ * If FPU is enabled, save FCR31 and clear it so that later
+ * ctc1's don't trigger FPE for pending exceptions.
+ */
+ uasm_i_lui(&p, AT, ST0_CU1 >> 16);
+ uasm_i_and(&p, V1, V0, AT);
+ uasm_il_beqz(&p, &r, V1, label_fpu_1);
+ uasm_i_nop(&p);
+ uasm_i_cfc1(&p, T0, 31);
+ uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.fcr31),
+ K1);
+ uasm_i_ctc1(&p, ZERO, 31);
+ uasm_l_fpu_1(&l, p);
+ }
+
+ if (cpu_has_msa) {
+ /*
+ * If MSA is enabled, save MSACSR and clear it so that later
+ * instructions don't trigger MSAFPE for pending exceptions.
+ */
+ uasm_i_mfc0(&p, T0, C0_CONFIG5);
+ uasm_i_ext(&p, T0, T0, 27, 1); /* MIPS_CONF5_MSAEN */
+ uasm_il_beqz(&p, &r, T0, label_msa_1);
+ uasm_i_nop(&p);
+ uasm_i_cfcmsa(&p, T0, MSA_CSR);
+ uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.msacsr),
+ K1);
+ uasm_i_ctcmsa(&p, MSA_CSR, ZERO);
+ uasm_l_msa_1(&l, p);
+ }
+
+ /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
+ uasm_i_addiu(&p, AT, ZERO, ~(ST0_EXL | KSU_USER | ST0_IE));
+ uasm_i_and(&p, V0, V0, AT);
+ uasm_i_lui(&p, AT, ST0_CU0 >> 16);
+ uasm_i_or(&p, V0, V0, AT);
+ uasm_i_mtc0(&p, V0, C0_STATUS);
+ uasm_i_ehb(&p);
+
+ /* Load up host GP */
+ UASM_i_LW(&p, GP, offsetof(struct kvm_vcpu_arch, host_gp), K1);
+
+ /* Need a stack before we can jump to "C" */
+ UASM_i_LW(&p, SP, offsetof(struct kvm_vcpu_arch, host_stack), K1);
+
+ /* Saved host state */
+ UASM_i_ADDIU(&p, SP, SP, -(int)sizeof(struct pt_regs));
+
+ /*
+ * XXXKYMA do we need to load the host ASID, maybe not because the
+ * kernel entries are marked GLOBAL, need to verify
+ */
+
+ /* Restore host scratch registers, as we'll have clobbered them */
+ kvm_mips_build_restore_scratch(&p, K0, SP);
+
+ /* Restore RDHWR access */
+ UASM_i_LA_mostly(&p, K0, (long)&hwrena);
+ uasm_i_lw(&p, K0, uasm_rel_lo((long)&hwrena), K0);
+ uasm_i_mtc0(&p, K0, C0_HWRENA);
+
+ /* Jump to handler */
+ /*
+ * XXXKYMA: not sure if this is safe, how large is the stack??
+ * Now jump to the kvm_mips_handle_exit() to see if we can deal
+ * with this in the kernel
+ */
+ UASM_i_LA(&p, T9, (unsigned long)kvm_mips_handle_exit);
+ uasm_i_jalr(&p, RA, T9);
+ UASM_i_ADDIU(&p, SP, SP, -CALLFRAME_SIZ);
+
+ uasm_resolve_relocs(relocs, labels);
+
+ p = kvm_mips_build_ret_from_exit(p);
+
+ return p;
+}
+
+/**
+ * kvm_mips_build_ret_from_exit() - Assemble guest exit return handler.
+ * @addr: Address to start writing code.
+ *
+ * Assemble the code to handle the return from kvm_mips_handle_exit(), either
+ * resuming the guest or returning to the host depending on the return value.
+ *
+ * Returns: Next address after end of written function.
+ */
+static void *kvm_mips_build_ret_from_exit(void *addr)
+{
+ u32 *p = addr;
+ struct uasm_label labels[2];
+ struct uasm_reloc relocs[2];
+ struct uasm_label *l = labels;
+ struct uasm_reloc *r = relocs;
+
+ memset(labels, 0, sizeof(labels));
+ memset(relocs, 0, sizeof(relocs));
+
+ /* Return from handler Make sure interrupts are disabled */
+ uasm_i_di(&p, ZERO);
+ uasm_i_ehb(&p);
+
+ /*
+ * XXXKYMA: k0/k1 could have been blown away if we processed
+ * an exception while we were handling the exception from the
+ * guest, reload k1
+ */
+
+ uasm_i_move(&p, K1, S1);
+ UASM_i_ADDIU(&p, K1, K1, offsetof(struct kvm_vcpu, arch));
+
+ /*
+ * Check return value, should tell us if we are returning to the
+ * host (handle I/O etc)or resuming the guest
+ */
+ uasm_i_andi(&p, T0, V0, RESUME_HOST);
+ uasm_il_bnez(&p, &r, T0, label_return_to_host);
+ uasm_i_nop(&p);
+
+ p = kvm_mips_build_ret_to_guest(p);
+
+ uasm_l_return_to_host(&l, p);
+ p = kvm_mips_build_ret_to_host(p);
+
+ uasm_resolve_relocs(relocs, labels);
+
+ return p;
+}
+
+/**
+ * kvm_mips_build_ret_to_guest() - Assemble code to return to the guest.
+ * @addr: Address to start writing code.
+ *
+ * Assemble the code to handle return from the guest exit handler
+ * (kvm_mips_handle_exit()) back to the guest.
+ *
+ * Returns: Next address after end of written function.
+ */
+static void *kvm_mips_build_ret_to_guest(void *addr)
+{
+ u32 *p = addr;
+
+ /* Put the saved pointer to vcpu (s1) back into the scratch register */
+ UASM_i_MTC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]);
+
+ /* Load up the Guest EBASE to minimize the window where BEV is set */
+ UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1);
+
+ /* Switch EBASE back to the one used by KVM */
+ uasm_i_mfc0(&p, V1, C0_STATUS);
+ uasm_i_lui(&p, AT, ST0_BEV >> 16);
+ uasm_i_or(&p, K0, V1, AT);
+ uasm_i_mtc0(&p, K0, C0_STATUS);
+ uasm_i_ehb(&p);
+ build_set_exc_base(&p, T0);
+
+ /* Setup status register for running guest in UM */
+ uasm_i_ori(&p, V1, V1, ST0_EXL | KSU_USER | ST0_IE);
+ UASM_i_LA(&p, AT, ~(ST0_CU0 | ST0_MX));
+ uasm_i_and(&p, V1, V1, AT);
+ uasm_i_mtc0(&p, V1, C0_STATUS);
+ uasm_i_ehb(&p);
+
+ p = kvm_mips_build_enter_guest(p);
+
+ return p;
+}
+
+/**
+ * kvm_mips_build_ret_to_host() - Assemble code to return to the host.
+ * @addr: Address to start writing code.
+ *
+ * Assemble the code to handle return from the guest exit handler
+ * (kvm_mips_handle_exit()) back to the host, i.e. to the caller of the vcpu_run
+ * function generated by kvm_mips_build_vcpu_run().
+ *
+ * Returns: Next address after end of written function.
+ */
+static void *kvm_mips_build_ret_to_host(void *addr)
+{
+ u32 *p = addr;
+ unsigned int i;
+
+ /* EBASE is already pointing to Linux */
+ UASM_i_LW(&p, K1, offsetof(struct kvm_vcpu_arch, host_stack), K1);
+ UASM_i_ADDIU(&p, K1, K1, -(int)sizeof(struct pt_regs));
+
+ /*
+ * r2/v0 is the return code, shift it down by 2 (arithmetic)
+ * to recover the err code
+ */
+ uasm_i_sra(&p, K0, V0, 2);
+ uasm_i_move(&p, V0, K0);
+
+ /* Load context saved on the host stack */
+ for (i = 16; i < 31; ++i) {
+ if (i == 24)
+ i = 28;
+ UASM_i_LW(&p, i, offsetof(struct pt_regs, regs[i]), K1);
+ }
+
+ /* Restore RDHWR access */
+ UASM_i_LA_mostly(&p, K0, (long)&hwrena);
+ uasm_i_lw(&p, K0, uasm_rel_lo((long)&hwrena), K0);
+ uasm_i_mtc0(&p, K0, C0_HWRENA);
+
+ /* Restore RA, which is the address we will return to */
+ UASM_i_LW(&p, RA, offsetof(struct pt_regs, regs[RA]), K1);
+ uasm_i_jr(&p, RA);
+ uasm_i_nop(&p);
+
+ return p;
+}
+
diff --git a/arch/mips/kvm/fpu.S b/arch/mips/kvm/fpu.S
index 531fbf5131c0..16f17c6390dd 100644
--- a/arch/mips/kvm/fpu.S
+++ b/arch/mips/kvm/fpu.S
@@ -14,13 +14,16 @@
#include <asm/mipsregs.h>
#include <asm/regdef.h>
+/* preprocessor replaces the fp in ".set fp=64" with $30 otherwise */
+#undef fp
+
.set noreorder
.set noat
LEAF(__kvm_save_fpu)
.set push
- .set mips64r2
SET_HARDFLOAT
+ .set fp=64
mfc0 t0, CP0_STATUS
sll t0, t0, 5 # is Status.FR set?
bgez t0, 1f # no: skip odd doubles
@@ -63,8 +66,8 @@ LEAF(__kvm_save_fpu)
LEAF(__kvm_restore_fpu)
.set push
- .set mips64r2
SET_HARDFLOAT
+ .set fp=64
mfc0 t0, CP0_STATUS
sll t0, t0, 5 # is Status.FR set?
bgez t0, 1f # no: skip odd doubles
diff --git a/arch/mips/kvm/interrupt.c b/arch/mips/kvm/interrupt.c
index 95f790663b0c..ad28dac6b7e9 100644
--- a/arch/mips/kvm/interrupt.c
+++ b/arch/mips/kvm/interrupt.c
@@ -22,12 +22,12 @@
#include "interrupt.h"
-void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, uint32_t priority)
+void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, unsigned int priority)
{
set_bit(priority, &vcpu->arch.pending_exceptions);
}
-void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, uint32_t priority)
+void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int priority)
{
clear_bit(priority, &vcpu->arch.pending_exceptions);
}
@@ -114,10 +114,10 @@ void kvm_mips_dequeue_io_int_cb(struct kvm_vcpu *vcpu,
/* Deliver the interrupt of the corresponding priority, if possible. */
int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
- uint32_t cause)
+ u32 cause)
{
int allowed = 0;
- uint32_t exccode;
+ u32 exccode;
struct kvm_vcpu_arch *arch = &vcpu->arch;
struct mips_coproc *cop0 = vcpu->arch.cop0;
@@ -196,12 +196,12 @@ int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
}
int kvm_mips_irq_clear_cb(struct kvm_vcpu *vcpu, unsigned int priority,
- uint32_t cause)
+ u32 cause)
{
return 1;
}
-void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, uint32_t cause)
+void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, u32 cause)
{
unsigned long *pending = &vcpu->arch.pending_exceptions;
unsigned long *pending_clr = &vcpu->arch.pending_exceptions_clr;
diff --git a/arch/mips/kvm/interrupt.h b/arch/mips/kvm/interrupt.h
index 2143884709e4..fb118a2c8379 100644
--- a/arch/mips/kvm/interrupt.h
+++ b/arch/mips/kvm/interrupt.h
@@ -28,17 +28,13 @@
#define MIPS_EXC_MAX 12
/* XXXSL More to follow */
-extern char __kvm_mips_vcpu_run_end[];
-extern char mips32_exception[], mips32_exceptionEnd[];
-extern char mips32_GuestException[], mips32_GuestExceptionEnd[];
-
#define C_TI (_ULCAST_(1) << 30)
#define KVM_MIPS_IRQ_DELIVER_ALL_AT_ONCE (0)
#define KVM_MIPS_IRQ_CLEAR_ALL_AT_ONCE (0)
-void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, uint32_t priority);
-void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, uint32_t priority);
+void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, unsigned int priority);
+void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int priority);
int kvm_mips_pending_timer(struct kvm_vcpu *vcpu);
void kvm_mips_queue_timer_int_cb(struct kvm_vcpu *vcpu);
@@ -48,7 +44,7 @@ void kvm_mips_queue_io_int_cb(struct kvm_vcpu *vcpu,
void kvm_mips_dequeue_io_int_cb(struct kvm_vcpu *vcpu,
struct kvm_mips_interrupt *irq);
int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
- uint32_t cause);
+ u32 cause);
int kvm_mips_irq_clear_cb(struct kvm_vcpu *vcpu, unsigned int priority,
- uint32_t cause);
-void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, uint32_t cause);
+ u32 cause);
+void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, u32 cause);
diff --git a/arch/mips/kvm/locore.S b/arch/mips/kvm/locore.S
deleted file mode 100644
index 828fcfc1cd7f..000000000000
--- a/arch/mips/kvm/locore.S
+++ /dev/null
@@ -1,605 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License. See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Main entry point for the guest, exception handling.
- *
- * Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved.
- * Authors: Sanjay Lal <sanjayl@kymasys.com>
- */
-
-#include <asm/asm.h>
-#include <asm/asmmacro.h>
-#include <asm/regdef.h>
-#include <asm/mipsregs.h>
-#include <asm/stackframe.h>
-#include <asm/asm-offsets.h>
-
-#define _C_LABEL(x) x
-#define MIPSX(name) mips32_ ## name
-#define CALLFRAME_SIZ 32
-
-/*
- * VECTOR
- * exception vector entrypoint
- */
-#define VECTOR(x, regmask) \
- .ent _C_LABEL(x),0; \
- EXPORT(x);
-
-#define VECTOR_END(x) \
- EXPORT(x);
-
-/* Overload, Danger Will Robinson!! */
-#define PT_HOST_USERLOCAL PT_EPC
-
-#define CP0_DDATA_LO $28,3
-
-/* Resume Flags */
-#define RESUME_FLAG_HOST (1<<1) /* Resume host? */
-
-#define RESUME_GUEST 0
-#define RESUME_HOST RESUME_FLAG_HOST
-
-/*
- * __kvm_mips_vcpu_run: entry point to the guest
- * a0: run
- * a1: vcpu
- */
- .set noreorder
-
-FEXPORT(__kvm_mips_vcpu_run)
- /* k0/k1 not being used in host kernel context */
- INT_ADDIU k1, sp, -PT_SIZE
- LONG_S $16, PT_R16(k1)
- LONG_S $17, PT_R17(k1)
- LONG_S $18, PT_R18(k1)
- LONG_S $19, PT_R19(k1)
- LONG_S $20, PT_R20(k1)
- LONG_S $21, PT_R21(k1)
- LONG_S $22, PT_R22(k1)
- LONG_S $23, PT_R23(k1)
-
- LONG_S $28, PT_R28(k1)
- LONG_S $29, PT_R29(k1)
- LONG_S $30, PT_R30(k1)
- LONG_S $31, PT_R31(k1)
-
- /* Save hi/lo */
- mflo v0
- LONG_S v0, PT_LO(k1)
- mfhi v1
- LONG_S v1, PT_HI(k1)
-
- /* Save host status */
- mfc0 v0, CP0_STATUS
- LONG_S v0, PT_STATUS(k1)
-
- /* Save DDATA_LO, will be used to store pointer to vcpu */
- mfc0 v1, CP0_DDATA_LO
- LONG_S v1, PT_HOST_USERLOCAL(k1)
-
- /* DDATA_LO has pointer to vcpu */
- mtc0 a1, CP0_DDATA_LO
-
- /* Offset into vcpu->arch */
- INT_ADDIU k1, a1, VCPU_HOST_ARCH
-
- /*
- * Save the host stack to VCPU, used for exception processing
- * when we exit from the Guest
- */
- LONG_S sp, VCPU_HOST_STACK(k1)
-
- /* Save the kernel gp as well */
- LONG_S gp, VCPU_HOST_GP(k1)
-
- /*
- * Setup status register for running the guest in UM, interrupts
- * are disabled
- */
- li k0, (ST0_EXL | KSU_USER | ST0_BEV)
- mtc0 k0, CP0_STATUS
- ehb
-
- /* load up the new EBASE */
- LONG_L k0, VCPU_GUEST_EBASE(k1)
- mtc0 k0, CP0_EBASE
-
- /*
- * Now that the new EBASE has been loaded, unset BEV, set
- * interrupt mask as it was but make sure that timer interrupts
- * are enabled
- */
- li k0, (ST0_EXL | KSU_USER | ST0_IE)
- andi v0, v0, ST0_IM
- or k0, k0, v0
- mtc0 k0, CP0_STATUS
- ehb
-
- /* Set Guest EPC */
- LONG_L t0, VCPU_PC(k1)
- mtc0 t0, CP0_EPC
-
-FEXPORT(__kvm_mips_load_asid)
- /* Set the ASID for the Guest Kernel */
- PTR_L t0, VCPU_COP0(k1)
- LONG_L t0, COP0_STATUS(t0)
- andi t0, KSU_USER | ST0_ERL | ST0_EXL
- xori t0, KSU_USER
- bnez t0, 1f /* If kernel */
- INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID /* (BD) */
- INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID /* else user */
-1:
- /* t1: contains the base of the ASID array, need to get the cpu id */
- LONG_L t2, TI_CPU($28) /* smp_processor_id */
- INT_SLL t2, t2, 2 /* x4 */
- REG_ADDU t3, t1, t2
- LONG_L k0, (t3)
-#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
- li t3, CPUINFO_SIZE/4
- mul t2, t2, t3 /* x sizeof(struct cpuinfo_mips)/4 */
- LONG_L t2, (cpu_data + CPUINFO_ASID_MASK)(t2)
- and k0, k0, t2
-#else
- andi k0, k0, MIPS_ENTRYHI_ASID
-#endif
- mtc0 k0, CP0_ENTRYHI
- ehb
-
- /* Disable RDHWR access */
- mtc0 zero, CP0_HWRENA
-
- .set noat
- /* Now load up the Guest Context from VCPU */
- LONG_L $1, VCPU_R1(k1)
- LONG_L $2, VCPU_R2(k1)
- LONG_L $3, VCPU_R3(k1)
-
- LONG_L $4, VCPU_R4(k1)
- LONG_L $5, VCPU_R5(k1)
- LONG_L $6, VCPU_R6(k1)
- LONG_L $7, VCPU_R7(k1)
-
- LONG_L $8, VCPU_R8(k1)
- LONG_L $9, VCPU_R9(k1)
- LONG_L $10, VCPU_R10(k1)
- LONG_L $11, VCPU_R11(k1)
- LONG_L $12, VCPU_R12(k1)
- LONG_L $13, VCPU_R13(k1)
- LONG_L $14, VCPU_R14(k1)
- LONG_L $15, VCPU_R15(k1)
- LONG_L $16, VCPU_R16(k1)
- LONG_L $17, VCPU_R17(k1)
- LONG_L $18, VCPU_R18(k1)
- LONG_L $19, VCPU_R19(k1)
- LONG_L $20, VCPU_R20(k1)
- LONG_L $21, VCPU_R21(k1)
- LONG_L $22, VCPU_R22(k1)
- LONG_L $23, VCPU_R23(k1)
- LONG_L $24, VCPU_R24(k1)
- LONG_L $25, VCPU_R25(k1)
-
- /* k0/k1 loaded up later */
-
- LONG_L $28, VCPU_R28(k1)
- LONG_L $29, VCPU_R29(k1)
- LONG_L $30, VCPU_R30(k1)
- LONG_L $31, VCPU_R31(k1)
-
- /* Restore hi/lo */
- LONG_L k0, VCPU_LO(k1)
- mtlo k0
-
- LONG_L k0, VCPU_HI(k1)
- mthi k0
-
-FEXPORT(__kvm_mips_load_k0k1)
- /* Restore the guest's k0/k1 registers */
- LONG_L k0, VCPU_R26(k1)
- LONG_L k1, VCPU_R27(k1)
-
- /* Jump to guest */
- eret
-EXPORT(__kvm_mips_vcpu_run_end)
-
-VECTOR(MIPSX(exception), unknown)
-/* Find out what mode we came from and jump to the proper handler. */
- mtc0 k0, CP0_ERROREPC #01: Save guest k0
- ehb #02:
-
- mfc0 k0, CP0_EBASE #02: Get EBASE
- INT_SRL k0, k0, 10 #03: Get rid of CPUNum
- INT_SLL k0, k0, 10 #04
- LONG_S k1, 0x3000(k0) #05: Save k1 @ offset 0x3000
- INT_ADDIU k0, k0, 0x2000 #06: Exception handler is
- # installed @ offset 0x2000
- j k0 #07: jump to the function
- nop #08: branch delay slot
-VECTOR_END(MIPSX(exceptionEnd))
-.end MIPSX(exception)
-
-/*
- * Generic Guest exception handler. We end up here when the guest
- * does something that causes a trap to kernel mode.
- */
-NESTED (MIPSX(GuestException), CALLFRAME_SIZ, ra)
- /* Get the VCPU pointer from DDTATA_LO */
- mfc0 k1, CP0_DDATA_LO
- INT_ADDIU k1, k1, VCPU_HOST_ARCH
-
- /* Start saving Guest context to VCPU */
- LONG_S $0, VCPU_R0(k1)
- LONG_S $1, VCPU_R1(k1)
- LONG_S $2, VCPU_R2(k1)
- LONG_S $3, VCPU_R3(k1)
- LONG_S $4, VCPU_R4(k1)
- LONG_S $5, VCPU_R5(k1)
- LONG_S $6, VCPU_R6(k1)
- LONG_S $7, VCPU_R7(k1)
- LONG_S $8, VCPU_R8(k1)
- LONG_S $9, VCPU_R9(k1)
- LONG_S $10, VCPU_R10(k1)
- LONG_S $11, VCPU_R11(k1)
- LONG_S $12, VCPU_R12(k1)
- LONG_S $13, VCPU_R13(k1)
- LONG_S $14, VCPU_R14(k1)
- LONG_S $15, VCPU_R15(k1)
- LONG_S $16, VCPU_R16(k1)
- LONG_S $17, VCPU_R17(k1)
- LONG_S $18, VCPU_R18(k1)
- LONG_S $19, VCPU_R19(k1)
- LONG_S $20, VCPU_R20(k1)
- LONG_S $21, VCPU_R21(k1)
- LONG_S $22, VCPU_R22(k1)
- LONG_S $23, VCPU_R23(k1)
- LONG_S $24, VCPU_R24(k1)
- LONG_S $25, VCPU_R25(k1)
-
- /* Guest k0/k1 saved later */
-
- LONG_S $28, VCPU_R28(k1)
- LONG_S $29, VCPU_R29(k1)
- LONG_S $30, VCPU_R30(k1)
- LONG_S $31, VCPU_R31(k1)
-
- .set at
-
- /* We need to save hi/lo and restore them on the way out */
- mfhi t0
- LONG_S t0, VCPU_HI(k1)
-
- mflo t0
- LONG_S t0, VCPU_LO(k1)
-
- /* Finally save guest k0/k1 to VCPU */
- mfc0 t0, CP0_ERROREPC
- LONG_S t0, VCPU_R26(k1)
-
- /* Get GUEST k1 and save it in VCPU */
- PTR_LI t1, ~0x2ff
- mfc0 t0, CP0_EBASE
- and t0, t0, t1
- LONG_L t0, 0x3000(t0)
- LONG_S t0, VCPU_R27(k1)
-
- /* Now that context has been saved, we can use other registers */
-
- /* Restore vcpu */
- mfc0 a1, CP0_DDATA_LO
- move s1, a1
-
- /* Restore run (vcpu->run) */
- LONG_L a0, VCPU_RUN(a1)
- /* Save pointer to run in s0, will be saved by the compiler */
- move s0, a0
-
- /*
- * Save Host level EPC, BadVaddr and Cause to VCPU, useful to
- * process the exception
- */
- mfc0 k0,CP0_EPC
- LONG_S k0, VCPU_PC(k1)
-
- mfc0 k0, CP0_BADVADDR
- LONG_S k0, VCPU_HOST_CP0_BADVADDR(k1)
-
- mfc0 k0, CP0_CAUSE
- LONG_S k0, VCPU_HOST_CP0_CAUSE(k1)
-
- mfc0 k0, CP0_ENTRYHI
- LONG_S k0, VCPU_HOST_ENTRYHI(k1)
-
- /* Now restore the host state just enough to run the handlers */
-
- /* Switch EBASE to the one used by Linux */
- /* load up the host EBASE */
- mfc0 v0, CP0_STATUS
-
- or k0, v0, ST0_BEV
-
- mtc0 k0, CP0_STATUS
- ehb
-
- LONG_L k0, VCPU_HOST_EBASE(k1)
- mtc0 k0,CP0_EBASE
-
- /*
- * If FPU is enabled, save FCR31 and clear it so that later ctc1's don't
- * trigger FPE for pending exceptions.
- */
- and v1, v0, ST0_CU1
- beqz v1, 1f
- nop
- .set push
- SET_HARDFLOAT
- cfc1 t0, fcr31
- sw t0, VCPU_FCR31(k1)
- ctc1 zero,fcr31
- .set pop
-1:
-
-#ifdef CONFIG_CPU_HAS_MSA
- /*
- * If MSA is enabled, save MSACSR and clear it so that later
- * instructions don't trigger MSAFPE for pending exceptions.
- */
- mfc0 t0, CP0_CONFIG3
- ext t0, t0, 28, 1 /* MIPS_CONF3_MSAP */
- beqz t0, 1f
- nop
- mfc0 t0, CP0_CONFIG5
- ext t0, t0, 27, 1 /* MIPS_CONF5_MSAEN */
- beqz t0, 1f
- nop
- _cfcmsa t0, MSA_CSR
- sw t0, VCPU_MSA_CSR(k1)
- _ctcmsa MSA_CSR, zero
-1:
-#endif
-
- /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
- and v0, v0, ~(ST0_EXL | KSU_USER | ST0_IE)
- or v0, v0, ST0_CU0
- mtc0 v0, CP0_STATUS
- ehb
-
- /* Load up host GP */
- LONG_L gp, VCPU_HOST_GP(k1)
-
- /* Need a stack before we can jump to "C" */
- LONG_L sp, VCPU_HOST_STACK(k1)
-
- /* Saved host state */
- INT_ADDIU sp, sp, -PT_SIZE
-
- /*
- * XXXKYMA do we need to load the host ASID, maybe not because the
- * kernel entries are marked GLOBAL, need to verify
- */
-
- /* Restore host DDATA_LO */
- LONG_L k0, PT_HOST_USERLOCAL(sp)
- mtc0 k0, CP0_DDATA_LO
-
- /* Restore RDHWR access */
- PTR_LI k0, 0x2000000F
- mtc0 k0, CP0_HWRENA
-
- /* Jump to handler */
-FEXPORT(__kvm_mips_jump_to_handler)
- /*
- * XXXKYMA: not sure if this is safe, how large is the stack??
- * Now jump to the kvm_mips_handle_exit() to see if we can deal
- * with this in the kernel
- */
- PTR_LA t9, kvm_mips_handle_exit
- jalr.hb t9
- INT_ADDIU sp, sp, -CALLFRAME_SIZ /* BD Slot */
-
- /* Return from handler Make sure interrupts are disabled */
- di
- ehb
-
- /*
- * XXXKYMA: k0/k1 could have been blown away if we processed
- * an exception while we were handling the exception from the
- * guest, reload k1
- */
-
- move k1, s1
- INT_ADDIU k1, k1, VCPU_HOST_ARCH
-
- /*
- * Check return value, should tell us if we are returning to the
- * host (handle I/O etc)or resuming the guest
- */
- andi t0, v0, RESUME_HOST
- bnez t0, __kvm_mips_return_to_host
- nop
-
-__kvm_mips_return_to_guest:
- /* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register */
- mtc0 s1, CP0_DDATA_LO
-
- /* Load up the Guest EBASE to minimize the window where BEV is set */
- LONG_L t0, VCPU_GUEST_EBASE(k1)
-
- /* Switch EBASE back to the one used by KVM */
- mfc0 v1, CP0_STATUS
- or k0, v1, ST0_BEV
- mtc0 k0, CP0_STATUS
- ehb
- mtc0 t0, CP0_EBASE
-
- /* Setup status register for running guest in UM */
- or v1, v1, (ST0_EXL | KSU_USER | ST0_IE)
- and v1, v1, ~(ST0_CU0 | ST0_MX)
- mtc0 v1, CP0_STATUS
- ehb
-
- /* Set Guest EPC */
- LONG_L t0, VCPU_PC(k1)
- mtc0 t0, CP0_EPC
-
- /* Set the ASID for the Guest Kernel */
- PTR_L t0, VCPU_COP0(k1)
- LONG_L t0, COP0_STATUS(t0)
- andi t0, KSU_USER | ST0_ERL | ST0_EXL
- xori t0, KSU_USER
- bnez t0, 1f /* If kernel */
- INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID /* (BD) */
- INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID /* else user */
-1:
- /* t1: contains the base of the ASID array, need to get the cpu id */
- LONG_L t2, TI_CPU($28) /* smp_processor_id */
- INT_SLL t2, t2, 2 /* x4 */
- REG_ADDU t3, t1, t2
- LONG_L k0, (t3)
-#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
- li t3, CPUINFO_SIZE/4
- mul t2, t2, t3 /* x sizeof(struct cpuinfo_mips)/4 */
- LONG_L t2, (cpu_data + CPUINFO_ASID_MASK)(t2)
- and k0, k0, t2
-#else
- andi k0, k0, MIPS_ENTRYHI_ASID
-#endif
- mtc0 k0, CP0_ENTRYHI
- ehb
-
- /* Disable RDHWR access */
- mtc0 zero, CP0_HWRENA
-
- .set noat
- /* load the guest context from VCPU and return */
- LONG_L $0, VCPU_R0(k1)
- LONG_L $1, VCPU_R1(k1)
- LONG_L $2, VCPU_R2(k1)
- LONG_L $3, VCPU_R3(k1)
- LONG_L $4, VCPU_R4(k1)
- LONG_L $5, VCPU_R5(k1)
- LONG_L $6, VCPU_R6(k1)
- LONG_L $7, VCPU_R7(k1)
- LONG_L $8, VCPU_R8(k1)
- LONG_L $9, VCPU_R9(k1)
- LONG_L $10, VCPU_R10(k1)
- LONG_L $11, VCPU_R11(k1)
- LONG_L $12, VCPU_R12(k1)
- LONG_L $13, VCPU_R13(k1)
- LONG_L $14, VCPU_R14(k1)
- LONG_L $15, VCPU_R15(k1)
- LONG_L $16, VCPU_R16(k1)
- LONG_L $17, VCPU_R17(k1)
- LONG_L $18, VCPU_R18(k1)
- LONG_L $19, VCPU_R19(k1)
- LONG_L $20, VCPU_R20(k1)
- LONG_L $21, VCPU_R21(k1)
- LONG_L $22, VCPU_R22(k1)
- LONG_L $23, VCPU_R23(k1)
- LONG_L $24, VCPU_R24(k1)
- LONG_L $25, VCPU_R25(k1)
-
- /* $/k1 loaded later */
- LONG_L $28, VCPU_R28(k1)
- LONG_L $29, VCPU_R29(k1)
- LONG_L $30, VCPU_R30(k1)
- LONG_L $31, VCPU_R31(k1)
-
-FEXPORT(__kvm_mips_skip_guest_restore)
- LONG_L k0, VCPU_HI(k1)
- mthi k0
-
- LONG_L k0, VCPU_LO(k1)
- mtlo k0
-
- LONG_L k0, VCPU_R26(k1)
- LONG_L k1, VCPU_R27(k1)
-
- eret
- .set at
-
-__kvm_mips_return_to_host:
- /* EBASE is already pointing to Linux */
- LONG_L k1, VCPU_HOST_STACK(k1)
- INT_ADDIU k1,k1, -PT_SIZE
-
- /* Restore host DDATA_LO */
- LONG_L k0, PT_HOST_USERLOCAL(k1)
- mtc0 k0, CP0_DDATA_LO
-
- /*
- * r2/v0 is the return code, shift it down by 2 (arithmetic)
- * to recover the err code
- */
- INT_SRA k0, v0, 2
- move $2, k0
-
- /* Load context saved on the host stack */
- LONG_L $16, PT_R16(k1)
- LONG_L $17, PT_R17(k1)
- LONG_L $18, PT_R18(k1)
- LONG_L $19, PT_R19(k1)
- LONG_L $20, PT_R20(k1)
- LONG_L $21, PT_R21(k1)
- LONG_L $22, PT_R22(k1)
- LONG_L $23, PT_R23(k1)
-
- LONG_L $28, PT_R28(k1)
- LONG_L $29, PT_R29(k1)
- LONG_L $30, PT_R30(k1)
-
- LONG_L k0, PT_HI(k1)
- mthi k0
-
- LONG_L k0, PT_LO(k1)
- mtlo k0
-
- /* Restore RDHWR access */
- PTR_LI k0, 0x2000000F
- mtc0 k0, CP0_HWRENA
-
- /* Restore RA, which is the address we will return to */
- LONG_L ra, PT_R31(k1)
- j ra
- nop
-
-VECTOR_END(MIPSX(GuestExceptionEnd))
-.end MIPSX(GuestException)
-
-MIPSX(exceptions):
- ####
- ##### The exception handlers.
- #####
- .word _C_LABEL(MIPSX(GuestException)) # 0
- .word _C_LABEL(MIPSX(GuestException)) # 1
- .word _C_LABEL(MIPSX(GuestException)) # 2
- .word _C_LABEL(MIPSX(GuestException)) # 3
- .word _C_LABEL(MIPSX(GuestException)) # 4
- .word _C_LABEL(MIPSX(GuestException)) # 5
- .word _C_LABEL(MIPSX(GuestException)) # 6
- .word _C_LABEL(MIPSX(GuestException)) # 7
- .word _C_LABEL(MIPSX(GuestException)) # 8
- .word _C_LABEL(MIPSX(GuestException)) # 9
- .word _C_LABEL(MIPSX(GuestException)) # 10
- .word _C_LABEL(MIPSX(GuestException)) # 11
- .word _C_LABEL(MIPSX(GuestException)) # 12
- .word _C_LABEL(MIPSX(GuestException)) # 13
- .word _C_LABEL(MIPSX(GuestException)) # 14
- .word _C_LABEL(MIPSX(GuestException)) # 15
- .word _C_LABEL(MIPSX(GuestException)) # 16
- .word _C_LABEL(MIPSX(GuestException)) # 17
- .word _C_LABEL(MIPSX(GuestException)) # 18
- .word _C_LABEL(MIPSX(GuestException)) # 19
- .word _C_LABEL(MIPSX(GuestException)) # 20
- .word _C_LABEL(MIPSX(GuestException)) # 21
- .word _C_LABEL(MIPSX(GuestException)) # 22
- .word _C_LABEL(MIPSX(GuestException)) # 23
- .word _C_LABEL(MIPSX(GuestException)) # 24
- .word _C_LABEL(MIPSX(GuestException)) # 25
- .word _C_LABEL(MIPSX(GuestException)) # 26
- .word _C_LABEL(MIPSX(GuestException)) # 27
- .word _C_LABEL(MIPSX(GuestException)) # 28
- .word _C_LABEL(MIPSX(GuestException)) # 29
- .word _C_LABEL(MIPSX(GuestException)) # 30
- .word _C_LABEL(MIPSX(GuestException)) # 31
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 44da5259f390..a6ea084b4d9d 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -9,6 +9,7 @@
* Authors: Sanjay Lal <sanjayl@kymasys.com>
*/
+#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/kdebug.h>
@@ -147,7 +148,7 @@ void kvm_mips_free_vcpus(struct kvm *kvm)
/* Put the pages we reserved for the guest pmap */
for (i = 0; i < kvm->arch.guest_pmap_npages; i++) {
if (kvm->arch.guest_pmap[i] != KVM_INVALID_PAGE)
- kvm_mips_release_pfn_clean(kvm->arch.guest_pmap[i]);
+ kvm_release_pfn_clean(kvm->arch.guest_pmap[i]);
}
kfree(kvm->arch.guest_pmap);
@@ -244,10 +245,27 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
}
}
+static inline void dump_handler(const char *symbol, void *start, void *end)
+{
+ u32 *p;
+
+ pr_debug("LEAF(%s)\n", symbol);
+
+ pr_debug("\t.set push\n");
+ pr_debug("\t.set noreorder\n");
+
+ for (p = start; p < (u32 *)end; ++p)
+ pr_debug("\t.word\t0x%08x\t\t# %p\n", *p, p);
+
+ pr_debug("\t.set\tpop\n");
+
+ pr_debug("\tEND(%s)\n", symbol);
+}
+
struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
{
- int err, size, offset;
- void *gebase;
+ int err, size;
+ void *gebase, *p, *handler;
int i;
struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
@@ -273,9 +291,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
else
size = 0x4000;
- /* Save Linux EBASE */
- vcpu->arch.host_ebase = (void *)read_c0_ebase();
-
gebase = kzalloc(ALIGN(size, PAGE_SIZE), GFP_KERNEL);
if (!gebase) {
@@ -285,44 +300,53 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
kvm_debug("Allocated %d bytes for KVM Exception Handlers @ %p\n",
ALIGN(size, PAGE_SIZE), gebase);
+ /*
+ * Check new ebase actually fits in CP0_EBase. The lack of a write gate
+ * limits us to the low 512MB of physical address space. If the memory
+ * we allocate is out of range, just give up now.
+ */
+ if (!cpu_has_ebase_wg && virt_to_phys(gebase) >= 0x20000000) {
+ kvm_err("CP0_EBase.WG required for guest exception base %pK\n",
+ gebase);
+ err = -ENOMEM;
+ goto out_free_gebase;
+ }
+
/* Save new ebase */
vcpu->arch.guest_ebase = gebase;
- /* Copy L1 Guest Exception handler to correct offset */
+ /* Build guest exception vectors dynamically in unmapped memory */
+ handler = gebase + 0x2000;
/* TLB Refill, EXL = 0 */
- memcpy(gebase, mips32_exception,
- mips32_exceptionEnd - mips32_exception);
+ kvm_mips_build_exception(gebase, handler);
/* General Exception Entry point */
- memcpy(gebase + 0x180, mips32_exception,
- mips32_exceptionEnd - mips32_exception);
+ kvm_mips_build_exception(gebase + 0x180, handler);
/* For vectored interrupts poke the exception code @ all offsets 0-7 */
for (i = 0; i < 8; i++) {
kvm_debug("L1 Vectored handler @ %p\n",
gebase + 0x200 + (i * VECTORSPACING));
- memcpy(gebase + 0x200 + (i * VECTORSPACING), mips32_exception,
- mips32_exceptionEnd - mips32_exception);
+ kvm_mips_build_exception(gebase + 0x200 + i * VECTORSPACING,
+ handler);
}
- /* General handler, relocate to unmapped space for sanity's sake */
- offset = 0x2000;
- kvm_debug("Installing KVM Exception handlers @ %p, %#x bytes\n",
- gebase + offset,
- mips32_GuestExceptionEnd - mips32_GuestException);
+ /* General exit handler */
+ p = handler;
+ p = kvm_mips_build_exit(p);
- memcpy(gebase + offset, mips32_GuestException,
- mips32_GuestExceptionEnd - mips32_GuestException);
+ /* Guest entry routine */
+ vcpu->arch.vcpu_run = p;
+ p = kvm_mips_build_vcpu_run(p);
-#ifdef MODULE
- offset += mips32_GuestExceptionEnd - mips32_GuestException;
- memcpy(gebase + offset, (char *)__kvm_mips_vcpu_run,
- __kvm_mips_vcpu_run_end - (char *)__kvm_mips_vcpu_run);
- vcpu->arch.vcpu_run = gebase + offset;
-#else
- vcpu->arch.vcpu_run = __kvm_mips_vcpu_run;
-#endif
+ /* Dump the generated code */
+ pr_debug("#include <asm/asm.h>\n");
+ pr_debug("#include <asm/regdef.h>\n");
+ pr_debug("\n");
+ dump_handler("kvm_vcpu_run", vcpu->arch.vcpu_run, p);
+ dump_handler("kvm_gen_exc", gebase + 0x180, gebase + 0x200);
+ dump_handler("kvm_exit", gebase + 0x2000, vcpu->arch.vcpu_run);
/* Invalidate the icache for these ranges */
local_flush_icache_range((unsigned long)gebase,
@@ -408,17 +432,19 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
kvm_mips_deliver_interrupts(vcpu,
kvm_read_c0_guest_cause(vcpu->arch.cop0));
- __kvm_guest_enter();
+ guest_enter_irqoff();
/* Disable hardware page table walking while in guest */
htw_stop();
+ trace_kvm_enter(vcpu);
r = vcpu->arch.vcpu_run(run, vcpu);
+ trace_kvm_out(vcpu);
/* Re-enable HTW before enabling interrupts */
htw_start();
- __kvm_guest_exit();
+ guest_exit_irqoff();
local_irq_enable();
if (vcpu->sigset_active)
@@ -507,8 +533,10 @@ static u64 kvm_mips_get_one_regs[] = {
KVM_REG_MIPS_R30,
KVM_REG_MIPS_R31,
+#ifndef CONFIG_CPU_MIPSR6
KVM_REG_MIPS_HI,
KVM_REG_MIPS_LO,
+#endif
KVM_REG_MIPS_PC,
KVM_REG_MIPS_CP0_INDEX,
@@ -539,6 +567,104 @@ static u64 kvm_mips_get_one_regs[] = {
KVM_REG_MIPS_COUNT_HZ,
};
+static u64 kvm_mips_get_one_regs_fpu[] = {
+ KVM_REG_MIPS_FCR_IR,
+ KVM_REG_MIPS_FCR_CSR,
+};
+
+static u64 kvm_mips_get_one_regs_msa[] = {
+ KVM_REG_MIPS_MSA_IR,
+ KVM_REG_MIPS_MSA_CSR,
+};
+
+static u64 kvm_mips_get_one_regs_kscratch[] = {
+ KVM_REG_MIPS_CP0_KSCRATCH1,
+ KVM_REG_MIPS_CP0_KSCRATCH2,
+ KVM_REG_MIPS_CP0_KSCRATCH3,
+ KVM_REG_MIPS_CP0_KSCRATCH4,
+ KVM_REG_MIPS_CP0_KSCRATCH5,
+ KVM_REG_MIPS_CP0_KSCRATCH6,
+};
+
+static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu)
+{
+ unsigned long ret;
+
+ ret = ARRAY_SIZE(kvm_mips_get_one_regs);
+ if (kvm_mips_guest_can_have_fpu(&vcpu->arch)) {
+ ret += ARRAY_SIZE(kvm_mips_get_one_regs_fpu) + 48;
+ /* odd doubles */
+ if (boot_cpu_data.fpu_id & MIPS_FPIR_F64)
+ ret += 16;
+ }
+ if (kvm_mips_guest_can_have_msa(&vcpu->arch))
+ ret += ARRAY_SIZE(kvm_mips_get_one_regs_msa) + 32;
+ ret += __arch_hweight8(vcpu->arch.kscratch_enabled);
+ ret += kvm_mips_callbacks->num_regs(vcpu);
+
+ return ret;
+}
+
+static int kvm_mips_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
+{
+ u64 index;
+ unsigned int i;
+
+ if (copy_to_user(indices, kvm_mips_get_one_regs,
+ sizeof(kvm_mips_get_one_regs)))
+ return -EFAULT;
+ indices += ARRAY_SIZE(kvm_mips_get_one_regs);
+
+ if (kvm_mips_guest_can_have_fpu(&vcpu->arch)) {
+ if (copy_to_user(indices, kvm_mips_get_one_regs_fpu,
+ sizeof(kvm_mips_get_one_regs_fpu)))
+ return -EFAULT;
+ indices += ARRAY_SIZE(kvm_mips_get_one_regs_fpu);
+
+ for (i = 0; i < 32; ++i) {
+ index = KVM_REG_MIPS_FPR_32(i);
+ if (copy_to_user(indices, &index, sizeof(index)))
+ return -EFAULT;
+ ++indices;
+
+ /* skip odd doubles if no F64 */
+ if (i & 1 && !(boot_cpu_data.fpu_id & MIPS_FPIR_F64))
+ continue;
+
+ index = KVM_REG_MIPS_FPR_64(i);
+ if (copy_to_user(indices, &index, sizeof(index)))
+ return -EFAULT;
+ ++indices;
+ }
+ }
+
+ if (kvm_mips_guest_can_have_msa(&vcpu->arch)) {
+ if (copy_to_user(indices, kvm_mips_get_one_regs_msa,
+ sizeof(kvm_mips_get_one_regs_msa)))
+ return -EFAULT;
+ indices += ARRAY_SIZE(kvm_mips_get_one_regs_msa);
+
+ for (i = 0; i < 32; ++i) {
+ index = KVM_REG_MIPS_VEC_128(i);
+ if (copy_to_user(indices, &index, sizeof(index)))
+ return -EFAULT;
+ ++indices;
+ }
+ }
+
+ for (i = 0; i < 6; ++i) {
+ if (!(vcpu->arch.kscratch_enabled & BIT(i + 2)))
+ continue;
+
+ if (copy_to_user(indices, &kvm_mips_get_one_regs_kscratch[i],
+ sizeof(kvm_mips_get_one_regs_kscratch[i])))
+ return -EFAULT;
+ ++indices;
+ }
+
+ return kvm_mips_callbacks->copy_reg_indices(vcpu, indices);
+}
+
static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
const struct kvm_one_reg *reg)
{
@@ -554,12 +680,14 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
case KVM_REG_MIPS_R0 ... KVM_REG_MIPS_R31:
v = (long)vcpu->arch.gprs[reg->id - KVM_REG_MIPS_R0];
break;
+#ifndef CONFIG_CPU_MIPSR6
case KVM_REG_MIPS_HI:
v = (long)vcpu->arch.hi;
break;
case KVM_REG_MIPS_LO:
v = (long)vcpu->arch.lo;
break;
+#endif
case KVM_REG_MIPS_PC:
v = (long)vcpu->arch.pc;
break;
@@ -688,17 +816,37 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
case KVM_REG_MIPS_CP0_ERROREPC:
v = (long)kvm_read_c0_guest_errorepc(cop0);
break;
+ case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
+ idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
+ if (!(vcpu->arch.kscratch_enabled & BIT(idx)))
+ return -EINVAL;
+ switch (idx) {
+ case 2:
+ v = (long)kvm_read_c0_guest_kscratch1(cop0);
+ break;
+ case 3:
+ v = (long)kvm_read_c0_guest_kscratch2(cop0);
+ break;
+ case 4:
+ v = (long)kvm_read_c0_guest_kscratch3(cop0);
+ break;
+ case 5:
+ v = (long)kvm_read_c0_guest_kscratch4(cop0);
+ break;
+ case 6:
+ v = (long)kvm_read_c0_guest_kscratch5(cop0);
+ break;
+ case 7:
+ v = (long)kvm_read_c0_guest_kscratch6(cop0);
+ break;
+ }
+ break;
/* registers to be handled specially */
- case KVM_REG_MIPS_CP0_COUNT:
- case KVM_REG_MIPS_COUNT_CTL:
- case KVM_REG_MIPS_COUNT_RESUME:
- case KVM_REG_MIPS_COUNT_HZ:
+ default:
ret = kvm_mips_callbacks->get_one_reg(vcpu, reg, &v);
if (ret)
return ret;
break;
- default:
- return -EINVAL;
}
if ((reg->id & KVM_REG_SIZE_MASK) == KVM_REG_SIZE_U64) {
u64 __user *uaddr64 = (u64 __user *)(long)reg->addr;
@@ -755,12 +903,14 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
case KVM_REG_MIPS_R1 ... KVM_REG_MIPS_R31:
vcpu->arch.gprs[reg->id - KVM_REG_MIPS_R0] = v;
break;
+#ifndef CONFIG_CPU_MIPSR6
case KVM_REG_MIPS_HI:
vcpu->arch.hi = v;
break;
case KVM_REG_MIPS_LO:
vcpu->arch.lo = v;
break;
+#endif
case KVM_REG_MIPS_PC:
vcpu->arch.pc = v;
break;
@@ -859,22 +1009,34 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
case KVM_REG_MIPS_CP0_ERROREPC:
kvm_write_c0_guest_errorepc(cop0, v);
break;
+ case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
+ idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
+ if (!(vcpu->arch.kscratch_enabled & BIT(idx)))
+ return -EINVAL;
+ switch (idx) {
+ case 2:
+ kvm_write_c0_guest_kscratch1(cop0, v);
+ break;
+ case 3:
+ kvm_write_c0_guest_kscratch2(cop0, v);
+ break;
+ case 4:
+ kvm_write_c0_guest_kscratch3(cop0, v);
+ break;
+ case 5:
+ kvm_write_c0_guest_kscratch4(cop0, v);
+ break;
+ case 6:
+ kvm_write_c0_guest_kscratch5(cop0, v);
+ break;
+ case 7:
+ kvm_write_c0_guest_kscratch6(cop0, v);
+ break;
+ }
+ break;
/* registers to be handled specially */
- case KVM_REG_MIPS_CP0_COUNT:
- case KVM_REG_MIPS_CP0_COMPARE:
- case KVM_REG_MIPS_CP0_CAUSE:
- case KVM_REG_MIPS_CP0_CONFIG:
- case KVM_REG_MIPS_CP0_CONFIG1:
- case KVM_REG_MIPS_CP0_CONFIG2:
- case KVM_REG_MIPS_CP0_CONFIG3:
- case KVM_REG_MIPS_CP0_CONFIG4:
- case KVM_REG_MIPS_CP0_CONFIG5:
- case KVM_REG_MIPS_COUNT_CTL:
- case KVM_REG_MIPS_COUNT_RESUME:
- case KVM_REG_MIPS_COUNT_HZ:
- return kvm_mips_callbacks->set_one_reg(vcpu, reg, v);
default:
- return -EINVAL;
+ return kvm_mips_callbacks->set_one_reg(vcpu, reg, v);
}
return 0;
}
@@ -927,23 +1089,18 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl,
}
case KVM_GET_REG_LIST: {
struct kvm_reg_list __user *user_list = argp;
- u64 __user *reg_dest;
struct kvm_reg_list reg_list;
unsigned n;
if (copy_from_user(&reg_list, user_list, sizeof(reg_list)))
return -EFAULT;
n = reg_list.n;
- reg_list.n = ARRAY_SIZE(kvm_mips_get_one_regs);
+ reg_list.n = kvm_mips_num_regs(vcpu);
if (copy_to_user(user_list, &reg_list, sizeof(reg_list)))
return -EFAULT;
if (n < reg_list.n)
return -E2BIG;
- reg_dest = user_list->reg;
- if (copy_to_user(reg_dest, kvm_mips_get_one_regs,
- sizeof(kvm_mips_get_one_regs)))
- return -EFAULT;
- return 0;
+ return kvm_mips_copy_reg_indices(vcpu, user_list->reg);
}
case KVM_NMI:
/* Treat the NMI as a CPU reset */
@@ -1222,7 +1379,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
static void kvm_mips_set_c0_status(void)
{
- uint32_t status = read_c0_status();
+ u32 status = read_c0_status();
if (cpu_has_dsp)
status |= (ST0_MX);
@@ -1236,9 +1393,9 @@ static void kvm_mips_set_c0_status(void)
*/
int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
{
- uint32_t cause = vcpu->arch.host_cp0_cause;
- uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
- uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+ u32 cause = vcpu->arch.host_cp0_cause;
+ u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
+ u32 __user *opc = (u32 __user *) vcpu->arch.pc;
unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
@@ -1260,6 +1417,7 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
kvm_debug("kvm_mips_handle_exit: cause: %#x, PC: %p, kvm_run: %p, kvm_vcpu: %p\n",
cause, opc, run, vcpu);
+ trace_kvm_exit(vcpu, exccode);
/*
* Do a privilege check, if in UM most of these exit conditions end up
@@ -1279,7 +1437,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
kvm_debug("[%d]EXCCODE_INT @ %p\n", vcpu->vcpu_id, opc);
++vcpu->stat.int_exits;
- trace_kvm_exit(vcpu, INT_EXITS);
if (need_resched())
cond_resched();
@@ -1291,7 +1448,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
kvm_debug("EXCCODE_CPU: @ PC: %p\n", opc);
++vcpu->stat.cop_unusable_exits;
- trace_kvm_exit(vcpu, COP_UNUSABLE_EXITS);
ret = kvm_mips_callbacks->handle_cop_unusable(vcpu);
/* XXXKYMA: Might need to return to user space */
if (run->exit_reason == KVM_EXIT_IRQ_WINDOW_OPEN)
@@ -1300,7 +1456,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
case EXCCODE_MOD:
++vcpu->stat.tlbmod_exits;
- trace_kvm_exit(vcpu, TLBMOD_EXITS);
ret = kvm_mips_callbacks->handle_tlb_mod(vcpu);
break;
@@ -1310,7 +1465,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
badvaddr);
++vcpu->stat.tlbmiss_st_exits;
- trace_kvm_exit(vcpu, TLBMISS_ST_EXITS);
ret = kvm_mips_callbacks->handle_tlb_st_miss(vcpu);
break;
@@ -1319,61 +1473,51 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
cause, opc, badvaddr);
++vcpu->stat.tlbmiss_ld_exits;
- trace_kvm_exit(vcpu, TLBMISS_LD_EXITS);
ret = kvm_mips_callbacks->handle_tlb_ld_miss(vcpu);
break;
case EXCCODE_ADES:
++vcpu->stat.addrerr_st_exits;
- trace_kvm_exit(vcpu, ADDRERR_ST_EXITS);
ret = kvm_mips_callbacks->handle_addr_err_st(vcpu);
break;
case EXCCODE_ADEL:
++vcpu->stat.addrerr_ld_exits;
- trace_kvm_exit(vcpu, ADDRERR_LD_EXITS);
ret = kvm_mips_callbacks->handle_addr_err_ld(vcpu);
break;
case EXCCODE_SYS:
++vcpu->stat.syscall_exits;
- trace_kvm_exit(vcpu, SYSCALL_EXITS);
ret = kvm_mips_callbacks->handle_syscall(vcpu);
break;
case EXCCODE_RI:
++vcpu->stat.resvd_inst_exits;
- trace_kvm_exit(vcpu, RESVD_INST_EXITS);
ret = kvm_mips_callbacks->handle_res_inst(vcpu);
break;
case EXCCODE_BP:
++vcpu->stat.break_inst_exits;
- trace_kvm_exit(vcpu, BREAK_INST_EXITS);
ret = kvm_mips_callbacks->handle_break(vcpu);
break;
case EXCCODE_TR:
++vcpu->stat.trap_inst_exits;
- trace_kvm_exit(vcpu, TRAP_INST_EXITS);
ret = kvm_mips_callbacks->handle_trap(vcpu);
break;
case EXCCODE_MSAFPE:
++vcpu->stat.msa_fpe_exits;
- trace_kvm_exit(vcpu, MSA_FPE_EXITS);
ret = kvm_mips_callbacks->handle_msa_fpe(vcpu);
break;
case EXCCODE_FPE:
++vcpu->stat.fpe_exits;
- trace_kvm_exit(vcpu, FPE_EXITS);
ret = kvm_mips_callbacks->handle_fpe(vcpu);
break;
case EXCCODE_MSADIS:
++vcpu->stat.msa_disabled_exits;
- trace_kvm_exit(vcpu, MSA_DISABLED_EXITS);
ret = kvm_mips_callbacks->handle_msa_disabled(vcpu);
break;
@@ -1400,11 +1544,13 @@ skip_emul:
run->exit_reason = KVM_EXIT_INTR;
ret = (-EINTR << 2) | RESUME_HOST;
++vcpu->stat.signal_exits;
- trace_kvm_exit(vcpu, SIGNAL_EXITS);
+ trace_kvm_exit(vcpu, KVM_TRACE_EXIT_SIGNAL);
}
}
if (ret == RESUME_GUEST) {
+ trace_kvm_reenter(vcpu);
+
/*
* If FPU / MSA are enabled (i.e. the guest's FPU / MSA context
* is live), restore FCR31 / MSACSR.
@@ -1450,7 +1596,7 @@ void kvm_own_fpu(struct kvm_vcpu *vcpu)
* not to clobber the status register directly via the commpage.
*/
if (cpu_has_msa && sr & ST0_CU1 && !(sr & ST0_FR) &&
- vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA)
+ vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA)
kvm_lose_fpu(vcpu);
/*
@@ -1465,9 +1611,12 @@ void kvm_own_fpu(struct kvm_vcpu *vcpu)
enable_fpu_hazard();
/* If guest FPU state not active, restore it now */
- if (!(vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU)) {
+ if (!(vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)) {
__kvm_restore_fpu(&vcpu->arch);
- vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_FPU;
+ vcpu->arch.aux_inuse |= KVM_MIPS_AUX_FPU;
+ trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, KVM_TRACE_AUX_FPU);
+ } else {
+ trace_kvm_aux(vcpu, KVM_TRACE_AUX_ENABLE, KVM_TRACE_AUX_FPU);
}
preempt_enable();
@@ -1494,8 +1643,8 @@ void kvm_own_msa(struct kvm_vcpu *vcpu)
* interacts with MSA state, so play it safe and save it first.
*/
if (!(sr & ST0_FR) &&
- (vcpu->arch.fpu_inuse & (KVM_MIPS_FPU_FPU |
- KVM_MIPS_FPU_MSA)) == KVM_MIPS_FPU_FPU)
+ (vcpu->arch.aux_inuse & (KVM_MIPS_AUX_FPU |
+ KVM_MIPS_AUX_MSA)) == KVM_MIPS_AUX_FPU)
kvm_lose_fpu(vcpu);
change_c0_status(ST0_CU1 | ST0_FR, sr);
@@ -1509,22 +1658,26 @@ void kvm_own_msa(struct kvm_vcpu *vcpu)
set_c0_config5(MIPS_CONF5_MSAEN);
enable_fpu_hazard();
- switch (vcpu->arch.fpu_inuse & (KVM_MIPS_FPU_FPU | KVM_MIPS_FPU_MSA)) {
- case KVM_MIPS_FPU_FPU:
+ switch (vcpu->arch.aux_inuse & (KVM_MIPS_AUX_FPU | KVM_MIPS_AUX_MSA)) {
+ case KVM_MIPS_AUX_FPU:
/*
* Guest FPU state already loaded, only restore upper MSA state
*/
__kvm_restore_msa_upper(&vcpu->arch);
- vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_MSA;
+ vcpu->arch.aux_inuse |= KVM_MIPS_AUX_MSA;
+ trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, KVM_TRACE_AUX_MSA);
break;
case 0:
/* Neither FPU or MSA already active, restore full MSA state */
__kvm_restore_msa(&vcpu->arch);
- vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_MSA;
+ vcpu->arch.aux_inuse |= KVM_MIPS_AUX_MSA;
if (kvm_mips_guest_has_fpu(&vcpu->arch))
- vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_FPU;
+ vcpu->arch.aux_inuse |= KVM_MIPS_AUX_FPU;
+ trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE,
+ KVM_TRACE_AUX_FPU_MSA);
break;
default:
+ trace_kvm_aux(vcpu, KVM_TRACE_AUX_ENABLE, KVM_TRACE_AUX_MSA);
break;
}
@@ -1536,13 +1689,15 @@ void kvm_own_msa(struct kvm_vcpu *vcpu)
void kvm_drop_fpu(struct kvm_vcpu *vcpu)
{
preempt_disable();
- if (cpu_has_msa && vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA) {
+ if (cpu_has_msa && vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) {
disable_msa();
- vcpu->arch.fpu_inuse &= ~KVM_MIPS_FPU_MSA;
+ trace_kvm_aux(vcpu, KVM_TRACE_AUX_DISCARD, KVM_TRACE_AUX_MSA);
+ vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_MSA;
}
- if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) {
+ if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) {
clear_c0_status(ST0_CU1 | ST0_FR);
- vcpu->arch.fpu_inuse &= ~KVM_MIPS_FPU_FPU;
+ trace_kvm_aux(vcpu, KVM_TRACE_AUX_DISCARD, KVM_TRACE_AUX_FPU);
+ vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_FPU;
}
preempt_enable();
}
@@ -1558,25 +1713,27 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu)
*/
preempt_disable();
- if (cpu_has_msa && vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA) {
+ if (cpu_has_msa && vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) {
set_c0_config5(MIPS_CONF5_MSAEN);
enable_fpu_hazard();
__kvm_save_msa(&vcpu->arch);
+ trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_FPU_MSA);
/* Disable MSA & FPU */
disable_msa();
- if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) {
+ if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) {
clear_c0_status(ST0_CU1 | ST0_FR);
disable_fpu_hazard();
}
- vcpu->arch.fpu_inuse &= ~(KVM_MIPS_FPU_FPU | KVM_MIPS_FPU_MSA);
- } else if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) {
+ vcpu->arch.aux_inuse &= ~(KVM_MIPS_AUX_FPU | KVM_MIPS_AUX_MSA);
+ } else if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) {
set_c0_status(ST0_CU1);
enable_fpu_hazard();
__kvm_save_fpu(&vcpu->arch);
- vcpu->arch.fpu_inuse &= ~KVM_MIPS_FPU_FPU;
+ vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_FPU;
+ trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_FPU);
/* Disable FPU */
clear_c0_status(ST0_CU1 | ST0_FR);
@@ -1638,6 +1795,10 @@ static int __init kvm_mips_init(void)
{
int ret;
+ ret = kvm_mips_entry_setup();
+ if (ret)
+ return ret;
+
ret = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
if (ret)
@@ -1645,18 +1806,6 @@ static int __init kvm_mips_init(void)
register_die_notifier(&kvm_mips_csr_die_notifier);
- /*
- * On MIPS, kernel modules are executed from "mapped space", which
- * requires TLBs. The TLB handling code is statically linked with
- * the rest of the kernel (tlb.c) to avoid the possibility of
- * double faulting. The issue is that the TLB code references
- * routines that are part of the the KVM module, which are only
- * available once the module is loaded.
- */
- kvm_mips_gfn_to_pfn = gfn_to_pfn;
- kvm_mips_release_pfn_clean = kvm_release_pfn_clean;
- kvm_mips_is_error_pfn = is_error_pfn;
-
return 0;
}
@@ -1664,10 +1813,6 @@ static void __exit kvm_mips_exit(void)
{
kvm_exit();
- kvm_mips_gfn_to_pfn = NULL;
- kvm_mips_release_pfn_clean = NULL;
- kvm_mips_is_error_pfn = NULL;
-
unregister_die_notifier(&kvm_mips_csr_die_notifier);
}
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
new file mode 100644
index 000000000000..57319ee57c4f
--- /dev/null
+++ b/arch/mips/kvm/mmu.c
@@ -0,0 +1,375 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * KVM/MIPS MMU handling in the KVM module.
+ *
+ * Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved.
+ * Authors: Sanjay Lal <sanjayl@kymasys.com>
+ */
+
+#include <linux/highmem.h>
+#include <linux/kvm_host.h>
+#include <asm/mmu_context.h>
+
+static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
+{
+ int cpu = smp_processor_id();
+
+ return vcpu->arch.guest_kernel_asid[cpu] &
+ cpu_asid_mask(&cpu_data[cpu]);
+}
+
+static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
+{
+ int cpu = smp_processor_id();
+
+ return vcpu->arch.guest_user_asid[cpu] &
+ cpu_asid_mask(&cpu_data[cpu]);
+}
+
+static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn)
+{
+ int srcu_idx, err = 0;
+ kvm_pfn_t pfn;
+
+ if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE)
+ return 0;
+
+ srcu_idx = srcu_read_lock(&kvm->srcu);
+ pfn = gfn_to_pfn(kvm, gfn);
+
+ if (is_error_pfn(pfn)) {
+ kvm_err("Couldn't get pfn for gfn %#llx!\n", gfn);
+ err = -EFAULT;
+ goto out;
+ }
+
+ kvm->arch.guest_pmap[gfn] = pfn;
+out:
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+ return err;
+}
+
+/* Translate guest KSEG0 addresses to Host PA */
+unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu,
+ unsigned long gva)
+{
+ gfn_t gfn;
+ unsigned long offset = gva & ~PAGE_MASK;
+ struct kvm *kvm = vcpu->kvm;
+
+ if (KVM_GUEST_KSEGX(gva) != KVM_GUEST_KSEG0) {
+ kvm_err("%s/%p: Invalid gva: %#lx\n", __func__,
+ __builtin_return_address(0), gva);
+ return KVM_INVALID_PAGE;
+ }
+
+ gfn = (KVM_GUEST_CPHYSADDR(gva) >> PAGE_SHIFT);
+
+ if (gfn >= kvm->arch.guest_pmap_npages) {
+ kvm_err("%s: Invalid gfn: %#llx, GVA: %#lx\n", __func__, gfn,
+ gva);
+ return KVM_INVALID_PAGE;
+ }
+
+ if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
+ return KVM_INVALID_ADDR;
+
+ return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset;
+}
+
+/* XXXKYMA: Must be called with interrupts disabled */
+int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
+ struct kvm_vcpu *vcpu)
+{
+ gfn_t gfn;
+ kvm_pfn_t pfn0, pfn1;
+ unsigned long vaddr = 0;
+ unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
+ struct kvm *kvm = vcpu->kvm;
+ const int flush_dcache_mask = 0;
+ int ret;
+
+ if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) {
+ kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr);
+ kvm_mips_dump_host_tlbs();
+ return -1;
+ }
+
+ gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT);
+ if (gfn >= kvm->arch.guest_pmap_npages) {
+ kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__,
+ gfn, badvaddr);
+ kvm_mips_dump_host_tlbs();
+ return -1;
+ }
+ vaddr = badvaddr & (PAGE_MASK << 1);
+
+ if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
+ return -1;
+
+ if (kvm_mips_map_page(vcpu->kvm, gfn ^ 0x1) < 0)
+ return -1;
+
+ pfn0 = kvm->arch.guest_pmap[gfn & ~0x1];
+ pfn1 = kvm->arch.guest_pmap[gfn | 0x1];
+
+ entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) |
+ ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+ ENTRYLO_D | ENTRYLO_V;
+ entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) |
+ ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+ ENTRYLO_D | ENTRYLO_V;
+
+ preempt_disable();
+ entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu));
+ ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
+ flush_dcache_mask);
+ preempt_enable();
+
+ return ret;
+}
+
+int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
+ struct kvm_mips_tlb *tlb)
+{
+ unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
+ struct kvm *kvm = vcpu->kvm;
+ kvm_pfn_t pfn0, pfn1;
+ int ret;
+
+ if ((tlb->tlb_hi & VPN2_MASK) == 0) {
+ pfn0 = 0;
+ pfn1 = 0;
+ } else {
+ if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo[0])
+ >> PAGE_SHIFT) < 0)
+ return -1;
+
+ if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo[1])
+ >> PAGE_SHIFT) < 0)
+ return -1;
+
+ pfn0 = kvm->arch.guest_pmap[
+ mips3_tlbpfn_to_paddr(tlb->tlb_lo[0]) >> PAGE_SHIFT];
+ pfn1 = kvm->arch.guest_pmap[
+ mips3_tlbpfn_to_paddr(tlb->tlb_lo[1]) >> PAGE_SHIFT];
+ }
+
+ /* Get attributes from the Guest TLB */
+ entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) |
+ ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+ (tlb->tlb_lo[0] & ENTRYLO_D) |
+ (tlb->tlb_lo[0] & ENTRYLO_V);
+ entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) |
+ ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+ (tlb->tlb_lo[1] & ENTRYLO_D) |
+ (tlb->tlb_lo[1] & ENTRYLO_V);
+
+ kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc,
+ tlb->tlb_lo[0], tlb->tlb_lo[1]);
+
+ preempt_disable();
+ entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ?
+ kvm_mips_get_kernel_asid(vcpu) :
+ kvm_mips_get_user_asid(vcpu));
+ ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
+ tlb->tlb_mask);
+ preempt_enable();
+
+ return ret;
+}
+
+void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu,
+ struct kvm_vcpu *vcpu)
+{
+ unsigned long asid = asid_cache(cpu);
+
+ asid += cpu_asid_inc();
+ if (!(asid & cpu_asid_mask(&cpu_data[cpu]))) {
+ if (cpu_has_vtag_icache)
+ flush_icache_all();
+
+ kvm_local_flush_tlb_all(); /* start new asid cycle */
+
+ if (!asid) /* fix version if needed */
+ asid = asid_first_version(cpu);
+ }
+
+ cpu_context(cpu, mm) = asid_cache(cpu) = asid;
+}
+
+/**
+ * kvm_mips_migrate_count() - Migrate timer.
+ * @vcpu: Virtual CPU.
+ *
+ * Migrate CP0_Count hrtimer to the current CPU by cancelling and restarting it
+ * if it was running prior to being cancelled.
+ *
+ * Must be called when the VCPU is migrated to a different CPU to ensure that
+ * timer expiry during guest execution interrupts the guest and causes the
+ * interrupt to be delivered in a timely manner.
+ */
+static void kvm_mips_migrate_count(struct kvm_vcpu *vcpu)
+{
+ if (hrtimer_cancel(&vcpu->arch.comparecount_timer))
+ hrtimer_restart(&vcpu->arch.comparecount_timer);
+}
+
+/* Restore ASID once we are scheduled back after preemption */
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]);
+ unsigned long flags;
+ int newasid = 0;
+
+ kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu);
+
+ /* Allocate new kernel and user ASIDs if needed */
+
+ local_irq_save(flags);
+
+ if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) &
+ asid_version_mask(cpu)) {
+ kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu);
+ vcpu->arch.guest_kernel_asid[cpu] =
+ vcpu->arch.guest_kernel_mm.context.asid[cpu];
+ kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu);
+ vcpu->arch.guest_user_asid[cpu] =
+ vcpu->arch.guest_user_mm.context.asid[cpu];
+ newasid++;
+
+ kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
+ cpu_context(cpu, current->mm));
+ kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n",
+ cpu, vcpu->arch.guest_kernel_asid[cpu]);
+ kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu,
+ vcpu->arch.guest_user_asid[cpu]);
+ }
+
+ if (vcpu->arch.last_sched_cpu != cpu) {
+ kvm_debug("[%d->%d]KVM VCPU[%d] switch\n",
+ vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id);
+ /*
+ * Migrate the timer interrupt to the current CPU so that it
+ * always interrupts the guest and synchronously triggers a
+ * guest timer interrupt.
+ */
+ kvm_mips_migrate_count(vcpu);
+ }
+
+ if (!newasid) {
+ /*
+ * If we preempted while the guest was executing, then reload
+ * the pre-empted ASID
+ */
+ if (current->flags & PF_VCPU) {
+ write_c0_entryhi(vcpu->arch.
+ preempt_entryhi & asid_mask);
+ ehb();
+ }
+ } else {
+ /* New ASIDs were allocated for the VM */
+
+ /*
+ * Were we in guest context? If so then the pre-empted ASID is
+ * no longer valid, we need to set it to what it should be based
+ * on the mode of the Guest (Kernel/User)
+ */
+ if (current->flags & PF_VCPU) {
+ if (KVM_GUEST_KERNEL_MODE(vcpu))
+ write_c0_entryhi(vcpu->arch.
+ guest_kernel_asid[cpu] &
+ asid_mask);
+ else
+ write_c0_entryhi(vcpu->arch.
+ guest_user_asid[cpu] &
+ asid_mask);
+ ehb();
+ }
+ }
+
+ /* restore guest state to registers */
+ kvm_mips_callbacks->vcpu_set_regs(vcpu);
+
+ local_irq_restore(flags);
+
+}
+
+/* ASID can change if another task is scheduled during preemption */
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ unsigned long flags;
+ int cpu;
+
+ local_irq_save(flags);
+
+ cpu = smp_processor_id();
+
+ vcpu->arch.preempt_entryhi = read_c0_entryhi();
+ vcpu->arch.last_sched_cpu = cpu;
+
+ /* save guest state in registers */
+ kvm_mips_callbacks->vcpu_get_regs(vcpu);
+
+ if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
+ asid_version_mask(cpu))) {
+ kvm_debug("%s: Dropping MMU Context: %#lx\n", __func__,
+ cpu_context(cpu, current->mm));
+ drop_mmu_context(current->mm, cpu);
+ }
+ write_c0_entryhi(cpu_asid(cpu, current->mm));
+ ehb();
+
+ local_irq_restore(flags);
+}
+
+u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu)
+{
+ struct mips_coproc *cop0 = vcpu->arch.cop0;
+ unsigned long paddr, flags, vpn2, asid;
+ unsigned long va = (unsigned long)opc;
+ void *vaddr;
+ u32 inst;
+ int index;
+
+ if (KVM_GUEST_KSEGX(va) < KVM_GUEST_KSEG0 ||
+ KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG23) {
+ local_irq_save(flags);
+ index = kvm_mips_host_tlb_lookup(vcpu, va);
+ if (index >= 0) {
+ inst = *(opc);
+ } else {
+ vpn2 = va & VPN2_MASK;
+ asid = kvm_read_c0_guest_entryhi(cop0) &
+ KVM_ENTRYHI_ASID;
+ index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid);
+ if (index < 0) {
+ kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n",
+ __func__, opc, vcpu, read_c0_entryhi());
+ kvm_mips_dump_host_tlbs();
+ kvm_mips_dump_guest_tlbs(vcpu);
+ local_irq_restore(flags);
+ return KVM_INVALID_INST;
+ }
+ kvm_mips_handle_mapped_seg_tlb_fault(vcpu,
+ &vcpu->arch.
+ guest_tlb[index]);
+ inst = *(opc);
+ }
+ local_irq_restore(flags);
+ } else if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) {
+ paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu, va);
+ vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr)));
+ vaddr += paddr & ~PAGE_MASK;
+ inst = *(u32 *)vaddr;
+ kunmap_atomic(vaddr);
+ } else {
+ kvm_err("%s: illegal address: %p\n", __func__, opc);
+ return KVM_INVALID_INST;
+ }
+
+ return inst;
+}
diff --git a/arch/mips/kvm/stats.c b/arch/mips/kvm/stats.c
index 888bb67070ac..53f851a61554 100644
--- a/arch/mips/kvm/stats.c
+++ b/arch/mips/kvm/stats.c
@@ -11,27 +11,6 @@
#include <linux/kvm_host.h>
-char *kvm_mips_exit_types_str[MAX_KVM_MIPS_EXIT_TYPES] = {
- "WAIT",
- "CACHE",
- "Signal",
- "Interrupt",
- "COP0/1 Unusable",
- "TLB Mod",
- "TLB Miss (LD)",
- "TLB Miss (ST)",
- "Address Err (ST)",
- "Address Error (LD)",
- "System Call",
- "Reserved Inst",
- "Break Inst",
- "Trap Inst",
- "MSA FPE",
- "FPE",
- "MSA Disabled",
- "D-Cache Flushes",
-};
-
char *kvm_cop0_str[N_MIPS_COPROC_REGS] = {
"Index",
"Random",
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index ed021ae7867a..254377d8e0b9 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -14,7 +14,7 @@
#include <linux/smp.h>
#include <linux/mm.h>
#include <linux/delay.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/kvm_host.h>
#include <linux/srcu.h>
@@ -24,6 +24,7 @@
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/tlb.h>
+#include <asm/tlbdebug.h>
#undef CONFIG_MIPS_MT
#include <asm/r4kcache.h>
@@ -32,22 +33,10 @@
#define KVM_GUEST_PC_TLB 0
#define KVM_GUEST_SP_TLB 1
-#define PRIx64 "llx"
-
atomic_t kvm_mips_instance;
EXPORT_SYMBOL_GPL(kvm_mips_instance);
-/* These function pointers are initialized once the KVM module is loaded */
-kvm_pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn);
-EXPORT_SYMBOL_GPL(kvm_mips_gfn_to_pfn);
-
-void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn);
-EXPORT_SYMBOL_GPL(kvm_mips_release_pfn_clean);
-
-bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn);
-EXPORT_SYMBOL_GPL(kvm_mips_is_error_pfn);
-
-uint32_t kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
+static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
{
int cpu = smp_processor_id();
@@ -55,7 +44,7 @@ uint32_t kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
cpu_asid_mask(&cpu_data[cpu]);
}
-uint32_t kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
+static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
{
int cpu = smp_processor_id();
@@ -63,7 +52,7 @@ uint32_t kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
cpu_asid_mask(&cpu_data[cpu]);
}
-inline uint32_t kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu)
+inline u32 kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu)
{
return vcpu->kvm->arch.commpage_tlb;
}
@@ -72,50 +61,15 @@ inline uint32_t kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu)
void kvm_mips_dump_host_tlbs(void)
{
- unsigned long old_entryhi;
- unsigned long old_pagemask;
- struct kvm_mips_tlb tlb;
unsigned long flags;
- int i;
local_irq_save(flags);
- old_entryhi = read_c0_entryhi();
- old_pagemask = read_c0_pagemask();
-
kvm_info("HOST TLBs:\n");
- kvm_info("ASID: %#lx\n", read_c0_entryhi() &
- cpu_asid_mask(&current_cpu_data));
-
- for (i = 0; i < current_cpu_data.tlbsize; i++) {
- write_c0_index(i);
- mtc0_tlbw_hazard();
-
- tlb_read();
- tlbw_use_hazard();
+ dump_tlb_regs();
+ pr_info("\n");
+ dump_tlb_all();
- tlb.tlb_hi = read_c0_entryhi();
- tlb.tlb_lo0 = read_c0_entrylo0();
- tlb.tlb_lo1 = read_c0_entrylo1();
- tlb.tlb_mask = read_c0_pagemask();
-
- kvm_info("TLB%c%3d Hi 0x%08lx ",
- (tlb.tlb_lo0 | tlb.tlb_lo1) & MIPS3_PG_V ? ' ' : '*',
- i, tlb.tlb_hi);
- kvm_info("Lo0=0x%09" PRIx64 " %c%c attr %lx ",
- (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo0),
- (tlb.tlb_lo0 & MIPS3_PG_D) ? 'D' : ' ',
- (tlb.tlb_lo0 & MIPS3_PG_G) ? 'G' : ' ',
- (tlb.tlb_lo0 >> 3) & 7);
- kvm_info("Lo1=0x%09" PRIx64 " %c%c attr %lx sz=%lx\n",
- (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo1),
- (tlb.tlb_lo1 & MIPS3_PG_D) ? 'D' : ' ',
- (tlb.tlb_lo1 & MIPS3_PG_G) ? 'G' : ' ',
- (tlb.tlb_lo1 >> 3) & 7, tlb.tlb_mask);
- }
- write_c0_entryhi(old_entryhi);
- write_c0_pagemask(old_pagemask);
- mtc0_tlbw_hazard();
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(kvm_mips_dump_host_tlbs);
@@ -132,74 +86,24 @@ void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu)
for (i = 0; i < KVM_MIPS_GUEST_TLB_SIZE; i++) {
tlb = vcpu->arch.guest_tlb[i];
kvm_info("TLB%c%3d Hi 0x%08lx ",
- (tlb.tlb_lo0 | tlb.tlb_lo1) & MIPS3_PG_V ? ' ' : '*',
+ (tlb.tlb_lo[0] | tlb.tlb_lo[1]) & ENTRYLO_V
+ ? ' ' : '*',
i, tlb.tlb_hi);
- kvm_info("Lo0=0x%09" PRIx64 " %c%c attr %lx ",
- (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo0),
- (tlb.tlb_lo0 & MIPS3_PG_D) ? 'D' : ' ',
- (tlb.tlb_lo0 & MIPS3_PG_G) ? 'G' : ' ',
- (tlb.tlb_lo0 >> 3) & 7);
- kvm_info("Lo1=0x%09" PRIx64 " %c%c attr %lx sz=%lx\n",
- (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo1),
- (tlb.tlb_lo1 & MIPS3_PG_D) ? 'D' : ' ',
- (tlb.tlb_lo1 & MIPS3_PG_G) ? 'G' : ' ',
- (tlb.tlb_lo1 >> 3) & 7, tlb.tlb_mask);
+ kvm_info("Lo0=0x%09llx %c%c attr %lx ",
+ (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo[0]),
+ (tlb.tlb_lo[0] & ENTRYLO_D) ? 'D' : ' ',
+ (tlb.tlb_lo[0] & ENTRYLO_G) ? 'G' : ' ',
+ (tlb.tlb_lo[0] & ENTRYLO_C) >> ENTRYLO_C_SHIFT);
+ kvm_info("Lo1=0x%09llx %c%c attr %lx sz=%lx\n",
+ (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo[1]),
+ (tlb.tlb_lo[1] & ENTRYLO_D) ? 'D' : ' ',
+ (tlb.tlb_lo[1] & ENTRYLO_G) ? 'G' : ' ',
+ (tlb.tlb_lo[1] & ENTRYLO_C) >> ENTRYLO_C_SHIFT,
+ tlb.tlb_mask);
}
}
EXPORT_SYMBOL_GPL(kvm_mips_dump_guest_tlbs);
-static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn)
-{
- int srcu_idx, err = 0;
- kvm_pfn_t pfn;
-
- if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE)
- return 0;
-
- srcu_idx = srcu_read_lock(&kvm->srcu);
- pfn = kvm_mips_gfn_to_pfn(kvm, gfn);
-
- if (kvm_mips_is_error_pfn(pfn)) {
- kvm_err("Couldn't get pfn for gfn %#" PRIx64 "!\n", gfn);
- err = -EFAULT;
- goto out;
- }
-
- kvm->arch.guest_pmap[gfn] = pfn;
-out:
- srcu_read_unlock(&kvm->srcu, srcu_idx);
- return err;
-}
-
-/* Translate guest KSEG0 addresses to Host PA */
-unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu,
- unsigned long gva)
-{
- gfn_t gfn;
- uint32_t offset = gva & ~PAGE_MASK;
- struct kvm *kvm = vcpu->kvm;
-
- if (KVM_GUEST_KSEGX(gva) != KVM_GUEST_KSEG0) {
- kvm_err("%s/%p: Invalid gva: %#lx\n", __func__,
- __builtin_return_address(0), gva);
- return KVM_INVALID_PAGE;
- }
-
- gfn = (KVM_GUEST_CPHYSADDR(gva) >> PAGE_SHIFT);
-
- if (gfn >= kvm->arch.guest_pmap_npages) {
- kvm_err("%s: Invalid gfn: %#llx, GVA: %#lx\n", __func__, gfn,
- gva);
- return KVM_INVALID_PAGE;
- }
-
- if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
- return KVM_INVALID_ADDR;
-
- return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset;
-}
-EXPORT_SYMBOL_GPL(kvm_mips_translate_guest_kseg0_to_hpa);
-
/* XXXKYMA: Must be called with interrupts disabled */
/* set flush_dcache_mask == 0 if no dcache flush required */
int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
@@ -243,12 +147,12 @@ int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
/* Flush D-cache */
if (flush_dcache_mask) {
- if (entrylo0 & MIPS3_PG_V) {
+ if (entrylo0 & ENTRYLO_V) {
++vcpu->stat.flush_dcache_exits;
flush_data_cache_page((entryhi & VPN2_MASK) &
~flush_dcache_mask);
}
- if (entrylo1 & MIPS3_PG_V) {
+ if (entrylo1 & ENTRYLO_V) {
++vcpu->stat.flush_dcache_exits;
flush_data_cache_page(((entryhi & VPN2_MASK) &
~flush_dcache_mask) |
@@ -259,96 +163,35 @@ int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
/* Restore old ASID */
write_c0_entryhi(old_entryhi);
mtc0_tlbw_hazard();
- tlbw_use_hazard();
local_irq_restore(flags);
return 0;
}
-
-/* XXXKYMA: Must be called with interrupts disabled */
-int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
- struct kvm_vcpu *vcpu)
-{
- gfn_t gfn;
- kvm_pfn_t pfn0, pfn1;
- unsigned long vaddr = 0;
- unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
- int even;
- struct kvm *kvm = vcpu->kvm;
- const int flush_dcache_mask = 0;
- int ret;
-
- if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) {
- kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr);
- kvm_mips_dump_host_tlbs();
- return -1;
- }
-
- gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT);
- if (gfn >= kvm->arch.guest_pmap_npages) {
- kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__,
- gfn, badvaddr);
- kvm_mips_dump_host_tlbs();
- return -1;
- }
- even = !(gfn & 0x1);
- vaddr = badvaddr & (PAGE_MASK << 1);
-
- if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
- return -1;
-
- if (kvm_mips_map_page(vcpu->kvm, gfn ^ 0x1) < 0)
- return -1;
-
- if (even) {
- pfn0 = kvm->arch.guest_pmap[gfn];
- pfn1 = kvm->arch.guest_pmap[gfn ^ 0x1];
- } else {
- pfn0 = kvm->arch.guest_pmap[gfn ^ 0x1];
- pfn1 = kvm->arch.guest_pmap[gfn];
- }
-
- entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
- (1 << 2) | (0x1 << 1);
- entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) |
- (1 << 2) | (0x1 << 1);
-
- preempt_disable();
- entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu));
- ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
- flush_dcache_mask);
- preempt_enable();
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(kvm_mips_handle_kseg0_tlb_fault);
+EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_write);
int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
struct kvm_vcpu *vcpu)
{
- kvm_pfn_t pfn0, pfn1;
+ kvm_pfn_t pfn;
unsigned long flags, old_entryhi = 0, vaddr = 0;
- unsigned long entrylo0 = 0, entrylo1 = 0;
+ unsigned long entrylo[2] = { 0, 0 };
+ unsigned int pair_idx;
- pfn0 = CPHYSADDR(vcpu->arch.kseg0_commpage) >> PAGE_SHIFT;
- pfn1 = 0;
- entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
- (1 << 2) | (0x1 << 1);
- entrylo1 = 0;
+ pfn = PFN_DOWN(virt_to_phys(vcpu->arch.kseg0_commpage));
+ pair_idx = (badvaddr >> PAGE_SHIFT) & 1;
+ entrylo[pair_idx] = mips3_paddr_to_tlbpfn(pfn << PAGE_SHIFT) |
+ ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+ ENTRYLO_D | ENTRYLO_V;
local_irq_save(flags);
old_entryhi = read_c0_entryhi();
vaddr = badvaddr & (PAGE_MASK << 1);
write_c0_entryhi(vaddr | kvm_mips_get_kernel_asid(vcpu));
- mtc0_tlbw_hazard();
- write_c0_entrylo0(entrylo0);
- mtc0_tlbw_hazard();
- write_c0_entrylo1(entrylo1);
- mtc0_tlbw_hazard();
+ write_c0_entrylo0(entrylo[0]);
+ write_c0_entrylo1(entrylo[1]);
write_c0_index(kvm_mips_get_commpage_asid(vcpu));
mtc0_tlbw_hazard();
tlb_write_indexed();
- mtc0_tlbw_hazard();
tlbw_use_hazard();
kvm_debug("@ %#lx idx: %2d [entryhi(R): %#lx] entrylo0 (R): 0x%08lx, entrylo1(R): 0x%08lx\n",
@@ -358,68 +201,12 @@ int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
/* Restore old ASID */
write_c0_entryhi(old_entryhi);
mtc0_tlbw_hazard();
- tlbw_use_hazard();
local_irq_restore(flags);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_mips_handle_commpage_tlb_fault);
-int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
- struct kvm_mips_tlb *tlb,
- unsigned long *hpa0,
- unsigned long *hpa1)
-{
- unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
- struct kvm *kvm = vcpu->kvm;
- kvm_pfn_t pfn0, pfn1;
- int ret;
-
- if ((tlb->tlb_hi & VPN2_MASK) == 0) {
- pfn0 = 0;
- pfn1 = 0;
- } else {
- if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo0)
- >> PAGE_SHIFT) < 0)
- return -1;
-
- if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo1)
- >> PAGE_SHIFT) < 0)
- return -1;
-
- pfn0 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo0)
- >> PAGE_SHIFT];
- pfn1 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo1)
- >> PAGE_SHIFT];
- }
-
- if (hpa0)
- *hpa0 = pfn0 << PAGE_SHIFT;
-
- if (hpa1)
- *hpa1 = pfn1 << PAGE_SHIFT;
-
- /* Get attributes from the Guest TLB */
- entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
- (tlb->tlb_lo0 & MIPS3_PG_D) | (tlb->tlb_lo0 & MIPS3_PG_V);
- entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) |
- (tlb->tlb_lo1 & MIPS3_PG_D) | (tlb->tlb_lo1 & MIPS3_PG_V);
-
- kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc,
- tlb->tlb_lo0, tlb->tlb_lo1);
-
- preempt_disable();
- entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ?
- kvm_mips_get_kernel_asid(vcpu) :
- kvm_mips_get_user_asid(vcpu));
- ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
- tlb->tlb_mask);
- preempt_enable();
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(kvm_mips_handle_mapped_seg_tlb_fault);
-
int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi)
{
int i;
@@ -435,7 +222,7 @@ int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi)
}
kvm_debug("%s: entryhi: %#lx, index: %d lo0: %#lx, lo1: %#lx\n",
- __func__, entryhi, index, tlb[i].tlb_lo0, tlb[i].tlb_lo1);
+ __func__, entryhi, index, tlb[i].tlb_lo[0], tlb[i].tlb_lo[1]);
return index;
}
@@ -467,7 +254,6 @@ int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr)
/* Restore old ASID */
write_c0_entryhi(old_entryhi);
mtc0_tlbw_hazard();
- tlbw_use_hazard();
local_irq_restore(flags);
@@ -498,21 +284,16 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va)
if (idx > 0) {
write_c0_entryhi(UNIQUE_ENTRYHI(idx));
- mtc0_tlbw_hazard();
-
write_c0_entrylo0(0);
- mtc0_tlbw_hazard();
-
write_c0_entrylo1(0);
mtc0_tlbw_hazard();
tlb_write_indexed();
- mtc0_tlbw_hazard();
+ tlbw_use_hazard();
}
write_c0_entryhi(old_entryhi);
mtc0_tlbw_hazard();
- tlbw_use_hazard();
local_irq_restore(flags);
@@ -540,61 +321,39 @@ void kvm_mips_flush_host_tlb(int skip_kseg0)
/* Blast 'em all away. */
for (entry = 0; entry < maxentry; entry++) {
write_c0_index(entry);
- mtc0_tlbw_hazard();
if (skip_kseg0) {
+ mtc0_tlbr_hazard();
tlb_read();
- tlbw_use_hazard();
+ tlb_read_hazard();
entryhi = read_c0_entryhi();
/* Don't blow away guest kernel entries */
if (KVM_GUEST_KSEGX(entryhi) == KVM_GUEST_KSEG0)
continue;
+
+ write_c0_pagemask(old_pagemask);
}
/* Make sure all entries differ. */
write_c0_entryhi(UNIQUE_ENTRYHI(entry));
- mtc0_tlbw_hazard();
write_c0_entrylo0(0);
- mtc0_tlbw_hazard();
write_c0_entrylo1(0);
mtc0_tlbw_hazard();
tlb_write_indexed();
- mtc0_tlbw_hazard();
+ tlbw_use_hazard();
}
- tlbw_use_hazard();
-
write_c0_entryhi(old_entryhi);
write_c0_pagemask(old_pagemask);
mtc0_tlbw_hazard();
- tlbw_use_hazard();
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(kvm_mips_flush_host_tlb);
-void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu,
- struct kvm_vcpu *vcpu)
-{
- unsigned long asid = asid_cache(cpu);
-
- asid += cpu_asid_inc();
- if (!(asid & cpu_asid_mask(&cpu_data[cpu]))) {
- if (cpu_has_vtag_icache)
- flush_icache_all();
-
- kvm_local_flush_tlb_all(); /* start new asid cycle */
-
- if (!asid) /* fix version if needed */
- asid = asid_first_version(cpu);
- }
-
- cpu_context(cpu, mm) = asid_cache(cpu) = asid;
-}
-
void kvm_local_flush_tlb_all(void)
{
unsigned long flags;
@@ -614,185 +373,12 @@ void kvm_local_flush_tlb_all(void)
write_c0_index(entry);
mtc0_tlbw_hazard();
tlb_write_indexed();
+ tlbw_use_hazard();
entry++;
}
- tlbw_use_hazard();
write_c0_entryhi(old_ctx);
mtc0_tlbw_hazard();
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(kvm_local_flush_tlb_all);
-
-/**
- * kvm_mips_migrate_count() - Migrate timer.
- * @vcpu: Virtual CPU.
- *
- * Migrate CP0_Count hrtimer to the current CPU by cancelling and restarting it
- * if it was running prior to being cancelled.
- *
- * Must be called when the VCPU is migrated to a different CPU to ensure that
- * timer expiry during guest execution interrupts the guest and causes the
- * interrupt to be delivered in a timely manner.
- */
-static void kvm_mips_migrate_count(struct kvm_vcpu *vcpu)
-{
- if (hrtimer_cancel(&vcpu->arch.comparecount_timer))
- hrtimer_restart(&vcpu->arch.comparecount_timer);
-}
-
-/* Restore ASID once we are scheduled back after preemption */
-void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
- unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]);
- unsigned long flags;
- int newasid = 0;
-
- kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu);
-
- /* Allocate new kernel and user ASIDs if needed */
-
- local_irq_save(flags);
-
- if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) &
- asid_version_mask(cpu)) {
- kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu);
- vcpu->arch.guest_kernel_asid[cpu] =
- vcpu->arch.guest_kernel_mm.context.asid[cpu];
- kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu);
- vcpu->arch.guest_user_asid[cpu] =
- vcpu->arch.guest_user_mm.context.asid[cpu];
- newasid++;
-
- kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
- cpu_context(cpu, current->mm));
- kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n",
- cpu, vcpu->arch.guest_kernel_asid[cpu]);
- kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu,
- vcpu->arch.guest_user_asid[cpu]);
- }
-
- if (vcpu->arch.last_sched_cpu != cpu) {
- kvm_debug("[%d->%d]KVM VCPU[%d] switch\n",
- vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id);
- /*
- * Migrate the timer interrupt to the current CPU so that it
- * always interrupts the guest and synchronously triggers a
- * guest timer interrupt.
- */
- kvm_mips_migrate_count(vcpu);
- }
-
- if (!newasid) {
- /*
- * If we preempted while the guest was executing, then reload
- * the pre-empted ASID
- */
- if (current->flags & PF_VCPU) {
- write_c0_entryhi(vcpu->arch.
- preempt_entryhi & asid_mask);
- ehb();
- }
- } else {
- /* New ASIDs were allocated for the VM */
-
- /*
- * Were we in guest context? If so then the pre-empted ASID is
- * no longer valid, we need to set it to what it should be based
- * on the mode of the Guest (Kernel/User)
- */
- if (current->flags & PF_VCPU) {
- if (KVM_GUEST_KERNEL_MODE(vcpu))
- write_c0_entryhi(vcpu->arch.
- guest_kernel_asid[cpu] &
- asid_mask);
- else
- write_c0_entryhi(vcpu->arch.
- guest_user_asid[cpu] &
- asid_mask);
- ehb();
- }
- }
-
- /* restore guest state to registers */
- kvm_mips_callbacks->vcpu_set_regs(vcpu);
-
- local_irq_restore(flags);
-
-}
-EXPORT_SYMBOL_GPL(kvm_arch_vcpu_load);
-
-/* ASID can change if another task is scheduled during preemption */
-void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
-{
- unsigned long flags;
- uint32_t cpu;
-
- local_irq_save(flags);
-
- cpu = smp_processor_id();
-
- vcpu->arch.preempt_entryhi = read_c0_entryhi();
- vcpu->arch.last_sched_cpu = cpu;
-
- /* save guest state in registers */
- kvm_mips_callbacks->vcpu_get_regs(vcpu);
-
- if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
- asid_version_mask(cpu))) {
- kvm_debug("%s: Dropping MMU Context: %#lx\n", __func__,
- cpu_context(cpu, current->mm));
- drop_mmu_context(current->mm, cpu);
- }
- write_c0_entryhi(cpu_asid(cpu, current->mm));
- ehb();
-
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(kvm_arch_vcpu_put);
-
-uint32_t kvm_get_inst(uint32_t *opc, struct kvm_vcpu *vcpu)
-{
- struct mips_coproc *cop0 = vcpu->arch.cop0;
- unsigned long paddr, flags, vpn2, asid;
- uint32_t inst;
- int index;
-
- if (KVM_GUEST_KSEGX((unsigned long) opc) < KVM_GUEST_KSEG0 ||
- KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
- local_irq_save(flags);
- index = kvm_mips_host_tlb_lookup(vcpu, (unsigned long) opc);
- if (index >= 0) {
- inst = *(opc);
- } else {
- vpn2 = (unsigned long) opc & VPN2_MASK;
- asid = kvm_read_c0_guest_entryhi(cop0) &
- KVM_ENTRYHI_ASID;
- index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid);
- if (index < 0) {
- kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n",
- __func__, opc, vcpu, read_c0_entryhi());
- kvm_mips_dump_host_tlbs();
- local_irq_restore(flags);
- return KVM_INVALID_INST;
- }
- kvm_mips_handle_mapped_seg_tlb_fault(vcpu,
- &vcpu->arch.
- guest_tlb[index],
- NULL, NULL);
- inst = *(opc);
- }
- local_irq_restore(flags);
- } else if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) {
- paddr =
- kvm_mips_translate_guest_kseg0_to_hpa(vcpu,
- (unsigned long) opc);
- inst = *(uint32_t *) CKSEG0ADDR(paddr);
- } else {
- kvm_err("%s: illegal address: %p\n", __func__, opc);
- return KVM_INVALID_INST;
- }
-
- return inst;
-}
-EXPORT_SYMBOL_GPL(kvm_get_inst);
diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h
index bd6437f67dc0..c858cf168078 100644
--- a/arch/mips/kvm/trace.h
+++ b/arch/mips/kvm/trace.h
@@ -17,8 +17,75 @@
#define TRACE_INCLUDE_PATH .
#define TRACE_INCLUDE_FILE trace
-/* Tracepoints for VM eists */
-extern char *kvm_mips_exit_types_str[MAX_KVM_MIPS_EXIT_TYPES];
+/*
+ * Tracepoints for VM enters
+ */
+DECLARE_EVENT_CLASS(kvm_transition,
+ TP_PROTO(struct kvm_vcpu *vcpu),
+ TP_ARGS(vcpu),
+ TP_STRUCT__entry(
+ __field(unsigned long, pc)
+ ),
+
+ TP_fast_assign(
+ __entry->pc = vcpu->arch.pc;
+ ),
+
+ TP_printk("PC: 0x%08lx",
+ __entry->pc)
+);
+
+DEFINE_EVENT(kvm_transition, kvm_enter,
+ TP_PROTO(struct kvm_vcpu *vcpu),
+ TP_ARGS(vcpu));
+
+DEFINE_EVENT(kvm_transition, kvm_reenter,
+ TP_PROTO(struct kvm_vcpu *vcpu),
+ TP_ARGS(vcpu));
+
+DEFINE_EVENT(kvm_transition, kvm_out,
+ TP_PROTO(struct kvm_vcpu *vcpu),
+ TP_ARGS(vcpu));
+
+/* The first 32 exit reasons correspond to Cause.ExcCode */
+#define KVM_TRACE_EXIT_INT 0
+#define KVM_TRACE_EXIT_TLBMOD 1
+#define KVM_TRACE_EXIT_TLBMISS_LD 2
+#define KVM_TRACE_EXIT_TLBMISS_ST 3
+#define KVM_TRACE_EXIT_ADDRERR_LD 4
+#define KVM_TRACE_EXIT_ADDRERR_ST 5
+#define KVM_TRACE_EXIT_SYSCALL 8
+#define KVM_TRACE_EXIT_BREAK_INST 9
+#define KVM_TRACE_EXIT_RESVD_INST 10
+#define KVM_TRACE_EXIT_COP_UNUSABLE 11
+#define KVM_TRACE_EXIT_TRAP_INST 13
+#define KVM_TRACE_EXIT_MSA_FPE 14
+#define KVM_TRACE_EXIT_FPE 15
+#define KVM_TRACE_EXIT_MSA_DISABLED 21
+/* Further exit reasons */
+#define KVM_TRACE_EXIT_WAIT 32
+#define KVM_TRACE_EXIT_CACHE 33
+#define KVM_TRACE_EXIT_SIGNAL 34
+
+/* Tracepoints for VM exits */
+#define kvm_trace_symbol_exit_types \
+ { KVM_TRACE_EXIT_INT, "Interrupt" }, \
+ { KVM_TRACE_EXIT_TLBMOD, "TLB Mod" }, \
+ { KVM_TRACE_EXIT_TLBMISS_LD, "TLB Miss (LD)" }, \
+ { KVM_TRACE_EXIT_TLBMISS_ST, "TLB Miss (ST)" }, \
+ { KVM_TRACE_EXIT_ADDRERR_LD, "Address Error (LD)" }, \
+ { KVM_TRACE_EXIT_ADDRERR_ST, "Address Err (ST)" }, \
+ { KVM_TRACE_EXIT_SYSCALL, "System Call" }, \
+ { KVM_TRACE_EXIT_BREAK_INST, "Break Inst" }, \
+ { KVM_TRACE_EXIT_RESVD_INST, "Reserved Inst" }, \
+ { KVM_TRACE_EXIT_COP_UNUSABLE, "COP0/1 Unusable" }, \
+ { KVM_TRACE_EXIT_TRAP_INST, "Trap Inst" }, \
+ { KVM_TRACE_EXIT_MSA_FPE, "MSA FPE" }, \
+ { KVM_TRACE_EXIT_FPE, "FPE" }, \
+ { KVM_TRACE_EXIT_MSA_DISABLED, "MSA Disabled" }, \
+ { KVM_TRACE_EXIT_WAIT, "WAIT" }, \
+ { KVM_TRACE_EXIT_CACHE, "CACHE" }, \
+ { KVM_TRACE_EXIT_SIGNAL, "Signal" }
TRACE_EVENT(kvm_exit,
TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason),
@@ -34,10 +101,173 @@ TRACE_EVENT(kvm_exit,
),
TP_printk("[%s]PC: 0x%08lx",
- kvm_mips_exit_types_str[__entry->reason],
+ __print_symbolic(__entry->reason,
+ kvm_trace_symbol_exit_types),
__entry->pc)
);
+#define KVM_TRACE_MFC0 0
+#define KVM_TRACE_MTC0 1
+#define KVM_TRACE_DMFC0 2
+#define KVM_TRACE_DMTC0 3
+#define KVM_TRACE_RDHWR 4
+
+#define KVM_TRACE_HWR_COP0 0
+#define KVM_TRACE_HWR_HWR 1
+
+#define KVM_TRACE_COP0(REG, SEL) ((KVM_TRACE_HWR_COP0 << 8) | \
+ ((REG) << 3) | (SEL))
+#define KVM_TRACE_HWR(REG, SEL) ((KVM_TRACE_HWR_HWR << 8) | \
+ ((REG) << 3) | (SEL))
+
+#define kvm_trace_symbol_hwr_ops \
+ { KVM_TRACE_MFC0, "MFC0" }, \
+ { KVM_TRACE_MTC0, "MTC0" }, \
+ { KVM_TRACE_DMFC0, "DMFC0" }, \
+ { KVM_TRACE_DMTC0, "DMTC0" }, \
+ { KVM_TRACE_RDHWR, "RDHWR" }
+
+#define kvm_trace_symbol_hwr_cop \
+ { KVM_TRACE_HWR_COP0, "COP0" }, \
+ { KVM_TRACE_HWR_HWR, "HWR" }
+
+#define kvm_trace_symbol_hwr_regs \
+ { KVM_TRACE_COP0( 0, 0), "Index" }, \
+ { KVM_TRACE_COP0( 2, 0), "EntryLo0" }, \
+ { KVM_TRACE_COP0( 3, 0), "EntryLo1" }, \
+ { KVM_TRACE_COP0( 4, 0), "Context" }, \
+ { KVM_TRACE_COP0( 4, 2), "UserLocal" }, \
+ { KVM_TRACE_COP0( 5, 0), "PageMask" }, \
+ { KVM_TRACE_COP0( 6, 0), "Wired" }, \
+ { KVM_TRACE_COP0( 7, 0), "HWREna" }, \
+ { KVM_TRACE_COP0( 8, 0), "BadVAddr" }, \
+ { KVM_TRACE_COP0( 9, 0), "Count" }, \
+ { KVM_TRACE_COP0(10, 0), "EntryHi" }, \
+ { KVM_TRACE_COP0(11, 0), "Compare" }, \
+ { KVM_TRACE_COP0(12, 0), "Status" }, \
+ { KVM_TRACE_COP0(12, 1), "IntCtl" }, \
+ { KVM_TRACE_COP0(12, 2), "SRSCtl" }, \
+ { KVM_TRACE_COP0(13, 0), "Cause" }, \
+ { KVM_TRACE_COP0(14, 0), "EPC" }, \
+ { KVM_TRACE_COP0(15, 0), "PRId" }, \
+ { KVM_TRACE_COP0(15, 1), "EBase" }, \
+ { KVM_TRACE_COP0(16, 0), "Config" }, \
+ { KVM_TRACE_COP0(16, 1), "Config1" }, \
+ { KVM_TRACE_COP0(16, 2), "Config2" }, \
+ { KVM_TRACE_COP0(16, 3), "Config3" }, \
+ { KVM_TRACE_COP0(16, 4), "Config4" }, \
+ { KVM_TRACE_COP0(16, 5), "Config5" }, \
+ { KVM_TRACE_COP0(16, 7), "Config7" }, \
+ { KVM_TRACE_COP0(26, 0), "ECC" }, \
+ { KVM_TRACE_COP0(30, 0), "ErrorEPC" }, \
+ { KVM_TRACE_COP0(31, 2), "KScratch1" }, \
+ { KVM_TRACE_COP0(31, 3), "KScratch2" }, \
+ { KVM_TRACE_COP0(31, 4), "KScratch3" }, \
+ { KVM_TRACE_COP0(31, 5), "KScratch4" }, \
+ { KVM_TRACE_COP0(31, 6), "KScratch5" }, \
+ { KVM_TRACE_COP0(31, 7), "KScratch6" }, \
+ { KVM_TRACE_HWR( 0, 0), "CPUNum" }, \
+ { KVM_TRACE_HWR( 1, 0), "SYNCI_Step" }, \
+ { KVM_TRACE_HWR( 2, 0), "CC" }, \
+ { KVM_TRACE_HWR( 3, 0), "CCRes" }, \
+ { KVM_TRACE_HWR(29, 0), "ULR" }
+
+TRACE_EVENT(kvm_hwr,
+ TP_PROTO(struct kvm_vcpu *vcpu, unsigned int op, unsigned int reg,
+ unsigned long val),
+ TP_ARGS(vcpu, op, reg, val),
+ TP_STRUCT__entry(
+ __field(unsigned long, val)
+ __field(u16, reg)
+ __field(u8, op)
+ ),
+
+ TP_fast_assign(
+ __entry->val = val;
+ __entry->reg = reg;
+ __entry->op = op;
+ ),
+
+ TP_printk("%s %s (%s:%u:%u) 0x%08lx",
+ __print_symbolic(__entry->op,
+ kvm_trace_symbol_hwr_ops),
+ __print_symbolic(__entry->reg,
+ kvm_trace_symbol_hwr_regs),
+ __print_symbolic(__entry->reg >> 8,
+ kvm_trace_symbol_hwr_cop),
+ (__entry->reg >> 3) & 0x1f,
+ __entry->reg & 0x7,
+ __entry->val)
+);
+
+#define KVM_TRACE_AUX_RESTORE 0
+#define KVM_TRACE_AUX_SAVE 1
+#define KVM_TRACE_AUX_ENABLE 2
+#define KVM_TRACE_AUX_DISABLE 3
+#define KVM_TRACE_AUX_DISCARD 4
+
+#define KVM_TRACE_AUX_FPU 1
+#define KVM_TRACE_AUX_MSA 2
+#define KVM_TRACE_AUX_FPU_MSA 3
+
+#define kvm_trace_symbol_aux_op \
+ { KVM_TRACE_AUX_RESTORE, "restore" }, \
+ { KVM_TRACE_AUX_SAVE, "save" }, \
+ { KVM_TRACE_AUX_ENABLE, "enable" }, \
+ { KVM_TRACE_AUX_DISABLE, "disable" }, \
+ { KVM_TRACE_AUX_DISCARD, "discard" }
+
+#define kvm_trace_symbol_aux_state \
+ { KVM_TRACE_AUX_FPU, "FPU" }, \
+ { KVM_TRACE_AUX_MSA, "MSA" }, \
+ { KVM_TRACE_AUX_FPU_MSA, "FPU & MSA" }
+
+TRACE_EVENT(kvm_aux,
+ TP_PROTO(struct kvm_vcpu *vcpu, unsigned int op,
+ unsigned int state),
+ TP_ARGS(vcpu, op, state),
+ TP_STRUCT__entry(
+ __field(unsigned long, pc)
+ __field(u8, op)
+ __field(u8, state)
+ ),
+
+ TP_fast_assign(
+ __entry->pc = vcpu->arch.pc;
+ __entry->op = op;
+ __entry->state = state;
+ ),
+
+ TP_printk("%s %s PC: 0x%08lx",
+ __print_symbolic(__entry->op,
+ kvm_trace_symbol_aux_op),
+ __print_symbolic(__entry->state,
+ kvm_trace_symbol_aux_state),
+ __entry->pc)
+);
+
+TRACE_EVENT(kvm_asid_change,
+ TP_PROTO(struct kvm_vcpu *vcpu, unsigned int old_asid,
+ unsigned int new_asid),
+ TP_ARGS(vcpu, old_asid, new_asid),
+ TP_STRUCT__entry(
+ __field(unsigned long, pc)
+ __field(u8, old_asid)
+ __field(u8, new_asid)
+ ),
+
+ TP_fast_assign(
+ __entry->pc = vcpu->arch.pc;
+ __entry->old_asid = old_asid;
+ __entry->new_asid = new_asid;
+ ),
+
+ TP_printk("PC: 0x%08lx old: 0x%02x new: 0x%02x",
+ __entry->pc,
+ __entry->old_asid,
+ __entry->new_asid)
+);
+
#endif /* _TRACE_KVM_H */
/* This part must be outside protection */
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 6ba0fafcecbc..091553942bcb 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -21,7 +21,7 @@
static gpa_t kvm_trap_emul_gva_to_gpa_cb(gva_t gva)
{
gpa_t gpa;
- uint32_t kseg = KSEGX(gva);
+ gva_t kseg = KSEGX(gva);
if ((kseg == CKSEG0) || (kseg == CKSEG1))
gpa = CPHYSADDR(gva);
@@ -40,8 +40,8 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
- unsigned long cause = vcpu->arch.host_cp0_cause;
+ u32 __user *opc = (u32 __user *) vcpu->arch.pc;
+ u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
@@ -87,15 +87,15 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu)
static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+ u32 __user *opc = (u32 __user *) vcpu->arch.pc;
unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
- unsigned long cause = vcpu->arch.host_cp0_cause;
+ u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0
|| KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) {
- kvm_debug("USER/KSEG23 ADDR TLB MOD fault: cause %#lx, PC: %p, BadVaddr: %#lx\n",
+ kvm_debug("USER/KSEG23 ADDR TLB MOD fault: cause %#x, PC: %p, BadVaddr: %#lx\n",
cause, opc, badvaddr);
er = kvm_mips_handle_tlbmod(cause, opc, run, vcpu);
@@ -111,14 +111,14 @@ static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu)
* when we are not using HIGHMEM. Need to address this in a
* HIGHMEM kernel
*/
- kvm_err("TLB MOD fault not handled, cause %#lx, PC: %p, BadVaddr: %#lx\n",
+ kvm_err("TLB MOD fault not handled, cause %#x, PC: %p, BadVaddr: %#lx\n",
cause, opc, badvaddr);
kvm_mips_dump_host_tlbs();
kvm_arch_vcpu_dump_regs(vcpu);
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
ret = RESUME_HOST;
} else {
- kvm_err("Illegal TLB Mod fault address , cause %#lx, PC: %p, BadVaddr: %#lx\n",
+ kvm_err("Illegal TLB Mod fault address , cause %#x, PC: %p, BadVaddr: %#lx\n",
cause, opc, badvaddr);
kvm_mips_dump_host_tlbs();
kvm_arch_vcpu_dump_regs(vcpu);
@@ -128,59 +128,12 @@ static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu)
return ret;
}
-static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu)
-{
- struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
- unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
- unsigned long cause = vcpu->arch.host_cp0_cause;
- enum emulation_result er = EMULATE_DONE;
- int ret = RESUME_GUEST;
-
- if (((badvaddr & PAGE_MASK) == KVM_GUEST_COMMPAGE_ADDR)
- && KVM_GUEST_KERNEL_MODE(vcpu)) {
- if (kvm_mips_handle_commpage_tlb_fault(badvaddr, vcpu) < 0) {
- run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- ret = RESUME_HOST;
- }
- } else if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0
- || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) {
- kvm_debug("USER ADDR TLB LD fault: cause %#lx, PC: %p, BadVaddr: %#lx\n",
- cause, opc, badvaddr);
- er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu);
- if (er == EMULATE_DONE)
- ret = RESUME_GUEST;
- else {
- run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- ret = RESUME_HOST;
- }
- } else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) {
- /*
- * All KSEG0 faults are handled by KVM, as the guest kernel does
- * not expect to ever get them
- */
- if (kvm_mips_handle_kseg0_tlb_fault
- (vcpu->arch.host_cp0_badvaddr, vcpu) < 0) {
- run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- ret = RESUME_HOST;
- }
- } else {
- kvm_err("Illegal TLB LD fault address , cause %#lx, PC: %p, BadVaddr: %#lx\n",
- cause, opc, badvaddr);
- kvm_mips_dump_host_tlbs();
- kvm_arch_vcpu_dump_regs(vcpu);
- run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- ret = RESUME_HOST;
- }
- return ret;
-}
-
-static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
+static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store)
{
struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+ u32 __user *opc = (u32 __user *) vcpu->arch.pc;
unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
- unsigned long cause = vcpu->arch.host_cp0_cause;
+ u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
@@ -192,8 +145,8 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
}
} else if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0
|| KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) {
- kvm_debug("USER ADDR TLB ST fault: PC: %#lx, BadVaddr: %#lx\n",
- vcpu->arch.pc, badvaddr);
+ kvm_debug("USER ADDR TLB %s fault: cause %#x, PC: %p, BadVaddr: %#lx\n",
+ store ? "ST" : "LD", cause, opc, badvaddr);
/*
* User Address (UA) fault, this could happen if
@@ -213,14 +166,18 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
ret = RESUME_HOST;
}
} else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) {
+ /*
+ * All KSEG0 faults are handled by KVM, as the guest kernel does
+ * not expect to ever get them
+ */
if (kvm_mips_handle_kseg0_tlb_fault
(vcpu->arch.host_cp0_badvaddr, vcpu) < 0) {
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
ret = RESUME_HOST;
}
} else {
- kvm_err("Illegal TLB ST fault address , cause %#lx, PC: %p, BadVaddr: %#lx\n",
- cause, opc, badvaddr);
+ kvm_err("Illegal TLB %s fault address , cause %#x, PC: %p, BadVaddr: %#lx\n",
+ store ? "ST" : "LD", cause, opc, badvaddr);
kvm_mips_dump_host_tlbs();
kvm_arch_vcpu_dump_regs(vcpu);
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -229,12 +186,22 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
return ret;
}
+static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu)
+{
+ return kvm_trap_emul_handle_tlb_miss(vcpu, true);
+}
+
+static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
+{
+ return kvm_trap_emul_handle_tlb_miss(vcpu, false);
+}
+
static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+ u32 __user *opc = (u32 __user *) vcpu->arch.pc;
unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
- unsigned long cause = vcpu->arch.host_cp0_cause;
+ u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
@@ -251,7 +218,7 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu)
ret = RESUME_HOST;
}
} else {
- kvm_err("Address Error (STORE): cause %#lx, PC: %p, BadVaddr: %#lx\n",
+ kvm_err("Address Error (STORE): cause %#x, PC: %p, BadVaddr: %#lx\n",
cause, opc, badvaddr);
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
ret = RESUME_HOST;
@@ -262,9 +229,9 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu)
static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+ u32 __user *opc = (u32 __user *) vcpu->arch.pc;
unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
- unsigned long cause = vcpu->arch.host_cp0_cause;
+ u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
@@ -280,7 +247,7 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu)
ret = RESUME_HOST;
}
} else {
- kvm_err("Address Error (LOAD): cause %#lx, PC: %p, BadVaddr: %#lx\n",
+ kvm_err("Address Error (LOAD): cause %#x, PC: %p, BadVaddr: %#lx\n",
cause, opc, badvaddr);
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
ret = RESUME_HOST;
@@ -292,8 +259,8 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu)
static int kvm_trap_emul_handle_syscall(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
- unsigned long cause = vcpu->arch.host_cp0_cause;
+ u32 __user *opc = (u32 __user *) vcpu->arch.pc;
+ u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
@@ -310,8 +277,8 @@ static int kvm_trap_emul_handle_syscall(struct kvm_vcpu *vcpu)
static int kvm_trap_emul_handle_res_inst(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
- unsigned long cause = vcpu->arch.host_cp0_cause;
+ u32 __user *opc = (u32 __user *) vcpu->arch.pc;
+ u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
@@ -328,8 +295,8 @@ static int kvm_trap_emul_handle_res_inst(struct kvm_vcpu *vcpu)
static int kvm_trap_emul_handle_break(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
- unsigned long cause = vcpu->arch.host_cp0_cause;
+ u32 __user *opc = (u32 __user *) vcpu->arch.pc;
+ u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
@@ -346,8 +313,8 @@ static int kvm_trap_emul_handle_break(struct kvm_vcpu *vcpu)
static int kvm_trap_emul_handle_trap(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *)vcpu->arch.pc;
- unsigned long cause = vcpu->arch.host_cp0_cause;
+ u32 __user *opc = (u32 __user *)vcpu->arch.pc;
+ u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
@@ -364,8 +331,8 @@ static int kvm_trap_emul_handle_trap(struct kvm_vcpu *vcpu)
static int kvm_trap_emul_handle_msa_fpe(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *)vcpu->arch.pc;
- unsigned long cause = vcpu->arch.host_cp0_cause;
+ u32 __user *opc = (u32 __user *)vcpu->arch.pc;
+ u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
@@ -382,8 +349,8 @@ static int kvm_trap_emul_handle_msa_fpe(struct kvm_vcpu *vcpu)
static int kvm_trap_emul_handle_fpe(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *)vcpu->arch.pc;
- unsigned long cause = vcpu->arch.host_cp0_cause;
+ u32 __user *opc = (u32 __user *)vcpu->arch.pc;
+ u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
@@ -407,8 +374,8 @@ static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
- unsigned long cause = vcpu->arch.host_cp0_cause;
+ u32 __user *opc = (u32 __user *) vcpu->arch.pc;
+ u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
@@ -451,24 +418,41 @@ static int kvm_trap_emul_vm_init(struct kvm *kvm)
static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu)
{
+ vcpu->arch.kscratch_enabled = 0xfc;
+
return 0;
}
static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
- uint32_t config1;
+ u32 config, config1;
int vcpu_id = vcpu->vcpu_id;
/*
* Arch specific stuff, set up config registers properly so that the
- * guest will come up as expected, for now we simulate a MIPS 24kc
+ * guest will come up as expected
*/
+#ifndef CONFIG_CPU_MIPSR6
+ /* r2-r5, simulate a MIPS 24kc */
kvm_write_c0_guest_prid(cop0, 0x00019300);
- /* Have config1, Cacheable, noncoherent, write-back, write allocate */
- kvm_write_c0_guest_config(cop0, MIPS_CONF_M | (0x3 << CP0C0_K0) |
- (0x1 << CP0C0_AR) |
- (MMU_TYPE_R4000 << CP0C0_MT));
+#else
+ /* r6+, simulate a generic QEMU machine */
+ kvm_write_c0_guest_prid(cop0, 0x00010000);
+#endif
+ /*
+ * Have config1, Cacheable, noncoherent, write-back, write allocate.
+ * Endianness, arch revision & virtually tagged icache should match
+ * host.
+ */
+ config = read_c0_config() & MIPS_CONF_AR;
+ config |= MIPS_CONF_M | CONF_CM_CACHABLE_NONCOHERENT | MIPS_CONF_MT_TLB;
+#ifdef CONFIG_CPU_BIG_ENDIAN
+ config |= CONF_BE;
+#endif
+ if (cpu_has_vtag_icache)
+ config |= MIPS_CONF_VI;
+ kvm_write_c0_guest_config(cop0, config);
/* Read the cache characteristics from the host Config1 Register */
config1 = (read_c0_config1() & ~0x7f);
@@ -478,9 +462,8 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
config1 |= ((KVM_MIPS_GUEST_TLB_SIZE - 1) << 25);
/* We unset some bits that we aren't emulating */
- config1 &=
- ~((1 << CP0C1_C2) | (1 << CP0C1_MD) | (1 << CP0C1_PC) |
- (1 << CP0C1_WR) | (1 << CP0C1_CA));
+ config1 &= ~(MIPS_CONF1_C2 | MIPS_CONF1_MD | MIPS_CONF1_PC |
+ MIPS_CONF1_WR | MIPS_CONF1_CA);
kvm_write_c0_guest_config1(cop0, config1);
/* Have config3, no tertiary/secondary caches implemented */
@@ -511,6 +494,17 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
return 0;
}
+static unsigned long kvm_trap_emul_num_regs(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+static int kvm_trap_emul_copy_reg_indices(struct kvm_vcpu *vcpu,
+ u64 __user *indices)
+{
+ return 0;
+}
+
static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu,
const struct kvm_one_reg *reg,
s64 *v)
@@ -660,6 +654,8 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
.dequeue_io_int = kvm_mips_dequeue_io_int_cb,
.irq_deliver = kvm_mips_irq_deliver_cb,
.irq_clear = kvm_mips_irq_clear_cb,
+ .num_regs = kvm_trap_emul_num_regs,
+ .copy_reg_indices = kvm_trap_emul_copy_reg_indices,
.get_one_reg = kvm_trap_emul_get_one_reg,
.set_one_reg = kvm_trap_emul_set_one_reg,
.vcpu_get_regs = kvm_trap_emul_vcpu_get_regs,
diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index d96e912b9d44..6dc07fba187f 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -627,8 +627,8 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
dec_insn.pc_inc +
dec_insn.next_pc_inc;
return 1;
- case cbcond0_op:
- case cbcond1_op:
+ case pop10_op:
+ case pop30_op:
if (!cpu_has_mips_r6)
break;
if (insn.i_format.rt && !insn.i_format.rs)
@@ -683,14 +683,14 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
dec_insn.next_pc_inc;
return 1;
- case beqzcjic_op:
+ case pop66_op:
if (!cpu_has_mips_r6)
break;
*contpc = regs->cp0_epc + dec_insn.pc_inc +
dec_insn.next_pc_inc;
return 1;
- case bnezcjialc_op:
+ case pop76_op:
if (!cpu_has_mips_r6)
break;
if (!insn.i_format.rs)
diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c
index ef7f925dd1b0..7a9c345e87e5 100644
--- a/arch/mips/mm/c-r4k.c
+++ b/arch/mips/mm/c-r4k.c
@@ -1206,7 +1206,7 @@ static void probe_pcache(void)
c->icache.linesz;
c->icache.waybit = __ffs(icache_size/c->icache.ways);
- if (config & 0x8) /* VI bit */
+ if (config & MIPS_CONF_VI)
c->icache.flags |= MIPS_CACHE_VTAG;
/*
diff --git a/arch/mips/mm/uasm-micromips.c b/arch/mips/mm/uasm-micromips.c
index d78178daea4b..277cf52d80e1 100644
--- a/arch/mips/mm/uasm-micromips.c
+++ b/arch/mips/mm/uasm-micromips.c
@@ -53,8 +53,13 @@ static struct insn insn_table_MM[] = {
{ insn_bltzl, 0, 0 },
{ insn_bne, M(mm_bne32_op, 0, 0, 0, 0, 0), RT | RS | BIMM },
{ insn_cache, M(mm_pool32b_op, 0, 0, mm_cache_func, 0, 0), RT | RS | SIMM },
+ { insn_cfc1, M(mm_pool32f_op, 0, 0, 0, mm_cfc1_op, mm_32f_73_op), RT | RS },
+ { insn_cfcmsa, M(mm_pool32s_op, 0, msa_cfc_op, 0, 0, mm_32s_elm_op), RD | RE },
+ { insn_ctc1, M(mm_pool32f_op, 0, 0, 0, mm_ctc1_op, mm_32f_73_op), RT | RS },
+ { insn_ctcmsa, M(mm_pool32s_op, 0, msa_ctc_op, 0, 0, mm_32s_elm_op), RD | RE },
{ insn_daddu, 0, 0 },
{ insn_daddiu, 0, 0 },
+ { insn_di, M(mm_pool32a_op, 0, 0, 0, mm_di_op, mm_pool32axf_op), RS },
{ insn_divu, M(mm_pool32a_op, 0, 0, 0, mm_divu_op, mm_pool32axf_op), RT | RS },
{ insn_dmfc0, 0, 0 },
{ insn_dmtc0, 0, 0 },
@@ -84,6 +89,8 @@ static struct insn insn_table_MM[] = {
{ insn_mfhi, M(mm_pool32a_op, 0, 0, 0, mm_mfhi32_op, mm_pool32axf_op), RS },
{ insn_mflo, M(mm_pool32a_op, 0, 0, 0, mm_mflo32_op, mm_pool32axf_op), RS },
{ insn_mtc0, M(mm_pool32a_op, 0, 0, 0, mm_mtc0_op, mm_pool32axf_op), RT | RS | RD },
+ { insn_mthi, M(mm_pool32a_op, 0, 0, 0, mm_mthi32_op, mm_pool32axf_op), RS },
+ { insn_mtlo, M(mm_pool32a_op, 0, 0, 0, mm_mtlo32_op, mm_pool32axf_op), RS },
{ insn_mul, M(mm_pool32a_op, 0, 0, 0, 0, mm_mul_op), RT | RS | RD },
{ insn_or, M(mm_pool32a_op, 0, 0, 0, 0, mm_or32_op), RT | RS | RD },
{ insn_ori, M(mm_ori32_op, 0, 0, 0, 0, 0), RT | RS | UIMM },
@@ -166,13 +173,15 @@ static void build_insn(u32 **buf, enum opcode opc, ...)
op = ip->match;
va_start(ap, opc);
if (ip->fields & RS) {
- if (opc == insn_mfc0 || opc == insn_mtc0)
+ if (opc == insn_mfc0 || opc == insn_mtc0 ||
+ opc == insn_cfc1 || opc == insn_ctc1)
op |= build_rt(va_arg(ap, u32));
else
op |= build_rs(va_arg(ap, u32));
}
if (ip->fields & RT) {
- if (opc == insn_mfc0 || opc == insn_mtc0)
+ if (opc == insn_mfc0 || opc == insn_mtc0 ||
+ opc == insn_cfc1 || opc == insn_ctc1)
op |= build_rs(va_arg(ap, u32));
else
op |= build_rt(va_arg(ap, u32));
diff --git a/arch/mips/mm/uasm-mips.c b/arch/mips/mm/uasm-mips.c
index 9c2220a45189..cec524167822 100644
--- a/arch/mips/mm/uasm-mips.c
+++ b/arch/mips/mm/uasm-mips.c
@@ -67,9 +67,14 @@ static struct insn insn_table[] = {
#else
{ insn_cache, M6(cache_op, 0, 0, 0, cache6_op), RS | RT | SIMM9 },
#endif
+ { insn_cfc1, M(cop1_op, cfc_op, 0, 0, 0, 0), RT | RD },
+ { insn_cfcmsa, M(msa_op, 0, msa_cfc_op, 0, 0, msa_elm_op), RD | RE },
+ { insn_ctc1, M(cop1_op, ctc_op, 0, 0, 0, 0), RT | RD },
+ { insn_ctcmsa, M(msa_op, 0, msa_ctc_op, 0, 0, msa_elm_op), RD | RE },
{ insn_daddiu, M(daddiu_op, 0, 0, 0, 0, 0), RS | RT | SIMM },
{ insn_daddu, M(spec_op, 0, 0, 0, 0, daddu_op), RS | RT | RD },
{ insn_dinsm, M(spec3_op, 0, 0, 0, 0, dinsm_op), RS | RT | RD | RE },
+ { insn_di, M(cop0_op, mfmc0_op, 0, 12, 0, 0), RT },
{ insn_dins, M(spec3_op, 0, 0, 0, 0, dins_op), RS | RT | RD | RE },
{ insn_divu, M(spec_op, 0, 0, 0, 0, divu_op), RS | RT },
{ insn_dmfc0, M(cop0_op, dmfc_op, 0, 0, 0, 0), RT | RD | SET},
@@ -114,7 +119,13 @@ static struct insn insn_table[] = {
{ insn_mflo, M(spec_op, 0, 0, 0, 0, mflo_op), RD },
{ insn_mtc0, M(cop0_op, mtc_op, 0, 0, 0, 0), RT | RD | SET},
{ insn_mthc0, M(cop0_op, mthc0_op, 0, 0, 0, 0), RT | RD | SET},
+ { insn_mthi, M(spec_op, 0, 0, 0, 0, mthi_op), RS },
+ { insn_mtlo, M(spec_op, 0, 0, 0, 0, mtlo_op), RS },
+#ifndef CONFIG_CPU_MIPSR6
{ insn_mul, M(spec2_op, 0, 0, 0, 0, mul_op), RS | RT | RD},
+#else
+ { insn_mul, M(spec_op, 0, 0, 0, mult_mul_op, mult_op), RS | RT | RD},
+#endif
{ insn_ori, M(ori_op, 0, 0, 0, 0, 0), RS | RT | UIMM },
{ insn_or, M(spec_op, 0, 0, 0, 0, or_op), RS | RT | RD },
#ifndef CONFIG_CPU_MIPSR6
diff --git a/arch/mips/mm/uasm.c b/arch/mips/mm/uasm.c
index ad718debc35a..3e0282d301d6 100644
--- a/arch/mips/mm/uasm.c
+++ b/arch/mips/mm/uasm.c
@@ -49,18 +49,19 @@ enum opcode {
insn_invalid,
insn_addiu, insn_addu, insn_and, insn_andi, insn_bbit0, insn_bbit1,
insn_beq, insn_beql, insn_bgez, insn_bgezl, insn_bltz, insn_bltzl,
- insn_bne, insn_cache, insn_daddiu, insn_daddu, insn_dins, insn_dinsm,
- insn_divu, insn_dmfc0, insn_dmtc0, insn_drotr, insn_drotr32, insn_dsll,
+ insn_bne, insn_cache, insn_cfc1, insn_cfcmsa, insn_ctc1, insn_ctcmsa,
+ insn_daddiu, insn_daddu, insn_di, insn_dins, insn_dinsm, insn_divu,
+ insn_dmfc0, insn_dmtc0, insn_drotr, insn_drotr32, insn_dsll,
insn_dsll32, insn_dsra, insn_dsrl, insn_dsrl32, insn_dsubu, insn_eret,
insn_ext, insn_ins, insn_j, insn_jal, insn_jalr, insn_jr, insn_lb,
insn_ld, insn_ldx, insn_lh, insn_ll, insn_lld, insn_lui, insn_lw,
insn_lwx, insn_mfc0, insn_mfhc0, insn_mfhi, insn_mflo, insn_mtc0,
- insn_mthc0, insn_mul, insn_or, insn_ori, insn_pref, insn_rfe,
- insn_rotr, insn_sc, insn_scd, insn_sd, insn_sll, insn_sllv, insn_slt,
- insn_sltiu, insn_sltu, insn_sra, insn_srl, insn_srlv, insn_subu,
- insn_sw, insn_sync, insn_syscall, insn_tlbp, insn_tlbr, insn_tlbwi,
- insn_tlbwr, insn_wait, insn_wsbh, insn_xor, insn_xori, insn_yield,
- insn_lddir, insn_ldpte,
+ insn_mthc0, insn_mthi, insn_mtlo, insn_mul, insn_or, insn_ori,
+ insn_pref, insn_rfe, insn_rotr, insn_sc, insn_scd, insn_sd, insn_sll,
+ insn_sllv, insn_slt, insn_sltiu, insn_sltu, insn_sra, insn_srl,
+ insn_srlv, insn_subu, insn_sw, insn_sync, insn_syscall, insn_tlbp,
+ insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait, insn_wsbh, insn_xor,
+ insn_xori, insn_yield, insn_lddir, insn_ldpte,
};
struct insn {
@@ -268,10 +269,15 @@ I_u1s2(_bltz)
I_u1s2(_bltzl)
I_u1u2s3(_bne)
I_u2s3u1(_cache)
+I_u1u2(_cfc1)
+I_u2u1(_cfcmsa)
+I_u1u2(_ctc1)
+I_u2u1(_ctcmsa)
I_u1u2u3(_dmfc0)
I_u1u2u3(_dmtc0)
I_u2u1s3(_daddiu)
I_u3u1u2(_daddu)
+I_u1(_di);
I_u1u2(_divu)
I_u2u1u3(_dsll)
I_u2u1u3(_dsll32)
@@ -301,6 +307,8 @@ I_u1(_mfhi)
I_u1(_mflo)
I_u1u2u3(_mtc0)
I_u1u2u3(_mthc0)
+I_u1(_mthi)
+I_u1(_mtlo)
I_u3u1u2(_mul)
I_u2u1u3(_ori)
I_u3u1u2(_or)
diff --git a/arch/powerpc/include/asm/hmi.h b/arch/powerpc/include/asm/hmi.h
new file mode 100644
index 000000000000..88b4901ac4ee
--- /dev/null
+++ b/arch/powerpc/include/asm/hmi.h
@@ -0,0 +1,45 @@
+/*
+ * Hypervisor Maintenance Interrupt header file.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.
+ *
+ * Copyright 2015 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#ifndef __ASM_PPC64_HMI_H__
+#define __ASM_PPC64_HMI_H__
+
+#ifdef CONFIG_PPC_BOOK3S_64
+
+#define CORE_TB_RESYNC_REQ_BIT 63
+#define MAX_SUBCORE_PER_CORE 4
+
+/*
+ * sibling_subcore_state structure is used to co-ordinate all threads
+ * during HMI to avoid TB corruption. This structure is allocated once
+ * per each core and shared by all threads on that core.
+ */
+struct sibling_subcore_state {
+ unsigned long flags;
+ u8 in_guest[MAX_SUBCORE_PER_CORE];
+};
+
+extern void wait_for_subcore_guest_exit(void);
+extern void wait_for_tb_resync(void);
+#else
+static inline void wait_for_subcore_guest_exit(void) { }
+static inline void wait_for_tb_resync(void) { }
+#endif
+#endif /* __ASM_PPC64_HMI_H__ */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index ad171e979ab0..148303e7771f 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -26,6 +26,7 @@
#include <asm/kvm_book3s_asm.h>
#endif
#include <asm/accounting.h>
+#include <asm/hmi.h>
register struct paca_struct *local_paca asm("r13");
@@ -182,6 +183,11 @@ struct paca_struct {
*/
u16 in_mce;
u8 hmi_event_available; /* HMI event is available */
+ /*
+ * Bitmap for sibling subcore status. See kvm/book3s_hv_ras.c for
+ * more details
+ */
+ struct sibling_subcore_state *sibling_subcore_state;
#endif
/* Stuff for accurate time accounting */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index fe4c075bcf50..b2027a5cf508 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -41,7 +41,7 @@ obj-$(CONFIG_VDSO32) += vdso32/
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o
obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o
-obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o
+obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o hmi.o
obj-$(CONFIG_PPC_BOOK3E_64) += exceptions-64e.o idle_book3e.o
obj-$(CONFIG_PPC64) += vdso64/
obj-$(CONFIG_ALTIVEC) += vecemu.o
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 6200e4925d26..694def6c9d61 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -671,6 +671,8 @@ BEGIN_FTR_SECTION
beq h_doorbell_common
cmpwi r3,0xea0
beq h_virt_irq_common
+ cmpwi r3,0xe60
+ beq hmi_exception_common
FTR_SECTION_ELSE
cmpwi r3,0xa00
beq doorbell_super_common
@@ -1172,7 +1174,7 @@ fwnmi_data_area:
.globl hmi_exception_early
hmi_exception_early:
- EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0xe60)
+ EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST, 0xe62)
mr r10,r1 /* Save r1 */
ld r1,PACAEMERGSP(r13) /* Use emergency stack */
subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */
diff --git a/arch/powerpc/kernel/hmi.c b/arch/powerpc/kernel/hmi.c
new file mode 100644
index 000000000000..e3f738eb1cac
--- /dev/null
+++ b/arch/powerpc/kernel/hmi.c
@@ -0,0 +1,56 @@
+/*
+ * Hypervisor Maintenance Interrupt (HMI) handling.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.
+ *
+ * Copyright 2015 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <asm/paca.h>
+#include <asm/hmi.h>
+
+void wait_for_subcore_guest_exit(void)
+{
+ int i;
+
+ /*
+ * NULL bitmap pointer indicates that KVM module hasn't
+ * been loaded yet and hence no guests are running.
+ * If no KVM is in use, no need to co-ordinate among threads
+ * as all of them will always be in host and no one is going
+ * to modify TB other than the opal hmi handler.
+ * Hence, just return from here.
+ */
+ if (!local_paca->sibling_subcore_state)
+ return;
+
+ for (i = 0; i < MAX_SUBCORE_PER_CORE; i++)
+ while (local_paca->sibling_subcore_state->in_guest[i])
+ cpu_relax();
+}
+
+void wait_for_tb_resync(void)
+{
+ if (!local_paca->sibling_subcore_state)
+ return;
+
+ while (test_bit(CORE_TB_RESYNC_REQ_BIT,
+ &local_paca->sibling_subcore_state->flags))
+ cpu_relax();
+}
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 335eb6cedae5..8a56a51fc0cb 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -336,7 +336,9 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \
ld r2,PACATOC(r13); \
ld r1,PACAR1(r13); \
std r3,ORIG_GPR3(r1); /* Save original r3 */ \
- bl opal_rm_handle_hmi; \
+ li r3,0; /* NULL argument */ \
+ bl hmi_exception_realmode; \
+ nop; \
ld r3,ORIG_GPR3(r1); /* Restore original r3 */ \
20: nop;
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index f7e2f2e318bd..2cb589264cb7 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -61,6 +61,7 @@
#include <asm/tm.h>
#include <asm/debug.h>
#include <asm/asm-prototypes.h>
+#include <asm/hmi.h>
#include <sysdev/fsl_pci.h>
#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
@@ -308,9 +309,13 @@ long hmi_exception_realmode(struct pt_regs *regs)
{
__this_cpu_inc(irq_stat.hmi_exceptions);
+ wait_for_subcore_guest_exit();
+
if (ppc_md.hmi_exception_early)
ppc_md.hmi_exception_early(regs);
+ wait_for_tb_resync();
+
return 0;
}
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index e20beae5ca7a..2fd5580c8f6e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -52,6 +52,7 @@
#include <asm/switch_to.h>
#include <asm/smp.h>
#include <asm/dbell.h>
+#include <asm/hmi.h>
#include <linux/gfp.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
@@ -2522,7 +2523,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list)
spin_unlock(&pvc->lock);
- kvm_guest_enter();
+ guest_enter();
srcu_idx = srcu_read_lock(&vc->kvm->srcu);
@@ -2570,7 +2571,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
/* make sure updates to secondary vcpu structs are visible now */
smp_mb();
- kvm_guest_exit();
+ guest_exit();
for (sub = 0; sub < core_info.n_subcores; ++sub)
list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub],
@@ -3401,6 +3402,38 @@ static struct kvmppc_ops kvm_ops_hv = {
.hcall_implemented = kvmppc_hcall_impl_hv,
};
+static int kvm_init_subcore_bitmap(void)
+{
+ int i, j;
+ int nr_cores = cpu_nr_cores();
+ struct sibling_subcore_state *sibling_subcore_state;
+
+ for (i = 0; i < nr_cores; i++) {
+ int first_cpu = i * threads_per_core;
+ int node = cpu_to_node(first_cpu);
+
+ /* Ignore if it is already allocated. */
+ if (paca[first_cpu].sibling_subcore_state)
+ continue;
+
+ sibling_subcore_state =
+ kmalloc_node(sizeof(struct sibling_subcore_state),
+ GFP_KERNEL, node);
+ if (!sibling_subcore_state)
+ return -ENOMEM;
+
+ memset(sibling_subcore_state, 0,
+ sizeof(struct sibling_subcore_state));
+
+ for (j = 0; j < threads_per_core; j++) {
+ int cpu = first_cpu + j;
+
+ paca[cpu].sibling_subcore_state = sibling_subcore_state;
+ }
+ }
+ return 0;
+}
+
static int kvmppc_book3s_init_hv(void)
{
int r;
@@ -3411,6 +3444,10 @@ static int kvmppc_book3s_init_hv(void)
if (r < 0)
return -ENODEV;
+ r = kvm_init_subcore_bitmap();
+ if (r)
+ return r;
+
kvm_ops_hv.owner = THIS_MODULE;
kvmppc_hv_ops = &kvm_ops_hv;
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
index 93b5f5c9b445..0fa70a9618d7 100644
--- a/arch/powerpc/kvm/book3s_hv_ras.c
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -13,6 +13,9 @@
#include <linux/kernel.h>
#include <asm/opal.h>
#include <asm/mce.h>
+#include <asm/machdep.h>
+#include <asm/cputhreads.h>
+#include <asm/hmi.h>
/* SRR1 bits for machine check on POWER7 */
#define SRR1_MC_LDSTERR (1ul << (63-42))
@@ -140,3 +143,176 @@ long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu)
{
return kvmppc_realmode_mc_power7(vcpu);
}
+
+/* Check if dynamic split is in force and return subcore size accordingly. */
+static inline int kvmppc_cur_subcore_size(void)
+{
+ if (local_paca->kvm_hstate.kvm_split_mode)
+ return local_paca->kvm_hstate.kvm_split_mode->subcore_size;
+
+ return threads_per_subcore;
+}
+
+void kvmppc_subcore_enter_guest(void)
+{
+ int thread_id, subcore_id;
+
+ thread_id = cpu_thread_in_core(local_paca->paca_index);
+ subcore_id = thread_id / kvmppc_cur_subcore_size();
+
+ local_paca->sibling_subcore_state->in_guest[subcore_id] = 1;
+}
+
+void kvmppc_subcore_exit_guest(void)
+{
+ int thread_id, subcore_id;
+
+ thread_id = cpu_thread_in_core(local_paca->paca_index);
+ subcore_id = thread_id / kvmppc_cur_subcore_size();
+
+ local_paca->sibling_subcore_state->in_guest[subcore_id] = 0;
+}
+
+static bool kvmppc_tb_resync_required(void)
+{
+ if (test_and_set_bit(CORE_TB_RESYNC_REQ_BIT,
+ &local_paca->sibling_subcore_state->flags))
+ return false;
+
+ return true;
+}
+
+static void kvmppc_tb_resync_done(void)
+{
+ clear_bit(CORE_TB_RESYNC_REQ_BIT,
+ &local_paca->sibling_subcore_state->flags);
+}
+
+/*
+ * kvmppc_realmode_hmi_handler() is called only by primary thread during
+ * guest exit path.
+ *
+ * There are multiple reasons why HMI could occur, one of them is
+ * Timebase (TB) error. If this HMI is due to TB error, then TB would
+ * have been in stopped state. The opal hmi handler Will fix it and
+ * restore the TB value with host timebase value. For HMI caused due
+ * to non-TB errors, opal hmi handler will not touch/restore TB register
+ * and hence there won't be any change in TB value.
+ *
+ * Since we are not sure about the cause of this HMI, we can't be sure
+ * about the content of TB register whether it holds guest or host timebase
+ * value. Hence the idea is to resync the TB on every HMI, so that we
+ * know about the exact state of the TB value. Resync TB call will
+ * restore TB to host timebase.
+ *
+ * Things to consider:
+ * - On TB error, HMI interrupt is reported on all the threads of the core
+ * that has encountered TB error irrespective of split-core mode.
+ * - The very first thread on the core that get chance to fix TB error
+ * would rsync the TB with local chipTOD value.
+ * - The resync TB is a core level action i.e. it will sync all the TBs
+ * in that core independent of split-core mode. This means if we trigger
+ * TB sync from a thread from one subcore, it would affect TB values of
+ * sibling subcores of the same core.
+ *
+ * All threads need to co-ordinate before making opal hmi handler.
+ * All threads will use sibling_subcore_state->in_guest[] (shared by all
+ * threads in the core) in paca which holds information about whether
+ * sibling subcores are in Guest mode or host mode. The in_guest[] array
+ * is of size MAX_SUBCORE_PER_CORE=4, indexed using subcore id to set/unset
+ * subcore status. Only primary threads from each subcore is responsible
+ * to set/unset its designated array element while entering/exiting the
+ * guset.
+ *
+ * After invoking opal hmi handler call, one of the thread (of entire core)
+ * will need to resync the TB. Bit 63 from subcore state bitmap flags
+ * (sibling_subcore_state->flags) will be used to co-ordinate between
+ * primary threads to decide who takes up the responsibility.
+ *
+ * This is what we do:
+ * - Primary thread from each subcore tries to set resync required bit[63]
+ * of paca->sibling_subcore_state->flags.
+ * - The first primary thread that is able to set the flag takes the
+ * responsibility of TB resync. (Let us call it as thread leader)
+ * - All other threads which are in host will call
+ * wait_for_subcore_guest_exit() and wait for in_guest[0-3] from
+ * paca->sibling_subcore_state to get cleared.
+ * - All the primary thread will clear its subcore status from subcore
+ * state in_guest[] array respectively.
+ * - Once all primary threads clear in_guest[0-3], all of them will invoke
+ * opal hmi handler.
+ * - Now all threads will wait for TB resync to complete by invoking
+ * wait_for_tb_resync() except the thread leader.
+ * - Thread leader will do a TB resync by invoking opal_resync_timebase()
+ * call and the it will clear the resync required bit.
+ * - All other threads will now come out of resync wait loop and proceed
+ * with individual execution.
+ * - On return of this function, primary thread will signal all
+ * secondary threads to proceed.
+ * - All secondary threads will eventually call opal hmi handler on
+ * their exit path.
+ */
+
+long kvmppc_realmode_hmi_handler(void)
+{
+ int ptid = local_paca->kvm_hstate.ptid;
+ bool resync_req;
+
+ /* This is only called on primary thread. */
+ BUG_ON(ptid != 0);
+ __this_cpu_inc(irq_stat.hmi_exceptions);
+
+ /*
+ * By now primary thread has already completed guest->host
+ * partition switch but haven't signaled secondaries yet.
+ * All the secondary threads on this subcore is waiting
+ * for primary thread to signal them to go ahead.
+ *
+ * For threads from subcore which isn't in guest, they all will
+ * wait until all other subcores on this core exit the guest.
+ *
+ * Now set the resync required bit. If you are the first to
+ * set this bit then kvmppc_tb_resync_required() function will
+ * return true. For rest all other subcores
+ * kvmppc_tb_resync_required() will return false.
+ *
+ * If resync_req == true, then this thread is responsible to
+ * initiate TB resync after hmi handler has completed.
+ * All other threads on this core will wait until this thread
+ * clears the resync required bit flag.
+ */
+ resync_req = kvmppc_tb_resync_required();
+
+ /* Reset the subcore status to indicate it has exited guest */
+ kvmppc_subcore_exit_guest();
+
+ /*
+ * Wait for other subcores on this core to exit the guest.
+ * All the primary threads and threads from subcore that are
+ * not in guest will wait here until all subcores are out
+ * of guest context.
+ */
+ wait_for_subcore_guest_exit();
+
+ /*
+ * At this point we are sure that primary threads from each
+ * subcore on this core have completed guest->host partition
+ * switch. Now it is safe to call HMI handler.
+ */
+ if (ppc_md.hmi_exception_early)
+ ppc_md.hmi_exception_early(NULL);
+
+ /*
+ * Check if this thread is responsible to resync TB.
+ * All other threads will wait until this thread completes the
+ * TB resync.
+ */
+ if (resync_req) {
+ opal_resync_timebase();
+ /* Reset TB resync req bit */
+ kvmppc_tb_resync_done();
+ } else {
+ wait_for_tb_resync();
+ }
+ return 0;
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 86f0cae37a85..975655573844 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -29,6 +29,7 @@
#include <asm/kvm_book3s_asm.h>
#include <asm/book3s/64/mmu-hash.h>
#include <asm/tm.h>
+#include <asm/opal.h>
#define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
@@ -373,6 +374,18 @@ kvm_secondary_got_guest:
lwsync
std r0, HSTATE_KVM_VCORE(r13)
+ /*
+ * All secondaries exiting guest will fall through this path.
+ * Before proceeding, just check for HMI interrupt and
+ * invoke opal hmi handler. By now we are sure that the
+ * primary thread on this core/subcore has already made partition
+ * switch/TB resync and we are good to call opal hmi handler.
+ */
+ cmpwi r12, BOOK3S_INTERRUPT_HMI
+ bne kvm_no_guest
+
+ li r3,0 /* NULL argument */
+ bl hmi_exception_realmode
/*
* At this point we have finished executing in the guest.
* We need to wait for hwthread_req to become zero, since
@@ -428,6 +441,22 @@ kvm_no_guest:
*/
kvm_unsplit_nap:
/*
+ * When secondaries are napping in kvm_unsplit_nap() with
+ * hwthread_req = 1, HMI goes ignored even though subcores are
+ * already exited the guest. Hence HMI keeps waking up secondaries
+ * from nap in a loop and secondaries always go back to nap since
+ * no vcore is assigned to them. This makes impossible for primary
+ * thread to get hold of secondary threads resulting into a soft
+ * lockup in KVM path.
+ *
+ * Let us check if HMI is pending and handle it before we go to nap.
+ */
+ cmpwi r12, BOOK3S_INTERRUPT_HMI
+ bne 55f
+ li r3, 0 /* NULL argument */
+ bl hmi_exception_realmode
+55:
+ /*
* Ensure that secondary doesn't nap when it has
* its vcore pointer set.
*/
@@ -601,6 +630,11 @@ BEGIN_FTR_SECTION
mtspr SPRN_DPDES, r8
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+ /* Mark the subcore state as inside guest */
+ bl kvmppc_subcore_enter_guest
+ nop
+ ld r5, HSTATE_KVM_VCORE(r13)
+ ld r4, HSTATE_KVM_VCPU(r13)
li r0,1
stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */
@@ -655,112 +689,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
BEGIN_FTR_SECTION
- b skip_tm
-END_FTR_SECTION_IFCLR(CPU_FTR_TM)
-
- /* Turn on TM/FP/VSX/VMX so we can restore them. */
- mfmsr r5
- li r6, MSR_TM >> 32
- sldi r6, r6, 32
- or r5, r5, r6
- ori r5, r5, MSR_FP
- oris r5, r5, (MSR_VEC | MSR_VSX)@h
- mtmsrd r5
-
- /*
- * The user may change these outside of a transaction, so they must
- * always be context switched.
- */
- ld r5, VCPU_TFHAR(r4)
- ld r6, VCPU_TFIAR(r4)
- ld r7, VCPU_TEXASR(r4)
- mtspr SPRN_TFHAR, r5
- mtspr SPRN_TFIAR, r6
- mtspr SPRN_TEXASR, r7
-
- ld r5, VCPU_MSR(r4)
- rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
- beq skip_tm /* TM not active in guest */
-
- /* Make sure the failure summary is set, otherwise we'll program check
- * when we trechkpt. It's possible that this might have been not set
- * on a kvmppc_set_one_reg() call but we shouldn't let this crash the
- * host.
- */
- oris r7, r7, (TEXASR_FS)@h
- mtspr SPRN_TEXASR, r7
-
- /*
- * We need to load up the checkpointed state for the guest.
- * We need to do this early as it will blow away any GPRs, VSRs and
- * some SPRs.
- */
-
- mr r31, r4
- addi r3, r31, VCPU_FPRS_TM
- bl load_fp_state
- addi r3, r31, VCPU_VRS_TM
- bl load_vr_state
- mr r4, r31
- lwz r7, VCPU_VRSAVE_TM(r4)
- mtspr SPRN_VRSAVE, r7
-
- ld r5, VCPU_LR_TM(r4)
- lwz r6, VCPU_CR_TM(r4)
- ld r7, VCPU_CTR_TM(r4)
- ld r8, VCPU_AMR_TM(r4)
- ld r9, VCPU_TAR_TM(r4)
- mtlr r5
- mtcr r6
- mtctr r7
- mtspr SPRN_AMR, r8
- mtspr SPRN_TAR, r9
-
- /*
- * Load up PPR and DSCR values but don't put them in the actual SPRs
- * till the last moment to avoid running with userspace PPR and DSCR for
- * too long.
- */
- ld r29, VCPU_DSCR_TM(r4)
- ld r30, VCPU_PPR_TM(r4)
-
- std r2, PACATMSCRATCH(r13) /* Save TOC */
-
- /* Clear the MSR RI since r1, r13 are all going to be foobar. */
- li r5, 0
- mtmsrd r5, 1
-
- /* Load GPRs r0-r28 */
- reg = 0
- .rept 29
- ld reg, VCPU_GPRS_TM(reg)(r31)
- reg = reg + 1
- .endr
-
- mtspr SPRN_DSCR, r29
- mtspr SPRN_PPR, r30
-
- /* Load final GPRs */
- ld 29, VCPU_GPRS_TM(29)(r31)
- ld 30, VCPU_GPRS_TM(30)(r31)
- ld 31, VCPU_GPRS_TM(31)(r31)
-
- /* TM checkpointed state is now setup. All GPRs are now volatile. */
- TRECHKPT
-
- /* Now let's get back the state we need. */
- HMT_MEDIUM
- GET_PACA(r13)
- ld r29, HSTATE_DSCR(r13)
- mtspr SPRN_DSCR, r29
- ld r4, HSTATE_KVM_VCPU(r13)
- ld r1, HSTATE_HOST_R1(r13)
- ld r2, PACATMSCRATCH(r13)
-
- /* Set the MSR RI since we have our registers back. */
- li r5, MSR_RI
- mtmsrd r5, 1
-skip_tm:
+ bl kvmppc_restore_tm
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
#endif
/* Load guest PMU registers */
@@ -841,12 +771,6 @@ BEGIN_FTR_SECTION
/* Skip next section on POWER7 */
b 8f
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
- /* Turn on TM so we can access TFHAR/TFIAR/TEXASR */
- mfmsr r8
- li r0, 1
- rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG
- mtmsrd r8
-
/* Load up POWER8-specific registers */
ld r5, VCPU_IAMR(r4)
lwz r6, VCPU_PSPB(r4)
@@ -1436,106 +1360,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
BEGIN_FTR_SECTION
- b 2f
-END_FTR_SECTION_IFCLR(CPU_FTR_TM)
- /* Turn on TM. */
- mfmsr r8
- li r0, 1
- rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG
- mtmsrd r8
-
- ld r5, VCPU_MSR(r9)
- rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
- beq 1f /* TM not active in guest. */
-
- li r3, TM_CAUSE_KVM_RESCHED
-
- /* Clear the MSR RI since r1, r13 are all going to be foobar. */
- li r5, 0
- mtmsrd r5, 1
-
- /* All GPRs are volatile at this point. */
- TRECLAIM(R3)
-
- /* Temporarily store r13 and r9 so we have some regs to play with */
- SET_SCRATCH0(r13)
- GET_PACA(r13)
- std r9, PACATMSCRATCH(r13)
- ld r9, HSTATE_KVM_VCPU(r13)
-
- /* Get a few more GPRs free. */
- std r29, VCPU_GPRS_TM(29)(r9)
- std r30, VCPU_GPRS_TM(30)(r9)
- std r31, VCPU_GPRS_TM(31)(r9)
-
- /* Save away PPR and DSCR soon so don't run with user values. */
- mfspr r31, SPRN_PPR
- HMT_MEDIUM
- mfspr r30, SPRN_DSCR
- ld r29, HSTATE_DSCR(r13)
- mtspr SPRN_DSCR, r29
-
- /* Save all but r9, r13 & r29-r31 */
- reg = 0
- .rept 29
- .if (reg != 9) && (reg != 13)
- std reg, VCPU_GPRS_TM(reg)(r9)
- .endif
- reg = reg + 1
- .endr
- /* ... now save r13 */
- GET_SCRATCH0(r4)
- std r4, VCPU_GPRS_TM(13)(r9)
- /* ... and save r9 */
- ld r4, PACATMSCRATCH(r13)
- std r4, VCPU_GPRS_TM(9)(r9)
-
- /* Reload stack pointer and TOC. */
- ld r1, HSTATE_HOST_R1(r13)
- ld r2, PACATOC(r13)
-
- /* Set MSR RI now we have r1 and r13 back. */
- li r5, MSR_RI
- mtmsrd r5, 1
-
- /* Save away checkpinted SPRs. */
- std r31, VCPU_PPR_TM(r9)
- std r30, VCPU_DSCR_TM(r9)
- mflr r5
- mfcr r6
- mfctr r7
- mfspr r8, SPRN_AMR
- mfspr r10, SPRN_TAR
- std r5, VCPU_LR_TM(r9)
- stw r6, VCPU_CR_TM(r9)
- std r7, VCPU_CTR_TM(r9)
- std r8, VCPU_AMR_TM(r9)
- std r10, VCPU_TAR_TM(r9)
-
- /* Restore r12 as trap number. */
- lwz r12, VCPU_TRAP(r9)
-
- /* Save FP/VSX. */
- addi r3, r9, VCPU_FPRS_TM
- bl store_fp_state
- addi r3, r9, VCPU_VRS_TM
- bl store_vr_state
- mfspr r6, SPRN_VRSAVE
- stw r6, VCPU_VRSAVE_TM(r9)
-1:
- /*
- * We need to save these SPRs after the treclaim so that the software
- * error code is recorded correctly in the TEXASR. Also the user may
- * change these outside of a transaction, so they must always be
- * context switched.
- */
- mfspr r5, SPRN_TFHAR
- mfspr r6, SPRN_TFIAR
- mfspr r7, SPRN_TEXASR
- std r5, VCPU_TFHAR(r9)
- std r6, VCPU_TFIAR(r9)
- std r7, VCPU_TEXASR(r9)
-2:
+ bl kvmppc_save_tm
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
#endif
/* Increment yield count if they have a VPA */
@@ -1683,6 +1509,23 @@ BEGIN_FTR_SECTION
mtspr SPRN_DPDES, r8
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+ /* If HMI, call kvmppc_realmode_hmi_handler() */
+ cmpwi r12, BOOK3S_INTERRUPT_HMI
+ bne 27f
+ bl kvmppc_realmode_hmi_handler
+ nop
+ li r12, BOOK3S_INTERRUPT_HMI
+ /*
+ * At this point kvmppc_realmode_hmi_handler would have resync-ed
+ * the TB. Hence it is not required to subtract guest timebase
+ * offset from timebase. So, skip it.
+ *
+ * Also, do not call kvmppc_subcore_exit_guest() because it has
+ * been invoked as part of kvmppc_realmode_hmi_handler().
+ */
+ b 30f
+
+27:
/* Subtract timebase offset from timebase */
ld r8,VCORE_TB_OFFSET(r5)
cmpdi r8,0
@@ -1698,8 +1541,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
addis r8,r8,0x100 /* if so, increment upper 40 bits */
mtspr SPRN_TBU40,r8
+17: bl kvmppc_subcore_exit_guest
+ nop
+30: ld r5,HSTATE_KVM_VCORE(r13)
+ ld r4,VCORE_KVM(r5) /* pointer to struct kvm */
+
/* Reset PCR */
-17: ld r0, VCORE_PCR(r5)
+ ld r0, VCORE_PCR(r5)
cmpdi r0, 0
beq 18f
li r0, 0
@@ -2245,6 +2093,13 @@ _GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */
/* save FP state */
bl kvmppc_save_fp
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+ ld r9, HSTATE_KVM_VCPU(r13)
+ bl kvmppc_save_tm
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
+#endif
+
/*
* Set DEC to the smaller of DEC and HDEC, so that we wake
* no later than the end of our timeslice (HDEC interrupts
@@ -2321,6 +2176,12 @@ kvm_end_cede:
bl kvmhv_accumulate_time
#endif
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+ bl kvmppc_restore_tm
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
+#endif
+
/* load up FP state */
bl kvmppc_load_fp
@@ -2461,6 +2322,8 @@ BEGIN_FTR_SECTION
cmpwi r6, 3 /* hypervisor doorbell? */
beq 3f
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+ cmpwi r6, 0xa /* Hypervisor maintenance ? */
+ beq 4f
li r3, 1 /* anything else, return 1 */
0: blr
@@ -2482,6 +2345,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
li r3, -1
blr
+ /* Woken up due to Hypervisor maintenance interrupt */
+4: li r12, BOOK3S_INTERRUPT_HMI
+ li r3, 1
+ blr
+
/*
* Determine what sort of external interrupt is pending (if any).
* Returns:
@@ -2631,6 +2499,239 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
mr r4,r31
blr
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+/*
+ * Save transactional state and TM-related registers.
+ * Called with r9 pointing to the vcpu struct.
+ * This can modify all checkpointed registers, but
+ * restores r1, r2 and r9 (vcpu pointer) before exit.
+ */
+kvmppc_save_tm:
+ mflr r0
+ std r0, PPC_LR_STKOFF(r1)
+
+ /* Turn on TM. */
+ mfmsr r8
+ li r0, 1
+ rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG
+ mtmsrd r8
+
+ ld r5, VCPU_MSR(r9)
+ rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+ beq 1f /* TM not active in guest. */
+
+ std r1, HSTATE_HOST_R1(r13)
+ li r3, TM_CAUSE_KVM_RESCHED
+
+ /* Clear the MSR RI since r1, r13 are all going to be foobar. */
+ li r5, 0
+ mtmsrd r5, 1
+
+ /* All GPRs are volatile at this point. */
+ TRECLAIM(R3)
+
+ /* Temporarily store r13 and r9 so we have some regs to play with */
+ SET_SCRATCH0(r13)
+ GET_PACA(r13)
+ std r9, PACATMSCRATCH(r13)
+ ld r9, HSTATE_KVM_VCPU(r13)
+
+ /* Get a few more GPRs free. */
+ std r29, VCPU_GPRS_TM(29)(r9)
+ std r30, VCPU_GPRS_TM(30)(r9)
+ std r31, VCPU_GPRS_TM(31)(r9)
+
+ /* Save away PPR and DSCR soon so don't run with user values. */
+ mfspr r31, SPRN_PPR
+ HMT_MEDIUM
+ mfspr r30, SPRN_DSCR
+ ld r29, HSTATE_DSCR(r13)
+ mtspr SPRN_DSCR, r29
+
+ /* Save all but r9, r13 & r29-r31 */
+ reg = 0
+ .rept 29
+ .if (reg != 9) && (reg != 13)
+ std reg, VCPU_GPRS_TM(reg)(r9)
+ .endif
+ reg = reg + 1
+ .endr
+ /* ... now save r13 */
+ GET_SCRATCH0(r4)
+ std r4, VCPU_GPRS_TM(13)(r9)
+ /* ... and save r9 */
+ ld r4, PACATMSCRATCH(r13)
+ std r4, VCPU_GPRS_TM(9)(r9)
+
+ /* Reload stack pointer and TOC. */
+ ld r1, HSTATE_HOST_R1(r13)
+ ld r2, PACATOC(r13)
+
+ /* Set MSR RI now we have r1 and r13 back. */
+ li r5, MSR_RI
+ mtmsrd r5, 1
+
+ /* Save away checkpinted SPRs. */
+ std r31, VCPU_PPR_TM(r9)
+ std r30, VCPU_DSCR_TM(r9)
+ mflr r5
+ mfcr r6
+ mfctr r7
+ mfspr r8, SPRN_AMR
+ mfspr r10, SPRN_TAR
+ std r5, VCPU_LR_TM(r9)
+ stw r6, VCPU_CR_TM(r9)
+ std r7, VCPU_CTR_TM(r9)
+ std r8, VCPU_AMR_TM(r9)
+ std r10, VCPU_TAR_TM(r9)
+
+ /* Restore r12 as trap number. */
+ lwz r12, VCPU_TRAP(r9)
+
+ /* Save FP/VSX. */
+ addi r3, r9, VCPU_FPRS_TM
+ bl store_fp_state
+ addi r3, r9, VCPU_VRS_TM
+ bl store_vr_state
+ mfspr r6, SPRN_VRSAVE
+ stw r6, VCPU_VRSAVE_TM(r9)
+1:
+ /*
+ * We need to save these SPRs after the treclaim so that the software
+ * error code is recorded correctly in the TEXASR. Also the user may
+ * change these outside of a transaction, so they must always be
+ * context switched.
+ */
+ mfspr r5, SPRN_TFHAR
+ mfspr r6, SPRN_TFIAR
+ mfspr r7, SPRN_TEXASR
+ std r5, VCPU_TFHAR(r9)
+ std r6, VCPU_TFIAR(r9)
+ std r7, VCPU_TEXASR(r9)
+
+ ld r0, PPC_LR_STKOFF(r1)
+ mtlr r0
+ blr
+
+/*
+ * Restore transactional state and TM-related registers.
+ * Called with r4 pointing to the vcpu struct.
+ * This potentially modifies all checkpointed registers.
+ * It restores r1, r2, r4 from the PACA.
+ */
+kvmppc_restore_tm:
+ mflr r0
+ std r0, PPC_LR_STKOFF(r1)
+
+ /* Turn on TM/FP/VSX/VMX so we can restore them. */
+ mfmsr r5
+ li r6, MSR_TM >> 32
+ sldi r6, r6, 32
+ or r5, r5, r6
+ ori r5, r5, MSR_FP
+ oris r5, r5, (MSR_VEC | MSR_VSX)@h
+ mtmsrd r5
+
+ /*
+ * The user may change these outside of a transaction, so they must
+ * always be context switched.
+ */
+ ld r5, VCPU_TFHAR(r4)
+ ld r6, VCPU_TFIAR(r4)
+ ld r7, VCPU_TEXASR(r4)
+ mtspr SPRN_TFHAR, r5
+ mtspr SPRN_TFIAR, r6
+ mtspr SPRN_TEXASR, r7
+
+ ld r5, VCPU_MSR(r4)
+ rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+ beqlr /* TM not active in guest */
+ std r1, HSTATE_HOST_R1(r13)
+
+ /* Make sure the failure summary is set, otherwise we'll program check
+ * when we trechkpt. It's possible that this might have been not set
+ * on a kvmppc_set_one_reg() call but we shouldn't let this crash the
+ * host.
+ */
+ oris r7, r7, (TEXASR_FS)@h
+ mtspr SPRN_TEXASR, r7
+
+ /*
+ * We need to load up the checkpointed state for the guest.
+ * We need to do this early as it will blow away any GPRs, VSRs and
+ * some SPRs.
+ */
+
+ mr r31, r4
+ addi r3, r31, VCPU_FPRS_TM
+ bl load_fp_state
+ addi r3, r31, VCPU_VRS_TM
+ bl load_vr_state
+ mr r4, r31
+ lwz r7, VCPU_VRSAVE_TM(r4)
+ mtspr SPRN_VRSAVE, r7
+
+ ld r5, VCPU_LR_TM(r4)
+ lwz r6, VCPU_CR_TM(r4)
+ ld r7, VCPU_CTR_TM(r4)
+ ld r8, VCPU_AMR_TM(r4)
+ ld r9, VCPU_TAR_TM(r4)
+ mtlr r5
+ mtcr r6
+ mtctr r7
+ mtspr SPRN_AMR, r8
+ mtspr SPRN_TAR, r9
+
+ /*
+ * Load up PPR and DSCR values but don't put them in the actual SPRs
+ * till the last moment to avoid running with userspace PPR and DSCR for
+ * too long.
+ */
+ ld r29, VCPU_DSCR_TM(r4)
+ ld r30, VCPU_PPR_TM(r4)
+
+ std r2, PACATMSCRATCH(r13) /* Save TOC */
+
+ /* Clear the MSR RI since r1, r13 are all going to be foobar. */
+ li r5, 0
+ mtmsrd r5, 1
+
+ /* Load GPRs r0-r28 */
+ reg = 0
+ .rept 29
+ ld reg, VCPU_GPRS_TM(reg)(r31)
+ reg = reg + 1
+ .endr
+
+ mtspr SPRN_DSCR, r29
+ mtspr SPRN_PPR, r30
+
+ /* Load final GPRs */
+ ld 29, VCPU_GPRS_TM(29)(r31)
+ ld 30, VCPU_GPRS_TM(30)(r31)
+ ld 31, VCPU_GPRS_TM(31)(r31)
+
+ /* TM checkpointed state is now setup. All GPRs are now volatile. */
+ TRECHKPT
+
+ /* Now let's get back the state we need. */
+ HMT_MEDIUM
+ GET_PACA(r13)
+ ld r29, HSTATE_DSCR(r13)
+ mtspr SPRN_DSCR, r29
+ ld r4, HSTATE_KVM_VCPU(r13)
+ ld r1, HSTATE_HOST_R1(r13)
+ ld r2, PACATMSCRATCH(r13)
+
+ /* Set the MSR RI since we have our registers back. */
+ li r5, MSR_RI
+ mtmsrd r5, 1
+
+ ld r0, PPC_LR_STKOFF(r1)
+ mtlr r0
+ blr
+#endif
+
/*
* We come here if we get any exception or interrupt while we are
* executing host real mode code while in guest MMU context.
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index c4f7d6b86b9e..e76f79a45988 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -914,7 +914,7 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
/* We get here with MSR.EE=1 */
trace_kvm_exit(exit_nr, vcpu);
- kvm_guest_exit();
+ guest_exit();
switch (exit_nr) {
case BOOK3S_INTERRUPT_INST_STORAGE:
@@ -1049,7 +1049,17 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
int emul;
program_interrupt:
- flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
+ /*
+ * shadow_srr1 only contains valid flags if we came here via
+ * a program exception. The other exceptions (emulation assist,
+ * FP unavailable, etc.) do not provide flags in SRR1, so use
+ * an illegal-instruction exception when injecting a program
+ * interrupt into the guest.
+ */
+ if (exit_nr == BOOK3S_INTERRUPT_PROGRAM)
+ flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
+ else
+ flags = SRR1_PROGILL;
emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
if (emul != EMULATE_DONE) {
@@ -1531,7 +1541,7 @@ static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
kvmppc_clear_debug(vcpu);
- /* No need for kvm_guest_exit. It's done in handle_exit.
+ /* No need for guest_exit. It's done in handle_exit.
We also get here with interrupts enabled. */
/* Make sure we save the guest FPU/Altivec/VSX state */
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 4afae695899a..02b4672f7347 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -776,7 +776,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
ret = __kvmppc_vcpu_run(kvm_run, vcpu);
- /* No need for kvm_guest_exit. It's done in handle_exit.
+ /* No need for guest_exit. It's done in handle_exit.
We also get here with interrupts enabled. */
/* Switch back to user space debug context */
@@ -1012,7 +1012,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
}
trace_kvm_exit(exit_nr, vcpu);
- __kvm_guest_exit();
+ guest_exit_irqoff();
local_irq_enable();
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 5cc2e7af3a7b..b379146de55b 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -302,7 +302,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
advance = 0;
printk(KERN_ERR "Couldn't emulate instruction 0x%08x "
"(op %d xop %d)\n", inst, get_op(inst), get_xop(inst));
- kvmppc_core_queue_program(vcpu, 0);
}
}
diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
index 6249cdc834d1..ed38f8114118 100644
--- a/arch/powerpc/kvm/mpic.c
+++ b/arch/powerpc/kvm/mpic.c
@@ -1823,7 +1823,8 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
return 0;
}
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+int kvm_set_routing_entry(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
int r = -EINVAL;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 02416fea7653..6ce40dd6fe51 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -119,7 +119,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
continue;
}
- __kvm_guest_enter();
+ guest_enter_irqoff();
return 1;
}
@@ -588,6 +588,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = 1;
break;
#endif
+ case KVM_CAP_PPC_HTM:
+ r = cpu_has_feature(CPU_FTR_TM_COMP) &&
+ is_kvmppc_hv_enabled(kvm);
+ break;
default:
r = 0;
break;
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index cf928bba4d9a..3d29d40eb0e9 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -64,7 +64,6 @@ END_FTR_SECTION(0, 1); \
OPAL_BRANCH(opal_tracepoint_entry) \
mfcr r12; \
stw r12,8(r1); \
- std r1,PACAR1(r13); \
li r11,0; \
mfmsr r12; \
ori r11,r11,MSR_EE; \
@@ -127,7 +126,6 @@ opal_tracepoint_entry:
mfcr r12
std r11,16(r1)
stw r12,8(r1)
- std r1,PACAR1(r13)
li r11,0
mfmsr r12
ori r11,r11,MSR_EE
diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c
index 67d43a0eabb4..28f03ca60100 100644
--- a/arch/s390/hypfs/hypfs_diag.c
+++ b/arch/s390/hypfs/hypfs_diag.c
@@ -19,29 +19,10 @@
#include <asm/ebcdic.h>
#include "hypfs.h"
-#define LPAR_NAME_LEN 8 /* lpar name len in diag 204 data */
-#define CPU_NAME_LEN 16 /* type name len of cpus in diag224 name table */
#define TMP_SIZE 64 /* size of temporary buffers */
#define DBFS_D204_HDR_VERSION 0
-/* diag 204 subcodes */
-enum diag204_sc {
- SUBC_STIB4 = 4,
- SUBC_RSI = 5,
- SUBC_STIB6 = 6,
- SUBC_STIB7 = 7
-};
-
-/* The two available diag 204 data formats */
-enum diag204_format {
- INFO_SIMPLE = 0,
- INFO_EXT = 0x00010000
-};
-
-/* bit is set in flags, when physical cpu info is included in diag 204 data */
-#define LPAR_PHYS_FLG 0x80
-
static char *diag224_cpu_names; /* diag 224 name table */
static enum diag204_sc diag204_store_sc; /* used subcode for store */
static enum diag204_format diag204_info_type; /* used diag 204 data format */
@@ -53,7 +34,7 @@ static int diag204_buf_pages; /* number of pages for diag204 data */
static struct dentry *dbfs_d204_file;
/*
- * DIAG 204 data structures and member access functions.
+ * DIAG 204 member access functions.
*
* Since we have two different diag 204 data formats for old and new s390
* machines, we do not access the structs directly, but use getter functions for
@@ -62,304 +43,173 @@ static struct dentry *dbfs_d204_file;
/* Time information block */
-struct info_blk_hdr {
- __u8 npar;
- __u8 flags;
- __u16 tslice;
- __u16 phys_cpus;
- __u16 this_part;
- __u64 curtod;
-} __attribute__ ((packed));
-
-struct x_info_blk_hdr {
- __u8 npar;
- __u8 flags;
- __u16 tslice;
- __u16 phys_cpus;
- __u16 this_part;
- __u64 curtod1;
- __u64 curtod2;
- char reserved[40];
-} __attribute__ ((packed));
-
static inline int info_blk_hdr__size(enum diag204_format type)
{
- if (type == INFO_SIMPLE)
- return sizeof(struct info_blk_hdr);
- else /* INFO_EXT */
- return sizeof(struct x_info_blk_hdr);
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_info_blk_hdr);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_info_blk_hdr);
}
static inline __u8 info_blk_hdr__npar(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct info_blk_hdr *)hdr)->npar;
- else /* INFO_EXT */
- return ((struct x_info_blk_hdr *)hdr)->npar;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_info_blk_hdr *)hdr)->npar;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_info_blk_hdr *)hdr)->npar;
}
static inline __u8 info_blk_hdr__flags(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct info_blk_hdr *)hdr)->flags;
- else /* INFO_EXT */
- return ((struct x_info_blk_hdr *)hdr)->flags;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_info_blk_hdr *)hdr)->flags;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_info_blk_hdr *)hdr)->flags;
}
static inline __u16 info_blk_hdr__pcpus(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct info_blk_hdr *)hdr)->phys_cpus;
- else /* INFO_EXT */
- return ((struct x_info_blk_hdr *)hdr)->phys_cpus;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_info_blk_hdr *)hdr)->phys_cpus;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_info_blk_hdr *)hdr)->phys_cpus;
}
/* Partition header */
-struct part_hdr {
- __u8 pn;
- __u8 cpus;
- char reserved[6];
- char part_name[LPAR_NAME_LEN];
-} __attribute__ ((packed));
-
-struct x_part_hdr {
- __u8 pn;
- __u8 cpus;
- __u8 rcpus;
- __u8 pflag;
- __u32 mlu;
- char part_name[LPAR_NAME_LEN];
- char lpc_name[8];
- char os_name[8];
- __u64 online_cs;
- __u64 online_es;
- __u8 upid;
- char reserved1[3];
- __u32 group_mlu;
- char group_name[8];
- char reserved2[32];
-} __attribute__ ((packed));
-
static inline int part_hdr__size(enum diag204_format type)
{
- if (type == INFO_SIMPLE)
- return sizeof(struct part_hdr);
- else /* INFO_EXT */
- return sizeof(struct x_part_hdr);
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_part_hdr);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_part_hdr);
}
static inline __u8 part_hdr__rcpus(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct part_hdr *)hdr)->cpus;
- else /* INFO_EXT */
- return ((struct x_part_hdr *)hdr)->rcpus;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_part_hdr *)hdr)->cpus;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_part_hdr *)hdr)->rcpus;
}
static inline void part_hdr__part_name(enum diag204_format type, void *hdr,
char *name)
{
- if (type == INFO_SIMPLE)
- memcpy(name, ((struct part_hdr *)hdr)->part_name,
- LPAR_NAME_LEN);
- else /* INFO_EXT */
- memcpy(name, ((struct x_part_hdr *)hdr)->part_name,
- LPAR_NAME_LEN);
- EBCASC(name, LPAR_NAME_LEN);
- name[LPAR_NAME_LEN] = 0;
+ if (type == DIAG204_INFO_SIMPLE)
+ memcpy(name, ((struct diag204_part_hdr *)hdr)->part_name,
+ DIAG204_LPAR_NAME_LEN);
+ else /* DIAG204_INFO_EXT */
+ memcpy(name, ((struct diag204_x_part_hdr *)hdr)->part_name,
+ DIAG204_LPAR_NAME_LEN);
+ EBCASC(name, DIAG204_LPAR_NAME_LEN);
+ name[DIAG204_LPAR_NAME_LEN] = 0;
strim(name);
}
-struct cpu_info {
- __u16 cpu_addr;
- char reserved1[2];
- __u8 ctidx;
- __u8 cflag;
- __u16 weight;
- __u64 acc_time;
- __u64 lp_time;
-} __attribute__ ((packed));
-
-struct x_cpu_info {
- __u16 cpu_addr;
- char reserved1[2];
- __u8 ctidx;
- __u8 cflag;
- __u16 weight;
- __u64 acc_time;
- __u64 lp_time;
- __u16 min_weight;
- __u16 cur_weight;
- __u16 max_weight;
- char reseved2[2];
- __u64 online_time;
- __u64 wait_time;
- __u32 pma_weight;
- __u32 polar_weight;
- char reserved3[40];
-} __attribute__ ((packed));
-
/* CPU info block */
static inline int cpu_info__size(enum diag204_format type)
{
- if (type == INFO_SIMPLE)
- return sizeof(struct cpu_info);
- else /* INFO_EXT */
- return sizeof(struct x_cpu_info);
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_cpu_info);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_cpu_info);
}
static inline __u8 cpu_info__ctidx(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct cpu_info *)hdr)->ctidx;
- else /* INFO_EXT */
- return ((struct x_cpu_info *)hdr)->ctidx;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_cpu_info *)hdr)->ctidx;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->ctidx;
}
static inline __u16 cpu_info__cpu_addr(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct cpu_info *)hdr)->cpu_addr;
- else /* INFO_EXT */
- return ((struct x_cpu_info *)hdr)->cpu_addr;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_cpu_info *)hdr)->cpu_addr;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->cpu_addr;
}
static inline __u64 cpu_info__acc_time(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct cpu_info *)hdr)->acc_time;
- else /* INFO_EXT */
- return ((struct x_cpu_info *)hdr)->acc_time;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_cpu_info *)hdr)->acc_time;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->acc_time;
}
static inline __u64 cpu_info__lp_time(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct cpu_info *)hdr)->lp_time;
- else /* INFO_EXT */
- return ((struct x_cpu_info *)hdr)->lp_time;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_cpu_info *)hdr)->lp_time;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->lp_time;
}
static inline __u64 cpu_info__online_time(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
+ if (type == DIAG204_INFO_SIMPLE)
return 0; /* online_time not available in simple info */
- else /* INFO_EXT */
- return ((struct x_cpu_info *)hdr)->online_time;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->online_time;
}
/* Physical header */
-struct phys_hdr {
- char reserved1[1];
- __u8 cpus;
- char reserved2[6];
- char mgm_name[8];
-} __attribute__ ((packed));
-
-struct x_phys_hdr {
- char reserved1[1];
- __u8 cpus;
- char reserved2[6];
- char mgm_name[8];
- char reserved3[80];
-} __attribute__ ((packed));
-
static inline int phys_hdr__size(enum diag204_format type)
{
- if (type == INFO_SIMPLE)
- return sizeof(struct phys_hdr);
- else /* INFO_EXT */
- return sizeof(struct x_phys_hdr);
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_phys_hdr);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_phys_hdr);
}
static inline __u8 phys_hdr__cpus(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct phys_hdr *)hdr)->cpus;
- else /* INFO_EXT */
- return ((struct x_phys_hdr *)hdr)->cpus;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_phys_hdr *)hdr)->cpus;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_phys_hdr *)hdr)->cpus;
}
/* Physical CPU info block */
-struct phys_cpu {
- __u16 cpu_addr;
- char reserved1[2];
- __u8 ctidx;
- char reserved2[3];
- __u64 mgm_time;
- char reserved3[8];
-} __attribute__ ((packed));
-
-struct x_phys_cpu {
- __u16 cpu_addr;
- char reserved1[2];
- __u8 ctidx;
- char reserved2[3];
- __u64 mgm_time;
- char reserved3[80];
-} __attribute__ ((packed));
-
static inline int phys_cpu__size(enum diag204_format type)
{
- if (type == INFO_SIMPLE)
- return sizeof(struct phys_cpu);
- else /* INFO_EXT */
- return sizeof(struct x_phys_cpu);
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_phys_cpu);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_phys_cpu);
}
static inline __u16 phys_cpu__cpu_addr(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct phys_cpu *)hdr)->cpu_addr;
- else /* INFO_EXT */
- return ((struct x_phys_cpu *)hdr)->cpu_addr;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_phys_cpu *)hdr)->cpu_addr;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_phys_cpu *)hdr)->cpu_addr;
}
static inline __u64 phys_cpu__mgm_time(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct phys_cpu *)hdr)->mgm_time;
- else /* INFO_EXT */
- return ((struct x_phys_cpu *)hdr)->mgm_time;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_phys_cpu *)hdr)->mgm_time;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_phys_cpu *)hdr)->mgm_time;
}
static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct phys_cpu *)hdr)->ctidx;
- else /* INFO_EXT */
- return ((struct x_phys_cpu *)hdr)->ctidx;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_phys_cpu *)hdr)->ctidx;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_phys_cpu *)hdr)->ctidx;
}
/* Diagnose 204 functions */
-
-static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr)
-{
- register unsigned long _subcode asm("0") = *subcode;
- register unsigned long _size asm("1") = size;
-
- asm volatile(
- " diag %2,%0,0x204\n"
- "0: nopr %%r7\n"
- EX_TABLE(0b,0b)
- : "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory");
- *subcode = _subcode;
- return _size;
-}
-
-static int diag204(unsigned long subcode, unsigned long size, void *addr)
-{
- diag_stat_inc(DIAG_STAT_X204);
- size = __diag204(&subcode, size, addr);
- if (subcode)
- return -1;
- return size;
-}
-
/*
* For the old diag subcode 4 with simple data format we have to use real
* memory. If we use subcode 6 or 7 with extended data format, we can (and
@@ -411,12 +261,12 @@ static void *diag204_get_buffer(enum diag204_format fmt, int *pages)
*pages = diag204_buf_pages;
return diag204_buf;
}
- if (fmt == INFO_SIMPLE) {
+ if (fmt == DIAG204_INFO_SIMPLE) {
*pages = 1;
return diag204_alloc_rbuf();
- } else {/* INFO_EXT */
- *pages = diag204((unsigned long)SUBC_RSI |
- (unsigned long)INFO_EXT, 0, NULL);
+ } else {/* DIAG204_INFO_EXT */
+ *pages = diag204((unsigned long)DIAG204_SUBC_RSI |
+ (unsigned long)DIAG204_INFO_EXT, 0, NULL);
if (*pages <= 0)
return ERR_PTR(-ENOSYS);
else
@@ -443,18 +293,18 @@ static int diag204_probe(void)
void *buf;
int pages, rc;
- buf = diag204_get_buffer(INFO_EXT, &pages);
+ buf = diag204_get_buffer(DIAG204_INFO_EXT, &pages);
if (!IS_ERR(buf)) {
- if (diag204((unsigned long)SUBC_STIB7 |
- (unsigned long)INFO_EXT, pages, buf) >= 0) {
- diag204_store_sc = SUBC_STIB7;
- diag204_info_type = INFO_EXT;
+ if (diag204((unsigned long)DIAG204_SUBC_STIB7 |
+ (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) {
+ diag204_store_sc = DIAG204_SUBC_STIB7;
+ diag204_info_type = DIAG204_INFO_EXT;
goto out;
}
- if (diag204((unsigned long)SUBC_STIB6 |
- (unsigned long)INFO_EXT, pages, buf) >= 0) {
- diag204_store_sc = SUBC_STIB6;
- diag204_info_type = INFO_EXT;
+ if (diag204((unsigned long)DIAG204_SUBC_STIB6 |
+ (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) {
+ diag204_store_sc = DIAG204_SUBC_STIB6;
+ diag204_info_type = DIAG204_INFO_EXT;
goto out;
}
diag204_free_buffer();
@@ -462,15 +312,15 @@ static int diag204_probe(void)
/* subcodes 6 and 7 failed, now try subcode 4 */
- buf = diag204_get_buffer(INFO_SIMPLE, &pages);
+ buf = diag204_get_buffer(DIAG204_INFO_SIMPLE, &pages);
if (IS_ERR(buf)) {
rc = PTR_ERR(buf);
goto fail_alloc;
}
- if (diag204((unsigned long)SUBC_STIB4 |
- (unsigned long)INFO_SIMPLE, pages, buf) >= 0) {
- diag204_store_sc = SUBC_STIB4;
- diag204_info_type = INFO_SIMPLE;
+ if (diag204((unsigned long)DIAG204_SUBC_STIB4 |
+ (unsigned long)DIAG204_INFO_SIMPLE, pages, buf) >= 0) {
+ diag204_store_sc = DIAG204_SUBC_STIB4;
+ diag204_info_type = DIAG204_INFO_SIMPLE;
goto out;
} else {
rc = -ENOSYS;
@@ -510,20 +360,6 @@ out:
/* Diagnose 224 functions */
-static int diag224(void *ptr)
-{
- int rc = -EOPNOTSUPP;
-
- diag_stat_inc(DIAG_STAT_X224);
- asm volatile(
- " diag %1,%2,0x224\n"
- "0: lhi %0,0x0\n"
- "1:\n"
- EX_TABLE(0b,1b)
- : "+d" (rc) :"d" (0), "d" (ptr) : "memory");
- return rc;
-}
-
static int diag224_get_name_table(void)
{
/* memory must be below 2GB */
@@ -545,9 +381,9 @@ static void diag224_delete_name_table(void)
static int diag224_idx2name(int index, char *name)
{
- memcpy(name, diag224_cpu_names + ((index + 1) * CPU_NAME_LEN),
- CPU_NAME_LEN);
- name[CPU_NAME_LEN] = 0;
+ memcpy(name, diag224_cpu_names + ((index + 1) * DIAG204_CPU_NAME_LEN),
+ DIAG204_CPU_NAME_LEN);
+ name[DIAG204_CPU_NAME_LEN] = 0;
strim(name);
return 0;
}
@@ -603,7 +439,7 @@ __init int hypfs_diag_init(void)
pr_err("The hardware system does not support hypfs\n");
return -ENODATA;
}
- if (diag204_info_type == INFO_EXT) {
+ if (diag204_info_type == DIAG204_INFO_EXT) {
rc = hypfs_dbfs_create_file(&dbfs_file_d204);
if (rc)
return rc;
@@ -651,7 +487,7 @@ static int hypfs_create_cpu_files(struct dentry *cpus_dir, void *cpu_info)
cpu_info__lp_time(diag204_info_type, cpu_info));
if (IS_ERR(rc))
return PTR_ERR(rc);
- if (diag204_info_type == INFO_EXT) {
+ if (diag204_info_type == DIAG204_INFO_EXT) {
rc = hypfs_create_u64(cpu_dir, "onlinetime",
cpu_info__online_time(diag204_info_type,
cpu_info));
@@ -667,12 +503,12 @@ static void *hypfs_create_lpar_files(struct dentry *systems_dir, void *part_hdr)
{
struct dentry *cpus_dir;
struct dentry *lpar_dir;
- char lpar_name[LPAR_NAME_LEN + 1];
+ char lpar_name[DIAG204_LPAR_NAME_LEN + 1];
void *cpu_info;
int i;
part_hdr__part_name(diag204_info_type, part_hdr, lpar_name);
- lpar_name[LPAR_NAME_LEN] = 0;
+ lpar_name[DIAG204_LPAR_NAME_LEN] = 0;
lpar_dir = hypfs_mkdir(systems_dir, lpar_name);
if (IS_ERR(lpar_dir))
return lpar_dir;
@@ -755,7 +591,8 @@ int hypfs_diag_create_files(struct dentry *root)
goto err_out;
}
}
- if (info_blk_hdr__flags(diag204_info_type, time_hdr) & LPAR_PHYS_FLG) {
+ if (info_blk_hdr__flags(diag204_info_type, time_hdr) &
+ DIAG204_LPAR_PHYS_FLG) {
ptr = hypfs_create_phys_files(root, part_hdr);
if (IS_ERR(ptr)) {
rc = PTR_ERR(ptr);
diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h
index 1a82cf26ee11..d28621de8e0b 100644
--- a/arch/s390/include/asm/cpacf.h
+++ b/arch/s390/include/asm/cpacf.h
@@ -20,6 +20,9 @@
#define CPACF_KMC 0xb92f /* MSA */
#define CPACF_KIMD 0xb93e /* MSA */
#define CPACF_KLMD 0xb93f /* MSA */
+#define CPACF_PCKMO 0xb928 /* MSA3 */
+#define CPACF_KMF 0xb92a /* MSA4 */
+#define CPACF_KMO 0xb92b /* MSA4 */
#define CPACF_PCC 0xb92c /* MSA4 */
#define CPACF_KMCTR 0xb92d /* MSA4 */
#define CPACF_PPNO 0xb93c /* MSA5 */
@@ -136,6 +139,7 @@ static inline void __cpacf_query(unsigned int opcode, unsigned char *status)
register unsigned long r1 asm("1") = (unsigned long) status;
asm volatile(
+ " spm 0\n" /* pckmo doesn't change the cc */
/* Parameter registers are ignored, but may not be 0 */
"0: .insn rrf,%[opc] << 16,2,2,2,0\n"
" brc 1,0b\n" /* handle partial completion */
@@ -157,6 +161,12 @@ static inline int cpacf_query(unsigned int opcode, unsigned int func)
if (!test_facility(17)) /* check for MSA */
return 0;
break;
+ case CPACF_PCKMO:
+ if (!test_facility(76)) /* check for MSA3 */
+ return 0;
+ break;
+ case CPACF_KMF:
+ case CPACF_KMO:
case CPACF_PCC:
case CPACF_KMCTR:
if (!test_facility(77)) /* check for MSA4 */
diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h
index 86cae09e076a..8acf482162ed 100644
--- a/arch/s390/include/asm/diag.h
+++ b/arch/s390/include/asm/diag.h
@@ -78,4 +78,153 @@ struct diag210 {
extern int diag210(struct diag210 *addr);
+/* bit is set in flags, when physical cpu info is included in diag 204 data */
+#define DIAG204_LPAR_PHYS_FLG 0x80
+#define DIAG204_LPAR_NAME_LEN 8 /* lpar name len in diag 204 data */
+#define DIAG204_CPU_NAME_LEN 16 /* type name len of cpus in diag224 name table */
+
+/* diag 204 subcodes */
+enum diag204_sc {
+ DIAG204_SUBC_STIB4 = 4,
+ DIAG204_SUBC_RSI = 5,
+ DIAG204_SUBC_STIB6 = 6,
+ DIAG204_SUBC_STIB7 = 7
+};
+
+/* The two available diag 204 data formats */
+enum diag204_format {
+ DIAG204_INFO_SIMPLE = 0,
+ DIAG204_INFO_EXT = 0x00010000
+};
+
+enum diag204_cpu_flags {
+ DIAG204_CPU_ONLINE = 0x20,
+ DIAG204_CPU_CAPPED = 0x40,
+};
+
+struct diag204_info_blk_hdr {
+ __u8 npar;
+ __u8 flags;
+ __u16 tslice;
+ __u16 phys_cpus;
+ __u16 this_part;
+ __u64 curtod;
+} __packed;
+
+struct diag204_x_info_blk_hdr {
+ __u8 npar;
+ __u8 flags;
+ __u16 tslice;
+ __u16 phys_cpus;
+ __u16 this_part;
+ __u64 curtod1;
+ __u64 curtod2;
+ char reserved[40];
+} __packed;
+
+struct diag204_part_hdr {
+ __u8 pn;
+ __u8 cpus;
+ char reserved[6];
+ char part_name[DIAG204_LPAR_NAME_LEN];
+} __packed;
+
+struct diag204_x_part_hdr {
+ __u8 pn;
+ __u8 cpus;
+ __u8 rcpus;
+ __u8 pflag;
+ __u32 mlu;
+ char part_name[DIAG204_LPAR_NAME_LEN];
+ char lpc_name[8];
+ char os_name[8];
+ __u64 online_cs;
+ __u64 online_es;
+ __u8 upid;
+ __u8 reserved:3;
+ __u8 mtid:5;
+ char reserved1[2];
+ __u32 group_mlu;
+ char group_name[8];
+ char hardware_group_name[8];
+ char reserved2[24];
+} __packed;
+
+struct diag204_cpu_info {
+ __u16 cpu_addr;
+ char reserved1[2];
+ __u8 ctidx;
+ __u8 cflag;
+ __u16 weight;
+ __u64 acc_time;
+ __u64 lp_time;
+} __packed;
+
+struct diag204_x_cpu_info {
+ __u16 cpu_addr;
+ char reserved1[2];
+ __u8 ctidx;
+ __u8 cflag;
+ __u16 weight;
+ __u64 acc_time;
+ __u64 lp_time;
+ __u16 min_weight;
+ __u16 cur_weight;
+ __u16 max_weight;
+ char reseved2[2];
+ __u64 online_time;
+ __u64 wait_time;
+ __u32 pma_weight;
+ __u32 polar_weight;
+ __u32 cpu_type_cap;
+ __u32 group_cpu_type_cap;
+ char reserved3[32];
+} __packed;
+
+struct diag204_phys_hdr {
+ char reserved1[1];
+ __u8 cpus;
+ char reserved2[6];
+ char mgm_name[8];
+} __packed;
+
+struct diag204_x_phys_hdr {
+ char reserved1[1];
+ __u8 cpus;
+ char reserved2[6];
+ char mgm_name[8];
+ char reserved3[80];
+} __packed;
+
+struct diag204_phys_cpu {
+ __u16 cpu_addr;
+ char reserved1[2];
+ __u8 ctidx;
+ char reserved2[3];
+ __u64 mgm_time;
+ char reserved3[8];
+} __packed;
+
+struct diag204_x_phys_cpu {
+ __u16 cpu_addr;
+ char reserved1[2];
+ __u8 ctidx;
+ char reserved2[1];
+ __u16 weight;
+ __u64 mgm_time;
+ char reserved3[80];
+} __packed;
+
+struct diag204_x_part_block {
+ struct diag204_x_part_hdr hdr;
+ struct diag204_x_cpu_info cpus[];
+} __packed;
+
+struct diag204_x_phys_block {
+ struct diag204_x_phys_hdr hdr;
+ struct diag204_x_phys_cpu cpus[];
+} __packed;
+
+int diag204(unsigned long subcode, unsigned long size, void *addr);
+int diag224(void *ptr);
#endif /* _ASM_S390_DIAG_H */
diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index d054c1b07a3c..741ddba0bf11 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -10,14 +10,25 @@
/**
* struct gmap_struct - guest address space
+ * @list: list head for the mm->context gmap list
* @crst_list: list of all crst tables used in the guest address space
* @mm: pointer to the parent mm_struct
* @guest_to_host: radix tree with guest to host address translation
* @host_to_guest: radix tree with pointer to segment table entries
* @guest_table_lock: spinlock to protect all entries in the guest page table
+ * @ref_count: reference counter for the gmap structure
* @table: pointer to the page directory
* @asce: address space control element for gmap page table
* @pfault_enabled: defines if pfaults are applicable for the guest
+ * @host_to_rmap: radix tree with gmap_rmap lists
+ * @children: list of shadow gmap structures
+ * @pt_list: list of all page tables used in the shadow guest address space
+ * @shadow_lock: spinlock to protect the shadow gmap list
+ * @parent: pointer to the parent gmap for shadow guest address spaces
+ * @orig_asce: ASCE for which the shadow page table has been created
+ * @edat_level: edat level to be used for the shadow translation
+ * @removed: flag to indicate if a shadow guest address space has been removed
+ * @initialized: flag to indicate if a shadow guest address space can be used
*/
struct gmap {
struct list_head list;
@@ -26,26 +37,64 @@ struct gmap {
struct radix_tree_root guest_to_host;
struct radix_tree_root host_to_guest;
spinlock_t guest_table_lock;
+ atomic_t ref_count;
unsigned long *table;
unsigned long asce;
unsigned long asce_end;
void *private;
bool pfault_enabled;
+ /* Additional data for shadow guest address spaces */
+ struct radix_tree_root host_to_rmap;
+ struct list_head children;
+ struct list_head pt_list;
+ spinlock_t shadow_lock;
+ struct gmap *parent;
+ unsigned long orig_asce;
+ int edat_level;
+ bool removed;
+ bool initialized;
};
/**
+ * struct gmap_rmap - reverse mapping for shadow page table entries
+ * @next: pointer to next rmap in the list
+ * @raddr: virtual rmap address in the shadow guest address space
+ */
+struct gmap_rmap {
+ struct gmap_rmap *next;
+ unsigned long raddr;
+};
+
+#define gmap_for_each_rmap(pos, head) \
+ for (pos = (head); pos; pos = pos->next)
+
+#define gmap_for_each_rmap_safe(pos, n, head) \
+ for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n)
+
+/**
* struct gmap_notifier - notify function block for page invalidation
* @notifier_call: address of callback function
*/
struct gmap_notifier {
struct list_head list;
- void (*notifier_call)(struct gmap *gmap, unsigned long gaddr);
+ struct rcu_head rcu;
+ void (*notifier_call)(struct gmap *gmap, unsigned long start,
+ unsigned long end);
};
-struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit);
-void gmap_free(struct gmap *gmap);
+static inline int gmap_is_shadow(struct gmap *gmap)
+{
+ return !!gmap->parent;
+}
+
+struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit);
+void gmap_remove(struct gmap *gmap);
+struct gmap *gmap_get(struct gmap *gmap);
+void gmap_put(struct gmap *gmap);
+
void gmap_enable(struct gmap *gmap);
void gmap_disable(struct gmap *gmap);
+struct gmap *gmap_get_enabled(void);
int gmap_map_segment(struct gmap *gmap, unsigned long from,
unsigned long to, unsigned long len);
int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len);
@@ -57,8 +106,29 @@ void gmap_discard(struct gmap *, unsigned long from, unsigned long to);
void __gmap_zap(struct gmap *, unsigned long gaddr);
void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr);
-void gmap_register_ipte_notifier(struct gmap_notifier *);
-void gmap_unregister_ipte_notifier(struct gmap_notifier *);
-int gmap_ipte_notify(struct gmap *, unsigned long start, unsigned long len);
+int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val);
+
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
+ int edat_level);
+int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level);
+int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
+ int fake);
+int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
+ int fake);
+int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
+ int fake);
+int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
+ int fake);
+int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
+ unsigned long *pgt, int *dat_protection, int *fake);
+int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte);
+
+void gmap_register_pte_notifier(struct gmap_notifier *);
+void gmap_unregister_pte_notifier(struct gmap_notifier *);
+void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *,
+ unsigned long bits);
+
+int gmap_mprotect_notify(struct gmap *, unsigned long start,
+ unsigned long len, int prot);
#endif /* _ASM_S390_GMAP_H */
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index ac82e8eb936d..8e5daf7a76ce 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -43,6 +43,7 @@
/* s390-specific vcpu->requests bit members */
#define KVM_REQ_ENABLE_IBS 8
#define KVM_REQ_DISABLE_IBS 9
+#define KVM_REQ_ICPT_OPEREXC 10
#define SIGP_CTRL_C 0x80
#define SIGP_CTRL_SCN_MASK 0x3f
@@ -145,7 +146,7 @@ struct kvm_s390_sie_block {
__u64 cputm; /* 0x0028 */
__u64 ckc; /* 0x0030 */
__u64 epoch; /* 0x0038 */
- __u8 reserved40[4]; /* 0x0040 */
+ __u32 svcc; /* 0x0040 */
#define LCTL_CR0 0x8000
#define LCTL_CR6 0x0200
#define LCTL_CR9 0x0040
@@ -154,6 +155,7 @@ struct kvm_s390_sie_block {
#define LCTL_CR14 0x0002
__u16 lctl; /* 0x0044 */
__s16 icpua; /* 0x0046 */
+#define ICTL_OPEREXC 0x80000000
#define ICTL_PINT 0x20000000
#define ICTL_LPSW 0x00400000
#define ICTL_STCTL 0x00040000
@@ -166,6 +168,9 @@ struct kvm_s390_sie_block {
#define ICPT_INST 0x04
#define ICPT_PROGI 0x08
#define ICPT_INSTPROGI 0x0C
+#define ICPT_EXTINT 0x14
+#define ICPT_VALIDITY 0x20
+#define ICPT_STOP 0x28
#define ICPT_OPEREXC 0x2C
#define ICPT_PARTEXEC 0x38
#define ICPT_IOINST 0x40
@@ -185,7 +190,9 @@ struct kvm_s390_sie_block {
__u32 scaol; /* 0x0064 */
__u8 reserved68[4]; /* 0x0068 */
__u32 todpr; /* 0x006c */
- __u8 reserved70[32]; /* 0x0070 */
+ __u8 reserved70[16]; /* 0x0070 */
+ __u64 mso; /* 0x0080 */
+ __u64 msl; /* 0x0088 */
psw_t gpsw; /* 0x0090 */
__u64 gg14; /* 0x00a0 */
__u64 gg15; /* 0x00a8 */
@@ -223,7 +230,7 @@ struct kvm_s390_sie_block {
__u8 reserved1e6[2]; /* 0x01e6 */
__u64 itdba; /* 0x01e8 */
__u64 riccbd; /* 0x01f0 */
- __u8 reserved1f8[8]; /* 0x01f8 */
+ __u64 gvrd; /* 0x01f8 */
} __attribute__((packed));
struct kvm_s390_itdb {
@@ -256,6 +263,7 @@ struct kvm_vcpu_stat {
u32 instruction_stctg;
u32 exit_program_interruption;
u32 exit_instr_and_program;
+ u32 exit_operation_exception;
u32 deliver_external_call;
u32 deliver_emergency_signal;
u32 deliver_service_signal;
@@ -278,7 +286,9 @@ struct kvm_vcpu_stat {
u32 instruction_stsi;
u32 instruction_stfl;
u32 instruction_tprot;
+ u32 instruction_sie;
u32 instruction_essa;
+ u32 instruction_sthyi;
u32 instruction_sigp_sense;
u32 instruction_sigp_sense_running;
u32 instruction_sigp_external_call;
@@ -541,12 +551,16 @@ struct kvm_guestdbg_info_arch {
struct kvm_vcpu_arch {
struct kvm_s390_sie_block *sie_block;
+ /* if vsie is active, currently executed shadow sie control block */
+ struct kvm_s390_sie_block *vsie_block;
unsigned int host_acrs[NUM_ACRS];
struct fpu host_fpregs;
struct kvm_s390_local_interrupt local_int;
struct hrtimer ckc_timer;
struct kvm_s390_pgm_info pgm;
struct gmap *gmap;
+ /* backup location for the currently enabled gmap when scheduled out */
+ struct gmap *enabled_gmap;
struct kvm_guestdbg_info_arch guestdbg;
unsigned long pfault_token;
unsigned long pfault_select;
@@ -631,6 +645,14 @@ struct sie_page2 {
u8 reserved900[0x1000 - 0x900]; /* 0x0900 */
} __packed;
+struct kvm_s390_vsie {
+ struct mutex mutex;
+ struct radix_tree_root addr_to_page;
+ int page_count;
+ int next;
+ struct page *pages[KVM_MAX_VCPUS];
+};
+
struct kvm_arch{
void *sca;
int use_esca;
@@ -646,15 +668,20 @@ struct kvm_arch{
int user_cpu_state_ctrl;
int user_sigp;
int user_stsi;
+ int user_instr0;
struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS];
wait_queue_head_t ipte_wq;
int ipte_lock_count;
struct mutex ipte_mutex;
+ struct ratelimit_state sthyi_limit;
spinlock_t start_stop_lock;
struct sie_page2 *sie_page2;
struct kvm_s390_cpu_model model;
struct kvm_s390_crypto crypto;
+ struct kvm_s390_vsie vsie;
u64 epoch;
+ /* subset of available cpu features enabled by user space */
+ DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
};
#define KVM_HVA_ERR_BAD (-1UL)
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index 18226437a832..6d39329c894b 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -8,8 +8,9 @@ typedef struct {
cpumask_t cpu_attach_mask;
atomic_t flush_count;
unsigned int flush_mm;
- spinlock_t list_lock;
+ spinlock_t pgtable_lock;
struct list_head pgtable_list;
+ spinlock_t gmap_lock;
struct list_head gmap_list;
unsigned long asce;
unsigned long asce_limit;
@@ -22,9 +23,11 @@ typedef struct {
unsigned int use_skey:1;
} mm_context_t;
-#define INIT_MM_CONTEXT(name) \
- .context.list_lock = __SPIN_LOCK_UNLOCKED(name.context.list_lock), \
- .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \
+#define INIT_MM_CONTEXT(name) \
+ .context.pgtable_lock = \
+ __SPIN_LOCK_UNLOCKED(name.context.pgtable_lock), \
+ .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \
+ .context.gmap_lock = __SPIN_LOCK_UNLOCKED(name.context.gmap_lock), \
.context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list),
static inline int tprot(unsigned long addr)
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index f77c638bf397..c6a088c91aee 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -15,8 +15,9 @@
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
{
- spin_lock_init(&mm->context.list_lock);
+ spin_lock_init(&mm->context.pgtable_lock);
INIT_LIST_HEAD(&mm->context.pgtable_list);
+ spin_lock_init(&mm->context.gmap_lock);
INIT_LIST_HEAD(&mm->context.gmap_list);
cpumask_clear(&mm->context.cpu_attach_mask);
atomic_set(&mm->context.flush_count, 0);
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index b2146c4119b2..69b8a41fca84 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -111,13 +111,14 @@ static inline unsigned char page_get_storage_key(unsigned long addr)
static inline int page_reset_referenced(unsigned long addr)
{
- unsigned int ipm;
+ int cc;
asm volatile(
" rrbe 0,%1\n"
" ipm %0\n"
- : "=d" (ipm) : "a" (addr) : "cc");
- return !!(ipm & 0x20000000);
+ " srl %0,28\n"
+ : "=d" (cc) : "a" (addr) : "cc");
+ return cc;
}
/* Bits int the storage key */
@@ -148,6 +149,8 @@ static inline int devmem_is_allowed(unsigned long pfn)
#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
#define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT)
#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+#define pfn_to_virt(pfn) __va((pfn) << PAGE_SHIFT)
+#define page_to_virt(page) pfn_to_virt(page_to_pfn(page))
#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \
VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index da34cb6b1f3b..f4eb9843eed4 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -19,8 +19,10 @@ unsigned long *crst_table_alloc(struct mm_struct *);
void crst_table_free(struct mm_struct *, unsigned long *);
unsigned long *page_table_alloc(struct mm_struct *);
+struct page *page_table_alloc_pgste(struct mm_struct *mm);
void page_table_free(struct mm_struct *, unsigned long *);
void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long);
+void page_table_free_pgste(struct page *page);
extern int page_table_allocate_pgste;
static inline void clear_table(unsigned long *s, unsigned long val, size_t n)
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 48d383af078f..72c7f60bfe83 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -277,6 +277,7 @@ static inline int is_module_addr(void *addr)
/* Bits in the region table entry */
#define _REGION_ENTRY_ORIGIN ~0xfffUL/* region/segment table origin */
#define _REGION_ENTRY_PROTECT 0x200 /* region protection bit */
+#define _REGION_ENTRY_OFFSET 0xc0 /* region table offset */
#define _REGION_ENTRY_INVALID 0x20 /* invalid region table entry */
#define _REGION_ENTRY_TYPE_MASK 0x0c /* region/segment table type mask */
#define _REGION_ENTRY_TYPE_R1 0x0c /* region first table type */
@@ -364,6 +365,7 @@ static inline int is_module_addr(void *addr)
#define PGSTE_GC_BIT 0x0002000000000000UL
#define PGSTE_UC_BIT 0x0000800000000000UL /* user dirty (migration) */
#define PGSTE_IN_BIT 0x0000400000000000UL /* IPTE notify bit */
+#define PGSTE_VSIE_BIT 0x0000200000000000UL /* ref'd in a shadow table */
/* Guest Page State used for virtualization */
#define _PGSTE_GPS_ZERO 0x0000000080000000UL
@@ -1002,15 +1004,26 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t entry);
void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-void ptep_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void ptep_notify(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned long bits);
+int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr,
+ pte_t *ptep, int prot, unsigned long bit);
void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
pte_t *ptep , int reset);
void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
+ pte_t *sptep, pte_t *tptep, pte_t pte);
+void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep);
bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address);
int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
unsigned char key, bool nq);
-unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr);
+int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+ unsigned char key, unsigned char *oldkey,
+ bool nq, bool mr, bool mc);
+int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr);
+int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+ unsigned char *key);
/*
* Certain architectures need to do special things when PTEs
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 09529202ea77..03323175de30 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -112,6 +112,8 @@ struct thread_struct {
unsigned long ksp; /* kernel stack pointer */
mm_segment_t mm_segment;
unsigned long gmap_addr; /* address of last gmap fault. */
+ unsigned int gmap_write_flag; /* gmap fault write indication */
+ unsigned int gmap_int_code; /* int code of last gmap fault */
unsigned int gmap_pfault; /* signal of a pending guest pfault */
struct per_regs per_user; /* User specified PER registers */
struct per_event per_event; /* Cause of the last PER trap */
diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index e4f6f73afe2f..2ad9c204b1a2 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -32,12 +32,19 @@ struct sclp_core_entry {
u8 reserved0;
u8 : 4;
u8 sief2 : 1;
- u8 : 3;
- u8 : 3;
+ u8 skey : 1;
+ u8 : 2;
+ u8 : 2;
+ u8 gpere : 1;
u8 siif : 1;
u8 sigpif : 1;
u8 : 3;
- u8 reserved2[10];
+ u8 reserved2[3];
+ u8 : 2;
+ u8 ib : 1;
+ u8 cei : 1;
+ u8 : 4;
+ u8 reserved3[6];
u8 type;
u8 reserved1;
} __attribute__((packed));
@@ -59,6 +66,15 @@ struct sclp_info {
unsigned char has_hvs : 1;
unsigned char has_esca : 1;
unsigned char has_sief2 : 1;
+ unsigned char has_64bscao : 1;
+ unsigned char has_gpere : 1;
+ unsigned char has_cmma : 1;
+ unsigned char has_gsls : 1;
+ unsigned char has_ib : 1;
+ unsigned char has_cei : 1;
+ unsigned char has_pfmfi : 1;
+ unsigned char has_ibs : 1;
+ unsigned char has_skey : 1;
unsigned int ibc;
unsigned int mtid;
unsigned int mtid_cp;
@@ -101,5 +117,6 @@ int memcpy_hsa_kernel(void *dest, unsigned long src, size_t count);
int memcpy_hsa_user(void __user *dest, unsigned long src, size_t count);
void sclp_early_detect(void);
void _sclp_print_early(const char *);
+void sclp_ocf_cpc_name_copy(char *dst);
#endif /* _ASM_S390_SCLP_H */
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 3b8e99ef9d58..a2ffec4139ad 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -93,6 +93,47 @@ struct kvm_s390_vm_cpu_machine {
__u64 fac_list[256];
};
+#define KVM_S390_VM_CPU_PROCESSOR_FEAT 2
+#define KVM_S390_VM_CPU_MACHINE_FEAT 3
+
+#define KVM_S390_VM_CPU_FEAT_NR_BITS 1024
+#define KVM_S390_VM_CPU_FEAT_ESOP 0
+#define KVM_S390_VM_CPU_FEAT_SIEF2 1
+#define KVM_S390_VM_CPU_FEAT_64BSCAO 2
+#define KVM_S390_VM_CPU_FEAT_SIIF 3
+#define KVM_S390_VM_CPU_FEAT_GPERE 4
+#define KVM_S390_VM_CPU_FEAT_GSLS 5
+#define KVM_S390_VM_CPU_FEAT_IB 6
+#define KVM_S390_VM_CPU_FEAT_CEI 7
+#define KVM_S390_VM_CPU_FEAT_IBS 8
+#define KVM_S390_VM_CPU_FEAT_SKEY 9
+#define KVM_S390_VM_CPU_FEAT_CMMA 10
+#define KVM_S390_VM_CPU_FEAT_PFMFI 11
+#define KVM_S390_VM_CPU_FEAT_SIGPIF 12
+struct kvm_s390_vm_cpu_feat {
+ __u64 feat[16];
+};
+
+#define KVM_S390_VM_CPU_PROCESSOR_SUBFUNC 4
+#define KVM_S390_VM_CPU_MACHINE_SUBFUNC 5
+/* for "test bit" instructions MSB 0 bit ordering, for "query" raw blocks */
+struct kvm_s390_vm_cpu_subfunc {
+ __u8 plo[32]; /* always */
+ __u8 ptff[16]; /* with TOD-clock steering */
+ __u8 kmac[16]; /* with MSA */
+ __u8 kmc[16]; /* with MSA */
+ __u8 km[16]; /* with MSA */
+ __u8 kimd[16]; /* with MSA */
+ __u8 klmd[16]; /* with MSA */
+ __u8 pckmo[16]; /* with MSA3 */
+ __u8 kmctr[16]; /* with MSA4 */
+ __u8 kmf[16]; /* with MSA4 */
+ __u8 kmo[16]; /* with MSA4 */
+ __u8 pcc[16]; /* with MSA4 */
+ __u8 ppno[16]; /* with MSA5 */
+ __u8 reserved[1824];
+};
+
/* kvm attributes for crypto */
#define KVM_S390_VM_CRYPTO_ENABLE_AES_KW 0
#define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW 1
diff --git a/arch/s390/include/uapi/asm/sie.h b/arch/s390/include/uapi/asm/sie.h
index 8fb5d4a6dd25..3ac634368939 100644
--- a/arch/s390/include/uapi/asm/sie.h
+++ b/arch/s390/include/uapi/asm/sie.h
@@ -140,6 +140,7 @@
exit_code_ipa0(0xB2, 0x4c, "TAR"), \
exit_code_ipa0(0xB2, 0x50, "CSP"), \
exit_code_ipa0(0xB2, 0x54, "MVPG"), \
+ exit_code_ipa0(0xB2, 0x56, "STHYI"), \
exit_code_ipa0(0xB2, 0x58, "BSG"), \
exit_code_ipa0(0xB2, 0x5a, "BSA"), \
exit_code_ipa0(0xB2, 0x5f, "CHSC"), \
diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c
index 48b37b8357e6..a97354c8c667 100644
--- a/arch/s390/kernel/diag.c
+++ b/arch/s390/kernel/diag.c
@@ -162,6 +162,30 @@ int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode)
}
EXPORT_SYMBOL(diag14);
+static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr)
+{
+ register unsigned long _subcode asm("0") = *subcode;
+ register unsigned long _size asm("1") = size;
+
+ asm volatile(
+ " diag %2,%0,0x204\n"
+ "0: nopr %%r7\n"
+ EX_TABLE(0b,0b)
+ : "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory");
+ *subcode = _subcode;
+ return _size;
+}
+
+int diag204(unsigned long subcode, unsigned long size, void *addr)
+{
+ diag_stat_inc(DIAG_STAT_X204);
+ size = __diag204(&subcode, size, addr);
+ if (subcode)
+ return -1;
+ return size;
+}
+EXPORT_SYMBOL(diag204);
+
/*
* Diagnose 210: Get information about a virtual device
*/
@@ -196,3 +220,18 @@ int diag210(struct diag210 *addr)
return ccode;
}
EXPORT_SYMBOL(diag210);
+
+int diag224(void *ptr)
+{
+ int rc = -EOPNOTSUPP;
+
+ diag_stat_inc(DIAG_STAT_X224);
+ asm volatile(
+ " diag %1,%2,0x224\n"
+ "0: lhi %0,0x0\n"
+ "1:\n"
+ EX_TABLE(0b,1b)
+ : "+d" (rc) :"d" (0), "d" (ptr) : "memory");
+ return rc;
+}
+EXPORT_SYMBOL(diag224);
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index d42fa38c2429..09a9e6dfc09f 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -12,6 +12,6 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqch
ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o
-kvm-objs += diag.o gaccess.o guestdbg.o
+kvm-objs += diag.o gaccess.o guestdbg.o sthyi.o vsie.o
obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 1ea4095b67d7..ce865bd4f81d 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -212,6 +212,11 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu)
(vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY))
return -EOPNOTSUPP;
+ VCPU_EVENT(vcpu, 4, "diag 0x500 schid 0x%8.8x queue 0x%x cookie 0x%llx",
+ (u32) vcpu->run->s.regs.gprs[2],
+ (u32) vcpu->run->s.regs.gprs[3],
+ vcpu->run->s.regs.gprs[4]);
+
/*
* The layout is as follows:
* - gpr 2 contains the subchannel id (passed as addr)
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 66938d283b77..54200208bf24 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -8,6 +8,7 @@
#include <linux/vmalloc.h>
#include <linux/err.h>
#include <asm/pgtable.h>
+#include <asm/gmap.h>
#include "kvm-s390.h"
#include "gaccess.h"
#include <asm/switch_to.h>
@@ -476,18 +477,73 @@ enum {
FSI_FETCH = 2 /* Exception was due to fetch operation */
};
-static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
- ar_t ar, enum gacc_mode mode)
+enum prot_type {
+ PROT_TYPE_LA = 0,
+ PROT_TYPE_KEYC = 1,
+ PROT_TYPE_ALC = 2,
+ PROT_TYPE_DAT = 3,
+};
+
+static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
+ ar_t ar, enum gacc_mode mode, enum prot_type prot)
{
- int rc;
- struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw);
struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
- struct trans_exc_code_bits *tec_bits;
+ struct trans_exc_code_bits *tec;
memset(pgm, 0, sizeof(*pgm));
- tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
- tec_bits->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
- tec_bits->as = psw.as;
+ pgm->code = code;
+ tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
+
+ switch (code) {
+ case PGM_ASCE_TYPE:
+ case PGM_PAGE_TRANSLATION:
+ case PGM_REGION_FIRST_TRANS:
+ case PGM_REGION_SECOND_TRANS:
+ case PGM_REGION_THIRD_TRANS:
+ case PGM_SEGMENT_TRANSLATION:
+ /*
+ * op_access_id only applies to MOVE_PAGE -> set bit 61
+ * exc_access_id has to be set to 0 for some instructions. Both
+ * cases have to be handled by the caller. We can always store
+ * exc_access_id, as it is undefined for non-ar cases.
+ */
+ tec->addr = gva >> PAGE_SHIFT;
+ tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
+ tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
+ /* FALL THROUGH */
+ case PGM_ALEN_TRANSLATION:
+ case PGM_ALE_SEQUENCE:
+ case PGM_ASTE_VALIDITY:
+ case PGM_ASTE_SEQUENCE:
+ case PGM_EXTENDED_AUTHORITY:
+ pgm->exc_access_id = ar;
+ break;
+ case PGM_PROTECTION:
+ switch (prot) {
+ case PROT_TYPE_ALC:
+ tec->b60 = 1;
+ /* FALL THROUGH */
+ case PROT_TYPE_DAT:
+ tec->b61 = 1;
+ tec->addr = gva >> PAGE_SHIFT;
+ tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
+ tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
+ /* exc_access_id is undefined for most cases */
+ pgm->exc_access_id = ar;
+ break;
+ default: /* LA and KEYC set b61 to 0, other params undefined */
+ break;
+ }
+ break;
+ }
+ return code;
+}
+
+static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
+ unsigned long ga, ar_t ar, enum gacc_mode mode)
+{
+ int rc;
+ struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw);
if (!psw.t) {
asce->val = 0;
@@ -510,21 +566,8 @@ static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
return 0;
case PSW_AS_ACCREG:
rc = ar_translation(vcpu, asce, ar, mode);
- switch (rc) {
- case PGM_ALEN_TRANSLATION:
- case PGM_ALE_SEQUENCE:
- case PGM_ASTE_VALIDITY:
- case PGM_ASTE_SEQUENCE:
- case PGM_EXTENDED_AUTHORITY:
- vcpu->arch.pgm.exc_access_id = ar;
- break;
- case PGM_PROTECTION:
- tec_bits->b60 = 1;
- tec_bits->b61 = 1;
- break;
- }
if (rc > 0)
- pgm->code = rc;
+ return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_ALC);
return rc;
}
return 0;
@@ -729,40 +772,31 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu,
return 1;
}
-static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga,
+static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar,
unsigned long *pages, unsigned long nr_pages,
const union asce asce, enum gacc_mode mode)
{
- struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
psw_t *psw = &vcpu->arch.sie_block->gpsw;
- struct trans_exc_code_bits *tec_bits;
- int lap_enabled, rc;
+ int lap_enabled, rc = 0;
- tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
lap_enabled = low_address_protection_enabled(vcpu, asce);
while (nr_pages) {
ga = kvm_s390_logical_to_effective(vcpu, ga);
- tec_bits->addr = ga >> PAGE_SHIFT;
- if (mode == GACC_STORE && lap_enabled && is_low_address(ga)) {
- pgm->code = PGM_PROTECTION;
- return pgm->code;
- }
+ if (mode == GACC_STORE && lap_enabled && is_low_address(ga))
+ return trans_exc(vcpu, PGM_PROTECTION, ga, ar, mode,
+ PROT_TYPE_LA);
ga &= PAGE_MASK;
if (psw_bits(*psw).t) {
rc = guest_translate(vcpu, ga, pages, asce, mode);
if (rc < 0)
return rc;
- if (rc == PGM_PROTECTION)
- tec_bits->b61 = 1;
- if (rc)
- pgm->code = rc;
} else {
*pages = kvm_s390_real_to_abs(vcpu, ga);
if (kvm_is_error_gpa(vcpu->kvm, *pages))
- pgm->code = PGM_ADDRESSING;
+ rc = PGM_ADDRESSING;
}
- if (pgm->code)
- return pgm->code;
+ if (rc)
+ return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_DAT);
ga += PAGE_SIZE;
pages++;
nr_pages--;
@@ -783,7 +817,8 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
if (!len)
return 0;
- rc = get_vcpu_asce(vcpu, &asce, ar, mode);
+ ga = kvm_s390_logical_to_effective(vcpu, ga);
+ rc = get_vcpu_asce(vcpu, &asce, ga, ar, mode);
if (rc)
return rc;
nr_pages = (((ga & ~PAGE_MASK) + len - 1) >> PAGE_SHIFT) + 1;
@@ -795,7 +830,7 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
need_ipte_lock = psw_bits(*psw).t && !asce.r;
if (need_ipte_lock)
ipte_lock(vcpu);
- rc = guest_page_range(vcpu, ga, pages, nr_pages, asce, mode);
+ rc = guest_page_range(vcpu, ga, ar, pages, nr_pages, asce, mode);
for (idx = 0; idx < nr_pages && !rc; idx++) {
gpa = *(pages + idx) + (ga & ~PAGE_MASK);
_len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len);
@@ -846,37 +881,28 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
unsigned long *gpa, enum gacc_mode mode)
{
- struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
psw_t *psw = &vcpu->arch.sie_block->gpsw;
- struct trans_exc_code_bits *tec;
union asce asce;
int rc;
gva = kvm_s390_logical_to_effective(vcpu, gva);
- tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
- rc = get_vcpu_asce(vcpu, &asce, ar, mode);
- tec->addr = gva >> PAGE_SHIFT;
+ rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode);
if (rc)
return rc;
if (is_low_address(gva) && low_address_protection_enabled(vcpu, asce)) {
- if (mode == GACC_STORE) {
- rc = pgm->code = PGM_PROTECTION;
- return rc;
- }
+ if (mode == GACC_STORE)
+ return trans_exc(vcpu, PGM_PROTECTION, gva, 0,
+ mode, PROT_TYPE_LA);
}
if (psw_bits(*psw).t && !asce.r) { /* Use DAT? */
rc = guest_translate(vcpu, gva, gpa, asce, mode);
- if (rc > 0) {
- if (rc == PGM_PROTECTION)
- tec->b61 = 1;
- pgm->code = rc;
- }
+ if (rc > 0)
+ return trans_exc(vcpu, rc, gva, 0, mode, PROT_TYPE_DAT);
} else {
- rc = 0;
*gpa = kvm_s390_real_to_abs(vcpu, gva);
if (kvm_is_error_gpa(vcpu->kvm, *gpa))
- rc = pgm->code = PGM_ADDRESSING;
+ return trans_exc(vcpu, rc, gva, PGM_ADDRESSING, mode, 0);
}
return rc;
@@ -915,20 +941,247 @@ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
*/
int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
{
- struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
- psw_t *psw = &vcpu->arch.sie_block->gpsw;
- struct trans_exc_code_bits *tec_bits;
union ctlreg0 ctlreg0 = {.val = vcpu->arch.sie_block->gcr[0]};
if (!ctlreg0.lap || !is_low_address(gra))
return 0;
+ return trans_exc(vcpu, PGM_PROTECTION, gra, 0, GACC_STORE, PROT_TYPE_LA);
+}
- memset(pgm, 0, sizeof(*pgm));
- tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
- tec_bits->fsi = FSI_STORE;
- tec_bits->as = psw_bits(*psw).as;
- tec_bits->addr = gra >> PAGE_SHIFT;
- pgm->code = PGM_PROTECTION;
+/**
+ * kvm_s390_shadow_tables - walk the guest page table and create shadow tables
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pgt: pointer to the page table address result
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ */
+static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
+ unsigned long *pgt, int *dat_protection,
+ int *fake)
+{
+ struct gmap *parent;
+ union asce asce;
+ union vaddress vaddr;
+ unsigned long ptr;
+ int rc;
+
+ *fake = 0;
+ *dat_protection = 0;
+ parent = sg->parent;
+ vaddr.addr = saddr;
+ asce.val = sg->orig_asce;
+ ptr = asce.origin * 4096;
+ if (asce.r) {
+ *fake = 1;
+ asce.dt = ASCE_TYPE_REGION1;
+ }
+ switch (asce.dt) {
+ case ASCE_TYPE_REGION1:
+ if (vaddr.rfx01 > asce.tl && !asce.r)
+ return PGM_REGION_FIRST_TRANS;
+ break;
+ case ASCE_TYPE_REGION2:
+ if (vaddr.rfx)
+ return PGM_ASCE_TYPE;
+ if (vaddr.rsx01 > asce.tl)
+ return PGM_REGION_SECOND_TRANS;
+ break;
+ case ASCE_TYPE_REGION3:
+ if (vaddr.rfx || vaddr.rsx)
+ return PGM_ASCE_TYPE;
+ if (vaddr.rtx01 > asce.tl)
+ return PGM_REGION_THIRD_TRANS;
+ break;
+ case ASCE_TYPE_SEGMENT:
+ if (vaddr.rfx || vaddr.rsx || vaddr.rtx)
+ return PGM_ASCE_TYPE;
+ if (vaddr.sx01 > asce.tl)
+ return PGM_SEGMENT_TRANSLATION;
+ break;
+ }
+
+ switch (asce.dt) {
+ case ASCE_TYPE_REGION1: {
+ union region1_table_entry rfte;
- return pgm->code;
+ if (*fake) {
+ /* offset in 16EB guest memory block */
+ ptr = ptr + ((unsigned long) vaddr.rsx << 53UL);
+ rfte.val = ptr;
+ goto shadow_r2t;
+ }
+ rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
+ if (rc)
+ return rc;
+ if (rfte.i)
+ return PGM_REGION_FIRST_TRANS;
+ if (rfte.tt != TABLE_TYPE_REGION1)
+ return PGM_TRANSLATION_SPEC;
+ if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl)
+ return PGM_REGION_SECOND_TRANS;
+ if (sg->edat_level >= 1)
+ *dat_protection |= rfte.p;
+ ptr = rfte.rto << 12UL;
+shadow_r2t:
+ rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake);
+ if (rc)
+ return rc;
+ /* fallthrough */
+ }
+ case ASCE_TYPE_REGION2: {
+ union region2_table_entry rste;
+
+ if (*fake) {
+ /* offset in 8PB guest memory block */
+ ptr = ptr + ((unsigned long) vaddr.rtx << 42UL);
+ rste.val = ptr;
+ goto shadow_r3t;
+ }
+ rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
+ if (rc)
+ return rc;
+ if (rste.i)
+ return PGM_REGION_SECOND_TRANS;
+ if (rste.tt != TABLE_TYPE_REGION2)
+ return PGM_TRANSLATION_SPEC;
+ if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl)
+ return PGM_REGION_THIRD_TRANS;
+ if (sg->edat_level >= 1)
+ *dat_protection |= rste.p;
+ ptr = rste.rto << 12UL;
+shadow_r3t:
+ rste.p |= *dat_protection;
+ rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake);
+ if (rc)
+ return rc;
+ /* fallthrough */
+ }
+ case ASCE_TYPE_REGION3: {
+ union region3_table_entry rtte;
+
+ if (*fake) {
+ /* offset in 4TB guest memory block */
+ ptr = ptr + ((unsigned long) vaddr.sx << 31UL);
+ rtte.val = ptr;
+ goto shadow_sgt;
+ }
+ rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
+ if (rc)
+ return rc;
+ if (rtte.i)
+ return PGM_REGION_THIRD_TRANS;
+ if (rtte.tt != TABLE_TYPE_REGION3)
+ return PGM_TRANSLATION_SPEC;
+ if (rtte.cr && asce.p && sg->edat_level >= 2)
+ return PGM_TRANSLATION_SPEC;
+ if (rtte.fc && sg->edat_level >= 2) {
+ *dat_protection |= rtte.fc0.p;
+ *fake = 1;
+ ptr = rtte.fc1.rfaa << 31UL;
+ rtte.val = ptr;
+ goto shadow_sgt;
+ }
+ if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl)
+ return PGM_SEGMENT_TRANSLATION;
+ if (sg->edat_level >= 1)
+ *dat_protection |= rtte.fc0.p;
+ ptr = rtte.fc0.sto << 12UL;
+shadow_sgt:
+ rtte.fc0.p |= *dat_protection;
+ rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake);
+ if (rc)
+ return rc;
+ /* fallthrough */
+ }
+ case ASCE_TYPE_SEGMENT: {
+ union segment_table_entry ste;
+
+ if (*fake) {
+ /* offset in 2G guest memory block */
+ ptr = ptr + ((unsigned long) vaddr.sx << 20UL);
+ ste.val = ptr;
+ goto shadow_pgt;
+ }
+ rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
+ if (rc)
+ return rc;
+ if (ste.i)
+ return PGM_SEGMENT_TRANSLATION;
+ if (ste.tt != TABLE_TYPE_SEGMENT)
+ return PGM_TRANSLATION_SPEC;
+ if (ste.cs && asce.p)
+ return PGM_TRANSLATION_SPEC;
+ *dat_protection |= ste.fc0.p;
+ if (ste.fc && sg->edat_level >= 1) {
+ *fake = 1;
+ ptr = ste.fc1.sfaa << 20UL;
+ ste.val = ptr;
+ goto shadow_pgt;
+ }
+ ptr = ste.fc0.pto << 11UL;
+shadow_pgt:
+ ste.fc0.p |= *dat_protection;
+ rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake);
+ if (rc)
+ return rc;
+ }
+ }
+ /* Return the parent address of the page table */
+ *pgt = ptr;
+ return 0;
+}
+
+/**
+ * kvm_s390_shadow_fault - handle fault on a shadow page table
+ * @vcpu: virtual cpu
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ *
+ * Returns: - 0 if the shadow fault was successfully resolved
+ * - > 0 (pgm exception code) on exceptions while faulting
+ * - -EAGAIN if the caller can retry immediately
+ * - -EFAULT when accessing invalid guest addresses
+ * - -ENOMEM if out of memory
+ */
+int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
+ unsigned long saddr)
+{
+ union vaddress vaddr;
+ union page_table_entry pte;
+ unsigned long pgt;
+ int dat_protection, fake;
+ int rc;
+
+ down_read(&sg->mm->mmap_sem);
+ /*
+ * We don't want any guest-2 tables to change - so the parent
+ * tables/pointers we read stay valid - unshadowing is however
+ * always possible - only guest_table_lock protects us.
+ */
+ ipte_lock(vcpu);
+
+ rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake);
+ if (rc)
+ rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection,
+ &fake);
+
+ vaddr.addr = saddr;
+ if (fake) {
+ /* offset in 1MB guest memory block */
+ pte.val = pgt + ((unsigned long) vaddr.px << 12UL);
+ goto shadow_page;
+ }
+ if (!rc)
+ rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
+ if (!rc && pte.i)
+ rc = PGM_PAGE_TRANSLATION;
+ if (!rc && (pte.z || (pte.co && sg->edat_level < 1)))
+ rc = PGM_TRANSLATION_SPEC;
+shadow_page:
+ pte.p |= dat_protection;
+ if (!rc)
+ rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
+ ipte_unlock(vcpu);
+ up_read(&sg->mm->mmap_sem);
+ return rc;
}
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index df0a79dd8159..8756569ad938 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -361,4 +361,7 @@ void ipte_unlock(struct kvm_vcpu *vcpu);
int ipte_lock_held(struct kvm_vcpu *vcpu);
int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
+int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
+ unsigned long saddr);
+
#endif /* __KVM_S390_GACCESS_H */
diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
index e8c6843b9600..31a05330d11c 100644
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c
@@ -439,6 +439,23 @@ exit_required:
#define guest_per_enabled(vcpu) \
(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER)
+int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu)
+{
+ const u8 ilen = kvm_s390_get_ilen(vcpu);
+ struct kvm_s390_pgm_info pgm_info = {
+ .code = PGM_PER,
+ .per_code = PER_EVENT_IFETCH >> 24,
+ .per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen),
+ };
+
+ /*
+ * The PSW points to the next instruction, therefore the intercepted
+ * instruction generated a PER i-fetch event. PER address therefore
+ * points at the previous PSW address (could be an EXECUTE function).
+ */
+ return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
+}
+
static void filter_guest_per_event(struct kvm_vcpu *vcpu)
{
u32 perc = vcpu->arch.sie_block->perc << 24;
@@ -465,7 +482,7 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu)
guest_perc &= ~PER_EVENT_IFETCH;
/* All other PER events will be given to the guest */
- /* TODO: Check alterated address/address space */
+ /* TODO: Check altered address/address space */
vcpu->arch.sie_block->perc = guest_perc >> 24;
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 252157181302..dfd0ca2638fa 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -351,8 +351,26 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu)
return -EOPNOTSUPP;
}
+static int handle_operexc(struct kvm_vcpu *vcpu)
+{
+ vcpu->stat.exit_operation_exception++;
+ trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa,
+ vcpu->arch.sie_block->ipb);
+
+ if (vcpu->arch.sie_block->ipa == 0xb256 &&
+ test_kvm_facility(vcpu->kvm, 74))
+ return handle_sthyi(vcpu);
+
+ if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0)
+ return -EOPNOTSUPP;
+
+ return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
+}
+
int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
{
+ int rc, per_rc = 0;
+
if (kvm_is_ucontrol(vcpu->kvm))
return -EOPNOTSUPP;
@@ -361,7 +379,8 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
case 0x18:
return handle_noop(vcpu);
case 0x04:
- return handle_instruction(vcpu);
+ rc = handle_instruction(vcpu);
+ break;
case 0x08:
return handle_prog(vcpu);
case 0x14:
@@ -372,9 +391,19 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
return handle_validity(vcpu);
case 0x28:
return handle_stop(vcpu);
+ case 0x2c:
+ rc = handle_operexc(vcpu);
+ break;
case 0x38:
- return handle_partial_execution(vcpu);
+ rc = handle_partial_execution(vcpu);
+ break;
default:
return -EOPNOTSUPP;
}
+
+ /* process PER, also if the instrution is processed in user space */
+ if (vcpu->arch.sie_block->icptstatus & 0x02 &&
+ (!rc || rc == -EOPNOTSUPP))
+ per_rc = kvm_s390_handle_per_ifetch_icpt(vcpu);
+ return per_rc ? per_rc : rc;
}
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 5a80af740d3e..24524c0f3ef8 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -28,9 +28,6 @@
#include "gaccess.h"
#include "trace-s390.h"
-#define IOINT_SCHID_MASK 0x0000ffff
-#define IOINT_SSID_MASK 0x00030000
-#define IOINT_CSSID_MASK 0x03fc0000
#define PFAULT_INIT 0x0600
#define PFAULT_DONE 0x0680
#define VIRTIO_PARAM 0x0d00
@@ -821,7 +818,14 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu,
struct kvm_s390_interrupt_info,
list);
if (inti) {
- VCPU_EVENT(vcpu, 4, "deliver: I/O 0x%llx", inti->type);
+ if (inti->type & KVM_S390_INT_IO_AI_MASK)
+ VCPU_EVENT(vcpu, 4, "%s", "deliver: I/O (AI)");
+ else
+ VCPU_EVENT(vcpu, 4, "deliver: I/O %x ss %x schid %04x",
+ inti->io.subchannel_id >> 8,
+ inti->io.subchannel_id >> 1 & 0x3,
+ inti->io.subchannel_nr);
+
vcpu->stat.deliver_io_int++;
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
inti->type,
@@ -991,6 +995,11 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
swake_up(&vcpu->wq);
vcpu->stat.halt_wakeup++;
}
+ /*
+ * The VCPU might not be sleeping but is executing the VSIE. Let's
+ * kick it, so it leaves the SIE to process the request.
+ */
+ kvm_s390_vsie_kick(vcpu);
}
enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
@@ -1415,6 +1424,13 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
}
fi->counters[FIRQ_CNTR_IO] += 1;
+ if (inti->type & KVM_S390_INT_IO_AI_MASK)
+ VM_EVENT(kvm, 4, "%s", "inject: I/O (AI)");
+ else
+ VM_EVENT(kvm, 4, "inject: I/O %x ss %x schid %04x",
+ inti->io.subchannel_id >> 8,
+ inti->io.subchannel_id >> 1 & 0x3,
+ inti->io.subchannel_nr);
isc = int_word_to_isc(inti->io.io_int_word);
list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc];
list_add_tail(&inti->list, list);
@@ -1531,13 +1547,6 @@ int kvm_s390_inject_vm(struct kvm *kvm,
inti->mchk.mcic = s390int->parm64;
break;
case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
- if (inti->type & KVM_S390_INT_IO_AI_MASK)
- VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
- else
- VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
- s390int->type & IOINT_CSSID_MASK,
- s390int->type & IOINT_SSID_MASK,
- s390int->type & IOINT_SCHID_MASK);
inti->io.subchannel_id = s390int->parm >> 16;
inti->io.subchannel_nr = s390int->parm & 0x0000ffffu;
inti->io.io_int_parm = s390int->parm64 >> 32;
@@ -2237,7 +2246,8 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
return ret;
}
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+int kvm_set_routing_entry(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
int ret;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 6f5c344cd785..3f3ae4865d57 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -21,11 +21,13 @@
#include <linux/init.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
+#include <linux/mman.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/vmalloc.h>
+#include <linux/bitmap.h>
#include <asm/asm-offsets.h>
#include <asm/lowcore.h>
#include <asm/stp.h>
@@ -35,6 +37,8 @@
#include <asm/switch_to.h>
#include <asm/isc.h>
#include <asm/sclp.h>
+#include <asm/cpacf.h>
+#include <asm/timex.h>
#include "kvm-s390.h"
#include "gaccess.h"
@@ -64,6 +68,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "exit_pei", VCPU_STAT(exit_pei) },
{ "exit_program_interruption", VCPU_STAT(exit_program_interruption) },
{ "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) },
+ { "exit_operation_exception", VCPU_STAT(exit_operation_exception) },
{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
@@ -94,6 +99,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "instruction_stsi", VCPU_STAT(instruction_stsi) },
{ "instruction_stfl", VCPU_STAT(instruction_stfl) },
{ "instruction_tprot", VCPU_STAT(instruction_tprot) },
+ { "instruction_sthyi", VCPU_STAT(instruction_sthyi) },
+ { "instruction_sie", VCPU_STAT(instruction_sie) },
{ "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
{ "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) },
{ "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
@@ -119,6 +126,11 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ NULL }
};
+/* allow nested virtualization in KVM (if enabled by user space) */
+static int nested;
+module_param(nested, int, S_IRUGO);
+MODULE_PARM_DESC(nested, "Nested virtualization support");
+
/* upper facilities limit for kvm */
unsigned long kvm_s390_fac_list_mask[16] = {
0xffe6000000000000UL,
@@ -131,7 +143,13 @@ unsigned long kvm_s390_fac_list_mask_size(void)
return ARRAY_SIZE(kvm_s390_fac_list_mask);
}
+/* available cpu features supported by kvm */
+static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
+/* available subfunctions indicated via query / "test bit" */
+static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc;
+
static struct gmap_notifier gmap_notifier;
+static struct gmap_notifier vsie_gmap_notifier;
debug_info_t *kvm_s390_dbf;
/* Section: not file related */
@@ -141,7 +159,8 @@ int kvm_arch_hardware_enable(void)
return 0;
}
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address);
+static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
+ unsigned long end);
/*
* This callback is executed during stop_machine(). All CPUs are therefore
@@ -163,6 +182,8 @@ static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val,
vcpu->arch.sie_block->epoch -= *delta;
if (vcpu->arch.cputm_enabled)
vcpu->arch.cputm_start += *delta;
+ if (vcpu->arch.vsie_block)
+ vcpu->arch.vsie_block->epoch -= *delta;
}
}
return NOTIFY_OK;
@@ -175,7 +196,9 @@ static struct notifier_block kvm_clock_notifier = {
int kvm_arch_hardware_setup(void)
{
gmap_notifier.notifier_call = kvm_gmap_notifier;
- gmap_register_ipte_notifier(&gmap_notifier);
+ gmap_register_pte_notifier(&gmap_notifier);
+ vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier;
+ gmap_register_pte_notifier(&vsie_gmap_notifier);
atomic_notifier_chain_register(&s390_epoch_delta_notifier,
&kvm_clock_notifier);
return 0;
@@ -183,11 +206,109 @@ int kvm_arch_hardware_setup(void)
void kvm_arch_hardware_unsetup(void)
{
- gmap_unregister_ipte_notifier(&gmap_notifier);
+ gmap_unregister_pte_notifier(&gmap_notifier);
+ gmap_unregister_pte_notifier(&vsie_gmap_notifier);
atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
&kvm_clock_notifier);
}
+static void allow_cpu_feat(unsigned long nr)
+{
+ set_bit_inv(nr, kvm_s390_available_cpu_feat);
+}
+
+static inline int plo_test_bit(unsigned char nr)
+{
+ register unsigned long r0 asm("0") = (unsigned long) nr | 0x100;
+ int cc = 3; /* subfunction not available */
+
+ asm volatile(
+ /* Parameter registers are ignored for "test bit" */
+ " plo 0,0,0,0(0)\n"
+ " ipm %0\n"
+ " srl %0,28\n"
+ : "=d" (cc)
+ : "d" (r0)
+ : "cc");
+ return cc == 0;
+}
+
+static void kvm_s390_cpu_feat_init(void)
+{
+ int i;
+
+ for (i = 0; i < 256; ++i) {
+ if (plo_test_bit(i))
+ kvm_s390_available_subfunc.plo[i >> 3] |= 0x80 >> (i & 7);
+ }
+
+ if (test_facility(28)) /* TOD-clock steering */
+ ptff(kvm_s390_available_subfunc.ptff,
+ sizeof(kvm_s390_available_subfunc.ptff),
+ PTFF_QAF);
+
+ if (test_facility(17)) { /* MSA */
+ __cpacf_query(CPACF_KMAC, kvm_s390_available_subfunc.kmac);
+ __cpacf_query(CPACF_KMC, kvm_s390_available_subfunc.kmc);
+ __cpacf_query(CPACF_KM, kvm_s390_available_subfunc.km);
+ __cpacf_query(CPACF_KIMD, kvm_s390_available_subfunc.kimd);
+ __cpacf_query(CPACF_KLMD, kvm_s390_available_subfunc.klmd);
+ }
+ if (test_facility(76)) /* MSA3 */
+ __cpacf_query(CPACF_PCKMO, kvm_s390_available_subfunc.pckmo);
+ if (test_facility(77)) { /* MSA4 */
+ __cpacf_query(CPACF_KMCTR, kvm_s390_available_subfunc.kmctr);
+ __cpacf_query(CPACF_KMF, kvm_s390_available_subfunc.kmf);
+ __cpacf_query(CPACF_KMO, kvm_s390_available_subfunc.kmo);
+ __cpacf_query(CPACF_PCC, kvm_s390_available_subfunc.pcc);
+ }
+ if (test_facility(57)) /* MSA5 */
+ __cpacf_query(CPACF_PPNO, kvm_s390_available_subfunc.ppno);
+
+ if (MACHINE_HAS_ESOP)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
+ /*
+ * We need SIE support, ESOP (PROT_READ protection for gmap_shadow),
+ * 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing).
+ */
+ if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao ||
+ !test_facility(3) || !nested)
+ return;
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2);
+ if (sclp.has_64bscao)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_64BSCAO);
+ if (sclp.has_siif)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIIF);
+ if (sclp.has_gpere)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GPERE);
+ if (sclp.has_gsls)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GSLS);
+ if (sclp.has_ib)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IB);
+ if (sclp.has_cei)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI);
+ if (sclp.has_ibs)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS);
+ /*
+ * KVM_S390_VM_CPU_FEAT_SKEY: Wrong shadow of PTE.I bits will make
+ * all skey handling functions read/set the skey from the PGSTE
+ * instead of the real storage key.
+ *
+ * KVM_S390_VM_CPU_FEAT_CMMA: Wrong shadow of PTE.I bits will make
+ * pages being detected as preserved although they are resident.
+ *
+ * KVM_S390_VM_CPU_FEAT_PFMFI: Wrong shadow of PTE.I bits will
+ * have the same effect as for KVM_S390_VM_CPU_FEAT_SKEY.
+ *
+ * For KVM_S390_VM_CPU_FEAT_SKEY, KVM_S390_VM_CPU_FEAT_CMMA and
+ * KVM_S390_VM_CPU_FEAT_PFMFI, all PTE.I and PGSTE bits have to be
+ * correctly shadowed. We can do that for the PGSTE but not for PTE.I.
+ *
+ * KVM_S390_VM_CPU_FEAT_SIGPIF: Wrong SCB addresses in the SCA. We
+ * cannot easily shadow the SCA because of the ipte lock.
+ */
+}
+
int kvm_arch_init(void *opaque)
{
kvm_s390_dbf = debug_register("kvm-trace", 32, 1, 7 * sizeof(long));
@@ -199,6 +320,8 @@ int kvm_arch_init(void *opaque)
return -ENOMEM;
}
+ kvm_s390_cpu_feat_init();
+
/* Register floating interrupt controller interface. */
return kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC);
}
@@ -244,6 +367,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_USER_STSI:
case KVM_CAP_S390_SKEYS:
case KVM_CAP_S390_IRQ_STATE:
+ case KVM_CAP_S390_USER_INSTR0:
r = 1;
break;
case KVM_CAP_S390_MEM_OP:
@@ -251,8 +375,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
case KVM_CAP_NR_VCPUS:
case KVM_CAP_MAX_VCPUS:
- r = sclp.has_esca ? KVM_S390_ESCA_CPU_SLOTS
- : KVM_S390_BSCA_CPU_SLOTS;
+ r = KVM_S390_BSCA_CPU_SLOTS;
+ if (sclp.has_esca && sclp.has_64bscao)
+ r = KVM_S390_ESCA_CPU_SLOTS;
break;
case KVM_CAP_NR_MEMSLOTS:
r = KVM_USER_MEM_SLOTS;
@@ -335,6 +460,16 @@ out:
return r;
}
+static void icpt_operexc_on_all_vcpus(struct kvm *kvm)
+{
+ unsigned int i;
+ struct kvm_vcpu *vcpu;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ kvm_s390_sync_request(KVM_REQ_ICPT_OPEREXC, vcpu);
+ }
+}
+
static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
{
int r;
@@ -355,7 +490,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
break;
case KVM_CAP_S390_VECTOR_REGISTERS:
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus)) {
+ if (kvm->created_vcpus) {
r = -EBUSY;
} else if (MACHINE_HAS_VX) {
set_kvm_facility(kvm->arch.model.fac_mask, 129);
@@ -370,7 +505,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
case KVM_CAP_S390_RI:
r = -EINVAL;
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus)) {
+ if (kvm->created_vcpus) {
r = -EBUSY;
} else if (test_facility(64)) {
set_kvm_facility(kvm->arch.model.fac_mask, 64);
@@ -386,6 +521,12 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
kvm->arch.user_stsi = 1;
r = 0;
break;
+ case KVM_CAP_S390_USER_INSTR0:
+ VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_INSTR0");
+ kvm->arch.user_instr0 = 1;
+ icpt_operexc_on_all_vcpus(kvm);
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
@@ -418,21 +559,23 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
unsigned int idx;
switch (attr->attr) {
case KVM_S390_VM_MEM_ENABLE_CMMA:
- /* enable CMMA only for z10 and later (EDAT_1) */
- ret = -EINVAL;
- if (!MACHINE_IS_LPAR || !MACHINE_HAS_EDAT1)
+ ret = -ENXIO;
+ if (!sclp.has_cmma)
break;
ret = -EBUSY;
VM_EVENT(kvm, 3, "%s", "ENABLE: CMMA support");
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus) == 0) {
+ if (!kvm->created_vcpus) {
kvm->arch.use_cmma = 1;
ret = 0;
}
mutex_unlock(&kvm->lock);
break;
case KVM_S390_VM_MEM_CLR_CMMA:
+ ret = -ENXIO;
+ if (!sclp.has_cmma)
+ break;
ret = -EINVAL;
if (!kvm->arch.use_cmma)
break;
@@ -461,20 +604,20 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
if (!new_limit)
return -EINVAL;
- /* gmap_alloc takes last usable address */
+ /* gmap_create takes last usable address */
if (new_limit != KVM_S390_NO_MEM_LIMIT)
new_limit -= 1;
ret = -EBUSY;
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus) == 0) {
- /* gmap_alloc will round the limit up */
- struct gmap *new = gmap_alloc(current->mm, new_limit);
+ if (!kvm->created_vcpus) {
+ /* gmap_create will round the limit up */
+ struct gmap *new = gmap_create(current->mm, new_limit);
if (!new) {
ret = -ENOMEM;
} else {
- gmap_free(kvm->arch.gmap);
+ gmap_remove(kvm->arch.gmap);
new->private = kvm;
kvm->arch.gmap = new;
ret = 0;
@@ -644,7 +787,7 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr)
int ret = 0;
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus)) {
+ if (kvm->created_vcpus) {
ret = -EBUSY;
goto out;
}
@@ -676,6 +819,39 @@ out:
return ret;
}
+static int kvm_s390_set_processor_feat(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ struct kvm_s390_vm_cpu_feat data;
+ int ret = -EBUSY;
+
+ if (copy_from_user(&data, (void __user *)attr->addr, sizeof(data)))
+ return -EFAULT;
+ if (!bitmap_subset((unsigned long *) data.feat,
+ kvm_s390_available_cpu_feat,
+ KVM_S390_VM_CPU_FEAT_NR_BITS))
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+ if (!atomic_read(&kvm->online_vcpus)) {
+ bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat,
+ KVM_S390_VM_CPU_FEAT_NR_BITS);
+ ret = 0;
+ }
+ mutex_unlock(&kvm->lock);
+ return ret;
+}
+
+static int kvm_s390_set_processor_subfunc(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ /*
+ * Once supported by kernel + hw, we have to store the subfunctions
+ * in kvm->arch and remember that user space configured them.
+ */
+ return -ENXIO;
+}
+
static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
{
int ret = -ENXIO;
@@ -684,6 +860,12 @@ static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
case KVM_S390_VM_CPU_PROCESSOR:
ret = kvm_s390_set_processor(kvm, attr);
break;
+ case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+ ret = kvm_s390_set_processor_feat(kvm, attr);
+ break;
+ case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
+ ret = kvm_s390_set_processor_subfunc(kvm, attr);
+ break;
}
return ret;
}
@@ -732,6 +914,50 @@ out:
return ret;
}
+static int kvm_s390_get_processor_feat(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ struct kvm_s390_vm_cpu_feat data;
+
+ bitmap_copy((unsigned long *) data.feat, kvm->arch.cpu_feat,
+ KVM_S390_VM_CPU_FEAT_NR_BITS);
+ if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
+ return -EFAULT;
+ return 0;
+}
+
+static int kvm_s390_get_machine_feat(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ struct kvm_s390_vm_cpu_feat data;
+
+ bitmap_copy((unsigned long *) data.feat,
+ kvm_s390_available_cpu_feat,
+ KVM_S390_VM_CPU_FEAT_NR_BITS);
+ if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
+ return -EFAULT;
+ return 0;
+}
+
+static int kvm_s390_get_processor_subfunc(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ /*
+ * Once we can actually configure subfunctions (kernel + hw support),
+ * we have to check if they were already set by user space, if so copy
+ * them from kvm->arch.
+ */
+ return -ENXIO;
+}
+
+static int kvm_s390_get_machine_subfunc(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ if (copy_to_user((void __user *)attr->addr, &kvm_s390_available_subfunc,
+ sizeof(struct kvm_s390_vm_cpu_subfunc)))
+ return -EFAULT;
+ return 0;
+}
static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
{
int ret = -ENXIO;
@@ -743,6 +969,18 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
case KVM_S390_VM_CPU_MACHINE:
ret = kvm_s390_get_machine(kvm, attr);
break;
+ case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+ ret = kvm_s390_get_processor_feat(kvm, attr);
+ break;
+ case KVM_S390_VM_CPU_MACHINE_FEAT:
+ ret = kvm_s390_get_machine_feat(kvm, attr);
+ break;
+ case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
+ ret = kvm_s390_get_processor_subfunc(kvm, attr);
+ break;
+ case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
+ ret = kvm_s390_get_machine_subfunc(kvm, attr);
+ break;
}
return ret;
}
@@ -803,6 +1041,8 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
switch (attr->attr) {
case KVM_S390_VM_MEM_ENABLE_CMMA:
case KVM_S390_VM_MEM_CLR_CMMA:
+ ret = sclp.has_cmma ? 0 : -ENXIO;
+ break;
case KVM_S390_VM_MEM_LIMIT_SIZE:
ret = 0;
break;
@@ -826,8 +1066,13 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
switch (attr->attr) {
case KVM_S390_VM_CPU_PROCESSOR:
case KVM_S390_VM_CPU_MACHINE:
+ case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+ case KVM_S390_VM_CPU_MACHINE_FEAT:
+ case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
ret = 0;
break;
+ /* configuring subfunctions is not supported yet */
+ case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
default:
ret = -ENXIO;
break;
@@ -858,7 +1103,6 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
{
uint8_t *keys;
uint64_t hva;
- unsigned long curkey;
int i, r = 0;
if (args->flags != 0)
@@ -879,26 +1123,27 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
if (!keys)
return -ENOMEM;
+ down_read(&current->mm->mmap_sem);
for (i = 0; i < args->count; i++) {
hva = gfn_to_hva(kvm, args->start_gfn + i);
if (kvm_is_error_hva(hva)) {
r = -EFAULT;
- goto out;
+ break;
}
- curkey = get_guest_storage_key(current->mm, hva);
- if (IS_ERR_VALUE(curkey)) {
- r = curkey;
- goto out;
- }
- keys[i] = curkey;
+ r = get_guest_storage_key(current->mm, hva, &keys[i]);
+ if (r)
+ break;
+ }
+ up_read(&current->mm->mmap_sem);
+
+ if (!r) {
+ r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
+ sizeof(uint8_t) * args->count);
+ if (r)
+ r = -EFAULT;
}
- r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
- sizeof(uint8_t) * args->count);
- if (r)
- r = -EFAULT;
-out:
kvfree(keys);
return r;
}
@@ -935,24 +1180,25 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
if (r)
goto out;
+ down_read(&current->mm->mmap_sem);
for (i = 0; i < args->count; i++) {
hva = gfn_to_hva(kvm, args->start_gfn + i);
if (kvm_is_error_hva(hva)) {
r = -EFAULT;
- goto out;
+ break;
}
/* Lowest order bit is reserved */
if (keys[i] & 0x01) {
r = -EINVAL;
- goto out;
+ break;
}
- r = set_guest_storage_key(current->mm, hva,
- (unsigned long)keys[i], 0);
+ r = set_guest_storage_key(current->mm, hva, keys[i], 0);
if (r)
- goto out;
+ break;
}
+ up_read(&current->mm->mmap_sem);
out:
kvfree(keys);
return r;
@@ -1129,6 +1375,7 @@ static void sca_dispose(struct kvm *kvm)
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
+ gfp_t alloc_flags = GFP_KERNEL;
int i, rc;
char debug_name[16];
static unsigned long sca_offset;
@@ -1150,9 +1397,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
rc = -ENOMEM;
+ ratelimit_state_init(&kvm->arch.sthyi_limit, 5 * HZ, 500);
+
kvm->arch.use_esca = 0; /* start with basic SCA */
+ if (!sclp.has_64bscao)
+ alloc_flags |= GFP_DMA;
rwlock_init(&kvm->arch.sca_lock);
- kvm->arch.sca = (struct bsca_block *) get_zeroed_page(GFP_KERNEL);
+ kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags);
if (!kvm->arch.sca)
goto out_err;
spin_lock(&kvm_lock);
@@ -1189,6 +1440,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
memcpy(kvm->arch.model.fac_list, kvm->arch.model.fac_mask,
S390_ARCH_FAC_LIST_SIZE_BYTE);
+ set_kvm_facility(kvm->arch.model.fac_mask, 74);
+ set_kvm_facility(kvm->arch.model.fac_list, 74);
+
kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid();
kvm->arch.model.ibc = sclp.ibc & 0x0fff;
@@ -1212,7 +1466,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
else
kvm->arch.mem_limit = min_t(unsigned long, TASK_MAX_SIZE,
sclp.hamax + 1);
- kvm->arch.gmap = gmap_alloc(current->mm, kvm->arch.mem_limit - 1);
+ kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1);
if (!kvm->arch.gmap)
goto out_err;
kvm->arch.gmap->private = kvm;
@@ -1224,6 +1478,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm->arch.epoch = 0;
spin_lock_init(&kvm->arch.start_stop_lock);
+ kvm_s390_vsie_init(kvm);
KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid);
return 0;
@@ -1245,7 +1500,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
sca_del_vcpu(vcpu);
if (kvm_is_ucontrol(vcpu->kvm))
- gmap_free(vcpu->arch.gmap);
+ gmap_remove(vcpu->arch.gmap);
if (vcpu->kvm->arch.use_cmma)
kvm_s390_vcpu_unsetup_cmma(vcpu);
@@ -1278,16 +1533,17 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
debug_unregister(kvm->arch.dbf);
free_page((unsigned long)kvm->arch.sie_page2);
if (!kvm_is_ucontrol(kvm))
- gmap_free(kvm->arch.gmap);
+ gmap_remove(kvm->arch.gmap);
kvm_s390_destroy_adapters(kvm);
kvm_s390_clear_float_irqs(kvm);
+ kvm_s390_vsie_destroy(kvm);
KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
}
/* Section: vcpu related */
static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu)
{
- vcpu->arch.gmap = gmap_alloc(current->mm, -1UL);
+ vcpu->arch.gmap = gmap_create(current->mm, -1UL);
if (!vcpu->arch.gmap)
return -ENOMEM;
vcpu->arch.gmap->private = vcpu->kvm;
@@ -1396,7 +1652,7 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id)
if (id < KVM_S390_BSCA_CPU_SLOTS)
return true;
- if (!sclp.has_esca)
+ if (!sclp.has_esca || !sclp.has_64bscao)
return false;
mutex_lock(&kvm->lock);
@@ -1537,7 +1793,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
save_access_regs(vcpu->arch.host_acrs);
restore_access_regs(vcpu->run->s.regs.acrs);
- gmap_enable(vcpu->arch.gmap);
+ gmap_enable(vcpu->arch.enabled_gmap);
atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
__start_cpu_timer_accounting(vcpu);
@@ -1550,7 +1806,8 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
__stop_cpu_timer_accounting(vcpu);
atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
- gmap_disable(vcpu->arch.gmap);
+ vcpu->arch.enabled_gmap = gmap_get_enabled();
+ gmap_disable(vcpu->arch.enabled_gmap);
/* Save guest register state */
save_fpu_regs();
@@ -1599,7 +1856,10 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
vcpu->arch.gmap = vcpu->kvm->arch.gmap;
sca_add_vcpu(vcpu);
}
-
+ if (test_kvm_facility(vcpu->kvm, 74) || vcpu->kvm->arch.user_instr0)
+ vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
+ /* make vcpu_load load the right gmap on the first trigger */
+ vcpu->arch.enabled_gmap = vcpu->arch.gmap;
}
static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
@@ -1658,15 +1918,21 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
kvm_s390_vcpu_setup_model(vcpu);
- vcpu->arch.sie_block->ecb = 0x02;
+ /* pgste_set_pte has special handling for !MACHINE_HAS_ESOP */
+ if (MACHINE_HAS_ESOP)
+ vcpu->arch.sie_block->ecb |= 0x02;
if (test_kvm_facility(vcpu->kvm, 9))
vcpu->arch.sie_block->ecb |= 0x04;
- if (test_kvm_facility(vcpu->kvm, 50) && test_kvm_facility(vcpu->kvm, 73))
+ if (test_kvm_facility(vcpu->kvm, 73))
vcpu->arch.sie_block->ecb |= 0x10;
- if (test_kvm_facility(vcpu->kvm, 8))
+ if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi)
vcpu->arch.sie_block->ecb2 |= 0x08;
- vcpu->arch.sie_block->eca = 0xC1002000U;
+ vcpu->arch.sie_block->eca = 0x1002000U;
+ if (sclp.has_cei)
+ vcpu->arch.sie_block->eca |= 0x80000000U;
+ if (sclp.has_ib)
+ vcpu->arch.sie_block->eca |= 0x40000000U;
if (sclp.has_siif)
vcpu->arch.sie_block->eca |= 1;
if (sclp.has_sigpif)
@@ -1716,6 +1982,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
vcpu->arch.sie_block = &sie_page->sie_block;
vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb;
+ /* the real guest size will always be smaller than msl */
+ vcpu->arch.sie_block->mso = 0;
+ vcpu->arch.sie_block->msl = sclp.hamax;
+
vcpu->arch.sie_block->icpua = id;
spin_lock_init(&vcpu->arch.local_int.lock);
vcpu->arch.local_int.float_int = &kvm->arch.float_int;
@@ -1784,16 +2054,25 @@ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu)
kvm_s390_vcpu_request(vcpu);
}
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address)
+static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
+ unsigned long end)
{
- int i;
struct kvm *kvm = gmap->private;
struct kvm_vcpu *vcpu;
+ unsigned long prefix;
+ int i;
+ if (gmap_is_shadow(gmap))
+ return;
+ if (start >= 1UL << 31)
+ /* We are only interested in prefix pages */
+ return;
kvm_for_each_vcpu(i, vcpu, kvm) {
/* match against both prefix pages */
- if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) {
- VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address);
+ prefix = kvm_s390_get_prefix(vcpu);
+ if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) {
+ VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx",
+ start, end);
kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu);
}
}
@@ -2002,6 +2281,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
if (dbg->control & ~VALID_GUESTDBG_FLAGS)
return -EINVAL;
+ if (!sclp.has_gpere)
+ return -EINVAL;
if (dbg->control & KVM_GUESTDBG_ENABLE) {
vcpu->guest_debug = dbg->control;
@@ -2070,16 +2351,16 @@ retry:
return 0;
/*
* We use MMU_RELOAD just to re-arm the ipte notifier for the
- * guest prefix page. gmap_ipte_notify will wait on the ptl lock.
+ * guest prefix page. gmap_mprotect_notify will wait on the ptl lock.
* This ensures that the ipte instruction for this request has
* already finished. We might race against a second unmapper that
* wants to set the blocking bit. Lets just retry the request loop.
*/
if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) {
int rc;
- rc = gmap_ipte_notify(vcpu->arch.gmap,
- kvm_s390_get_prefix(vcpu),
- PAGE_SIZE * 2);
+ rc = gmap_mprotect_notify(vcpu->arch.gmap,
+ kvm_s390_get_prefix(vcpu),
+ PAGE_SIZE * 2, PROT_WRITE);
if (rc)
return rc;
goto retry;
@@ -2108,6 +2389,11 @@ retry:
goto retry;
}
+ if (kvm_check_request(KVM_REQ_ICPT_OPEREXC, vcpu)) {
+ vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
+ goto retry;
+ }
+
/* nothing to do, just clear the request */
clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
@@ -2362,14 +2648,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
* guest_enter and guest_exit should be no uaccess.
*/
local_irq_disable();
- __kvm_guest_enter();
+ guest_enter_irqoff();
__disable_cpu_timer_accounting(vcpu);
local_irq_enable();
exit_reason = sie64a(vcpu->arch.sie_block,
vcpu->run->s.regs.gprs);
local_irq_disable();
__enable_cpu_timer_accounting(vcpu);
- __kvm_guest_exit();
+ guest_exit_irqoff();
local_irq_enable();
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
@@ -2598,6 +2884,8 @@ static void __disable_ibs_on_all_vcpus(struct kvm *kvm)
static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
{
+ if (!sclp.has_ibs)
+ return;
kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu);
kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu);
}
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 8621ab00ec8e..b8432862a817 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -56,7 +56,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
{
- return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_WAIT;
+ return test_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
}
static inline int kvm_is_ucontrol(struct kvm *kvm)
@@ -175,6 +175,12 @@ static inline int set_kvm_facility(u64 *fac_list, unsigned long nr)
return 0;
}
+static inline int test_kvm_cpu_feat(struct kvm *kvm, unsigned long nr)
+{
+ WARN_ON_ONCE(nr >= KVM_S390_VM_CPU_FEAT_NR_BITS);
+ return test_bit_inv(nr, kvm->arch.cpu_feat);
+}
+
/* are cpu states controlled by user space */
static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm)
{
@@ -232,6 +238,8 @@ static inline void kvm_s390_forward_psw(struct kvm_vcpu *vcpu, int ilen)
}
static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu)
{
+ /* don't inject PER events if we re-execute the instruction */
+ vcpu->arch.sie_block->icptstatus &= ~0x02;
kvm_s390_rewind_psw(vcpu, kvm_s390_get_ilen(vcpu));
}
@@ -246,10 +254,21 @@ int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu);
int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu);
int kvm_s390_handle_eb(struct kvm_vcpu *vcpu);
+/* implemented in vsie.c */
+int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu);
+void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu);
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
+ unsigned long end);
+void kvm_s390_vsie_init(struct kvm *kvm);
+void kvm_s390_vsie_destroy(struct kvm *kvm);
+
/* implemented in sigp.c */
int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
+/* implemented in sthyi.c */
+int handle_sthyi(struct kvm_vcpu *vcpu);
+
/* implemented in kvm-s390.c */
void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod);
long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
@@ -360,6 +379,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg);
void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu);
void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu);
+int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu);
void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu);
/* support for Basic/Extended SCA handling */
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 95916fa7c670..46160388e996 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -27,6 +27,7 @@
#include <asm/io.h>
#include <asm/ptrace.h>
#include <asm/compat.h>
+#include <asm/sclp.h>
#include "gaccess.h"
#include "kvm-s390.h"
#include "trace.h"
@@ -152,30 +153,166 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
static int __skey_check_enable(struct kvm_vcpu *vcpu)
{
int rc = 0;
+
+ trace_kvm_s390_skey_related_inst(vcpu);
if (!(vcpu->arch.sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)))
return rc;
rc = s390_enable_skey();
- VCPU_EVENT(vcpu, 3, "%s", "enabling storage keys for guest");
- trace_kvm_s390_skey_related_inst(vcpu);
- vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
+ VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc);
+ if (!rc)
+ vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
return rc;
}
-
-static int handle_skey(struct kvm_vcpu *vcpu)
+static int try_handle_skey(struct kvm_vcpu *vcpu)
{
- int rc = __skey_check_enable(vcpu);
+ int rc;
+ vcpu->stat.instruction_storage_key++;
+ rc = __skey_check_enable(vcpu);
if (rc)
return rc;
- vcpu->stat.instruction_storage_key++;
-
+ if (sclp.has_skey) {
+ /* with storage-key facility, SIE interprets it for us */
+ kvm_s390_retry_instr(vcpu);
+ VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation");
+ return -EAGAIN;
+ }
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+ return 0;
+}
- kvm_s390_retry_instr(vcpu);
- VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation");
+static int handle_iske(struct kvm_vcpu *vcpu)
+{
+ unsigned long addr;
+ unsigned char key;
+ int reg1, reg2;
+ int rc;
+
+ rc = try_handle_skey(vcpu);
+ if (rc)
+ return rc != -EAGAIN ? rc : 0;
+
+ kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+
+ addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+ addr = kvm_s390_logical_to_effective(vcpu, addr);
+ addr = kvm_s390_real_to_abs(vcpu, addr);
+ addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr));
+ if (kvm_is_error_hva(addr))
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+ down_read(&current->mm->mmap_sem);
+ rc = get_guest_storage_key(current->mm, addr, &key);
+ up_read(&current->mm->mmap_sem);
+ if (rc)
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ vcpu->run->s.regs.gprs[reg1] &= ~0xff;
+ vcpu->run->s.regs.gprs[reg1] |= key;
+ return 0;
+}
+
+static int handle_rrbe(struct kvm_vcpu *vcpu)
+{
+ unsigned long addr;
+ int reg1, reg2;
+ int rc;
+
+ rc = try_handle_skey(vcpu);
+ if (rc)
+ return rc != -EAGAIN ? rc : 0;
+
+ kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+
+ addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+ addr = kvm_s390_logical_to_effective(vcpu, addr);
+ addr = kvm_s390_real_to_abs(vcpu, addr);
+ addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr));
+ if (kvm_is_error_hva(addr))
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+ down_read(&current->mm->mmap_sem);
+ rc = reset_guest_reference_bit(current->mm, addr);
+ up_read(&current->mm->mmap_sem);
+ if (rc < 0)
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+ kvm_s390_set_psw_cc(vcpu, rc);
+ return 0;
+}
+
+#define SSKE_NQ 0x8
+#define SSKE_MR 0x4
+#define SSKE_MC 0x2
+#define SSKE_MB 0x1
+static int handle_sske(struct kvm_vcpu *vcpu)
+{
+ unsigned char m3 = vcpu->arch.sie_block->ipb >> 28;
+ unsigned long start, end;
+ unsigned char key, oldkey;
+ int reg1, reg2;
+ int rc;
+
+ rc = try_handle_skey(vcpu);
+ if (rc)
+ return rc != -EAGAIN ? rc : 0;
+
+ if (!test_kvm_facility(vcpu->kvm, 8))
+ m3 &= ~SSKE_MB;
+ if (!test_kvm_facility(vcpu->kvm, 10))
+ m3 &= ~(SSKE_MC | SSKE_MR);
+ if (!test_kvm_facility(vcpu->kvm, 14))
+ m3 &= ~SSKE_NQ;
+
+ kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+
+ key = vcpu->run->s.regs.gprs[reg1] & 0xfe;
+ start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+ start = kvm_s390_logical_to_effective(vcpu, start);
+ if (m3 & SSKE_MB) {
+ /* start already designates an absolute address */
+ end = (start + (1UL << 20)) & ~((1UL << 20) - 1);
+ } else {
+ start = kvm_s390_real_to_abs(vcpu, start);
+ end = start + PAGE_SIZE;
+ }
+
+ while (start != end) {
+ unsigned long addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
+
+ if (kvm_is_error_hva(addr))
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+ down_read(&current->mm->mmap_sem);
+ rc = cond_set_guest_storage_key(current->mm, addr, key, &oldkey,
+ m3 & SSKE_NQ, m3 & SSKE_MR,
+ m3 & SSKE_MC);
+ up_read(&current->mm->mmap_sem);
+ if (rc < 0)
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ start += PAGE_SIZE;
+ };
+
+ if (m3 & (SSKE_MC | SSKE_MR)) {
+ if (m3 & SSKE_MB) {
+ /* skey in reg1 is unpredictable */
+ kvm_s390_set_psw_cc(vcpu, 3);
+ } else {
+ kvm_s390_set_psw_cc(vcpu, rc);
+ vcpu->run->s.regs.gprs[reg1] &= ~0xff00UL;
+ vcpu->run->s.regs.gprs[reg1] |= (u64) oldkey << 8;
+ }
+ }
+ if (m3 & SSKE_MB) {
+ if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_64BIT)
+ vcpu->run->s.regs.gprs[reg2] &= ~PAGE_MASK;
+ else
+ vcpu->run->s.regs.gprs[reg2] &= ~0xfffff000UL;
+ end = kvm_s390_logical_to_effective(vcpu, end);
+ vcpu->run->s.regs.gprs[reg2] |= end;
+ }
return 0;
}
@@ -582,10 +719,11 @@ static const intercept_handler_t b2_handlers[256] = {
[0x10] = handle_set_prefix,
[0x11] = handle_store_prefix,
[0x12] = handle_store_cpu_address,
+ [0x14] = kvm_s390_handle_vsie,
[0x21] = handle_ipte_interlock,
- [0x29] = handle_skey,
- [0x2a] = handle_skey,
- [0x2b] = handle_skey,
+ [0x29] = handle_iske,
+ [0x2a] = handle_rrbe,
+ [0x2b] = handle_sske,
[0x2c] = handle_test_block,
[0x30] = handle_io_inst,
[0x31] = handle_io_inst,
@@ -654,8 +792,10 @@ static int handle_epsw(struct kvm_vcpu *vcpu)
static int handle_pfmf(struct kvm_vcpu *vcpu)
{
+ bool mr = false, mc = false, nq;
int reg1, reg2;
unsigned long start, end;
+ unsigned char key;
vcpu->stat.instruction_pfmf++;
@@ -675,15 +815,27 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
!test_kvm_facility(vcpu->kvm, 14))
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
- /* No support for conditional-SSKE */
- if (vcpu->run->s.regs.gprs[reg1] & (PFMF_MR | PFMF_MC))
- return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+ /* Only provide conditional-SSKE support if enabled for the guest */
+ if (vcpu->run->s.regs.gprs[reg1] & PFMF_SK &&
+ test_kvm_facility(vcpu->kvm, 10)) {
+ mr = vcpu->run->s.regs.gprs[reg1] & PFMF_MR;
+ mc = vcpu->run->s.regs.gprs[reg1] & PFMF_MC;
+ }
+ nq = vcpu->run->s.regs.gprs[reg1] & PFMF_NQ;
+ key = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY;
start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
start = kvm_s390_logical_to_effective(vcpu, start);
+ if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
+ if (kvm_s390_check_low_addr_prot_real(vcpu, start))
+ return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
+ }
+
switch (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) {
case 0x00000000:
+ /* only 4k frames specify a real address */
+ start = kvm_s390_real_to_abs(vcpu, start);
end = (start + (1UL << 12)) & ~((1UL << 12) - 1);
break;
case 0x00001000:
@@ -701,20 +853,11 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
}
- if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
- if (kvm_s390_check_low_addr_prot_real(vcpu, start))
- return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
- }
-
- while (start < end) {
- unsigned long useraddr, abs_addr;
+ while (start != end) {
+ unsigned long useraddr;
/* Translate guest address to host address */
- if ((vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) == 0)
- abs_addr = kvm_s390_real_to_abs(vcpu, start);
- else
- abs_addr = start;
- useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(abs_addr));
+ useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
if (kvm_is_error_hva(useraddr))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
@@ -728,16 +871,25 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
if (rc)
return rc;
- if (set_guest_storage_key(current->mm, useraddr,
- vcpu->run->s.regs.gprs[reg1] & PFMF_KEY,
- vcpu->run->s.regs.gprs[reg1] & PFMF_NQ))
+ down_read(&current->mm->mmap_sem);
+ rc = cond_set_guest_storage_key(current->mm, useraddr,
+ key, NULL, nq, mr, mc);
+ up_read(&current->mm->mmap_sem);
+ if (rc < 0)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
}
start += PAGE_SIZE;
}
- if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC)
- vcpu->run->s.regs.gprs[reg2] = end;
+ if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) {
+ if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_64BIT) {
+ vcpu->run->s.regs.gprs[reg2] = end;
+ } else {
+ vcpu->run->s.regs.gprs[reg2] &= ~0xffffffffUL;
+ end = kvm_s390_logical_to_effective(vcpu, end);
+ vcpu->run->s.regs.gprs[reg2] |= end;
+ }
+ }
return 0;
}
@@ -1033,7 +1185,15 @@ static int handle_sckpf(struct kvm_vcpu *vcpu)
return 0;
}
+static int handle_ptff(struct kvm_vcpu *vcpu)
+{
+ /* we don't emulate any control instructions yet */
+ kvm_s390_set_psw_cc(vcpu, 3);
+ return 0;
+}
+
static const intercept_handler_t x01_handlers[256] = {
+ [0x04] = handle_ptff,
[0x07] = handle_sckpf,
};
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 28ea0cab1f1b..1a252f537081 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -77,18 +77,18 @@ static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu,
const u64 psw_int_mask = PSW_MASK_IO | PSW_MASK_EXT;
u16 p_asn, s_asn;
psw_t *psw;
- u32 flags;
+ bool idle;
- flags = atomic_read(&dst_vcpu->arch.sie_block->cpuflags);
+ idle = is_vcpu_idle(vcpu);
psw = &dst_vcpu->arch.sie_block->gpsw;
p_asn = dst_vcpu->arch.sie_block->gcr[4] & 0xffff; /* Primary ASN */
s_asn = dst_vcpu->arch.sie_block->gcr[3] & 0xffff; /* Secondary ASN */
/* Inject the emergency signal? */
- if (!(flags & CPUSTAT_STOPPED)
+ if (!is_vcpu_stopped(vcpu)
|| (psw->mask & psw_int_mask) != psw_int_mask
- || ((flags & CPUSTAT_WAIT) && psw->addr != 0)
- || (!(flags & CPUSTAT_WAIT) && (asn == p_asn || asn == s_asn))) {
+ || (idle && psw->addr != 0)
+ || (!idle && (asn == p_asn || asn == s_asn))) {
return __inject_sigp_emergency(vcpu, dst_vcpu);
} else {
*reg &= 0xffffffff00000000UL;
diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c
new file mode 100644
index 000000000000..bd98b7d25200
--- /dev/null
+++ b/arch/s390/kvm/sthyi.c
@@ -0,0 +1,471 @@
+/*
+ * store hypervisor information instruction emulation functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ * Copyright IBM Corp. 2016
+ * Author(s): Janosch Frank <frankja@linux.vnet.ibm.com>
+ */
+#include <linux/kvm_host.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/ratelimit.h>
+
+#include <asm/kvm_host.h>
+#include <asm/asm-offsets.h>
+#include <asm/sclp.h>
+#include <asm/diag.h>
+#include <asm/sysinfo.h>
+#include <asm/ebcdic.h>
+
+#include "kvm-s390.h"
+#include "gaccess.h"
+#include "trace.h"
+
+#define DED_WEIGHT 0xffff
+/*
+ * CP and IFL as EBCDIC strings, SP/0x40 determines the end of string
+ * as they are justified with spaces.
+ */
+#define CP 0xc3d7404040404040UL
+#define IFL 0xc9c6d34040404040UL
+
+enum hdr_flags {
+ HDR_NOT_LPAR = 0x10,
+ HDR_STACK_INCM = 0x20,
+ HDR_STSI_UNAV = 0x40,
+ HDR_PERF_UNAV = 0x80,
+};
+
+enum mac_validity {
+ MAC_NAME_VLD = 0x20,
+ MAC_ID_VLD = 0x40,
+ MAC_CNT_VLD = 0x80,
+};
+
+enum par_flag {
+ PAR_MT_EN = 0x80,
+};
+
+enum par_validity {
+ PAR_GRP_VLD = 0x08,
+ PAR_ID_VLD = 0x10,
+ PAR_ABS_VLD = 0x20,
+ PAR_WGHT_VLD = 0x40,
+ PAR_PCNT_VLD = 0x80,
+};
+
+struct hdr_sctn {
+ u8 infhflg1;
+ u8 infhflg2; /* reserved */
+ u8 infhval1; /* reserved */
+ u8 infhval2; /* reserved */
+ u8 reserved[3];
+ u8 infhygct;
+ u16 infhtotl;
+ u16 infhdln;
+ u16 infmoff;
+ u16 infmlen;
+ u16 infpoff;
+ u16 infplen;
+ u16 infhoff1;
+ u16 infhlen1;
+ u16 infgoff1;
+ u16 infglen1;
+ u16 infhoff2;
+ u16 infhlen2;
+ u16 infgoff2;
+ u16 infglen2;
+ u16 infhoff3;
+ u16 infhlen3;
+ u16 infgoff3;
+ u16 infglen3;
+ u8 reserved2[4];
+} __packed;
+
+struct mac_sctn {
+ u8 infmflg1; /* reserved */
+ u8 infmflg2; /* reserved */
+ u8 infmval1;
+ u8 infmval2; /* reserved */
+ u16 infmscps;
+ u16 infmdcps;
+ u16 infmsifl;
+ u16 infmdifl;
+ char infmname[8];
+ char infmtype[4];
+ char infmmanu[16];
+ char infmseq[16];
+ char infmpman[4];
+ u8 reserved[4];
+} __packed;
+
+struct par_sctn {
+ u8 infpflg1;
+ u8 infpflg2; /* reserved */
+ u8 infpval1;
+ u8 infpval2; /* reserved */
+ u16 infppnum;
+ u16 infpscps;
+ u16 infpdcps;
+ u16 infpsifl;
+ u16 infpdifl;
+ u16 reserved;
+ char infppnam[8];
+ u32 infpwbcp;
+ u32 infpabcp;
+ u32 infpwbif;
+ u32 infpabif;
+ char infplgnm[8];
+ u32 infplgcp;
+ u32 infplgif;
+} __packed;
+
+struct sthyi_sctns {
+ struct hdr_sctn hdr;
+ struct mac_sctn mac;
+ struct par_sctn par;
+} __packed;
+
+struct cpu_inf {
+ u64 lpar_cap;
+ u64 lpar_grp_cap;
+ u64 lpar_weight;
+ u64 all_weight;
+ int cpu_num_ded;
+ int cpu_num_shd;
+};
+
+struct lpar_cpu_inf {
+ struct cpu_inf cp;
+ struct cpu_inf ifl;
+};
+
+static inline u64 cpu_id(u8 ctidx, void *diag224_buf)
+{
+ return *((u64 *)(diag224_buf + (ctidx + 1) * DIAG204_CPU_NAME_LEN));
+}
+
+/*
+ * Scales the cpu capping from the lpar range to the one expected in
+ * sthyi data.
+ *
+ * diag204 reports a cap in hundredths of processor units.
+ * z/VM's range for one core is 0 - 0x10000.
+ */
+static u32 scale_cap(u32 in)
+{
+ return (0x10000 * in) / 100;
+}
+
+static void fill_hdr(struct sthyi_sctns *sctns)
+{
+ sctns->hdr.infhdln = sizeof(sctns->hdr);
+ sctns->hdr.infmoff = sizeof(sctns->hdr);
+ sctns->hdr.infmlen = sizeof(sctns->mac);
+ sctns->hdr.infplen = sizeof(sctns->par);
+ sctns->hdr.infpoff = sctns->hdr.infhdln + sctns->hdr.infmlen;
+ sctns->hdr.infhtotl = sctns->hdr.infpoff + sctns->hdr.infplen;
+}
+
+static void fill_stsi_mac(struct sthyi_sctns *sctns,
+ struct sysinfo_1_1_1 *sysinfo)
+{
+ if (stsi(sysinfo, 1, 1, 1))
+ return;
+
+ sclp_ocf_cpc_name_copy(sctns->mac.infmname);
+
+ memcpy(sctns->mac.infmtype, sysinfo->type, sizeof(sctns->mac.infmtype));
+ memcpy(sctns->mac.infmmanu, sysinfo->manufacturer, sizeof(sctns->mac.infmmanu));
+ memcpy(sctns->mac.infmpman, sysinfo->plant, sizeof(sctns->mac.infmpman));
+ memcpy(sctns->mac.infmseq, sysinfo->sequence, sizeof(sctns->mac.infmseq));
+
+ sctns->mac.infmval1 |= MAC_ID_VLD | MAC_NAME_VLD;
+}
+
+static void fill_stsi_par(struct sthyi_sctns *sctns,
+ struct sysinfo_2_2_2 *sysinfo)
+{
+ if (stsi(sysinfo, 2, 2, 2))
+ return;
+
+ sctns->par.infppnum = sysinfo->lpar_number;
+ memcpy(sctns->par.infppnam, sysinfo->name, sizeof(sctns->par.infppnam));
+
+ sctns->par.infpval1 |= PAR_ID_VLD;
+}
+
+static void fill_stsi(struct sthyi_sctns *sctns)
+{
+ void *sysinfo;
+
+ /* Errors are handled through the validity bits in the response. */
+ sysinfo = (void *)__get_free_page(GFP_KERNEL);
+ if (!sysinfo)
+ return;
+
+ fill_stsi_mac(sctns, sysinfo);
+ fill_stsi_par(sctns, sysinfo);
+
+ free_pages((unsigned long)sysinfo, 0);
+}
+
+static void fill_diag_mac(struct sthyi_sctns *sctns,
+ struct diag204_x_phys_block *block,
+ void *diag224_buf)
+{
+ int i;
+
+ for (i = 0; i < block->hdr.cpus; i++) {
+ switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) {
+ case CP:
+ if (block->cpus[i].weight == DED_WEIGHT)
+ sctns->mac.infmdcps++;
+ else
+ sctns->mac.infmscps++;
+ break;
+ case IFL:
+ if (block->cpus[i].weight == DED_WEIGHT)
+ sctns->mac.infmdifl++;
+ else
+ sctns->mac.infmsifl++;
+ break;
+ }
+ }
+ sctns->mac.infmval1 |= MAC_CNT_VLD;
+}
+
+/* Returns a pointer to the the next partition block. */
+static struct diag204_x_part_block *lpar_cpu_inf(struct lpar_cpu_inf *part_inf,
+ bool this_lpar,
+ void *diag224_buf,
+ struct diag204_x_part_block *block)
+{
+ int i, capped = 0, weight_cp = 0, weight_ifl = 0;
+ struct cpu_inf *cpu_inf;
+
+ for (i = 0; i < block->hdr.rcpus; i++) {
+ if (!(block->cpus[i].cflag & DIAG204_CPU_ONLINE))
+ continue;
+
+ switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) {
+ case CP:
+ cpu_inf = &part_inf->cp;
+ if (block->cpus[i].cur_weight < DED_WEIGHT)
+ weight_cp |= block->cpus[i].cur_weight;
+ break;
+ case IFL:
+ cpu_inf = &part_inf->ifl;
+ if (block->cpus[i].cur_weight < DED_WEIGHT)
+ weight_ifl |= block->cpus[i].cur_weight;
+ break;
+ default:
+ continue;
+ }
+
+ if (!this_lpar)
+ continue;
+
+ capped |= block->cpus[i].cflag & DIAG204_CPU_CAPPED;
+ cpu_inf->lpar_cap |= block->cpus[i].cpu_type_cap;
+ cpu_inf->lpar_grp_cap |= block->cpus[i].group_cpu_type_cap;
+
+ if (block->cpus[i].weight == DED_WEIGHT)
+ cpu_inf->cpu_num_ded += 1;
+ else
+ cpu_inf->cpu_num_shd += 1;
+ }
+
+ if (this_lpar && capped) {
+ part_inf->cp.lpar_weight = weight_cp;
+ part_inf->ifl.lpar_weight = weight_ifl;
+ }
+ part_inf->cp.all_weight += weight_cp;
+ part_inf->ifl.all_weight += weight_ifl;
+ return (struct diag204_x_part_block *)&block->cpus[i];
+}
+
+static void fill_diag(struct sthyi_sctns *sctns)
+{
+ int i, r, pages;
+ bool this_lpar;
+ void *diag204_buf;
+ void *diag224_buf = NULL;
+ struct diag204_x_info_blk_hdr *ti_hdr;
+ struct diag204_x_part_block *part_block;
+ struct diag204_x_phys_block *phys_block;
+ struct lpar_cpu_inf lpar_inf = {};
+
+ /* Errors are handled through the validity bits in the response. */
+ pages = diag204((unsigned long)DIAG204_SUBC_RSI |
+ (unsigned long)DIAG204_INFO_EXT, 0, NULL);
+ if (pages <= 0)
+ return;
+
+ diag204_buf = vmalloc(PAGE_SIZE * pages);
+ if (!diag204_buf)
+ return;
+
+ r = diag204((unsigned long)DIAG204_SUBC_STIB7 |
+ (unsigned long)DIAG204_INFO_EXT, pages, diag204_buf);
+ if (r < 0)
+ goto out;
+
+ diag224_buf = kmalloc(PAGE_SIZE, GFP_KERNEL | GFP_DMA);
+ if (!diag224_buf || diag224(diag224_buf))
+ goto out;
+
+ ti_hdr = diag204_buf;
+ part_block = diag204_buf + sizeof(*ti_hdr);
+
+ for (i = 0; i < ti_hdr->npar; i++) {
+ /*
+ * For the calling lpar we also need to get the cpu
+ * caps and weights. The time information block header
+ * specifies the offset to the partition block of the
+ * caller lpar, so we know when we process its data.
+ */
+ this_lpar = (void *)part_block - diag204_buf == ti_hdr->this_part;
+ part_block = lpar_cpu_inf(&lpar_inf, this_lpar, diag224_buf,
+ part_block);
+ }
+
+ phys_block = (struct diag204_x_phys_block *)part_block;
+ part_block = diag204_buf + ti_hdr->this_part;
+ if (part_block->hdr.mtid)
+ sctns->par.infpflg1 = PAR_MT_EN;
+
+ sctns->par.infpval1 |= PAR_GRP_VLD;
+ sctns->par.infplgcp = scale_cap(lpar_inf.cp.lpar_grp_cap);
+ sctns->par.infplgif = scale_cap(lpar_inf.ifl.lpar_grp_cap);
+ memcpy(sctns->par.infplgnm, part_block->hdr.hardware_group_name,
+ sizeof(sctns->par.infplgnm));
+
+ sctns->par.infpscps = lpar_inf.cp.cpu_num_shd;
+ sctns->par.infpdcps = lpar_inf.cp.cpu_num_ded;
+ sctns->par.infpsifl = lpar_inf.ifl.cpu_num_shd;
+ sctns->par.infpdifl = lpar_inf.ifl.cpu_num_ded;
+ sctns->par.infpval1 |= PAR_PCNT_VLD;
+
+ sctns->par.infpabcp = scale_cap(lpar_inf.cp.lpar_cap);
+ sctns->par.infpabif = scale_cap(lpar_inf.ifl.lpar_cap);
+ sctns->par.infpval1 |= PAR_ABS_VLD;
+
+ /*
+ * Everything below needs global performance data to be
+ * meaningful.
+ */
+ if (!(ti_hdr->flags & DIAG204_LPAR_PHYS_FLG)) {
+ sctns->hdr.infhflg1 |= HDR_PERF_UNAV;
+ goto out;
+ }
+
+ fill_diag_mac(sctns, phys_block, diag224_buf);
+
+ if (lpar_inf.cp.lpar_weight) {
+ sctns->par.infpwbcp = sctns->mac.infmscps * 0x10000 *
+ lpar_inf.cp.lpar_weight / lpar_inf.cp.all_weight;
+ }
+
+ if (lpar_inf.ifl.lpar_weight) {
+ sctns->par.infpwbif = sctns->mac.infmsifl * 0x10000 *
+ lpar_inf.ifl.lpar_weight / lpar_inf.ifl.all_weight;
+ }
+ sctns->par.infpval1 |= PAR_WGHT_VLD;
+
+out:
+ kfree(diag224_buf);
+ vfree(diag204_buf);
+}
+
+static int sthyi(u64 vaddr)
+{
+ register u64 code asm("0") = 0;
+ register u64 addr asm("2") = vaddr;
+ int cc;
+
+ asm volatile(
+ ".insn rre,0xB2560000,%[code],%[addr]\n"
+ "ipm %[cc]\n"
+ "srl %[cc],28\n"
+ : [cc] "=d" (cc)
+ : [code] "d" (code), [addr] "a" (addr)
+ : "memory", "cc");
+ return cc;
+}
+
+int handle_sthyi(struct kvm_vcpu *vcpu)
+{
+ int reg1, reg2, r = 0;
+ u64 code, addr, cc = 0;
+ struct sthyi_sctns *sctns = NULL;
+
+ /*
+ * STHYI requires extensive locking in the higher hypervisors
+ * and is very computational/memory expensive. Therefore we
+ * ratelimit the executions per VM.
+ */
+ if (!__ratelimit(&vcpu->kvm->arch.sthyi_limit)) {
+ kvm_s390_retry_instr(vcpu);
+ return 0;
+ }
+
+ kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+ code = vcpu->run->s.regs.gprs[reg1];
+ addr = vcpu->run->s.regs.gprs[reg2];
+
+ vcpu->stat.instruction_sthyi++;
+ VCPU_EVENT(vcpu, 3, "STHYI: fc: %llu addr: 0x%016llx", code, addr);
+ trace_kvm_s390_handle_sthyi(vcpu, code, addr);
+
+ if (reg1 == reg2 || reg1 & 1 || reg2 & 1 || addr & ~PAGE_MASK)
+ return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+ if (code & 0xffff) {
+ cc = 3;
+ goto out;
+ }
+
+ /*
+ * If the page has not yet been faulted in, we want to do that
+ * now and not after all the expensive calculations.
+ */
+ r = write_guest(vcpu, addr, reg2, &cc, 1);
+ if (r)
+ return kvm_s390_inject_prog_cond(vcpu, r);
+
+ sctns = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!sctns)
+ return -ENOMEM;
+
+ /*
+ * If we are a guest, we don't want to emulate an emulated
+ * instruction. We ask the hypervisor to provide the data.
+ */
+ if (test_facility(74)) {
+ cc = sthyi((u64)sctns);
+ goto out;
+ }
+
+ fill_hdr(sctns);
+ fill_stsi(sctns);
+ fill_diag(sctns);
+
+out:
+ if (!cc) {
+ r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE);
+ if (r) {
+ free_page((unsigned long)sctns);
+ return kvm_s390_inject_prog_cond(vcpu, r);
+ }
+ }
+
+ free_page((unsigned long)sctns);
+ vcpu->run->s.regs.gprs[reg2 + 1] = cc ? 4 : 0;
+ kvm_s390_set_psw_cc(vcpu, cc);
+ return r;
+}
diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h
index 916834d7a73a..4fc9d4e5be89 100644
--- a/arch/s390/kvm/trace.h
+++ b/arch/s390/kvm/trace.h
@@ -41,7 +41,7 @@ TRACE_EVENT(kvm_s390_skey_related_inst,
TP_fast_assign(
VCPU_ASSIGN_COMMON
),
- VCPU_TP_PRINTK("%s", "first instruction related to skeys on vcpu")
+ VCPU_TP_PRINTK("%s", "storage key related instruction")
);
TRACE_EVENT(kvm_s390_major_guest_pfault,
@@ -185,8 +185,10 @@ TRACE_EVENT(kvm_s390_intercept_prog,
__entry->code = code;
),
- VCPU_TP_PRINTK("intercepted program interruption %04x",
- __entry->code)
+ VCPU_TP_PRINTK("intercepted program interruption %04x (%s)",
+ __entry->code,
+ __print_symbolic(__entry->code,
+ icpt_prog_codes))
);
/*
@@ -412,6 +414,47 @@ TRACE_EVENT(kvm_s390_handle_stsi,
__entry->addr)
);
+TRACE_EVENT(kvm_s390_handle_operexc,
+ TP_PROTO(VCPU_PROTO_COMMON, __u16 ipa, __u32 ipb),
+ TP_ARGS(VCPU_ARGS_COMMON, ipa, ipb),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(__u64, instruction)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->instruction = ((__u64)ipa << 48) |
+ ((__u64)ipb << 16);
+ ),
+
+ VCPU_TP_PRINTK("operation exception on instruction %016llx (%s)",
+ __entry->instruction,
+ __print_symbolic(icpt_insn_decoder(__entry->instruction),
+ icpt_insn_codes))
+ );
+
+TRACE_EVENT(kvm_s390_handle_sthyi,
+ TP_PROTO(VCPU_PROTO_COMMON, u64 code, u64 addr),
+ TP_ARGS(VCPU_ARGS_COMMON, code, addr),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(u64, code)
+ __field(u64, addr)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->code = code;
+ __entry->addr = addr;
+ ),
+
+ VCPU_TP_PRINTK("STHYI fc: %llu addr: %016llx",
+ __entry->code, __entry->addr)
+ );
+
#endif /* _TRACE_KVM_H */
/* This part must be outside protection */
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
new file mode 100644
index 000000000000..c106488b4137
--- /dev/null
+++ b/arch/s390/kvm/vsie.c
@@ -0,0 +1,1091 @@
+/*
+ * kvm nested virtualization support for s390x
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ * Author(s): David Hildenbrand <dahi@linux.vnet.ibm.com>
+ */
+#include <linux/vmalloc.h>
+#include <linux/kvm_host.h>
+#include <linux/bug.h>
+#include <linux/list.h>
+#include <linux/bitmap.h>
+#include <asm/gmap.h>
+#include <asm/mmu_context.h>
+#include <asm/sclp.h>
+#include <asm/nmi.h>
+#include <asm/dis.h>
+#include "kvm-s390.h"
+#include "gaccess.h"
+
+struct vsie_page {
+ struct kvm_s390_sie_block scb_s; /* 0x0000 */
+ /* the pinned originial scb */
+ struct kvm_s390_sie_block *scb_o; /* 0x0200 */
+ /* the shadow gmap in use by the vsie_page */
+ struct gmap *gmap; /* 0x0208 */
+ /* address of the last reported fault to guest2 */
+ unsigned long fault_addr; /* 0x0210 */
+ __u8 reserved[0x0700 - 0x0218]; /* 0x0218 */
+ struct kvm_s390_crypto_cb crycb; /* 0x0700 */
+ __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */
+} __packed;
+
+/* trigger a validity icpt for the given scb */
+static int set_validity_icpt(struct kvm_s390_sie_block *scb,
+ __u16 reason_code)
+{
+ scb->ipa = 0x1000;
+ scb->ipb = ((__u32) reason_code) << 16;
+ scb->icptcode = ICPT_VALIDITY;
+ return 1;
+}
+
+/* mark the prefix as unmapped, this will block the VSIE */
+static void prefix_unmapped(struct vsie_page *vsie_page)
+{
+ atomic_or(PROG_REQUEST, &vsie_page->scb_s.prog20);
+}
+
+/* mark the prefix as unmapped and wait until the VSIE has been left */
+static void prefix_unmapped_sync(struct vsie_page *vsie_page)
+{
+ prefix_unmapped(vsie_page);
+ if (vsie_page->scb_s.prog0c & PROG_IN_SIE)
+ atomic_or(CPUSTAT_STOP_INT, &vsie_page->scb_s.cpuflags);
+ while (vsie_page->scb_s.prog0c & PROG_IN_SIE)
+ cpu_relax();
+}
+
+/* mark the prefix as mapped, this will allow the VSIE to run */
+static void prefix_mapped(struct vsie_page *vsie_page)
+{
+ atomic_andnot(PROG_REQUEST, &vsie_page->scb_s.prog20);
+}
+
+/* test if the prefix is mapped into the gmap shadow */
+static int prefix_is_mapped(struct vsie_page *vsie_page)
+{
+ return !(atomic_read(&vsie_page->scb_s.prog20) & PROG_REQUEST);
+}
+
+/* copy the updated intervention request bits into the shadow scb */
+static void update_intervention_requests(struct vsie_page *vsie_page)
+{
+ const int bits = CPUSTAT_STOP_INT | CPUSTAT_IO_INT | CPUSTAT_EXT_INT;
+ int cpuflags;
+
+ cpuflags = atomic_read(&vsie_page->scb_o->cpuflags);
+ atomic_andnot(bits, &vsie_page->scb_s.cpuflags);
+ atomic_or(cpuflags & bits, &vsie_page->scb_s.cpuflags);
+}
+
+/* shadow (filter and validate) the cpuflags */
+static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ int newflags, cpuflags = atomic_read(&scb_o->cpuflags);
+
+ /* we don't allow ESA/390 guests */
+ if (!(cpuflags & CPUSTAT_ZARCH))
+ return set_validity_icpt(scb_s, 0x0001U);
+
+ if (cpuflags & (CPUSTAT_RRF | CPUSTAT_MCDS))
+ return set_validity_icpt(scb_s, 0x0001U);
+ else if (cpuflags & (CPUSTAT_SLSV | CPUSTAT_SLSR))
+ return set_validity_icpt(scb_s, 0x0007U);
+
+ /* intervention requests will be set later */
+ newflags = CPUSTAT_ZARCH;
+ if (cpuflags & CPUSTAT_GED && test_kvm_facility(vcpu->kvm, 8))
+ newflags |= CPUSTAT_GED;
+ if (cpuflags & CPUSTAT_GED2 && test_kvm_facility(vcpu->kvm, 78)) {
+ if (cpuflags & CPUSTAT_GED)
+ return set_validity_icpt(scb_s, 0x0001U);
+ newflags |= CPUSTAT_GED2;
+ }
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GPERE))
+ newflags |= cpuflags & CPUSTAT_P;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GSLS))
+ newflags |= cpuflags & CPUSTAT_SM;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IBS))
+ newflags |= cpuflags & CPUSTAT_IBS;
+
+ atomic_set(&scb_s->cpuflags, newflags);
+ return 0;
+}
+
+/*
+ * Create a shadow copy of the crycb block and setup key wrapping, if
+ * requested for guest 3 and enabled for guest 2.
+ *
+ * We only accept format-1 (no AP in g2), but convert it into format-2
+ * There is nothing to do for format-0.
+ *
+ * Returns: - 0 if shadowed or nothing to do
+ * - > 0 if control has to be given to guest 2
+ */
+static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ u32 crycb_addr = scb_o->crycbd & 0x7ffffff8U;
+ unsigned long *b1, *b2;
+ u8 ecb3_flags;
+
+ scb_s->crycbd = 0;
+ if (!(scb_o->crycbd & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1))
+ return 0;
+ /* format-1 is supported with message-security-assist extension 3 */
+ if (!test_kvm_facility(vcpu->kvm, 76))
+ return 0;
+ /* we may only allow it if enabled for guest 2 */
+ ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 &
+ (ECB3_AES | ECB3_DEA);
+ if (!ecb3_flags)
+ return 0;
+
+ if ((crycb_addr & PAGE_MASK) != ((crycb_addr + 128) & PAGE_MASK))
+ return set_validity_icpt(scb_s, 0x003CU);
+ else if (!crycb_addr)
+ return set_validity_icpt(scb_s, 0x0039U);
+
+ /* copy only the wrapping keys */
+ if (read_guest_real(vcpu, crycb_addr + 72, &vsie_page->crycb, 56))
+ return set_validity_icpt(scb_s, 0x0035U);
+
+ scb_s->ecb3 |= ecb3_flags;
+ scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT1 |
+ CRYCB_FORMAT2;
+
+ /* xor both blocks in one run */
+ b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask;
+ b2 = (unsigned long *)
+ vcpu->kvm->arch.crypto.crycb->dea_wrapping_key_mask;
+ /* as 56%8 == 0, bitmap_xor won't overwrite any data */
+ bitmap_xor(b1, b1, b2, BITS_PER_BYTE * 56);
+ return 0;
+}
+
+/* shadow (round up/down) the ibc to avoid validity icpt */
+static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ __u64 min_ibc = (sclp.ibc >> 16) & 0x0fffU;
+
+ scb_s->ibc = 0;
+ /* ibc installed in g2 and requested for g3 */
+ if (vcpu->kvm->arch.model.ibc && (scb_o->ibc & 0x0fffU)) {
+ scb_s->ibc = scb_o->ibc & 0x0fffU;
+ /* takte care of the minimum ibc level of the machine */
+ if (scb_s->ibc < min_ibc)
+ scb_s->ibc = min_ibc;
+ /* take care of the maximum ibc level set for the guest */
+ if (scb_s->ibc > vcpu->kvm->arch.model.ibc)
+ scb_s->ibc = vcpu->kvm->arch.model.ibc;
+ }
+}
+
+/* unshadow the scb, copying parameters back to the real scb */
+static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+
+ /* interception */
+ scb_o->icptcode = scb_s->icptcode;
+ scb_o->icptstatus = scb_s->icptstatus;
+ scb_o->ipa = scb_s->ipa;
+ scb_o->ipb = scb_s->ipb;
+ scb_o->gbea = scb_s->gbea;
+
+ /* timer */
+ scb_o->cputm = scb_s->cputm;
+ scb_o->ckc = scb_s->ckc;
+ scb_o->todpr = scb_s->todpr;
+
+ /* guest state */
+ scb_o->gpsw = scb_s->gpsw;
+ scb_o->gg14 = scb_s->gg14;
+ scb_o->gg15 = scb_s->gg15;
+ memcpy(scb_o->gcr, scb_s->gcr, 128);
+ scb_o->pp = scb_s->pp;
+
+ /* interrupt intercept */
+ switch (scb_s->icptcode) {
+ case ICPT_PROGI:
+ case ICPT_INSTPROGI:
+ case ICPT_EXTINT:
+ memcpy((void *)((u64)scb_o + 0xc0),
+ (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
+ break;
+ case ICPT_PARTEXEC:
+ /* MVPG only */
+ memcpy((void *)((u64)scb_o + 0xc0),
+ (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
+ break;
+ }
+
+ if (scb_s->ihcpu != 0xffffU)
+ scb_o->ihcpu = scb_s->ihcpu;
+}
+
+/*
+ * Setup the shadow scb by copying and checking the relevant parts of the g2
+ * provided scb.
+ *
+ * Returns: - 0 if the scb has been shadowed
+ * - > 0 if control has to be given to guest 2
+ */
+static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ bool had_tx = scb_s->ecb & 0x10U;
+ unsigned long new_mso = 0;
+ int rc;
+
+ /* make sure we don't have any leftovers when reusing the scb */
+ scb_s->icptcode = 0;
+ scb_s->eca = 0;
+ scb_s->ecb = 0;
+ scb_s->ecb2 = 0;
+ scb_s->ecb3 = 0;
+ scb_s->ecd = 0;
+ scb_s->fac = 0;
+
+ rc = prepare_cpuflags(vcpu, vsie_page);
+ if (rc)
+ goto out;
+
+ /* timer */
+ scb_s->cputm = scb_o->cputm;
+ scb_s->ckc = scb_o->ckc;
+ scb_s->todpr = scb_o->todpr;
+ scb_s->epoch = scb_o->epoch;
+
+ /* guest state */
+ scb_s->gpsw = scb_o->gpsw;
+ scb_s->gg14 = scb_o->gg14;
+ scb_s->gg15 = scb_o->gg15;
+ memcpy(scb_s->gcr, scb_o->gcr, 128);
+ scb_s->pp = scb_o->pp;
+
+ /* interception / execution handling */
+ scb_s->gbea = scb_o->gbea;
+ scb_s->lctl = scb_o->lctl;
+ scb_s->svcc = scb_o->svcc;
+ scb_s->ictl = scb_o->ictl;
+ /*
+ * SKEY handling functions can't deal with false setting of PTE invalid
+ * bits. Therefore we cannot provide interpretation and would later
+ * have to provide own emulation handlers.
+ */
+ scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
+ scb_s->icpua = scb_o->icpua;
+
+ if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM))
+ new_mso = scb_o->mso & 0xfffffffffff00000UL;
+ /* if the hva of the prefix changes, we have to remap the prefix */
+ if (scb_s->mso != new_mso || scb_s->prefix != scb_o->prefix)
+ prefix_unmapped(vsie_page);
+ /* SIE will do mso/msl validity and exception checks for us */
+ scb_s->msl = scb_o->msl & 0xfffffffffff00000UL;
+ scb_s->mso = new_mso;
+ scb_s->prefix = scb_o->prefix;
+
+ /* We have to definetly flush the tlb if this scb never ran */
+ if (scb_s->ihcpu != 0xffffU)
+ scb_s->ihcpu = scb_o->ihcpu;
+
+ /* MVPG and Protection Exception Interpretation are always available */
+ scb_s->eca |= scb_o->eca & 0x01002000U;
+ /* Host-protection-interruption introduced with ESOP */
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP))
+ scb_s->ecb |= scb_o->ecb & 0x02U;
+ /* transactional execution */
+ if (test_kvm_facility(vcpu->kvm, 73)) {
+ /* remap the prefix is tx is toggled on */
+ if ((scb_o->ecb & 0x10U) && !had_tx)
+ prefix_unmapped(vsie_page);
+ scb_s->ecb |= scb_o->ecb & 0x10U;
+ }
+ /* SIMD */
+ if (test_kvm_facility(vcpu->kvm, 129)) {
+ scb_s->eca |= scb_o->eca & 0x00020000U;
+ scb_s->ecd |= scb_o->ecd & 0x20000000U;
+ }
+ /* Run-time-Instrumentation */
+ if (test_kvm_facility(vcpu->kvm, 64))
+ scb_s->ecb3 |= scb_o->ecb3 & 0x01U;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF))
+ scb_s->eca |= scb_o->eca & 0x00000001U;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB))
+ scb_s->eca |= scb_o->eca & 0x40000000U;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI))
+ scb_s->eca |= scb_o->eca & 0x80000000U;
+
+ prepare_ibc(vcpu, vsie_page);
+ rc = shadow_crycb(vcpu, vsie_page);
+out:
+ if (rc)
+ unshadow_scb(vcpu, vsie_page);
+ return rc;
+}
+
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
+ unsigned long end)
+{
+ struct kvm *kvm = gmap->private;
+ struct vsie_page *cur;
+ unsigned long prefix;
+ struct page *page;
+ int i;
+
+ if (!gmap_is_shadow(gmap))
+ return;
+ if (start >= 1UL << 31)
+ /* We are only interested in prefix pages */
+ return;
+
+ /*
+ * Only new shadow blocks are added to the list during runtime,
+ * therefore we can safely reference them all the time.
+ */
+ for (i = 0; i < kvm->arch.vsie.page_count; i++) {
+ page = READ_ONCE(kvm->arch.vsie.pages[i]);
+ if (!page)
+ continue;
+ cur = page_to_virt(page);
+ if (READ_ONCE(cur->gmap) != gmap)
+ continue;
+ prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT;
+ /* with mso/msl, the prefix lies at an offset */
+ prefix += cur->scb_s.mso;
+ if (prefix <= end && start <= prefix + 2 * PAGE_SIZE - 1)
+ prefix_unmapped_sync(cur);
+ }
+}
+
+/*
+ * Map the first prefix page and if tx is enabled also the second prefix page.
+ *
+ * The prefix will be protected, a gmap notifier will inform about unmaps.
+ * The shadow scb must not be executed until the prefix is remapped, this is
+ * guaranteed by properly handling PROG_REQUEST.
+ *
+ * Returns: - 0 on if successfully mapped or already mapped
+ * - > 0 if control has to be given to guest 2
+ * - -EAGAIN if the caller can retry immediately
+ * - -ENOMEM if out of memory
+ */
+static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
+ int rc;
+
+ if (prefix_is_mapped(vsie_page))
+ return 0;
+
+ /* mark it as mapped so we can catch any concurrent unmappers */
+ prefix_mapped(vsie_page);
+
+ /* with mso/msl, the prefix lies at offset *mso* */
+ prefix += scb_s->mso;
+
+ rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
+ if (!rc && (scb_s->ecb & 0x10U))
+ rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+ prefix + PAGE_SIZE);
+ /*
+ * We don't have to mprotect, we will be called for all unshadows.
+ * SIE will detect if protection applies and trigger a validity.
+ */
+ if (rc)
+ prefix_unmapped(vsie_page);
+ if (rc > 0 || rc == -EFAULT)
+ rc = set_validity_icpt(scb_s, 0x0037U);
+ return rc;
+}
+
+/*
+ * Pin the guest page given by gpa and set hpa to the pinned host address.
+ * Will always be pinned writable.
+ *
+ * Returns: - 0 on success
+ * - -EINVAL if the gpa is not valid guest storage
+ * - -ENOMEM if out of memory
+ */
+static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
+{
+ struct page *page;
+ hva_t hva;
+ int rc;
+
+ hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
+ if (kvm_is_error_hva(hva))
+ return -EINVAL;
+ rc = get_user_pages_fast(hva, 1, 1, &page);
+ if (rc < 0)
+ return rc;
+ else if (rc != 1)
+ return -ENOMEM;
+ *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK);
+ return 0;
+}
+
+/* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */
+static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa)
+{
+ struct page *page;
+
+ page = virt_to_page(hpa);
+ set_page_dirty_lock(page);
+ put_page(page);
+ /* mark the page always as dirty for migration */
+ mark_page_dirty(kvm, gpa_to_gfn(gpa));
+}
+
+/* unpin all blocks previously pinned by pin_blocks(), marking them dirty */
+static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ hpa_t hpa;
+ gpa_t gpa;
+
+ hpa = (u64) scb_s->scaoh << 32 | scb_s->scaol;
+ if (hpa) {
+ gpa = scb_o->scaol & ~0xfUL;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
+ gpa |= (u64) scb_o->scaoh << 32;
+ unpin_guest_page(vcpu->kvm, gpa, hpa);
+ scb_s->scaol = 0;
+ scb_s->scaoh = 0;
+ }
+
+ hpa = scb_s->itdba;
+ if (hpa) {
+ gpa = scb_o->itdba & ~0xffUL;
+ unpin_guest_page(vcpu->kvm, gpa, hpa);
+ scb_s->itdba = 0;
+ }
+
+ hpa = scb_s->gvrd;
+ if (hpa) {
+ gpa = scb_o->gvrd & ~0x1ffUL;
+ unpin_guest_page(vcpu->kvm, gpa, hpa);
+ scb_s->gvrd = 0;
+ }
+
+ hpa = scb_s->riccbd;
+ if (hpa) {
+ gpa = scb_o->riccbd & ~0x3fUL;
+ unpin_guest_page(vcpu->kvm, gpa, hpa);
+ scb_s->riccbd = 0;
+ }
+}
+
+/*
+ * Instead of shadowing some blocks, we can simply forward them because the
+ * addresses in the scb are 64 bit long.
+ *
+ * This works as long as the data lies in one page. If blocks ever exceed one
+ * page, we have to fall back to shadowing.
+ *
+ * As we reuse the sca, the vcpu pointers contained in it are invalid. We must
+ * therefore not enable any facilities that access these pointers (e.g. SIGPIF).
+ *
+ * Returns: - 0 if all blocks were pinned.
+ * - > 0 if control has to be given to guest 2
+ * - -ENOMEM if out of memory
+ */
+static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ hpa_t hpa;
+ gpa_t gpa;
+ int rc = 0;
+
+ gpa = scb_o->scaol & ~0xfUL;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
+ gpa |= (u64) scb_o->scaoh << 32;
+ if (gpa) {
+ if (!(gpa & ~0x1fffUL))
+ rc = set_validity_icpt(scb_s, 0x0038U);
+ else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu))
+ rc = set_validity_icpt(scb_s, 0x0011U);
+ else if ((gpa & PAGE_MASK) !=
+ ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK))
+ rc = set_validity_icpt(scb_s, 0x003bU);
+ if (!rc) {
+ rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+ if (rc == -EINVAL)
+ rc = set_validity_icpt(scb_s, 0x0034U);
+ }
+ if (rc)
+ goto unpin;
+ scb_s->scaoh = (u32)((u64)hpa >> 32);
+ scb_s->scaol = (u32)(u64)hpa;
+ }
+
+ gpa = scb_o->itdba & ~0xffUL;
+ if (gpa && (scb_s->ecb & 0x10U)) {
+ if (!(gpa & ~0x1fffU)) {
+ rc = set_validity_icpt(scb_s, 0x0080U);
+ goto unpin;
+ }
+ /* 256 bytes cannot cross page boundaries */
+ rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+ if (rc == -EINVAL)
+ rc = set_validity_icpt(scb_s, 0x0080U);
+ if (rc)
+ goto unpin;
+ scb_s->itdba = hpa;
+ }
+
+ gpa = scb_o->gvrd & ~0x1ffUL;
+ if (gpa && (scb_s->eca & 0x00020000U) &&
+ !(scb_s->ecd & 0x20000000U)) {
+ if (!(gpa & ~0x1fffUL)) {
+ rc = set_validity_icpt(scb_s, 0x1310U);
+ goto unpin;
+ }
+ /*
+ * 512 bytes vector registers cannot cross page boundaries
+ * if this block gets bigger, we have to shadow it.
+ */
+ rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+ if (rc == -EINVAL)
+ rc = set_validity_icpt(scb_s, 0x1310U);
+ if (rc)
+ goto unpin;
+ scb_s->gvrd = hpa;
+ }
+
+ gpa = scb_o->riccbd & ~0x3fUL;
+ if (gpa && (scb_s->ecb3 & 0x01U)) {
+ if (!(gpa & ~0x1fffUL)) {
+ rc = set_validity_icpt(scb_s, 0x0043U);
+ goto unpin;
+ }
+ /* 64 bytes cannot cross page boundaries */
+ rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+ if (rc == -EINVAL)
+ rc = set_validity_icpt(scb_s, 0x0043U);
+ /* Validity 0x0044 will be checked by SIE */
+ if (rc)
+ goto unpin;
+ scb_s->gvrd = hpa;
+ }
+ return 0;
+unpin:
+ unpin_blocks(vcpu, vsie_page);
+ return rc;
+}
+
+/* unpin the scb provided by guest 2, marking it as dirty */
+static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
+ gpa_t gpa)
+{
+ hpa_t hpa = (hpa_t) vsie_page->scb_o;
+
+ if (hpa)
+ unpin_guest_page(vcpu->kvm, gpa, hpa);
+ vsie_page->scb_o = NULL;
+}
+
+/*
+ * Pin the scb at gpa provided by guest 2 at vsie_page->scb_o.
+ *
+ * Returns: - 0 if the scb was pinned.
+ * - > 0 if control has to be given to guest 2
+ * - -ENOMEM if out of memory
+ */
+static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
+ gpa_t gpa)
+{
+ hpa_t hpa;
+ int rc;
+
+ rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+ if (rc == -EINVAL) {
+ rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ if (!rc)
+ rc = 1;
+ }
+ if (!rc)
+ vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa;
+ return rc;
+}
+
+/*
+ * Inject a fault into guest 2.
+ *
+ * Returns: - > 0 if control has to be given to guest 2
+ * < 0 if an error occurred during injection.
+ */
+static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr,
+ bool write_flag)
+{
+ struct kvm_s390_pgm_info pgm = {
+ .code = code,
+ .trans_exc_code =
+ /* 0-51: virtual address */
+ (vaddr & 0xfffffffffffff000UL) |
+ /* 52-53: store / fetch */
+ (((unsigned int) !write_flag) + 1) << 10,
+ /* 62-63: asce id (alway primary == 0) */
+ .exc_access_id = 0, /* always primary */
+ .op_access_id = 0, /* not MVPG */
+ };
+ int rc;
+
+ if (code == PGM_PROTECTION)
+ pgm.trans_exc_code |= 0x4UL;
+
+ rc = kvm_s390_inject_prog_irq(vcpu, &pgm);
+ return rc ? rc : 1;
+}
+
+/*
+ * Handle a fault during vsie execution on a gmap shadow.
+ *
+ * Returns: - 0 if the fault was resolved
+ * - > 0 if control has to be given to guest 2
+ * - < 0 if an error occurred
+ */
+static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ int rc;
+
+ if (current->thread.gmap_int_code == PGM_PROTECTION)
+ /* we can directly forward all protection exceptions */
+ return inject_fault(vcpu, PGM_PROTECTION,
+ current->thread.gmap_addr, 1);
+
+ rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+ current->thread.gmap_addr);
+ if (rc > 0) {
+ rc = inject_fault(vcpu, rc,
+ current->thread.gmap_addr,
+ current->thread.gmap_write_flag);
+ if (rc >= 0)
+ vsie_page->fault_addr = current->thread.gmap_addr;
+ }
+ return rc;
+}
+
+/*
+ * Retry the previous fault that required guest 2 intervention. This avoids
+ * one superfluous SIE re-entry and direct exit.
+ *
+ * Will ignore any errors. The next SIE fault will do proper fault handling.
+ */
+static void handle_last_fault(struct kvm_vcpu *vcpu,
+ struct vsie_page *vsie_page)
+{
+ if (vsie_page->fault_addr)
+ kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+ vsie_page->fault_addr);
+ vsie_page->fault_addr = 0;
+}
+
+static inline void clear_vsie_icpt(struct vsie_page *vsie_page)
+{
+ vsie_page->scb_s.icptcode = 0;
+}
+
+/* rewind the psw and clear the vsie icpt, so we can retry execution */
+static void retry_vsie_icpt(struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ int ilen = insn_length(scb_s->ipa >> 8);
+
+ /* take care of EXECUTE instructions */
+ if (scb_s->icptstatus & 1) {
+ ilen = (scb_s->icptstatus >> 4) & 0x6;
+ if (!ilen)
+ ilen = 4;
+ }
+ scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, ilen);
+ clear_vsie_icpt(vsie_page);
+}
+
+/*
+ * Try to shadow + enable the guest 2 provided facility list.
+ * Retry instruction execution if enabled for and provided by guest 2.
+ *
+ * Returns: - 0 if handled (retry or guest 2 icpt)
+ * - > 0 if control has to be given to guest 2
+ */
+static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ __u32 fac = vsie_page->scb_o->fac & 0x7ffffff8U;
+
+ if (fac && test_kvm_facility(vcpu->kvm, 7)) {
+ retry_vsie_icpt(vsie_page);
+ if (read_guest_real(vcpu, fac, &vsie_page->fac,
+ sizeof(vsie_page->fac)))
+ return set_validity_icpt(scb_s, 0x1090U);
+ scb_s->fac = (__u32)(__u64) &vsie_page->fac;
+ }
+ return 0;
+}
+
+/*
+ * Run the vsie on a shadow scb and a shadow gmap, without any further
+ * sanity checks, handling SIE faults.
+ *
+ * Returns: - 0 everything went fine
+ * - > 0 if control has to be given to guest 2
+ * - < 0 if an error occurred
+ */
+static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ int rc;
+
+ handle_last_fault(vcpu, vsie_page);
+
+ if (need_resched())
+ schedule();
+ if (test_cpu_flag(CIF_MCCK_PENDING))
+ s390_handle_mcck();
+
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ local_irq_disable();
+ guest_enter_irqoff();
+ local_irq_enable();
+
+ rc = sie64a(scb_s, vcpu->run->s.regs.gprs);
+
+ local_irq_disable();
+ guest_exit_irqoff();
+ local_irq_enable();
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ if (rc > 0)
+ rc = 0; /* we could still have an icpt */
+ else if (rc == -EFAULT)
+ return handle_fault(vcpu, vsie_page);
+
+ switch (scb_s->icptcode) {
+ case ICPT_INST:
+ if (scb_s->ipa == 0xb2b0)
+ rc = handle_stfle(vcpu, vsie_page);
+ break;
+ case ICPT_STOP:
+ /* stop not requested by g2 - must have been a kick */
+ if (!(atomic_read(&scb_o->cpuflags) & CPUSTAT_STOP_INT))
+ clear_vsie_icpt(vsie_page);
+ break;
+ case ICPT_VALIDITY:
+ if ((scb_s->ipa & 0xf000) != 0xf000)
+ scb_s->ipa += 0x1000;
+ break;
+ }
+ return rc;
+}
+
+static void release_gmap_shadow(struct vsie_page *vsie_page)
+{
+ if (vsie_page->gmap)
+ gmap_put(vsie_page->gmap);
+ WRITE_ONCE(vsie_page->gmap, NULL);
+ prefix_unmapped(vsie_page);
+}
+
+static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
+ struct vsie_page *vsie_page)
+{
+ unsigned long asce;
+ union ctlreg0 cr0;
+ struct gmap *gmap;
+ int edat;
+
+ asce = vcpu->arch.sie_block->gcr[1];
+ cr0.val = vcpu->arch.sie_block->gcr[0];
+ edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
+ edat += edat && test_kvm_facility(vcpu->kvm, 78);
+
+ /*
+ * ASCE or EDAT could have changed since last icpt, or the gmap
+ * we're holding has been unshadowed. If the gmap is still valid,
+ * we can safely reuse it.
+ */
+ if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat))
+ return 0;
+
+ /* release the old shadow - if any, and mark the prefix as unmapped */
+ release_gmap_shadow(vsie_page);
+ gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
+ if (IS_ERR(gmap))
+ return PTR_ERR(gmap);
+ gmap->private = vcpu->kvm;
+ WRITE_ONCE(vsie_page->gmap, gmap);
+ return 0;
+}
+
+/*
+ * Register the shadow scb at the VCPU, e.g. for kicking out of vsie.
+ */
+static void register_shadow_scb(struct kvm_vcpu *vcpu,
+ struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+
+ WRITE_ONCE(vcpu->arch.vsie_block, &vsie_page->scb_s);
+ /*
+ * External calls have to lead to a kick of the vcpu and
+ * therefore the vsie -> Simulate Wait state.
+ */
+ atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+ /*
+ * We have to adjust the g3 epoch by the g2 epoch. The epoch will
+ * automatically be adjusted on tod clock changes via kvm_sync_clock.
+ */
+ preempt_disable();
+ scb_s->epoch += vcpu->kvm->arch.epoch;
+ preempt_enable();
+}
+
+/*
+ * Unregister a shadow scb from a VCPU.
+ */
+static void unregister_shadow_scb(struct kvm_vcpu *vcpu)
+{
+ atomic_andnot(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+ WRITE_ONCE(vcpu->arch.vsie_block, NULL);
+}
+
+/*
+ * Run the vsie on a shadowed scb, managing the gmap shadow, handling
+ * prefix pages and faults.
+ *
+ * Returns: - 0 if no errors occurred
+ * - > 0 if control has to be given to guest 2
+ * - -ENOMEM if out of memory
+ */
+static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ int rc = 0;
+
+ while (1) {
+ rc = acquire_gmap_shadow(vcpu, vsie_page);
+ if (!rc)
+ rc = map_prefix(vcpu, vsie_page);
+ if (!rc) {
+ gmap_enable(vsie_page->gmap);
+ update_intervention_requests(vsie_page);
+ rc = do_vsie_run(vcpu, vsie_page);
+ gmap_enable(vcpu->arch.gmap);
+ }
+ atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20);
+
+ if (rc == -EAGAIN)
+ rc = 0;
+ if (rc || scb_s->icptcode || signal_pending(current) ||
+ kvm_s390_vcpu_has_irq(vcpu, 0))
+ break;
+ };
+
+ if (rc == -EFAULT) {
+ /*
+ * Addressing exceptions are always presentes as intercepts.
+ * As addressing exceptions are suppressing and our guest 3 PSW
+ * points at the responsible instruction, we have to
+ * forward the PSW and set the ilc. If we can't read guest 3
+ * instruction, we can use an arbitrary ilc. Let's always use
+ * ilen = 4 for now, so we can avoid reading in guest 3 virtual
+ * memory. (we could also fake the shadow so the hardware
+ * handles it).
+ */
+ scb_s->icptcode = ICPT_PROGI;
+ scb_s->iprcc = PGM_ADDRESSING;
+ scb_s->pgmilc = 4;
+ scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4);
+ }
+ return rc;
+}
+
+/*
+ * Get or create a vsie page for a scb address.
+ *
+ * Returns: - address of a vsie page (cached or new one)
+ * - NULL if the same scb address is already used by another VCPU
+ * - ERR_PTR(-ENOMEM) if out of memory
+ */
+static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
+{
+ struct vsie_page *vsie_page;
+ struct page *page;
+ int nr_vcpus;
+
+ rcu_read_lock();
+ page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9);
+ rcu_read_unlock();
+ if (page) {
+ if (page_ref_inc_return(page) == 2)
+ return page_to_virt(page);
+ page_ref_dec(page);
+ }
+
+ /*
+ * We want at least #online_vcpus shadows, so every VCPU can execute
+ * the VSIE in parallel.
+ */
+ nr_vcpus = atomic_read(&kvm->online_vcpus);
+
+ mutex_lock(&kvm->arch.vsie.mutex);
+ if (kvm->arch.vsie.page_count < nr_vcpus) {
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA);
+ if (!page) {
+ mutex_unlock(&kvm->arch.vsie.mutex);
+ return ERR_PTR(-ENOMEM);
+ }
+ page_ref_inc(page);
+ kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page;
+ kvm->arch.vsie.page_count++;
+ } else {
+ /* reuse an existing entry that belongs to nobody */
+ while (true) {
+ page = kvm->arch.vsie.pages[kvm->arch.vsie.next];
+ if (page_ref_inc_return(page) == 2)
+ break;
+ page_ref_dec(page);
+ kvm->arch.vsie.next++;
+ kvm->arch.vsie.next %= nr_vcpus;
+ }
+ radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
+ }
+ page->index = addr;
+ /* double use of the same address */
+ if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) {
+ page_ref_dec(page);
+ mutex_unlock(&kvm->arch.vsie.mutex);
+ return NULL;
+ }
+ mutex_unlock(&kvm->arch.vsie.mutex);
+
+ vsie_page = page_to_virt(page);
+ memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block));
+ release_gmap_shadow(vsie_page);
+ vsie_page->fault_addr = 0;
+ vsie_page->scb_s.ihcpu = 0xffffU;
+ return vsie_page;
+}
+
+/* put a vsie page acquired via get_vsie_page */
+static void put_vsie_page(struct kvm *kvm, struct vsie_page *vsie_page)
+{
+ struct page *page = pfn_to_page(__pa(vsie_page) >> PAGE_SHIFT);
+
+ page_ref_dec(page);
+}
+
+int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
+{
+ struct vsie_page *vsie_page;
+ unsigned long scb_addr;
+ int rc;
+
+ vcpu->stat.instruction_sie++;
+ if (!test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIEF2))
+ return -EOPNOTSUPP;
+ if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+ return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
+ BUILD_BUG_ON(sizeof(struct vsie_page) != 4096);
+ scb_addr = kvm_s390_get_base_disp_s(vcpu, NULL);
+
+ /* 512 byte alignment */
+ if (unlikely(scb_addr & 0x1ffUL))
+ return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+ if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0))
+ return 0;
+
+ vsie_page = get_vsie_page(vcpu->kvm, scb_addr);
+ if (IS_ERR(vsie_page))
+ return PTR_ERR(vsie_page);
+ else if (!vsie_page)
+ /* double use of sie control block - simply do nothing */
+ return 0;
+
+ rc = pin_scb(vcpu, vsie_page, scb_addr);
+ if (rc)
+ goto out_put;
+ rc = shadow_scb(vcpu, vsie_page);
+ if (rc)
+ goto out_unpin_scb;
+ rc = pin_blocks(vcpu, vsie_page);
+ if (rc)
+ goto out_unshadow;
+ register_shadow_scb(vcpu, vsie_page);
+ rc = vsie_run(vcpu, vsie_page);
+ unregister_shadow_scb(vcpu);
+ unpin_blocks(vcpu, vsie_page);
+out_unshadow:
+ unshadow_scb(vcpu, vsie_page);
+out_unpin_scb:
+ unpin_scb(vcpu, vsie_page, scb_addr);
+out_put:
+ put_vsie_page(vcpu->kvm, vsie_page);
+
+ return rc < 0 ? rc : 0;
+}
+
+/* Init the vsie data structures. To be called when a vm is initialized. */
+void kvm_s390_vsie_init(struct kvm *kvm)
+{
+ mutex_init(&kvm->arch.vsie.mutex);
+ INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL);
+}
+
+/* Destroy the vsie data structures. To be called when a vm is destroyed. */
+void kvm_s390_vsie_destroy(struct kvm *kvm)
+{
+ struct vsie_page *vsie_page;
+ struct page *page;
+ int i;
+
+ mutex_lock(&kvm->arch.vsie.mutex);
+ for (i = 0; i < kvm->arch.vsie.page_count; i++) {
+ page = kvm->arch.vsie.pages[i];
+ kvm->arch.vsie.pages[i] = NULL;
+ vsie_page = page_to_virt(page);
+ release_gmap_shadow(vsie_page);
+ /* free the radix tree entry */
+ radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
+ __free_page(page);
+ }
+ kvm->arch.vsie.page_count = 0;
+ mutex_unlock(&kvm->arch.vsie.mutex);
+}
+
+void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu)
+{
+ struct kvm_s390_sie_block *scb = READ_ONCE(vcpu->arch.vsie_block);
+
+ /*
+ * Even if the VCPU lets go of the shadow sie block reference, it is
+ * still valid in the cache. So we can safely kick it.
+ */
+ if (scb) {
+ atomic_or(PROG_BLOCK_SIE, &scb->prog20);
+ if (scb->prog0c & PROG_IN_SIE)
+ atomic_or(CPUSTAT_STOP_INT, &scb->cpuflags);
+ }
+}
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 25783dc3c813..a58bca62a93b 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -418,6 +418,8 @@ static inline int do_exception(struct pt_regs *regs, int access)
(struct gmap *) S390_lowcore.gmap : NULL;
if (gmap) {
current->thread.gmap_addr = address;
+ current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
+ current->thread.gmap_int_code = regs->int_code & 0xffff;
address = __gmap_translate(gmap, address);
if (address == -EFAULT) {
fault = VM_FAULT_BADMAP;
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 063c721ec0dc..2ce6bb3bab32 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -20,14 +20,16 @@
#include <asm/gmap.h>
#include <asm/tlb.h>
+#define GMAP_SHADOW_FAKE_TABLE 1ULL
+
/**
- * gmap_alloc - allocate a guest address space
+ * gmap_alloc - allocate and initialize a guest address space
* @mm: pointer to the parent mm_struct
* @limit: maximum address of the gmap address space
*
* Returns a guest address space structure.
*/
-struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
+static struct gmap *gmap_alloc(unsigned long limit)
{
struct gmap *gmap;
struct page *page;
@@ -55,10 +57,14 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
if (!gmap)
goto out;
INIT_LIST_HEAD(&gmap->crst_list);
+ INIT_LIST_HEAD(&gmap->children);
+ INIT_LIST_HEAD(&gmap->pt_list);
INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
+ INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
spin_lock_init(&gmap->guest_table_lock);
- gmap->mm = mm;
+ spin_lock_init(&gmap->shadow_lock);
+ atomic_set(&gmap->ref_count, 1);
page = alloc_pages(GFP_KERNEL, 2);
if (!page)
goto out_free;
@@ -70,9 +76,6 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
gmap->asce = atype | _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS | __pa(table);
gmap->asce_end = limit;
- down_write(&mm->mmap_sem);
- list_add(&gmap->list, &mm->context.gmap_list);
- up_write(&mm->mmap_sem);
return gmap;
out_free:
@@ -80,7 +83,28 @@ out_free:
out:
return NULL;
}
-EXPORT_SYMBOL_GPL(gmap_alloc);
+
+/**
+ * gmap_create - create a guest address space
+ * @mm: pointer to the parent mm_struct
+ * @limit: maximum size of the gmap address space
+ *
+ * Returns a guest address space structure.
+ */
+struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
+{
+ struct gmap *gmap;
+
+ gmap = gmap_alloc(limit);
+ if (!gmap)
+ return NULL;
+ gmap->mm = mm;
+ spin_lock(&mm->context.gmap_lock);
+ list_add_rcu(&gmap->list, &mm->context.gmap_list);
+ spin_unlock(&mm->context.gmap_lock);
+ return gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_create);
static void gmap_flush_tlb(struct gmap *gmap)
{
@@ -114,31 +138,117 @@ static void gmap_radix_tree_free(struct radix_tree_root *root)
} while (nr > 0);
}
+static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
+{
+ struct gmap_rmap *rmap, *rnext, *head;
+ struct radix_tree_iter iter;
+ unsigned long indices[16];
+ unsigned long index;
+ void **slot;
+ int i, nr;
+
+ /* A radix tree is freed by deleting all of its entries */
+ index = 0;
+ do {
+ nr = 0;
+ radix_tree_for_each_slot(slot, root, &iter, index) {
+ indices[nr] = iter.index;
+ if (++nr == 16)
+ break;
+ }
+ for (i = 0; i < nr; i++) {
+ index = indices[i];
+ head = radix_tree_delete(root, index);
+ gmap_for_each_rmap_safe(rmap, rnext, head)
+ kfree(rmap);
+ }
+ } while (nr > 0);
+}
+
/**
* gmap_free - free a guest address space
* @gmap: pointer to the guest address space structure
+ *
+ * No locks required. There are no references to this gmap anymore.
*/
-void gmap_free(struct gmap *gmap)
+static void gmap_free(struct gmap *gmap)
{
struct page *page, *next;
- /* Flush tlb. */
- if (MACHINE_HAS_IDTE)
- __tlb_flush_idte(gmap->asce);
- else
- __tlb_flush_global();
-
+ /* Flush tlb of all gmaps (if not already done for shadows) */
+ if (!(gmap_is_shadow(gmap) && gmap->removed))
+ gmap_flush_tlb(gmap);
/* Free all segment & region tables. */
list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
__free_pages(page, 2);
gmap_radix_tree_free(&gmap->guest_to_host);
gmap_radix_tree_free(&gmap->host_to_guest);
- down_write(&gmap->mm->mmap_sem);
- list_del(&gmap->list);
- up_write(&gmap->mm->mmap_sem);
+
+ /* Free additional data for a shadow gmap */
+ if (gmap_is_shadow(gmap)) {
+ /* Free all page tables. */
+ list_for_each_entry_safe(page, next, &gmap->pt_list, lru)
+ page_table_free_pgste(page);
+ gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
+ /* Release reference to the parent */
+ gmap_put(gmap->parent);
+ }
+
kfree(gmap);
}
-EXPORT_SYMBOL_GPL(gmap_free);
+
+/**
+ * gmap_get - increase reference counter for guest address space
+ * @gmap: pointer to the guest address space structure
+ *
+ * Returns the gmap pointer
+ */
+struct gmap *gmap_get(struct gmap *gmap)
+{
+ atomic_inc(&gmap->ref_count);
+ return gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_get);
+
+/**
+ * gmap_put - decrease reference counter for guest address space
+ * @gmap: pointer to the guest address space structure
+ *
+ * If the reference counter reaches zero the guest address space is freed.
+ */
+void gmap_put(struct gmap *gmap)
+{
+ if (atomic_dec_return(&gmap->ref_count) == 0)
+ gmap_free(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_put);
+
+/**
+ * gmap_remove - remove a guest address space but do not free it yet
+ * @gmap: pointer to the guest address space structure
+ */
+void gmap_remove(struct gmap *gmap)
+{
+ struct gmap *sg, *next;
+
+ /* Remove all shadow gmaps linked to this gmap */
+ if (!list_empty(&gmap->children)) {
+ spin_lock(&gmap->shadow_lock);
+ list_for_each_entry_safe(sg, next, &gmap->children, list) {
+ list_del(&sg->list);
+ gmap_put(sg);
+ }
+ spin_unlock(&gmap->shadow_lock);
+ }
+ /* Remove gmap from the pre-mm list */
+ spin_lock(&gmap->mm->context.gmap_lock);
+ list_del_rcu(&gmap->list);
+ spin_unlock(&gmap->mm->context.gmap_lock);
+ synchronize_rcu();
+ /* Put reference */
+ gmap_put(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_remove);
/**
* gmap_enable - switch primary space to the guest address space
@@ -160,6 +270,17 @@ void gmap_disable(struct gmap *gmap)
}
EXPORT_SYMBOL_GPL(gmap_disable);
+/**
+ * gmap_get_enabled - get a pointer to the currently enabled gmap
+ *
+ * Returns a pointer to the currently enabled gmap. 0 if none is enabled.
+ */
+struct gmap *gmap_get_enabled(void)
+{
+ return (struct gmap *) S390_lowcore.gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_get_enabled);
+
/*
* gmap_alloc_table is assumed to be called with mmap_sem held
*/
@@ -175,7 +296,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
return -ENOMEM;
new = (unsigned long *) page_to_phys(page);
crst_table_init(new, init);
- spin_lock(&gmap->mm->page_table_lock);
+ spin_lock(&gmap->guest_table_lock);
if (*table & _REGION_ENTRY_INVALID) {
list_add(&page->lru, &gmap->crst_list);
*table = (unsigned long) new | _REGION_ENTRY_LENGTH |
@@ -183,7 +304,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
page->index = gaddr;
page = NULL;
}
- spin_unlock(&gmap->mm->page_table_lock);
+ spin_unlock(&gmap->guest_table_lock);
if (page)
__free_pages(page, 2);
return 0;
@@ -219,6 +340,7 @@ static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
unsigned long *entry;
int flush = 0;
+ BUG_ON(gmap_is_shadow(gmap));
spin_lock(&gmap->guest_table_lock);
entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
if (entry) {
@@ -258,6 +380,7 @@ int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
unsigned long off;
int flush;
+ BUG_ON(gmap_is_shadow(gmap));
if ((to | len) & (PMD_SIZE - 1))
return -EINVAL;
if (len == 0 || to + len < to)
@@ -289,6 +412,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
unsigned long off;
int flush;
+ BUG_ON(gmap_is_shadow(gmap));
if ((from | to | len) & (PMD_SIZE - 1))
return -EINVAL;
if (len == 0 || from + len < from || to + len < to ||
@@ -326,6 +450,8 @@ EXPORT_SYMBOL_GPL(gmap_map_segment);
* This function does not establish potentially missing page table entries.
* The mmap_sem of the mm that belongs to the address space must be held
* when this function gets called.
+ *
+ * Note: Can also be called for shadow gmaps.
*/
unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
{
@@ -333,6 +459,7 @@ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
vmaddr = (unsigned long)
radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
+ /* Note: guest_to_host is empty for a shadow gmap */
return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
}
EXPORT_SYMBOL_GPL(__gmap_translate);
@@ -369,11 +496,13 @@ void gmap_unlink(struct mm_struct *mm, unsigned long *table,
struct gmap *gmap;
int flush;
- list_for_each_entry(gmap, &mm->context.gmap_list, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
if (flush)
gmap_flush_tlb(gmap);
}
+ rcu_read_unlock();
}
/**
@@ -397,6 +526,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
pmd_t *pmd;
int rc;
+ BUG_ON(gmap_is_shadow(gmap));
/* Create higher level tables in the gmap page table */
table = gmap->table;
if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
@@ -552,116 +682,1412 @@ static LIST_HEAD(gmap_notifier_list);
static DEFINE_SPINLOCK(gmap_notifier_lock);
/**
- * gmap_register_ipte_notifier - register a pte invalidation callback
+ * gmap_register_pte_notifier - register a pte invalidation callback
* @nb: pointer to the gmap notifier block
*/
-void gmap_register_ipte_notifier(struct gmap_notifier *nb)
+void gmap_register_pte_notifier(struct gmap_notifier *nb)
{
spin_lock(&gmap_notifier_lock);
- list_add(&nb->list, &gmap_notifier_list);
+ list_add_rcu(&nb->list, &gmap_notifier_list);
spin_unlock(&gmap_notifier_lock);
}
-EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
+EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
/**
- * gmap_unregister_ipte_notifier - remove a pte invalidation callback
+ * gmap_unregister_pte_notifier - remove a pte invalidation callback
* @nb: pointer to the gmap notifier block
*/
-void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
+void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
{
spin_lock(&gmap_notifier_lock);
- list_del_init(&nb->list);
+ list_del_rcu(&nb->list);
spin_unlock(&gmap_notifier_lock);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
+
+/**
+ * gmap_call_notifier - call all registered invalidation callbacks
+ * @gmap: pointer to guest mapping meta data structure
+ * @start: start virtual address in the guest address space
+ * @end: end virtual address in the guest address space
+ */
+static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
+ unsigned long end)
+{
+ struct gmap_notifier *nb;
+
+ list_for_each_entry(nb, &gmap_notifier_list, list)
+ nb->notifier_call(gmap, start, end);
+}
+
+/**
+ * gmap_table_walk - walk the gmap page tables
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @level: page table level to stop at
+ *
+ * Returns a table entry pointer for the given guest address and @level
+ * @level=0 : returns a pointer to a page table table entry (or NULL)
+ * @level=1 : returns a pointer to a segment table entry (or NULL)
+ * @level=2 : returns a pointer to a region-3 table entry (or NULL)
+ * @level=3 : returns a pointer to a region-2 table entry (or NULL)
+ * @level=4 : returns a pointer to a region-1 table entry (or NULL)
+ *
+ * Returns NULL if the gmap page tables could not be walked to the
+ * requested level.
+ *
+ * Note: Can also be called for shadow gmaps.
+ */
+static inline unsigned long *gmap_table_walk(struct gmap *gmap,
+ unsigned long gaddr, int level)
+{
+ unsigned long *table;
+
+ if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4))
+ return NULL;
+ if (gmap_is_shadow(gmap) && gmap->removed)
+ return NULL;
+ if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11)))
+ return NULL;
+ table = gmap->table;
+ switch (gmap->asce & _ASCE_TYPE_MASK) {
+ case _ASCE_TYPE_REGION1:
+ table += (gaddr >> 53) & 0x7ff;
+ if (level == 4)
+ break;
+ if (*table & _REGION_ENTRY_INVALID)
+ return NULL;
+ table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ /* Fallthrough */
+ case _ASCE_TYPE_REGION2:
+ table += (gaddr >> 42) & 0x7ff;
+ if (level == 3)
+ break;
+ if (*table & _REGION_ENTRY_INVALID)
+ return NULL;
+ table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ /* Fallthrough */
+ case _ASCE_TYPE_REGION3:
+ table += (gaddr >> 31) & 0x7ff;
+ if (level == 2)
+ break;
+ if (*table & _REGION_ENTRY_INVALID)
+ return NULL;
+ table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ /* Fallthrough */
+ case _ASCE_TYPE_SEGMENT:
+ table += (gaddr >> 20) & 0x7ff;
+ if (level == 1)
+ break;
+ if (*table & _REGION_ENTRY_INVALID)
+ return NULL;
+ table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
+ table += (gaddr >> 12) & 0xff;
+ }
+ return table;
+}
+
+/**
+ * gmap_pte_op_walk - walk the gmap page table, get the page table lock
+ * and return the pte pointer
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @ptl: pointer to the spinlock pointer
+ *
+ * Returns a pointer to the locked pte for a guest address, or NULL
+ *
+ * Note: Can also be called for shadow gmaps.
+ */
+static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
+ spinlock_t **ptl)
+{
+ unsigned long *table;
+
+ if (gmap_is_shadow(gmap))
+ spin_lock(&gmap->guest_table_lock);
+ /* Walk the gmap page table, lock and get pte pointer */
+ table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
+ if (!table || *table & _SEGMENT_ENTRY_INVALID) {
+ if (gmap_is_shadow(gmap))
+ spin_unlock(&gmap->guest_table_lock);
+ return NULL;
+ }
+ if (gmap_is_shadow(gmap)) {
+ *ptl = &gmap->guest_table_lock;
+ return pte_offset_map((pmd_t *) table, gaddr);
+ }
+ return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
+}
+
+/**
+ * gmap_pte_op_fixup - force a page in and connect the gmap page table
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @vmaddr: address in the host process address space
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ *
+ * Returns 0 if the caller can retry __gmap_translate (might fail again),
+ * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
+ * up or connecting the gmap page table.
+ */
+static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
+ unsigned long vmaddr, int prot)
+{
+ struct mm_struct *mm = gmap->mm;
+ unsigned int fault_flags;
+ bool unlocked = false;
+
+ BUG_ON(gmap_is_shadow(gmap));
+ fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
+ if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked))
+ return -EFAULT;
+ if (unlocked)
+ /* lost mmap_sem, caller has to retry __gmap_translate */
+ return 0;
+ /* Connect the page tables */
+ return __gmap_link(gmap, gaddr, vmaddr);
}
-EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
/**
- * gmap_ipte_notify - mark a range of ptes for invalidation notification
+ * gmap_pte_op_end - release the page table lock
+ * @ptl: pointer to the spinlock pointer
+ */
+static void gmap_pte_op_end(spinlock_t *ptl)
+{
+ spin_unlock(ptl);
+}
+
+/*
+ * gmap_protect_range - remove access rights to memory and set pgste bits
* @gmap: pointer to guest mapping meta data structure
* @gaddr: virtual address in the guest address space
* @len: size of area
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bits: pgste notification bits to set
*
- * Returns 0 if for each page in the given range a gmap mapping exists and
- * the invalidation notification could be set. If the gmap mapping is missing
- * for one or more pages -EFAULT is returned. If no memory could be allocated
- * -ENOMEM is returned. This function establishes missing page table entries.
+ * Returns 0 if successfully protected, -ENOMEM if out of memory and
+ * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
+ *
+ * Called with sg->mm->mmap_sem in read.
+ *
+ * Note: Can also be called for shadow gmaps.
*/
-int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
+static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
+ unsigned long len, int prot, unsigned long bits)
{
- unsigned long addr;
+ unsigned long vmaddr;
spinlock_t *ptl;
pte_t *ptep;
- bool unlocked;
- int rc = 0;
+ int rc;
+
+ while (len) {
+ rc = -EAGAIN;
+ ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
+ if (ptep) {
+ rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits);
+ gmap_pte_op_end(ptl);
+ }
+ if (rc) {
+ vmaddr = __gmap_translate(gmap, gaddr);
+ if (IS_ERR_VALUE(vmaddr))
+ return vmaddr;
+ rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
+ if (rc)
+ return rc;
+ continue;
+ }
+ gaddr += PAGE_SIZE;
+ len -= PAGE_SIZE;
+ }
+ return 0;
+}
+
+/**
+ * gmap_mprotect_notify - change access rights for a range of ptes and
+ * call the notifier if any pte changes again
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @len: size of area
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ *
+ * Returns 0 if for each page in the given range a gmap mapping exists,
+ * the new access rights could be set and the notifier could be armed.
+ * If the gmap mapping is missing for one or more pages -EFAULT is
+ * returned. If no memory could be allocated -ENOMEM is returned.
+ * This function establishes missing page table entries.
+ */
+int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
+ unsigned long len, int prot)
+{
+ int rc;
- if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
+ if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap))
+ return -EINVAL;
+ if (!MACHINE_HAS_ESOP && prot == PROT_READ)
return -EINVAL;
down_read(&gmap->mm->mmap_sem);
- while (len) {
- unlocked = false;
- /* Convert gmap address and connect the page tables */
- addr = __gmap_translate(gmap, gaddr);
- if (IS_ERR_VALUE(addr)) {
- rc = addr;
+ rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT);
+ up_read(&gmap->mm->mmap_sem);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
+
+/**
+ * gmap_read_table - get an unsigned long value from a guest page table using
+ * absolute addressing, without marking the page referenced.
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @val: pointer to the unsigned long value to return
+ *
+ * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
+ * if reading using the virtual address failed.
+ *
+ * Called with gmap->mm->mmap_sem in read.
+ */
+int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
+{
+ unsigned long address, vmaddr;
+ spinlock_t *ptl;
+ pte_t *ptep, pte;
+ int rc;
+
+ while (1) {
+ rc = -EAGAIN;
+ ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
+ if (ptep) {
+ pte = *ptep;
+ if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
+ address = pte_val(pte) & PAGE_MASK;
+ address += gaddr & ~PAGE_MASK;
+ *val = *(unsigned long *) address;
+ pte_val(*ptep) |= _PAGE_YOUNG;
+ /* Do *NOT* clear the _PAGE_INVALID bit! */
+ rc = 0;
+ }
+ gmap_pte_op_end(ptl);
+ }
+ if (!rc)
+ break;
+ vmaddr = __gmap_translate(gmap, gaddr);
+ if (IS_ERR_VALUE(vmaddr)) {
+ rc = vmaddr;
break;
}
- /* Get the page mapped */
- if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
- &unlocked)) {
- rc = -EFAULT;
+ rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
+ if (rc)
break;
+ }
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_read_table);
+
+/**
+ * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
+ * @sg: pointer to the shadow guest address space structure
+ * @vmaddr: vm address associated with the rmap
+ * @rmap: pointer to the rmap structure
+ *
+ * Called with the sg->guest_table_lock
+ */
+static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
+ struct gmap_rmap *rmap)
+{
+ void **slot;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
+ if (slot) {
+ rmap->next = radix_tree_deref_slot_protected(slot,
+ &sg->guest_table_lock);
+ radix_tree_replace_slot(slot, rmap);
+ } else {
+ rmap->next = NULL;
+ radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
+ rmap);
+ }
+}
+
+/**
+ * gmap_protect_rmap - modify access rights to memory and create an rmap
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow gmap
+ * @paddr: address in the parent guest address space
+ * @len: length of the memory area to protect
+ * @prot: indicates access rights: none, read-only or read-write
+ *
+ * Returns 0 if successfully protected and the rmap was created, -ENOMEM
+ * if out of memory and -EFAULT if paddr is invalid.
+ */
+static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
+ unsigned long paddr, unsigned long len, int prot)
+{
+ struct gmap *parent;
+ struct gmap_rmap *rmap;
+ unsigned long vmaddr;
+ spinlock_t *ptl;
+ pte_t *ptep;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ parent = sg->parent;
+ while (len) {
+ vmaddr = __gmap_translate(parent, paddr);
+ if (IS_ERR_VALUE(vmaddr))
+ return vmaddr;
+ rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+ if (!rmap)
+ return -ENOMEM;
+ rmap->raddr = raddr;
+ rc = radix_tree_preload(GFP_KERNEL);
+ if (rc) {
+ kfree(rmap);
+ return rc;
+ }
+ rc = -EAGAIN;
+ ptep = gmap_pte_op_walk(parent, paddr, &ptl);
+ if (ptep) {
+ spin_lock(&sg->guest_table_lock);
+ rc = ptep_force_prot(parent->mm, paddr, ptep, prot,
+ PGSTE_VSIE_BIT);
+ if (!rc)
+ gmap_insert_rmap(sg, vmaddr, rmap);
+ spin_unlock(&sg->guest_table_lock);
+ gmap_pte_op_end(ptl);
}
- /* While trying to map mmap_sem got unlocked. Let us retry */
- if (unlocked)
+ radix_tree_preload_end();
+ if (rc) {
+ kfree(rmap);
+ rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+ if (rc)
+ return rc;
continue;
- rc = __gmap_link(gmap, gaddr, addr);
+ }
+ paddr += PAGE_SIZE;
+ len -= PAGE_SIZE;
+ }
+ return 0;
+}
+
+#define _SHADOW_RMAP_MASK 0x7
+#define _SHADOW_RMAP_REGION1 0x5
+#define _SHADOW_RMAP_REGION2 0x4
+#define _SHADOW_RMAP_REGION3 0x3
+#define _SHADOW_RMAP_SEGMENT 0x2
+#define _SHADOW_RMAP_PGTABLE 0x1
+
+/**
+ * gmap_idte_one - invalidate a single region or segment table entry
+ * @asce: region or segment table *origin* + table-type bits
+ * @vaddr: virtual address to identify the table entry to flush
+ *
+ * The invalid bit of a single region or segment table entry is set
+ * and the associated TLB entries depending on the entry are flushed.
+ * The table-type of the @asce identifies the portion of the @vaddr
+ * that is used as the invalidation index.
+ */
+static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
+{
+ asm volatile(
+ " .insn rrf,0xb98e0000,%0,%1,0,0"
+ : : "a" (asce), "a" (vaddr) : "cc", "memory");
+}
+
+/**
+ * gmap_unshadow_page - remove a page from a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long *table;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
+ if (!table || *table & _PAGE_INVALID)
+ return;
+ gmap_call_notifier(sg, raddr, raddr + (1UL << 12) - 1);
+ ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
+}
+
+/**
+ * __gmap_unshadow_pgt - remove all entries from a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @pgt: pointer to the start of a shadow page table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
+ unsigned long *pgt)
+{
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ for (i = 0; i < 256; i++, raddr += 1UL << 12)
+ pgt[i] = _PAGE_INVALID;
+}
+
+/**
+ * gmap_unshadow_pgt - remove a shadow page table from a segment entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long sto, *ste, *pgt;
+ struct page *page;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
+ if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
+ return;
+ gmap_call_notifier(sg, raddr, raddr + (1UL << 20) - 1);
+ sto = (unsigned long) (ste - ((raddr >> 20) & 0x7ff));
+ gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
+ pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN);
+ *ste = _SEGMENT_ENTRY_EMPTY;
+ __gmap_unshadow_pgt(sg, raddr, pgt);
+ /* Free page table */
+ page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ page_table_free_pgste(page);
+}
+
+/**
+ * __gmap_unshadow_sgt - remove all entries from a shadow segment table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @sgt: pointer to the start of a shadow segment table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
+ unsigned long *sgt)
+{
+ unsigned long asce, *pgt;
+ struct page *page;
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT;
+ for (i = 0; i < 2048; i++, raddr += 1UL << 20) {
+ if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
+ continue;
+ pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN);
+ sgt[i] = _SEGMENT_ENTRY_EMPTY;
+ __gmap_unshadow_pgt(sg, raddr, pgt);
+ /* Free page table */
+ page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ page_table_free_pgste(page);
+ }
+}
+
+/**
+ * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the shadow->guest_table_lock
+ */
+static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long r3o, *r3e, *sgt;
+ struct page *page;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
+ if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
+ return;
+ gmap_call_notifier(sg, raddr, raddr + (1UL << 31) - 1);
+ r3o = (unsigned long) (r3e - ((raddr >> 31) & 0x7ff));
+ gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr);
+ sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN);
+ *r3e = _REGION3_ENTRY_EMPTY;
+ __gmap_unshadow_sgt(sg, raddr, sgt);
+ /* Free segment table */
+ page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: address in the shadow guest address space
+ * @r3t: pointer to the start of a shadow region-3 table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
+ unsigned long *r3t)
+{
+ unsigned long asce, *sgt;
+ struct page *page;
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ asce = (unsigned long) r3t | _ASCE_TYPE_REGION3;
+ for (i = 0; i < 2048; i++, raddr += 1UL << 31) {
+ if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
+ continue;
+ sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN);
+ r3t[i] = _REGION3_ENTRY_EMPTY;
+ __gmap_unshadow_sgt(sg, raddr, sgt);
+ /* Free segment table */
+ page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, 2);
+ }
+}
+
+/**
+ * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long r2o, *r2e, *r3t;
+ struct page *page;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
+ if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
+ return;
+ gmap_call_notifier(sg, raddr, raddr + (1UL << 42) - 1);
+ r2o = (unsigned long) (r2e - ((raddr >> 42) & 0x7ff));
+ gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr);
+ r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN);
+ *r2e = _REGION2_ENTRY_EMPTY;
+ __gmap_unshadow_r3t(sg, raddr, r3t);
+ /* Free region 3 table */
+ page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @r2t: pointer to the start of a shadow region-2 table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
+ unsigned long *r2t)
+{
+ unsigned long asce, *r3t;
+ struct page *page;
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ asce = (unsigned long) r2t | _ASCE_TYPE_REGION2;
+ for (i = 0; i < 2048; i++, raddr += 1UL << 42) {
+ if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
+ continue;
+ r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN);
+ r2t[i] = _REGION2_ENTRY_EMPTY;
+ __gmap_unshadow_r3t(sg, raddr, r3t);
+ /* Free region 3 table */
+ page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, 2);
+ }
+}
+
+/**
+ * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long r1o, *r1e, *r2t;
+ struct page *page;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
+ if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
+ return;
+ gmap_call_notifier(sg, raddr, raddr + (1UL << 53) - 1);
+ r1o = (unsigned long) (r1e - ((raddr >> 53) & 0x7ff));
+ gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr);
+ r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN);
+ *r1e = _REGION1_ENTRY_EMPTY;
+ __gmap_unshadow_r2t(sg, raddr, r2t);
+ /* Free region 2 table */
+ page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @r1t: pointer to the start of a shadow region-1 table
+ *
+ * Called with the shadow->guest_table_lock
+ */
+static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
+ unsigned long *r1t)
+{
+ unsigned long asce, *r2t;
+ struct page *page;
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ asce = (unsigned long) r1t | _ASCE_TYPE_REGION1;
+ for (i = 0; i < 2048; i++, raddr += 1UL << 53) {
+ if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
+ continue;
+ r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN);
+ __gmap_unshadow_r2t(sg, raddr, r2t);
+ /* Clear entry and flush translation r1t -> r2t */
+ gmap_idte_one(asce, raddr);
+ r1t[i] = _REGION1_ENTRY_EMPTY;
+ /* Free region 2 table */
+ page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, 2);
+ }
+}
+
+/**
+ * gmap_unshadow - remove a shadow page table completely
+ * @sg: pointer to the shadow guest address space structure
+ *
+ * Called with sg->guest_table_lock
+ */
+static void gmap_unshadow(struct gmap *sg)
+{
+ unsigned long *table;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ if (sg->removed)
+ return;
+ sg->removed = 1;
+ gmap_call_notifier(sg, 0, -1UL);
+ gmap_flush_tlb(sg);
+ table = (unsigned long *)(sg->asce & _ASCE_ORIGIN);
+ switch (sg->asce & _ASCE_TYPE_MASK) {
+ case _ASCE_TYPE_REGION1:
+ __gmap_unshadow_r1t(sg, 0, table);
+ break;
+ case _ASCE_TYPE_REGION2:
+ __gmap_unshadow_r2t(sg, 0, table);
+ break;
+ case _ASCE_TYPE_REGION3:
+ __gmap_unshadow_r3t(sg, 0, table);
+ break;
+ case _ASCE_TYPE_SEGMENT:
+ __gmap_unshadow_sgt(sg, 0, table);
+ break;
+ }
+}
+
+/**
+ * gmap_find_shadow - find a specific asce in the list of shadow tables
+ * @parent: pointer to the parent gmap
+ * @asce: ASCE for which the shadow table is created
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * Returns the pointer to a gmap if a shadow table with the given asce is
+ * already available, ERR_PTR(-EAGAIN) if another one is just being created,
+ * otherwise NULL
+ */
+static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce,
+ int edat_level)
+{
+ struct gmap *sg;
+
+ list_for_each_entry(sg, &parent->children, list) {
+ if (sg->orig_asce != asce || sg->edat_level != edat_level ||
+ sg->removed)
+ continue;
+ if (!sg->initialized)
+ return ERR_PTR(-EAGAIN);
+ atomic_inc(&sg->ref_count);
+ return sg;
+ }
+ return NULL;
+}
+
+/**
+ * gmap_shadow_valid - check if a shadow guest address space matches the
+ * given properties and is still valid
+ * @sg: pointer to the shadow guest address space structure
+ * @asce: ASCE for which the shadow table is requested
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * Returns 1 if the gmap shadow is still valid and matches the given
+ * properties, the caller can continue using it. Returns 0 otherwise, the
+ * caller has to request a new shadow gmap in this case.
+ *
+ */
+int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
+{
+ if (sg->removed)
+ return 0;
+ return sg->orig_asce == asce && sg->edat_level == edat_level;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_valid);
+
+/**
+ * gmap_shadow - create/find a shadow guest address space
+ * @parent: pointer to the parent gmap
+ * @asce: ASCE for which the shadow table is created
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * The pages of the top level page table referred by the asce parameter
+ * will be set to read-only and marked in the PGSTEs of the kvm process.
+ * The shadow table will be removed automatically on any change to the
+ * PTE mapping for the source table.
+ *
+ * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
+ * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
+ * parent gmap table could not be protected.
+ */
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
+ int edat_level)
+{
+ struct gmap *sg, *new;
+ unsigned long limit;
+ int rc;
+
+ BUG_ON(gmap_is_shadow(parent));
+ spin_lock(&parent->shadow_lock);
+ sg = gmap_find_shadow(parent, asce, edat_level);
+ spin_unlock(&parent->shadow_lock);
+ if (sg)
+ return sg;
+ /* Create a new shadow gmap */
+ limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
+ if (asce & _ASCE_REAL_SPACE)
+ limit = -1UL;
+ new = gmap_alloc(limit);
+ if (!new)
+ return ERR_PTR(-ENOMEM);
+ new->mm = parent->mm;
+ new->parent = gmap_get(parent);
+ new->orig_asce = asce;
+ new->edat_level = edat_level;
+ new->initialized = false;
+ spin_lock(&parent->shadow_lock);
+ /* Recheck if another CPU created the same shadow */
+ sg = gmap_find_shadow(parent, asce, edat_level);
+ if (sg) {
+ spin_unlock(&parent->shadow_lock);
+ gmap_free(new);
+ return sg;
+ }
+ if (asce & _ASCE_REAL_SPACE) {
+ /* only allow one real-space gmap shadow */
+ list_for_each_entry(sg, &parent->children, list) {
+ if (sg->orig_asce & _ASCE_REAL_SPACE) {
+ spin_lock(&sg->guest_table_lock);
+ gmap_unshadow(sg);
+ spin_unlock(&sg->guest_table_lock);
+ list_del(&sg->list);
+ gmap_put(sg);
+ break;
+ }
+ }
+ }
+ atomic_set(&new->ref_count, 2);
+ list_add(&new->list, &parent->children);
+ if (asce & _ASCE_REAL_SPACE) {
+ /* nothing to protect, return right away */
+ new->initialized = true;
+ spin_unlock(&parent->shadow_lock);
+ return new;
+ }
+ spin_unlock(&parent->shadow_lock);
+ /* protect after insertion, so it will get properly invalidated */
+ down_read(&parent->mm->mmap_sem);
+ rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
+ ((asce & _ASCE_TABLE_LENGTH) + 1) * 4096,
+ PROT_READ, PGSTE_VSIE_BIT);
+ up_read(&parent->mm->mmap_sem);
+ spin_lock(&parent->shadow_lock);
+ new->initialized = true;
+ if (rc) {
+ list_del(&new->list);
+ gmap_free(new);
+ new = ERR_PTR(rc);
+ }
+ spin_unlock(&parent->shadow_lock);
+ return new;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow);
+
+/**
+ * gmap_shadow_r2t - create an empty shadow region 2 table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @r2t: parent gmap address of the region 2 table to get shadowed
+ * @fake: r2t references contiguous guest memory block, not a r2t
+ *
+ * The r2t parameter specifies the address of the source table. The
+ * four pages of the source table are made read-only in the parent gmap
+ * address space. A write to the source table area @r2t will automatically
+ * remove the shadow r2 table and all of its decendents.
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
+ int fake)
+{
+ unsigned long raddr, origin, offset, len;
+ unsigned long *s_r2t, *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ /* Allocate a shadow region second table */
+ page = alloc_pages(GFP_KERNEL, 2);
+ if (!page)
+ return -ENOMEM;
+ page->index = r2t & _REGION_ENTRY_ORIGIN;
+ if (fake)
+ page->index |= GMAP_SHADOW_FAKE_TABLE;
+ s_r2t = (unsigned long *) page_to_phys(page);
+ /* Install shadow region second table */
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
+ if (!table) {
+ rc = -EAGAIN; /* Race with unshadow */
+ goto out_free;
+ }
+ if (!(*table & _REGION_ENTRY_INVALID)) {
+ rc = 0; /* Already established */
+ goto out_free;
+ } else if (*table & _REGION_ENTRY_ORIGIN) {
+ rc = -EAGAIN; /* Race with shadow */
+ goto out_free;
+ }
+ crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY);
+ /* mark as invalid as long as the parent table is not protected */
+ *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH |
+ _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
+ if (sg->edat_level >= 1)
+ *table |= (r2t & _REGION_ENTRY_PROTECT);
+ list_add(&page->lru, &sg->crst_list);
+ if (fake) {
+ /* nothing to protect for fake tables */
+ *table &= ~_REGION_ENTRY_INVALID;
+ spin_unlock(&sg->guest_table_lock);
+ return 0;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ /* Make r2t read-only in parent gmap page table */
+ raddr = (saddr & 0xffe0000000000000UL) | _SHADOW_RMAP_REGION1;
+ origin = r2t & _REGION_ENTRY_ORIGIN;
+ offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+ len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+ rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+ spin_lock(&sg->guest_table_lock);
+ if (!rc) {
+ table = gmap_table_walk(sg, saddr, 4);
+ if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+ (unsigned long) s_r2t)
+ rc = -EAGAIN; /* Race with unshadow */
+ else
+ *table &= ~_REGION_ENTRY_INVALID;
+ } else {
+ gmap_unshadow_r2t(sg, raddr);
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+out_free:
+ spin_unlock(&sg->guest_table_lock);
+ __free_pages(page, 2);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
+
+/**
+ * gmap_shadow_r3t - create a shadow region 3 table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @r3t: parent gmap address of the region 3 table to get shadowed
+ * @fake: r3t references contiguous guest memory block, not a r3t
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
+ int fake)
+{
+ unsigned long raddr, origin, offset, len;
+ unsigned long *s_r3t, *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ /* Allocate a shadow region second table */
+ page = alloc_pages(GFP_KERNEL, 2);
+ if (!page)
+ return -ENOMEM;
+ page->index = r3t & _REGION_ENTRY_ORIGIN;
+ if (fake)
+ page->index |= GMAP_SHADOW_FAKE_TABLE;
+ s_r3t = (unsigned long *) page_to_phys(page);
+ /* Install shadow region second table */
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
+ if (!table) {
+ rc = -EAGAIN; /* Race with unshadow */
+ goto out_free;
+ }
+ if (!(*table & _REGION_ENTRY_INVALID)) {
+ rc = 0; /* Already established */
+ goto out_free;
+ } else if (*table & _REGION_ENTRY_ORIGIN) {
+ rc = -EAGAIN; /* Race with shadow */
+ }
+ crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
+ /* mark as invalid as long as the parent table is not protected */
+ *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH |
+ _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
+ if (sg->edat_level >= 1)
+ *table |= (r3t & _REGION_ENTRY_PROTECT);
+ list_add(&page->lru, &sg->crst_list);
+ if (fake) {
+ /* nothing to protect for fake tables */
+ *table &= ~_REGION_ENTRY_INVALID;
+ spin_unlock(&sg->guest_table_lock);
+ return 0;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ /* Make r3t read-only in parent gmap page table */
+ raddr = (saddr & 0xfffffc0000000000UL) | _SHADOW_RMAP_REGION2;
+ origin = r3t & _REGION_ENTRY_ORIGIN;
+ offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+ len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+ rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+ spin_lock(&sg->guest_table_lock);
+ if (!rc) {
+ table = gmap_table_walk(sg, saddr, 3);
+ if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+ (unsigned long) s_r3t)
+ rc = -EAGAIN; /* Race with unshadow */
+ else
+ *table &= ~_REGION_ENTRY_INVALID;
+ } else {
+ gmap_unshadow_r3t(sg, raddr);
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+out_free:
+ spin_unlock(&sg->guest_table_lock);
+ __free_pages(page, 2);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
+
+/**
+ * gmap_shadow_sgt - create a shadow segment table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @sgt: parent gmap address of the segment table to get shadowed
+ * @fake: sgt references contiguous guest memory block, not a sgt
+ *
+ * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
+ int fake)
+{
+ unsigned long raddr, origin, offset, len;
+ unsigned long *s_sgt, *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
+ /* Allocate a shadow segment table */
+ page = alloc_pages(GFP_KERNEL, 2);
+ if (!page)
+ return -ENOMEM;
+ page->index = sgt & _REGION_ENTRY_ORIGIN;
+ if (fake)
+ page->index |= GMAP_SHADOW_FAKE_TABLE;
+ s_sgt = (unsigned long *) page_to_phys(page);
+ /* Install shadow region second table */
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
+ if (!table) {
+ rc = -EAGAIN; /* Race with unshadow */
+ goto out_free;
+ }
+ if (!(*table & _REGION_ENTRY_INVALID)) {
+ rc = 0; /* Already established */
+ goto out_free;
+ } else if (*table & _REGION_ENTRY_ORIGIN) {
+ rc = -EAGAIN; /* Race with shadow */
+ goto out_free;
+ }
+ crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY);
+ /* mark as invalid as long as the parent table is not protected */
+ *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH |
+ _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
+ if (sg->edat_level >= 1)
+ *table |= sgt & _REGION_ENTRY_PROTECT;
+ list_add(&page->lru, &sg->crst_list);
+ if (fake) {
+ /* nothing to protect for fake tables */
+ *table &= ~_REGION_ENTRY_INVALID;
+ spin_unlock(&sg->guest_table_lock);
+ return 0;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ /* Make sgt read-only in parent gmap page table */
+ raddr = (saddr & 0xffffffff80000000UL) | _SHADOW_RMAP_REGION3;
+ origin = sgt & _REGION_ENTRY_ORIGIN;
+ offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+ len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+ rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+ spin_lock(&sg->guest_table_lock);
+ if (!rc) {
+ table = gmap_table_walk(sg, saddr, 2);
+ if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+ (unsigned long) s_sgt)
+ rc = -EAGAIN; /* Race with unshadow */
+ else
+ *table &= ~_REGION_ENTRY_INVALID;
+ } else {
+ gmap_unshadow_sgt(sg, raddr);
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+out_free:
+ spin_unlock(&sg->guest_table_lock);
+ __free_pages(page, 2);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
+
+/**
+ * gmap_shadow_lookup_pgtable - find a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: the address in the shadow aguest address space
+ * @pgt: parent gmap address of the page table to get shadowed
+ * @dat_protection: if the pgtable is marked as protected by dat
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ *
+ * Returns 0 if the shadow page table was found and -EAGAIN if the page
+ * table was not found.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
+ unsigned long *pgt, int *dat_protection,
+ int *fake)
+{
+ unsigned long *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+ if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
+ /* Shadow page tables are full pages (pte+pgste) */
+ page = pfn_to_page(*table >> PAGE_SHIFT);
+ *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE;
+ *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
+ *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE);
+ rc = 0;
+ } else {
+ rc = -EAGAIN;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
+
+/**
+ * gmap_shadow_pgt - instantiate a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pgt: parent gmap address of the page table to get shadowed
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory,
+ * -EFAULT if an address in the parent gmap could not be resolved and
+ *
+ * Called with gmap->mm->mmap_sem in read
+ */
+int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
+ int fake)
+{
+ unsigned long raddr, origin;
+ unsigned long *s_pgt, *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
+ /* Allocate a shadow page table */
+ page = page_table_alloc_pgste(sg->mm);
+ if (!page)
+ return -ENOMEM;
+ page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
+ if (fake)
+ page->index |= GMAP_SHADOW_FAKE_TABLE;
+ s_pgt = (unsigned long *) page_to_phys(page);
+ /* Install shadow page table */
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+ if (!table) {
+ rc = -EAGAIN; /* Race with unshadow */
+ goto out_free;
+ }
+ if (!(*table & _SEGMENT_ENTRY_INVALID)) {
+ rc = 0; /* Already established */
+ goto out_free;
+ } else if (*table & _SEGMENT_ENTRY_ORIGIN) {
+ rc = -EAGAIN; /* Race with shadow */
+ goto out_free;
+ }
+ /* mark as invalid as long as the parent table is not protected */
+ *table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
+ (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
+ list_add(&page->lru, &sg->pt_list);
+ if (fake) {
+ /* nothing to protect for fake tables */
+ *table &= ~_SEGMENT_ENTRY_INVALID;
+ spin_unlock(&sg->guest_table_lock);
+ return 0;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ /* Make pgt read-only in parent gmap page table (not the pgste) */
+ raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT;
+ origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
+ rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ);
+ spin_lock(&sg->guest_table_lock);
+ if (!rc) {
+ table = gmap_table_walk(sg, saddr, 1);
+ if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) !=
+ (unsigned long) s_pgt)
+ rc = -EAGAIN; /* Race with unshadow */
+ else
+ *table &= ~_SEGMENT_ENTRY_INVALID;
+ } else {
+ gmap_unshadow_pgt(sg, raddr);
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+out_free:
+ spin_unlock(&sg->guest_table_lock);
+ page_table_free_pgste(page);
+ return rc;
+
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
+
+/**
+ * gmap_shadow_page - create a shadow page mapping
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pte: pte in parent gmap address space to get shadowed
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
+{
+ struct gmap *parent;
+ struct gmap_rmap *rmap;
+ unsigned long vmaddr, paddr;
+ spinlock_t *ptl;
+ pte_t *sptep, *tptep;
+ int prot;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ parent = sg->parent;
+ prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
+
+ rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+ if (!rmap)
+ return -ENOMEM;
+ rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
+
+ while (1) {
+ paddr = pte_val(pte) & PAGE_MASK;
+ vmaddr = __gmap_translate(parent, paddr);
+ if (IS_ERR_VALUE(vmaddr)) {
+ rc = vmaddr;
+ break;
+ }
+ rc = radix_tree_preload(GFP_KERNEL);
if (rc)
break;
- /* Walk the process page table, lock and get pte pointer */
- ptep = get_locked_pte(gmap->mm, addr, &ptl);
- VM_BUG_ON(!ptep);
- /* Set notification bit in the pgste of the pte */
- if ((pte_val(*ptep) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
- ptep_set_notify(gmap->mm, addr, ptep);
- gaddr += PAGE_SIZE;
- len -= PAGE_SIZE;
+ rc = -EAGAIN;
+ sptep = gmap_pte_op_walk(parent, paddr, &ptl);
+ if (sptep) {
+ spin_lock(&sg->guest_table_lock);
+ /* Get page table pointer */
+ tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
+ if (!tptep) {
+ spin_unlock(&sg->guest_table_lock);
+ gmap_pte_op_end(ptl);
+ radix_tree_preload_end();
+ break;
+ }
+ rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
+ if (rc > 0) {
+ /* Success and a new mapping */
+ gmap_insert_rmap(sg, vmaddr, rmap);
+ rmap = NULL;
+ rc = 0;
+ }
+ gmap_pte_op_end(ptl);
+ spin_unlock(&sg->guest_table_lock);
}
- pte_unmap_unlock(ptep, ptl);
+ radix_tree_preload_end();
+ if (!rc)
+ break;
+ rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+ if (rc)
+ break;
}
- up_read(&gmap->mm->mmap_sem);
+ kfree(rmap);
return rc;
}
-EXPORT_SYMBOL_GPL(gmap_ipte_notify);
+EXPORT_SYMBOL_GPL(gmap_shadow_page);
+
+/**
+ * gmap_shadow_notify - handle notifications for shadow gmap
+ *
+ * Called with sg->parent->shadow_lock.
+ */
+static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
+ unsigned long offset, pte_t *pte)
+{
+ struct gmap_rmap *rmap, *rnext, *head;
+ unsigned long gaddr, start, end, bits, raddr;
+ unsigned long *table;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ spin_lock(&sg->parent->guest_table_lock);
+ table = radix_tree_lookup(&sg->parent->host_to_guest,
+ vmaddr >> PMD_SHIFT);
+ gaddr = table ? __gmap_segment_gaddr(table) + offset : 0;
+ spin_unlock(&sg->parent->guest_table_lock);
+ if (!table)
+ return;
+
+ spin_lock(&sg->guest_table_lock);
+ if (sg->removed) {
+ spin_unlock(&sg->guest_table_lock);
+ return;
+ }
+ /* Check for top level table */
+ start = sg->orig_asce & _ASCE_ORIGIN;
+ end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * 4096;
+ if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
+ gaddr < end) {
+ /* The complete shadow table has to go */
+ gmap_unshadow(sg);
+ spin_unlock(&sg->guest_table_lock);
+ list_del(&sg->list);
+ gmap_put(sg);
+ return;
+ }
+ /* Remove the page table tree from on specific entry */
+ head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> 12);
+ gmap_for_each_rmap_safe(rmap, rnext, head) {
+ bits = rmap->raddr & _SHADOW_RMAP_MASK;
+ raddr = rmap->raddr ^ bits;
+ switch (bits) {
+ case _SHADOW_RMAP_REGION1:
+ gmap_unshadow_r2t(sg, raddr);
+ break;
+ case _SHADOW_RMAP_REGION2:
+ gmap_unshadow_r3t(sg, raddr);
+ break;
+ case _SHADOW_RMAP_REGION3:
+ gmap_unshadow_sgt(sg, raddr);
+ break;
+ case _SHADOW_RMAP_SEGMENT:
+ gmap_unshadow_pgt(sg, raddr);
+ break;
+ case _SHADOW_RMAP_PGTABLE:
+ gmap_unshadow_page(sg, raddr);
+ break;
+ }
+ kfree(rmap);
+ }
+ spin_unlock(&sg->guest_table_lock);
+}
/**
* ptep_notify - call all invalidation callbacks for a specific pte.
* @mm: pointer to the process mm_struct
* @addr: virtual address in the process address space
* @pte: pointer to the page table entry
+ * @bits: bits from the pgste that caused the notify call
*
* This function is assumed to be called with the page table lock held
* for the pte to notify.
*/
-void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte)
+void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
+ pte_t *pte, unsigned long bits)
{
unsigned long offset, gaddr;
unsigned long *table;
- struct gmap_notifier *nb;
- struct gmap *gmap;
+ struct gmap *gmap, *sg, *next;
offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
offset = offset * (4096 / sizeof(pte_t));
- spin_lock(&gmap_notifier_lock);
- list_for_each_entry(gmap, &mm->context.gmap_list, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+ if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
+ spin_lock(&gmap->shadow_lock);
+ list_for_each_entry_safe(sg, next,
+ &gmap->children, list)
+ gmap_shadow_notify(sg, vmaddr, offset, pte);
+ spin_unlock(&gmap->shadow_lock);
+ }
+ if (!(bits & PGSTE_IN_BIT))
+ continue;
+ spin_lock(&gmap->guest_table_lock);
table = radix_tree_lookup(&gmap->host_to_guest,
vmaddr >> PMD_SHIFT);
- if (!table)
- continue;
- gaddr = __gmap_segment_gaddr(table) + offset;
- list_for_each_entry(nb, &gmap_notifier_list, list)
- nb->notifier_call(gmap, gaddr);
+ if (table)
+ gaddr = __gmap_segment_gaddr(table) + offset;
+ spin_unlock(&gmap->guest_table_lock);
+ if (table)
+ gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
}
- spin_unlock(&gmap_notifier_lock);
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(ptep_notify);
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index e2565d2d0c32..995f78532cc2 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -137,6 +137,29 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
return new;
}
+#ifdef CONFIG_PGSTE
+
+struct page *page_table_alloc_pgste(struct mm_struct *mm)
+{
+ struct page *page;
+ unsigned long *table;
+
+ page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
+ if (page) {
+ table = (unsigned long *) page_to_phys(page);
+ clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
+ clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
+ }
+ return page;
+}
+
+void page_table_free_pgste(struct page *page)
+{
+ __free_page(page);
+}
+
+#endif /* CONFIG_PGSTE */
+
/*
* page table entry allocation/free routines.
*/
@@ -149,7 +172,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
/* Try to get a fragment of a 4K page as a 2K page table */
if (!mm_alloc_pgste(mm)) {
table = NULL;
- spin_lock_bh(&mm->context.list_lock);
+ spin_lock_bh(&mm->context.pgtable_lock);
if (!list_empty(&mm->context.pgtable_list)) {
page = list_first_entry(&mm->context.pgtable_list,
struct page, lru);
@@ -164,7 +187,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
list_del(&page->lru);
}
}
- spin_unlock_bh(&mm->context.list_lock);
+ spin_unlock_bh(&mm->context.pgtable_lock);
if (table)
return table;
}
@@ -187,9 +210,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
/* Return the first 2K fragment of the page */
atomic_set(&page->_mapcount, 1);
clear_table(table, _PAGE_INVALID, PAGE_SIZE);
- spin_lock_bh(&mm->context.list_lock);
+ spin_lock_bh(&mm->context.pgtable_lock);
list_add(&page->lru, &mm->context.pgtable_list);
- spin_unlock_bh(&mm->context.list_lock);
+ spin_unlock_bh(&mm->context.pgtable_lock);
}
return table;
}
@@ -203,13 +226,13 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
if (!mm_alloc_pgste(mm)) {
/* Free 2K page table fragment of a 4K page */
bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
- spin_lock_bh(&mm->context.list_lock);
+ spin_lock_bh(&mm->context.pgtable_lock);
mask = atomic_xor_bits(&page->_mapcount, 1U << bit);
if (mask & 3)
list_add(&page->lru, &mm->context.pgtable_list);
else
list_del(&page->lru);
- spin_unlock_bh(&mm->context.list_lock);
+ spin_unlock_bh(&mm->context.pgtable_lock);
if (mask != 0)
return;
}
@@ -235,13 +258,13 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
return;
}
bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
- spin_lock_bh(&mm->context.list_lock);
+ spin_lock_bh(&mm->context.pgtable_lock);
mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit);
if (mask & 3)
list_add_tail(&page->lru, &mm->context.pgtable_list);
else
list_del(&page->lru);
- spin_unlock_bh(&mm->context.list_lock);
+ spin_unlock_bh(&mm->context.pgtable_lock);
table = (unsigned long *) (__pa(table) | (1U << bit));
tlb_remove_table(tlb, table);
}
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index b98d1a152d46..5f092015aaa7 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -174,14 +174,17 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
return pgste;
}
-static inline pgste_t pgste_ipte_notify(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep, pgste_t pgste)
+static inline pgste_t pgste_pte_notify(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep, pgste_t pgste)
{
#ifdef CONFIG_PGSTE
- if (pgste_val(pgste) & PGSTE_IN_BIT) {
- pgste_val(pgste) &= ~PGSTE_IN_BIT;
- ptep_notify(mm, addr, ptep);
+ unsigned long bits;
+
+ bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
+ if (bits) {
+ pgste_val(pgste) ^= bits;
+ ptep_notify(mm, addr, ptep, bits);
}
#endif
return pgste;
@@ -194,7 +197,7 @@ static inline pgste_t ptep_xchg_start(struct mm_struct *mm,
if (mm_has_pgste(mm)) {
pgste = pgste_get_lock(ptep);
- pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+ pgste = pgste_pte_notify(mm, addr, ptep, pgste);
}
return pgste;
}
@@ -459,6 +462,90 @@ void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
preempt_enable();
}
+/**
+ * ptep_force_prot - change access rights of a locked pte
+ * @mm: pointer to the process mm_struct
+ * @addr: virtual address in the guest address space
+ * @ptep: pointer to the page table entry
+ * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bit: pgste bit to set (e.g. for notification)
+ *
+ * Returns 0 if the access rights were changed and -EAGAIN if the current
+ * and requested access rights are incompatible.
+ */
+int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, int prot, unsigned long bit)
+{
+ pte_t entry;
+ pgste_t pgste;
+ int pte_i, pte_p;
+
+ pgste = pgste_get_lock(ptep);
+ entry = *ptep;
+ /* Check pte entry after all locks have been acquired */
+ pte_i = pte_val(entry) & _PAGE_INVALID;
+ pte_p = pte_val(entry) & _PAGE_PROTECT;
+ if ((pte_i && (prot != PROT_NONE)) ||
+ (pte_p && (prot & PROT_WRITE))) {
+ pgste_set_unlock(ptep, pgste);
+ return -EAGAIN;
+ }
+ /* Change access rights and set pgste bit */
+ if (prot == PROT_NONE && !pte_i) {
+ ptep_flush_direct(mm, addr, ptep);
+ pgste = pgste_update_all(entry, pgste, mm);
+ pte_val(entry) |= _PAGE_INVALID;
+ }
+ if (prot == PROT_READ && !pte_p) {
+ ptep_flush_direct(mm, addr, ptep);
+ pte_val(entry) &= ~_PAGE_INVALID;
+ pte_val(entry) |= _PAGE_PROTECT;
+ }
+ pgste_val(pgste) |= bit;
+ pgste = pgste_set_pte(ptep, pgste, entry);
+ pgste_set_unlock(ptep, pgste);
+ return 0;
+}
+
+int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
+ pte_t *sptep, pte_t *tptep, pte_t pte)
+{
+ pgste_t spgste, tpgste;
+ pte_t spte, tpte;
+ int rc = -EAGAIN;
+
+ if (!(pte_val(*tptep) & _PAGE_INVALID))
+ return 0; /* already shadowed */
+ spgste = pgste_get_lock(sptep);
+ spte = *sptep;
+ if (!(pte_val(spte) & _PAGE_INVALID) &&
+ !((pte_val(spte) & _PAGE_PROTECT) &&
+ !(pte_val(pte) & _PAGE_PROTECT))) {
+ pgste_val(spgste) |= PGSTE_VSIE_BIT;
+ tpgste = pgste_get_lock(tptep);
+ pte_val(tpte) = (pte_val(spte) & PAGE_MASK) |
+ (pte_val(pte) & _PAGE_PROTECT);
+ /* don't touch the storage key - it belongs to parent pgste */
+ tpgste = pgste_set_pte(tptep, tpgste, tpte);
+ pgste_set_unlock(tptep, tpgste);
+ rc = 1;
+ }
+ pgste_set_unlock(sptep, spgste);
+ return rc;
+}
+
+void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep)
+{
+ pgste_t pgste;
+
+ pgste = pgste_get_lock(ptep);
+ /* notifier is called by the caller */
+ ptep_flush_direct(mm, saddr, ptep);
+ /* don't touch the storage key - it belongs to parent pgste */
+ pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID));
+ pgste_set_unlock(ptep, pgste);
+}
+
static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
{
if (!non_swap_entry(entry))
@@ -532,7 +619,7 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr)
pgste_val(pgste) &= ~PGSTE_UC_BIT;
pte = *ptep;
if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
- pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+ pgste = pgste_pte_notify(mm, addr, ptep, pgste);
__ptep_ipte(addr, ptep);
if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
pte_val(pte) |= _PAGE_PROTECT;
@@ -555,12 +642,9 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
pgste_t old, new;
pte_t *ptep;
- down_read(&mm->mmap_sem);
ptep = get_locked_pte(mm, addr, &ptl);
- if (unlikely(!ptep)) {
- up_read(&mm->mmap_sem);
+ if (unlikely(!ptep))
return -EFAULT;
- }
new = old = pgste_get_lock(ptep);
pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
@@ -587,45 +671,100 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
pgste_set_unlock(ptep, new);
pte_unmap_unlock(ptep, ptl);
- up_read(&mm->mmap_sem);
return 0;
}
EXPORT_SYMBOL(set_guest_storage_key);
-unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
+/**
+ * Conditionally set a guest storage key (handling csske).
+ * oldkey will be updated when either mr or mc is set and a pointer is given.
+ *
+ * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest
+ * storage key was updated and -EFAULT on access errors.
+ */
+int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+ unsigned char key, unsigned char *oldkey,
+ bool nq, bool mr, bool mc)
+{
+ unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT;
+ int rc;
+
+ /* we can drop the pgste lock between getting and setting the key */
+ if (mr | mc) {
+ rc = get_guest_storage_key(current->mm, addr, &tmp);
+ if (rc)
+ return rc;
+ if (oldkey)
+ *oldkey = tmp;
+ if (!mr)
+ mask |= _PAGE_REFERENCED;
+ if (!mc)
+ mask |= _PAGE_CHANGED;
+ if (!((tmp ^ key) & mask))
+ return 0;
+ }
+ rc = set_guest_storage_key(current->mm, addr, key, nq);
+ return rc < 0 ? rc : 1;
+}
+EXPORT_SYMBOL(cond_set_guest_storage_key);
+
+/**
+ * Reset a guest reference bit (rrbe), returning the reference and changed bit.
+ *
+ * Returns < 0 in case of error, otherwise the cc to be reported to the guest.
+ */
+int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
{
- unsigned char key;
spinlock_t *ptl;
- pgste_t pgste;
+ pgste_t old, new;
pte_t *ptep;
+ int cc = 0;
- down_read(&mm->mmap_sem);
ptep = get_locked_pte(mm, addr, &ptl);
- if (unlikely(!ptep)) {
- up_read(&mm->mmap_sem);
+ if (unlikely(!ptep))
return -EFAULT;
- }
- pgste = pgste_get_lock(ptep);
- if (pte_val(*ptep) & _PAGE_INVALID) {
- key = (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56;
- key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56;
- key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48;
- key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48;
- } else {
- key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK);
+ new = old = pgste_get_lock(ptep);
+ /* Reset guest reference bit only */
+ pgste_val(new) &= ~PGSTE_GR_BIT;
- /* Reflect guest's logical view, not physical */
- if (pgste_val(pgste) & PGSTE_GR_BIT)
- key |= _PAGE_REFERENCED;
- if (pgste_val(pgste) & PGSTE_GC_BIT)
- key |= _PAGE_CHANGED;
+ if (!(pte_val(*ptep) & _PAGE_INVALID)) {
+ cc = page_reset_referenced(pte_val(*ptep) & PAGE_MASK);
+ /* Merge real referenced bit into host-set */
+ pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT;
}
+ /* Reflect guest's logical view, not physical */
+ cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49;
+ /* Changing the guest storage key is considered a change of the page */
+ if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT)
+ pgste_val(new) |= PGSTE_UC_BIT;
+
+ pgste_set_unlock(ptep, new);
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+}
+EXPORT_SYMBOL(reset_guest_reference_bit);
+
+int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+ unsigned char *key)
+{
+ spinlock_t *ptl;
+ pgste_t pgste;
+ pte_t *ptep;
+ ptep = get_locked_pte(mm, addr, &ptl);
+ if (unlikely(!ptep))
+ return -EFAULT;
+
+ pgste = pgste_get_lock(ptep);
+ *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
+ if (!(pte_val(*ptep) & _PAGE_INVALID))
+ *key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK);
+ /* Reflect guest's logical view, not physical */
+ *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
pgste_set_unlock(ptep, pgste);
pte_unmap_unlock(ptep, ptl);
- up_read(&mm->mmap_sem);
- return key;
+ return 0;
}
EXPORT_SYMBOL(get_guest_storage_key);
#endif
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 69e62862b622..33ae3a4d0159 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -35,8 +35,9 @@
#include <asm/asm.h>
#include <asm/kvm_page_track.h>
-#define KVM_MAX_VCPUS 255
-#define KVM_SOFT_MAX_VCPUS 160
+#define KVM_MAX_VCPUS 288
+#define KVM_SOFT_MAX_VCPUS 240
+#define KVM_MAX_VCPU_ID 1023
#define KVM_USER_MEM_SLOTS 509
/* memory slots that are not exposed to userspace */
#define KVM_PRIVATE_MEM_SLOTS 3
@@ -599,6 +600,7 @@ struct kvm_vcpu_arch {
u64 mcg_cap;
u64 mcg_status;
u64 mcg_ctl;
+ u64 mcg_ext_ctl;
u64 *mce_banks;
/* Cache MMIO info */
@@ -682,9 +684,12 @@ struct kvm_arch_memory_slot {
struct kvm_apic_map {
struct rcu_head rcu;
u8 mode;
- struct kvm_lapic *phys_map[256];
- /* first index is cluster id second is cpu id in a cluster */
- struct kvm_lapic *logical_map[16][16];
+ u32 max_apic_id;
+ union {
+ struct kvm_lapic *xapic_flat_map[8];
+ struct kvm_lapic *xapic_cluster_map[16][4];
+ };
+ struct kvm_lapic *phys_map[];
};
/* Hyper-V emulation context */
@@ -779,6 +784,9 @@ struct kvm_arch {
u32 ldr_mode;
struct page *avic_logical_id_table_page;
struct page *avic_physical_id_table_page;
+
+ bool x2apic_format;
+ bool x2apic_broadcast_quirk_disabled;
};
struct kvm_vm_stat {
@@ -1006,6 +1014,11 @@ struct kvm_x86_ops {
int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
+
+ int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc);
+ void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
+
+ void (*setup_mce)(struct kvm_vcpu *vcpu);
};
struct kvm_arch_async_pf {
@@ -1026,7 +1039,7 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu);
void kvm_mmu_init_vm(struct kvm *kvm);
void kvm_mmu_uninit_vm(struct kvm *kvm);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask);
+ u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask);
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
@@ -1077,6 +1090,10 @@ extern u32 kvm_max_guest_tsc_khz;
extern u8 kvm_tsc_scaling_ratio_frac_bits;
/* maximum allowed value of TSC scaling ratio */
extern u64 kvm_max_tsc_scaling_ratio;
+/* 1ull << kvm_tsc_scaling_ratio_frac_bits */
+extern u64 kvm_default_tsc_scaling_ratio;
+
+extern u64 kvm_mce_cap_supported;
enum emulation_result {
EMULATE_DONE, /* no further processing */
@@ -1352,7 +1369,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu);
-void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
struct kvm_lapic_irq *irq);
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index d0fe23ec7e98..14824fc78f7e 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -193,7 +193,6 @@ struct __attribute__ ((__packed__)) vmcb {
struct vmcb_save_area save;
};
-#define SVM_CPUID_FEATURE_SHIFT 2
#define SVM_CPUID_FUNC 0x8000000a
#define SVM_VM_CR_SVM_DISABLE 4
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index cce9ee68e335..0116b2ee9e64 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -83,23 +83,19 @@ static inline void cpu_emergency_vmxoff(void)
*/
static inline int cpu_has_svm(const char **msg)
{
- uint32_t eax, ebx, ecx, edx;
-
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
if (msg)
*msg = "not amd";
return 0;
}
- cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
- if (eax < SVM_CPUID_FUNC) {
+ if (boot_cpu_data.extended_cpuid_level < SVM_CPUID_FUNC) {
if (msg)
*msg = "can't execute cpuid_8000000a";
return 0;
}
- cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
- if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
+ if (!boot_cpu_has(X86_FEATURE_SVM)) {
if (msg)
*msg = "svm not available";
return 0;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 639a6e34500c..ab8e32f7b9a8 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -32,7 +32,6 @@ config KVM
select HAVE_KVM_IRQ_BYPASS
select HAVE_KVM_IRQ_ROUTING
select HAVE_KVM_EVENTFD
- select KVM_APIC_ARCHITECTURE
select KVM_ASYNC_PF
select USER_RETURN_NOTIFIER
select KVM_MMIO
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index a4bf5b45d65a..5fb6c620180e 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -645,7 +645,6 @@ static const struct kvm_io_device_ops speaker_dev_ops = {
.write = speaker_ioport_write,
};
-/* Caller must hold slots_lock */
struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
{
struct kvm_pit *pit;
@@ -690,6 +689,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
kvm_pit_set_reinject(pit, true);
+ mutex_lock(&kvm->slots_lock);
kvm_iodevice_init(&pit->dev, &pit_dev_ops);
ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,
KVM_PIT_MEM_LENGTH, &pit->dev);
@@ -704,12 +704,14 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
if (ret < 0)
goto fail_register_speaker;
}
+ mutex_unlock(&kvm->slots_lock);
return pit;
fail_register_speaker:
kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
fail_register_pit:
+ mutex_unlock(&kvm->slots_lock);
kvm_pit_set_reinject(pit, false);
kthread_stop(pit->worker_task);
fail_kthread:
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
index 95e0e6481f07..b181426f67b4 100644
--- a/arch/x86/kvm/iommu.c
+++ b/arch/x86/kvm/iommu.c
@@ -28,9 +28,7 @@
#include <linux/moduleparam.h>
#include <linux/pci.h>
#include <linux/stat.h>
-#include <linux/dmar.h>
#include <linux/iommu.h>
-#include <linux/intel-iommu.h>
#include "assigned-dev.h"
static bool allow_unsafe_assigned_interrupts;
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index dfb4c6476877..25810b144b58 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -110,13 +110,17 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
return r;
}
-void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
struct kvm_lapic_irq *irq)
{
- trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
+ trace_kvm_msi_set_irq(e->msi.address_lo | (kvm->arch.x2apic_format ?
+ (u64)e->msi.address_hi << 32 : 0),
+ e->msi.data);
irq->dest_id = (e->msi.address_lo &
MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
+ if (kvm->arch.x2apic_format)
+ irq->dest_id |= MSI_ADDR_EXT_DEST_ID(e->msi.address_hi);
irq->vector = (e->msi.data &
MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
@@ -129,15 +133,24 @@ void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
}
EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
+static inline bool kvm_msi_route_invalid(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e)
+{
+ return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff);
+}
+
int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level, bool line_status)
{
struct kvm_lapic_irq irq;
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
if (!level)
return -1;
- kvm_set_msi_irq(e, &irq);
+ kvm_set_msi_irq(kvm, e, &irq);
return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
}
@@ -153,7 +166,10 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
if (unlikely(e->type != KVM_IRQ_ROUTING_MSI))
return -EWOULDBLOCK;
- kvm_set_msi_irq(e, &irq);
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
+ kvm_set_msi_irq(kvm, e, &irq);
if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
return r;
@@ -248,7 +264,8 @@ static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e,
return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint);
}
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+int kvm_set_routing_entry(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
int r = -EINVAL;
@@ -285,6 +302,9 @@ int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
e->msi.address_lo = ue->u.msi.address_lo;
e->msi.address_hi = ue->u.msi.address_hi;
e->msi.data = ue->u.msi.data;
+
+ if (kvm_msi_route_invalid(kvm, e))
+ goto out;
break;
case KVM_IRQ_ROUTING_HV_SINT:
e->set = kvm_hv_set_sint;
@@ -388,21 +408,16 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
kvm->arch.nr_reserved_ioapic_pins);
for (i = 0; i < nr_ioapic_pins; ++i) {
hlist_for_each_entry(entry, &table->map[i], link) {
- u32 dest_id, dest_mode;
- bool level;
+ struct kvm_lapic_irq irq;
if (entry->type != KVM_IRQ_ROUTING_MSI)
continue;
- dest_id = (entry->msi.address_lo >> 12) & 0xff;
- dest_mode = (entry->msi.address_lo >> 2) & 0x1;
- level = entry->msi.data & MSI_DATA_TRIGGER_LEVEL;
- if (level && kvm_apic_match_dest(vcpu, NULL, 0,
- dest_id, dest_mode)) {
- u32 vector = entry->msi.data & 0xff;
-
- __set_bit(vector,
- ioapic_handled_vectors);
- }
+
+ kvm_set_msi_irq(vcpu->kvm, entry, &irq);
+
+ if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0,
+ irq.dest_id, irq.dest_mode))
+ __set_bit(irq.vector, ioapic_handled_vectors);
}
}
srcu_read_unlock(&kvm->irq_srcu, idx);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 57549ed47ca5..730cf174090a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -115,26 +115,43 @@ static inline int apic_enabled(struct kvm_lapic *apic)
(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
-/* The logical map is definitely wrong if we have multiple
- * modes at the same time. (Physical map is always right.)
- */
-static inline bool kvm_apic_logical_map_valid(struct kvm_apic_map *map)
-{
- return !(map->mode & (map->mode - 1));
+static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
+ u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
+ switch (map->mode) {
+ case KVM_APIC_MODE_X2APIC: {
+ u32 offset = (dest_id >> 16) * 16;
+ u32 max_apic_id = map->max_apic_id;
+
+ if (offset <= max_apic_id) {
+ u8 cluster_size = min(max_apic_id - offset + 1, 16U);
+
+ *cluster = &map->phys_map[offset];
+ *mask = dest_id & (0xffff >> (16 - cluster_size));
+ } else {
+ *mask = 0;
+ }
+
+ return true;
+ }
+ case KVM_APIC_MODE_XAPIC_FLAT:
+ *cluster = map->xapic_flat_map;
+ *mask = dest_id & 0xff;
+ return true;
+ case KVM_APIC_MODE_XAPIC_CLUSTER:
+ *cluster = map->xapic_cluster_map[dest_id >> 4];
+ *mask = dest_id & 0xf;
+ return true;
+ default:
+ /* Not optimized. */
+ return false;
+ }
}
-static inline void
-apic_logical_id(struct kvm_apic_map *map, u32 dest_id, u16 *cid, u16 *lid)
+static void kvm_apic_map_free(struct rcu_head *rcu)
{
- unsigned lid_bits;
+ struct kvm_apic_map *map = container_of(rcu, struct kvm_apic_map, rcu);
- BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_CLUSTER != 4);
- BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_FLAT != 8);
- BUILD_BUG_ON(KVM_APIC_MODE_X2APIC != 16);
- lid_bits = map->mode;
-
- *cid = dest_id >> lid_bits;
- *lid = dest_id & ((1 << lid_bits) - 1);
+ kvfree(map);
}
static void recalculate_apic_map(struct kvm *kvm)
@@ -142,17 +159,26 @@ static void recalculate_apic_map(struct kvm *kvm)
struct kvm_apic_map *new, *old = NULL;
struct kvm_vcpu *vcpu;
int i;
-
- new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
+ u32 max_id = 255;
mutex_lock(&kvm->arch.apic_map_lock);
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ if (kvm_apic_present(vcpu))
+ max_id = max(max_id, kvm_apic_id(vcpu->arch.apic));
+
+ new = kvm_kvzalloc(sizeof(struct kvm_apic_map) +
+ sizeof(struct kvm_lapic *) * ((u64)max_id + 1));
+
if (!new)
goto out;
+ new->max_apic_id = max_id;
+
kvm_for_each_vcpu(i, vcpu, kvm) {
struct kvm_lapic *apic = vcpu->arch.apic;
- u16 cid, lid;
+ struct kvm_lapic **cluster;
+ u16 mask;
u32 ldr, aid;
if (!kvm_apic_present(vcpu))
@@ -161,7 +187,7 @@ static void recalculate_apic_map(struct kvm *kvm)
aid = kvm_apic_id(apic);
ldr = kvm_lapic_get_reg(apic, APIC_LDR);
- if (aid < ARRAY_SIZE(new->phys_map))
+ if (aid <= new->max_apic_id)
new->phys_map[aid] = apic;
if (apic_x2apic_mode(apic)) {
@@ -174,13 +200,11 @@ static void recalculate_apic_map(struct kvm *kvm)
new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER;
}
- if (!kvm_apic_logical_map_valid(new))
+ if (!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask))
continue;
- apic_logical_id(new, ldr, &cid, &lid);
-
- if (lid && cid < ARRAY_SIZE(new->logical_map))
- new->logical_map[cid][ffs(lid) - 1] = apic;
+ if (mask)
+ cluster[ffs(mask) - 1] = apic;
}
out:
old = rcu_dereference_protected(kvm->arch.apic_map,
@@ -189,7 +213,7 @@ out:
mutex_unlock(&kvm->arch.apic_map_lock);
if (old)
- kfree_rcu(old, rcu);
+ call_rcu(&old->rcu, kvm_apic_map_free);
kvm_make_scan_ioapic_request(kvm);
}
@@ -210,7 +234,7 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
}
}
-static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
+static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
{
kvm_lapic_set_reg(apic, APIC_ID, id << 24);
recalculate_apic_map(apic->vcpu->kvm);
@@ -222,11 +246,11 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
recalculate_apic_map(apic->vcpu->kvm);
}
-static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u8 id)
+static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
{
u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
- kvm_lapic_set_reg(apic, APIC_ID, id << 24);
+ kvm_lapic_set_reg(apic, APIC_ID, id);
kvm_lapic_set_reg(apic, APIC_LDR, ldr);
recalculate_apic_map(apic->vcpu->kvm);
}
@@ -599,17 +623,30 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
}
}
-/* KVM APIC implementation has two quirks
- * - dest always begins at 0 while xAPIC MDA has offset 24,
- * - IOxAPIC messages have to be delivered (directly) to x2APIC.
+/* The KVM local APIC implementation has two quirks:
+ *
+ * - the xAPIC MDA stores the destination at bits 24-31, while this
+ * is not true of struct kvm_lapic_irq's dest_id field. This is
+ * just a quirk in the API and is not problematic.
+ *
+ * - in-kernel IOAPIC messages have to be delivered directly to
+ * x2APIC, because the kernel does not support interrupt remapping.
+ * In order to support broadcast without interrupt remapping, x2APIC
+ * rewrites the destination of non-IPI messages from APIC_BROADCAST
+ * to X2APIC_BROADCAST.
+ *
+ * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API. This is
+ * important when userspace wants to use x2APIC-format MSIs, because
+ * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7".
*/
-static u32 kvm_apic_mda(unsigned int dest_id, struct kvm_lapic *source,
- struct kvm_lapic *target)
+static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id,
+ struct kvm_lapic *source, struct kvm_lapic *target)
{
bool ipi = source != NULL;
bool x2apic_mda = apic_x2apic_mode(ipi ? source : target);
- if (!ipi && dest_id == APIC_BROADCAST && x2apic_mda)
+ if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled &&
+ !ipi && dest_id == APIC_BROADCAST && x2apic_mda)
return X2APIC_BROADCAST;
return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id);
@@ -619,7 +656,7 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int short_hand, unsigned int dest, int dest_mode)
{
struct kvm_lapic *target = vcpu->arch.apic;
- u32 mda = kvm_apic_mda(dest, source, target);
+ u32 mda = kvm_apic_mda(vcpu, dest, source, target);
apic_debug("target %p, source %p, dest 0x%x, "
"dest_mode 0x%x, short_hand 0x%x\n",
@@ -671,102 +708,126 @@ static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
}
}
-bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
- struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
+static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src,
+ struct kvm_lapic_irq *irq, struct kvm_apic_map *map)
{
- struct kvm_apic_map *map;
- unsigned long bitmap = 1;
- struct kvm_lapic **dst;
- int i;
- bool ret, x2apic_ipi;
+ if (kvm->arch.x2apic_broadcast_quirk_disabled) {
+ if ((irq->dest_id == APIC_BROADCAST &&
+ map->mode != KVM_APIC_MODE_X2APIC))
+ return true;
+ if (irq->dest_id == X2APIC_BROADCAST)
+ return true;
+ } else {
+ bool x2apic_ipi = src && *src && apic_x2apic_mode(*src);
+ if (irq->dest_id == (x2apic_ipi ?
+ X2APIC_BROADCAST : APIC_BROADCAST))
+ return true;
+ }
- *r = -1;
+ return false;
+}
- if (irq->shorthand == APIC_DEST_SELF) {
- *r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
- return true;
- }
+/* Return true if the interrupt can be handled by using *bitmap as index mask
+ * for valid destinations in *dst array.
+ * Return false if kvm_apic_map_get_dest_lapic did nothing useful.
+ * Note: we may have zero kvm_lapic destinations when we return true, which
+ * means that the interrupt should be dropped. In this case, *bitmap would be
+ * zero and *dst undefined.
+ */
+static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm,
+ struct kvm_lapic **src, struct kvm_lapic_irq *irq,
+ struct kvm_apic_map *map, struct kvm_lapic ***dst,
+ unsigned long *bitmap)
+{
+ int i, lowest;
- if (irq->shorthand)
+ if (irq->shorthand == APIC_DEST_SELF && src) {
+ *dst = src;
+ *bitmap = 1;
+ return true;
+ } else if (irq->shorthand)
return false;
- x2apic_ipi = src && apic_x2apic_mode(src);
- if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST))
+ if (!map || kvm_apic_is_broadcast_dest(kvm, src, irq, map))
return false;
- ret = true;
- rcu_read_lock();
- map = rcu_dereference(kvm->arch.apic_map);
-
- if (!map) {
- ret = false;
- goto out;
+ if (irq->dest_mode == APIC_DEST_PHYSICAL) {
+ if (irq->dest_id > map->max_apic_id) {
+ *bitmap = 0;
+ } else {
+ *dst = &map->phys_map[irq->dest_id];
+ *bitmap = 1;
+ }
+ return true;
}
- if (irq->dest_mode == APIC_DEST_PHYSICAL) {
- if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
- goto out;
+ *bitmap = 0;
+ if (!kvm_apic_map_get_logical_dest(map, irq->dest_id, dst,
+ (u16 *)bitmap))
+ return false;
- dst = &map->phys_map[irq->dest_id];
- } else {
- u16 cid;
+ if (!kvm_lowest_prio_delivery(irq))
+ return true;
- if (!kvm_apic_logical_map_valid(map)) {
- ret = false;
- goto out;
+ if (!kvm_vector_hashing_enabled()) {
+ lowest = -1;
+ for_each_set_bit(i, bitmap, 16) {
+ if (!(*dst)[i])
+ continue;
+ if (lowest < 0)
+ lowest = i;
+ else if (kvm_apic_compare_prio((*dst)[i]->vcpu,
+ (*dst)[lowest]->vcpu) < 0)
+ lowest = i;
}
+ } else {
+ if (!*bitmap)
+ return true;
- apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
+ lowest = kvm_vector_to_index(irq->vector, hweight16(*bitmap),
+ bitmap, 16);
- if (cid >= ARRAY_SIZE(map->logical_map))
- goto out;
+ if (!(*dst)[lowest]) {
+ kvm_apic_disabled_lapic_found(kvm);
+ *bitmap = 0;
+ return true;
+ }
+ }
- dst = map->logical_map[cid];
+ *bitmap = (lowest >= 0) ? 1 << lowest : 0;
- if (!kvm_lowest_prio_delivery(irq))
- goto set_irq;
+ return true;
+}
- if (!kvm_vector_hashing_enabled()) {
- int l = -1;
- for_each_set_bit(i, &bitmap, 16) {
- if (!dst[i])
- continue;
- if (l < 0)
- l = i;
- else if (kvm_apic_compare_prio(dst[i]->vcpu,
- dst[l]->vcpu) < 0)
- l = i;
- }
- bitmap = (l >= 0) ? 1 << l : 0;
- } else {
- int idx;
- unsigned int dest_vcpus;
+bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
+{
+ struct kvm_apic_map *map;
+ unsigned long bitmap;
+ struct kvm_lapic **dst = NULL;
+ int i;
+ bool ret;
- dest_vcpus = hweight16(bitmap);
- if (dest_vcpus == 0)
- goto out;
+ *r = -1;
- idx = kvm_vector_to_index(irq->vector,
- dest_vcpus, &bitmap, 16);
+ if (irq->shorthand == APIC_DEST_SELF) {
+ *r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
+ return true;
+ }
- if (!dst[idx]) {
- kvm_apic_disabled_lapic_found(kvm);
- goto out;
- }
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
- bitmap = (idx >= 0) ? 1 << idx : 0;
+ ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap);
+ if (ret)
+ for_each_set_bit(i, &bitmap, 16) {
+ if (!dst[i])
+ continue;
+ if (*r < 0)
+ *r = 0;
+ *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
}
- }
-set_irq:
- for_each_set_bit(i, &bitmap, 16) {
- if (!dst[i])
- continue;
- if (*r < 0)
- *r = 0;
- *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
- }
-out:
rcu_read_unlock();
return ret;
}
@@ -789,8 +850,9 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu)
{
struct kvm_apic_map *map;
+ unsigned long bitmap;
+ struct kvm_lapic **dst = NULL;
bool ret = false;
- struct kvm_lapic *dst = NULL;
if (irq->shorthand)
return false;
@@ -798,69 +860,16 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
rcu_read_lock();
map = rcu_dereference(kvm->arch.apic_map);
- if (!map)
- goto out;
-
- if (irq->dest_mode == APIC_DEST_PHYSICAL) {
- if (irq->dest_id == 0xFF)
- goto out;
-
- if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
- goto out;
-
- dst = map->phys_map[irq->dest_id];
- if (dst && kvm_apic_present(dst->vcpu))
- *dest_vcpu = dst->vcpu;
- else
- goto out;
- } else {
- u16 cid;
- unsigned long bitmap = 1;
- int i, r = 0;
-
- if (!kvm_apic_logical_map_valid(map))
- goto out;
-
- apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
-
- if (cid >= ARRAY_SIZE(map->logical_map))
- goto out;
-
- if (kvm_vector_hashing_enabled() &&
- kvm_lowest_prio_delivery(irq)) {
- int idx;
- unsigned int dest_vcpus;
+ if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) &&
+ hweight16(bitmap) == 1) {
+ unsigned long i = find_first_bit(&bitmap, 16);
- dest_vcpus = hweight16(bitmap);
- if (dest_vcpus == 0)
- goto out;
-
- idx = kvm_vector_to_index(irq->vector, dest_vcpus,
- &bitmap, 16);
-
- dst = map->logical_map[cid][idx];
- if (!dst) {
- kvm_apic_disabled_lapic_found(kvm);
- goto out;
- }
-
- *dest_vcpu = dst->vcpu;
- } else {
- for_each_set_bit(i, &bitmap, 16) {
- dst = map->logical_map[cid][i];
- if (++r == 2)
- goto out;
- }
-
- if (dst && kvm_apic_present(dst->vcpu))
- *dest_vcpu = dst->vcpu;
- else
- goto out;
+ if (dst[i]) {
+ *dest_vcpu = dst[i]->vcpu;
+ ret = true;
}
}
- ret = true;
-out:
rcu_read_unlock();
return ret;
}
@@ -1127,12 +1136,6 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
return 0;
switch (offset) {
- case APIC_ID:
- if (apic_x2apic_mode(apic))
- val = kvm_apic_id(apic);
- else
- val = kvm_apic_id(apic) << 24;
- break;
case APIC_ARBPRI:
apic_debug("Access APIC ARBPRI register which is for P6\n");
break;
@@ -1314,6 +1317,108 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
nsec_to_cycles(vcpu, lapic_timer_advance_ns)));
}
+static void start_sw_tscdeadline(struct kvm_lapic *apic)
+{
+ u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
+ u64 ns = 0;
+ ktime_t expire;
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
+ unsigned long flags;
+ ktime_t now;
+
+ if (unlikely(!tscdeadline || !this_tsc_khz))
+ return;
+
+ local_irq_save(flags);
+
+ now = apic->lapic_timer.timer.base->get_time();
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ if (likely(tscdeadline > guest_tsc)) {
+ ns = (tscdeadline - guest_tsc) * 1000000ULL;
+ do_div(ns, this_tsc_khz);
+ expire = ktime_add_ns(now, ns);
+ expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
+ hrtimer_start(&apic->lapic_timer.timer,
+ expire, HRTIMER_MODE_ABS_PINNED);
+ } else
+ apic_timer_expired(apic);
+
+ local_irq_restore(flags);
+}
+
+bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.apic->lapic_timer.hv_timer_in_use;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
+
+static void cancel_hv_tscdeadline(struct kvm_lapic *apic)
+{
+ kvm_x86_ops->cancel_hv_timer(apic->vcpu);
+ apic->lapic_timer.hv_timer_in_use = false;
+}
+
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+ WARN_ON(swait_active(&vcpu->wq));
+ cancel_hv_tscdeadline(apic);
+ apic_timer_expired(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
+
+static bool start_hv_tscdeadline(struct kvm_lapic *apic)
+{
+ u64 tscdeadline = apic->lapic_timer.tscdeadline;
+
+ if (atomic_read(&apic->lapic_timer.pending) ||
+ kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) {
+ if (apic->lapic_timer.hv_timer_in_use)
+ cancel_hv_tscdeadline(apic);
+ } else {
+ apic->lapic_timer.hv_timer_in_use = true;
+ hrtimer_cancel(&apic->lapic_timer.timer);
+
+ /* In case the sw timer triggered in the window */
+ if (atomic_read(&apic->lapic_timer.pending))
+ cancel_hv_tscdeadline(apic);
+ }
+ trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
+ apic->lapic_timer.hv_timer_in_use);
+ return apic->lapic_timer.hv_timer_in_use;
+}
+
+void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ WARN_ON(apic->lapic_timer.hv_timer_in_use);
+
+ if (apic_lvtt_tscdeadline(apic))
+ start_hv_tscdeadline(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
+
+void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ /* Possibly the TSC deadline timer is not enabled yet */
+ if (!apic->lapic_timer.hv_timer_in_use)
+ return;
+
+ cancel_hv_tscdeadline(apic);
+
+ if (atomic_read(&apic->lapic_timer.pending))
+ return;
+
+ start_sw_tscdeadline(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
+
static void start_apic_timer(struct kvm_lapic *apic)
{
ktime_t now;
@@ -1360,32 +1465,8 @@ static void start_apic_timer(struct kvm_lapic *apic)
ktime_to_ns(ktime_add_ns(now,
apic->lapic_timer.period)));
} else if (apic_lvtt_tscdeadline(apic)) {
- /* lapic timer in tsc deadline mode */
- u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
- u64 ns = 0;
- ktime_t expire;
- struct kvm_vcpu *vcpu = apic->vcpu;
- unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
- unsigned long flags;
-
- if (unlikely(!tscdeadline || !this_tsc_khz))
- return;
-
- local_irq_save(flags);
-
- now = apic->lapic_timer.timer.base->get_time();
- guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
- if (likely(tscdeadline > guest_tsc)) {
- ns = (tscdeadline - guest_tsc) * 1000000ULL;
- do_div(ns, this_tsc_khz);
- expire = ktime_add_ns(now, ns);
- expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
- hrtimer_start(&apic->lapic_timer.timer,
- expire, HRTIMER_MODE_ABS_PINNED);
- } else
- apic_timer_expired(apic);
-
- local_irq_restore(flags);
+ if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic)))
+ start_sw_tscdeadline(apic);
}
}
@@ -1413,7 +1494,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
switch (reg) {
case APIC_ID: /* Local APIC ID */
if (!apic_x2apic_mode(apic))
- kvm_apic_set_id(apic, val >> 24);
+ kvm_apic_set_xapic_id(apic, val >> 24);
else
ret = 1;
break;
@@ -1674,9 +1755,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
/* update jump label if enable bit changes */
if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
- if (value & MSR_IA32_APICBASE_ENABLE)
+ if (value & MSR_IA32_APICBASE_ENABLE) {
+ kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
static_key_slow_dec_deferred(&apic_hw_disabled);
- else
+ } else
static_key_slow_inc(&apic_hw_disabled.key);
recalculate_apic_map(vcpu->kvm);
}
@@ -1716,8 +1798,11 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
/* Stop the timer in case it's a reset to an active apic */
hrtimer_cancel(&apic->lapic_timer.timer);
- if (!init_event)
- kvm_apic_set_id(apic, vcpu->vcpu_id);
+ if (!init_event) {
+ kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE |
+ MSR_IA32_APICBASE_ENABLE);
+ kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+ }
kvm_apic_set_version(apic->vcpu);
for (i = 0; i < KVM_APIC_LVT_NUM; i++)
@@ -1856,9 +1941,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
* thinking that APIC satet has changed.
*/
vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
- kvm_lapic_set_base(vcpu,
- APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
-
static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
kvm_lapic_reset(vcpu, false);
kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
@@ -1938,17 +2020,48 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
return vector;
}
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
- struct kvm_lapic_state *s)
+static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s, bool set)
+{
+ if (apic_x2apic_mode(vcpu->arch.apic)) {
+ u32 *id = (u32 *)(s->regs + APIC_ID);
+
+ if (vcpu->kvm->arch.x2apic_format) {
+ if (*id != vcpu->vcpu_id)
+ return -EINVAL;
+ } else {
+ if (set)
+ *id >>= 24;
+ else
+ *id <<= 24;
+ }
+ }
+
+ return 0;
+}
+
+int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
+{
+ memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s));
+ return kvm_apic_state_fixup(vcpu, s, false);
+}
+
+int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
{
struct kvm_lapic *apic = vcpu->arch.apic;
+ int r;
+
kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
/* set SPIV separately to get count of SW disabled APICs right */
apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
+
+ r = kvm_apic_state_fixup(vcpu, s, true);
+ if (r)
+ return r;
memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
- /* call kvm_apic_set_id() to put apic into apic_map */
- kvm_apic_set_id(apic, kvm_apic_id(apic));
+
+ recalculate_apic_map(vcpu->kvm);
kvm_apic_set_version(vcpu);
apic_update_ppr(apic);
@@ -1974,6 +2087,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
kvm_rtc_eoi_tracking_restore_one(vcpu);
vcpu->arch.apic_arb_prio = 0;
+
+ return 0;
}
void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 891c6da7d4aa..f60d01c29d51 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -20,6 +20,7 @@ struct kvm_timer {
u64 tscdeadline;
u64 expired_tscdeadline;
atomic_t pending; /* accumulated triggered timers */
+ bool hv_timer_in_use;
};
struct kvm_lapic {
@@ -80,8 +81,8 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
- struct kvm_lapic_state *s);
+int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
@@ -199,9 +200,15 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
}
-static inline int kvm_apic_id(struct kvm_lapic *apic)
+static inline u32 kvm_apic_id(struct kvm_lapic *apic)
{
- return (kvm_lapic_get_reg(apic, APIC_ID) >> 24) & 0xff;
+ /* To avoid a race between apic_base and following APIC_ID update when
+ * switching to x2apic_mode, the x2apic mode returns initial x2apic id.
+ */
+ if (apic_x2apic_mode(apic))
+ return apic->vcpu->vcpu_id;
+
+ return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
}
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
@@ -212,4 +219,8 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu);
int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
const unsigned long *bitmap, u32 bitmap_size);
+void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
+bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu);
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 745a5f445ae2..3d4cc8cc56a3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -176,6 +176,7 @@ static u64 __read_mostly shadow_user_mask;
static u64 __read_mostly shadow_accessed_mask;
static u64 __read_mostly shadow_dirty_mask;
static u64 __read_mostly shadow_mmio_mask;
+static u64 __read_mostly shadow_present_mask;
static void mmu_spte_set(u64 *sptep, u64 spte);
static void mmu_free_roots(struct kvm_vcpu *vcpu);
@@ -283,13 +284,14 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
}
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask)
+ u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask)
{
shadow_user_mask = user_mask;
shadow_accessed_mask = accessed_mask;
shadow_dirty_mask = dirty_mask;
shadow_nx_mask = nx_mask;
shadow_x_mask = x_mask;
+ shadow_present_mask = p_mask;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
@@ -305,7 +307,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
static int is_shadow_present_pte(u64 pte)
{
- return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
+ return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte);
}
static int is_large_pte(u64 pte)
@@ -524,7 +526,7 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
}
/* Rules for using mmu_spte_update:
- * Update the state bits, it means the mapped pfn is not changged.
+ * Update the state bits, it means the mapped pfn is not changed.
*
* Whenever we overwrite a writable spte with a read-only one we
* should flush remote TLBs. Otherwise rmap_write_protect
@@ -2246,10 +2248,9 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
{
u64 spte;
- BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK ||
- VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
+ BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
- spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
+ spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
mmu_spte_set(sptep, spte);
@@ -2516,13 +2517,19 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
gfn_t gfn, kvm_pfn_t pfn, bool speculative,
bool can_unsync, bool host_writable)
{
- u64 spte;
+ u64 spte = 0;
int ret = 0;
if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
return 0;
- spte = PT_PRESENT_MASK;
+ /*
+ * For the EPT case, shadow_present_mask is 0 if hardware
+ * supports exec-only page table entries. In that case,
+ * ACC_USER_MASK and shadow_user_mask are used to represent
+ * read access. See FNAME(gpte_access) in paging_tmpl.h.
+ */
+ spte |= shadow_present_mask;
if (!speculative)
spte |= shadow_accessed_mask;
@@ -3190,7 +3197,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
MMU_WARN_ON(VALID_PAGE(root));
if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
- if (!is_present_gpte(pdptr)) {
+ if (!(pdptr & PT_PRESENT_MASK)) {
vcpu->arch.mmu.pae_root[i] = 0;
continue;
}
@@ -3915,9 +3922,7 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
* clearer.
*/
smap = cr4_smap && u && !uf && !ff;
- } else
- /* Not really needed: no U/S accesses on ept */
- u = 1;
+ }
fault = (ff && !x) || (uf && !u) || (wf && !w) ||
(smapf && smap);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 66b33b96a31b..ddc56e91f2e4 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -93,11 +93,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
return kvm_mmu_load(vcpu);
}
-static inline int is_present_gpte(unsigned long pte)
-{
- return pte & PT_PRESENT_MASK;
-}
-
/*
* Currently, we have two sorts of write-protection, a) the first one
* write-protects guest page to sync the guest modification, b) another one is
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index bc019f70e0b6..a01105485315 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -131,7 +131,7 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
static inline int FNAME(is_present_gpte)(unsigned long pte)
{
#if PTTYPE != PTTYPE_EPT
- return is_present_gpte(pte);
+ return pte & PT_PRESENT_MASK;
#else
return pte & 7;
#endif
@@ -181,13 +181,19 @@ no_present:
return true;
}
+/*
+ * For PTTYPE_EPT, a page table can be executable but not readable
+ * on supported processors. Therefore, set_spte does not automatically
+ * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK
+ * to signify readability since it isn't used in the EPT case
+ */
static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
{
unsigned access;
#if PTTYPE == PTTYPE_EPT
access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
- ACC_USER_MASK;
+ ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0);
#else
BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK);
BUILD_BUG_ON(ACC_EXEC_MASK != 1);
diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c
index ab38af4f4947..9d4a8504a95a 100644
--- a/arch/x86/kvm/pmu_intel.c
+++ b/arch/x86/kvm/pmu_intel.c
@@ -93,7 +93,7 @@ static unsigned intel_find_fixed_event(int idx)
return intel_arch_events[fixed_pmc_events[idx]].event_type;
}
-/* check if a PMC is enabled by comparising it with globl_ctrl bits. */
+/* check if a PMC is enabled by comparing it with globl_ctrl bits. */
static bool intel_pmc_is_enabled(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 16ef31b87452..af523d84d102 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1577,7 +1577,7 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
/*
- * Any change of EFLAGS.VM is accompained by a reload of SS
+ * Any change of EFLAGS.VM is accompanied by a reload of SS
* (caused by either a task switch or an inter-privilege IRET),
* so we do not need to update the CPL here.
*/
@@ -4940,6 +4940,12 @@ out:
static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
{
local_irq_enable();
+ /*
+ * We must have an instruction with interrupts enabled, so
+ * the timer interrupt isn't delayed by the interrupt shadow.
+ */
+ asm("nop");
+ local_irq_disable();
}
static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 8de925031b5c..0a6cc6754ec5 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1348,6 +1348,21 @@ TRACE_EVENT(kvm_avic_unaccelerated_access,
__entry->vec)
);
+TRACE_EVENT(kvm_hv_timer_state,
+ TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use),
+ TP_ARGS(vcpu_id, hv_timer_in_use),
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(unsigned int, hv_timer_in_use)
+ ),
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->hv_timer_in_use = hv_timer_in_use;
+ ),
+ TP_printk("vcpu_id %x hv_timer %x\n",
+ __entry->vcpu_id,
+ __entry->hv_timer_in_use)
+);
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index df07a0a4611f..bc354f003ce1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -110,6 +110,13 @@ module_param_named(pml, enable_pml, bool, S_IRUGO);
#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
+/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
+static int __read_mostly cpu_preemption_timer_multi;
+static bool __read_mostly enable_preemption_timer = 1;
+#ifdef CONFIG_X86_64
+module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
+#endif
+
#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
#define KVM_VM_CR0_ALWAYS_ON \
@@ -398,6 +405,12 @@ struct nested_vmx {
/* The host-usable pointer to the above */
struct page *current_vmcs12_page;
struct vmcs12 *current_vmcs12;
+ /*
+ * Cache of the guest's VMCS, existing outside of guest memory.
+ * Loaded from guest memory during VMPTRLD. Flushed to guest
+ * memory during VMXOFF, VMCLEAR, VMPTRLD.
+ */
+ struct vmcs12 *cached_vmcs12;
struct vmcs *current_shadow_vmcs;
/*
* Indicates if the shadow vmcs must be updated with the
@@ -421,7 +434,6 @@ struct nested_vmx {
struct pi_desc *pi_desc;
bool pi_pending;
u16 posted_intr_nv;
- u64 msr_ia32_feature_control;
struct hrtimer preemption_timer;
bool preemption_timer_expired;
@@ -597,11 +609,22 @@ struct vcpu_vmx {
#define PML_ENTITY_NUM 512
struct page *pml_pg;
+ /* apic deadline value in host tsc */
+ u64 hv_deadline_tsc;
+
u64 current_tsc_ratio;
bool guest_pkru_valid;
u32 guest_pkru;
u32 host_pkru;
+
+ /*
+ * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
+ * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
+ * in msr_ia32_feature_control_valid_bits.
+ */
+ u64 msr_ia32_feature_control;
+ u64 msr_ia32_feature_control_valid_bits;
};
enum segment_cache_field {
@@ -841,7 +864,7 @@ static inline short vmcs_field_to_offset(unsigned long field)
static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
{
- return to_vmx(vcpu)->nested.current_vmcs12;
+ return to_vmx(vcpu)->nested.cached_vmcs12;
}
static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
@@ -1056,6 +1079,58 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
}
+/*
+ * Comment's format: document - errata name - stepping - processor name.
+ * Refer from
+ * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
+ */
+static u32 vmx_preemption_cpu_tfms[] = {
+/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
+0x000206E6,
+/* 323056.pdf - AAX65 - C2 - Xeon L3406 */
+/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
+/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020652,
+/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020655,
+/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
+/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
+/*
+ * 320767.pdf - AAP86 - B1 -
+ * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
+ */
+0x000106E5,
+/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
+0x000106A0,
+/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
+0x000106A1,
+/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
+0x000106A4,
+ /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
+ /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
+ /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
+0x000106A5,
+};
+
+static inline bool cpu_has_broken_vmx_preemption_timer(void)
+{
+ u32 eax = cpuid_eax(0x00000001), i;
+
+ /* Clear the reserved bits */
+ eax &= ~(0x3U << 14 | 0xfU << 28);
+ for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
+ if (eax == vmx_preemption_cpu_tfms[i])
+ return true;
+
+ return false;
+}
+
+static inline bool cpu_has_vmx_preemption_timer(void)
+{
+ return vmcs_config.pin_based_exec_ctrl &
+ PIN_BASED_VMX_PREEMPTION_TIMER;
+}
+
static inline bool cpu_has_vmx_posted_intr(void)
{
return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
@@ -1603,6 +1678,11 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
__vmcs_writel(field, __vmcs_readl(field) | mask);
}
+static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx)
+{
+ vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS);
+}
+
static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
{
vmcs_write32(VM_ENTRY_CONTROLS, val);
@@ -1631,6 +1711,11 @@ static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
}
+static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
+{
+ vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
+}
+
static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
{
vmcs_write32(VM_EXIT_CONTROLS, val);
@@ -2121,22 +2206,14 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+ bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
if (!vmm_exclusive)
kvm_cpu_vmxon(phys_addr);
- else if (vmx->loaded_vmcs->cpu != cpu)
+ else if (!already_loaded)
loaded_vmcs_clear(vmx->loaded_vmcs);
- if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
- per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
- vmcs_load(vmx->loaded_vmcs->vmcs);
- }
-
- if (vmx->loaded_vmcs->cpu != cpu) {
- struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
- unsigned long sysenter_esp;
-
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ if (!already_loaded) {
local_irq_disable();
crash_disable_local_vmclear(cpu);
@@ -2151,6 +2228,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
&per_cpu(loaded_vmcss_on_cpu, cpu));
crash_enable_local_vmclear(cpu);
local_irq_enable();
+ }
+
+ if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+ per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
+ vmcs_load(vmx->loaded_vmcs->vmcs);
+ }
+
+ if (!already_loaded) {
+ struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
+ unsigned long sysenter_esp;
+
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
/*
* Linux uses per-cpu TSS and GDT, so set these when switching
@@ -2716,6 +2805,9 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
VMX_EPT_INVEPT_BIT;
+ if (cpu_has_vmx_ept_execute_only())
+ vmx->nested.nested_vmx_ept_caps |=
+ VMX_EPT_EXECUTE_ONLY_BIT;
vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
/*
* For nested guests, we don't do anything specific
@@ -2864,6 +2956,14 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
return 0;
}
+static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
+ uint64_t val)
+{
+ uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
+
+ return !(val & ~valid_bits);
+}
+
/*
* Reads an msr value (of 'msr_index') into 'pdata'.
* Returns 0 on success, non-0 otherwise.
@@ -2905,10 +3005,15 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
msr_info->data = vmcs_read64(GUEST_BNDCFGS);
break;
- case MSR_IA32_FEATURE_CONTROL:
- if (!nested_vmx_allowed(vcpu))
+ case MSR_IA32_MCG_EXT_CTL:
+ if (!msr_info->host_initiated &&
+ !(to_vmx(vcpu)->msr_ia32_feature_control &
+ FEATURE_CONTROL_LMCE))
return 1;
- msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
+ msr_info->data = vcpu->arch.mcg_ext_ctl;
+ break;
+ case MSR_IA32_FEATURE_CONTROL:
+ msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control;
break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!nested_vmx_allowed(vcpu))
@@ -2998,12 +3103,20 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_TSC_ADJUST:
ret = kvm_set_msr_common(vcpu, msr_info);
break;
+ case MSR_IA32_MCG_EXT_CTL:
+ if ((!msr_info->host_initiated &&
+ !(to_vmx(vcpu)->msr_ia32_feature_control &
+ FEATURE_CONTROL_LMCE)) ||
+ (data & ~MCG_EXT_CTL_LMCE_EN))
+ return 1;
+ vcpu->arch.mcg_ext_ctl = data;
+ break;
case MSR_IA32_FEATURE_CONTROL:
- if (!nested_vmx_allowed(vcpu) ||
- (to_vmx(vcpu)->nested.msr_ia32_feature_control &
+ if (!vmx_feature_control_msr_valid(vcpu, data) ||
+ (to_vmx(vcpu)->msr_ia32_feature_control &
FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
return 1;
- vmx->nested.msr_ia32_feature_control = data;
+ vmx->msr_ia32_feature_control = data;
if (msr_info->host_initiated && data == 0)
vmx_leave_nested(vcpu);
break;
@@ -3297,25 +3410,27 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
vmx_capability.ept, vmx_capability.vpid);
}
- min = VM_EXIT_SAVE_DEBUG_CONTROLS;
+ min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
#ifdef CONFIG_X86_64
min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
#endif
opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
- VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS;
+ VM_EXIT_CLEAR_BNDCFGS;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
&_vmexit_control) < 0)
return -EIO;
min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
- opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
+ opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
+ PIN_BASED_VMX_PREEMPTION_TIMER;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
&_pin_based_exec_control) < 0)
return -EIO;
+ if (cpu_has_broken_vmx_preemption_timer())
+ _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
if (!(_cpu_based_2nd_exec_control &
- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
- !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
@@ -3364,7 +3479,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
/*
* Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
- * but due to arrata below it can't be used. Workaround is to use
+ * but due to errata below it can't be used. Workaround is to use
* msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
*
* VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
@@ -4781,6 +4896,8 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
if (!kvm_vcpu_apicv_active(&vmx->vcpu))
pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+ /* Enable the preemption timer dynamically */
+ pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
return pin_based_exec_ctrl;
}
@@ -4896,6 +5013,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
/* Control */
vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
+ vmx->hv_deadline_tsc = -1;
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
@@ -6016,12 +6134,14 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
trace_kvm_page_fault(gpa, exit_qualification);
- /* It is a write fault? */
- error_code = exit_qualification & PFERR_WRITE_MASK;
+ /* it is a read fault? */
+ error_code = (exit_qualification << 2) & PFERR_USER_MASK;
+ /* it is a write fault? */
+ error_code |= exit_qualification & PFERR_WRITE_MASK;
/* It is a fetch fault? */
error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK;
/* ept page table is present? */
- error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK;
+ error_code |= (exit_qualification & 0x38) != 0;
vcpu->arch.exit_qualification = exit_qualification;
@@ -6355,9 +6475,6 @@ static __init int hardware_setup(void)
for (msr = 0x800; msr <= 0x8ff; msr++)
vmx_disable_intercept_msr_read_x2apic(msr);
- /* According SDM, in x2apic mode, the whole id reg is used. But in
- * KVM, it only use the highest eight bits. Need to intercept it */
- vmx_enable_intercept_msr_read_x2apic(0x802);
/* TMCCT */
vmx_enable_intercept_msr_read_x2apic(0x839);
/* TPR */
@@ -6368,10 +6485,12 @@ static __init int hardware_setup(void)
vmx_disable_intercept_msr_write_x2apic(0x83f);
if (enable_ept) {
- kvm_mmu_set_mask_ptes(0ull,
+ kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
(enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
(enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
- 0ull, VMX_EPT_EXECUTABLE_MASK);
+ 0ull, VMX_EPT_EXECUTABLE_MASK,
+ cpu_has_vmx_ept_execute_only() ?
+ 0ull : VMX_EPT_READABLE_MASK);
ept_set_mmio_spte_mask();
kvm_enable_tdp();
} else
@@ -6393,8 +6512,21 @@ static __init int hardware_setup(void)
kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
}
+ if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
+ u64 vmx_msr;
+
+ rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
+ cpu_preemption_timer_multi =
+ vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+ } else {
+ kvm_x86_ops->set_hv_timer = NULL;
+ kvm_x86_ops->cancel_hv_timer = NULL;
+ }
+
kvm_set_posted_intr_wakeup_handler(wakeup_handler);
+ kvm_mce_cap_supported |= MCG_LMCE_P;
+
return alloc_kvm_area();
out8:
@@ -6862,16 +6994,22 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
return 1;
}
- if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
+ if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
!= VMXON_NEEDED_FEATURES) {
kvm_inject_gp(vcpu, 0);
return 1;
}
+ vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
+ if (!vmx->nested.cached_vmcs12)
+ return -ENOMEM;
+
if (enable_shadow_vmcs) {
shadow_vmcs = alloc_vmcs();
- if (!shadow_vmcs)
+ if (!shadow_vmcs) {
+ kfree(vmx->nested.cached_vmcs12);
return -ENOMEM;
+ }
/* mark vmcs as shadow */
shadow_vmcs->revision_id |= (1u << 31);
/* init shadow vmcs */
@@ -6942,6 +7080,11 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
vmcs_write64(VMCS_LINK_POINTER, -1ull);
}
vmx->nested.posted_intr_nv = -1;
+
+ /* Flush VMCS12 to guest memory */
+ memcpy(vmx->nested.current_vmcs12, vmx->nested.cached_vmcs12,
+ VMCS12_SIZE);
+
kunmap(vmx->nested.current_vmcs12_page);
nested_release_page(vmx->nested.current_vmcs12_page);
vmx->nested.current_vmptr = -1ull;
@@ -6962,6 +7105,7 @@ static void free_nested(struct vcpu_vmx *vmx)
nested_release_vmcs12(vmx);
if (enable_shadow_vmcs)
free_vmcs(vmx->nested.current_shadow_vmcs);
+ kfree(vmx->nested.cached_vmcs12);
/* Unpin physical memory we referred to in current vmcs02 */
if (vmx->nested.apic_access_page) {
nested_release_page(vmx->nested.apic_access_page);
@@ -7365,6 +7509,13 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
vmx->nested.current_vmptr = vmptr;
vmx->nested.current_vmcs12 = new_vmcs12;
vmx->nested.current_vmcs12_page = page;
+ /*
+ * Load VMCS12 from guest memory since it is not already
+ * cached.
+ */
+ memcpy(vmx->nested.cached_vmcs12,
+ vmx->nested.current_vmcs12, VMCS12_SIZE);
+
if (enable_shadow_vmcs) {
vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
SECONDARY_EXEC_SHADOW_VMCS);
@@ -7560,6 +7711,12 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+{
+ kvm_lapic_expired_hv_timer(vcpu);
+ return 1;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -7610,6 +7767,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_XSAVES] = handle_xsaves,
[EXIT_REASON_XRSTORS] = handle_xrstors,
[EXIT_REASON_PML_FULL] = handle_pml_full,
+ [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
};
static const int kvm_vmx_max_exit_handlers =
@@ -7918,6 +8076,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
* the XSS exit bitmap in vmcs12.
*/
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
+ case EXIT_REASON_PREEMPTION_TIMER:
+ return false;
default:
return true;
}
@@ -8303,7 +8463,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
* the next L2->L1 exit.
*/
if (!is_guest_mode(vcpu) ||
- !nested_cpu_has2(vmx->nested.current_vmcs12,
+ !nested_cpu_has2(get_vmcs12(&vmx->vcpu),
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
vmcs_write64(APIC_ACCESS_ADDR, hpa);
}
@@ -8436,7 +8596,6 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
"push %[sp]\n\t"
#endif
"pushf\n\t"
- "orl $0x200, (%%" _ASM_SP ")\n\t"
__ASM_SIZE(push) " $%c[cs]\n\t"
"call *%[entry]\n\t"
:
@@ -8449,8 +8608,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
[ss]"i"(__KERNEL_DS),
[cs]"i"(__KERNEL_CS)
);
- } else
- local_irq_enable();
+ }
}
static bool vmx_has_high_real_mode_segbase(void)
@@ -8601,6 +8759,26 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
msrs[i].host);
}
+void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 tscl;
+ u32 delta_tsc;
+
+ if (vmx->hv_deadline_tsc == -1)
+ return;
+
+ tscl = rdtsc();
+ if (vmx->hv_deadline_tsc > tscl)
+ /* sure to be 32 bit only because checked on set_hv_timer */
+ delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
+ cpu_preemption_timer_multi);
+ else
+ delta_tsc = 0;
+
+ vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
+}
+
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8650,6 +8828,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
atomic_switch_perf_msrs(vmx);
debugctlmsr = get_debugctlmsr();
+ vmx_arm_hv_timer(vcpu);
+
vmx->__launched = vmx->loaded_vmcs->launched;
asm(
/* Store host registers */
@@ -8940,6 +9120,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
vmx->nested.current_vmptr = -1ull;
vmx->nested.current_vmcs12 = NULL;
+ vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
+
return &vmx->vcpu;
free_vmcs:
@@ -9080,6 +9262,13 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
if (cpu_has_secondary_exec_ctrls())
vmcs_set_secondary_exec_control(secondary_exec_ctl);
+
+ if (nested_vmx_allowed(vcpu))
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+ FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+ else
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+ ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
}
static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -9636,9 +9825,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs_write64(VMCS_LINK_POINTER, -1ull);
exec_control = vmcs12->pin_based_vm_exec_control;
- exec_control |= vmcs_config.pin_based_exec_ctrl;
+
+ /* Preemption timer setting is only taken from vmcs01. */
exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ exec_control |= vmcs_config.pin_based_exec_ctrl;
+ if (vmx->hv_deadline_tsc == -1)
+ exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ /* Posted interrupts setting is only taken from vmcs12. */
if (nested_cpu_has_posted_intr(vmcs12)) {
/*
* Note that we use L0's vector here and in
@@ -10556,8 +10750,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmcs12->vm_exit_intr_error_code,
KVM_ISA_VMX);
- vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS));
- vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS));
+ vm_entry_controls_reset_shadow(vmx);
+ vm_exit_controls_reset_shadow(vmx);
vmx_segment_cache_clear(vmx);
/* if no vmcs02 cache requested, remove the one we used */
@@ -10566,8 +10760,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
load_vmcs12_host_state(vcpu, vmcs12);
- /* Update TSC_OFFSET if TSC was changed while L2 ran */
+ /* Update any VMCS fields that might have changed while L2 ran */
vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+ if (vmx->hv_deadline_tsc == -1)
+ vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
+ else
+ vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
/* This is needed for same reason as it was needed in prepare_vmcs02 */
vmx->host_rsp = 0;
@@ -10647,6 +10847,64 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
return X86EMUL_CONTINUE;
}
+#ifdef CONFIG_X86_64
+/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
+static inline int u64_shl_div_u64(u64 a, unsigned int shift,
+ u64 divisor, u64 *result)
+{
+ u64 low = a << shift, high = a >> (64 - shift);
+
+ /* To avoid the overflow on divq */
+ if (high >= divisor)
+ return 1;
+
+ /* Low hold the result, high hold rem which is discarded */
+ asm("divq %2\n\t" : "=a" (low), "=d" (high) :
+ "rm" (divisor), "0" (low), "1" (high));
+ *result = low;
+
+ return 0;
+}
+
+static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 tscl = rdtsc();
+ u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
+ u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
+
+ /* Convert to host delta tsc if tsc scaling is enabled */
+ if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
+ u64_shl_div_u64(delta_tsc,
+ kvm_tsc_scaling_ratio_frac_bits,
+ vcpu->arch.tsc_scaling_ratio,
+ &delta_tsc))
+ return -ERANGE;
+
+ /*
+ * If the delta tsc can't fit in the 32 bit after the multi shift,
+ * we can't use the preemption timer.
+ * It's possible that it fits on later vmentries, but checking
+ * on every vmentry is costly so we just use an hrtimer.
+ */
+ if (delta_tsc >> (cpu_preemption_timer_multi + 32))
+ return -ERANGE;
+
+ vmx->hv_deadline_tsc = tscl + delta_tsc;
+ vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
+ return 0;
+}
+
+static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ vmx->hv_deadline_tsc = -1;
+ vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
+}
+#endif
+
static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
if (ple_gap)
@@ -10691,7 +10949,7 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
* this case, return 1, otherwise, return 0.
*
*/
-static int vmx_pre_block(struct kvm_vcpu *vcpu)
+static int pi_pre_block(struct kvm_vcpu *vcpu)
{
unsigned long flags;
unsigned int dest;
@@ -10758,7 +11016,18 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
return 0;
}
-static void vmx_post_block(struct kvm_vcpu *vcpu)
+static int vmx_pre_block(struct kvm_vcpu *vcpu)
+{
+ if (pi_pre_block(vcpu))
+ return 1;
+
+ if (kvm_lapic_hv_timer_in_use(vcpu))
+ kvm_lapic_switch_to_sw_timer(vcpu);
+
+ return 0;
+}
+
+static void pi_post_block(struct kvm_vcpu *vcpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
struct pi_desc old, new;
@@ -10800,6 +11069,14 @@ static void vmx_post_block(struct kvm_vcpu *vcpu)
}
}
+static void vmx_post_block(struct kvm_vcpu *vcpu)
+{
+ if (kvm_x86_ops->set_hv_timer)
+ kvm_lapic_switch_to_hv_timer(vcpu);
+
+ pi_post_block(vcpu);
+}
+
/*
* vmx_update_pi_irte - set IRTE for Posted-Interrupts
*
@@ -10844,7 +11121,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
* We will support full lowest-priority interrupt later.
*/
- kvm_set_msi_irq(e, &irq);
+ kvm_set_msi_irq(kvm, e, &irq);
if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
/*
* Make sure the IRTE is in remapped mode if
@@ -10889,6 +11166,16 @@ out:
return ret;
}
+static void vmx_setup_mce(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.mcg_cap & MCG_LMCE_P)
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+ FEATURE_CONTROL_LMCE;
+ else
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+ ~FEATURE_CONTROL_LMCE;
+}
+
static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -11013,6 +11300,13 @@ static struct kvm_x86_ops vmx_x86_ops = {
.pmu_ops = &intel_pmu_ops,
.update_pi_irte = vmx_update_pi_irte,
+
+#ifdef CONFIG_X86_64
+ .set_hv_timer = vmx_set_hv_timer,
+ .cancel_hv_timer = vmx_cancel_hv_timer,
+#endif
+
+ .setup_mce = vmx_setup_mce,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9c496c7e8c00..19f9f9e05c2a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -71,7 +71,8 @@
#define MAX_IO_MSRS 256
#define KVM_MAX_MCE_BANKS 32
-#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
+u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
+EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
#define emul_to_vcpu(ctxt) \
container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
@@ -90,8 +91,12 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
+#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
+ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+
static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static void process_nmi(struct kvm_vcpu *vcpu);
+static void enter_smm(struct kvm_vcpu *vcpu);
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
struct kvm_x86_ops *kvm_x86_ops __read_mostly;
@@ -114,7 +119,8 @@ u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits;
EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
u64 __read_mostly kvm_max_tsc_scaling_ratio;
EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
-static u64 __read_mostly kvm_default_tsc_scaling_ratio;
+u64 __read_mostly kvm_default_tsc_scaling_ratio;
+EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
static u32 __read_mostly tsc_tolerance_ppm = 250;
@@ -538,7 +544,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
goto out;
}
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
- if (is_present_gpte(pdpte[i]) &&
+ if ((pdpte[i] & PT_PRESENT_MASK) &&
(pdpte[i] &
vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
ret = 0;
@@ -983,6 +989,7 @@ static u32 emulated_msrs[] = {
MSR_IA32_MISC_ENABLE,
MSR_IA32_MCG_STATUS,
MSR_IA32_MCG_CTL,
+ MSR_IA32_MCG_EXT_CTL,
MSR_IA32_SMBASE,
};
@@ -1162,7 +1169,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
int version;
int r;
struct pvclock_wall_clock wc;
- struct timespec boot;
+ struct timespec64 boot;
if (!wall_clock)
return;
@@ -1185,13 +1192,13 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
* wall clock specified here. guest system time equals host
* system time for us, thus we must fill in host boot time here.
*/
- getboottime(&boot);
+ getboottime64(&boot);
if (kvm->arch.kvmclock_offset) {
- struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);
- boot = timespec_sub(boot, ts);
+ struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
+ boot = timespec64_sub(boot, ts);
}
- wc.sec = boot.tv_sec;
+ wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
wc.nsec = boot.tv_nsec;
wc.version = version;
@@ -2616,6 +2623,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_TSC_CONTROL:
r = kvm_has_tsc_control;
break;
+ case KVM_CAP_X2APIC_API:
+ r = KVM_X2APIC_API_VALID_FLAGS;
+ break;
default:
r = 0;
break;
@@ -2678,11 +2688,9 @@ long kvm_arch_dev_ioctl(struct file *filp,
break;
}
case KVM_X86_GET_MCE_CAP_SUPPORTED: {
- u64 mce_cap;
-
- mce_cap = KVM_MCE_CAP_SUPPORTED;
r = -EFAULT;
- if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
+ if (copy_to_user(argp, &kvm_mce_cap_supported,
+ sizeof(kvm_mce_cap_supported)))
goto out;
r = 0;
break;
@@ -2734,6 +2742,11 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
rdtsc() - vcpu->arch.last_host_tsc;
if (tsc_delta < 0)
mark_tsc_unstable("KVM discovered backwards TSC");
+
+ if (kvm_lapic_hv_timer_in_use(vcpu) &&
+ kvm_x86_ops->set_hv_timer(vcpu,
+ kvm_get_lapic_tscdeadline_msr(vcpu)))
+ kvm_lapic_switch_to_sw_timer(vcpu);
if (check_tsc_unstable()) {
u64 offset = kvm_compute_tsc_offset(vcpu,
vcpu->arch.last_guest_tsc);
@@ -2767,15 +2780,17 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
if (vcpu->arch.apicv_active)
kvm_x86_ops->sync_pir_to_irr(vcpu);
- memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
-
- return 0;
+ return kvm_apic_get_state(vcpu, s);
}
static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- kvm_apic_post_state_restore(vcpu, s);
+ int r;
+
+ r = kvm_apic_set_state(vcpu, s);
+ if (r)
+ return r;
update_cr8_intercept(vcpu);
return 0;
@@ -2860,7 +2875,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
r = -EINVAL;
if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
goto out;
- if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
+ if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
goto out;
r = 0;
vcpu->arch.mcg_cap = mcg_cap;
@@ -2870,6 +2885,9 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
/* Init IA32_MCi_CTL to all 1s */
for (bank = 0; bank < bank_num; bank++)
vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+
+ if (kvm_x86_ops->setup_mce)
+ kvm_x86_ops->setup_mce(vcpu);
out:
return r;
}
@@ -3768,7 +3786,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
r = -EEXIST;
if (irqchip_in_kernel(kvm))
goto split_irqchip_unlock;
- if (atomic_read(&kvm->online_vcpus))
+ if (kvm->created_vcpus)
goto split_irqchip_unlock;
r = kvm_setup_empty_irq_routing(kvm);
if (r)
@@ -3782,6 +3800,18 @@ split_irqchip_unlock:
mutex_unlock(&kvm->lock);
break;
}
+ case KVM_CAP_X2APIC_API:
+ r = -EINVAL;
+ if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
+ break;
+
+ if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
+ kvm->arch.x2apic_format = true;
+ if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+ kvm->arch.x2apic_broadcast_quirk_disabled = true;
+
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
@@ -3833,7 +3863,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (kvm->arch.vpic)
goto create_irqchip_unlock;
r = -EINVAL;
- if (atomic_read(&kvm->online_vcpus))
+ if (kvm->created_vcpus)
goto create_irqchip_unlock;
r = -ENOMEM;
vpic = kvm_create_pic(kvm);
@@ -3873,7 +3903,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
sizeof(struct kvm_pit_config)))
goto out;
create_pit:
- mutex_lock(&kvm->slots_lock);
+ mutex_lock(&kvm->lock);
r = -EEXIST;
if (kvm->arch.vpit)
goto create_pit_unlock;
@@ -3882,7 +3912,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (kvm->arch.vpit)
r = 0;
create_pit_unlock:
- mutex_unlock(&kvm->slots_lock);
+ mutex_unlock(&kvm->lock);
break;
case KVM_GET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
@@ -3989,7 +4019,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
case KVM_SET_BOOT_CPU_ID:
r = 0;
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus) != 0)
+ if (kvm->created_vcpus)
r = -EBUSY;
else
kvm->arch.bsp_vcpu_id = arg;
@@ -5297,13 +5327,8 @@ static void kvm_smm_changed(struct kvm_vcpu *vcpu)
/* This is a good place to trace that we are exiting SMM. */
trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
- if (unlikely(vcpu->arch.smi_pending)) {
- kvm_make_request(KVM_REQ_SMI, vcpu);
- vcpu->arch.smi_pending = 0;
- } else {
- /* Process a latched INIT, if any. */
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- }
+ /* Process a latched INIT or SMI, if any. */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
}
kvm_mmu_reset_context(vcpu);
@@ -5849,8 +5874,8 @@ int kvm_arch_init(void *opaque)
kvm_x86_ops = ops;
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
- PT_DIRTY_MASK, PT64_NX_MASK, 0);
-
+ PT_DIRTY_MASK, PT64_NX_MASK, 0,
+ PT_PRESENT_MASK);
kvm_timer_init();
perf_register_guest_info_callbacks(&kvm_guest_cbs);
@@ -6084,7 +6109,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
}
/* try to inject new event if pending */
- if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
+ if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
+ vcpu->arch.smi_pending = false;
+ enter_smm(vcpu);
+ } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
--vcpu->arch.nmi_pending;
vcpu->arch.nmi_injected = true;
kvm_x86_ops->set_nmi(vcpu);
@@ -6107,6 +6135,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
kvm_x86_ops->set_irq(vcpu);
}
}
+
return 0;
}
@@ -6130,7 +6159,7 @@ static void process_nmi(struct kvm_vcpu *vcpu)
#define put_smstate(type, buf, offset, val) \
*(type *)((buf) + (offset) - 0x7e00) = val
-static u32 process_smi_get_segment_flags(struct kvm_segment *seg)
+static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
{
u32 flags = 0;
flags |= seg->g << 23;
@@ -6144,7 +6173,7 @@ static u32 process_smi_get_segment_flags(struct kvm_segment *seg)
return flags;
}
-static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
+static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
{
struct kvm_segment seg;
int offset;
@@ -6159,11 +6188,11 @@ static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
put_smstate(u32, buf, offset + 8, seg.base);
put_smstate(u32, buf, offset + 4, seg.limit);
- put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg));
+ put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg));
}
#ifdef CONFIG_X86_64
-static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
+static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
{
struct kvm_segment seg;
int offset;
@@ -6172,7 +6201,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
kvm_get_segment(vcpu, &seg, n);
offset = 0x7e00 + n * 16;
- flags = process_smi_get_segment_flags(&seg) >> 8;
+ flags = enter_smm_get_segment_flags(&seg) >> 8;
put_smstate(u16, buf, offset, seg.selector);
put_smstate(u16, buf, offset + 2, flags);
put_smstate(u32, buf, offset + 4, seg.limit);
@@ -6180,7 +6209,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
}
#endif
-static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
+static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
{
struct desc_ptr dt;
struct kvm_segment seg;
@@ -6204,13 +6233,13 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7fc4, seg.selector);
put_smstate(u32, buf, 0x7f64, seg.base);
put_smstate(u32, buf, 0x7f60, seg.limit);
- put_smstate(u32, buf, 0x7f5c, process_smi_get_segment_flags(&seg));
+ put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg));
kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
put_smstate(u32, buf, 0x7fc0, seg.selector);
put_smstate(u32, buf, 0x7f80, seg.base);
put_smstate(u32, buf, 0x7f7c, seg.limit);
- put_smstate(u32, buf, 0x7f78, process_smi_get_segment_flags(&seg));
+ put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
kvm_x86_ops->get_gdt(vcpu, &dt);
put_smstate(u32, buf, 0x7f74, dt.address);
@@ -6221,7 +6250,7 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7f54, dt.size);
for (i = 0; i < 6; i++)
- process_smi_save_seg_32(vcpu, buf, i);
+ enter_smm_save_seg_32(vcpu, buf, i);
put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
@@ -6230,7 +6259,7 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
}
-static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
+static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
{
#ifdef CONFIG_X86_64
struct desc_ptr dt;
@@ -6262,7 +6291,7 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
put_smstate(u16, buf, 0x7e90, seg.selector);
- put_smstate(u16, buf, 0x7e92, process_smi_get_segment_flags(&seg) >> 8);
+ put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8);
put_smstate(u32, buf, 0x7e94, seg.limit);
put_smstate(u64, buf, 0x7e98, seg.base);
@@ -6272,7 +6301,7 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
put_smstate(u16, buf, 0x7e70, seg.selector);
- put_smstate(u16, buf, 0x7e72, process_smi_get_segment_flags(&seg) >> 8);
+ put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8);
put_smstate(u32, buf, 0x7e74, seg.limit);
put_smstate(u64, buf, 0x7e78, seg.base);
@@ -6281,31 +6310,26 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u64, buf, 0x7e68, dt.address);
for (i = 0; i < 6; i++)
- process_smi_save_seg_64(vcpu, buf, i);
+ enter_smm_save_seg_64(vcpu, buf, i);
#else
WARN_ON_ONCE(1);
#endif
}
-static void process_smi(struct kvm_vcpu *vcpu)
+static void enter_smm(struct kvm_vcpu *vcpu)
{
struct kvm_segment cs, ds;
struct desc_ptr dt;
char buf[512];
u32 cr0;
- if (is_smm(vcpu)) {
- vcpu->arch.smi_pending = true;
- return;
- }
-
trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
vcpu->arch.hflags |= HF_SMM_MASK;
memset(buf, 0, 512);
if (guest_cpuid_has_longmode(vcpu))
- process_smi_save_state_64(vcpu, buf);
+ enter_smm_save_state_64(vcpu, buf);
else
- process_smi_save_state_32(vcpu, buf);
+ enter_smm_save_state_32(vcpu, buf);
kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
@@ -6361,6 +6385,12 @@ static void process_smi(struct kvm_vcpu *vcpu)
kvm_mmu_reset_context(vcpu);
}
+static void process_smi(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.smi_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
void kvm_make_scan_ioapic_request(struct kvm *kvm)
{
kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
@@ -6555,8 +6585,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (inject_pending_event(vcpu, req_int_win) != 0)
req_immediate_exit = true;
- /* enable NMI/IRQ window open exits if needed */
else {
+ /* Enable NMI/IRQ window open exits if needed.
+ *
+ * SMIs have two cases: 1) they can be nested, and
+ * then there is nothing to do here because RSM will
+ * cause a vmexit anyway; 2) or the SMI can be pending
+ * because inject_pending_event has completed the
+ * injection of an IRQ or NMI from the previous vmexit,
+ * and then we request an immediate exit to inject the SMI.
+ */
+ if (vcpu->arch.smi_pending && !is_smm(vcpu))
+ req_immediate_exit = true;
if (vcpu->arch.nmi_pending)
kvm_x86_ops->enable_nmi_window(vcpu);
if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
@@ -6607,12 +6647,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_load_guest_xcr0(vcpu);
- if (req_immediate_exit)
+ if (req_immediate_exit) {
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
smp_send_reschedule(vcpu->cpu);
+ }
trace_kvm_entry(vcpu->vcpu_id);
wait_lapic_expire(vcpu);
- __kvm_guest_enter();
+ guest_enter_irqoff();
if (unlikely(vcpu->arch.switch_db_regs)) {
set_debugreg(0, 7);
@@ -6663,16 +6705,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
++vcpu->stat.exits;
- /*
- * We must have an instruction between local_irq_enable() and
- * kvm_guest_exit(), so the timer interrupt isn't delayed by
- * the interrupt shadow. The stat.exits increment will do nicely.
- * But we need to prevent reordering, hence this barrier():
- */
- barrier();
-
- kvm_guest_exit();
+ guest_exit_irqoff();
+ local_irq_enable();
preempt_enable();
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
@@ -7409,6 +7444,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
vcpu->arch.hflags = 0;
+ vcpu->arch.smi_pending = 0;
atomic_set(&vcpu->arch.nmi_queued, 0);
vcpu->arch.nmi_pending = 0;
vcpu->arch.nmi_injected = false;
@@ -7601,11 +7637,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
}
-bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
-{
- return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu);
-}
-
struct static_key kvm_no_apic_vcpu __read_mostly;
EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
@@ -7872,7 +7903,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic);
kvm_free_vcpus(kvm);
- kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+ kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
kvm_mmu_uninit_vm(kvm);
}
@@ -8380,7 +8411,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
/*
* When producer of consumer is unregistered, we change back to
* remapped mode, so we can re-use the current implementation
- * when the irq is masked/disabed or the consumer side (KVM
+ * when the irq is masked/disabled or the consumer side (KVM
* int this case doesn't want to receive the interrupts.
*/
ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c
index 0ac520dd1b21..c71df0c7dedc 100644
--- a/drivers/s390/char/sclp_early.c
+++ b/drivers/s390/char/sclp_early.c
@@ -46,7 +46,8 @@ struct read_info_sccb {
u64 rnmax2; /* 104-111 */
u8 _pad_112[116 - 112]; /* 112-115 */
u8 fac116; /* 116 */
- u8 _pad_117[119 - 117]; /* 117-118 */
+ u8 fac117; /* 117 */
+ u8 _pad_118; /* 118 */
u8 fac119; /* 119 */
u16 hcpua; /* 120-121 */
u8 _pad_122[124 - 122]; /* 122-123 */
@@ -114,7 +115,12 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb)
sclp.facilities = sccb->facilities;
sclp.has_sprp = !!(sccb->fac84 & 0x02);
sclp.has_core_type = !!(sccb->fac84 & 0x01);
+ sclp.has_gsls = !!(sccb->fac85 & 0x80);
+ sclp.has_64bscao = !!(sccb->fac116 & 0x80);
+ sclp.has_cmma = !!(sccb->fac116 & 0x40);
sclp.has_esca = !!(sccb->fac116 & 0x08);
+ sclp.has_pfmfi = !!(sccb->fac117 & 0x40);
+ sclp.has_ibs = !!(sccb->fac117 & 0x20);
sclp.has_hvs = !!(sccb->fac119 & 0x80);
if (sccb->fac85 & 0x02)
S390_lowcore.machine_flags |= MACHINE_FLAG_ESOP;
@@ -145,6 +151,10 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb)
sclp.has_siif = cpue->siif;
sclp.has_sigpif = cpue->sigpif;
sclp.has_sief2 = cpue->sief2;
+ sclp.has_gpere = cpue->gpere;
+ sclp.has_ib = cpue->ib;
+ sclp.has_cei = cpue->cei;
+ sclp.has_skey = cpue->skey;
break;
}
diff --git a/drivers/s390/char/sclp_ocf.c b/drivers/s390/char/sclp_ocf.c
index 2553db0fdb52..f59b71776bbd 100644
--- a/drivers/s390/char/sclp_ocf.c
+++ b/drivers/s390/char/sclp_ocf.c
@@ -26,7 +26,7 @@
#define OCF_LENGTH_CPC_NAME 8UL
static char hmc_network[OCF_LENGTH_HMC_NETWORK + 1];
-static char cpc_name[OCF_LENGTH_CPC_NAME + 1];
+static char cpc_name[OCF_LENGTH_CPC_NAME]; /* in EBCDIC */
static DEFINE_SPINLOCK(sclp_ocf_lock);
static struct work_struct sclp_ocf_change_work;
@@ -72,9 +72,8 @@ static void sclp_ocf_handler(struct evbuf_header *evbuf)
}
if (cpc) {
size = min(OCF_LENGTH_CPC_NAME, (size_t) cpc->length);
+ memset(cpc_name, 0, OCF_LENGTH_CPC_NAME);
memcpy(cpc_name, cpc + 1, size);
- EBCASC(cpc_name, size);
- cpc_name[size] = 0;
}
spin_unlock(&sclp_ocf_lock);
schedule_work(&sclp_ocf_change_work);
@@ -85,15 +84,23 @@ static struct sclp_register sclp_ocf_event = {
.receiver_fn = sclp_ocf_handler,
};
+void sclp_ocf_cpc_name_copy(char *dst)
+{
+ spin_lock_irq(&sclp_ocf_lock);
+ memcpy(dst, cpc_name, OCF_LENGTH_CPC_NAME);
+ spin_unlock_irq(&sclp_ocf_lock);
+}
+EXPORT_SYMBOL(sclp_ocf_cpc_name_copy);
+
static ssize_t cpc_name_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
- int rc;
+ char name[OCF_LENGTH_CPC_NAME + 1];
- spin_lock_irq(&sclp_ocf_lock);
- rc = snprintf(page, PAGE_SIZE, "%s\n", cpc_name);
- spin_unlock_irq(&sclp_ocf_lock);
- return rc;
+ sclp_ocf_cpc_name_copy(name);
+ name[OCF_LENGTH_CPC_NAME] = 0;
+ EBCASC(name, OCF_LENGTH_CPC_NAME);
+ return snprintf(page, PAGE_SIZE, "%s\n", name);
}
static struct kobj_attribute cpc_name_attr =
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index da0a524802cb..540da5149ba7 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -1,6 +1,5 @@
/*
- * Copyright (C) 2012 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
+ * Copyright (C) 2015, 2016 ARM Ltd.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -12,16 +11,10 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-
-#ifndef __ASM_ARM_KVM_VGIC_H
-#define __ASM_ARM_KVM_VGIC_H
-
-#ifdef CONFIG_KVM_NEW_VGIC
-#include <kvm/vgic/vgic.h>
-#else
+#ifndef __KVM_ARM_VGIC_H
+#define __KVM_ARM_VGIC_H
#include <linux/kernel.h>
#include <linux/kvm.h>
@@ -29,248 +22,187 @@
#include <linux/spinlock.h>
#include <linux/types.h>
#include <kvm/iodev.h>
-#include <linux/irqchip/arm-gic-common.h>
+#include <linux/list.h>
-#define VGIC_NR_IRQS_LEGACY 256
+#define VGIC_V3_MAX_CPUS 255
+#define VGIC_V2_MAX_CPUS 8
+#define VGIC_NR_IRQS_LEGACY 256
#define VGIC_NR_SGIS 16
#define VGIC_NR_PPIS 16
#define VGIC_NR_PRIVATE_IRQS (VGIC_NR_SGIS + VGIC_NR_PPIS)
+#define VGIC_MAX_PRIVATE (VGIC_NR_PRIVATE_IRQS - 1)
+#define VGIC_MAX_SPI 1019
+#define VGIC_MAX_RESERVED 1023
+#define VGIC_MIN_LPI 8192
-#define VGIC_V2_MAX_LRS (1 << 6)
-#define VGIC_V3_MAX_LRS 16
-#define VGIC_MAX_IRQS 1024
-#define VGIC_V2_MAX_CPUS 8
-#define VGIC_V3_MAX_CPUS 255
+enum vgic_type {
+ VGIC_V2, /* Good ol' GICv2 */
+ VGIC_V3, /* New fancy GICv3 */
+};
-#if (VGIC_NR_IRQS_LEGACY & 31)
-#error "VGIC_NR_IRQS must be a multiple of 32"
-#endif
+/* same for all guests, as depending only on the _host's_ GIC model */
+struct vgic_global {
+ /* type of the host GIC */
+ enum vgic_type type;
-#if (VGIC_NR_IRQS_LEGACY > VGIC_MAX_IRQS)
-#error "VGIC_NR_IRQS must be <= 1024"
-#endif
+ /* Physical address of vgic virtual cpu interface */
+ phys_addr_t vcpu_base;
-/*
- * The GIC distributor registers describing interrupts have two parts:
- * - 32 per-CPU interrupts (SGI + PPI)
- * - a bunch of shared interrupts (SPI)
- */
-struct vgic_bitmap {
- /*
- * - One UL per VCPU for private interrupts (assumes UL is at
- * least 32 bits)
- * - As many UL as necessary for shared interrupts.
- *
- * The private interrupts are accessed via the "private"
- * field, one UL per vcpu (the state for vcpu n is in
- * private[n]). The shared interrupts are accessed via the
- * "shared" pointer (IRQn state is at bit n-32 in the bitmap).
- */
- unsigned long *private;
- unsigned long *shared;
-};
+ /* virtual control interface mapping */
+ void __iomem *vctrl_base;
-struct vgic_bytemap {
- /*
- * - 8 u32 per VCPU for private interrupts
- * - As many u32 as necessary for shared interrupts.
- *
- * The private interrupts are accessed via the "private"
- * field, (the state for vcpu n is in private[n*8] to
- * private[n*8 + 7]). The shared interrupts are accessed via
- * the "shared" pointer (IRQn state is at byte (n-32)%4 of the
- * shared[(n-32)/4] word).
- */
- u32 *private;
- u32 *shared;
-};
+ /* Number of implemented list registers */
+ int nr_lr;
-struct kvm_vcpu;
+ /* Maintenance IRQ number */
+ unsigned int maint_irq;
-enum vgic_type {
- VGIC_V2, /* Good ol' GICv2 */
- VGIC_V3, /* New fancy GICv3 */
+ /* maximum number of VCPUs allowed (GICv2 limits us to 8) */
+ int max_gic_vcpus;
+
+ /* Only needed for the legacy KVM_CREATE_IRQCHIP */
+ bool can_emulate_gicv2;
};
-#define LR_STATE_PENDING (1 << 0)
-#define LR_STATE_ACTIVE (1 << 1)
-#define LR_STATE_MASK (3 << 0)
-#define LR_EOI_INT (1 << 2)
-#define LR_HW (1 << 3)
+extern struct vgic_global kvm_vgic_global_state;
-struct vgic_lr {
- unsigned irq:10;
- union {
- unsigned hwirq:10;
- unsigned source:3;
- };
- unsigned state:4;
-};
+#define VGIC_V2_MAX_LRS (1 << 6)
+#define VGIC_V3_MAX_LRS 16
+#define VGIC_V3_LR_INDEX(lr) (VGIC_V3_MAX_LRS - 1 - lr)
-struct vgic_vmcr {
- u32 ctlr;
- u32 abpr;
- u32 bpr;
- u32 pmr;
+enum vgic_irq_config {
+ VGIC_CONFIG_EDGE = 0,
+ VGIC_CONFIG_LEVEL
};
-struct vgic_ops {
- struct vgic_lr (*get_lr)(const struct kvm_vcpu *, int);
- void (*set_lr)(struct kvm_vcpu *, int, struct vgic_lr);
- u64 (*get_elrsr)(const struct kvm_vcpu *vcpu);
- u64 (*get_eisr)(const struct kvm_vcpu *vcpu);
- void (*clear_eisr)(struct kvm_vcpu *vcpu);
- u32 (*get_interrupt_status)(const struct kvm_vcpu *vcpu);
- void (*enable_underflow)(struct kvm_vcpu *vcpu);
- void (*disable_underflow)(struct kvm_vcpu *vcpu);
- void (*get_vmcr)(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
- void (*set_vmcr)(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
- void (*enable)(struct kvm_vcpu *vcpu);
+struct vgic_irq {
+ spinlock_t irq_lock; /* Protects the content of the struct */
+ struct list_head lpi_list; /* Used to link all LPIs together */
+ struct list_head ap_list;
+
+ struct kvm_vcpu *vcpu; /* SGIs and PPIs: The VCPU
+ * SPIs and LPIs: The VCPU whose ap_list
+ * this is queued on.
+ */
+
+ struct kvm_vcpu *target_vcpu; /* The VCPU that this interrupt should
+ * be sent to, as a result of the
+ * targets reg (v2) or the
+ * affinity reg (v3).
+ */
+
+ u32 intid; /* Guest visible INTID */
+ bool pending;
+ bool line_level; /* Level only */
+ bool soft_pending; /* Level only */
+ bool active; /* not used for LPIs */
+ bool enabled;
+ bool hw; /* Tied to HW IRQ */
+ struct kref refcount; /* Used for LPIs */
+ u32 hwintid; /* HW INTID number */
+ union {
+ u8 targets; /* GICv2 target VCPUs mask */
+ u32 mpidr; /* GICv3 target VCPU */
+ };
+ u8 source; /* GICv2 SGIs only */
+ u8 priority;
+ enum vgic_irq_config config; /* Level or edge */
};
-struct vgic_params {
- /* vgic type */
- enum vgic_type type;
- /* Physical address of vgic virtual cpu interface */
- phys_addr_t vcpu_base;
- /* Number of list registers */
- u32 nr_lr;
- /* Interrupt number */
- unsigned int maint_irq;
- /* Virtual control interface base address */
- void __iomem *vctrl_base;
- int max_gic_vcpus;
- /* Only needed for the legacy KVM_CREATE_IRQCHIP */
- bool can_emulate_gicv2;
-};
+struct vgic_register_region;
+struct vgic_its;
-struct vgic_vm_ops {
- bool (*queue_sgi)(struct kvm_vcpu *, int irq);
- void (*add_sgi_source)(struct kvm_vcpu *, int irq, int source);
- int (*init_model)(struct kvm *);
- int (*map_resources)(struct kvm *, const struct vgic_params *);
+enum iodev_type {
+ IODEV_CPUIF,
+ IODEV_DIST,
+ IODEV_REDIST,
+ IODEV_ITS
};
struct vgic_io_device {
- gpa_t addr;
- int len;
- const struct vgic_io_range *reg_ranges;
- struct kvm_vcpu *redist_vcpu;
+ gpa_t base_addr;
+ union {
+ struct kvm_vcpu *redist_vcpu;
+ struct vgic_its *its;
+ };
+ const struct vgic_register_region *regions;
+ enum iodev_type iodev_type;
+ int nr_regions;
struct kvm_io_device dev;
};
-struct irq_phys_map {
- u32 virt_irq;
- u32 phys_irq;
-};
-
-struct irq_phys_map_entry {
- struct list_head entry;
- struct rcu_head rcu;
- struct irq_phys_map map;
+struct vgic_its {
+ /* The base address of the ITS control register frame */
+ gpa_t vgic_its_base;
+
+ bool enabled;
+ bool initialized;
+ struct vgic_io_device iodev;
+ struct kvm_device *dev;
+
+ /* These registers correspond to GITS_BASER{0,1} */
+ u64 baser_device_table;
+ u64 baser_coll_table;
+
+ /* Protects the command queue */
+ struct mutex cmd_lock;
+ u64 cbaser;
+ u32 creadr;
+ u32 cwriter;
+
+ /* Protects the device and collection lists */
+ struct mutex its_lock;
+ struct list_head device_list;
+ struct list_head collection_list;
};
struct vgic_dist {
- spinlock_t lock;
bool in_kernel;
bool ready;
+ bool initialized;
/* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */
u32 vgic_model;
- int nr_cpus;
- int nr_irqs;
+ /* Do injected MSIs require an additional device ID? */
+ bool msis_require_devid;
+
+ int nr_spis;
+ /* TODO: Consider moving to global state */
/* Virtual control interface mapping */
void __iomem *vctrl_base;
- /* Distributor and vcpu interface mapping in the guest */
- phys_addr_t vgic_dist_base;
- /* GICv2 and GICv3 use different mapped register blocks */
+ /* base addresses in guest physical address space: */
+ gpa_t vgic_dist_base; /* distributor */
union {
- phys_addr_t vgic_cpu_base;
- phys_addr_t vgic_redist_base;
+ /* either a GICv2 CPU interface */
+ gpa_t vgic_cpu_base;
+ /* or a number of GICv3 redistributor regions */
+ gpa_t vgic_redist_base;
};
- /* Distributor enabled */
- u32 enabled;
-
- /* Interrupt enabled (one bit per IRQ) */
- struct vgic_bitmap irq_enabled;
-
- /* Level-triggered interrupt external input is asserted */
- struct vgic_bitmap irq_level;
-
- /*
- * Interrupt state is pending on the distributor
- */
- struct vgic_bitmap irq_pending;
-
- /*
- * Tracks writes to GICD_ISPENDRn and GICD_ICPENDRn for level-triggered
- * interrupts. Essentially holds the state of the flip-flop in
- * Figure 4-10 on page 4-101 in ARM IHI 0048B.b.
- * Once set, it is only cleared for level-triggered interrupts on
- * guest ACKs (when we queue it) or writes to GICD_ICPENDRn.
- */
- struct vgic_bitmap irq_soft_pend;
-
- /* Level-triggered interrupt queued on VCPU interface */
- struct vgic_bitmap irq_queued;
-
- /* Interrupt was active when unqueue from VCPU interface */
- struct vgic_bitmap irq_active;
-
- /* Interrupt priority. Not used yet. */
- struct vgic_bytemap irq_priority;
+ /* distributor enabled */
+ bool enabled;
- /* Level/edge triggered */
- struct vgic_bitmap irq_cfg;
+ struct vgic_irq *spis;
- /*
- * Source CPU per SGI and target CPU:
- *
- * Each byte represent a SGI observable on a VCPU, each bit of
- * this byte indicating if the corresponding VCPU has
- * generated this interrupt. This is a GICv2 feature only.
- *
- * For VCPUn (n < 8), irq_sgi_sources[n*16] to [n*16 + 15] are
- * the SGIs observable on VCPUn.
- */
- u8 *irq_sgi_sources;
+ struct vgic_io_device dist_iodev;
- /*
- * Target CPU for each SPI:
- *
- * Array of available SPI, each byte indicating the target
- * VCPU for SPI. IRQn (n >=32) is at irq_spi_cpu[n-32].
- */
- u8 *irq_spi_cpu;
+ bool has_its;
/*
- * Reverse lookup of irq_spi_cpu for faster compute pending:
- *
- * Array of bitmaps, one per VCPU, describing if IRQn is
- * routed to a particular VCPU.
+ * Contains the attributes and gpa of the LPI configuration table.
+ * Since we report GICR_TYPER.CommonLPIAff as 0b00, we can share
+ * one address across all redistributors.
+ * GICv3 spec: 6.1.2 "LPI Configuration tables"
*/
- struct vgic_bitmap *irq_spi_target;
-
- /* Target MPIDR for each IRQ (needed for GICv3 IROUTERn) only */
- u32 *irq_spi_mpidr;
+ u64 propbaser;
- /* Bitmap indicating which CPU has something pending */
- unsigned long *irq_pending_on_cpu;
-
- /* Bitmap indicating which CPU has active IRQs */
- unsigned long *irq_active_on_cpu;
-
- struct vgic_vm_ops vm_ops;
- struct vgic_io_device dist_iodev;
- struct vgic_io_device *redist_iodevs;
-
- /* Virtual irq to hwirq mapping */
- spinlock_t irq_phys_map_lock;
- struct list_head irq_phys_map_list;
+ /* Protects the lpi_list and the count value below. */
+ spinlock_t lpi_list_lock;
+ struct list_head lpi_list_head;
+ int lpi_list_count;
};
struct vgic_v2_cpu_if {
@@ -298,78 +230,88 @@ struct vgic_v3_cpu_if {
};
struct vgic_cpu {
- /* Pending/active/both interrupts on this VCPU */
- DECLARE_BITMAP(pending_percpu, VGIC_NR_PRIVATE_IRQS);
- DECLARE_BITMAP(active_percpu, VGIC_NR_PRIVATE_IRQS);
- DECLARE_BITMAP(pend_act_percpu, VGIC_NR_PRIVATE_IRQS);
-
- /* Pending/active/both shared interrupts, dynamically sized */
- unsigned long *pending_shared;
- unsigned long *active_shared;
- unsigned long *pend_act_shared;
-
/* CPU vif control registers for world switch */
union {
struct vgic_v2_cpu_if vgic_v2;
struct vgic_v3_cpu_if vgic_v3;
};
- /* Protected by the distributor's irq_phys_map_lock */
- struct list_head irq_phys_map_list;
+ unsigned int used_lrs;
+ struct vgic_irq private_irqs[VGIC_NR_PRIVATE_IRQS];
- u64 live_lrs;
-};
+ spinlock_t ap_list_lock; /* Protects the ap_list */
+
+ /*
+ * List of IRQs that this VCPU should consider because they are either
+ * Active or Pending (hence the name; AP list), or because they recently
+ * were one of the two and need to be migrated off this list to another
+ * VCPU.
+ */
+ struct list_head ap_list_head;
-#define LR_EMPTY 0xff
+ u64 live_lrs;
-#define INT_STATUS_EOI (1 << 0)
-#define INT_STATUS_UNDERFLOW (1 << 1)
+ /*
+ * Members below are used with GICv3 emulation only and represent
+ * parts of the redistributor.
+ */
+ struct vgic_io_device rd_iodev;
+ struct vgic_io_device sgi_iodev;
-struct kvm;
-struct kvm_vcpu;
+ /* Contains the attributes and gpa of the LPI pending tables. */
+ u64 pendbaser;
+
+ bool lpis_enabled;
+};
int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
-int kvm_vgic_hyp_init(void);
-int kvm_vgic_map_resources(struct kvm *kvm);
-int kvm_vgic_get_max_vcpus(void);
void kvm_vgic_early_init(struct kvm *kvm);
int kvm_vgic_create(struct kvm *kvm, u32 type);
void kvm_vgic_destroy(struct kvm *kvm);
void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu);
void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
-void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
-void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
-int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
+int kvm_vgic_map_resources(struct kvm *kvm);
+int kvm_vgic_hyp_init(void);
+
+int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
bool level);
-int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid,
- unsigned int virt_irq, bool level);
-void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
-int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
-int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int phys_irq);
+int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid,
+ bool level);
+int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq);
int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq);
bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq);
+int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
+
#define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel))
-#define vgic_initialized(k) (!!((k)->arch.vgic.nr_cpus))
+#define vgic_initialized(k) ((k)->arch.vgic.initialized)
#define vgic_ready(k) ((k)->arch.vgic.ready)
#define vgic_valid_spi(k, i) (((i) >= VGIC_NR_PRIVATE_IRQS) && \
- ((i) < (k)->arch.vgic.nr_irqs))
+ ((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS))
+
+bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu);
+void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
+void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
-int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info,
- const struct vgic_ops **ops,
- const struct vgic_params **params);
#ifdef CONFIG_KVM_ARM_VGIC_V3
-int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info,
- const struct vgic_ops **ops,
- const struct vgic_params **params);
+void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
#else
-static inline int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info,
- const struct vgic_ops **ops,
- const struct vgic_params **params)
+static inline void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
{
- return -ENODEV;
}
#endif
-#endif /* old VGIC include */
-#endif
+/**
+ * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
+ *
+ * The host's GIC naturally limits the maximum amount of VCPUs a guest
+ * can use.
+ */
+static inline int kvm_vgic_get_max_vcpus(void)
+{
+ return kvm_vgic_global_state.max_gic_vcpus;
+}
+
+int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);
+
+#endif /* __KVM_ARM_VGIC_H */
diff --git a/include/kvm/vgic/vgic.h b/include/kvm/vgic/vgic.h
deleted file mode 100644
index 3fbd175265ae..000000000000
--- a/include/kvm/vgic/vgic.h
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Copyright (C) 2015, 2016 ARM Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef __ASM_ARM_KVM_VGIC_VGIC_H
-#define __ASM_ARM_KVM_VGIC_VGIC_H
-
-#include <linux/kernel.h>
-#include <linux/kvm.h>
-#include <linux/irqreturn.h>
-#include <linux/spinlock.h>
-#include <linux/types.h>
-#include <kvm/iodev.h>
-
-#define VGIC_V3_MAX_CPUS 255
-#define VGIC_V2_MAX_CPUS 8
-#define VGIC_NR_IRQS_LEGACY 256
-#define VGIC_NR_SGIS 16
-#define VGIC_NR_PPIS 16
-#define VGIC_NR_PRIVATE_IRQS (VGIC_NR_SGIS + VGIC_NR_PPIS)
-#define VGIC_MAX_PRIVATE (VGIC_NR_PRIVATE_IRQS - 1)
-#define VGIC_MAX_SPI 1019
-#define VGIC_MAX_RESERVED 1023
-#define VGIC_MIN_LPI 8192
-
-enum vgic_type {
- VGIC_V2, /* Good ol' GICv2 */
- VGIC_V3, /* New fancy GICv3 */
-};
-
-/* same for all guests, as depending only on the _host's_ GIC model */
-struct vgic_global {
- /* type of the host GIC */
- enum vgic_type type;
-
- /* Physical address of vgic virtual cpu interface */
- phys_addr_t vcpu_base;
-
- /* virtual control interface mapping */
- void __iomem *vctrl_base;
-
- /* Number of implemented list registers */
- int nr_lr;
-
- /* Maintenance IRQ number */
- unsigned int maint_irq;
-
- /* maximum number of VCPUs allowed (GICv2 limits us to 8) */
- int max_gic_vcpus;
-
- /* Only needed for the legacy KVM_CREATE_IRQCHIP */
- bool can_emulate_gicv2;
-};
-
-extern struct vgic_global kvm_vgic_global_state;
-
-#define VGIC_V2_MAX_LRS (1 << 6)
-#define VGIC_V3_MAX_LRS 16
-#define VGIC_V3_LR_INDEX(lr) (VGIC_V3_MAX_LRS - 1 - lr)
-
-enum vgic_irq_config {
- VGIC_CONFIG_EDGE = 0,
- VGIC_CONFIG_LEVEL
-};
-
-struct vgic_irq {
- spinlock_t irq_lock; /* Protects the content of the struct */
- struct list_head ap_list;
-
- struct kvm_vcpu *vcpu; /* SGIs and PPIs: The VCPU
- * SPIs and LPIs: The VCPU whose ap_list
- * this is queued on.
- */
-
- struct kvm_vcpu *target_vcpu; /* The VCPU that this interrupt should
- * be sent to, as a result of the
- * targets reg (v2) or the
- * affinity reg (v3).
- */
-
- u32 intid; /* Guest visible INTID */
- bool pending;
- bool line_level; /* Level only */
- bool soft_pending; /* Level only */
- bool active; /* not used for LPIs */
- bool enabled;
- bool hw; /* Tied to HW IRQ */
- u32 hwintid; /* HW INTID number */
- union {
- u8 targets; /* GICv2 target VCPUs mask */
- u32 mpidr; /* GICv3 target VCPU */
- };
- u8 source; /* GICv2 SGIs only */
- u8 priority;
- enum vgic_irq_config config; /* Level or edge */
-};
-
-struct vgic_register_region;
-
-struct vgic_io_device {
- gpa_t base_addr;
- struct kvm_vcpu *redist_vcpu;
- const struct vgic_register_region *regions;
- int nr_regions;
- struct kvm_io_device dev;
-};
-
-struct vgic_dist {
- bool in_kernel;
- bool ready;
- bool initialized;
-
- /* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */
- u32 vgic_model;
-
- int nr_spis;
-
- /* TODO: Consider moving to global state */
- /* Virtual control interface mapping */
- void __iomem *vctrl_base;
-
- /* base addresses in guest physical address space: */
- gpa_t vgic_dist_base; /* distributor */
- union {
- /* either a GICv2 CPU interface */
- gpa_t vgic_cpu_base;
- /* or a number of GICv3 redistributor regions */
- gpa_t vgic_redist_base;
- };
-
- /* distributor enabled */
- bool enabled;
-
- struct vgic_irq *spis;
-
- struct vgic_io_device dist_iodev;
- struct vgic_io_device *redist_iodevs;
-};
-
-struct vgic_v2_cpu_if {
- u32 vgic_hcr;
- u32 vgic_vmcr;
- u32 vgic_misr; /* Saved only */
- u64 vgic_eisr; /* Saved only */
- u64 vgic_elrsr; /* Saved only */
- u32 vgic_apr;
- u32 vgic_lr[VGIC_V2_MAX_LRS];
-};
-
-struct vgic_v3_cpu_if {
-#ifdef CONFIG_KVM_ARM_VGIC_V3
- u32 vgic_hcr;
- u32 vgic_vmcr;
- u32 vgic_sre; /* Restored only, change ignored */
- u32 vgic_misr; /* Saved only */
- u32 vgic_eisr; /* Saved only */
- u32 vgic_elrsr; /* Saved only */
- u32 vgic_ap0r[4];
- u32 vgic_ap1r[4];
- u64 vgic_lr[VGIC_V3_MAX_LRS];
-#endif
-};
-
-struct vgic_cpu {
- /* CPU vif control registers for world switch */
- union {
- struct vgic_v2_cpu_if vgic_v2;
- struct vgic_v3_cpu_if vgic_v3;
- };
-
- unsigned int used_lrs;
- struct vgic_irq private_irqs[VGIC_NR_PRIVATE_IRQS];
-
- spinlock_t ap_list_lock; /* Protects the ap_list */
-
- /*
- * List of IRQs that this VCPU should consider because they are either
- * Active or Pending (hence the name; AP list), or because they recently
- * were one of the two and need to be migrated off this list to another
- * VCPU.
- */
- struct list_head ap_list_head;
-
- u64 live_lrs;
-};
-
-int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
-void kvm_vgic_early_init(struct kvm *kvm);
-int kvm_vgic_create(struct kvm *kvm, u32 type);
-void kvm_vgic_destroy(struct kvm *kvm);
-void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu);
-void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
-int kvm_vgic_map_resources(struct kvm *kvm);
-int kvm_vgic_hyp_init(void);
-
-int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
- bool level);
-int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid,
- bool level);
-int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq);
-int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq);
-bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq);
-
-int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
-
-#define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel))
-#define vgic_initialized(k) ((k)->arch.vgic.initialized)
-#define vgic_ready(k) ((k)->arch.vgic.ready)
-#define vgic_valid_spi(k, i) (((i) >= VGIC_NR_PRIVATE_IRQS) && \
- ((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS))
-
-bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu);
-void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
-void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
-
-#ifdef CONFIG_KVM_ARM_VGIC_V3
-void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
-#else
-static inline void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
-{
-}
-#endif
-
-/**
- * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
- *
- * The host's GIC naturally limits the maximum amount of VCPUs a guest
- * can use.
- */
-static inline int kvm_vgic_get_max_vcpus(void)
-{
- return kvm_vgic_global_state.max_gic_vcpus;
-}
-
-#endif /* __ASM_ARM_KVM_VGIC_VGIC_H */
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index d9aef2a0ec8e..c78fc27418f2 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -99,7 +99,8 @@ static inline void context_tracking_init(void) { }
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static inline void guest_enter(void)
+/* must be called with irqs disabled */
+static inline void guest_enter_irqoff(void)
{
if (vtime_accounting_cpu_enabled())
vtime_guest_enter(current);
@@ -108,9 +109,19 @@ static inline void guest_enter(void)
if (context_tracking_is_enabled())
__context_tracking_enter(CONTEXT_GUEST);
+
+ /* KVM does not hold any references to rcu protected data when it
+ * switches CPU into a guest mode. In fact switching to a guest mode
+ * is very similar to exiting to userspace from rcu point of view. In
+ * addition CPU may stay in a guest mode for quite a long time (up to
+ * one time slice). Lets treat guest mode as quiescent state, just like
+ * we do with user-mode execution.
+ */
+ if (!context_tracking_cpu_is_enabled())
+ rcu_virt_note_context_switch(smp_processor_id());
}
-static inline void guest_exit(void)
+static inline void guest_exit_irqoff(void)
{
if (context_tracking_is_enabled())
__context_tracking_exit(CONTEXT_GUEST);
@@ -122,7 +133,7 @@ static inline void guest_exit(void)
}
#else
-static inline void guest_enter(void)
+static inline void guest_enter_irqoff(void)
{
/*
* This is running in ioctl context so its safe
@@ -131,9 +142,10 @@ static inline void guest_enter(void)
*/
vtime_account_system(current);
current->flags |= PF_VCPU;
+ rcu_virt_note_context_switch(smp_processor_id());
}
-static inline void guest_exit(void)
+static inline void guest_exit_irqoff(void)
{
/* Flush the guest cputime we spent on the guest */
vtime_account_system(current);
@@ -141,4 +153,22 @@ static inline void guest_exit(void)
}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
+static inline void guest_enter(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ guest_enter_irqoff();
+ local_irq_restore(flags);
+}
+
+static inline void guest_exit(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ guest_exit_irqoff();
+ local_irq_restore(flags);
+}
+
#endif
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 107eed475b94..56b0b7ec66aa 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -112,34 +112,76 @@
#define GICR_WAKER_ProcessorSleep (1U << 1)
#define GICR_WAKER_ChildrenAsleep (1U << 2)
-#define GICR_PROPBASER_NonShareable (0U << 10)
-#define GICR_PROPBASER_InnerShareable (1U << 10)
-#define GICR_PROPBASER_OuterShareable (2U << 10)
-#define GICR_PROPBASER_SHAREABILITY_MASK (3UL << 10)
-#define GICR_PROPBASER_nCnB (0U << 7)
-#define GICR_PROPBASER_nC (1U << 7)
-#define GICR_PROPBASER_RaWt (2U << 7)
-#define GICR_PROPBASER_RaWb (3U << 7)
-#define GICR_PROPBASER_WaWt (4U << 7)
-#define GICR_PROPBASER_WaWb (5U << 7)
-#define GICR_PROPBASER_RaWaWt (6U << 7)
-#define GICR_PROPBASER_RaWaWb (7U << 7)
-#define GICR_PROPBASER_CACHEABILITY_MASK (7U << 7)
-#define GICR_PROPBASER_IDBITS_MASK (0x1f)
-
-#define GICR_PENDBASER_NonShareable (0U << 10)
-#define GICR_PENDBASER_InnerShareable (1U << 10)
-#define GICR_PENDBASER_OuterShareable (2U << 10)
-#define GICR_PENDBASER_SHAREABILITY_MASK (3UL << 10)
-#define GICR_PENDBASER_nCnB (0U << 7)
-#define GICR_PENDBASER_nC (1U << 7)
-#define GICR_PENDBASER_RaWt (2U << 7)
-#define GICR_PENDBASER_RaWb (3U << 7)
-#define GICR_PENDBASER_WaWt (4U << 7)
-#define GICR_PENDBASER_WaWb (5U << 7)
-#define GICR_PENDBASER_RaWaWt (6U << 7)
-#define GICR_PENDBASER_RaWaWb (7U << 7)
-#define GICR_PENDBASER_CACHEABILITY_MASK (7U << 7)
+#define GIC_BASER_CACHE_nCnB 0ULL
+#define GIC_BASER_CACHE_SameAsInner 0ULL
+#define GIC_BASER_CACHE_nC 1ULL
+#define GIC_BASER_CACHE_RaWt 2ULL
+#define GIC_BASER_CACHE_RaWb 3ULL
+#define GIC_BASER_CACHE_WaWt 4ULL
+#define GIC_BASER_CACHE_WaWb 5ULL
+#define GIC_BASER_CACHE_RaWaWt 6ULL
+#define GIC_BASER_CACHE_RaWaWb 7ULL
+#define GIC_BASER_CACHE_MASK 7ULL
+#define GIC_BASER_NonShareable 0ULL
+#define GIC_BASER_InnerShareable 1ULL
+#define GIC_BASER_OuterShareable 2ULL
+#define GIC_BASER_SHAREABILITY_MASK 3ULL
+
+#define GIC_BASER_CACHEABILITY(reg, inner_outer, type) \
+ (GIC_BASER_CACHE_##type << reg##_##inner_outer##_CACHEABILITY_SHIFT)
+
+#define GIC_BASER_SHAREABILITY(reg, type) \
+ (GIC_BASER_##type << reg##_SHAREABILITY_SHIFT)
+
+#define GICR_PROPBASER_SHAREABILITY_SHIFT (10)
+#define GICR_PROPBASER_INNER_CACHEABILITY_SHIFT (7)
+#define GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT (56)
+#define GICR_PROPBASER_SHAREABILITY_MASK \
+ GIC_BASER_SHAREABILITY(GICR_PROPBASER, SHAREABILITY_MASK)
+#define GICR_PROPBASER_INNER_CACHEABILITY_MASK \
+ GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, MASK)
+#define GICR_PROPBASER_OUTER_CACHEABILITY_MASK \
+ GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, MASK)
+#define GICR_PROPBASER_CACHEABILITY_MASK GICR_PROPBASER_INNER_CACHEABILITY_MASK
+
+#define GICR_PROPBASER_InnerShareable \
+ GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable)
+
+#define GICR_PROPBASER_nCnB GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nCnB)
+#define GICR_PROPBASER_nC GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nC)
+#define GICR_PROPBASER_RaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt)
+#define GICR_PROPBASER_RaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt)
+#define GICR_PROPBASER_WaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWt)
+#define GICR_PROPBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWb)
+#define GICR_PROPBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWt)
+#define GICR_PROPBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWb)
+
+#define GICR_PROPBASER_IDBITS_MASK (0x1f)
+
+#define GICR_PENDBASER_SHAREABILITY_SHIFT (10)
+#define GICR_PENDBASER_INNER_CACHEABILITY_SHIFT (7)
+#define GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT (56)
+#define GICR_PENDBASER_SHAREABILITY_MASK \
+ GIC_BASER_SHAREABILITY(GICR_PENDBASER, SHAREABILITY_MASK)
+#define GICR_PENDBASER_INNER_CACHEABILITY_MASK \
+ GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, MASK)
+#define GICR_PENDBASER_OUTER_CACHEABILITY_MASK \
+ GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, MASK)
+#define GICR_PENDBASER_CACHEABILITY_MASK GICR_PENDBASER_INNER_CACHEABILITY_MASK
+
+#define GICR_PENDBASER_InnerShareable \
+ GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)
+
+#define GICR_PENDBASER_nCnB GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nCnB)
+#define GICR_PENDBASER_nC GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nC)
+#define GICR_PENDBASER_RaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt)
+#define GICR_PENDBASER_RaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt)
+#define GICR_PENDBASER_WaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWt)
+#define GICR_PENDBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWb)
+#define GICR_PENDBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWt)
+#define GICR_PENDBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWb)
+
+#define GICR_PENDBASER_PTZ BIT_ULL(62)
/*
* Re-Distributor registers, offsets from SGI_base
@@ -175,54 +217,83 @@
#define GITS_CWRITER 0x0088
#define GITS_CREADR 0x0090
#define GITS_BASER 0x0100
+#define GITS_IDREGS_BASE 0xffd0
+#define GITS_PIDR0 0xffe0
+#define GITS_PIDR1 0xffe4
#define GITS_PIDR2 GICR_PIDR2
+#define GITS_PIDR4 0xffd0
+#define GITS_CIDR0 0xfff0
+#define GITS_CIDR1 0xfff4
+#define GITS_CIDR2 0xfff8
+#define GITS_CIDR3 0xfffc
#define GITS_TRANSLATER 0x10040
#define GITS_CTLR_ENABLE (1U << 0)
#define GITS_CTLR_QUIESCENT (1U << 31)
+#define GITS_TYPER_PLPIS (1UL << 0)
+#define GITS_TYPER_IDBITS_SHIFT 8
#define GITS_TYPER_DEVBITS_SHIFT 13
#define GITS_TYPER_DEVBITS(r) ((((r) >> GITS_TYPER_DEVBITS_SHIFT) & 0x1f) + 1)
#define GITS_TYPER_PTA (1UL << 19)
-
-#define GITS_CBASER_VALID (1UL << 63)
-#define GITS_CBASER_nCnB (0UL << 59)
-#define GITS_CBASER_nC (1UL << 59)
-#define GITS_CBASER_RaWt (2UL << 59)
-#define GITS_CBASER_RaWb (3UL << 59)
-#define GITS_CBASER_WaWt (4UL << 59)
-#define GITS_CBASER_WaWb (5UL << 59)
-#define GITS_CBASER_RaWaWt (6UL << 59)
-#define GITS_CBASER_RaWaWb (7UL << 59)
-#define GITS_CBASER_CACHEABILITY_MASK (7UL << 59)
-#define GITS_CBASER_NonShareable (0UL << 10)
-#define GITS_CBASER_InnerShareable (1UL << 10)
-#define GITS_CBASER_OuterShareable (2UL << 10)
-#define GITS_CBASER_SHAREABILITY_MASK (3UL << 10)
+#define GITS_TYPER_HWCOLLCNT_SHIFT 24
+
+#define GITS_CBASER_VALID (1UL << 63)
+#define GITS_CBASER_SHAREABILITY_SHIFT (10)
+#define GITS_CBASER_INNER_CACHEABILITY_SHIFT (59)
+#define GITS_CBASER_OUTER_CACHEABILITY_SHIFT (53)
+#define GITS_CBASER_SHAREABILITY_MASK \
+ GIC_BASER_SHAREABILITY(GITS_CBASER, SHAREABILITY_MASK)
+#define GITS_CBASER_INNER_CACHEABILITY_MASK \
+ GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, MASK)
+#define GITS_CBASER_OUTER_CACHEABILITY_MASK \
+ GIC_BASER_CACHEABILITY(GITS_CBASER, OUTER, MASK)
+#define GITS_CBASER_CACHEABILITY_MASK GITS_CBASER_INNER_CACHEABILITY_MASK
+
+#define GITS_CBASER_InnerShareable \
+ GIC_BASER_SHAREABILITY(GITS_CBASER, InnerShareable)
+
+#define GITS_CBASER_nCnB GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nCnB)
+#define GITS_CBASER_nC GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nC)
+#define GITS_CBASER_RaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt)
+#define GITS_CBASER_RaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt)
+#define GITS_CBASER_WaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWt)
+#define GITS_CBASER_WaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWb)
+#define GITS_CBASER_RaWaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWt)
+#define GITS_CBASER_RaWaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWb)
#define GITS_BASER_NR_REGS 8
-#define GITS_BASER_VALID (1UL << 63)
-#define GITS_BASER_INDIRECT (1UL << 62)
-#define GITS_BASER_nCnB (0UL << 59)
-#define GITS_BASER_nC (1UL << 59)
-#define GITS_BASER_RaWt (2UL << 59)
-#define GITS_BASER_RaWb (3UL << 59)
-#define GITS_BASER_WaWt (4UL << 59)
-#define GITS_BASER_WaWb (5UL << 59)
-#define GITS_BASER_RaWaWt (6UL << 59)
-#define GITS_BASER_RaWaWb (7UL << 59)
-#define GITS_BASER_CACHEABILITY_MASK (7UL << 59)
-#define GITS_BASER_TYPE_SHIFT (56)
+#define GITS_BASER_VALID (1UL << 63)
+#define GITS_BASER_INDIRECT (1ULL << 62)
+
+#define GITS_BASER_INNER_CACHEABILITY_SHIFT (59)
+#define GITS_BASER_OUTER_CACHEABILITY_SHIFT (53)
+#define GITS_BASER_INNER_CACHEABILITY_MASK \
+ GIC_BASER_CACHEABILITY(GITS_BASER, INNER, MASK)
+#define GITS_BASER_CACHEABILITY_MASK GITS_BASER_INNER_CACHEABILITY_MASK
+#define GITS_BASER_OUTER_CACHEABILITY_MASK \
+ GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, MASK)
+#define GITS_BASER_SHAREABILITY_MASK \
+ GIC_BASER_SHAREABILITY(GITS_BASER, SHAREABILITY_MASK)
+
+#define GITS_BASER_nCnB GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nCnB)
+#define GITS_BASER_nC GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nC)
+#define GITS_BASER_RaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt)
+#define GITS_BASER_RaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt)
+#define GITS_BASER_WaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWt)
+#define GITS_BASER_WaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWb)
+#define GITS_BASER_RaWaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWt)
+#define GITS_BASER_RaWaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWb)
+
+#define GITS_BASER_TYPE_SHIFT (56)
#define GITS_BASER_TYPE(r) (((r) >> GITS_BASER_TYPE_SHIFT) & 7)
-#define GITS_BASER_ENTRY_SIZE_SHIFT (48)
+#define GITS_BASER_ENTRY_SIZE_SHIFT (48)
#define GITS_BASER_ENTRY_SIZE(r) ((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0xff) + 1)
-#define GITS_BASER_NonShareable (0UL << 10)
-#define GITS_BASER_InnerShareable (1UL << 10)
-#define GITS_BASER_OuterShareable (2UL << 10)
#define GITS_BASER_SHAREABILITY_SHIFT (10)
-#define GITS_BASER_SHAREABILITY_MASK (3UL << GITS_BASER_SHAREABILITY_SHIFT)
+#define GITS_BASER_InnerShareable \
+ GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable)
#define GITS_BASER_PAGE_SIZE_SHIFT (8)
#define GITS_BASER_PAGE_SIZE_4K (0UL << GITS_BASER_PAGE_SIZE_SHIFT)
#define GITS_BASER_PAGE_SIZE_16K (1UL << GITS_BASER_PAGE_SIZE_SHIFT)
@@ -230,6 +301,7 @@
#define GITS_BASER_PAGE_SIZE_MASK (3UL << GITS_BASER_PAGE_SIZE_SHIFT)
#define GITS_BASER_PAGES_MAX 256
#define GITS_BASER_PAGES_SHIFT (0)
+#define GITS_BASER_NR_PAGES(r) (((r) & 0xff) + 1)
#define GITS_BASER_TYPE_NONE 0
#define GITS_BASER_TYPE_DEVICE 1
@@ -247,7 +319,10 @@
*/
#define GITS_CMD_MAPD 0x08
#define GITS_CMD_MAPC 0x09
-#define GITS_CMD_MAPVI 0x0a
+#define GITS_CMD_MAPTI 0x0a
+/* older GIC documentation used MAPVI for this command */
+#define GITS_CMD_MAPVI GITS_CMD_MAPTI
+#define GITS_CMD_MAPI 0x0b
#define GITS_CMD_MOVI 0x01
#define GITS_CMD_DISCARD 0x0f
#define GITS_CMD_INV 0x0c
@@ -258,6 +333,22 @@
#define GITS_CMD_SYNC 0x05
/*
+ * ITS error numbers
+ */
+#define E_ITS_MOVI_UNMAPPED_INTERRUPT 0x010107
+#define E_ITS_MOVI_UNMAPPED_COLLECTION 0x010109
+#define E_ITS_CLEAR_UNMAPPED_INTERRUPT 0x010507
+#define E_ITS_MAPD_DEVICE_OOR 0x010801
+#define E_ITS_MAPC_PROCNUM_OOR 0x010902
+#define E_ITS_MAPC_COLLECTION_OOR 0x010903
+#define E_ITS_MAPTI_UNMAPPED_DEVICE 0x010a04
+#define E_ITS_MAPTI_PHYSICALID_OOR 0x010a06
+#define E_ITS_INV_UNMAPPED_INTERRUPT 0x010c07
+#define E_ITS_INVALL_UNMAPPED_COLLECTION 0x010d09
+#define E_ITS_MOVALL_PROCNUM_OOR 0x010e01
+#define E_ITS_DISCARD_UNMAPPED_INTERRUPT 0x010f07
+
+/*
* CPU interface registers
*/
#define ICC_CTLR_EL1_EOImode_drop_dir (0U << 1)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1c9c973a7dd9..aafd702f3e21 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -164,6 +164,8 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
int len, struct kvm_io_device *dev);
int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
struct kvm_io_device *dev);
+struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+ gpa_t addr);
#ifdef CONFIG_KVM_ASYNC_PF
struct kvm_async_pf {
@@ -371,7 +373,15 @@ struct kvm {
struct srcu_struct srcu;
struct srcu_struct irq_srcu;
struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+
+ /*
+ * created_vcpus is protected by kvm->lock, and is incremented
+ * at the beginning of KVM_CREATE_VCPU. online_vcpus is only
+ * incremented after storing the kvm_vcpu pointer in vcpus,
+ * and is accessed atomically.
+ */
atomic_t online_vcpus;
+ int created_vcpus;
int last_boosted_vcpu;
struct list_head vm_list;
struct mutex lock;
@@ -867,45 +877,6 @@ static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
}
#endif
-/* must be called with irqs disabled */
-static inline void __kvm_guest_enter(void)
-{
- guest_enter();
- /* KVM does not hold any references to rcu protected data when it
- * switches CPU into a guest mode. In fact switching to a guest mode
- * is very similar to exiting to userspace from rcu point of view. In
- * addition CPU may stay in a guest mode for quite a long time (up to
- * one time slice). Lets treat guest mode as quiescent state, just like
- * we do with user-mode execution.
- */
- if (!context_tracking_cpu_is_enabled())
- rcu_virt_note_context_switch(smp_processor_id());
-}
-
-/* must be called with irqs disabled */
-static inline void __kvm_guest_exit(void)
-{
- guest_exit();
-}
-
-static inline void kvm_guest_enter(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __kvm_guest_enter();
- local_irq_restore(flags);
-}
-
-static inline void kvm_guest_exit(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __kvm_guest_exit();
- local_irq_restore(flags);
-}
-
/*
* search_memslots() and __gfn_to_memslot() are here because they are
* used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c.
@@ -1042,7 +1013,8 @@ int kvm_set_irq_routing(struct kvm *kvm,
const struct kvm_irq_routing_entry *entries,
unsigned nr,
unsigned flags);
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+int kvm_set_routing_entry(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue);
void kvm_free_irq_routing(struct kvm *kvm);
@@ -1097,12 +1069,6 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
#endif /* CONFIG_HAVE_KVM_EVENTFD */
-#ifdef CONFIG_KVM_APIC_ARCHITECTURE
-bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu);
-#else
-static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; }
-#endif
-
static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
{
/*
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index 8b5e0a9f2431..610e13271918 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -124,6 +124,15 @@ static inline int page_ref_sub_and_test(struct page *page, int nr)
return ret;
}
+static inline int page_ref_inc_return(struct page *page)
+{
+ int ret = atomic_inc_return(&page->_refcount);
+
+ if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return))
+ __page_ref_mod_and_return(page, 1, ret);
+ return ret;
+}
+
static inline int page_ref_dec_and_test(struct page *page)
{
int ret = atomic_dec_and_test(&page->_refcount);
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index f28292d73ddb..8ade3eb6c640 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -151,8 +151,9 @@ TRACE_EVENT(kvm_msi_set_irq,
__entry->data = data;
),
- TP_printk("dst %u vec %u (%s|%s|%s%s)",
- (u8)(__entry->address >> 12), (u8)__entry->data,
+ TP_printk("dst %llx vec %u (%s|%s|%s%s)",
+ (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00),
+ (u8)__entry->data,
__print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode),
(__entry->address & (1<<2)) ? "logical" : "physical",
(__entry->data & (1<<15)) ? "level" : "edge",
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 05ebf475104c..e98bb4cce639 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -866,6 +866,10 @@ struct kvm_ppc_smmu_info {
#define KVM_CAP_ARM_PMU_V3 126
#define KVM_CAP_VCPU_ATTRIBUTES 127
#define KVM_CAP_MAX_VCPU_ID 128
+#define KVM_CAP_X2APIC_API 129
+#define KVM_CAP_S390_USER_INSTR0 130
+#define KVM_CAP_MSI_DEVID 131
+#define KVM_CAP_PPC_HTM 132
#ifdef KVM_CAP_IRQ_ROUTING
@@ -1024,12 +1028,14 @@ struct kvm_one_reg {
__u64 addr;
};
+#define KVM_MSI_VALID_DEVID (1U << 0)
struct kvm_msi {
__u32 address_lo;
__u32 address_hi;
__u32 data;
__u32 flags;
- __u8 pad[16];
+ __u32 devid;
+ __u8 pad[12];
};
struct kvm_arm_device_addr {
@@ -1074,6 +1080,8 @@ enum kvm_device_type {
#define KVM_DEV_TYPE_FLIC KVM_DEV_TYPE_FLIC
KVM_DEV_TYPE_ARM_VGIC_V3,
#define KVM_DEV_TYPE_ARM_VGIC_V3 KVM_DEV_TYPE_ARM_VGIC_V3
+ KVM_DEV_TYPE_ARM_VGIC_ITS,
+#define KVM_DEV_TYPE_ARM_VGIC_ITS KVM_DEV_TYPE_ARM_VGIC_ITS
KVM_DEV_TYPE_MAX,
};
@@ -1313,4 +1321,7 @@ struct kvm_assigned_msix_entry {
__u16 padding[3];
};
+#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0)
+#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1)
+
#endif /* __LINUX_KVM_H */
diff --git a/mm/gup.c b/mm/gup.c
index 547741f5f7a7..96b2b2fd0fbd 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -723,6 +723,7 @@ retry:
}
return 0;
}
+EXPORT_SYMBOL_GPL(fixup_user_fault);
static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
struct mm_struct *mm,
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index e5d6108f5e85..b0cc1a34db27 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -16,9 +16,6 @@ config HAVE_KVM_EVENTFD
bool
select EVENTFD
-config KVM_APIC_ARCHITECTURE
- bool
-
config KVM_MMIO
bool
diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c
index 3a3a699b7489..7cffd9338c49 100644
--- a/virt/kvm/arm/hyp/vgic-v2-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v2-sr.c
@@ -21,18 +21,11 @@
#include <asm/kvm_hyp.h>
-#ifdef CONFIG_KVM_NEW_VGIC
-extern struct vgic_global kvm_vgic_global_state;
-#define vgic_v2_params kvm_vgic_global_state
-#else
-extern struct vgic_params vgic_v2_params;
-#endif
-
static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu,
void __iomem *base)
{
struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
- int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
+ int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
u32 eisr0, eisr1;
int i;
bool expect_mi;
@@ -74,7 +67,7 @@ static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu,
static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base)
{
struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
- int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
+ int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
u32 elrsr0, elrsr1;
elrsr0 = readl_relaxed(base + GICH_ELRSR0);
@@ -93,7 +86,7 @@ static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base)
static void __hyp_text save_lrs(struct kvm_vcpu *vcpu, void __iomem *base)
{
struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
- int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
+ int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
int i;
for (i = 0; i < nr_lr; i++) {
@@ -147,7 +140,7 @@ void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu)
struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
struct vgic_dist *vgic = &kvm->arch.vgic;
void __iomem *base = kern_hyp_va(vgic->vctrl_base);
- int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
+ int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
int i;
u64 live_lrs = 0;
diff --git a/virt/kvm/arm/vgic-v2-emul.c b/virt/kvm/arm/vgic-v2-emul.c
deleted file mode 100644
index 1b0bee095427..000000000000
--- a/virt/kvm/arm/vgic-v2-emul.c
+++ /dev/null
@@ -1,856 +0,0 @@
-/*
- * Contains GICv2 specific emulation code, was in vgic.c before.
- *
- * Copyright (C) 2012 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-#include <linux/uaccess.h>
-
-#include <linux/irqchip/arm-gic.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-
-#include "vgic.h"
-
-#define GICC_ARCH_VERSION_V2 0x2
-
-static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg);
-static u8 *vgic_get_sgi_sources(struct vgic_dist *dist, int vcpu_id, int sgi)
-{
- return dist->irq_sgi_sources + vcpu_id * VGIC_NR_SGIS + sgi;
-}
-
-static bool handle_mmio_misc(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
- u32 reg;
- u32 word_offset = offset & 3;
-
- switch (offset & ~3) {
- case 0: /* GICD_CTLR */
- reg = vcpu->kvm->arch.vgic.enabled;
- vgic_reg_access(mmio, &reg, word_offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
- if (mmio->is_write) {
- vcpu->kvm->arch.vgic.enabled = reg & 1;
- vgic_update_state(vcpu->kvm);
- return true;
- }
- break;
-
- case 4: /* GICD_TYPER */
- reg = (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5;
- reg |= (vcpu->kvm->arch.vgic.nr_irqs >> 5) - 1;
- vgic_reg_access(mmio, &reg, word_offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
- break;
-
- case 8: /* GICD_IIDR */
- reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
- vgic_reg_access(mmio, &reg, word_offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
- break;
- }
-
- return false;
-}
-
-static bool handle_mmio_set_enable_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
- vcpu->vcpu_id, ACCESS_WRITE_SETBIT);
-}
-
-static bool handle_mmio_clear_enable_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
- vcpu->vcpu_id, ACCESS_WRITE_CLEARBIT);
-}
-
-static bool handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
- vcpu->vcpu_id);
-}
-
-static bool handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
- vcpu->vcpu_id);
-}
-
-static bool handle_mmio_set_active_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset,
- vcpu->vcpu_id);
-}
-
-static bool handle_mmio_clear_active_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset,
- vcpu->vcpu_id);
-}
-
-static bool handle_mmio_priority_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- u32 *reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
- vcpu->vcpu_id, offset);
- vgic_reg_access(mmio, reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
- return false;
-}
-
-#define GICD_ITARGETSR_SIZE 32
-#define GICD_CPUTARGETS_BITS 8
-#define GICD_IRQS_PER_ITARGETSR (GICD_ITARGETSR_SIZE / GICD_CPUTARGETS_BITS)
-static u32 vgic_get_target_reg(struct kvm *kvm, int irq)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
- int i;
- u32 val = 0;
-
- irq -= VGIC_NR_PRIVATE_IRQS;
-
- for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++)
- val |= 1 << (dist->irq_spi_cpu[irq + i] + i * 8);
-
- return val;
-}
-
-static void vgic_set_target_reg(struct kvm *kvm, u32 val, int irq)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
- struct kvm_vcpu *vcpu;
- int i, c;
- unsigned long *bmap;
- u32 target;
-
- irq -= VGIC_NR_PRIVATE_IRQS;
-
- /*
- * Pick the LSB in each byte. This ensures we target exactly
- * one vcpu per IRQ. If the byte is null, assume we target
- * CPU0.
- */
- for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++) {
- int shift = i * GICD_CPUTARGETS_BITS;
-
- target = ffs((val >> shift) & 0xffU);
- target = target ? (target - 1) : 0;
- dist->irq_spi_cpu[irq + i] = target;
- kvm_for_each_vcpu(c, vcpu, kvm) {
- bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]);
- if (c == target)
- set_bit(irq + i, bmap);
- else
- clear_bit(irq + i, bmap);
- }
- }
-}
-
-static bool handle_mmio_target_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- u32 reg;
-
- /* We treat the banked interrupts targets as read-only */
- if (offset < 32) {
- u32 roreg;
-
- roreg = 1 << vcpu->vcpu_id;
- roreg |= roreg << 8;
- roreg |= roreg << 16;
-
- vgic_reg_access(mmio, &roreg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
- return false;
- }
-
- reg = vgic_get_target_reg(vcpu->kvm, offset & ~3U);
- vgic_reg_access(mmio, &reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
- if (mmio->is_write) {
- vgic_set_target_reg(vcpu->kvm, reg, offset & ~3U);
- vgic_update_state(vcpu->kvm);
- return true;
- }
-
- return false;
-}
-
-static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
- u32 *reg;
-
- reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
- vcpu->vcpu_id, offset >> 1);
-
- return vgic_handle_cfg_reg(reg, mmio, offset);
-}
-
-static bool handle_mmio_sgi_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
- u32 reg;
-
- vgic_reg_access(mmio, &reg, offset,
- ACCESS_READ_RAZ | ACCESS_WRITE_VALUE);
- if (mmio->is_write) {
- vgic_dispatch_sgi(vcpu, reg);
- vgic_update_state(vcpu->kvm);
- return true;
- }
-
- return false;
-}
-
-/* Handle reads of GICD_CPENDSGIRn and GICD_SPENDSGIRn */
-static bool read_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- int sgi;
- int min_sgi = (offset & ~0x3);
- int max_sgi = min_sgi + 3;
- int vcpu_id = vcpu->vcpu_id;
- u32 reg = 0;
-
- /* Copy source SGIs from distributor side */
- for (sgi = min_sgi; sgi <= max_sgi; sgi++) {
- u8 sources = *vgic_get_sgi_sources(dist, vcpu_id, sgi);
-
- reg |= ((u32)sources) << (8 * (sgi - min_sgi));
- }
-
- mmio_data_write(mmio, ~0, reg);
- return false;
-}
-
-static bool write_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset, bool set)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- int sgi;
- int min_sgi = (offset & ~0x3);
- int max_sgi = min_sgi + 3;
- int vcpu_id = vcpu->vcpu_id;
- u32 reg;
- bool updated = false;
-
- reg = mmio_data_read(mmio, ~0);
-
- /* Clear pending SGIs on the distributor */
- for (sgi = min_sgi; sgi <= max_sgi; sgi++) {
- u8 mask = reg >> (8 * (sgi - min_sgi));
- u8 *src = vgic_get_sgi_sources(dist, vcpu_id, sgi);
-
- if (set) {
- if ((*src & mask) != mask)
- updated = true;
- *src |= mask;
- } else {
- if (*src & mask)
- updated = true;
- *src &= ~mask;
- }
- }
-
- if (updated)
- vgic_update_state(vcpu->kvm);
-
- return updated;
-}
-
-static bool handle_mmio_sgi_set(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- if (!mmio->is_write)
- return read_set_clear_sgi_pend_reg(vcpu, mmio, offset);
- else
- return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, true);
-}
-
-static bool handle_mmio_sgi_clear(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- if (!mmio->is_write)
- return read_set_clear_sgi_pend_reg(vcpu, mmio, offset);
- else
- return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, false);
-}
-
-static const struct vgic_io_range vgic_dist_ranges[] = {
- {
- .base = GIC_DIST_SOFTINT,
- .len = 4,
- .handle_mmio = handle_mmio_sgi_reg,
- },
- {
- .base = GIC_DIST_CTRL,
- .len = 12,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_misc,
- },
- {
- .base = GIC_DIST_IGROUP,
- .len = VGIC_MAX_IRQS / 8,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- .base = GIC_DIST_ENABLE_SET,
- .len = VGIC_MAX_IRQS / 8,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_set_enable_reg,
- },
- {
- .base = GIC_DIST_ENABLE_CLEAR,
- .len = VGIC_MAX_IRQS / 8,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_clear_enable_reg,
- },
- {
- .base = GIC_DIST_PENDING_SET,
- .len = VGIC_MAX_IRQS / 8,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_set_pending_reg,
- },
- {
- .base = GIC_DIST_PENDING_CLEAR,
- .len = VGIC_MAX_IRQS / 8,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_clear_pending_reg,
- },
- {
- .base = GIC_DIST_ACTIVE_SET,
- .len = VGIC_MAX_IRQS / 8,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_set_active_reg,
- },
- {
- .base = GIC_DIST_ACTIVE_CLEAR,
- .len = VGIC_MAX_IRQS / 8,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_clear_active_reg,
- },
- {
- .base = GIC_DIST_PRI,
- .len = VGIC_MAX_IRQS,
- .bits_per_irq = 8,
- .handle_mmio = handle_mmio_priority_reg,
- },
- {
- .base = GIC_DIST_TARGET,
- .len = VGIC_MAX_IRQS,
- .bits_per_irq = 8,
- .handle_mmio = handle_mmio_target_reg,
- },
- {
- .base = GIC_DIST_CONFIG,
- .len = VGIC_MAX_IRQS / 4,
- .bits_per_irq = 2,
- .handle_mmio = handle_mmio_cfg_reg,
- },
- {
- .base = GIC_DIST_SGI_PENDING_CLEAR,
- .len = VGIC_NR_SGIS,
- .handle_mmio = handle_mmio_sgi_clear,
- },
- {
- .base = GIC_DIST_SGI_PENDING_SET,
- .len = VGIC_NR_SGIS,
- .handle_mmio = handle_mmio_sgi_set,
- },
- {}
-};
-
-static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg)
-{
- struct kvm *kvm = vcpu->kvm;
- struct vgic_dist *dist = &kvm->arch.vgic;
- int nrcpus = atomic_read(&kvm->online_vcpus);
- u8 target_cpus;
- int sgi, mode, c, vcpu_id;
-
- vcpu_id = vcpu->vcpu_id;
-
- sgi = reg & 0xf;
- target_cpus = (reg >> 16) & 0xff;
- mode = (reg >> 24) & 3;
-
- switch (mode) {
- case 0:
- if (!target_cpus)
- return;
- break;
-
- case 1:
- target_cpus = ((1 << nrcpus) - 1) & ~(1 << vcpu_id) & 0xff;
- break;
-
- case 2:
- target_cpus = 1 << vcpu_id;
- break;
- }
-
- kvm_for_each_vcpu(c, vcpu, kvm) {
- if (target_cpus & 1) {
- /* Flag the SGI as pending */
- vgic_dist_irq_set_pending(vcpu, sgi);
- *vgic_get_sgi_sources(dist, c, sgi) |= 1 << vcpu_id;
- kvm_debug("SGI%d from CPU%d to CPU%d\n",
- sgi, vcpu_id, c);
- }
-
- target_cpus >>= 1;
- }
-}
-
-static bool vgic_v2_queue_sgi(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- unsigned long sources;
- int vcpu_id = vcpu->vcpu_id;
- int c;
-
- sources = *vgic_get_sgi_sources(dist, vcpu_id, irq);
-
- for_each_set_bit(c, &sources, dist->nr_cpus) {
- if (vgic_queue_irq(vcpu, c, irq))
- clear_bit(c, &sources);
- }
-
- *vgic_get_sgi_sources(dist, vcpu_id, irq) = sources;
-
- /*
- * If the sources bitmap has been cleared it means that we
- * could queue all the SGIs onto link registers (see the
- * clear_bit above), and therefore we are done with them in
- * our emulated gic and can get rid of them.
- */
- if (!sources) {
- vgic_dist_irq_clear_pending(vcpu, irq);
- vgic_cpu_irq_clear(vcpu, irq);
- return true;
- }
-
- return false;
-}
-
-/**
- * kvm_vgic_map_resources - Configure global VGIC state before running any VCPUs
- * @kvm: pointer to the kvm struct
- *
- * Map the virtual CPU interface into the VM before running any VCPUs. We
- * can't do this at creation time, because user space must first set the
- * virtual CPU interface address in the guest physical address space.
- */
-static int vgic_v2_map_resources(struct kvm *kvm,
- const struct vgic_params *params)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
- int ret = 0;
-
- if (!irqchip_in_kernel(kvm))
- return 0;
-
- mutex_lock(&kvm->lock);
-
- if (vgic_ready(kvm))
- goto out;
-
- if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
- IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) {
- kvm_err("Need to set vgic cpu and dist addresses first\n");
- ret = -ENXIO;
- goto out;
- }
-
- vgic_register_kvm_io_dev(kvm, dist->vgic_dist_base,
- KVM_VGIC_V2_DIST_SIZE,
- vgic_dist_ranges, -1, &dist->dist_iodev);
-
- /*
- * Initialize the vgic if this hasn't already been done on demand by
- * accessing the vgic state from userspace.
- */
- ret = vgic_init(kvm);
- if (ret) {
- kvm_err("Unable to allocate maps\n");
- goto out_unregister;
- }
-
- ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
- params->vcpu_base, KVM_VGIC_V2_CPU_SIZE,
- true);
- if (ret) {
- kvm_err("Unable to remap VGIC CPU to VCPU\n");
- goto out_unregister;
- }
-
- dist->ready = true;
- goto out;
-
-out_unregister:
- kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dist->dist_iodev.dev);
-
-out:
- if (ret)
- kvm_vgic_destroy(kvm);
- mutex_unlock(&kvm->lock);
- return ret;
-}
-
-static void vgic_v2_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- *vgic_get_sgi_sources(dist, vcpu->vcpu_id, irq) |= 1 << source;
-}
-
-static int vgic_v2_init_model(struct kvm *kvm)
-{
- int i;
-
- for (i = VGIC_NR_PRIVATE_IRQS; i < kvm->arch.vgic.nr_irqs; i += 4)
- vgic_set_target_reg(kvm, 0, i);
-
- return 0;
-}
-
-void vgic_v2_init_emulation(struct kvm *kvm)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
-
- dist->vm_ops.queue_sgi = vgic_v2_queue_sgi;
- dist->vm_ops.add_sgi_source = vgic_v2_add_sgi_source;
- dist->vm_ops.init_model = vgic_v2_init_model;
- dist->vm_ops.map_resources = vgic_v2_map_resources;
-
- kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS;
-}
-
-static bool handle_cpu_mmio_misc(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
- bool updated = false;
- struct vgic_vmcr vmcr;
- u32 *vmcr_field;
- u32 reg;
-
- vgic_get_vmcr(vcpu, &vmcr);
-
- switch (offset & ~0x3) {
- case GIC_CPU_CTRL:
- vmcr_field = &vmcr.ctlr;
- break;
- case GIC_CPU_PRIMASK:
- vmcr_field = &vmcr.pmr;
- break;
- case GIC_CPU_BINPOINT:
- vmcr_field = &vmcr.bpr;
- break;
- case GIC_CPU_ALIAS_BINPOINT:
- vmcr_field = &vmcr.abpr;
- break;
- default:
- BUG();
- }
-
- if (!mmio->is_write) {
- reg = *vmcr_field;
- mmio_data_write(mmio, ~0, reg);
- } else {
- reg = mmio_data_read(mmio, ~0);
- if (reg != *vmcr_field) {
- *vmcr_field = reg;
- vgic_set_vmcr(vcpu, &vmcr);
- updated = true;
- }
- }
- return updated;
-}
-
-static bool handle_mmio_abpr(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
- return handle_cpu_mmio_misc(vcpu, mmio, GIC_CPU_ALIAS_BINPOINT);
-}
-
-static bool handle_cpu_mmio_ident(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- u32 reg;
-
- if (mmio->is_write)
- return false;
-
- /* GICC_IIDR */
- reg = (PRODUCT_ID_KVM << 20) |
- (GICC_ARCH_VERSION_V2 << 16) |
- (IMPLEMENTER_ARM << 0);
- mmio_data_write(mmio, ~0, reg);
- return false;
-}
-
-/*
- * CPU Interface Register accesses - these are not accessed by the VM, but by
- * user space for saving and restoring VGIC state.
- */
-static const struct vgic_io_range vgic_cpu_ranges[] = {
- {
- .base = GIC_CPU_CTRL,
- .len = 12,
- .handle_mmio = handle_cpu_mmio_misc,
- },
- {
- .base = GIC_CPU_ALIAS_BINPOINT,
- .len = 4,
- .handle_mmio = handle_mmio_abpr,
- },
- {
- .base = GIC_CPU_ACTIVEPRIO,
- .len = 16,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- .base = GIC_CPU_IDENT,
- .len = 4,
- .handle_mmio = handle_cpu_mmio_ident,
- },
-};
-
-static int vgic_attr_regs_access(struct kvm_device *dev,
- struct kvm_device_attr *attr,
- u32 *reg, bool is_write)
-{
- const struct vgic_io_range *r = NULL, *ranges;
- phys_addr_t offset;
- int ret, cpuid, c;
- struct kvm_vcpu *vcpu, *tmp_vcpu;
- struct vgic_dist *vgic;
- struct kvm_exit_mmio mmio;
- u32 data;
-
- offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
- cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >>
- KVM_DEV_ARM_VGIC_CPUID_SHIFT;
-
- mutex_lock(&dev->kvm->lock);
-
- ret = vgic_init(dev->kvm);
- if (ret)
- goto out;
-
- if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) {
- ret = -EINVAL;
- goto out;
- }
-
- vcpu = kvm_get_vcpu(dev->kvm, cpuid);
- vgic = &dev->kvm->arch.vgic;
-
- mmio.len = 4;
- mmio.is_write = is_write;
- mmio.data = &data;
- if (is_write)
- mmio_data_write(&mmio, ~0, *reg);
- switch (attr->group) {
- case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
- mmio.phys_addr = vgic->vgic_dist_base + offset;
- ranges = vgic_dist_ranges;
- break;
- case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
- mmio.phys_addr = vgic->vgic_cpu_base + offset;
- ranges = vgic_cpu_ranges;
- break;
- default:
- BUG();
- }
- r = vgic_find_range(ranges, 4, offset);
-
- if (unlikely(!r || !r->handle_mmio)) {
- ret = -ENXIO;
- goto out;
- }
-
-
- spin_lock(&vgic->lock);
-
- /*
- * Ensure that no other VCPU is running by checking the vcpu->cpu
- * field. If no other VPCUs are running we can safely access the VGIC
- * state, because even if another VPU is run after this point, that
- * VCPU will not touch the vgic state, because it will block on
- * getting the vgic->lock in kvm_vgic_sync_hwstate().
- */
- kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) {
- if (unlikely(tmp_vcpu->cpu != -1)) {
- ret = -EBUSY;
- goto out_vgic_unlock;
- }
- }
-
- /*
- * Move all pending IRQs from the LRs on all VCPUs so the pending
- * state can be properly represented in the register state accessible
- * through this API.
- */
- kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm)
- vgic_unqueue_irqs(tmp_vcpu);
-
- offset -= r->base;
- r->handle_mmio(vcpu, &mmio, offset);
-
- if (!is_write)
- *reg = mmio_data_read(&mmio, ~0);
-
- ret = 0;
-out_vgic_unlock:
- spin_unlock(&vgic->lock);
-out:
- mutex_unlock(&dev->kvm->lock);
- return ret;
-}
-
-static int vgic_v2_create(struct kvm_device *dev, u32 type)
-{
- return kvm_vgic_create(dev->kvm, type);
-}
-
-static void vgic_v2_destroy(struct kvm_device *dev)
-{
- kfree(dev);
-}
-
-static int vgic_v2_set_attr(struct kvm_device *dev,
- struct kvm_device_attr *attr)
-{
- int ret;
-
- ret = vgic_set_common_attr(dev, attr);
- if (ret != -ENXIO)
- return ret;
-
- switch (attr->group) {
- case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
- case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
- u32 __user *uaddr = (u32 __user *)(long)attr->addr;
- u32 reg;
-
- if (get_user(reg, uaddr))
- return -EFAULT;
-
- return vgic_attr_regs_access(dev, attr, &reg, true);
- }
-
- }
-
- return -ENXIO;
-}
-
-static int vgic_v2_get_attr(struct kvm_device *dev,
- struct kvm_device_attr *attr)
-{
- int ret;
-
- ret = vgic_get_common_attr(dev, attr);
- if (ret != -ENXIO)
- return ret;
-
- switch (attr->group) {
- case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
- case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
- u32 __user *uaddr = (u32 __user *)(long)attr->addr;
- u32 reg = 0;
-
- ret = vgic_attr_regs_access(dev, attr, &reg, false);
- if (ret)
- return ret;
- return put_user(reg, uaddr);
- }
-
- }
-
- return -ENXIO;
-}
-
-static int vgic_v2_has_attr(struct kvm_device *dev,
- struct kvm_device_attr *attr)
-{
- phys_addr_t offset;
-
- switch (attr->group) {
- case KVM_DEV_ARM_VGIC_GRP_ADDR:
- switch (attr->attr) {
- case KVM_VGIC_V2_ADDR_TYPE_DIST:
- case KVM_VGIC_V2_ADDR_TYPE_CPU:
- return 0;
- }
- break;
- case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
- offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
- return vgic_has_attr_regs(vgic_dist_ranges, offset);
- case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
- offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
- return vgic_has_attr_regs(vgic_cpu_ranges, offset);
- case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
- return 0;
- case KVM_DEV_ARM_VGIC_GRP_CTRL:
- switch (attr->attr) {
- case KVM_DEV_ARM_VGIC_CTRL_INIT:
- return 0;
- }
- }
- return -ENXIO;
-}
-
-struct kvm_device_ops kvm_arm_vgic_v2_ops = {
- .name = "kvm-arm-vgic-v2",
- .create = vgic_v2_create,
- .destroy = vgic_v2_destroy,
- .set_attr = vgic_v2_set_attr,
- .get_attr = vgic_v2_get_attr,
- .has_attr = vgic_v2_has_attr,
-};
diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c
deleted file mode 100644
index 334cd7a89106..000000000000
--- a/virt/kvm/arm/vgic-v2.c
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright (C) 2012,2013 ARM Limited, All Rights Reserved.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-
-#include <linux/irqchip/arm-gic.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-
-static struct vgic_lr vgic_v2_get_lr(const struct kvm_vcpu *vcpu, int lr)
-{
- struct vgic_lr lr_desc;
- u32 val = vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr];
-
- lr_desc.irq = val & GICH_LR_VIRTUALID;
- if (lr_desc.irq <= 15)
- lr_desc.source = (val >> GICH_LR_PHYSID_CPUID_SHIFT) & 0x7;
- else
- lr_desc.source = 0;
- lr_desc.state = 0;
-
- if (val & GICH_LR_PENDING_BIT)
- lr_desc.state |= LR_STATE_PENDING;
- if (val & GICH_LR_ACTIVE_BIT)
- lr_desc.state |= LR_STATE_ACTIVE;
- if (val & GICH_LR_EOI)
- lr_desc.state |= LR_EOI_INT;
- if (val & GICH_LR_HW) {
- lr_desc.state |= LR_HW;
- lr_desc.hwirq = (val & GICH_LR_PHYSID_CPUID) >> GICH_LR_PHYSID_CPUID_SHIFT;
- }
-
- return lr_desc;
-}
-
-static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
- struct vgic_lr lr_desc)
-{
- u32 lr_val;
-
- lr_val = lr_desc.irq;
-
- if (lr_desc.state & LR_STATE_PENDING)
- lr_val |= GICH_LR_PENDING_BIT;
- if (lr_desc.state & LR_STATE_ACTIVE)
- lr_val |= GICH_LR_ACTIVE_BIT;
- if (lr_desc.state & LR_EOI_INT)
- lr_val |= GICH_LR_EOI;
-
- if (lr_desc.state & LR_HW) {
- lr_val |= GICH_LR_HW;
- lr_val |= (u32)lr_desc.hwirq << GICH_LR_PHYSID_CPUID_SHIFT;
- }
-
- if (lr_desc.irq < VGIC_NR_SGIS)
- lr_val |= (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT);
-
- vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val;
-
- if (!(lr_desc.state & LR_STATE_MASK))
- vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr |= (1ULL << lr);
- else
- vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr &= ~(1ULL << lr);
-}
-
-static u64 vgic_v2_get_elrsr(const struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr;
-}
-
-static u64 vgic_v2_get_eisr(const struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.vgic_cpu.vgic_v2.vgic_eisr;
-}
-
-static void vgic_v2_clear_eisr(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.vgic_cpu.vgic_v2.vgic_eisr = 0;
-}
-
-static u32 vgic_v2_get_interrupt_status(const struct kvm_vcpu *vcpu)
-{
- u32 misr = vcpu->arch.vgic_cpu.vgic_v2.vgic_misr;
- u32 ret = 0;
-
- if (misr & GICH_MISR_EOI)
- ret |= INT_STATUS_EOI;
- if (misr & GICH_MISR_U)
- ret |= INT_STATUS_UNDERFLOW;
-
- return ret;
-}
-
-static void vgic_v2_enable_underflow(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr |= GICH_HCR_UIE;
-}
-
-static void vgic_v2_disable_underflow(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr &= ~GICH_HCR_UIE;
-}
-
-static void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
- u32 vmcr = vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr;
-
- vmcrp->ctlr = (vmcr & GICH_VMCR_CTRL_MASK) >> GICH_VMCR_CTRL_SHIFT;
- vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >> GICH_VMCR_ALIAS_BINPOINT_SHIFT;
- vmcrp->bpr = (vmcr & GICH_VMCR_BINPOINT_MASK) >> GICH_VMCR_BINPOINT_SHIFT;
- vmcrp->pmr = (vmcr & GICH_VMCR_PRIMASK_MASK) >> GICH_VMCR_PRIMASK_SHIFT;
-}
-
-static void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
- u32 vmcr;
-
- vmcr = (vmcrp->ctlr << GICH_VMCR_CTRL_SHIFT) & GICH_VMCR_CTRL_MASK;
- vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) & GICH_VMCR_ALIAS_BINPOINT_MASK;
- vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) & GICH_VMCR_BINPOINT_MASK;
- vmcr |= (vmcrp->pmr << GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK;
-
- vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = vmcr;
-}
-
-static void vgic_v2_enable(struct kvm_vcpu *vcpu)
-{
- /*
- * By forcing VMCR to zero, the GIC will restore the binary
- * points to their reset values. Anything else resets to zero
- * anyway.
- */
- vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
- vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr = ~0;
-
- /* Get the show on the road... */
- vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
-}
-
-static const struct vgic_ops vgic_v2_ops = {
- .get_lr = vgic_v2_get_lr,
- .set_lr = vgic_v2_set_lr,
- .get_elrsr = vgic_v2_get_elrsr,
- .get_eisr = vgic_v2_get_eisr,
- .clear_eisr = vgic_v2_clear_eisr,
- .get_interrupt_status = vgic_v2_get_interrupt_status,
- .enable_underflow = vgic_v2_enable_underflow,
- .disable_underflow = vgic_v2_disable_underflow,
- .get_vmcr = vgic_v2_get_vmcr,
- .set_vmcr = vgic_v2_set_vmcr,
- .enable = vgic_v2_enable,
-};
-
-struct vgic_params __section(.hyp.text) vgic_v2_params;
-
-static void vgic_cpu_init_lrs(void *params)
-{
- struct vgic_params *vgic = params;
- int i;
-
- for (i = 0; i < vgic->nr_lr; i++)
- writel_relaxed(0, vgic->vctrl_base + GICH_LR0 + (i * 4));
-}
-
-/**
- * vgic_v2_probe - probe for a GICv2 compatible interrupt controller
- * @gic_kvm_info: pointer to the GIC description
- * @ops: address of a pointer to the GICv2 operations
- * @params: address of a pointer to HW-specific parameters
- *
- * Returns 0 if a GICv2 has been found, with the low level operations
- * in *ops and the HW parameters in *params. Returns an error code
- * otherwise.
- */
-int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info,
- const struct vgic_ops **ops,
- const struct vgic_params **params)
-{
- int ret;
- struct vgic_params *vgic = &vgic_v2_params;
- const struct resource *vctrl_res = &gic_kvm_info->vctrl;
- const struct resource *vcpu_res = &gic_kvm_info->vcpu;
-
- memset(vgic, 0, sizeof(*vgic));
-
- if (!gic_kvm_info->maint_irq) {
- kvm_err("error getting vgic maintenance irq\n");
- ret = -ENXIO;
- goto out;
- }
- vgic->maint_irq = gic_kvm_info->maint_irq;
-
- if (!gic_kvm_info->vctrl.start) {
- kvm_err("GICH not present in the firmware table\n");
- ret = -ENXIO;
- goto out;
- }
-
- vgic->vctrl_base = ioremap(gic_kvm_info->vctrl.start,
- resource_size(&gic_kvm_info->vctrl));
- if (!vgic->vctrl_base) {
- kvm_err("Cannot ioremap GICH\n");
- ret = -ENOMEM;
- goto out;
- }
-
- vgic->nr_lr = readl_relaxed(vgic->vctrl_base + GICH_VTR);
- vgic->nr_lr = (vgic->nr_lr & 0x3f) + 1;
-
- ret = create_hyp_io_mappings(vgic->vctrl_base,
- vgic->vctrl_base + resource_size(vctrl_res),
- vctrl_res->start);
- if (ret) {
- kvm_err("Cannot map VCTRL into hyp\n");
- goto out_unmap;
- }
-
- if (!PAGE_ALIGNED(vcpu_res->start)) {
- kvm_err("GICV physical address 0x%llx not page aligned\n",
- (unsigned long long)vcpu_res->start);
- ret = -ENXIO;
- goto out_unmap;
- }
-
- if (!PAGE_ALIGNED(resource_size(vcpu_res))) {
- kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n",
- (unsigned long long)resource_size(vcpu_res),
- PAGE_SIZE);
- ret = -ENXIO;
- goto out_unmap;
- }
-
- vgic->can_emulate_gicv2 = true;
- kvm_register_device_ops(&kvm_arm_vgic_v2_ops, KVM_DEV_TYPE_ARM_VGIC_V2);
-
- vgic->vcpu_base = vcpu_res->start;
-
- kvm_info("GICH base=0x%llx, GICV base=0x%llx, IRQ=%d\n",
- gic_kvm_info->vctrl.start, vgic->vcpu_base, vgic->maint_irq);
-
- vgic->type = VGIC_V2;
- vgic->max_gic_vcpus = VGIC_V2_MAX_CPUS;
-
- on_each_cpu(vgic_cpu_init_lrs, vgic, 1);
-
- *ops = &vgic_v2_ops;
- *params = vgic;
- goto out;
-
-out_unmap:
- iounmap(vgic->vctrl_base);
-out:
- return ret;
-}
diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c
deleted file mode 100644
index e661e7fb9d91..000000000000
--- a/virt/kvm/arm/vgic-v3-emul.c
+++ /dev/null
@@ -1,1074 +0,0 @@
-/*
- * GICv3 distributor and redistributor emulation
- *
- * GICv3 emulation is currently only supported on a GICv3 host (because
- * we rely on the hardware's CPU interface virtualization support), but
- * supports both hardware with or without the optional GICv2 backwards
- * compatibility features.
- *
- * Limitations of the emulation:
- * (RAZ/WI: read as zero, write ignore, RAO/WI: read as one, write ignore)
- * - We do not support LPIs (yet). TYPER.LPIS is reported as 0 and is RAZ/WI.
- * - We do not support the message based interrupts (MBIs) triggered by
- * writes to the GICD_{SET,CLR}SPI_* registers. TYPER.MBIS is reported as 0.
- * - We do not support the (optional) backwards compatibility feature.
- * GICD_CTLR.ARE resets to 1 and is RAO/WI. If the _host_ GIC supports
- * the compatiblity feature, you can use a GICv2 in the guest, though.
- * - We only support a single security state. GICD_CTLR.DS is 1 and is RAO/WI.
- * - Priorities are not emulated (same as the GICv2 emulation). Linux
- * as a guest is fine with this, because it does not use priorities.
- * - We only support Group1 interrupts. Again Linux uses only those.
- *
- * Copyright (C) 2014 ARM Ltd.
- * Author: Andre Przywara <andre.przywara@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-
-#include <linux/irqchip/arm-gic-v3.h>
-#include <kvm/arm_vgic.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-
-#include "vgic.h"
-
-static bool handle_mmio_rao_wi(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
- u32 reg = 0xffffffff;
-
- vgic_reg_access(mmio, &reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-
- return false;
-}
-
-static bool handle_mmio_ctlr(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
- u32 reg = 0;
-
- /*
- * Force ARE and DS to 1, the guest cannot change this.
- * For the time being we only support Group1 interrupts.
- */
- if (vcpu->kvm->arch.vgic.enabled)
- reg = GICD_CTLR_ENABLE_SS_G1;
- reg |= GICD_CTLR_ARE_NS | GICD_CTLR_DS;
-
- vgic_reg_access(mmio, &reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
- if (mmio->is_write) {
- vcpu->kvm->arch.vgic.enabled = !!(reg & GICD_CTLR_ENABLE_SS_G1);
- vgic_update_state(vcpu->kvm);
- return true;
- }
- return false;
-}
-
-/*
- * As this implementation does not provide compatibility
- * with GICv2 (ARE==1), we report zero CPUs in bits [5..7].
- * Also LPIs and MBIs are not supported, so we set the respective bits to 0.
- * Also we report at most 2**10=1024 interrupt IDs (to match 1024 SPIs).
- */
-#define INTERRUPT_ID_BITS 10
-static bool handle_mmio_typer(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
- u32 reg;
-
- reg = (min(vcpu->kvm->arch.vgic.nr_irqs, 1024) >> 5) - 1;
-
- reg |= (INTERRUPT_ID_BITS - 1) << 19;
-
- vgic_reg_access(mmio, &reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-
- return false;
-}
-
-static bool handle_mmio_iidr(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
- u32 reg;
-
- reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
- vgic_reg_access(mmio, &reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-
- return false;
-}
-
-static bool handle_mmio_set_enable_reg_dist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
- return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
- vcpu->vcpu_id,
- ACCESS_WRITE_SETBIT);
-
- vgic_reg_access(mmio, NULL, offset,
- ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
- return false;
-}
-
-static bool handle_mmio_clear_enable_reg_dist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
- return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
- vcpu->vcpu_id,
- ACCESS_WRITE_CLEARBIT);
-
- vgic_reg_access(mmio, NULL, offset,
- ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
- return false;
-}
-
-static bool handle_mmio_set_pending_reg_dist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
- return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
- vcpu->vcpu_id);
-
- vgic_reg_access(mmio, NULL, offset,
- ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
- return false;
-}
-
-static bool handle_mmio_clear_pending_reg_dist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
- return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
- vcpu->vcpu_id);
-
- vgic_reg_access(mmio, NULL, offset,
- ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
- return false;
-}
-
-static bool handle_mmio_set_active_reg_dist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
- return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset,
- vcpu->vcpu_id);
-
- vgic_reg_access(mmio, NULL, offset,
- ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
- return false;
-}
-
-static bool handle_mmio_clear_active_reg_dist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
- return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset,
- vcpu->vcpu_id);
-
- vgic_reg_access(mmio, NULL, offset,
- ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
- return false;
-}
-
-static bool handle_mmio_priority_reg_dist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- u32 *reg;
-
- if (unlikely(offset < VGIC_NR_PRIVATE_IRQS)) {
- vgic_reg_access(mmio, NULL, offset,
- ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
- return false;
- }
-
- reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
- vcpu->vcpu_id, offset);
- vgic_reg_access(mmio, reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
- return false;
-}
-
-static bool handle_mmio_cfg_reg_dist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- u32 *reg;
-
- if (unlikely(offset < VGIC_NR_PRIVATE_IRQS / 4)) {
- vgic_reg_access(mmio, NULL, offset,
- ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
- return false;
- }
-
- reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
- vcpu->vcpu_id, offset >> 1);
-
- return vgic_handle_cfg_reg(reg, mmio, offset);
-}
-
-/*
- * We use a compressed version of the MPIDR (all 32 bits in one 32-bit word)
- * when we store the target MPIDR written by the guest.
- */
-static u32 compress_mpidr(unsigned long mpidr)
-{
- u32 ret;
-
- ret = MPIDR_AFFINITY_LEVEL(mpidr, 0);
- ret |= MPIDR_AFFINITY_LEVEL(mpidr, 1) << 8;
- ret |= MPIDR_AFFINITY_LEVEL(mpidr, 2) << 16;
- ret |= MPIDR_AFFINITY_LEVEL(mpidr, 3) << 24;
-
- return ret;
-}
-
-static unsigned long uncompress_mpidr(u32 value)
-{
- unsigned long mpidr;
-
- mpidr = ((value >> 0) & 0xFF) << MPIDR_LEVEL_SHIFT(0);
- mpidr |= ((value >> 8) & 0xFF) << MPIDR_LEVEL_SHIFT(1);
- mpidr |= ((value >> 16) & 0xFF) << MPIDR_LEVEL_SHIFT(2);
- mpidr |= (u64)((value >> 24) & 0xFF) << MPIDR_LEVEL_SHIFT(3);
-
- return mpidr;
-}
-
-/*
- * Lookup the given MPIDR value to get the vcpu_id (if there is one)
- * and store that in the irq_spi_cpu[] array.
- * This limits the number of VCPUs to 255 for now, extending the data
- * type (or storing kvm_vcpu pointers) should lift the limit.
- * Store the original MPIDR value in an extra array to support read-as-written.
- * Unallocated MPIDRs are translated to a special value and caught
- * before any array accesses.
- */
-static bool handle_mmio_route_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- struct kvm *kvm = vcpu->kvm;
- struct vgic_dist *dist = &kvm->arch.vgic;
- int spi;
- u32 reg;
- int vcpu_id;
- unsigned long *bmap, mpidr;
-
- /*
- * The upper 32 bits of each 64 bit register are zero,
- * as we don't support Aff3.
- */
- if ((offset & 4)) {
- vgic_reg_access(mmio, NULL, offset,
- ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
- return false;
- }
-
- /* This region only covers SPIs, so no handling of private IRQs here. */
- spi = offset / 8;
-
- /* get the stored MPIDR for this IRQ */
- mpidr = uncompress_mpidr(dist->irq_spi_mpidr[spi]);
- reg = mpidr;
-
- vgic_reg_access(mmio, &reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-
- if (!mmio->is_write)
- return false;
-
- /*
- * Now clear the currently assigned vCPU from the map, making room
- * for the new one to be written below
- */
- vcpu = kvm_mpidr_to_vcpu(kvm, mpidr);
- if (likely(vcpu)) {
- vcpu_id = vcpu->vcpu_id;
- bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
- __clear_bit(spi, bmap);
- }
-
- dist->irq_spi_mpidr[spi] = compress_mpidr(reg);
- vcpu = kvm_mpidr_to_vcpu(kvm, reg & MPIDR_HWID_BITMASK);
-
- /*
- * The spec says that non-existent MPIDR values should not be
- * forwarded to any existent (v)CPU, but should be able to become
- * pending anyway. We simply keep the irq_spi_target[] array empty, so
- * the interrupt will never be injected.
- * irq_spi_cpu[irq] gets a magic value in this case.
- */
- if (likely(vcpu)) {
- vcpu_id = vcpu->vcpu_id;
- dist->irq_spi_cpu[spi] = vcpu_id;
- bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
- __set_bit(spi, bmap);
- } else {
- dist->irq_spi_cpu[spi] = VCPU_NOT_ALLOCATED;
- }
-
- vgic_update_state(kvm);
-
- return true;
-}
-
-/*
- * We should be careful about promising too much when a guest reads
- * this register. Don't claim to be like any hardware implementation,
- * but just report the GIC as version 3 - which is what a Linux guest
- * would check.
- */
-static bool handle_mmio_idregs(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- u32 reg = 0;
-
- switch (offset + GICD_IDREGS) {
- case GICD_PIDR2:
- reg = 0x3b;
- break;
- }
-
- vgic_reg_access(mmio, &reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-
- return false;
-}
-
-static const struct vgic_io_range vgic_v3_dist_ranges[] = {
- {
- .base = GICD_CTLR,
- .len = 0x04,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_ctlr,
- },
- {
- .base = GICD_TYPER,
- .len = 0x04,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_typer,
- },
- {
- .base = GICD_IIDR,
- .len = 0x04,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_iidr,
- },
- {
- /* this register is optional, it is RAZ/WI if not implemented */
- .base = GICD_STATUSR,
- .len = 0x04,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- /* this write only register is WI when TYPER.MBIS=0 */
- .base = GICD_SETSPI_NSR,
- .len = 0x04,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- /* this write only register is WI when TYPER.MBIS=0 */
- .base = GICD_CLRSPI_NSR,
- .len = 0x04,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- /* this is RAZ/WI when DS=1 */
- .base = GICD_SETSPI_SR,
- .len = 0x04,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- /* this is RAZ/WI when DS=1 */
- .base = GICD_CLRSPI_SR,
- .len = 0x04,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- .base = GICD_IGROUPR,
- .len = 0x80,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_rao_wi,
- },
- {
- .base = GICD_ISENABLER,
- .len = 0x80,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_set_enable_reg_dist,
- },
- {
- .base = GICD_ICENABLER,
- .len = 0x80,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_clear_enable_reg_dist,
- },
- {
- .base = GICD_ISPENDR,
- .len = 0x80,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_set_pending_reg_dist,
- },
- {
- .base = GICD_ICPENDR,
- .len = 0x80,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_clear_pending_reg_dist,
- },
- {
- .base = GICD_ISACTIVER,
- .len = 0x80,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_set_active_reg_dist,
- },
- {
- .base = GICD_ICACTIVER,
- .len = 0x80,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_clear_active_reg_dist,
- },
- {
- .base = GICD_IPRIORITYR,
- .len = 0x400,
- .bits_per_irq = 8,
- .handle_mmio = handle_mmio_priority_reg_dist,
- },
- {
- /* TARGETSRn is RES0 when ARE=1 */
- .base = GICD_ITARGETSR,
- .len = 0x400,
- .bits_per_irq = 8,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- .base = GICD_ICFGR,
- .len = 0x100,
- .bits_per_irq = 2,
- .handle_mmio = handle_mmio_cfg_reg_dist,
- },
- {
- /* this is RAZ/WI when DS=1 */
- .base = GICD_IGRPMODR,
- .len = 0x80,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- /* this is RAZ/WI when DS=1 */
- .base = GICD_NSACR,
- .len = 0x100,
- .bits_per_irq = 2,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- /* this is RAZ/WI when ARE=1 */
- .base = GICD_SGIR,
- .len = 0x04,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- /* this is RAZ/WI when ARE=1 */
- .base = GICD_CPENDSGIR,
- .len = 0x10,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- /* this is RAZ/WI when ARE=1 */
- .base = GICD_SPENDSGIR,
- .len = 0x10,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- .base = GICD_IROUTER + 0x100,
- .len = 0x1ee0,
- .bits_per_irq = 64,
- .handle_mmio = handle_mmio_route_reg,
- },
- {
- .base = GICD_IDREGS,
- .len = 0x30,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_idregs,
- },
- {},
-};
-
-static bool handle_mmio_ctlr_redist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- /* since we don't support LPIs, this register is zero for now */
- vgic_reg_access(mmio, NULL, offset,
- ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
- return false;
-}
-
-static bool handle_mmio_typer_redist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- u32 reg;
- u64 mpidr;
- struct kvm_vcpu *redist_vcpu = mmio->private;
- int target_vcpu_id = redist_vcpu->vcpu_id;
-
- /* the upper 32 bits contain the affinity value */
- if ((offset & ~3) == 4) {
- mpidr = kvm_vcpu_get_mpidr_aff(redist_vcpu);
- reg = compress_mpidr(mpidr);
-
- vgic_reg_access(mmio, &reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
- return false;
- }
-
- reg = redist_vcpu->vcpu_id << 8;
- if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
- reg |= GICR_TYPER_LAST;
- vgic_reg_access(mmio, &reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
- return false;
-}
-
-static bool handle_mmio_set_enable_reg_redist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- struct kvm_vcpu *redist_vcpu = mmio->private;
-
- return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
- redist_vcpu->vcpu_id,
- ACCESS_WRITE_SETBIT);
-}
-
-static bool handle_mmio_clear_enable_reg_redist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- struct kvm_vcpu *redist_vcpu = mmio->private;
-
- return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
- redist_vcpu->vcpu_id,
- ACCESS_WRITE_CLEARBIT);
-}
-
-static bool handle_mmio_set_active_reg_redist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- struct kvm_vcpu *redist_vcpu = mmio->private;
-
- return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset,
- redist_vcpu->vcpu_id);
-}
-
-static bool handle_mmio_clear_active_reg_redist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- struct kvm_vcpu *redist_vcpu = mmio->private;
-
- return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset,
- redist_vcpu->vcpu_id);
-}
-
-static bool handle_mmio_set_pending_reg_redist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- struct kvm_vcpu *redist_vcpu = mmio->private;
-
- return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
- redist_vcpu->vcpu_id);
-}
-
-static bool handle_mmio_clear_pending_reg_redist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- struct kvm_vcpu *redist_vcpu = mmio->private;
-
- return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
- redist_vcpu->vcpu_id);
-}
-
-static bool handle_mmio_priority_reg_redist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- struct kvm_vcpu *redist_vcpu = mmio->private;
- u32 *reg;
-
- reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
- redist_vcpu->vcpu_id, offset);
- vgic_reg_access(mmio, reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
- return false;
-}
-
-static bool handle_mmio_cfg_reg_redist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- struct kvm_vcpu *redist_vcpu = mmio->private;
-
- u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
- redist_vcpu->vcpu_id, offset >> 1);
-
- return vgic_handle_cfg_reg(reg, mmio, offset);
-}
-
-#define SGI_base(x) ((x) + SZ_64K)
-
-static const struct vgic_io_range vgic_redist_ranges[] = {
- {
- .base = GICR_CTLR,
- .len = 0x04,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_ctlr_redist,
- },
- {
- .base = GICR_TYPER,
- .len = 0x08,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_typer_redist,
- },
- {
- .base = GICR_IIDR,
- .len = 0x04,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_iidr,
- },
- {
- .base = GICR_WAKER,
- .len = 0x04,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- .base = GICR_IDREGS,
- .len = 0x30,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_idregs,
- },
- {
- .base = SGI_base(GICR_IGROUPR0),
- .len = 0x04,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_rao_wi,
- },
- {
- .base = SGI_base(GICR_ISENABLER0),
- .len = 0x04,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_set_enable_reg_redist,
- },
- {
- .base = SGI_base(GICR_ICENABLER0),
- .len = 0x04,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_clear_enable_reg_redist,
- },
- {
- .base = SGI_base(GICR_ISPENDR0),
- .len = 0x04,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_set_pending_reg_redist,
- },
- {
- .base = SGI_base(GICR_ICPENDR0),
- .len = 0x04,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_clear_pending_reg_redist,
- },
- {
- .base = SGI_base(GICR_ISACTIVER0),
- .len = 0x04,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_set_active_reg_redist,
- },
- {
- .base = SGI_base(GICR_ICACTIVER0),
- .len = 0x04,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_clear_active_reg_redist,
- },
- {
- .base = SGI_base(GICR_IPRIORITYR0),
- .len = 0x20,
- .bits_per_irq = 8,
- .handle_mmio = handle_mmio_priority_reg_redist,
- },
- {
- .base = SGI_base(GICR_ICFGR0),
- .len = 0x08,
- .bits_per_irq = 2,
- .handle_mmio = handle_mmio_cfg_reg_redist,
- },
- {
- .base = SGI_base(GICR_IGRPMODR0),
- .len = 0x04,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- .base = SGI_base(GICR_NSACR),
- .len = 0x04,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {},
-};
-
-static bool vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, int irq)
-{
- if (vgic_queue_irq(vcpu, 0, irq)) {
- vgic_dist_irq_clear_pending(vcpu, irq);
- vgic_cpu_irq_clear(vcpu, irq);
- return true;
- }
-
- return false;
-}
-
-static int vgic_v3_map_resources(struct kvm *kvm,
- const struct vgic_params *params)
-{
- int ret = 0;
- struct vgic_dist *dist = &kvm->arch.vgic;
- gpa_t rdbase = dist->vgic_redist_base;
- struct vgic_io_device *iodevs = NULL;
- int i;
-
- if (!irqchip_in_kernel(kvm))
- return 0;
-
- mutex_lock(&kvm->lock);
-
- if (vgic_ready(kvm))
- goto out;
-
- if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
- IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) {
- kvm_err("Need to set vgic distributor addresses first\n");
- ret = -ENXIO;
- goto out;
- }
-
- /*
- * For a VGICv3 we require the userland to explicitly initialize
- * the VGIC before we need to use it.
- */
- if (!vgic_initialized(kvm)) {
- ret = -EBUSY;
- goto out;
- }
-
- ret = vgic_register_kvm_io_dev(kvm, dist->vgic_dist_base,
- GIC_V3_DIST_SIZE, vgic_v3_dist_ranges,
- -1, &dist->dist_iodev);
- if (ret)
- goto out;
-
- iodevs = kcalloc(dist->nr_cpus, sizeof(iodevs[0]), GFP_KERNEL);
- if (!iodevs) {
- ret = -ENOMEM;
- goto out_unregister;
- }
-
- for (i = 0; i < dist->nr_cpus; i++) {
- ret = vgic_register_kvm_io_dev(kvm, rdbase,
- SZ_128K, vgic_redist_ranges,
- i, &iodevs[i]);
- if (ret)
- goto out_unregister;
- rdbase += GIC_V3_REDIST_SIZE;
- }
-
- dist->redist_iodevs = iodevs;
- dist->ready = true;
- goto out;
-
-out_unregister:
- kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dist->dist_iodev.dev);
- if (iodevs) {
- for (i = 0; i < dist->nr_cpus; i++) {
- if (iodevs[i].dev.ops)
- kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
- &iodevs[i].dev);
- }
- }
-
-out:
- if (ret)
- kvm_vgic_destroy(kvm);
- mutex_unlock(&kvm->lock);
- return ret;
-}
-
-static int vgic_v3_init_model(struct kvm *kvm)
-{
- int i;
- u32 mpidr;
- struct vgic_dist *dist = &kvm->arch.vgic;
- int nr_spis = dist->nr_irqs - VGIC_NR_PRIVATE_IRQS;
-
- dist->irq_spi_mpidr = kcalloc(nr_spis, sizeof(dist->irq_spi_mpidr[0]),
- GFP_KERNEL);
-
- if (!dist->irq_spi_mpidr)
- return -ENOMEM;
-
- /* Initialize the target VCPUs for each IRQ to VCPU 0 */
- mpidr = compress_mpidr(kvm_vcpu_get_mpidr_aff(kvm_get_vcpu(kvm, 0)));
- for (i = VGIC_NR_PRIVATE_IRQS; i < dist->nr_irqs; i++) {
- dist->irq_spi_cpu[i - VGIC_NR_PRIVATE_IRQS] = 0;
- dist->irq_spi_mpidr[i - VGIC_NR_PRIVATE_IRQS] = mpidr;
- vgic_bitmap_set_irq_val(dist->irq_spi_target, 0, i, 1);
- }
-
- return 0;
-}
-
-/* GICv3 does not keep track of SGI sources anymore. */
-static void vgic_v3_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
-{
-}
-
-void vgic_v3_init_emulation(struct kvm *kvm)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
-
- dist->vm_ops.queue_sgi = vgic_v3_queue_sgi;
- dist->vm_ops.add_sgi_source = vgic_v3_add_sgi_source;
- dist->vm_ops.init_model = vgic_v3_init_model;
- dist->vm_ops.map_resources = vgic_v3_map_resources;
-
- kvm->arch.max_vcpus = KVM_MAX_VCPUS;
-}
-
-/*
- * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI
- * generation register ICC_SGI1R_EL1) with a given VCPU.
- * If the VCPU's MPIDR matches, return the level0 affinity, otherwise
- * return -1.
- */
-static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu)
-{
- unsigned long affinity;
- int level0;
-
- /*
- * Split the current VCPU's MPIDR into affinity level 0 and the
- * rest as this is what we have to compare against.
- */
- affinity = kvm_vcpu_get_mpidr_aff(vcpu);
- level0 = MPIDR_AFFINITY_LEVEL(affinity, 0);
- affinity &= ~MPIDR_LEVEL_MASK;
-
- /* bail out if the upper three levels don't match */
- if (sgi_aff != affinity)
- return -1;
-
- /* Is this VCPU's bit set in the mask ? */
- if (!(sgi_cpu_mask & BIT(level0)))
- return -1;
-
- return level0;
-}
-
-#define SGI_AFFINITY_LEVEL(reg, level) \
- ((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \
- >> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
-
-/**
- * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs
- * @vcpu: The VCPU requesting a SGI
- * @reg: The value written into the ICC_SGI1R_EL1 register by that VCPU
- *
- * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register.
- * This will trap in sys_regs.c and call this function.
- * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the
- * target processors as well as a bitmask of 16 Aff0 CPUs.
- * If the interrupt routing mode bit is not set, we iterate over all VCPUs to
- * check for matching ones. If this bit is set, we signal all, but not the
- * calling VCPU.
- */
-void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
-{
- struct kvm *kvm = vcpu->kvm;
- struct kvm_vcpu *c_vcpu;
- struct vgic_dist *dist = &kvm->arch.vgic;
- u16 target_cpus;
- u64 mpidr;
- int sgi, c;
- int vcpu_id = vcpu->vcpu_id;
- bool broadcast;
- int updated = 0;
-
- sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT;
- broadcast = reg & BIT(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
- target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT;
- mpidr = SGI_AFFINITY_LEVEL(reg, 3);
- mpidr |= SGI_AFFINITY_LEVEL(reg, 2);
- mpidr |= SGI_AFFINITY_LEVEL(reg, 1);
-
- /*
- * We take the dist lock here, because we come from the sysregs
- * code path and not from the MMIO one (which already takes the lock).
- */
- spin_lock(&dist->lock);
-
- /*
- * We iterate over all VCPUs to find the MPIDRs matching the request.
- * If we have handled one CPU, we clear it's bit to detect early
- * if we are already finished. This avoids iterating through all
- * VCPUs when most of the times we just signal a single VCPU.
- */
- kvm_for_each_vcpu(c, c_vcpu, kvm) {
-
- /* Exit early if we have dealt with all requested CPUs */
- if (!broadcast && target_cpus == 0)
- break;
-
- /* Don't signal the calling VCPU */
- if (broadcast && c == vcpu_id)
- continue;
-
- if (!broadcast) {
- int level0;
-
- level0 = match_mpidr(mpidr, target_cpus, c_vcpu);
- if (level0 == -1)
- continue;
-
- /* remove this matching VCPU from the mask */
- target_cpus &= ~BIT(level0);
- }
-
- /* Flag the SGI as pending */
- vgic_dist_irq_set_pending(c_vcpu, sgi);
- updated = 1;
- kvm_debug("SGI%d from CPU%d to CPU%d\n", sgi, vcpu_id, c);
- }
- if (updated)
- vgic_update_state(vcpu->kvm);
- spin_unlock(&dist->lock);
- if (updated)
- vgic_kick_vcpus(vcpu->kvm);
-}
-
-static int vgic_v3_create(struct kvm_device *dev, u32 type)
-{
- return kvm_vgic_create(dev->kvm, type);
-}
-
-static void vgic_v3_destroy(struct kvm_device *dev)
-{
- kfree(dev);
-}
-
-static int vgic_v3_set_attr(struct kvm_device *dev,
- struct kvm_device_attr *attr)
-{
- int ret;
-
- ret = vgic_set_common_attr(dev, attr);
- if (ret != -ENXIO)
- return ret;
-
- switch (attr->group) {
- case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
- case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
- return -ENXIO;
- }
-
- return -ENXIO;
-}
-
-static int vgic_v3_get_attr(struct kvm_device *dev,
- struct kvm_device_attr *attr)
-{
- int ret;
-
- ret = vgic_get_common_attr(dev, attr);
- if (ret != -ENXIO)
- return ret;
-
- switch (attr->group) {
- case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
- case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
- return -ENXIO;
- }
-
- return -ENXIO;
-}
-
-static int vgic_v3_has_attr(struct kvm_device *dev,
- struct kvm_device_attr *attr)
-{
- switch (attr->group) {
- case KVM_DEV_ARM_VGIC_GRP_ADDR:
- switch (attr->attr) {
- case KVM_VGIC_V2_ADDR_TYPE_DIST:
- case KVM_VGIC_V2_ADDR_TYPE_CPU:
- return -ENXIO;
- case KVM_VGIC_V3_ADDR_TYPE_DIST:
- case KVM_VGIC_V3_ADDR_TYPE_REDIST:
- return 0;
- }
- break;
- case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
- case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
- return -ENXIO;
- case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
- return 0;
- case KVM_DEV_ARM_VGIC_GRP_CTRL:
- switch (attr->attr) {
- case KVM_DEV_ARM_VGIC_CTRL_INIT:
- return 0;
- }
- }
- return -ENXIO;
-}
-
-struct kvm_device_ops kvm_arm_vgic_v3_ops = {
- .name = "kvm-arm-vgic-v3",
- .create = vgic_v3_create,
- .destroy = vgic_v3_destroy,
- .set_attr = vgic_v3_set_attr,
- .get_attr = vgic_v3_get_attr,
- .has_attr = vgic_v3_has_attr,
-};
diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c
deleted file mode 100644
index 75b02fa86436..000000000000
--- a/virt/kvm/arm/vgic-v3.c
+++ /dev/null
@@ -1,279 +0,0 @@
-/*
- * Copyright (C) 2013 ARM Limited, All Rights Reserved.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-
-#include <linux/irqchip/arm-gic-v3.h>
-#include <linux/irqchip/arm-gic-common.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_asm.h>
-#include <asm/kvm_mmu.h>
-
-static u32 ich_vtr_el2;
-
-static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr)
-{
- struct vgic_lr lr_desc;
- u64 val = vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr];
-
- if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
- lr_desc.irq = val & ICH_LR_VIRTUAL_ID_MASK;
- else
- lr_desc.irq = val & GICH_LR_VIRTUALID;
-
- lr_desc.source = 0;
- if (lr_desc.irq <= 15 &&
- vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
- lr_desc.source = (val >> GICH_LR_PHYSID_CPUID_SHIFT) & 0x7;
-
- lr_desc.state = 0;
-
- if (val & ICH_LR_PENDING_BIT)
- lr_desc.state |= LR_STATE_PENDING;
- if (val & ICH_LR_ACTIVE_BIT)
- lr_desc.state |= LR_STATE_ACTIVE;
- if (val & ICH_LR_EOI)
- lr_desc.state |= LR_EOI_INT;
- if (val & ICH_LR_HW) {
- lr_desc.state |= LR_HW;
- lr_desc.hwirq = (val >> ICH_LR_PHYS_ID_SHIFT) & GENMASK(9, 0);
- }
-
- return lr_desc;
-}
-
-static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
- struct vgic_lr lr_desc)
-{
- u64 lr_val;
-
- lr_val = lr_desc.irq;
-
- /*
- * Currently all guest IRQs are Group1, as Group0 would result
- * in a FIQ in the guest, which it wouldn't expect.
- * Eventually we want to make this configurable, so we may revisit
- * this in the future.
- */
- switch (vcpu->kvm->arch.vgic.vgic_model) {
- case KVM_DEV_TYPE_ARM_VGIC_V3:
- lr_val |= ICH_LR_GROUP;
- break;
- case KVM_DEV_TYPE_ARM_VGIC_V2:
- if (lr_desc.irq < VGIC_NR_SGIS)
- lr_val |= (u32)lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT;
- break;
- default:
- BUG();
- }
-
- if (lr_desc.state & LR_STATE_PENDING)
- lr_val |= ICH_LR_PENDING_BIT;
- if (lr_desc.state & LR_STATE_ACTIVE)
- lr_val |= ICH_LR_ACTIVE_BIT;
- if (lr_desc.state & LR_EOI_INT)
- lr_val |= ICH_LR_EOI;
- if (lr_desc.state & LR_HW) {
- lr_val |= ICH_LR_HW;
- lr_val |= ((u64)lr_desc.hwirq) << ICH_LR_PHYS_ID_SHIFT;
- }
-
- vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = lr_val;
-
- if (!(lr_desc.state & LR_STATE_MASK))
- vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr |= (1U << lr);
- else
- vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr &= ~(1U << lr);
-}
-
-static u64 vgic_v3_get_elrsr(const struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr;
-}
-
-static u64 vgic_v3_get_eisr(const struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.vgic_cpu.vgic_v3.vgic_eisr;
-}
-
-static void vgic_v3_clear_eisr(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.vgic_cpu.vgic_v3.vgic_eisr = 0;
-}
-
-static u32 vgic_v3_get_interrupt_status(const struct kvm_vcpu *vcpu)
-{
- u32 misr = vcpu->arch.vgic_cpu.vgic_v3.vgic_misr;
- u32 ret = 0;
-
- if (misr & ICH_MISR_EOI)
- ret |= INT_STATUS_EOI;
- if (misr & ICH_MISR_U)
- ret |= INT_STATUS_UNDERFLOW;
-
- return ret;
-}
-
-static void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
- u32 vmcr = vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr;
-
- vmcrp->ctlr = (vmcr & ICH_VMCR_CTLR_MASK) >> ICH_VMCR_CTLR_SHIFT;
- vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
- vmcrp->bpr = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
- vmcrp->pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
-}
-
-static void vgic_v3_enable_underflow(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr |= ICH_HCR_UIE;
-}
-
-static void vgic_v3_disable_underflow(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr &= ~ICH_HCR_UIE;
-}
-
-static void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
- u32 vmcr;
-
- vmcr = (vmcrp->ctlr << ICH_VMCR_CTLR_SHIFT) & ICH_VMCR_CTLR_MASK;
- vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK;
- vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK;
- vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK;
-
- vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr = vmcr;
-}
-
-static void vgic_v3_enable(struct kvm_vcpu *vcpu)
-{
- struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
-
- /*
- * By forcing VMCR to zero, the GIC will restore the binary
- * points to their reset values. Anything else resets to zero
- * anyway.
- */
- vgic_v3->vgic_vmcr = 0;
- vgic_v3->vgic_elrsr = ~0;
-
- /*
- * If we are emulating a GICv3, we do it in an non-GICv2-compatible
- * way, so we force SRE to 1 to demonstrate this to the guest.
- * This goes with the spec allowing the value to be RAO/WI.
- */
- if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
- vgic_v3->vgic_sre = ICC_SRE_EL1_SRE;
- else
- vgic_v3->vgic_sre = 0;
-
- /* Get the show on the road... */
- vgic_v3->vgic_hcr = ICH_HCR_EN;
-}
-
-static const struct vgic_ops vgic_v3_ops = {
- .get_lr = vgic_v3_get_lr,
- .set_lr = vgic_v3_set_lr,
- .get_elrsr = vgic_v3_get_elrsr,
- .get_eisr = vgic_v3_get_eisr,
- .clear_eisr = vgic_v3_clear_eisr,
- .get_interrupt_status = vgic_v3_get_interrupt_status,
- .enable_underflow = vgic_v3_enable_underflow,
- .disable_underflow = vgic_v3_disable_underflow,
- .get_vmcr = vgic_v3_get_vmcr,
- .set_vmcr = vgic_v3_set_vmcr,
- .enable = vgic_v3_enable,
-};
-
-static struct vgic_params vgic_v3_params;
-
-static void vgic_cpu_init_lrs(void *params)
-{
- kvm_call_hyp(__vgic_v3_init_lrs);
-}
-
-/**
- * vgic_v3_probe - probe for a GICv3 compatible interrupt controller
- * @gic_kvm_info: pointer to the GIC description
- * @ops: address of a pointer to the GICv3 operations
- * @params: address of a pointer to HW-specific parameters
- *
- * Returns 0 if a GICv3 has been found, with the low level operations
- * in *ops and the HW parameters in *params. Returns an error code
- * otherwise.
- */
-int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info,
- const struct vgic_ops **ops,
- const struct vgic_params **params)
-{
- int ret = 0;
- struct vgic_params *vgic = &vgic_v3_params;
- const struct resource *vcpu_res = &gic_kvm_info->vcpu;
-
- vgic->maint_irq = gic_kvm_info->maint_irq;
-
- ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2);
-
- /*
- * The ListRegs field is 5 bits, but there is a architectural
- * maximum of 16 list registers. Just ignore bit 4...
- */
- vgic->nr_lr = (ich_vtr_el2 & 0xf) + 1;
- vgic->can_emulate_gicv2 = false;
-
- if (!vcpu_res->start) {
- kvm_info("GICv3: no GICV resource entry\n");
- vgic->vcpu_base = 0;
- } else if (!PAGE_ALIGNED(vcpu_res->start)) {
- pr_warn("GICV physical address 0x%llx not page aligned\n",
- (unsigned long long)vcpu_res->start);
- vgic->vcpu_base = 0;
- } else if (!PAGE_ALIGNED(resource_size(vcpu_res))) {
- pr_warn("GICV size 0x%llx not a multiple of page size 0x%lx\n",
- (unsigned long long)resource_size(vcpu_res),
- PAGE_SIZE);
- } else {
- vgic->vcpu_base = vcpu_res->start;
- vgic->can_emulate_gicv2 = true;
- kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
- KVM_DEV_TYPE_ARM_VGIC_V2);
- }
- if (vgic->vcpu_base == 0)
- kvm_info("disabling GICv2 emulation\n");
- kvm_register_device_ops(&kvm_arm_vgic_v3_ops, KVM_DEV_TYPE_ARM_VGIC_V3);
-
- vgic->vctrl_base = NULL;
- vgic->type = VGIC_V3;
- vgic->max_gic_vcpus = VGIC_V3_MAX_CPUS;
-
- kvm_info("GICV base=0x%llx, IRQ=%d\n",
- vgic->vcpu_base, vgic->maint_irq);
-
- on_each_cpu(vgic_cpu_init_lrs, vgic, 1);
-
- *ops = &vgic_v3_ops;
- *params = vgic;
-
- return ret;
-}
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
deleted file mode 100644
index 67cb5e948be2..000000000000
--- a/virt/kvm/arm/vgic.c
+++ /dev/null
@@ -1,2417 +0,0 @@
-/*
- * Copyright (C) 2012 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-#include <linux/irq.h>
-#include <linux/rculist.h>
-#include <linux/uaccess.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-#include <trace/events/kvm.h>
-#include <asm/kvm.h>
-#include <kvm/iodev.h>
-#include <linux/irqchip/arm-gic-common.h>
-
-#define CREATE_TRACE_POINTS
-#include "trace.h"
-
-/*
- * How the whole thing works (courtesy of Christoffer Dall):
- *
- * - At any time, the dist->irq_pending_on_cpu is the oracle that knows if
- * something is pending on the CPU interface.
- * - Interrupts that are pending on the distributor are stored on the
- * vgic.irq_pending vgic bitmap (this bitmap is updated by both user land
- * ioctls and guest mmio ops, and other in-kernel peripherals such as the
- * arch. timers).
- * - Every time the bitmap changes, the irq_pending_on_cpu oracle is
- * recalculated
- * - To calculate the oracle, we need info for each cpu from
- * compute_pending_for_cpu, which considers:
- * - PPI: dist->irq_pending & dist->irq_enable
- * - SPI: dist->irq_pending & dist->irq_enable & dist->irq_spi_target
- * - irq_spi_target is a 'formatted' version of the GICD_ITARGETSRn
- * registers, stored on each vcpu. We only keep one bit of
- * information per interrupt, making sure that only one vcpu can
- * accept the interrupt.
- * - If any of the above state changes, we must recalculate the oracle.
- * - The same is true when injecting an interrupt, except that we only
- * consider a single interrupt at a time. The irq_spi_cpu array
- * contains the target CPU for each SPI.
- *
- * The handling of level interrupts adds some extra complexity. We
- * need to track when the interrupt has been EOIed, so we can sample
- * the 'line' again. This is achieved as such:
- *
- * - When a level interrupt is moved onto a vcpu, the corresponding
- * bit in irq_queued is set. As long as this bit is set, the line
- * will be ignored for further interrupts. The interrupt is injected
- * into the vcpu with the GICH_LR_EOI bit set (generate a
- * maintenance interrupt on EOI).
- * - When the interrupt is EOIed, the maintenance interrupt fires,
- * and clears the corresponding bit in irq_queued. This allows the
- * interrupt line to be sampled again.
- * - Note that level-triggered interrupts can also be set to pending from
- * writes to GICD_ISPENDRn and lowering the external input line does not
- * cause the interrupt to become inactive in such a situation.
- * Conversely, writes to GICD_ICPENDRn do not cause the interrupt to become
- * inactive as long as the external input line is held high.
- *
- *
- * Initialization rules: there are multiple stages to the vgic
- * initialization, both for the distributor and the CPU interfaces.
- *
- * Distributor:
- *
- * - kvm_vgic_early_init(): initialization of static data that doesn't
- * depend on any sizing information or emulation type. No allocation
- * is allowed there.
- *
- * - vgic_init(): allocation and initialization of the generic data
- * structures that depend on sizing information (number of CPUs,
- * number of interrupts). Also initializes the vcpu specific data
- * structures. Can be executed lazily for GICv2.
- * [to be renamed to kvm_vgic_init??]
- *
- * CPU Interface:
- *
- * - kvm_vgic_cpu_early_init(): initialization of static data that
- * doesn't depend on any sizing information or emulation type. No
- * allocation is allowed there.
- */
-
-#include "vgic.h"
-
-static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu);
-static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu);
-static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr);
-static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc);
-static u64 vgic_get_elrsr(struct kvm_vcpu *vcpu);
-static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
- int virt_irq);
-static int compute_pending_for_cpu(struct kvm_vcpu *vcpu);
-
-static const struct vgic_ops *vgic_ops;
-static const struct vgic_params *vgic;
-
-static void add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
-{
- vcpu->kvm->arch.vgic.vm_ops.add_sgi_source(vcpu, irq, source);
-}
-
-static bool queue_sgi(struct kvm_vcpu *vcpu, int irq)
-{
- return vcpu->kvm->arch.vgic.vm_ops.queue_sgi(vcpu, irq);
-}
-
-int kvm_vgic_map_resources(struct kvm *kvm)
-{
- return kvm->arch.vgic.vm_ops.map_resources(kvm, vgic);
-}
-
-/*
- * struct vgic_bitmap contains a bitmap made of unsigned longs, but
- * extracts u32s out of them.
- *
- * This does not work on 64-bit BE systems, because the bitmap access
- * will store two consecutive 32-bit words with the higher-addressed
- * register's bits at the lower index and the lower-addressed register's
- * bits at the higher index.
- *
- * Therefore, swizzle the register index when accessing the 32-bit word
- * registers to access the right register's value.
- */
-#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 64
-#define REG_OFFSET_SWIZZLE 1
-#else
-#define REG_OFFSET_SWIZZLE 0
-#endif
-
-static int vgic_init_bitmap(struct vgic_bitmap *b, int nr_cpus, int nr_irqs)
-{
- int nr_longs;
-
- nr_longs = nr_cpus + BITS_TO_LONGS(nr_irqs - VGIC_NR_PRIVATE_IRQS);
-
- b->private = kzalloc(sizeof(unsigned long) * nr_longs, GFP_KERNEL);
- if (!b->private)
- return -ENOMEM;
-
- b->shared = b->private + nr_cpus;
-
- return 0;
-}
-
-static void vgic_free_bitmap(struct vgic_bitmap *b)
-{
- kfree(b->private);
- b->private = NULL;
- b->shared = NULL;
-}
-
-/*
- * Call this function to convert a u64 value to an unsigned long * bitmask
- * in a way that works on both 32-bit and 64-bit LE and BE platforms.
- *
- * Warning: Calling this function may modify *val.
- */
-static unsigned long *u64_to_bitmask(u64 *val)
-{
-#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 32
- *val = (*val >> 32) | (*val << 32);
-#endif
- return (unsigned long *)val;
-}
-
-u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, int cpuid, u32 offset)
-{
- offset >>= 2;
- if (!offset)
- return (u32 *)(x->private + cpuid) + REG_OFFSET_SWIZZLE;
- else
- return (u32 *)(x->shared) + ((offset - 1) ^ REG_OFFSET_SWIZZLE);
-}
-
-static int vgic_bitmap_get_irq_val(struct vgic_bitmap *x,
- int cpuid, int irq)
-{
- if (irq < VGIC_NR_PRIVATE_IRQS)
- return test_bit(irq, x->private + cpuid);
-
- return test_bit(irq - VGIC_NR_PRIVATE_IRQS, x->shared);
-}
-
-void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid,
- int irq, int val)
-{
- unsigned long *reg;
-
- if (irq < VGIC_NR_PRIVATE_IRQS) {
- reg = x->private + cpuid;
- } else {
- reg = x->shared;
- irq -= VGIC_NR_PRIVATE_IRQS;
- }
-
- if (val)
- set_bit(irq, reg);
- else
- clear_bit(irq, reg);
-}
-
-static unsigned long *vgic_bitmap_get_cpu_map(struct vgic_bitmap *x, int cpuid)
-{
- return x->private + cpuid;
-}
-
-unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x)
-{
- return x->shared;
-}
-
-static int vgic_init_bytemap(struct vgic_bytemap *x, int nr_cpus, int nr_irqs)
-{
- int size;
-
- size = nr_cpus * VGIC_NR_PRIVATE_IRQS;
- size += nr_irqs - VGIC_NR_PRIVATE_IRQS;
-
- x->private = kzalloc(size, GFP_KERNEL);
- if (!x->private)
- return -ENOMEM;
-
- x->shared = x->private + nr_cpus * VGIC_NR_PRIVATE_IRQS / sizeof(u32);
- return 0;
-}
-
-static void vgic_free_bytemap(struct vgic_bytemap *b)
-{
- kfree(b->private);
- b->private = NULL;
- b->shared = NULL;
-}
-
-u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset)
-{
- u32 *reg;
-
- if (offset < VGIC_NR_PRIVATE_IRQS) {
- reg = x->private;
- offset += cpuid * VGIC_NR_PRIVATE_IRQS;
- } else {
- reg = x->shared;
- offset -= VGIC_NR_PRIVATE_IRQS;
- }
-
- return reg + (offset / sizeof(u32));
-}
-
-#define VGIC_CFG_LEVEL 0
-#define VGIC_CFG_EDGE 1
-
-static bool vgic_irq_is_edge(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- int irq_val;
-
- irq_val = vgic_bitmap_get_irq_val(&dist->irq_cfg, vcpu->vcpu_id, irq);
- return irq_val == VGIC_CFG_EDGE;
-}
-
-static int vgic_irq_is_enabled(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- return vgic_bitmap_get_irq_val(&dist->irq_enabled, vcpu->vcpu_id, irq);
-}
-
-static int vgic_irq_is_queued(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- return vgic_bitmap_get_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq);
-}
-
-static int vgic_irq_is_active(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- return vgic_bitmap_get_irq_val(&dist->irq_active, vcpu->vcpu_id, irq);
-}
-
-static void vgic_irq_set_queued(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 1);
-}
-
-static void vgic_irq_clear_queued(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 0);
-}
-
-static void vgic_irq_set_active(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 1);
-}
-
-static void vgic_irq_clear_active(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 0);
-}
-
-static int vgic_dist_irq_get_level(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- return vgic_bitmap_get_irq_val(&dist->irq_level, vcpu->vcpu_id, irq);
-}
-
-static void vgic_dist_irq_set_level(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- vgic_bitmap_set_irq_val(&dist->irq_level, vcpu->vcpu_id, irq, 1);
-}
-
-static void vgic_dist_irq_clear_level(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- vgic_bitmap_set_irq_val(&dist->irq_level, vcpu->vcpu_id, irq, 0);
-}
-
-static int vgic_dist_irq_soft_pend(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- return vgic_bitmap_get_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq);
-}
-
-static void vgic_dist_irq_clear_soft_pend(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- vgic_bitmap_set_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq, 0);
- if (!vgic_dist_irq_get_level(vcpu, irq)) {
- vgic_dist_irq_clear_pending(vcpu, irq);
- if (!compute_pending_for_cpu(vcpu))
- clear_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
- }
-}
-
-static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- return vgic_bitmap_get_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq);
-}
-
-void vgic_dist_irq_set_pending(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- vgic_bitmap_set_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq, 1);
-}
-
-void vgic_dist_irq_clear_pending(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- vgic_bitmap_set_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq, 0);
-}
-
-static void vgic_cpu_irq_set(struct kvm_vcpu *vcpu, int irq)
-{
- if (irq < VGIC_NR_PRIVATE_IRQS)
- set_bit(irq, vcpu->arch.vgic_cpu.pending_percpu);
- else
- set_bit(irq - VGIC_NR_PRIVATE_IRQS,
- vcpu->arch.vgic_cpu.pending_shared);
-}
-
-void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq)
-{
- if (irq < VGIC_NR_PRIVATE_IRQS)
- clear_bit(irq, vcpu->arch.vgic_cpu.pending_percpu);
- else
- clear_bit(irq - VGIC_NR_PRIVATE_IRQS,
- vcpu->arch.vgic_cpu.pending_shared);
-}
-
-static bool vgic_can_sample_irq(struct kvm_vcpu *vcpu, int irq)
-{
- return !vgic_irq_is_queued(vcpu, irq);
-}
-
-/**
- * vgic_reg_access - access vgic register
- * @mmio: pointer to the data describing the mmio access
- * @reg: pointer to the virtual backing of vgic distributor data
- * @offset: least significant 2 bits used for word offset
- * @mode: ACCESS_ mode (see defines above)
- *
- * Helper to make vgic register access easier using one of the access
- * modes defined for vgic register access
- * (read,raz,write-ignored,setbit,clearbit,write)
- */
-void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg,
- phys_addr_t offset, int mode)
-{
- int word_offset = (offset & 3) * 8;
- u32 mask = (1UL << (mmio->len * 8)) - 1;
- u32 regval;
-
- /*
- * Any alignment fault should have been delivered to the guest
- * directly (ARM ARM B3.12.7 "Prioritization of aborts").
- */
-
- if (reg) {
- regval = *reg;
- } else {
- BUG_ON(mode != (ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED));
- regval = 0;
- }
-
- if (mmio->is_write) {
- u32 data = mmio_data_read(mmio, mask) << word_offset;
- switch (ACCESS_WRITE_MASK(mode)) {
- case ACCESS_WRITE_IGNORED:
- return;
-
- case ACCESS_WRITE_SETBIT:
- regval |= data;
- break;
-
- case ACCESS_WRITE_CLEARBIT:
- regval &= ~data;
- break;
-
- case ACCESS_WRITE_VALUE:
- regval = (regval & ~(mask << word_offset)) | data;
- break;
- }
- *reg = regval;
- } else {
- switch (ACCESS_READ_MASK(mode)) {
- case ACCESS_READ_RAZ:
- regval = 0;
- /* fall through */
-
- case ACCESS_READ_VALUE:
- mmio_data_write(mmio, mask, regval >> word_offset);
- }
- }
-}
-
-bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- vgic_reg_access(mmio, NULL, offset,
- ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
- return false;
-}
-
-bool vgic_handle_enable_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
- phys_addr_t offset, int vcpu_id, int access)
-{
- u32 *reg;
- int mode = ACCESS_READ_VALUE | access;
- struct kvm_vcpu *target_vcpu = kvm_get_vcpu(kvm, vcpu_id);
-
- reg = vgic_bitmap_get_reg(&kvm->arch.vgic.irq_enabled, vcpu_id, offset);
- vgic_reg_access(mmio, reg, offset, mode);
- if (mmio->is_write) {
- if (access & ACCESS_WRITE_CLEARBIT) {
- if (offset < 4) /* Force SGI enabled */
- *reg |= 0xffff;
- vgic_retire_disabled_irqs(target_vcpu);
- }
- vgic_update_state(kvm);
- return true;
- }
-
- return false;
-}
-
-bool vgic_handle_set_pending_reg(struct kvm *kvm,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset, int vcpu_id)
-{
- u32 *reg, orig;
- u32 level_mask;
- int mode = ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT;
- struct vgic_dist *dist = &kvm->arch.vgic;
-
- reg = vgic_bitmap_get_reg(&dist->irq_cfg, vcpu_id, offset);
- level_mask = (~(*reg));
-
- /* Mark both level and edge triggered irqs as pending */
- reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset);
- orig = *reg;
- vgic_reg_access(mmio, reg, offset, mode);
-
- if (mmio->is_write) {
- /* Set the soft-pending flag only for level-triggered irqs */
- reg = vgic_bitmap_get_reg(&dist->irq_soft_pend,
- vcpu_id, offset);
- vgic_reg_access(mmio, reg, offset, mode);
- *reg &= level_mask;
-
- /* Ignore writes to SGIs */
- if (offset < 2) {
- *reg &= ~0xffff;
- *reg |= orig & 0xffff;
- }
-
- vgic_update_state(kvm);
- return true;
- }
-
- return false;
-}
-
-bool vgic_handle_clear_pending_reg(struct kvm *kvm,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset, int vcpu_id)
-{
- u32 *level_active;
- u32 *reg, orig;
- int mode = ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT;
- struct vgic_dist *dist = &kvm->arch.vgic;
-
- reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset);
- orig = *reg;
- vgic_reg_access(mmio, reg, offset, mode);
- if (mmio->is_write) {
- /* Re-set level triggered level-active interrupts */
- level_active = vgic_bitmap_get_reg(&dist->irq_level,
- vcpu_id, offset);
- reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset);
- *reg |= *level_active;
-
- /* Ignore writes to SGIs */
- if (offset < 2) {
- *reg &= ~0xffff;
- *reg |= orig & 0xffff;
- }
-
- /* Clear soft-pending flags */
- reg = vgic_bitmap_get_reg(&dist->irq_soft_pend,
- vcpu_id, offset);
- vgic_reg_access(mmio, reg, offset, mode);
-
- vgic_update_state(kvm);
- return true;
- }
- return false;
-}
-
-bool vgic_handle_set_active_reg(struct kvm *kvm,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset, int vcpu_id)
-{
- u32 *reg;
- struct vgic_dist *dist = &kvm->arch.vgic;
-
- reg = vgic_bitmap_get_reg(&dist->irq_active, vcpu_id, offset);
- vgic_reg_access(mmio, reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT);
-
- if (mmio->is_write) {
- vgic_update_state(kvm);
- return true;
- }
-
- return false;
-}
-
-bool vgic_handle_clear_active_reg(struct kvm *kvm,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset, int vcpu_id)
-{
- u32 *reg;
- struct vgic_dist *dist = &kvm->arch.vgic;
-
- reg = vgic_bitmap_get_reg(&dist->irq_active, vcpu_id, offset);
- vgic_reg_access(mmio, reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT);
-
- if (mmio->is_write) {
- vgic_update_state(kvm);
- return true;
- }
-
- return false;
-}
-
-static u32 vgic_cfg_expand(u16 val)
-{
- u32 res = 0;
- int i;
-
- /*
- * Turn a 16bit value like abcd...mnop into a 32bit word
- * a0b0c0d0...m0n0o0p0, which is what the HW cfg register is.
- */
- for (i = 0; i < 16; i++)
- res |= ((val >> i) & VGIC_CFG_EDGE) << (2 * i + 1);
-
- return res;
-}
-
-static u16 vgic_cfg_compress(u32 val)
-{
- u16 res = 0;
- int i;
-
- /*
- * Turn a 32bit word a0b0c0d0...m0n0o0p0 into 16bit value like
- * abcd...mnop which is what we really care about.
- */
- for (i = 0; i < 16; i++)
- res |= ((val >> (i * 2 + 1)) & VGIC_CFG_EDGE) << i;
-
- return res;
-}
-
-/*
- * The distributor uses 2 bits per IRQ for the CFG register, but the
- * LSB is always 0. As such, we only keep the upper bit, and use the
- * two above functions to compress/expand the bits
- */
-bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- u32 val;
-
- if (offset & 4)
- val = *reg >> 16;
- else
- val = *reg & 0xffff;
-
- val = vgic_cfg_expand(val);
- vgic_reg_access(mmio, &val, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
- if (mmio->is_write) {
- /* Ignore writes to read-only SGI and PPI bits */
- if (offset < 8)
- return false;
-
- val = vgic_cfg_compress(val);
- if (offset & 4) {
- *reg &= 0xffff;
- *reg |= val << 16;
- } else {
- *reg &= 0xffff << 16;
- *reg |= val;
- }
- }
-
- return false;
-}
-
-/**
- * vgic_unqueue_irqs - move pending/active IRQs from LRs to the distributor
- * @vgic_cpu: Pointer to the vgic_cpu struct holding the LRs
- *
- * Move any IRQs that have already been assigned to LRs back to the
- * emulated distributor state so that the complete emulated state can be read
- * from the main emulation structures without investigating the LRs.
- */
-void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
-{
- u64 elrsr = vgic_get_elrsr(vcpu);
- unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
- int i;
-
- for_each_clear_bit(i, elrsr_ptr, vgic->nr_lr) {
- struct vgic_lr lr = vgic_get_lr(vcpu, i);
-
- /*
- * There are three options for the state bits:
- *
- * 01: pending
- * 10: active
- * 11: pending and active
- */
- BUG_ON(!(lr.state & LR_STATE_MASK));
-
- /* Reestablish SGI source for pending and active IRQs */
- if (lr.irq < VGIC_NR_SGIS)
- add_sgi_source(vcpu, lr.irq, lr.source);
-
- /*
- * If the LR holds an active (10) or a pending and active (11)
- * interrupt then move the active state to the
- * distributor tracking bit.
- */
- if (lr.state & LR_STATE_ACTIVE)
- vgic_irq_set_active(vcpu, lr.irq);
-
- /*
- * Reestablish the pending state on the distributor and the
- * CPU interface and mark the LR as free for other use.
- */
- vgic_retire_lr(i, vcpu);
-
- /* Finally update the VGIC state. */
- vgic_update_state(vcpu->kvm);
- }
-}
-
-const
-struct vgic_io_range *vgic_find_range(const struct vgic_io_range *ranges,
- int len, gpa_t offset)
-{
- while (ranges->len) {
- if (offset >= ranges->base &&
- (offset + len) <= (ranges->base + ranges->len))
- return ranges;
- ranges++;
- }
-
- return NULL;
-}
-
-static bool vgic_validate_access(const struct vgic_dist *dist,
- const struct vgic_io_range *range,
- unsigned long offset)
-{
- int irq;
-
- if (!range->bits_per_irq)
- return true; /* Not an irq-based access */
-
- irq = offset * 8 / range->bits_per_irq;
- if (irq >= dist->nr_irqs)
- return false;
-
- return true;
-}
-
-/*
- * Call the respective handler function for the given range.
- * We split up any 64 bit accesses into two consecutive 32 bit
- * handler calls and merge the result afterwards.
- * We do this in a little endian fashion regardless of the host's
- * or guest's endianness, because the GIC is always LE and the rest of
- * the code (vgic_reg_access) also puts it in a LE fashion already.
- * At this point we have already identified the handle function, so
- * range points to that one entry and offset is relative to this.
- */
-static bool call_range_handler(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- unsigned long offset,
- const struct vgic_io_range *range)
-{
- struct kvm_exit_mmio mmio32;
- bool ret;
-
- if (likely(mmio->len <= 4))
- return range->handle_mmio(vcpu, mmio, offset);
-
- /*
- * Any access bigger than 4 bytes (that we currently handle in KVM)
- * is actually 8 bytes long, caused by a 64-bit access
- */
-
- mmio32.len = 4;
- mmio32.is_write = mmio->is_write;
- mmio32.private = mmio->private;
-
- mmio32.phys_addr = mmio->phys_addr + 4;
- mmio32.data = &((u32 *)mmio->data)[1];
- ret = range->handle_mmio(vcpu, &mmio32, offset + 4);
-
- mmio32.phys_addr = mmio->phys_addr;
- mmio32.data = &((u32 *)mmio->data)[0];
- ret |= range->handle_mmio(vcpu, &mmio32, offset);
-
- return ret;
-}
-
-/**
- * vgic_handle_mmio_access - handle an in-kernel MMIO access
- * This is called by the read/write KVM IO device wrappers below.
- * @vcpu: pointer to the vcpu performing the access
- * @this: pointer to the KVM IO device in charge
- * @addr: guest physical address of the access
- * @len: size of the access
- * @val: pointer to the data region
- * @is_write: read or write access
- *
- * returns true if the MMIO access could be performed
- */
-static int vgic_handle_mmio_access(struct kvm_vcpu *vcpu,
- struct kvm_io_device *this, gpa_t addr,
- int len, void *val, bool is_write)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- struct vgic_io_device *iodev = container_of(this,
- struct vgic_io_device, dev);
- const struct vgic_io_range *range;
- struct kvm_exit_mmio mmio;
- bool updated_state;
- gpa_t offset;
-
- offset = addr - iodev->addr;
- range = vgic_find_range(iodev->reg_ranges, len, offset);
- if (unlikely(!range || !range->handle_mmio)) {
- pr_warn("Unhandled access %d %08llx %d\n", is_write, addr, len);
- return -ENXIO;
- }
-
- mmio.phys_addr = addr;
- mmio.len = len;
- mmio.is_write = is_write;
- mmio.data = val;
- mmio.private = iodev->redist_vcpu;
-
- spin_lock(&dist->lock);
- offset -= range->base;
- if (vgic_validate_access(dist, range, offset)) {
- updated_state = call_range_handler(vcpu, &mmio, offset, range);
- } else {
- if (!is_write)
- memset(val, 0, len);
- updated_state = false;
- }
- spin_unlock(&dist->lock);
-
- if (updated_state)
- vgic_kick_vcpus(vcpu->kvm);
-
- return 0;
-}
-
-static int vgic_handle_mmio_read(struct kvm_vcpu *vcpu,
- struct kvm_io_device *this,
- gpa_t addr, int len, void *val)
-{
- return vgic_handle_mmio_access(vcpu, this, addr, len, val, false);
-}
-
-static int vgic_handle_mmio_write(struct kvm_vcpu *vcpu,
- struct kvm_io_device *this,
- gpa_t addr, int len, const void *val)
-{
- return vgic_handle_mmio_access(vcpu, this, addr, len, (void *)val,
- true);
-}
-
-static struct kvm_io_device_ops vgic_io_ops = {
- .read = vgic_handle_mmio_read,
- .write = vgic_handle_mmio_write,
-};
-
-/**
- * vgic_register_kvm_io_dev - register VGIC register frame on the KVM I/O bus
- * @kvm: The VM structure pointer
- * @base: The (guest) base address for the register frame
- * @len: Length of the register frame window
- * @ranges: Describing the handler functions for each register
- * @redist_vcpu_id: The VCPU ID to pass on to the handlers on call
- * @iodev: Points to memory to be passed on to the handler
- *
- * @iodev stores the parameters of this function to be usable by the handler
- * respectively the dispatcher function (since the KVM I/O bus framework lacks
- * an opaque parameter). Initialization is done in this function, but the
- * reference should be valid and unique for the whole VGIC lifetime.
- * If the register frame is not mapped for a specific VCPU, pass -1 to
- * @redist_vcpu_id.
- */
-int vgic_register_kvm_io_dev(struct kvm *kvm, gpa_t base, int len,
- const struct vgic_io_range *ranges,
- int redist_vcpu_id,
- struct vgic_io_device *iodev)
-{
- struct kvm_vcpu *vcpu = NULL;
- int ret;
-
- if (redist_vcpu_id >= 0)
- vcpu = kvm_get_vcpu(kvm, redist_vcpu_id);
-
- iodev->addr = base;
- iodev->len = len;
- iodev->reg_ranges = ranges;
- iodev->redist_vcpu = vcpu;
-
- kvm_iodevice_init(&iodev->dev, &vgic_io_ops);
-
- mutex_lock(&kvm->slots_lock);
-
- ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, base, len,
- &iodev->dev);
- mutex_unlock(&kvm->slots_lock);
-
- /* Mark the iodev as invalid if registration fails. */
- if (ret)
- iodev->dev.ops = NULL;
-
- return ret;
-}
-
-static int vgic_nr_shared_irqs(struct vgic_dist *dist)
-{
- return dist->nr_irqs - VGIC_NR_PRIVATE_IRQS;
-}
-
-static int compute_active_for_cpu(struct kvm_vcpu *vcpu)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- unsigned long *active, *enabled, *act_percpu, *act_shared;
- unsigned long active_private, active_shared;
- int nr_shared = vgic_nr_shared_irqs(dist);
- int vcpu_id;
-
- vcpu_id = vcpu->vcpu_id;
- act_percpu = vcpu->arch.vgic_cpu.active_percpu;
- act_shared = vcpu->arch.vgic_cpu.active_shared;
-
- active = vgic_bitmap_get_cpu_map(&dist->irq_active, vcpu_id);
- enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id);
- bitmap_and(act_percpu, active, enabled, VGIC_NR_PRIVATE_IRQS);
-
- active = vgic_bitmap_get_shared_map(&dist->irq_active);
- enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled);
- bitmap_and(act_shared, active, enabled, nr_shared);
- bitmap_and(act_shared, act_shared,
- vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]),
- nr_shared);
-
- active_private = find_first_bit(act_percpu, VGIC_NR_PRIVATE_IRQS);
- active_shared = find_first_bit(act_shared, nr_shared);
-
- return (active_private < VGIC_NR_PRIVATE_IRQS ||
- active_shared < nr_shared);
-}
-
-static int compute_pending_for_cpu(struct kvm_vcpu *vcpu)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- unsigned long *pending, *enabled, *pend_percpu, *pend_shared;
- unsigned long pending_private, pending_shared;
- int nr_shared = vgic_nr_shared_irqs(dist);
- int vcpu_id;
-
- vcpu_id = vcpu->vcpu_id;
- pend_percpu = vcpu->arch.vgic_cpu.pending_percpu;
- pend_shared = vcpu->arch.vgic_cpu.pending_shared;
-
- if (!dist->enabled) {
- bitmap_zero(pend_percpu, VGIC_NR_PRIVATE_IRQS);
- bitmap_zero(pend_shared, nr_shared);
- return 0;
- }
-
- pending = vgic_bitmap_get_cpu_map(&dist->irq_pending, vcpu_id);
- enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id);
- bitmap_and(pend_percpu, pending, enabled, VGIC_NR_PRIVATE_IRQS);
-
- pending = vgic_bitmap_get_shared_map(&dist->irq_pending);
- enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled);
- bitmap_and(pend_shared, pending, enabled, nr_shared);
- bitmap_and(pend_shared, pend_shared,
- vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]),
- nr_shared);
-
- pending_private = find_first_bit(pend_percpu, VGIC_NR_PRIVATE_IRQS);
- pending_shared = find_first_bit(pend_shared, nr_shared);
- return (pending_private < VGIC_NR_PRIVATE_IRQS ||
- pending_shared < vgic_nr_shared_irqs(dist));
-}
-
-/*
- * Update the interrupt state and determine which CPUs have pending
- * or active interrupts. Must be called with distributor lock held.
- */
-void vgic_update_state(struct kvm *kvm)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
- struct kvm_vcpu *vcpu;
- int c;
-
- kvm_for_each_vcpu(c, vcpu, kvm) {
- if (compute_pending_for_cpu(vcpu))
- set_bit(c, dist->irq_pending_on_cpu);
-
- if (compute_active_for_cpu(vcpu))
- set_bit(c, dist->irq_active_on_cpu);
- else
- clear_bit(c, dist->irq_active_on_cpu);
- }
-}
-
-static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr)
-{
- return vgic_ops->get_lr(vcpu, lr);
-}
-
-static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr,
- struct vgic_lr vlr)
-{
- vgic_ops->set_lr(vcpu, lr, vlr);
-}
-
-static inline u64 vgic_get_elrsr(struct kvm_vcpu *vcpu)
-{
- return vgic_ops->get_elrsr(vcpu);
-}
-
-static inline u64 vgic_get_eisr(struct kvm_vcpu *vcpu)
-{
- return vgic_ops->get_eisr(vcpu);
-}
-
-static inline void vgic_clear_eisr(struct kvm_vcpu *vcpu)
-{
- vgic_ops->clear_eisr(vcpu);
-}
-
-static inline u32 vgic_get_interrupt_status(struct kvm_vcpu *vcpu)
-{
- return vgic_ops->get_interrupt_status(vcpu);
-}
-
-static inline void vgic_enable_underflow(struct kvm_vcpu *vcpu)
-{
- vgic_ops->enable_underflow(vcpu);
-}
-
-static inline void vgic_disable_underflow(struct kvm_vcpu *vcpu)
-{
- vgic_ops->disable_underflow(vcpu);
-}
-
-void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
-{
- vgic_ops->get_vmcr(vcpu, vmcr);
-}
-
-void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
-{
- vgic_ops->set_vmcr(vcpu, vmcr);
-}
-
-static inline void vgic_enable(struct kvm_vcpu *vcpu)
-{
- vgic_ops->enable(vcpu);
-}
-
-static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu)
-{
- struct vgic_lr vlr = vgic_get_lr(vcpu, lr_nr);
-
- vgic_irq_clear_queued(vcpu, vlr.irq);
-
- /*
- * We must transfer the pending state back to the distributor before
- * retiring the LR, otherwise we may loose edge-triggered interrupts.
- */
- if (vlr.state & LR_STATE_PENDING) {
- vgic_dist_irq_set_pending(vcpu, vlr.irq);
- vlr.hwirq = 0;
- }
-
- vlr.state = 0;
- vgic_set_lr(vcpu, lr_nr, vlr);
-}
-
-static bool dist_active_irq(struct kvm_vcpu *vcpu)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- return test_bit(vcpu->vcpu_id, dist->irq_active_on_cpu);
-}
-
-bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq)
-{
- int i;
-
- for (i = 0; i < vgic->nr_lr; i++) {
- struct vgic_lr vlr = vgic_get_lr(vcpu, i);
-
- if (vlr.irq == virt_irq && vlr.state & LR_STATE_ACTIVE)
- return true;
- }
-
- return vgic_irq_is_active(vcpu, virt_irq);
-}
-
-/*
- * An interrupt may have been disabled after being made pending on the
- * CPU interface (the classic case is a timer running while we're
- * rebooting the guest - the interrupt would kick as soon as the CPU
- * interface gets enabled, with deadly consequences).
- *
- * The solution is to examine already active LRs, and check the
- * interrupt is still enabled. If not, just retire it.
- */
-static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu)
-{
- u64 elrsr = vgic_get_elrsr(vcpu);
- unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
- int lr;
-
- for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) {
- struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
-
- if (!vgic_irq_is_enabled(vcpu, vlr.irq))
- vgic_retire_lr(lr, vcpu);
- }
-}
-
-static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
- int lr_nr, struct vgic_lr vlr)
-{
- if (vgic_irq_is_active(vcpu, irq)) {
- vlr.state |= LR_STATE_ACTIVE;
- kvm_debug("Set active, clear distributor: 0x%x\n", vlr.state);
- vgic_irq_clear_active(vcpu, irq);
- vgic_update_state(vcpu->kvm);
- } else {
- WARN_ON(!vgic_dist_irq_is_pending(vcpu, irq));
- vlr.state |= LR_STATE_PENDING;
- kvm_debug("Set pending: 0x%x\n", vlr.state);
- }
-
- if (!vgic_irq_is_edge(vcpu, irq))
- vlr.state |= LR_EOI_INT;
-
- if (vlr.irq >= VGIC_NR_SGIS) {
- struct irq_phys_map *map;
- map = vgic_irq_map_search(vcpu, irq);
-
- if (map) {
- vlr.hwirq = map->phys_irq;
- vlr.state |= LR_HW;
- vlr.state &= ~LR_EOI_INT;
-
- /*
- * Make sure we're not going to sample this
- * again, as a HW-backed interrupt cannot be
- * in the PENDING_ACTIVE stage.
- */
- vgic_irq_set_queued(vcpu, irq);
- }
- }
-
- vgic_set_lr(vcpu, lr_nr, vlr);
-}
-
-/*
- * Queue an interrupt to a CPU virtual interface. Return true on success,
- * or false if it wasn't possible to queue it.
- * sgi_source must be zero for any non-SGI interrupts.
- */
-bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- u64 elrsr = vgic_get_elrsr(vcpu);
- unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
- struct vgic_lr vlr;
- int lr;
-
- /* Sanitize the input... */
- BUG_ON(sgi_source_id & ~7);
- BUG_ON(sgi_source_id && irq >= VGIC_NR_SGIS);
- BUG_ON(irq >= dist->nr_irqs);
-
- kvm_debug("Queue IRQ%d\n", irq);
-
- /* Do we have an active interrupt for the same CPUID? */
- for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) {
- vlr = vgic_get_lr(vcpu, lr);
- if (vlr.irq == irq && vlr.source == sgi_source_id) {
- kvm_debug("LR%d piggyback for IRQ%d\n", lr, vlr.irq);
- vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
- return true;
- }
- }
-
- /* Try to use another LR for this interrupt */
- lr = find_first_bit(elrsr_ptr, vgic->nr_lr);
- if (lr >= vgic->nr_lr)
- return false;
-
- kvm_debug("LR%d allocated for IRQ%d %x\n", lr, irq, sgi_source_id);
-
- vlr.irq = irq;
- vlr.source = sgi_source_id;
- vlr.state = 0;
- vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
-
- return true;
-}
-
-static bool vgic_queue_hwirq(struct kvm_vcpu *vcpu, int irq)
-{
- if (!vgic_can_sample_irq(vcpu, irq))
- return true; /* level interrupt, already queued */
-
- if (vgic_queue_irq(vcpu, 0, irq)) {
- if (vgic_irq_is_edge(vcpu, irq)) {
- vgic_dist_irq_clear_pending(vcpu, irq);
- vgic_cpu_irq_clear(vcpu, irq);
- } else {
- vgic_irq_set_queued(vcpu, irq);
- }
-
- return true;
- }
-
- return false;
-}
-
-/*
- * Fill the list registers with pending interrupts before running the
- * guest.
- */
-static void __kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
-{
- struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- unsigned long *pa_percpu, *pa_shared;
- int i, vcpu_id;
- int overflow = 0;
- int nr_shared = vgic_nr_shared_irqs(dist);
-
- vcpu_id = vcpu->vcpu_id;
-
- pa_percpu = vcpu->arch.vgic_cpu.pend_act_percpu;
- pa_shared = vcpu->arch.vgic_cpu.pend_act_shared;
-
- bitmap_or(pa_percpu, vgic_cpu->pending_percpu, vgic_cpu->active_percpu,
- VGIC_NR_PRIVATE_IRQS);
- bitmap_or(pa_shared, vgic_cpu->pending_shared, vgic_cpu->active_shared,
- nr_shared);
- /*
- * We may not have any pending interrupt, or the interrupts
- * may have been serviced from another vcpu. In all cases,
- * move along.
- */
- if (!kvm_vgic_vcpu_pending_irq(vcpu) && !dist_active_irq(vcpu))
- goto epilog;
-
- /* SGIs */
- for_each_set_bit(i, pa_percpu, VGIC_NR_SGIS) {
- if (!queue_sgi(vcpu, i))
- overflow = 1;
- }
-
- /* PPIs */
- for_each_set_bit_from(i, pa_percpu, VGIC_NR_PRIVATE_IRQS) {
- if (!vgic_queue_hwirq(vcpu, i))
- overflow = 1;
- }
-
- /* SPIs */
- for_each_set_bit(i, pa_shared, nr_shared) {
- if (!vgic_queue_hwirq(vcpu, i + VGIC_NR_PRIVATE_IRQS))
- overflow = 1;
- }
-
-
-
-
-epilog:
- if (overflow) {
- vgic_enable_underflow(vcpu);
- } else {
- vgic_disable_underflow(vcpu);
- /*
- * We're about to run this VCPU, and we've consumed
- * everything the distributor had in store for
- * us. Claim we don't have anything pending. We'll
- * adjust that if needed while exiting.
- */
- clear_bit(vcpu_id, dist->irq_pending_on_cpu);
- }
-}
-
-static int process_queued_irq(struct kvm_vcpu *vcpu,
- int lr, struct vgic_lr vlr)
-{
- int pending = 0;
-
- /*
- * If the IRQ was EOIed (called from vgic_process_maintenance) or it
- * went from active to non-active (called from vgic_sync_hwirq) it was
- * also ACKed and we we therefore assume we can clear the soft pending
- * state (should it had been set) for this interrupt.
- *
- * Note: if the IRQ soft pending state was set after the IRQ was
- * acked, it actually shouldn't be cleared, but we have no way of
- * knowing that unless we start trapping ACKs when the soft-pending
- * state is set.
- */
- vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq);
-
- /*
- * Tell the gic to start sampling this interrupt again.
- */
- vgic_irq_clear_queued(vcpu, vlr.irq);
-
- /* Any additional pending interrupt? */
- if (vgic_irq_is_edge(vcpu, vlr.irq)) {
- BUG_ON(!(vlr.state & LR_HW));
- pending = vgic_dist_irq_is_pending(vcpu, vlr.irq);
- } else {
- if (vgic_dist_irq_get_level(vcpu, vlr.irq)) {
- vgic_cpu_irq_set(vcpu, vlr.irq);
- pending = 1;
- } else {
- vgic_dist_irq_clear_pending(vcpu, vlr.irq);
- vgic_cpu_irq_clear(vcpu, vlr.irq);
- }
- }
-
- /*
- * Despite being EOIed, the LR may not have
- * been marked as empty.
- */
- vlr.state = 0;
- vlr.hwirq = 0;
- vgic_set_lr(vcpu, lr, vlr);
-
- return pending;
-}
-
-static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
-{
- u32 status = vgic_get_interrupt_status(vcpu);
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- struct kvm *kvm = vcpu->kvm;
- int level_pending = 0;
-
- kvm_debug("STATUS = %08x\n", status);
-
- if (status & INT_STATUS_EOI) {
- /*
- * Some level interrupts have been EOIed. Clear their
- * active bit.
- */
- u64 eisr = vgic_get_eisr(vcpu);
- unsigned long *eisr_ptr = u64_to_bitmask(&eisr);
- int lr;
-
- for_each_set_bit(lr, eisr_ptr, vgic->nr_lr) {
- struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
-
- WARN_ON(vgic_irq_is_edge(vcpu, vlr.irq));
- WARN_ON(vlr.state & LR_STATE_MASK);
-
-
- /*
- * kvm_notify_acked_irq calls kvm_set_irq()
- * to reset the IRQ level, which grabs the dist->lock
- * so we call this before taking the dist->lock.
- */
- kvm_notify_acked_irq(kvm, 0,
- vlr.irq - VGIC_NR_PRIVATE_IRQS);
-
- spin_lock(&dist->lock);
- level_pending |= process_queued_irq(vcpu, lr, vlr);
- spin_unlock(&dist->lock);
- }
- }
-
- if (status & INT_STATUS_UNDERFLOW)
- vgic_disable_underflow(vcpu);
-
- /*
- * In the next iterations of the vcpu loop, if we sync the vgic state
- * after flushing it, but before entering the guest (this happens for
- * pending signals and vmid rollovers), then make sure we don't pick
- * up any old maintenance interrupts here.
- */
- vgic_clear_eisr(vcpu);
-
- return level_pending;
-}
-
-/*
- * Save the physical active state, and reset it to inactive.
- *
- * Return true if there's a pending forwarded interrupt to queue.
- */
-static bool vgic_sync_hwirq(struct kvm_vcpu *vcpu, int lr, struct vgic_lr vlr)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- bool level_pending;
-
- if (!(vlr.state & LR_HW))
- return false;
-
- if (vlr.state & LR_STATE_ACTIVE)
- return false;
-
- spin_lock(&dist->lock);
- level_pending = process_queued_irq(vcpu, lr, vlr);
- spin_unlock(&dist->lock);
- return level_pending;
-}
-
-/* Sync back the VGIC state after a guest run */
-static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- u64 elrsr;
- unsigned long *elrsr_ptr;
- int lr, pending;
- bool level_pending;
-
- level_pending = vgic_process_maintenance(vcpu);
-
- /* Deal with HW interrupts, and clear mappings for empty LRs */
- for (lr = 0; lr < vgic->nr_lr; lr++) {
- struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
-
- level_pending |= vgic_sync_hwirq(vcpu, lr, vlr);
- BUG_ON(vlr.irq >= dist->nr_irqs);
- }
-
- /* Check if we still have something up our sleeve... */
- elrsr = vgic_get_elrsr(vcpu);
- elrsr_ptr = u64_to_bitmask(&elrsr);
- pending = find_first_zero_bit(elrsr_ptr, vgic->nr_lr);
- if (level_pending || pending < vgic->nr_lr)
- set_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
-}
-
-void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- if (!irqchip_in_kernel(vcpu->kvm))
- return;
-
- spin_lock(&dist->lock);
- __kvm_vgic_flush_hwstate(vcpu);
- spin_unlock(&dist->lock);
-}
-
-void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
-{
- if (!irqchip_in_kernel(vcpu->kvm))
- return;
-
- __kvm_vgic_sync_hwstate(vcpu);
-}
-
-int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- if (!irqchip_in_kernel(vcpu->kvm))
- return 0;
-
- return test_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
-}
-
-void vgic_kick_vcpus(struct kvm *kvm)
-{
- struct kvm_vcpu *vcpu;
- int c;
-
- /*
- * We've injected an interrupt, time to find out who deserves
- * a good kick...
- */
- kvm_for_each_vcpu(c, vcpu, kvm) {
- if (kvm_vgic_vcpu_pending_irq(vcpu))
- kvm_vcpu_kick(vcpu);
- }
-}
-
-static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level)
-{
- int edge_triggered = vgic_irq_is_edge(vcpu, irq);
-
- /*
- * Only inject an interrupt if:
- * - edge triggered and we have a rising edge
- * - level triggered and we change level
- */
- if (edge_triggered) {
- int state = vgic_dist_irq_is_pending(vcpu, irq);
- return level > state;
- } else {
- int state = vgic_dist_irq_get_level(vcpu, irq);
- return level != state;
- }
-}
-
-static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
- unsigned int irq_num, bool level)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
- struct kvm_vcpu *vcpu;
- int edge_triggered, level_triggered;
- int enabled;
- bool ret = true, can_inject = true;
-
- trace_vgic_update_irq_pending(cpuid, irq_num, level);
-
- if (irq_num >= min(kvm->arch.vgic.nr_irqs, 1020))
- return -EINVAL;
-
- spin_lock(&dist->lock);
-
- vcpu = kvm_get_vcpu(kvm, cpuid);
- edge_triggered = vgic_irq_is_edge(vcpu, irq_num);
- level_triggered = !edge_triggered;
-
- if (!vgic_validate_injection(vcpu, irq_num, level)) {
- ret = false;
- goto out;
- }
-
- if (irq_num >= VGIC_NR_PRIVATE_IRQS) {
- cpuid = dist->irq_spi_cpu[irq_num - VGIC_NR_PRIVATE_IRQS];
- if (cpuid == VCPU_NOT_ALLOCATED) {
- /* Pretend we use CPU0, and prevent injection */
- cpuid = 0;
- can_inject = false;
- }
- vcpu = kvm_get_vcpu(kvm, cpuid);
- }
-
- kvm_debug("Inject IRQ%d level %d CPU%d\n", irq_num, level, cpuid);
-
- if (level) {
- if (level_triggered)
- vgic_dist_irq_set_level(vcpu, irq_num);
- vgic_dist_irq_set_pending(vcpu, irq_num);
- } else {
- if (level_triggered) {
- vgic_dist_irq_clear_level(vcpu, irq_num);
- if (!vgic_dist_irq_soft_pend(vcpu, irq_num)) {
- vgic_dist_irq_clear_pending(vcpu, irq_num);
- vgic_cpu_irq_clear(vcpu, irq_num);
- if (!compute_pending_for_cpu(vcpu))
- clear_bit(cpuid, dist->irq_pending_on_cpu);
- }
- }
-
- ret = false;
- goto out;
- }
-
- enabled = vgic_irq_is_enabled(vcpu, irq_num);
-
- if (!enabled || !can_inject) {
- ret = false;
- goto out;
- }
-
- if (!vgic_can_sample_irq(vcpu, irq_num)) {
- /*
- * Level interrupt in progress, will be picked up
- * when EOId.
- */
- ret = false;
- goto out;
- }
-
- if (level) {
- vgic_cpu_irq_set(vcpu, irq_num);
- set_bit(cpuid, dist->irq_pending_on_cpu);
- }
-
-out:
- spin_unlock(&dist->lock);
-
- if (ret) {
- /* kick the specified vcpu */
- kvm_vcpu_kick(kvm_get_vcpu(kvm, cpuid));
- }
-
- return 0;
-}
-
-static int vgic_lazy_init(struct kvm *kvm)
-{
- int ret = 0;
-
- if (unlikely(!vgic_initialized(kvm))) {
- /*
- * We only provide the automatic initialization of the VGIC
- * for the legacy case of a GICv2. Any other type must
- * be explicitly initialized once setup with the respective
- * KVM device call.
- */
- if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2)
- return -EBUSY;
-
- mutex_lock(&kvm->lock);
- ret = vgic_init(kvm);
- mutex_unlock(&kvm->lock);
- }
-
- return ret;
-}
-
-/**
- * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic
- * @kvm: The VM structure pointer
- * @cpuid: The CPU for PPIs
- * @irq_num: The IRQ number that is assigned to the device. This IRQ
- * must not be mapped to a HW interrupt.
- * @level: Edge-triggered: true: to trigger the interrupt
- * false: to ignore the call
- * Level-sensitive true: raise the input signal
- * false: lower the input signal
- *
- * The GIC is not concerned with devices being active-LOW or active-HIGH for
- * level-sensitive interrupts. You can think of the level parameter as 1
- * being HIGH and 0 being LOW and all devices being active-HIGH.
- */
-int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
- bool level)
-{
- struct irq_phys_map *map;
- int ret;
-
- ret = vgic_lazy_init(kvm);
- if (ret)
- return ret;
-
- map = vgic_irq_map_search(kvm_get_vcpu(kvm, cpuid), irq_num);
- if (map)
- return -EINVAL;
-
- return vgic_update_irq_pending(kvm, cpuid, irq_num, level);
-}
-
-/**
- * kvm_vgic_inject_mapped_irq - Inject a physically mapped IRQ to the vgic
- * @kvm: The VM structure pointer
- * @cpuid: The CPU for PPIs
- * @virt_irq: The virtual IRQ to be injected
- * @level: Edge-triggered: true: to trigger the interrupt
- * false: to ignore the call
- * Level-sensitive true: raise the input signal
- * false: lower the input signal
- *
- * The GIC is not concerned with devices being active-LOW or active-HIGH for
- * level-sensitive interrupts. You can think of the level parameter as 1
- * being HIGH and 0 being LOW and all devices being active-HIGH.
- */
-int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid,
- unsigned int virt_irq, bool level)
-{
- int ret;
-
- ret = vgic_lazy_init(kvm);
- if (ret)
- return ret;
-
- return vgic_update_irq_pending(kvm, cpuid, virt_irq, level);
-}
-
-static irqreturn_t vgic_maintenance_handler(int irq, void *data)
-{
- /*
- * We cannot rely on the vgic maintenance interrupt to be
- * delivered synchronously. This means we can only use it to
- * exit the VM, and we perform the handling of EOIed
- * interrupts on the exit path (see vgic_process_maintenance).
- */
- return IRQ_HANDLED;
-}
-
-static struct list_head *vgic_get_irq_phys_map_list(struct kvm_vcpu *vcpu,
- int virt_irq)
-{
- if (virt_irq < VGIC_NR_PRIVATE_IRQS)
- return &vcpu->arch.vgic_cpu.irq_phys_map_list;
- else
- return &vcpu->kvm->arch.vgic.irq_phys_map_list;
-}
-
-/**
- * kvm_vgic_map_phys_irq - map a virtual IRQ to a physical IRQ
- * @vcpu: The VCPU pointer
- * @virt_irq: The virtual IRQ number for the guest
- * @phys_irq: The hardware IRQ number of the host
- *
- * Establish a mapping between a guest visible irq (@virt_irq) and a
- * hardware irq (@phys_irq). On injection, @virt_irq will be associated with
- * the physical interrupt represented by @phys_irq. This mapping can be
- * established multiple times as long as the parameters are the same.
- *
- * Returns 0 on success or an error value otherwise.
- */
-int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int phys_irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
- struct irq_phys_map *map;
- struct irq_phys_map_entry *entry;
- int ret = 0;
-
- /* Create a new mapping */
- entry = kzalloc(sizeof(*entry), GFP_KERNEL);
- if (!entry)
- return -ENOMEM;
-
- spin_lock(&dist->irq_phys_map_lock);
-
- /* Try to match an existing mapping */
- map = vgic_irq_map_search(vcpu, virt_irq);
- if (map) {
- /* Make sure this mapping matches */
- if (map->phys_irq != phys_irq)
- ret = -EINVAL;
-
- /* Found an existing, valid mapping */
- goto out;
- }
-
- map = &entry->map;
- map->virt_irq = virt_irq;
- map->phys_irq = phys_irq;
-
- list_add_tail_rcu(&entry->entry, root);
-
-out:
- spin_unlock(&dist->irq_phys_map_lock);
- /* If we've found a hit in the existing list, free the useless
- * entry */
- if (ret || map != &entry->map)
- kfree(entry);
- return ret;
-}
-
-static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
- int virt_irq)
-{
- struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
- struct irq_phys_map_entry *entry;
- struct irq_phys_map *map;
-
- rcu_read_lock();
-
- list_for_each_entry_rcu(entry, root, entry) {
- map = &entry->map;
- if (map->virt_irq == virt_irq) {
- rcu_read_unlock();
- return map;
- }
- }
-
- rcu_read_unlock();
-
- return NULL;
-}
-
-static void vgic_free_phys_irq_map_rcu(struct rcu_head *rcu)
-{
- struct irq_phys_map_entry *entry;
-
- entry = container_of(rcu, struct irq_phys_map_entry, rcu);
- kfree(entry);
-}
-
-/**
- * kvm_vgic_unmap_phys_irq - Remove a virtual to physical IRQ mapping
- * @vcpu: The VCPU pointer
- * @virt_irq: The virtual IRQ number to be unmapped
- *
- * Remove an existing mapping between virtual and physical interrupts.
- */
-int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- struct irq_phys_map_entry *entry;
- struct list_head *root;
-
- root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
-
- spin_lock(&dist->irq_phys_map_lock);
-
- list_for_each_entry(entry, root, entry) {
- if (entry->map.virt_irq == virt_irq) {
- list_del_rcu(&entry->entry);
- call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu);
- break;
- }
- }
-
- spin_unlock(&dist->irq_phys_map_lock);
-
- return 0;
-}
-
-static void vgic_destroy_irq_phys_map(struct kvm *kvm, struct list_head *root)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
- struct irq_phys_map_entry *entry;
-
- spin_lock(&dist->irq_phys_map_lock);
-
- list_for_each_entry(entry, root, entry) {
- list_del_rcu(&entry->entry);
- call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu);
- }
-
- spin_unlock(&dist->irq_phys_map_lock);
-}
-
-void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
-{
- struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
- kfree(vgic_cpu->pending_shared);
- kfree(vgic_cpu->active_shared);
- kfree(vgic_cpu->pend_act_shared);
- vgic_destroy_irq_phys_map(vcpu->kvm, &vgic_cpu->irq_phys_map_list);
- vgic_cpu->pending_shared = NULL;
- vgic_cpu->active_shared = NULL;
- vgic_cpu->pend_act_shared = NULL;
-}
-
-static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
-{
- struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
- int nr_longs = BITS_TO_LONGS(nr_irqs - VGIC_NR_PRIVATE_IRQS);
- int sz = nr_longs * sizeof(unsigned long);
- vgic_cpu->pending_shared = kzalloc(sz, GFP_KERNEL);
- vgic_cpu->active_shared = kzalloc(sz, GFP_KERNEL);
- vgic_cpu->pend_act_shared = kzalloc(sz, GFP_KERNEL);
-
- if (!vgic_cpu->pending_shared
- || !vgic_cpu->active_shared
- || !vgic_cpu->pend_act_shared) {
- kvm_vgic_vcpu_destroy(vcpu);
- return -ENOMEM;
- }
-
- return 0;
-}
-
-/**
- * kvm_vgic_vcpu_early_init - Earliest possible per-vcpu vgic init stage
- *
- * No memory allocation should be performed here, only static init.
- */
-void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu)
-{
- struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
- INIT_LIST_HEAD(&vgic_cpu->irq_phys_map_list);
-}
-
-/**
- * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
- *
- * The host's GIC naturally limits the maximum amount of VCPUs a guest
- * can use.
- */
-int kvm_vgic_get_max_vcpus(void)
-{
- return vgic->max_gic_vcpus;
-}
-
-void kvm_vgic_destroy(struct kvm *kvm)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
- struct kvm_vcpu *vcpu;
- int i;
-
- kvm_for_each_vcpu(i, vcpu, kvm)
- kvm_vgic_vcpu_destroy(vcpu);
-
- vgic_free_bitmap(&dist->irq_enabled);
- vgic_free_bitmap(&dist->irq_level);
- vgic_free_bitmap(&dist->irq_pending);
- vgic_free_bitmap(&dist->irq_soft_pend);
- vgic_free_bitmap(&dist->irq_queued);
- vgic_free_bitmap(&dist->irq_cfg);
- vgic_free_bytemap(&dist->irq_priority);
- if (dist->irq_spi_target) {
- for (i = 0; i < dist->nr_cpus; i++)
- vgic_free_bitmap(&dist->irq_spi_target[i]);
- }
- kfree(dist->irq_sgi_sources);
- kfree(dist->irq_spi_cpu);
- kfree(dist->irq_spi_mpidr);
- kfree(dist->irq_spi_target);
- kfree(dist->irq_pending_on_cpu);
- kfree(dist->irq_active_on_cpu);
- vgic_destroy_irq_phys_map(kvm, &dist->irq_phys_map_list);
- dist->irq_sgi_sources = NULL;
- dist->irq_spi_cpu = NULL;
- dist->irq_spi_target = NULL;
- dist->irq_pending_on_cpu = NULL;
- dist->irq_active_on_cpu = NULL;
- dist->nr_cpus = 0;
-}
-
-/*
- * Allocate and initialize the various data structures. Must be called
- * with kvm->lock held!
- */
-int vgic_init(struct kvm *kvm)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
- struct kvm_vcpu *vcpu;
- int nr_cpus, nr_irqs;
- int ret, i, vcpu_id;
-
- if (vgic_initialized(kvm))
- return 0;
-
- nr_cpus = dist->nr_cpus = atomic_read(&kvm->online_vcpus);
- if (!nr_cpus) /* No vcpus? Can't be good... */
- return -ENODEV;
-
- /*
- * If nobody configured the number of interrupts, use the
- * legacy one.
- */
- if (!dist->nr_irqs)
- dist->nr_irqs = VGIC_NR_IRQS_LEGACY;
-
- nr_irqs = dist->nr_irqs;
-
- ret = vgic_init_bitmap(&dist->irq_enabled, nr_cpus, nr_irqs);
- ret |= vgic_init_bitmap(&dist->irq_level, nr_cpus, nr_irqs);
- ret |= vgic_init_bitmap(&dist->irq_pending, nr_cpus, nr_irqs);
- ret |= vgic_init_bitmap(&dist->irq_soft_pend, nr_cpus, nr_irqs);
- ret |= vgic_init_bitmap(&dist->irq_queued, nr_cpus, nr_irqs);
- ret |= vgic_init_bitmap(&dist->irq_active, nr_cpus, nr_irqs);
- ret |= vgic_init_bitmap(&dist->irq_cfg, nr_cpus, nr_irqs);
- ret |= vgic_init_bytemap(&dist->irq_priority, nr_cpus, nr_irqs);
-
- if (ret)
- goto out;
-
- dist->irq_sgi_sources = kzalloc(nr_cpus * VGIC_NR_SGIS, GFP_KERNEL);
- dist->irq_spi_cpu = kzalloc(nr_irqs - VGIC_NR_PRIVATE_IRQS, GFP_KERNEL);
- dist->irq_spi_target = kzalloc(sizeof(*dist->irq_spi_target) * nr_cpus,
- GFP_KERNEL);
- dist->irq_pending_on_cpu = kzalloc(BITS_TO_LONGS(nr_cpus) * sizeof(long),
- GFP_KERNEL);
- dist->irq_active_on_cpu = kzalloc(BITS_TO_LONGS(nr_cpus) * sizeof(long),
- GFP_KERNEL);
- if (!dist->irq_sgi_sources ||
- !dist->irq_spi_cpu ||
- !dist->irq_spi_target ||
- !dist->irq_pending_on_cpu ||
- !dist->irq_active_on_cpu) {
- ret = -ENOMEM;
- goto out;
- }
-
- for (i = 0; i < nr_cpus; i++)
- ret |= vgic_init_bitmap(&dist->irq_spi_target[i],
- nr_cpus, nr_irqs);
-
- if (ret)
- goto out;
-
- ret = kvm->arch.vgic.vm_ops.init_model(kvm);
- if (ret)
- goto out;
-
- kvm_for_each_vcpu(vcpu_id, vcpu, kvm) {
- ret = vgic_vcpu_init_maps(vcpu, nr_irqs);
- if (ret) {
- kvm_err("VGIC: Failed to allocate vcpu memory\n");
- break;
- }
-
- /*
- * Enable and configure all SGIs to be edge-triggere and
- * configure all PPIs as level-triggered.
- */
- for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
- if (i < VGIC_NR_SGIS) {
- /* SGIs */
- vgic_bitmap_set_irq_val(&dist->irq_enabled,
- vcpu->vcpu_id, i, 1);
- vgic_bitmap_set_irq_val(&dist->irq_cfg,
- vcpu->vcpu_id, i,
- VGIC_CFG_EDGE);
- } else if (i < VGIC_NR_PRIVATE_IRQS) {
- /* PPIs */
- vgic_bitmap_set_irq_val(&dist->irq_cfg,
- vcpu->vcpu_id, i,
- VGIC_CFG_LEVEL);
- }
- }
-
- vgic_enable(vcpu);
- }
-
-out:
- if (ret)
- kvm_vgic_destroy(kvm);
-
- return ret;
-}
-
-static int init_vgic_model(struct kvm *kvm, int type)
-{
- switch (type) {
- case KVM_DEV_TYPE_ARM_VGIC_V2:
- vgic_v2_init_emulation(kvm);
- break;
-#ifdef CONFIG_KVM_ARM_VGIC_V3
- case KVM_DEV_TYPE_ARM_VGIC_V3:
- vgic_v3_init_emulation(kvm);
- break;
-#endif
- default:
- return -ENODEV;
- }
-
- if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus)
- return -E2BIG;
-
- return 0;
-}
-
-/**
- * kvm_vgic_early_init - Earliest possible vgic initialization stage
- *
- * No memory allocation should be performed here, only static init.
- */
-void kvm_vgic_early_init(struct kvm *kvm)
-{
- spin_lock_init(&kvm->arch.vgic.lock);
- spin_lock_init(&kvm->arch.vgic.irq_phys_map_lock);
- INIT_LIST_HEAD(&kvm->arch.vgic.irq_phys_map_list);
-}
-
-int kvm_vgic_create(struct kvm *kvm, u32 type)
-{
- int i, vcpu_lock_idx = -1, ret;
- struct kvm_vcpu *vcpu;
-
- mutex_lock(&kvm->lock);
-
- if (irqchip_in_kernel(kvm)) {
- ret = -EEXIST;
- goto out;
- }
-
- /*
- * This function is also called by the KVM_CREATE_IRQCHIP handler,
- * which had no chance yet to check the availability of the GICv2
- * emulation. So check this here again. KVM_CREATE_DEVICE does
- * the proper checks already.
- */
- if (type == KVM_DEV_TYPE_ARM_VGIC_V2 && !vgic->can_emulate_gicv2) {
- ret = -ENODEV;
- goto out;
- }
-
- /*
- * Any time a vcpu is run, vcpu_load is called which tries to grab the
- * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure
- * that no other VCPUs are run while we create the vgic.
- */
- ret = -EBUSY;
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (!mutex_trylock(&vcpu->mutex))
- goto out_unlock;
- vcpu_lock_idx = i;
- }
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (vcpu->arch.has_run_once)
- goto out_unlock;
- }
- ret = 0;
-
- ret = init_vgic_model(kvm, type);
- if (ret)
- goto out_unlock;
-
- kvm->arch.vgic.in_kernel = true;
- kvm->arch.vgic.vgic_model = type;
- kvm->arch.vgic.vctrl_base = vgic->vctrl_base;
- kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
- kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
- kvm->arch.vgic.vgic_redist_base = VGIC_ADDR_UNDEF;
-
-out_unlock:
- for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
- vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
- mutex_unlock(&vcpu->mutex);
- }
-
-out:
- mutex_unlock(&kvm->lock);
- return ret;
-}
-
-static int vgic_ioaddr_overlap(struct kvm *kvm)
-{
- phys_addr_t dist = kvm->arch.vgic.vgic_dist_base;
- phys_addr_t cpu = kvm->arch.vgic.vgic_cpu_base;
-
- if (IS_VGIC_ADDR_UNDEF(dist) || IS_VGIC_ADDR_UNDEF(cpu))
- return 0;
- if ((dist <= cpu && dist + KVM_VGIC_V2_DIST_SIZE > cpu) ||
- (cpu <= dist && cpu + KVM_VGIC_V2_CPU_SIZE > dist))
- return -EBUSY;
- return 0;
-}
-
-static int vgic_ioaddr_assign(struct kvm *kvm, phys_addr_t *ioaddr,
- phys_addr_t addr, phys_addr_t size)
-{
- int ret;
-
- if (addr & ~KVM_PHYS_MASK)
- return -E2BIG;
-
- if (addr & (SZ_4K - 1))
- return -EINVAL;
-
- if (!IS_VGIC_ADDR_UNDEF(*ioaddr))
- return -EEXIST;
- if (addr + size < addr)
- return -EINVAL;
-
- *ioaddr = addr;
- ret = vgic_ioaddr_overlap(kvm);
- if (ret)
- *ioaddr = VGIC_ADDR_UNDEF;
-
- return ret;
-}
-
-/**
- * kvm_vgic_addr - set or get vgic VM base addresses
- * @kvm: pointer to the vm struct
- * @type: the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX
- * @addr: pointer to address value
- * @write: if true set the address in the VM address space, if false read the
- * address
- *
- * Set or get the vgic base addresses for the distributor and the virtual CPU
- * interface in the VM physical address space. These addresses are properties
- * of the emulated core/SoC and therefore user space initially knows this
- * information.
- */
-int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
-{
- int r = 0;
- struct vgic_dist *vgic = &kvm->arch.vgic;
- int type_needed;
- phys_addr_t *addr_ptr, block_size;
- phys_addr_t alignment;
-
- mutex_lock(&kvm->lock);
- switch (type) {
- case KVM_VGIC_V2_ADDR_TYPE_DIST:
- type_needed = KVM_DEV_TYPE_ARM_VGIC_V2;
- addr_ptr = &vgic->vgic_dist_base;
- block_size = KVM_VGIC_V2_DIST_SIZE;
- alignment = SZ_4K;
- break;
- case KVM_VGIC_V2_ADDR_TYPE_CPU:
- type_needed = KVM_DEV_TYPE_ARM_VGIC_V2;
- addr_ptr = &vgic->vgic_cpu_base;
- block_size = KVM_VGIC_V2_CPU_SIZE;
- alignment = SZ_4K;
- break;
-#ifdef CONFIG_KVM_ARM_VGIC_V3
- case KVM_VGIC_V3_ADDR_TYPE_DIST:
- type_needed = KVM_DEV_TYPE_ARM_VGIC_V3;
- addr_ptr = &vgic->vgic_dist_base;
- block_size = KVM_VGIC_V3_DIST_SIZE;
- alignment = SZ_64K;
- break;
- case KVM_VGIC_V3_ADDR_TYPE_REDIST:
- type_needed = KVM_DEV_TYPE_ARM_VGIC_V3;
- addr_ptr = &vgic->vgic_redist_base;
- block_size = KVM_VGIC_V3_REDIST_SIZE;
- alignment = SZ_64K;
- break;
-#endif
- default:
- r = -ENODEV;
- goto out;
- }
-
- if (vgic->vgic_model != type_needed) {
- r = -ENODEV;
- goto out;
- }
-
- if (write) {
- if (!IS_ALIGNED(*addr, alignment))
- r = -EINVAL;
- else
- r = vgic_ioaddr_assign(kvm, addr_ptr, *addr,
- block_size);
- } else {
- *addr = *addr_ptr;
- }
-
-out:
- mutex_unlock(&kvm->lock);
- return r;
-}
-
-int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
-{
- int r;
-
- switch (attr->group) {
- case KVM_DEV_ARM_VGIC_GRP_ADDR: {
- u64 __user *uaddr = (u64 __user *)(long)attr->addr;
- u64 addr;
- unsigned long type = (unsigned long)attr->attr;
-
- if (copy_from_user(&addr, uaddr, sizeof(addr)))
- return -EFAULT;
-
- r = kvm_vgic_addr(dev->kvm, type, &addr, true);
- return (r == -ENODEV) ? -ENXIO : r;
- }
- case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
- u32 __user *uaddr = (u32 __user *)(long)attr->addr;
- u32 val;
- int ret = 0;
-
- if (get_user(val, uaddr))
- return -EFAULT;
-
- /*
- * We require:
- * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs
- * - at most 1024 interrupts
- * - a multiple of 32 interrupts
- */
- if (val < (VGIC_NR_PRIVATE_IRQS + 32) ||
- val > VGIC_MAX_IRQS ||
- (val & 31))
- return -EINVAL;
-
- mutex_lock(&dev->kvm->lock);
-
- if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_irqs)
- ret = -EBUSY;
- else
- dev->kvm->arch.vgic.nr_irqs = val;
-
- mutex_unlock(&dev->kvm->lock);
-
- return ret;
- }
- case KVM_DEV_ARM_VGIC_GRP_CTRL: {
- switch (attr->attr) {
- case KVM_DEV_ARM_VGIC_CTRL_INIT:
- r = vgic_init(dev->kvm);
- return r;
- }
- break;
- }
- }
-
- return -ENXIO;
-}
-
-int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
-{
- int r = -ENXIO;
-
- switch (attr->group) {
- case KVM_DEV_ARM_VGIC_GRP_ADDR: {
- u64 __user *uaddr = (u64 __user *)(long)attr->addr;
- u64 addr;
- unsigned long type = (unsigned long)attr->attr;
-
- r = kvm_vgic_addr(dev->kvm, type, &addr, false);
- if (r)
- return (r == -ENODEV) ? -ENXIO : r;
-
- if (copy_to_user(uaddr, &addr, sizeof(addr)))
- return -EFAULT;
- break;
- }
- case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
- u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-
- r = put_user(dev->kvm->arch.vgic.nr_irqs, uaddr);
- break;
- }
-
- }
-
- return r;
-}
-
-int vgic_has_attr_regs(const struct vgic_io_range *ranges, phys_addr_t offset)
-{
- if (vgic_find_range(ranges, 4, offset))
- return 0;
- else
- return -ENXIO;
-}
-
-static int vgic_starting_cpu(unsigned int cpu)
-{
- enable_percpu_irq(vgic->maint_irq, 0);
- return 0;
-}
-
-static int vgic_dying_cpu(unsigned int cpu)
-{
- disable_percpu_irq(vgic->maint_irq);
- return 0;
-}
-
-static int kvm_vgic_probe(void)
-{
- const struct gic_kvm_info *gic_kvm_info;
- int ret;
-
- gic_kvm_info = gic_get_kvm_info();
- if (!gic_kvm_info)
- return -ENODEV;
-
- switch (gic_kvm_info->type) {
- case GIC_V2:
- ret = vgic_v2_probe(gic_kvm_info, &vgic_ops, &vgic);
- break;
- case GIC_V3:
- ret = vgic_v3_probe(gic_kvm_info, &vgic_ops, &vgic);
- break;
- default:
- ret = -ENODEV;
- }
-
- return ret;
-}
-
-int kvm_vgic_hyp_init(void)
-{
- int ret;
-
- ret = kvm_vgic_probe();
- if (ret) {
- kvm_err("error: KVM vGIC probing failed\n");
- return ret;
- }
-
- ret = request_percpu_irq(vgic->maint_irq, vgic_maintenance_handler,
- "vgic", kvm_get_running_vcpus());
- if (ret) {
- kvm_err("Cannot register interrupt %d\n", vgic->maint_irq);
- return ret;
- }
-
- cpuhp_setup_state(CPUHP_AP_KVM_ARM_VGIC_STARTING,
- "AP_KVM_ARM_VGIC_STARTING", vgic_starting_cpu,
- vgic_dying_cpu);
- return 0;
-}
-
-int kvm_irq_map_gsi(struct kvm *kvm,
- struct kvm_kernel_irq_routing_entry *entries,
- int gsi)
-{
- return 0;
-}
-
-int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
-{
- return pin;
-}
-
-int kvm_set_irq(struct kvm *kvm, int irq_source_id,
- u32 irq, int level, bool line_status)
-{
- unsigned int spi = irq + VGIC_NR_PRIVATE_IRQS;
-
- trace_kvm_set_irq(irq, level, irq_source_id);
-
- BUG_ON(!vgic_initialized(kvm));
-
- return kvm_vgic_inject_irq(kvm, 0, spi, level);
-}
-
-/* MSI not implemented yet */
-int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id,
- int level, bool line_status)
-{
- return 0;
-}
diff --git a/virt/kvm/arm/vgic.h b/virt/kvm/arm/vgic.h
deleted file mode 100644
index 0df74cbb6200..000000000000
--- a/virt/kvm/arm/vgic.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (C) 2012-2014 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * Derived from virt/kvm/arm/vgic.c
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef __KVM_VGIC_H__
-#define __KVM_VGIC_H__
-
-#include <kvm/iodev.h>
-
-#define VGIC_ADDR_UNDEF (-1)
-#define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF)
-
-#define PRODUCT_ID_KVM 0x4b /* ASCII code K */
-#define IMPLEMENTER_ARM 0x43b
-
-#define ACCESS_READ_VALUE (1 << 0)
-#define ACCESS_READ_RAZ (0 << 0)
-#define ACCESS_READ_MASK(x) ((x) & (1 << 0))
-#define ACCESS_WRITE_IGNORED (0 << 1)
-#define ACCESS_WRITE_SETBIT (1 << 1)
-#define ACCESS_WRITE_CLEARBIT (2 << 1)
-#define ACCESS_WRITE_VALUE (3 << 1)
-#define ACCESS_WRITE_MASK(x) ((x) & (3 << 1))
-
-#define VCPU_NOT_ALLOCATED ((u8)-1)
-
-unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x);
-
-void vgic_update_state(struct kvm *kvm);
-int vgic_init_common_maps(struct kvm *kvm);
-
-u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, int cpuid, u32 offset);
-u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset);
-
-void vgic_dist_irq_set_pending(struct kvm_vcpu *vcpu, int irq);
-void vgic_dist_irq_clear_pending(struct kvm_vcpu *vcpu, int irq);
-void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq);
-void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid,
- int irq, int val);
-
-void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-
-bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq);
-void vgic_unqueue_irqs(struct kvm_vcpu *vcpu);
-
-struct kvm_exit_mmio {
- phys_addr_t phys_addr;
- void *data;
- u32 len;
- bool is_write;
- void *private;
-};
-
-void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg,
- phys_addr_t offset, int mode);
-bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
- phys_addr_t offset);
-
-static inline
-u32 mmio_data_read(struct kvm_exit_mmio *mmio, u32 mask)
-{
- return le32_to_cpu(*((u32 *)mmio->data)) & mask;
-}
-
-static inline
-void mmio_data_write(struct kvm_exit_mmio *mmio, u32 mask, u32 value)
-{
- *((u32 *)mmio->data) = cpu_to_le32(value) & mask;
-}
-
-struct vgic_io_range {
- phys_addr_t base;
- unsigned long len;
- int bits_per_irq;
- bool (*handle_mmio)(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
- phys_addr_t offset);
-};
-
-int vgic_register_kvm_io_dev(struct kvm *kvm, gpa_t base, int len,
- const struct vgic_io_range *ranges,
- int redist_id,
- struct vgic_io_device *iodev);
-
-static inline bool is_in_range(phys_addr_t addr, unsigned long len,
- phys_addr_t baseaddr, unsigned long size)
-{
- return (addr >= baseaddr) && (addr + len <= baseaddr + size);
-}
-
-const
-struct vgic_io_range *vgic_find_range(const struct vgic_io_range *ranges,
- int len, gpa_t offset);
-
-bool vgic_handle_enable_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
- phys_addr_t offset, int vcpu_id, int access);
-
-bool vgic_handle_set_pending_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
- phys_addr_t offset, int vcpu_id);
-
-bool vgic_handle_clear_pending_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
- phys_addr_t offset, int vcpu_id);
-
-bool vgic_handle_set_active_reg(struct kvm *kvm,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset, int vcpu_id);
-
-bool vgic_handle_clear_active_reg(struct kvm *kvm,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset, int vcpu_id);
-
-bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
- phys_addr_t offset);
-
-void vgic_kick_vcpus(struct kvm *kvm);
-
-int vgic_has_attr_regs(const struct vgic_io_range *ranges, phys_addr_t offset);
-int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
-int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
-
-int vgic_init(struct kvm *kvm);
-void vgic_v2_init_emulation(struct kvm *kvm);
-void vgic_v3_init_emulation(struct kvm *kvm);
-
-#endif
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
index 2c7f0d5a62ea..1e30ce08700d 100644
--- a/virt/kvm/arm/vgic/vgic-init.c
+++ b/virt/kvm/arm/vgic/vgic-init.c
@@ -157,6 +157,9 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
int i;
+ INIT_LIST_HEAD(&dist->lpi_list_head);
+ spin_lock_init(&dist->lpi_list_lock);
+
dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL);
if (!dist->spis)
return -ENOMEM;
@@ -177,6 +180,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
spin_lock_init(&irq->irq_lock);
irq->vcpu = NULL;
irq->target_vcpu = vcpu0;
+ kref_init(&irq->refcount);
if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
irq->targets = 0;
else
@@ -211,6 +215,7 @@ static void kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
irq->vcpu = NULL;
irq->target_vcpu = vcpu;
irq->targets = 1U << vcpu->vcpu_id;
+ kref_init(&irq->refcount);
if (vgic_irq_is_sgi(i)) {
/* SGIs */
irq->enabled = 1;
@@ -253,6 +258,9 @@ int vgic_init(struct kvm *kvm)
if (ret)
goto out;
+ if (vgic_has_its(kvm))
+ dist->msis_require_devid = true;
+
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_vgic_vcpu_init(vcpu);
@@ -271,7 +279,6 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm)
dist->initialized = false;
kfree(dist->spis);
- kfree(dist->redist_iodevs);
dist->nr_spis = 0;
mutex_unlock(&kvm->lock);
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
new file mode 100644
index 000000000000..07411cf967b9
--- /dev/null
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -0,0 +1,1500 @@
+/*
+ * GICv3 ITS emulation
+ *
+ * Copyright (C) 2015,2016 ARM Ltd.
+ * Author: Andre Przywara <andre.przywara@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/uaccess.h>
+
+#include <linux/irqchip/arm-gic-v3.h>
+
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_mmu.h>
+
+#include "vgic.h"
+#include "vgic-mmio.h"
+
+/*
+ * Creates a new (reference to a) struct vgic_irq for a given LPI.
+ * If this LPI is already mapped on another ITS, we increase its refcount
+ * and return a pointer to the existing structure.
+ * If this is a "new" LPI, we allocate and initialize a new struct vgic_irq.
+ * This function returns a pointer to the _unlocked_ structure.
+ */
+static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intid), *oldirq;
+
+ /* In this case there is no put, since we keep the reference. */
+ if (irq)
+ return irq;
+
+ irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL);
+ if (!irq)
+ return NULL;
+
+ INIT_LIST_HEAD(&irq->lpi_list);
+ INIT_LIST_HEAD(&irq->ap_list);
+ spin_lock_init(&irq->irq_lock);
+
+ irq->config = VGIC_CONFIG_EDGE;
+ kref_init(&irq->refcount);
+ irq->intid = intid;
+
+ spin_lock(&dist->lpi_list_lock);
+
+ /*
+ * There could be a race with another vgic_add_lpi(), so we need to
+ * check that we don't add a second list entry with the same LPI.
+ */
+ list_for_each_entry(oldirq, &dist->lpi_list_head, lpi_list) {
+ if (oldirq->intid != intid)
+ continue;
+
+ /* Someone was faster with adding this LPI, lets use that. */
+ kfree(irq);
+ irq = oldirq;
+
+ /*
+ * This increases the refcount, the caller is expected to
+ * call vgic_put_irq() on the returned pointer once it's
+ * finished with the IRQ.
+ */
+ vgic_get_irq_kref(irq);
+
+ goto out_unlock;
+ }
+
+ list_add_tail(&irq->lpi_list, &dist->lpi_list_head);
+ dist->lpi_list_count++;
+
+out_unlock:
+ spin_unlock(&dist->lpi_list_lock);
+
+ return irq;
+}
+
+struct its_device {
+ struct list_head dev_list;
+
+ /* the head for the list of ITTEs */
+ struct list_head itt_head;
+ u32 device_id;
+};
+
+#define COLLECTION_NOT_MAPPED ((u32)~0)
+
+struct its_collection {
+ struct list_head coll_list;
+
+ u32 collection_id;
+ u32 target_addr;
+};
+
+#define its_is_collection_mapped(coll) ((coll) && \
+ ((coll)->target_addr != COLLECTION_NOT_MAPPED))
+
+struct its_itte {
+ struct list_head itte_list;
+
+ struct vgic_irq *irq;
+ struct its_collection *collection;
+ u32 lpi;
+ u32 event_id;
+};
+
+/*
+ * Find and returns a device in the device table for an ITS.
+ * Must be called with the its_lock mutex held.
+ */
+static struct its_device *find_its_device(struct vgic_its *its, u32 device_id)
+{
+ struct its_device *device;
+
+ list_for_each_entry(device, &its->device_list, dev_list)
+ if (device_id == device->device_id)
+ return device;
+
+ return NULL;
+}
+
+/*
+ * Find and returns an interrupt translation table entry (ITTE) for a given
+ * Device ID/Event ID pair on an ITS.
+ * Must be called with the its_lock mutex held.
+ */
+static struct its_itte *find_itte(struct vgic_its *its, u32 device_id,
+ u32 event_id)
+{
+ struct its_device *device;
+ struct its_itte *itte;
+
+ device = find_its_device(its, device_id);
+ if (device == NULL)
+ return NULL;
+
+ list_for_each_entry(itte, &device->itt_head, itte_list)
+ if (itte->event_id == event_id)
+ return itte;
+
+ return NULL;
+}
+
+/* To be used as an iterator this macro misses the enclosing parentheses */
+#define for_each_lpi_its(dev, itte, its) \
+ list_for_each_entry(dev, &(its)->device_list, dev_list) \
+ list_for_each_entry(itte, &(dev)->itt_head, itte_list)
+
+/*
+ * We only implement 48 bits of PA at the moment, although the ITS
+ * supports more. Let's be restrictive here.
+ */
+#define BASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 16))
+#define CBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 12))
+#define PENDBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 16))
+#define PROPBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 12))
+
+#define GIC_LPI_OFFSET 8192
+
+/*
+ * Finds and returns a collection in the ITS collection table.
+ * Must be called with the its_lock mutex held.
+ */
+static struct its_collection *find_collection(struct vgic_its *its, int coll_id)
+{
+ struct its_collection *collection;
+
+ list_for_each_entry(collection, &its->collection_list, coll_list) {
+ if (coll_id == collection->collection_id)
+ return collection;
+ }
+
+ return NULL;
+}
+
+#define LPI_PROP_ENABLE_BIT(p) ((p) & LPI_PROP_ENABLED)
+#define LPI_PROP_PRIORITY(p) ((p) & 0xfc)
+
+/*
+ * Reads the configuration data for a given LPI from guest memory and
+ * updates the fields in struct vgic_irq.
+ * If filter_vcpu is not NULL, applies only if the IRQ is targeting this
+ * VCPU. Unconditionally applies if filter_vcpu is NULL.
+ */
+static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
+ struct kvm_vcpu *filter_vcpu)
+{
+ u64 propbase = PROPBASER_ADDRESS(kvm->arch.vgic.propbaser);
+ u8 prop;
+ int ret;
+
+ ret = kvm_read_guest(kvm, propbase + irq->intid - GIC_LPI_OFFSET,
+ &prop, 1);
+
+ if (ret)
+ return ret;
+
+ spin_lock(&irq->irq_lock);
+
+ if (!filter_vcpu || filter_vcpu == irq->target_vcpu) {
+ irq->priority = LPI_PROP_PRIORITY(prop);
+ irq->enabled = LPI_PROP_ENABLE_BIT(prop);
+
+ vgic_queue_irq_unlock(kvm, irq);
+ } else {
+ spin_unlock(&irq->irq_lock);
+ }
+
+ return 0;
+}
+
+/*
+ * Create a snapshot of the current LPI list, so that we can enumerate all
+ * LPIs without holding any lock.
+ * Returns the array length and puts the kmalloc'ed array into intid_ptr.
+ */
+static int vgic_copy_lpi_list(struct kvm *kvm, u32 **intid_ptr)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ struct vgic_irq *irq;
+ u32 *intids;
+ int irq_count = dist->lpi_list_count, i = 0;
+
+ /*
+ * We use the current value of the list length, which may change
+ * after the kmalloc. We don't care, because the guest shouldn't
+ * change anything while the command handling is still running,
+ * and in the worst case we would miss a new IRQ, which one wouldn't
+ * expect to be covered by this command anyway.
+ */
+ intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL);
+ if (!intids)
+ return -ENOMEM;
+
+ spin_lock(&dist->lpi_list_lock);
+ list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+ /* We don't need to "get" the IRQ, as we hold the list lock. */
+ intids[i] = irq->intid;
+ if (++i == irq_count)
+ break;
+ }
+ spin_unlock(&dist->lpi_list_lock);
+
+ *intid_ptr = intids;
+ return irq_count;
+}
+
+/*
+ * Promotes the ITS view of affinity of an ITTE (which redistributor this LPI
+ * is targeting) to the VGIC's view, which deals with target VCPUs.
+ * Needs to be called whenever either the collection for a LPIs has
+ * changed or the collection itself got retargeted.
+ */
+static void update_affinity_itte(struct kvm *kvm, struct its_itte *itte)
+{
+ struct kvm_vcpu *vcpu;
+
+ if (!its_is_collection_mapped(itte->collection))
+ return;
+
+ vcpu = kvm_get_vcpu(kvm, itte->collection->target_addr);
+
+ spin_lock(&itte->irq->irq_lock);
+ itte->irq->target_vcpu = vcpu;
+ spin_unlock(&itte->irq->irq_lock);
+}
+
+/*
+ * Updates the target VCPU for every LPI targeting this collection.
+ * Must be called with the its_lock mutex held.
+ */
+static void update_affinity_collection(struct kvm *kvm, struct vgic_its *its,
+ struct its_collection *coll)
+{
+ struct its_device *device;
+ struct its_itte *itte;
+
+ for_each_lpi_its(device, itte, its) {
+ if (!itte->collection || coll != itte->collection)
+ continue;
+
+ update_affinity_itte(kvm, itte);
+ }
+}
+
+static u32 max_lpis_propbaser(u64 propbaser)
+{
+ int nr_idbits = (propbaser & 0x1f) + 1;
+
+ return 1U << min(nr_idbits, INTERRUPT_ID_BITS_ITS);
+}
+
+/*
+ * Scan the whole LPI pending table and sync the pending bit in there
+ * with our own data structures. This relies on the LPI being
+ * mapped before.
+ */
+static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
+{
+ gpa_t pendbase = PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
+ struct vgic_irq *irq;
+ int last_byte_offset = -1;
+ int ret = 0;
+ u32 *intids;
+ int nr_irqs, i;
+
+ nr_irqs = vgic_copy_lpi_list(vcpu->kvm, &intids);
+ if (nr_irqs < 0)
+ return nr_irqs;
+
+ for (i = 0; i < nr_irqs; i++) {
+ int byte_offset, bit_nr;
+ u8 pendmask;
+
+ byte_offset = intids[i] / BITS_PER_BYTE;
+ bit_nr = intids[i] % BITS_PER_BYTE;
+
+ /*
+ * For contiguously allocated LPIs chances are we just read
+ * this very same byte in the last iteration. Reuse that.
+ */
+ if (byte_offset != last_byte_offset) {
+ ret = kvm_read_guest(vcpu->kvm, pendbase + byte_offset,
+ &pendmask, 1);
+ if (ret) {
+ kfree(intids);
+ return ret;
+ }
+ last_byte_offset = byte_offset;
+ }
+
+ irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]);
+ spin_lock(&irq->irq_lock);
+ irq->pending = pendmask & (1U << bit_nr);
+ vgic_queue_irq_unlock(vcpu->kvm, irq);
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ kfree(intids);
+
+ return ret;
+}
+
+static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len)
+{
+ u32 reg = 0;
+
+ mutex_lock(&its->cmd_lock);
+ if (its->creadr == its->cwriter)
+ reg |= GITS_CTLR_QUIESCENT;
+ if (its->enabled)
+ reg |= GITS_CTLR_ENABLE;
+ mutex_unlock(&its->cmd_lock);
+
+ return reg;
+}
+
+static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ its->enabled = !!(val & GITS_CTLR_ENABLE);
+}
+
+static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len)
+{
+ u64 reg = GITS_TYPER_PLPIS;
+
+ /*
+ * We use linear CPU numbers for redistributor addressing,
+ * so GITS_TYPER.PTA is 0.
+ * Also we force all PROPBASER registers to be the same, so
+ * CommonLPIAff is 0 as well.
+ * To avoid memory waste in the guest, we keep the number of IDBits and
+ * DevBits low - as least for the time being.
+ */
+ reg |= 0x0f << GITS_TYPER_DEVBITS_SHIFT;
+ reg |= 0x0f << GITS_TYPER_IDBITS_SHIFT;
+
+ return extract_bytes(reg, addr & 7, len);
+}
+
+static unsigned long vgic_mmio_read_its_iidr(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len)
+{
+ return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
+}
+
+static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len)
+{
+ switch (addr & 0xffff) {
+ case GITS_PIDR0:
+ return 0x92; /* part number, bits[7:0] */
+ case GITS_PIDR1:
+ return 0xb4; /* part number, bits[11:8] */
+ case GITS_PIDR2:
+ return GIC_PIDR2_ARCH_GICv3 | 0x0b;
+ case GITS_PIDR4:
+ return 0x40; /* This is a 64K software visible page */
+ /* The following are the ID registers for (any) GIC. */
+ case GITS_CIDR0:
+ return 0x0d;
+ case GITS_CIDR1:
+ return 0xf0;
+ case GITS_CIDR2:
+ return 0x05;
+ case GITS_CIDR3:
+ return 0xb1;
+ }
+
+ return 0;
+}
+
+/*
+ * Find the target VCPU and the LPI number for a given devid/eventid pair
+ * and make this IRQ pending, possibly injecting it.
+ * Must be called with the its_lock mutex held.
+ */
+static void vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its,
+ u32 devid, u32 eventid)
+{
+ struct its_itte *itte;
+
+ if (!its->enabled)
+ return;
+
+ itte = find_itte(its, devid, eventid);
+ /* Triggering an unmapped IRQ gets silently dropped. */
+ if (itte && its_is_collection_mapped(itte->collection)) {
+ struct kvm_vcpu *vcpu;
+
+ vcpu = kvm_get_vcpu(kvm, itte->collection->target_addr);
+ if (vcpu && vcpu->arch.vgic_cpu.lpis_enabled) {
+ spin_lock(&itte->irq->irq_lock);
+ itte->irq->pending = true;
+ vgic_queue_irq_unlock(kvm, itte->irq);
+ }
+ }
+}
+
+/*
+ * Queries the KVM IO bus framework to get the ITS pointer from the given
+ * doorbell address.
+ * We then call vgic_its_trigger_msi() with the decoded data.
+ */
+int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+ u64 address;
+ struct kvm_io_device *kvm_io_dev;
+ struct vgic_io_device *iodev;
+
+ if (!vgic_has_its(kvm))
+ return -ENODEV;
+
+ if (!(msi->flags & KVM_MSI_VALID_DEVID))
+ return -EINVAL;
+
+ address = (u64)msi->address_hi << 32 | msi->address_lo;
+
+ kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address);
+ if (!kvm_io_dev)
+ return -ENODEV;
+
+ iodev = container_of(kvm_io_dev, struct vgic_io_device, dev);
+
+ mutex_lock(&iodev->its->its_lock);
+ vgic_its_trigger_msi(kvm, iodev->its, msi->devid, msi->data);
+ mutex_unlock(&iodev->its->its_lock);
+
+ return 0;
+}
+
+/* Requires the its_lock to be held. */
+static void its_free_itte(struct kvm *kvm, struct its_itte *itte)
+{
+ list_del(&itte->itte_list);
+
+ /* This put matches the get in vgic_add_lpi. */
+ vgic_put_irq(kvm, itte->irq);
+
+ kfree(itte);
+}
+
+static u64 its_cmd_mask_field(u64 *its_cmd, int word, int shift, int size)
+{
+ return (le64_to_cpu(its_cmd[word]) >> shift) & (BIT_ULL(size) - 1);
+}
+
+#define its_cmd_get_command(cmd) its_cmd_mask_field(cmd, 0, 0, 8)
+#define its_cmd_get_deviceid(cmd) its_cmd_mask_field(cmd, 0, 32, 32)
+#define its_cmd_get_id(cmd) its_cmd_mask_field(cmd, 1, 0, 32)
+#define its_cmd_get_physical_id(cmd) its_cmd_mask_field(cmd, 1, 32, 32)
+#define its_cmd_get_collection(cmd) its_cmd_mask_field(cmd, 2, 0, 16)
+#define its_cmd_get_target_addr(cmd) its_cmd_mask_field(cmd, 2, 16, 32)
+#define its_cmd_get_validbit(cmd) its_cmd_mask_field(cmd, 2, 63, 1)
+
+/*
+ * The DISCARD command frees an Interrupt Translation Table Entry (ITTE).
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 device_id = its_cmd_get_deviceid(its_cmd);
+ u32 event_id = its_cmd_get_id(its_cmd);
+ struct its_itte *itte;
+
+
+ itte = find_itte(its, device_id, event_id);
+ if (itte && itte->collection) {
+ /*
+ * Though the spec talks about removing the pending state, we
+ * don't bother here since we clear the ITTE anyway and the
+ * pending state is a property of the ITTE struct.
+ */
+ its_free_itte(kvm, itte);
+ return 0;
+ }
+
+ return E_ITS_DISCARD_UNMAPPED_INTERRUPT;
+}
+
+/*
+ * The MOVI command moves an ITTE to a different collection.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 device_id = its_cmd_get_deviceid(its_cmd);
+ u32 event_id = its_cmd_get_id(its_cmd);
+ u32 coll_id = its_cmd_get_collection(its_cmd);
+ struct kvm_vcpu *vcpu;
+ struct its_itte *itte;
+ struct its_collection *collection;
+
+ itte = find_itte(its, device_id, event_id);
+ if (!itte)
+ return E_ITS_MOVI_UNMAPPED_INTERRUPT;
+
+ if (!its_is_collection_mapped(itte->collection))
+ return E_ITS_MOVI_UNMAPPED_COLLECTION;
+
+ collection = find_collection(its, coll_id);
+ if (!its_is_collection_mapped(collection))
+ return E_ITS_MOVI_UNMAPPED_COLLECTION;
+
+ itte->collection = collection;
+ vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+
+ spin_lock(&itte->irq->irq_lock);
+ itte->irq->target_vcpu = vcpu;
+ spin_unlock(&itte->irq->irq_lock);
+
+ return 0;
+}
+
+/*
+ * Check whether an ID can be stored into the corresponding guest table.
+ * For a direct table this is pretty easy, but gets a bit nasty for
+ * indirect tables. We check whether the resulting guest physical address
+ * is actually valid (covered by a memslot and guest accessbible).
+ * For this we have to read the respective first level entry.
+ */
+static bool vgic_its_check_id(struct vgic_its *its, u64 baser, int id)
+{
+ int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
+ int index;
+ u64 indirect_ptr;
+ gfn_t gfn;
+
+ if (!(baser & GITS_BASER_INDIRECT)) {
+ phys_addr_t addr;
+
+ if (id >= (l1_tbl_size / GITS_BASER_ENTRY_SIZE(baser)))
+ return false;
+
+ addr = BASER_ADDRESS(baser) + id * GITS_BASER_ENTRY_SIZE(baser);
+ gfn = addr >> PAGE_SHIFT;
+
+ return kvm_is_visible_gfn(its->dev->kvm, gfn);
+ }
+
+ /* calculate and check the index into the 1st level */
+ index = id / (SZ_64K / GITS_BASER_ENTRY_SIZE(baser));
+ if (index >= (l1_tbl_size / sizeof(u64)))
+ return false;
+
+ /* Each 1st level entry is represented by a 64-bit value. */
+ if (kvm_read_guest(its->dev->kvm,
+ BASER_ADDRESS(baser) + index * sizeof(indirect_ptr),
+ &indirect_ptr, sizeof(indirect_ptr)))
+ return false;
+
+ indirect_ptr = le64_to_cpu(indirect_ptr);
+
+ /* check the valid bit of the first level entry */
+ if (!(indirect_ptr & BIT_ULL(63)))
+ return false;
+
+ /*
+ * Mask the guest physical address and calculate the frame number.
+ * Any address beyond our supported 48 bits of PA will be caught
+ * by the actual check in the final step.
+ */
+ indirect_ptr &= GENMASK_ULL(51, 16);
+
+ /* Find the address of the actual entry */
+ index = id % (SZ_64K / GITS_BASER_ENTRY_SIZE(baser));
+ indirect_ptr += index * GITS_BASER_ENTRY_SIZE(baser);
+ gfn = indirect_ptr >> PAGE_SHIFT;
+
+ return kvm_is_visible_gfn(its->dev->kvm, gfn);
+}
+
+static int vgic_its_alloc_collection(struct vgic_its *its,
+ struct its_collection **colp,
+ u32 coll_id)
+{
+ struct its_collection *collection;
+
+ if (!vgic_its_check_id(its, its->baser_coll_table, coll_id))
+ return E_ITS_MAPC_COLLECTION_OOR;
+
+ collection = kzalloc(sizeof(*collection), GFP_KERNEL);
+
+ collection->collection_id = coll_id;
+ collection->target_addr = COLLECTION_NOT_MAPPED;
+
+ list_add_tail(&collection->coll_list, &its->collection_list);
+ *colp = collection;
+
+ return 0;
+}
+
+static void vgic_its_free_collection(struct vgic_its *its, u32 coll_id)
+{
+ struct its_collection *collection;
+ struct its_device *device;
+ struct its_itte *itte;
+
+ /*
+ * Clearing the mapping for that collection ID removes the
+ * entry from the list. If there wasn't any before, we can
+ * go home early.
+ */
+ collection = find_collection(its, coll_id);
+ if (!collection)
+ return;
+
+ for_each_lpi_its(device, itte, its)
+ if (itte->collection &&
+ itte->collection->collection_id == coll_id)
+ itte->collection = NULL;
+
+ list_del(&collection->coll_list);
+ kfree(collection);
+}
+
+/*
+ * The MAPTI and MAPI commands map LPIs to ITTEs.
+ * Must be called with its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 device_id = its_cmd_get_deviceid(its_cmd);
+ u32 event_id = its_cmd_get_id(its_cmd);
+ u32 coll_id = its_cmd_get_collection(its_cmd);
+ struct its_itte *itte;
+ struct its_device *device;
+ struct its_collection *collection, *new_coll = NULL;
+ int lpi_nr;
+
+ device = find_its_device(its, device_id);
+ if (!device)
+ return E_ITS_MAPTI_UNMAPPED_DEVICE;
+
+ if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI)
+ lpi_nr = its_cmd_get_physical_id(its_cmd);
+ else
+ lpi_nr = event_id;
+ if (lpi_nr < GIC_LPI_OFFSET ||
+ lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser))
+ return E_ITS_MAPTI_PHYSICALID_OOR;
+
+ collection = find_collection(its, coll_id);
+ if (!collection) {
+ int ret = vgic_its_alloc_collection(its, &collection, coll_id);
+ if (ret)
+ return ret;
+ new_coll = collection;
+ }
+
+ itte = find_itte(its, device_id, event_id);
+ if (!itte) {
+ itte = kzalloc(sizeof(struct its_itte), GFP_KERNEL);
+ if (!itte) {
+ if (new_coll)
+ vgic_its_free_collection(its, coll_id);
+ return -ENOMEM;
+ }
+
+ itte->event_id = event_id;
+ list_add_tail(&itte->itte_list, &device->itt_head);
+ }
+
+ itte->collection = collection;
+ itte->lpi = lpi_nr;
+ itte->irq = vgic_add_lpi(kvm, lpi_nr);
+ update_affinity_itte(kvm, itte);
+
+ /*
+ * We "cache" the configuration table entries in out struct vgic_irq's.
+ * However we only have those structs for mapped IRQs, so we read in
+ * the respective config data from memory here upon mapping the LPI.
+ */
+ update_lpi_config(kvm, itte->irq, NULL);
+
+ return 0;
+}
+
+/* Requires the its_lock to be held. */
+static void vgic_its_unmap_device(struct kvm *kvm, struct its_device *device)
+{
+ struct its_itte *itte, *temp;
+
+ /*
+ * The spec says that unmapping a device with still valid
+ * ITTEs associated is UNPREDICTABLE. We remove all ITTEs,
+ * since we cannot leave the memory unreferenced.
+ */
+ list_for_each_entry_safe(itte, temp, &device->itt_head, itte_list)
+ its_free_itte(kvm, itte);
+
+ list_del(&device->dev_list);
+ kfree(device);
+}
+
+/*
+ * MAPD maps or unmaps a device ID to Interrupt Translation Tables (ITTs).
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 device_id = its_cmd_get_deviceid(its_cmd);
+ bool valid = its_cmd_get_validbit(its_cmd);
+ struct its_device *device;
+
+ if (!vgic_its_check_id(its, its->baser_device_table, device_id))
+ return E_ITS_MAPD_DEVICE_OOR;
+
+ device = find_its_device(its, device_id);
+
+ /*
+ * The spec says that calling MAPD on an already mapped device
+ * invalidates all cached data for this device. We implement this
+ * by removing the mapping and re-establishing it.
+ */
+ if (device)
+ vgic_its_unmap_device(kvm, device);
+
+ /*
+ * The spec does not say whether unmapping a not-mapped device
+ * is an error, so we are done in any case.
+ */
+ if (!valid)
+ return 0;
+
+ device = kzalloc(sizeof(struct its_device), GFP_KERNEL);
+ if (!device)
+ return -ENOMEM;
+
+ device->device_id = device_id;
+ INIT_LIST_HEAD(&device->itt_head);
+
+ list_add_tail(&device->dev_list, &its->device_list);
+
+ return 0;
+}
+
+/*
+ * The MAPC command maps collection IDs to redistributors.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u16 coll_id;
+ u32 target_addr;
+ struct its_collection *collection;
+ bool valid;
+
+ valid = its_cmd_get_validbit(its_cmd);
+ coll_id = its_cmd_get_collection(its_cmd);
+ target_addr = its_cmd_get_target_addr(its_cmd);
+
+ if (target_addr >= atomic_read(&kvm->online_vcpus))
+ return E_ITS_MAPC_PROCNUM_OOR;
+
+ if (!valid) {
+ vgic_its_free_collection(its, coll_id);
+ } else {
+ collection = find_collection(its, coll_id);
+
+ if (!collection) {
+ int ret;
+
+ ret = vgic_its_alloc_collection(its, &collection,
+ coll_id);
+ if (ret)
+ return ret;
+ collection->target_addr = target_addr;
+ } else {
+ collection->target_addr = target_addr;
+ update_affinity_collection(kvm, its, collection);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * The CLEAR command removes the pending state for a particular LPI.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 device_id = its_cmd_get_deviceid(its_cmd);
+ u32 event_id = its_cmd_get_id(its_cmd);
+ struct its_itte *itte;
+
+
+ itte = find_itte(its, device_id, event_id);
+ if (!itte)
+ return E_ITS_CLEAR_UNMAPPED_INTERRUPT;
+
+ itte->irq->pending = false;
+
+ return 0;
+}
+
+/*
+ * The INV command syncs the configuration bits from the memory table.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 device_id = its_cmd_get_deviceid(its_cmd);
+ u32 event_id = its_cmd_get_id(its_cmd);
+ struct its_itte *itte;
+
+
+ itte = find_itte(its, device_id, event_id);
+ if (!itte)
+ return E_ITS_INV_UNMAPPED_INTERRUPT;
+
+ return update_lpi_config(kvm, itte->irq, NULL);
+}
+
+/*
+ * The INVALL command requests flushing of all IRQ data in this collection.
+ * Find the VCPU mapped to that collection, then iterate over the VM's list
+ * of mapped LPIs and update the configuration for each IRQ which targets
+ * the specified vcpu. The configuration will be read from the in-memory
+ * configuration table.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 coll_id = its_cmd_get_collection(its_cmd);
+ struct its_collection *collection;
+ struct kvm_vcpu *vcpu;
+ struct vgic_irq *irq;
+ u32 *intids;
+ int irq_count, i;
+
+ collection = find_collection(its, coll_id);
+ if (!its_is_collection_mapped(collection))
+ return E_ITS_INVALL_UNMAPPED_COLLECTION;
+
+ vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+
+ irq_count = vgic_copy_lpi_list(kvm, &intids);
+ if (irq_count < 0)
+ return irq_count;
+
+ for (i = 0; i < irq_count; i++) {
+ irq = vgic_get_irq(kvm, NULL, intids[i]);
+ if (!irq)
+ continue;
+ update_lpi_config(kvm, irq, vcpu);
+ vgic_put_irq(kvm, irq);
+ }
+
+ kfree(intids);
+
+ return 0;
+}
+
+/*
+ * The MOVALL command moves the pending state of all IRQs targeting one
+ * redistributor to another. We don't hold the pending state in the VCPUs,
+ * but in the IRQs instead, so there is really not much to do for us here.
+ * However the spec says that no IRQ must target the old redistributor
+ * afterwards, so we make sure that no LPI is using the associated target_vcpu.
+ * This command affects all LPIs in the system that target that redistributor.
+ */
+static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ u32 target1_addr = its_cmd_get_target_addr(its_cmd);
+ u32 target2_addr = its_cmd_mask_field(its_cmd, 3, 16, 32);
+ struct kvm_vcpu *vcpu1, *vcpu2;
+ struct vgic_irq *irq;
+
+ if (target1_addr >= atomic_read(&kvm->online_vcpus) ||
+ target2_addr >= atomic_read(&kvm->online_vcpus))
+ return E_ITS_MOVALL_PROCNUM_OOR;
+
+ if (target1_addr == target2_addr)
+ return 0;
+
+ vcpu1 = kvm_get_vcpu(kvm, target1_addr);
+ vcpu2 = kvm_get_vcpu(kvm, target2_addr);
+
+ spin_lock(&dist->lpi_list_lock);
+
+ list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+ spin_lock(&irq->irq_lock);
+
+ if (irq->target_vcpu == vcpu1)
+ irq->target_vcpu = vcpu2;
+
+ spin_unlock(&irq->irq_lock);
+ }
+
+ spin_unlock(&dist->lpi_list_lock);
+
+ return 0;
+}
+
+/*
+ * The INT command injects the LPI associated with that DevID/EvID pair.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_int(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 msi_data = its_cmd_get_id(its_cmd);
+ u64 msi_devid = its_cmd_get_deviceid(its_cmd);
+
+ vgic_its_trigger_msi(kvm, its, msi_devid, msi_data);
+
+ return 0;
+}
+
+/*
+ * This function is called with the its_cmd lock held, but the ITS data
+ * structure lock dropped.
+ */
+static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ int ret = -ENODEV;
+
+ mutex_lock(&its->its_lock);
+ switch (its_cmd_get_command(its_cmd)) {
+ case GITS_CMD_MAPD:
+ ret = vgic_its_cmd_handle_mapd(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_MAPC:
+ ret = vgic_its_cmd_handle_mapc(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_MAPI:
+ ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_MAPTI:
+ ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_MOVI:
+ ret = vgic_its_cmd_handle_movi(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_DISCARD:
+ ret = vgic_its_cmd_handle_discard(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_CLEAR:
+ ret = vgic_its_cmd_handle_clear(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_MOVALL:
+ ret = vgic_its_cmd_handle_movall(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_INT:
+ ret = vgic_its_cmd_handle_int(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_INV:
+ ret = vgic_its_cmd_handle_inv(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_INVALL:
+ ret = vgic_its_cmd_handle_invall(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_SYNC:
+ /* we ignore this command: we are in sync all of the time */
+ ret = 0;
+ break;
+ }
+ mutex_unlock(&its->its_lock);
+
+ return ret;
+}
+
+static u64 vgic_sanitise_its_baser(u64 reg)
+{
+ reg = vgic_sanitise_field(reg, GITS_BASER_SHAREABILITY_MASK,
+ GITS_BASER_SHAREABILITY_SHIFT,
+ vgic_sanitise_shareability);
+ reg = vgic_sanitise_field(reg, GITS_BASER_INNER_CACHEABILITY_MASK,
+ GITS_BASER_INNER_CACHEABILITY_SHIFT,
+ vgic_sanitise_inner_cacheability);
+ reg = vgic_sanitise_field(reg, GITS_BASER_OUTER_CACHEABILITY_MASK,
+ GITS_BASER_OUTER_CACHEABILITY_SHIFT,
+ vgic_sanitise_outer_cacheability);
+
+ /* Bits 15:12 contain bits 51:48 of the PA, which we don't support. */
+ reg &= ~GENMASK_ULL(15, 12);
+
+ /* We support only one (ITS) page size: 64K */
+ reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K;
+
+ return reg;
+}
+
+static u64 vgic_sanitise_its_cbaser(u64 reg)
+{
+ reg = vgic_sanitise_field(reg, GITS_CBASER_SHAREABILITY_MASK,
+ GITS_CBASER_SHAREABILITY_SHIFT,
+ vgic_sanitise_shareability);
+ reg = vgic_sanitise_field(reg, GITS_CBASER_INNER_CACHEABILITY_MASK,
+ GITS_CBASER_INNER_CACHEABILITY_SHIFT,
+ vgic_sanitise_inner_cacheability);
+ reg = vgic_sanitise_field(reg, GITS_CBASER_OUTER_CACHEABILITY_MASK,
+ GITS_CBASER_OUTER_CACHEABILITY_SHIFT,
+ vgic_sanitise_outer_cacheability);
+
+ /*
+ * Sanitise the physical address to be 64k aligned.
+ * Also limit the physical addresses to 48 bits.
+ */
+ reg &= ~(GENMASK_ULL(51, 48) | GENMASK_ULL(15, 12));
+
+ return reg;
+}
+
+static unsigned long vgic_mmio_read_its_cbaser(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len)
+{
+ return extract_bytes(its->cbaser, addr & 7, len);
+}
+
+static void vgic_mmio_write_its_cbaser(struct kvm *kvm, struct vgic_its *its,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ /* When GITS_CTLR.Enable is 1, this register is RO. */
+ if (its->enabled)
+ return;
+
+ mutex_lock(&its->cmd_lock);
+ its->cbaser = update_64bit_reg(its->cbaser, addr & 7, len, val);
+ its->cbaser = vgic_sanitise_its_cbaser(its->cbaser);
+ its->creadr = 0;
+ /*
+ * CWRITER is architecturally UNKNOWN on reset, but we need to reset
+ * it to CREADR to make sure we start with an empty command buffer.
+ */
+ its->cwriter = its->creadr;
+ mutex_unlock(&its->cmd_lock);
+}
+
+#define ITS_CMD_BUFFER_SIZE(baser) ((((baser) & 0xff) + 1) << 12)
+#define ITS_CMD_SIZE 32
+#define ITS_CMD_OFFSET(reg) ((reg) & GENMASK(19, 5))
+
+/*
+ * By writing to CWRITER the guest announces new commands to be processed.
+ * To avoid any races in the first place, we take the its_cmd lock, which
+ * protects our ring buffer variables, so that there is only one user
+ * per ITS handling commands at a given time.
+ */
+static void vgic_mmio_write_its_cwriter(struct kvm *kvm, struct vgic_its *its,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ gpa_t cbaser;
+ u64 cmd_buf[4];
+ u32 reg;
+
+ if (!its)
+ return;
+
+ mutex_lock(&its->cmd_lock);
+
+ reg = update_64bit_reg(its->cwriter, addr & 7, len, val);
+ reg = ITS_CMD_OFFSET(reg);
+ if (reg >= ITS_CMD_BUFFER_SIZE(its->cbaser)) {
+ mutex_unlock(&its->cmd_lock);
+ return;
+ }
+
+ its->cwriter = reg;
+ cbaser = CBASER_ADDRESS(its->cbaser);
+
+ while (its->cwriter != its->creadr) {
+ int ret = kvm_read_guest(kvm, cbaser + its->creadr,
+ cmd_buf, ITS_CMD_SIZE);
+ /*
+ * If kvm_read_guest() fails, this could be due to the guest
+ * programming a bogus value in CBASER or something else going
+ * wrong from which we cannot easily recover.
+ * According to section 6.3.2 in the GICv3 spec we can just
+ * ignore that command then.
+ */
+ if (!ret)
+ vgic_its_handle_command(kvm, its, cmd_buf);
+
+ its->creadr += ITS_CMD_SIZE;
+ if (its->creadr == ITS_CMD_BUFFER_SIZE(its->cbaser))
+ its->creadr = 0;
+ }
+
+ mutex_unlock(&its->cmd_lock);
+}
+
+static unsigned long vgic_mmio_read_its_cwriter(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len)
+{
+ return extract_bytes(its->cwriter, addr & 0x7, len);
+}
+
+static unsigned long vgic_mmio_read_its_creadr(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len)
+{
+ return extract_bytes(its->creadr, addr & 0x7, len);
+}
+
+#define BASER_INDEX(addr) (((addr) / sizeof(u64)) & 0x7)
+static unsigned long vgic_mmio_read_its_baser(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len)
+{
+ u64 reg;
+
+ switch (BASER_INDEX(addr)) {
+ case 0:
+ reg = its->baser_device_table;
+ break;
+ case 1:
+ reg = its->baser_coll_table;
+ break;
+ default:
+ reg = 0;
+ break;
+ }
+
+ return extract_bytes(reg, addr & 7, len);
+}
+
+#define GITS_BASER_RO_MASK (GENMASK_ULL(52, 48) | GENMASK_ULL(58, 56))
+static void vgic_mmio_write_its_baser(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u64 entry_size, device_type;
+ u64 reg, *regptr, clearbits = 0;
+
+ /* When GITS_CTLR.Enable is 1, we ignore write accesses. */
+ if (its->enabled)
+ return;
+
+ switch (BASER_INDEX(addr)) {
+ case 0:
+ regptr = &its->baser_device_table;
+ entry_size = 8;
+ device_type = GITS_BASER_TYPE_DEVICE;
+ break;
+ case 1:
+ regptr = &its->baser_coll_table;
+ entry_size = 8;
+ device_type = GITS_BASER_TYPE_COLLECTION;
+ clearbits = GITS_BASER_INDIRECT;
+ break;
+ default:
+ return;
+ }
+
+ reg = update_64bit_reg(*regptr, addr & 7, len, val);
+ reg &= ~GITS_BASER_RO_MASK;
+ reg &= ~clearbits;
+
+ reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT;
+ reg |= device_type << GITS_BASER_TYPE_SHIFT;
+ reg = vgic_sanitise_its_baser(reg);
+
+ *regptr = reg;
+}
+
+#define REGISTER_ITS_DESC(off, rd, wr, length, acc) \
+{ \
+ .reg_offset = off, \
+ .len = length, \
+ .access_flags = acc, \
+ .its_read = rd, \
+ .its_write = wr, \
+}
+
+static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its,
+ gpa_t addr, unsigned int len, unsigned long val)
+{
+ /* Ignore */
+}
+
+static struct vgic_register_region its_registers[] = {
+ REGISTER_ITS_DESC(GITS_CTLR,
+ vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4,
+ VGIC_ACCESS_32bit),
+ REGISTER_ITS_DESC(GITS_IIDR,
+ vgic_mmio_read_its_iidr, its_mmio_write_wi, 4,
+ VGIC_ACCESS_32bit),
+ REGISTER_ITS_DESC(GITS_TYPER,
+ vgic_mmio_read_its_typer, its_mmio_write_wi, 8,
+ VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+ REGISTER_ITS_DESC(GITS_CBASER,
+ vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8,
+ VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+ REGISTER_ITS_DESC(GITS_CWRITER,
+ vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8,
+ VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+ REGISTER_ITS_DESC(GITS_CREADR,
+ vgic_mmio_read_its_creadr, its_mmio_write_wi, 8,
+ VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+ REGISTER_ITS_DESC(GITS_BASER,
+ vgic_mmio_read_its_baser, vgic_mmio_write_its_baser, 0x40,
+ VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+ REGISTER_ITS_DESC(GITS_IDREGS_BASE,
+ vgic_mmio_read_its_idregs, its_mmio_write_wi, 0x30,
+ VGIC_ACCESS_32bit),
+};
+
+/* This is called on setting the LPI enable bit in the redistributor. */
+void vgic_enable_lpis(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->arch.vgic_cpu.pendbaser & GICR_PENDBASER_PTZ))
+ its_sync_lpi_pending_table(vcpu);
+}
+
+static int vgic_its_init_its(struct kvm *kvm, struct vgic_its *its)
+{
+ struct vgic_io_device *iodev = &its->iodev;
+ int ret;
+
+ if (its->initialized)
+ return 0;
+
+ if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base))
+ return -ENXIO;
+
+ iodev->regions = its_registers;
+ iodev->nr_regions = ARRAY_SIZE(its_registers);
+ kvm_iodevice_init(&iodev->dev, &kvm_io_gic_ops);
+
+ iodev->base_addr = its->vgic_its_base;
+ iodev->iodev_type = IODEV_ITS;
+ iodev->its = its;
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, iodev->base_addr,
+ KVM_VGIC_V3_ITS_SIZE, &iodev->dev);
+ mutex_unlock(&kvm->slots_lock);
+
+ if (!ret)
+ its->initialized = true;
+
+ return ret;
+}
+
+#define INITIAL_BASER_VALUE \
+ (GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb) | \
+ GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner) | \
+ GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) | \
+ ((8ULL - 1) << GITS_BASER_ENTRY_SIZE_SHIFT) | \
+ GITS_BASER_PAGE_SIZE_64K)
+
+#define INITIAL_PROPBASER_VALUE \
+ (GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb) | \
+ GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, SameAsInner) | \
+ GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable))
+
+static int vgic_its_create(struct kvm_device *dev, u32 type)
+{
+ struct vgic_its *its;
+
+ if (type != KVM_DEV_TYPE_ARM_VGIC_ITS)
+ return -ENODEV;
+
+ its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL);
+ if (!its)
+ return -ENOMEM;
+
+ mutex_init(&its->its_lock);
+ mutex_init(&its->cmd_lock);
+
+ its->vgic_its_base = VGIC_ADDR_UNDEF;
+
+ INIT_LIST_HEAD(&its->device_list);
+ INIT_LIST_HEAD(&its->collection_list);
+
+ dev->kvm->arch.vgic.has_its = true;
+ its->initialized = false;
+ its->enabled = false;
+ its->dev = dev;
+
+ its->baser_device_table = INITIAL_BASER_VALUE |
+ ((u64)GITS_BASER_TYPE_DEVICE << GITS_BASER_TYPE_SHIFT);
+ its->baser_coll_table = INITIAL_BASER_VALUE |
+ ((u64)GITS_BASER_TYPE_COLLECTION << GITS_BASER_TYPE_SHIFT);
+ dev->kvm->arch.vgic.propbaser = INITIAL_PROPBASER_VALUE;
+
+ dev->private = its;
+
+ return 0;
+}
+
+static void vgic_its_destroy(struct kvm_device *kvm_dev)
+{
+ struct kvm *kvm = kvm_dev->kvm;
+ struct vgic_its *its = kvm_dev->private;
+ struct its_device *dev;
+ struct its_itte *itte;
+ struct list_head *dev_cur, *dev_temp;
+ struct list_head *cur, *temp;
+
+ /*
+ * We may end up here without the lists ever having been initialized.
+ * Check this and bail out early to avoid dereferencing a NULL pointer.
+ */
+ if (!its->device_list.next)
+ return;
+
+ mutex_lock(&its->its_lock);
+ list_for_each_safe(dev_cur, dev_temp, &its->device_list) {
+ dev = container_of(dev_cur, struct its_device, dev_list);
+ list_for_each_safe(cur, temp, &dev->itt_head) {
+ itte = (container_of(cur, struct its_itte, itte_list));
+ its_free_itte(kvm, itte);
+ }
+ list_del(dev_cur);
+ kfree(dev);
+ }
+
+ list_for_each_safe(cur, temp, &its->collection_list) {
+ list_del(cur);
+ kfree(container_of(cur, struct its_collection, coll_list));
+ }
+ mutex_unlock(&its->its_lock);
+
+ kfree(its);
+}
+
+static int vgic_its_has_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_ADDR:
+ switch (attr->attr) {
+ case KVM_VGIC_ITS_ADDR_TYPE:
+ return 0;
+ }
+ break;
+ case KVM_DEV_ARM_VGIC_GRP_CTRL:
+ switch (attr->attr) {
+ case KVM_DEV_ARM_VGIC_CTRL_INIT:
+ return 0;
+ }
+ break;
+ }
+ return -ENXIO;
+}
+
+static int vgic_its_set_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ struct vgic_its *its = dev->private;
+ int ret;
+
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+ u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+ unsigned long type = (unsigned long)attr->attr;
+ u64 addr;
+
+ if (type != KVM_VGIC_ITS_ADDR_TYPE)
+ return -ENODEV;
+
+ if (its->initialized)
+ return -EBUSY;
+
+ if (copy_from_user(&addr, uaddr, sizeof(addr)))
+ return -EFAULT;
+
+ ret = vgic_check_ioaddr(dev->kvm, &its->vgic_its_base,
+ addr, SZ_64K);
+ if (ret)
+ return ret;
+
+ its->vgic_its_base = addr;
+
+ return 0;
+ }
+ case KVM_DEV_ARM_VGIC_GRP_CTRL:
+ switch (attr->attr) {
+ case KVM_DEV_ARM_VGIC_CTRL_INIT:
+ return vgic_its_init_its(dev->kvm, its);
+ }
+ break;
+ }
+ return -ENXIO;
+}
+
+static int vgic_its_get_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+ struct vgic_its *its = dev->private;
+ u64 addr = its->vgic_its_base;
+ u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+ unsigned long type = (unsigned long)attr->attr;
+
+ if (type != KVM_VGIC_ITS_ADDR_TYPE)
+ return -ENODEV;
+
+ if (copy_to_user(uaddr, &addr, sizeof(addr)))
+ return -EFAULT;
+ break;
+ default:
+ return -ENXIO;
+ }
+ }
+
+ return 0;
+}
+
+static struct kvm_device_ops kvm_arm_vgic_its_ops = {
+ .name = "kvm-arm-vgic-its",
+ .create = vgic_its_create,
+ .destroy = vgic_its_destroy,
+ .set_attr = vgic_its_set_attr,
+ .get_attr = vgic_its_get_attr,
+ .has_attr = vgic_its_has_attr,
+};
+
+int kvm_vgic_register_its_device(void)
+{
+ return kvm_register_device_ops(&kvm_arm_vgic_its_ops,
+ KVM_DEV_TYPE_ARM_VGIC_ITS);
+}
diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c
index 0130c4b147b7..1813f93b5cde 100644
--- a/virt/kvm/arm/vgic/vgic-kvm-device.c
+++ b/virt/kvm/arm/vgic/vgic-kvm-device.c
@@ -21,8 +21,8 @@
/* common helpers */
-static int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
- phys_addr_t addr, phys_addr_t alignment)
+int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
+ phys_addr_t addr, phys_addr_t alignment)
{
if (addr & ~KVM_PHYS_MASK)
return -E2BIG;
@@ -210,20 +210,27 @@ static void vgic_destroy(struct kvm_device *dev)
kfree(dev);
}
-void kvm_register_vgic_device(unsigned long type)
+int kvm_register_vgic_device(unsigned long type)
{
+ int ret = -ENODEV;
+
switch (type) {
case KVM_DEV_TYPE_ARM_VGIC_V2:
- kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
- KVM_DEV_TYPE_ARM_VGIC_V2);
+ ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
+ KVM_DEV_TYPE_ARM_VGIC_V2);
break;
#ifdef CONFIG_KVM_ARM_VGIC_V3
case KVM_DEV_TYPE_ARM_VGIC_V3:
- kvm_register_device_ops(&kvm_arm_vgic_v3_ops,
- KVM_DEV_TYPE_ARM_VGIC_V3);
+ ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops,
+ KVM_DEV_TYPE_ARM_VGIC_V3);
+ if (ret)
+ break;
+ ret = kvm_vgic_register_its_device();
break;
#endif
}
+
+ return ret;
}
/** vgic_attr_regs_access: allows user space to read/write VGIC registers
@@ -428,4 +435,3 @@ struct kvm_device_ops kvm_arm_vgic_v3_ops = {
};
#endif /* CONFIG_KVM_ARM_VGIC_V3 */
-
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
index a21393637e4b..b44b359cbbad 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v2.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c
@@ -102,6 +102,7 @@ static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu,
irq->source |= 1U << source_vcpu->vcpu_id;
vgic_queue_irq_unlock(source_vcpu->kvm, irq);
+ vgic_put_irq(source_vcpu->kvm, irq);
}
}
@@ -116,6 +117,8 @@ static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu,
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
val |= (u64)irq->targets << (i * 8);
+
+ vgic_put_irq(vcpu->kvm, irq);
}
return val;
@@ -143,6 +146,7 @@ static void vgic_mmio_write_target(struct kvm_vcpu *vcpu,
irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target);
spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -157,6 +161,8 @@ static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu,
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
val |= (u64)irq->source << (i * 8);
+
+ vgic_put_irq(vcpu->kvm, irq);
}
return val;
}
@@ -178,6 +184,7 @@ static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu,
irq->pending = false;
spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -201,6 +208,7 @@ static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu,
} else {
spin_unlock(&irq->irq_lock);
}
+ vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -429,6 +437,7 @@ int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
struct vgic_io_device dev = {
.regions = vgic_v2_cpu_registers,
.nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers),
+ .iodev_type = IODEV_CPUIF,
};
return vgic_uaccess(vcpu, &dev, is_write, offset, val);
@@ -440,6 +449,7 @@ int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
struct vgic_io_device dev = {
.regions = vgic_v2_dist_registers,
.nr_regions = ARRAY_SIZE(vgic_v2_dist_registers),
+ .iodev_type = IODEV_DIST,
};
return vgic_uaccess(vcpu, &dev, is_write, offset, val);
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
index a0c515a412a7..ff668e0dd586 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c
@@ -23,12 +23,35 @@
#include "vgic-mmio.h"
/* extract @num bytes at @offset bytes offset in data */
-static unsigned long extract_bytes(unsigned long data, unsigned int offset,
- unsigned int num)
+unsigned long extract_bytes(unsigned long data, unsigned int offset,
+ unsigned int num)
{
return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0);
}
+/* allows updates of any half of a 64-bit register (or the whole thing) */
+u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
+ unsigned long val)
+{
+ int lower = (offset & 4) * 8;
+ int upper = lower + 8 * len - 1;
+
+ reg &= ~GENMASK_ULL(upper, lower);
+ val &= GENMASK_ULL(len * 8 - 1, 0);
+
+ return reg | ((u64)val << lower);
+}
+
+bool vgic_has_its(struct kvm *kvm)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+
+ if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3)
+ return false;
+
+ return dist->has_its;
+}
+
static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len)
{
@@ -43,7 +66,12 @@ static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
case GICD_TYPER:
value = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
value = (value >> 5) - 1;
- value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19;
+ if (vgic_has_its(vcpu->kvm)) {
+ value |= (INTERRUPT_ID_BITS_ITS - 1) << 19;
+ value |= GICD_TYPER_LPIS;
+ } else {
+ value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19;
+ }
break;
case GICD_IIDR:
value = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
@@ -80,15 +108,17 @@ static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu,
{
int intid = VGIC_ADDR_TO_INTID(addr, 64);
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid);
+ unsigned long ret = 0;
if (!irq)
return 0;
/* The upper word is RAZ for us. */
- if (addr & 4)
- return 0;
+ if (!(addr & 4))
+ ret = extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len);
- return extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len);
+ vgic_put_irq(vcpu->kvm, irq);
+ return ret;
}
static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
@@ -96,15 +126,17 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
unsigned long val)
{
int intid = VGIC_ADDR_TO_INTID(addr, 64);
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid);
-
- if (!irq)
- return;
+ struct vgic_irq *irq;
/* The upper word is WI for us since we don't implement Aff3. */
if (addr & 4)
return;
+ irq = vgic_get_irq(vcpu->kvm, NULL, intid);
+
+ if (!irq)
+ return;
+
spin_lock(&irq->irq_lock);
/* We only care about and preserve Aff0, Aff1 and Aff2. */
@@ -112,6 +144,32 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr);
spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
+}
+
+static unsigned long vgic_mmio_read_v3r_ctlr(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+ return vgic_cpu->lpis_enabled ? GICR_CTLR_ENABLE_LPIS : 0;
+}
+
+
+static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ bool was_enabled = vgic_cpu->lpis_enabled;
+
+ if (!vgic_has_its(vcpu->kvm))
+ return;
+
+ vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS;
+
+ if (!was_enabled && vgic_cpu->lpis_enabled)
+ vgic_enable_lpis(vcpu);
}
static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
@@ -125,6 +183,8 @@ static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
value |= ((target_vcpu_id & 0xffff) << 8);
if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
value |= GICR_TYPER_LAST;
+ if (vgic_has_its(vcpu->kvm))
+ value |= GICR_TYPER_PLPIS;
return extract_bytes(value, addr & 7, len);
}
@@ -147,6 +207,142 @@ static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu,
return 0;
}
+/* We want to avoid outer shareable. */
+u64 vgic_sanitise_shareability(u64 field)
+{
+ switch (field) {
+ case GIC_BASER_OuterShareable:
+ return GIC_BASER_InnerShareable;
+ default:
+ return field;
+ }
+}
+
+/* Avoid any inner non-cacheable mapping. */
+u64 vgic_sanitise_inner_cacheability(u64 field)
+{
+ switch (field) {
+ case GIC_BASER_CACHE_nCnB:
+ case GIC_BASER_CACHE_nC:
+ return GIC_BASER_CACHE_RaWb;
+ default:
+ return field;
+ }
+}
+
+/* Non-cacheable or same-as-inner are OK. */
+u64 vgic_sanitise_outer_cacheability(u64 field)
+{
+ switch (field) {
+ case GIC_BASER_CACHE_SameAsInner:
+ case GIC_BASER_CACHE_nC:
+ return field;
+ default:
+ return GIC_BASER_CACHE_nC;
+ }
+}
+
+u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
+ u64 (*sanitise_fn)(u64))
+{
+ u64 field = (reg & field_mask) >> field_shift;
+
+ field = sanitise_fn(field) << field_shift;
+ return (reg & ~field_mask) | field;
+}
+
+#define PROPBASER_RES0_MASK \
+ (GENMASK_ULL(63, 59) | GENMASK_ULL(55, 52) | GENMASK_ULL(6, 5))
+#define PENDBASER_RES0_MASK \
+ (BIT_ULL(63) | GENMASK_ULL(61, 59) | GENMASK_ULL(55, 52) | \
+ GENMASK_ULL(15, 12) | GENMASK_ULL(6, 0))
+
+static u64 vgic_sanitise_pendbaser(u64 reg)
+{
+ reg = vgic_sanitise_field(reg, GICR_PENDBASER_SHAREABILITY_MASK,
+ GICR_PENDBASER_SHAREABILITY_SHIFT,
+ vgic_sanitise_shareability);
+ reg = vgic_sanitise_field(reg, GICR_PENDBASER_INNER_CACHEABILITY_MASK,
+ GICR_PENDBASER_INNER_CACHEABILITY_SHIFT,
+ vgic_sanitise_inner_cacheability);
+ reg = vgic_sanitise_field(reg, GICR_PENDBASER_OUTER_CACHEABILITY_MASK,
+ GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT,
+ vgic_sanitise_outer_cacheability);
+
+ reg &= ~PENDBASER_RES0_MASK;
+ reg &= ~GENMASK_ULL(51, 48);
+
+ return reg;
+}
+
+static u64 vgic_sanitise_propbaser(u64 reg)
+{
+ reg = vgic_sanitise_field(reg, GICR_PROPBASER_SHAREABILITY_MASK,
+ GICR_PROPBASER_SHAREABILITY_SHIFT,
+ vgic_sanitise_shareability);
+ reg = vgic_sanitise_field(reg, GICR_PROPBASER_INNER_CACHEABILITY_MASK,
+ GICR_PROPBASER_INNER_CACHEABILITY_SHIFT,
+ vgic_sanitise_inner_cacheability);
+ reg = vgic_sanitise_field(reg, GICR_PROPBASER_OUTER_CACHEABILITY_MASK,
+ GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT,
+ vgic_sanitise_outer_cacheability);
+
+ reg &= ~PROPBASER_RES0_MASK;
+ reg &= ~GENMASK_ULL(51, 48);
+ return reg;
+}
+
+static unsigned long vgic_mmio_read_propbase(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
+ return extract_bytes(dist->propbaser, addr & 7, len);
+}
+
+static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ u64 propbaser = dist->propbaser;
+
+ /* Storing a value with LPIs already enabled is undefined */
+ if (vgic_cpu->lpis_enabled)
+ return;
+
+ propbaser = update_64bit_reg(propbaser, addr & 4, len, val);
+ propbaser = vgic_sanitise_propbaser(propbaser);
+
+ dist->propbaser = propbaser;
+}
+
+static unsigned long vgic_mmio_read_pendbase(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+ return extract_bytes(vgic_cpu->pendbaser, addr & 7, len);
+}
+
+static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ u64 pendbaser = vgic_cpu->pendbaser;
+
+ /* Storing a value with LPIs already enabled is undefined */
+ if (vgic_cpu->lpis_enabled)
+ return;
+
+ pendbaser = update_64bit_reg(pendbaser, addr & 4, len, val);
+ pendbaser = vgic_sanitise_pendbaser(pendbaser);
+
+ vgic_cpu->pendbaser = pendbaser;
+}
+
/*
* The GICv3 per-IRQ registers are split to control PPIs and SGIs in the
* redistributors, while SPIs are covered by registers in the distributor
@@ -218,7 +414,7 @@ static const struct vgic_register_region vgic_v3_dist_registers[] = {
static const struct vgic_register_region vgic_v3_rdbase_registers[] = {
REGISTER_DESC_WITH_LENGTH(GICR_CTLR,
- vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
+ vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_LENGTH(GICR_IIDR,
vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4,
@@ -227,10 +423,10 @@ static const struct vgic_register_region vgic_v3_rdbase_registers[] = {
vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8,
VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER,
- vgic_mmio_read_raz, vgic_mmio_write_wi, 8,
+ vgic_mmio_read_propbase, vgic_mmio_write_propbase, 8,
VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER,
- vgic_mmio_read_raz, vgic_mmio_write_wi, 8,
+ vgic_mmio_read_pendbase, vgic_mmio_write_pendbase, 8,
VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_LENGTH(GICR_IDREGS,
vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
@@ -285,24 +481,18 @@ unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev)
int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address)
{
- int nr_vcpus = atomic_read(&kvm->online_vcpus);
struct kvm_vcpu *vcpu;
- struct vgic_io_device *devices;
int c, ret = 0;
- devices = kmalloc(sizeof(struct vgic_io_device) * nr_vcpus * 2,
- GFP_KERNEL);
- if (!devices)
- return -ENOMEM;
-
kvm_for_each_vcpu(c, vcpu, kvm) {
gpa_t rd_base = redist_base_address + c * SZ_64K * 2;
gpa_t sgi_base = rd_base + SZ_64K;
- struct vgic_io_device *rd_dev = &devices[c * 2];
- struct vgic_io_device *sgi_dev = &devices[c * 2 + 1];
+ struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev;
+ struct vgic_io_device *sgi_dev = &vcpu->arch.vgic_cpu.sgi_iodev;
kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops);
rd_dev->base_addr = rd_base;
+ rd_dev->iodev_type = IODEV_REDIST;
rd_dev->regions = vgic_v3_rdbase_registers;
rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rdbase_registers);
rd_dev->redist_vcpu = vcpu;
@@ -317,6 +507,7 @@ int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address)
kvm_iodevice_init(&sgi_dev->dev, &kvm_io_gic_ops);
sgi_dev->base_addr = sgi_base;
+ sgi_dev->iodev_type = IODEV_REDIST;
sgi_dev->regions = vgic_v3_sgibase_registers;
sgi_dev->nr_regions = ARRAY_SIZE(vgic_v3_sgibase_registers);
sgi_dev->redist_vcpu = vcpu;
@@ -335,14 +526,15 @@ int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address)
if (ret) {
/* The current c failed, so we start with the previous one. */
for (c--; c >= 0; c--) {
+ struct vgic_cpu *vgic_cpu;
+
+ vcpu = kvm_get_vcpu(kvm, c);
+ vgic_cpu = &vcpu->arch.vgic_cpu;
kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
- &devices[c * 2].dev);
+ &vgic_cpu->rd_iodev.dev);
kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
- &devices[c * 2 + 1].dev);
+ &vgic_cpu->sgi_iodev.dev);
}
- kfree(devices);
- } else {
- kvm->arch.vgic.redist_iodevs = devices;
}
return ret;
@@ -451,5 +643,6 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
irq->pending = true;
vgic_queue_irq_unlock(vcpu->kvm, irq);
+ vgic_put_irq(vcpu->kvm, irq);
}
}
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
index 9f6fab74dce7..3bad3c5ed431 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.c
+++ b/virt/kvm/arm/vgic/vgic-mmio.c
@@ -56,6 +56,8 @@ unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu,
if (irq->enabled)
value |= (1U << i);
+
+ vgic_put_irq(vcpu->kvm, irq);
}
return value;
@@ -74,6 +76,8 @@ void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
spin_lock(&irq->irq_lock);
irq->enabled = true;
vgic_queue_irq_unlock(vcpu->kvm, irq);
+
+ vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -92,6 +96,7 @@ void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
irq->enabled = false;
spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -108,6 +113,8 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
if (irq->pending)
value |= (1U << i);
+
+ vgic_put_irq(vcpu->kvm, irq);
}
return value;
@@ -129,6 +136,7 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
irq->soft_pending = true;
vgic_queue_irq_unlock(vcpu->kvm, irq);
+ vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -152,6 +160,7 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
}
spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -168,6 +177,8 @@ unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
if (irq->active)
value |= (1U << i);
+
+ vgic_put_irq(vcpu->kvm, irq);
}
return value;
@@ -242,6 +253,7 @@ void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
for_each_set_bit(i, &val, len * 8) {
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
vgic_mmio_change_active(vcpu, irq, false);
+ vgic_put_irq(vcpu->kvm, irq);
}
vgic_change_active_finish(vcpu, intid);
}
@@ -257,6 +269,7 @@ void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
for_each_set_bit(i, &val, len * 8) {
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
vgic_mmio_change_active(vcpu, irq, true);
+ vgic_put_irq(vcpu->kvm, irq);
}
vgic_change_active_finish(vcpu, intid);
}
@@ -272,6 +285,8 @@ unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
val |= (u64)irq->priority << (i * 8);
+
+ vgic_put_irq(vcpu->kvm, irq);
}
return val;
@@ -298,6 +313,8 @@ void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
/* Narrow the priority range to what we actually support */
irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS);
spin_unlock(&irq->irq_lock);
+
+ vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -313,6 +330,8 @@ unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu,
if (irq->config == VGIC_CONFIG_EDGE)
value |= (2U << (i * 2));
+
+ vgic_put_irq(vcpu->kvm, irq);
}
return value;
@@ -326,7 +345,7 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
int i;
for (i = 0; i < len * 4; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq;
/*
* The configuration cannot be changed for SGIs in general,
@@ -337,14 +356,18 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
if (intid + i < VGIC_NR_PRIVATE_IRQS)
continue;
+ irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
spin_lock(&irq->irq_lock);
+
if (test_bit(i * 2 + 1, &val)) {
irq->config = VGIC_CONFIG_EDGE;
} else {
irq->config = VGIC_CONFIG_LEVEL;
irq->pending = irq->line_level | irq->soft_pending;
}
+
spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -450,8 +473,7 @@ static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
{
struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
const struct vgic_register_region *region;
- struct kvm_vcpu *r_vcpu;
- unsigned long data;
+ unsigned long data = 0;
region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions,
addr - iodev->base_addr);
@@ -460,8 +482,21 @@ static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
return 0;
}
- r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
- data = region->read(r_vcpu, addr, len);
+ switch (iodev->iodev_type) {
+ case IODEV_CPUIF:
+ data = region->read(vcpu, addr, len);
+ break;
+ case IODEV_DIST:
+ data = region->read(vcpu, addr, len);
+ break;
+ case IODEV_REDIST:
+ data = region->read(iodev->redist_vcpu, addr, len);
+ break;
+ case IODEV_ITS:
+ data = region->its_read(vcpu->kvm, iodev->its, addr, len);
+ break;
+ }
+
vgic_data_host_to_mmio_bus(val, len, data);
return 0;
}
@@ -471,7 +506,6 @@ static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
{
struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
const struct vgic_register_region *region;
- struct kvm_vcpu *r_vcpu;
unsigned long data = vgic_data_mmio_bus_to_host(val, len);
region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions,
@@ -482,8 +516,21 @@ static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
if (!check_region(region, addr, len))
return 0;
- r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
- region->write(r_vcpu, addr, len, data);
+ switch (iodev->iodev_type) {
+ case IODEV_CPUIF:
+ region->write(vcpu, addr, len, data);
+ break;
+ case IODEV_DIST:
+ region->write(vcpu, addr, len, data);
+ break;
+ case IODEV_REDIST:
+ region->write(iodev->redist_vcpu, addr, len, data);
+ break;
+ case IODEV_ITS:
+ region->its_write(vcpu->kvm, iodev->its, addr, len, data);
+ break;
+ }
+
return 0;
}
@@ -513,6 +560,7 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
}
io_device->base_addr = dist_base_address;
+ io_device->iodev_type = IODEV_DIST;
io_device->redist_vcpu = NULL;
mutex_lock(&kvm->slots_lock);
diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h
index 850901482aec..0b3ecf9d100e 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.h
+++ b/virt/kvm/arm/vgic/vgic-mmio.h
@@ -21,10 +21,19 @@ struct vgic_register_region {
unsigned int len;
unsigned int bits_per_irq;
unsigned int access_flags;
- unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr,
- unsigned int len);
- void (*write)(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len,
- unsigned long val);
+ union {
+ unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr,
+ unsigned int len);
+ unsigned long (*its_read)(struct kvm *kvm, struct vgic_its *its,
+ gpa_t addr, unsigned int len);
+ };
+ union {
+ void (*write)(struct kvm_vcpu *vcpu, gpa_t addr,
+ unsigned int len, unsigned long val);
+ void (*its_write)(struct kvm *kvm, struct vgic_its *its,
+ gpa_t addr, unsigned int len,
+ unsigned long val);
+ };
};
extern struct kvm_io_device_ops kvm_io_gic_ops;
@@ -87,6 +96,12 @@ unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len);
void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
unsigned long data);
+unsigned long extract_bytes(unsigned long data, unsigned int offset,
+ unsigned int num);
+
+u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
+ unsigned long val);
+
unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len);
@@ -147,4 +162,12 @@ unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev);
unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev);
+#ifdef CONFIG_KVM_ARM_VGIC_V3
+u64 vgic_sanitise_outer_cacheability(u64 reg);
+u64 vgic_sanitise_inner_cacheability(u64 reg);
+u64 vgic_sanitise_shareability(u64 reg);
+u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
+ u64 (*sanitise_fn)(u64));
+#endif
+
#endif
diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
index e31405ee5515..0bf6709d1006 100644
--- a/virt/kvm/arm/vgic/vgic-v2.c
+++ b/virt/kvm/arm/vgic/vgic-v2.c
@@ -124,6 +124,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
}
spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -332,20 +333,25 @@ int vgic_v2_probe(const struct gic_kvm_info *info)
vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR);
kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1;
+ ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+ if (ret) {
+ kvm_err("Cannot register GICv2 KVM device\n");
+ iounmap(kvm_vgic_global_state.vctrl_base);
+ return ret;
+ }
+
ret = create_hyp_io_mappings(kvm_vgic_global_state.vctrl_base,
kvm_vgic_global_state.vctrl_base +
resource_size(&info->vctrl),
info->vctrl.start);
-
if (ret) {
kvm_err("Cannot map VCTRL into hyp\n");
+ kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2);
iounmap(kvm_vgic_global_state.vctrl_base);
return ret;
}
kvm_vgic_global_state.can_emulate_gicv2 = true;
- kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
-
kvm_vgic_global_state.vcpu_base = info->vcpu.start;
kvm_vgic_global_state.type = VGIC_V2;
kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS;
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index 346b4ad12b49..0506543df38a 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -81,6 +81,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
else
intid = val & GICH_LR_VIRTUALID;
irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+ if (!irq) /* An LPI could have been unmapped. */
+ continue;
spin_lock(&irq->irq_lock);
@@ -113,6 +115,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
}
spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -190,6 +193,11 @@ void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
vmcrp->pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
}
+#define INITIAL_PENDBASER_VALUE \
+ (GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb) | \
+ GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner) | \
+ GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable))
+
void vgic_v3_enable(struct kvm_vcpu *vcpu)
{
struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
@@ -207,10 +215,12 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu)
* way, so we force SRE to 1 to demonstrate this to the guest.
* This goes with the spec allowing the value to be RAO/WI.
*/
- if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
+ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
vgic_v3->vgic_sre = ICC_SRE_EL1_SRE;
- else
+ vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE;
+ } else {
vgic_v3->vgic_sre = 0;
+ }
/* Get the show on the road... */
vgic_v3->vgic_hcr = ICH_HCR_EN;
@@ -296,6 +306,7 @@ out:
int vgic_v3_probe(const struct gic_kvm_info *info)
{
u32 ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2);
+ int ret;
/*
* The ListRegs field is 5 bits, but there is a architectural
@@ -319,12 +330,22 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
} else {
kvm_vgic_global_state.vcpu_base = info->vcpu.start;
kvm_vgic_global_state.can_emulate_gicv2 = true;
- kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+ ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+ if (ret) {
+ kvm_err("Cannot register GICv2 KVM device.\n");
+ return ret;
+ }
kvm_info("vgic-v2@%llx\n", info->vcpu.start);
}
+ ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
+ if (ret) {
+ kvm_err("Cannot register GICv3 KVM device.\n");
+ kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2);
+ return ret;
+ }
+
if (kvm_vgic_global_state.vcpu_base == 0)
kvm_info("disabling GICv2 emulation\n");
- kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
kvm_vgic_global_state.vctrl_base = NULL;
kvm_vgic_global_state.type = VGIC_V3;
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 69b61abefa19..39f3358c6d91 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -33,10 +33,17 @@ struct vgic_global __section(.hyp.text) kvm_vgic_global_state;
/*
* Locking order is always:
- * vgic_cpu->ap_list_lock
- * vgic_irq->irq_lock
+ * its->cmd_lock (mutex)
+ * its->its_lock (mutex)
+ * vgic_cpu->ap_list_lock
+ * kvm->lpi_list_lock
+ * vgic_irq->irq_lock
*
- * (that is, always take the ap_list_lock before the struct vgic_irq lock).
+ * If you need to take multiple locks, always take the upper lock first,
+ * then the lower ones, e.g. first take the its_lock, then the irq_lock.
+ * If you are already holding a lock and need to take a higher one, you
+ * have to drop the lower ranking lock first and re-aquire it after having
+ * taken the upper one.
*
* When taking more than one ap_list_lock at the same time, always take the
* lowest numbered VCPU's ap_list_lock first, so:
@@ -45,6 +52,41 @@ struct vgic_global __section(.hyp.text) kvm_vgic_global_state;
* spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock);
*/
+/*
+ * Iterate over the VM's list of mapped LPIs to find the one with a
+ * matching interrupt ID and return a reference to the IRQ structure.
+ */
+static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ struct vgic_irq *irq = NULL;
+
+ spin_lock(&dist->lpi_list_lock);
+
+ list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+ if (irq->intid != intid)
+ continue;
+
+ /*
+ * This increases the refcount, the caller is expected to
+ * call vgic_put_irq() later once it's finished with the IRQ.
+ */
+ vgic_get_irq_kref(irq);
+ goto out_unlock;
+ }
+ irq = NULL;
+
+out_unlock:
+ spin_unlock(&dist->lpi_list_lock);
+
+ return irq;
+}
+
+/*
+ * This looks up the virtual interrupt ID to get the corresponding
+ * struct vgic_irq. It also increases the refcount, so any caller is expected
+ * to call vgic_put_irq() once it's finished with this IRQ.
+ */
struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
u32 intid)
{
@@ -56,14 +98,43 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
if (intid <= VGIC_MAX_SPI)
return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS];
- /* LPIs are not yet covered */
+ /* LPIs */
if (intid >= VGIC_MIN_LPI)
- return NULL;
+ return vgic_get_lpi(kvm, intid);
WARN(1, "Looking up struct vgic_irq for reserved INTID");
return NULL;
}
+/*
+ * We can't do anything in here, because we lack the kvm pointer to
+ * lock and remove the item from the lpi_list. So we keep this function
+ * empty and use the return value of kref_put() to trigger the freeing.
+ */
+static void vgic_irq_release(struct kref *ref)
+{
+}
+
+void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
+{
+ struct vgic_dist *dist;
+
+ if (irq->intid < VGIC_MIN_LPI)
+ return;
+
+ if (!kref_put(&irq->refcount, vgic_irq_release))
+ return;
+
+ dist = &kvm->arch.vgic;
+
+ spin_lock(&dist->lpi_list_lock);
+ list_del(&irq->lpi_list);
+ dist->lpi_list_count--;
+ spin_unlock(&dist->lpi_list_lock);
+
+ kfree(irq);
+}
+
/**
* kvm_vgic_target_oracle - compute the target vcpu for an irq
*
@@ -236,6 +307,11 @@ retry:
goto retry;
}
+ /*
+ * Grab a reference to the irq to reflect the fact that it is
+ * now in the ap_list.
+ */
+ vgic_get_irq_kref(irq);
list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
irq->vcpu = vcpu;
@@ -269,14 +345,17 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
if (!irq)
return -EINVAL;
- if (irq->hw != mapped_irq)
+ if (irq->hw != mapped_irq) {
+ vgic_put_irq(kvm, irq);
return -EINVAL;
+ }
spin_lock(&irq->irq_lock);
if (!vgic_validate_injection(irq, level)) {
/* Nothing to see here, move along... */
spin_unlock(&irq->irq_lock);
+ vgic_put_irq(kvm, irq);
return 0;
}
@@ -288,6 +367,7 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
}
vgic_queue_irq_unlock(kvm, irq);
+ vgic_put_irq(kvm, irq);
return 0;
}
@@ -330,25 +410,28 @@ int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq)
irq->hwintid = phys_irq;
spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
return 0;
}
int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq)
{
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq);
-
- BUG_ON(!irq);
+ struct vgic_irq *irq;
if (!vgic_initialized(vcpu->kvm))
return -EAGAIN;
+ irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq);
+ BUG_ON(!irq);
+
spin_lock(&irq->irq_lock);
irq->hw = false;
irq->hwintid = 0;
spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
return 0;
}
@@ -386,6 +469,15 @@ retry:
list_del(&irq->ap_list);
irq->vcpu = NULL;
spin_unlock(&irq->irq_lock);
+
+ /*
+ * This vgic_put_irq call matches the
+ * vgic_get_irq_kref in vgic_queue_irq_unlock,
+ * where we added the LPI to the ap_list. As
+ * we remove the irq from the list, we drop
+ * also drop the refcount.
+ */
+ vgic_put_irq(vcpu->kvm, irq);
continue;
}
@@ -614,6 +706,15 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq)
spin_lock(&irq->irq_lock);
map_is_active = irq->hw && irq->active;
spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
return map_is_active;
}
+
+int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+ if (vgic_has_its(kvm))
+ return vgic_its_inject_msi(kvm, msi);
+ else
+ return -ENODEV;
+}
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index 7b300ca370b7..1d8e21d5c13f 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -25,6 +25,7 @@
#define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF)
#define INTERRUPT_ID_BITS_SPIS 10
+#define INTERRUPT_ID_BITS_ITS 16
#define VGIC_PRI_BITS 5
#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS)
@@ -38,9 +39,13 @@ struct vgic_vmcr {
struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
u32 intid);
+void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq);
bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq);
void vgic_kick_vcpus(struct kvm *kvm);
+int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
+ phys_addr_t addr, phys_addr_t alignment);
+
void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu);
void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
@@ -59,6 +64,14 @@ int vgic_v2_map_resources(struct kvm *kvm);
int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
enum vgic_type);
+static inline void vgic_get_irq_kref(struct vgic_irq *irq)
+{
+ if (irq->intid < VGIC_MIN_LPI)
+ return;
+
+ kref_get(&irq->refcount);
+}
+
#ifdef CONFIG_KVM_ARM_VGIC_V3
void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu);
void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
@@ -71,6 +84,10 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu);
int vgic_v3_probe(const struct gic_kvm_info *info);
int vgic_v3_map_resources(struct kvm *kvm);
int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address);
+bool vgic_has_its(struct kvm *kvm);
+int kvm_vgic_register_its_device(void);
+void vgic_enable_lpis(struct kvm_vcpu *vcpu);
+int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi);
#else
static inline void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu)
{
@@ -122,9 +139,28 @@ static inline int vgic_register_redist_iodevs(struct kvm *kvm,
{
return -ENODEV;
}
+
+static inline bool vgic_has_its(struct kvm *kvm)
+{
+ return false;
+}
+
+static inline int kvm_vgic_register_its_device(void)
+{
+ return -ENODEV;
+}
+
+static inline void vgic_enable_lpis(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+ return -ENODEV;
+}
#endif
-void kvm_register_vgic_device(unsigned long type);
+int kvm_register_vgic_device(unsigned long type);
int vgic_lazy_init(struct kvm *kvm);
int vgic_init(struct kvm *kvm);
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index 8db197bb6c7a..df99e9c3b64d 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -135,7 +135,8 @@ void kvm_free_irq_routing(struct kvm *kvm)
free_irq_routing_table(rt);
}
-static int setup_routing_entry(struct kvm_irq_routing_table *rt,
+static int setup_routing_entry(struct kvm *kvm,
+ struct kvm_irq_routing_table *rt,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
@@ -154,7 +155,7 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
e->gsi = ue->gsi;
e->type = ue->type;
- r = kvm_set_routing_entry(e, ue);
+ r = kvm_set_routing_entry(kvm, e, ue);
if (r)
goto out;
if (e->type == KVM_IRQ_ROUTING_IRQCHIP)
@@ -211,7 +212,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
kfree(e);
goto out;
}
- r = setup_routing_entry(new, e, ue);
+ r = setup_routing_entry(kvm, new, e, ue);
if (r) {
kfree(e);
goto out;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2e791367c576..cc081ccfcaa3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1444,6 +1444,52 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
return true;
}
+static int hva_to_pfn_remapped(struct vm_area_struct *vma,
+ unsigned long addr, bool *async,
+ bool write_fault, kvm_pfn_t *p_pfn)
+{
+ unsigned long pfn;
+ int r;
+
+ r = follow_pfn(vma, addr, &pfn);
+ if (r) {
+ /*
+ * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
+ * not call the fault handler, so do it here.
+ */
+ bool unlocked = false;
+ r = fixup_user_fault(current, current->mm, addr,
+ (write_fault ? FAULT_FLAG_WRITE : 0),
+ &unlocked);
+ if (unlocked)
+ return -EAGAIN;
+ if (r)
+ return r;
+
+ r = follow_pfn(vma, addr, &pfn);
+ if (r)
+ return r;
+
+ }
+
+
+ /*
+ * Get a reference here because callers of *hva_to_pfn* and
+ * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
+ * returned pfn. This is only needed if the VMA has VM_MIXEDMAP
+ * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will
+ * simply do nothing for reserved pfns.
+ *
+ * Whoever called remap_pfn_range is also going to call e.g.
+ * unmap_mapping_range before the underlying pages are freed,
+ * causing a call to our MMU notifier.
+ */
+ kvm_get_pfn(pfn);
+
+ *p_pfn = pfn;
+ return 0;
+}
+
/*
* Pin guest page in memory and return its pfn.
* @addr: host virtual address which maps memory to the guest
@@ -1463,7 +1509,7 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
{
struct vm_area_struct *vma;
kvm_pfn_t pfn = 0;
- int npages;
+ int npages, r;
/* we can do it either atomically or asynchronously, not both */
BUG_ON(atomic && async);
@@ -1485,14 +1531,17 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
goto exit;
}
+retry:
vma = find_vma_intersection(current->mm, addr, addr + 1);
if (vma == NULL)
pfn = KVM_PFN_ERR_FAULT;
- else if ((vma->vm_flags & VM_PFNMAP)) {
- pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
- vma->vm_pgoff;
- BUG_ON(!kvm_is_reserved_pfn(pfn));
+ else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
+ r = hva_to_pfn_remapped(vma, addr, async, write_fault, &pfn);
+ if (r == -EAGAIN)
+ goto retry;
+ if (r < 0)
+ pfn = KVM_PFN_ERR_FAULT;
} else {
if (async && vma_is_valid(vma, write_fault))
*async = true;
@@ -2348,9 +2397,20 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
if (id >= KVM_MAX_VCPU_ID)
return -EINVAL;
+ mutex_lock(&kvm->lock);
+ if (kvm->created_vcpus == KVM_MAX_VCPUS) {
+ mutex_unlock(&kvm->lock);
+ return -EINVAL;
+ }
+
+ kvm->created_vcpus++;
+ mutex_unlock(&kvm->lock);
+
vcpu = kvm_arch_vcpu_create(kvm, id);
- if (IS_ERR(vcpu))
- return PTR_ERR(vcpu);
+ if (IS_ERR(vcpu)) {
+ r = PTR_ERR(vcpu);
+ goto vcpu_decrement;
+ }
preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
@@ -2359,14 +2419,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
goto vcpu_destroy;
mutex_lock(&kvm->lock);
- if (!kvm_vcpu_compatible(vcpu)) {
- r = -EINVAL;
- goto unlock_vcpu_destroy;
- }
- if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
- r = -EINVAL;
- goto unlock_vcpu_destroy;
- }
if (kvm_get_vcpu_by_id(kvm, id)) {
r = -EEXIST;
goto unlock_vcpu_destroy;
@@ -2399,6 +2451,10 @@ unlock_vcpu_destroy:
mutex_unlock(&kvm->lock);
vcpu_destroy:
kvm_arch_vcpu_destroy(vcpu);
+vcpu_decrement:
+ mutex_lock(&kvm->lock);
+ kvm->created_vcpus--;
+ mutex_unlock(&kvm->lock);
return r;
}
@@ -3487,6 +3543,30 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
return r;
}
+struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+ gpa_t addr)
+{
+ struct kvm_io_bus *bus;
+ int dev_idx, srcu_idx;
+ struct kvm_io_device *iodev = NULL;
+
+ srcu_idx = srcu_read_lock(&kvm->srcu);
+
+ bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+
+ dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
+ if (dev_idx < 0)
+ goto out_unlock;
+
+ iodev = bus->range[dev_idx].dev;
+
+out_unlock:
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+
+ return iodev;
+}
+EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
+
static int kvm_debugfs_open(struct inode *inode, struct file *file,
int (*get)(void *, u64 *), int (*set)(void *, u64),
const char *fmt)